Example #1
0
def save_bins(destination, tl_dict, token_vectors, bins):
    full_list = []

    for bin_ in bins:
        for problem_id in bin_:
            full_list.append(problem_id)

    for i, bin_ in enumerate(bins):
        test_problems = bin_
        training_problems = list(set(full_list) - set(bin_))

        token_vectors_this_fold = {'train': [], 'validation': [], 'test': []}

        for problem_id in training_problems:
            if problem_id in token_vectors['train']:
                token_vectors_this_fold['train'] += token_vectors['train'][
                    problem_id]
                token_vectors_this_fold['validation'] += token_vectors[
                    'validation'][problem_id]

        for problem_id in test_problems:
            if problem_id in token_vectors['validation']:
                token_vectors_this_fold['test'] += token_vectors['validation'][
                    problem_id]

        make_dir_if_not_exists(os.path.join(destination, 'bin_%d' % i))

        print "Fold %d: Train:%d Validation:%d Test:%d" % (
            i, len(token_vectors_this_fold['train']),
            len(token_vectors_this_fold['validation']),
            len(token_vectors_this_fold['test']))

        save_pairs(os.path.join(destination, 'bin_%d' % i),
                   token_vectors_this_fold, tl_dict)
def my_save_bins(destination, tl_dict, token_vectors, rng):
    full_list = get_cs_tokenized().keys()
    rng.shuffle(full_list)
    fold_n = 5
    bins = []
    for i in range(fold_n):
        bins.append(full_list[len(full_list) * i // fold_n:len(full_list) *
                              (i + 1) // fold_n])
    for i, bin_ in enumerate(bins):
        token_vectors_this_fold = {'train': [], 'validation': [], 'test': []}
        for problem_id in set(full_list) - set(bin_):
            if rng.rand() < 0.8:
                token_vectors_this_fold['train'] += token_vectors['train'][
                    problem_id]
            else:
                token_vectors_this_fold['validation'] += token_vectors[
                    'train'][problem_id]
        for problem_id in bin_:
            token_vectors_this_fold['test'] += token_vectors['train'][
                problem_id]
        print('Fold {}: (Train, Validation, Test) == ({} {} {})'.format(
            i, len(token_vectors_this_fold['train']),
            len(token_vectors_this_fold['validation']),
            len(token_vectors_this_fold['test'])))
        make_dir_if_not_exists(os.path.join(destination, 'bin_{}'.format(i)))
        save_pairs(os.path.join(destination, 'bin_{}'.format(i)),
                   token_vectors_this_fold, tl_dict)
Example #3
0
    def __init__(self, data_folder, shuffle=True, load_only_dicts=False):
        self.rng = np.random.RandomState(1189)
        self.tl_dict, self.rev_tl_dict = load_dictionaries(data_folder)
        assert self.tl_dict is not None and self.rev_tl_dict is not None

        if load_only_dicts:
            return

        if not shuffle:
            self.train_ex, self.valid_ex, self.test_ex = self._deserialize(
                data_folder)

        else:
            try:
                self.train_ex, self.valid_ex, self.test_ex = self._deserialize(
                    os.path.join(data_folder, 'shuffled'))

                print "Successfully loaded shuffled data."
                sys.stdout.flush()

            except IOError:
                print "Generating shuffled data..."
                sys.stdout.flush()

                self.train_ex, self.valid_ex, self.test_ex = self._deserialize(
                    data_folder)

                self.rng.shuffle(self.train_ex)
                self.rng.shuffle(self.valid_ex)
                self.rng.shuffle(self.test_ex)

                make_dir_if_not_exists(os.path.join(data_folder, 'shuffled'))

                np.save(
                    os.path.join(data_folder, 'shuffled',
                                 'examples-train.npy'), self.train_ex)
                np.save(
                    os.path.join(data_folder, 'shuffled',
                                 'examples-validation.npy'), self.valid_ex)
                np.save(
                    os.path.join(data_folder, 'shuffled', 'examples-test.npy'),
                    self.test_ex)
def save_bins(destination, tl_dict, token_vectors, rng):
    fold_n = 5
    bins = []
    full_list = get_cs_tokenized().keys()
    rng.shuffle(full_list)
    for i in range(fold_n):
        bins.append(full_list[len(full_list) * i // fold_n:len(full_list) *
                              (i + 1) // fold_n])
    for i, bin_ in enumerate(bins):
        test_problems = bin_
        training_problems = list(set(full_list) - set(bin_))

        token_vectors_this_fold = {'train': [], 'validation': [], 'test': []}

        for problem_id in training_problems:
            if problem_id in token_vectors['train']:
                token_vectors_this_fold['train'] += token_vectors['train'][
                    problem_id]
            if problem_id in token_vectors['validation']:
                token_vectors_this_fold['validation'] += token_vectors[
                    'validation'][problem_id]

        for problem_id in test_problems:
            if problem_id in token_vectors['validation']:
                token_vectors_this_fold['test'] += token_vectors['validation'][
                    problem_id]

        make_dir_if_not_exists(os.path.join(destination, 'bin_%d' % i))

        print "Fold %d: Train:%d Validation:%d Test:%d" % (
            i, len(token_vectors_this_fold['train']),
            len(token_vectors_this_fold['validation']),
            len(token_vectors_this_fold['test']))

        save_pairs(os.path.join(destination, 'bin_%d' % i),
                   token_vectors_this_fold, tl_dict)
    drop_ids = kind_mutations == 'typo'
    max_program_length = 450
    min_program_length = 75
    max_fix_length = 25

    max_mutations = 5
    max_variants = 4 if kind_mutations == 'ids' else 2

    db_path = os.path.join('data', 'iitk-dataset', 'dataset.db')
    validation_users = np.load(
        os.path.join('data', 'iitk-dataset', 'validation_users.npy')).item()
    bins = np.load(os.path.join('data', 'iitk-dataset', 'bins.npy'))

    seed = 1189

    output_directory = os.path.join('data/network_inputs',
                                    'iitk-%s-%d' % (kind_mutations, seed))

    print('output_directory:', output_directory)
    make_dir_if_not_exists(os.path.join(output_directory))

    result = generate_training_data(db_path, bins, validation_users,
                                    min_program_length, max_program_length,
                                    max_fix_length, kind_mutations,
                                    max_mutations, max_variants, seed)

    np.save(os.path.join(output_directory, 'testing-tokenized-examples.npy'),
            result)
    print('\n\n--------------- all outputs written to {} ---------------\n\n'.
          format(output_directory))
Example #6
0
 def __init__(self,
              data_folder,
              load_real_test_data=False,
              load_seeded_test_data=False,
              load_only_dicts=False,
              shuffle=False,
              seed=1189):
     self.rng = np.random.RandomState(seed)
     self.tl_dict, self.rev_tl_dict = load_dictionaries(data_folder)
     assert self.tl_dict is not None and self.rev_tl_dict is not None
     assert self.tl_dict['-new-line-'] == 2
     if load_only_dicts:
         return
     if load_real_test_data:
         try:
             self.real_test_data = np.load(os.path.join(
                 data_folder, 'test_real_raw.npy'),
                                           allow_pickle=True).item()
         except:
             self.real_test_data = np.load(os.path.join(
                 data_folder, 'test_raw.npy'),
                                           allow_pickle=True).item()
     if load_seeded_test_data:
         try:
             self.seeded_test_data = np.load(os.path.join(
                 data_folder, 'test_real_seeded.npy'),
                                             allow_pickle=True).item()
         except:
             self.seeded_test_data = np.load(os.path.join(
                 data_folder, 'test_seeded.npy'),
                                             allow_pickle=True).item()
     try:
         self.name_dict_store = np.load(os.path.join(
             data_folder, 'name_dict_store.npy'),
                                        allow_pickle=True).item()
         # 在这里我们读取也读取raw_test的name_dict,然后将二者组合在一起
         self.test_name_dict_store = np.load(os.path.join(
             data_folder, 'test_name_dict_store.npy'),
                                             allow_pickle=True).item()
         self.name_dict_store = dict(self.name_dict_store,
                                     **self.test_name_dict_store)
     except:
         print 'init name_dict_store with {}'
         self.name_dict_store = {}
     if not shuffle:
         # Load originals
         self.train_ex, self.valid_ex, self.test_ex = self._deserialize(
             data_folder)
         print "Successfully loaded data."
     else:
         try:  # to load pre-generated shuffled data
             self.train_ex, self.valid_ex, self.test_ex = self._deserialize(
                 os.path.join(data_folder, 'shuffled'))
             print "Successfully loaded shuffled data."
         # or generate it
         except IOError:
             print "Generating shuffled data..."
             sys.stdout.flush()
             # Load originals
             self.train_ex, self.valid_ex, self.test_ex = self._deserialize(
                 data_folder)
             # Shuffle
             self.rng.shuffle(self.train_ex)
             self.rng.shuffle(self.valid_ex)
             self.rng.shuffle(self.test_ex)
             # Save for later
             make_dir_if_not_exists(os.path.join(data_folder, 'shuffled'))
             np.save(
                 os.path.join(data_folder, 'shuffled',
                              'examples-train.npy'), self.train_ex)
             np.save(
                 os.path.join(data_folder, 'shuffled',
                              'examples-validation.npy'), self.valid_ex)
             np.save(
                 os.path.join(data_folder, 'shuffled', 'examples-test.npy'),
                 self.test_ex)
Example #7
0
parser.add_argument('-v',
                    '--vram',
                    help='Fraction of GPU memory to use',
                    type=float,
                    default=1.0)

args = parser.parse_args()

# Default checkpoints directory
if args.checkpoints_directory is None:
    checkpoints_directory = os.path.join('checkpoints', 'fold_%d' % args.fold)
else:
    checkpoints_directory = args.checkpoints_directory

# Make checkpoint directories
make_dir_if_not_exists(checkpoints_directory)
make_dir_if_not_exists(os.path.join(checkpoints_directory, 'best'))

# Print options
print 'Checkpoint every:', args.ckpt_every
print 'Batch size:', args.batch_size
print 'Embedding dim:', args.embedding_dim
print 'Memory dim:', args.memory_dim
print 'Layers:', args.num_layers
print 'Epochs:', args.epochs
print 'Resume at:', args.resume_at
print 'Resume epoch:', args.resume_epoch
print 'Resume training minibatch:', args.resume_training_minibatch
print 'RNN cell:', args.rnn_cell
print 'Bidirectional:', args.bidirectional
sys.stdout.flush()
Example #8
0
                                                  '''data/results/''')
args.checkpoint_directory = args.checkpoint_directory + \
    ('' if args.checkpoint_directory.endswith('/') else '/')
bin_id = None
try:
    if args.checkpoint_directory.find('bin_') == -1:
        raise ValueError('ERROR: failed to find the bin id')
    bin_id = int(args.checkpoint_directory[-2])
    print 'bin_id:', bin_id
except:
    raise

if args.database:
    database = args.database
else:
    make_dir_if_not_exists(database_path)
    database_name = args.which + '_' + args.task + '.db'
    database = os.path.join(database_path, database_name)

print 'using database:', database

if not args.data_directory:
    training_args = np.load(
        os.path.join(args.checkpoint_directory,
                     'experiment-configuration.npy')).item()['args']
    args.data_directory = training_args.data_directory

print 'data directory:', args.data_directory

conn = sqlite3.connect(database)
c = conn.cursor()
Example #9
0
    def __init__(self, fold, shuffle=True):
        data_folder = os.path.join('network_inputs')
        fold_folder = os.path.join(data_folder, 'fold_%d' % fold)
        self.tl_dict = np.load(os.path.join(data_folder,
                                            'translate_dict.npy')).item()

        if not shuffle:
            # Load originals
            self.train_x = np.load(
                os.path.join(fold_folder, 'mutated-train.npy'))
            self.train_y = np.load(os.path.join(fold_folder,
                                                'fixes-train.npy'))
            self.valid_x = np.load(
                os.path.join(fold_folder, 'mutated-validation.npy'))
            self.valid_y = np.load(
                os.path.join(fold_folder, 'fixes-validation.npy'))
            self.test_x = np.load(os.path.join(fold_folder,
                                               'mutated-test.npy'))
            self.test_y = np.load(os.path.join(fold_folder, 'fixes-test.npy'))

        else:
            try:
                self.train_x = np.load(
                    os.path.join(fold_folder, 'shuffled/mutated-train.npy'))
                self.train_y = np.load(
                    os.path.join(fold_folder, 'shuffled/fixes-train.npy'))
                self.valid_x = np.load(
                    os.path.join(fold_folder,
                                 'shuffled/mutated-validation.npy'))
                self.valid_y = np.load(
                    os.path.join(fold_folder, 'shuffled/fixes-validation.npy'))
                self.test_x = np.load(
                    os.path.join(fold_folder, 'shuffled/mutated-test.npy'))
                self.test_y = np.load(
                    os.path.join(fold_folder, 'shuffled/fixes-test.npy'))

                print "Successfully loaded shuffled data."
                sys.stdout.flush()

            # If not generate it
            except IOError:
                print "Generating shuffled data..."
                sys.stdout.flush()

                # Load originals
                self.train_x = np.load(
                    os.path.join(fold_folder, 'mutated-train.npy'))
                self.train_y = np.load(
                    os.path.join(fold_folder, 'fixes-train.npy'))
                self.valid_x = np.load(
                    os.path.join(fold_folder, 'mutated-validation.npy'))
                self.valid_y = np.load(
                    os.path.join(fold_folder, 'fixes-validation.npy'))
                self.test_x = np.load(
                    os.path.join(fold_folder, 'mutated-test.npy'))
                self.test_y = np.load(
                    os.path.join(fold_folder, 'fixes-test.npy'))

                # Shuffle
                self.train_x, self.train_y = zip(
                    *_shuffle(zip(list(self.train_x), list(self.train_y))))
                self.valid_x, self.valid_y = zip(
                    *_shuffle(zip(list(self.valid_x), list(self.valid_y))))
                self.test_x, self.test_y = zip(
                    *_shuffle(zip(list(self.test_x), list(self.test_y))))

                # Convert to np array
                self.train_x, self.train_y = np.array(self.train_x), np.array(
                    self.train_y)
                self.valid_x, self.valid_y = np.array(self.valid_x), np.array(
                    self.valid_y)
                self.test_x, self.test_y = np.array(self.test_x), np.array(
                    self.test_y)

                # Save for later
                make_dir_if_not_exists(os.path.join(fold_folder, 'shuffled'))

                np.save(
                    os.path.join(fold_folder, 'shuffled/mutated-train.npy'),
                    self.train_x)
                np.save(os.path.join(fold_folder, 'shuffled/fixes-train.npy'),
                        self.train_y)
                np.save(
                    os.path.join(fold_folder,
                                 'shuffled/mutated-validation.npy'),
                    self.valid_x)
                np.save(
                    os.path.join(fold_folder, 'shuffled/fixes-validation.npy'),
                    self.valid_y)
                np.save(os.path.join(fold_folder, 'shuffled/mutated-test.npy'),
                        self.test_x)
                np.save(os.path.join(fold_folder, 'shuffled/fixes-test.npy'),
                        self.test_y)

        # Check
        assert (len(self.train_x) == len(self.train_y))
        assert (len(self.valid_x) == len(self.valid_y))
        assert (len(self.test_x) == len(self.test_y))