def PCFP(self): # 1. Open a new sub directory for model-to-be-trained ext_model_location = os.path.join(self.model_location, 'Extrinsic_grid_completion') if not os.path.isdir(self.model_location): os.makedirs(self.model_location) # 1.5 Initialize the model extrinsic_inflection = Inflector(ext_model_location, data_format=dataloader.DataFormat.MT, extrinsic=True) extrinsic_inflection.train = os.path.join(self.model_location, 'ext_train.tsv') extrinsic_inflection.dev = os.path.join(self.model_location, 'ext_dev.tsv') # 1.75. Write out the initial grid for debugging init_grid_file = open( os.path.join(self.model_location, 'initial_grid.txt'), 'w') for row_idx in range(self.r): printline = [] for col_idx in range(self.c): if self.wf_grid[row_idx][col_idx] == None: printline.append('<<{}>>'.format( self.wf_grid[row_idx][col_idx])) else: printline.append(self.wf_grid[row_idx][col_idx]) init_grid_file.write('{}\n'.format('\t'.join(printline))) init_grid_file.close() stderr.write('Finished writing out initial grid.\n') # 2. Write out all train and dev instances train_file, dev_file = open(extrinsic_inflection.train, 'w'), open(extrinsic_inflection.dev, 'w') for row_idx in range(self.r): row, wfs, col_idxs = self.get_row(row_idx) if len(col_idxs) > 1: for trg_col_idx in col_idxs: trg_wf = row[trg_col_idx] src_col_idxs = list(x for x in col_idxs if x != trg_col_idx) for src_col_idx in src_col_idxs: src_wf = row[src_col_idx] instance = '<' + str( src_col_idx) + '>' + ' ' + ' '.join( list(src_wf)) + ' ' + '<' + str( trg_col_idx) + '>' + '\t' + ' '.join( list(trg_wf)) if random.choice(range(10)) == 4: dev_file.write('{}\n'.format(instance)) else: train_file.write('{}\n'.format(instance)) train_file.close() dev_file.close() # 3. Run the model from scratch.. don't return anything extrinsic_inflection.patience = 12 trained_model = seq2seq_runner.run(extrinsic_inflection) # 4. Read in Dev predictions and rank best source cells for each target cell error = False trg_2_src_acc = dict((trg_col_idx, dict((src_col_idx, [0, 1]) for src_col_idx in range(self.c) if src_col_idx != trg_col_idx)) for trg_col_idx in range(self.c)) preds = os.path.join(ext_model_location, 'predictions_dev.txt') for line in open(preds): line = line.strip() if line.startswith('SRC: '): src_col_idx = int(line.split('<', 1)[1].split('>', 1)[0]) trg_col_idx = int(line.split('<')[-1].split('>')[0]) trg_2_src_acc[trg_col_idx][src_col_idx][1] += 1 if not error: trg_2_src_acc[trg_col_idx][src_col_idx][0] += 1 if '*ERROR*' in line: error = True else: error = False for trg_col_idx in trg_2_src_acc: for src_col_idx in trg_2_src_acc[trg_col_idx]: trg_2_src_acc[trg_col_idx][ src_col_idx] = trg_2_src_acc[trg_col_idx][src_col_idx][ 0] / trg_2_src_acc[trg_col_idx][src_col_idx][1] trg_2_best_srcs = dict((trg_col_idx, list(trg_2_src_acc[trg_col_idx])) for trg_col_idx in range(self.c)) for trg_col_idx in trg_2_best_srcs: trg_2_best_srcs[trg_col_idx].sort( key=lambda x: trg_2_src_acc[trg_col_idx][x], reverse=True) stderr.write('Best Predictors for cell {}:\n'.format(trg_col_idx)) for best_src in trg_2_best_srcs[trg_col_idx]: stderr.write('\t{} ({})\n'.format( best_src, trg_2_src_acc[trg_col_idx][best_src])) # 5. Write out test set trying to predict each unattested cell from its best available predictor ext_model_location = os.path.join(self.model_location, 'Extrinsic_grid_completion_final') if not os.path.isdir(self.model_location): os.makedirs(self.model_location) extrinsic_inflection = Inflector(ext_model_location, data_format=dataloader.DataFormat.MT, extrinsic=True) extrinsic_inflection.train = None extrinsic_inflection.dev = None extrinsic_inflection.test = os.path.join(self.model_location, 'ext_test.tsv') extrinsic_inflection.checkpoint_to_restore = trained_model.best_checkpoint_path empty_slots = [] test_file = open(extrinsic_inflection.test, 'w') for row_idx in range(self.r): row, wfs, col_idxs = self.get_row(row_idx) if None in row and len(col_idxs) > 0: for trg_col_idx in range(self.c): if trg_col_idx not in col_idxs: # Make one prediction for every empty cell in grid if self.baseline == 'random_src': src_col_idx = random.choice(col_idxs) else: for src_col_idx in trg_2_best_srcs[trg_col_idx]: if src_col_idx in col_idxs: break src_wf = row[src_col_idx] instance = '<' + str( src_col_idx) + '>' + ' ' + ' '.join( list(src_wf)) + ' ' + '<' + str( trg_col_idx) + '>' + '\tPredictMe' test_file.write('{}\n'.format(instance)) empty_slots.append((row_idx, trg_col_idx)) test_file.close() # 6. Continue training the model on dev for one epoch and make predictions on test _ = seq2seq_runner.run(extrinsic_inflection) # 7. Parse the predictions file full_grid = np.array(self.wf_grid) preds = os.path.join(ext_model_location, 'predictions_test.txt') for line in open(preds): line = line.strip() if line.startswith('PRD:'): row_idx, col_idx = empty_slots.pop(0) pred = ''.join(line.split(':', 1)[1].split()).replace('_', ' ') assert self.wf_grid[row_idx][col_idx] == None full_grid[row_idx][col_idx] = pred # 8. Write out the completed grid for debugging full_grid_file = open( os.path.join(ext_model_location, 'pred_full_grid.txt'), 'w') for row_idx in range(self.r): printline = [] for col_idx in range(self.c): if self.wf_grid[row_idx][col_idx] == None: printline.append('<<{}>>'.format( full_grid[row_idx][col_idx])) else: printline.append(full_grid[row_idx][col_idx]) full_grid_file.write('{}\n'.format('\t'.join(printline))) full_grid_file.close() if len(empty_slots) != 0: raise Exception( '{}\n\nHow did test instances and predictions get misaligned!?\n\t{}\n\t{}' .format(str(full_grid), len(empty_slots), '\n\t'.join(list(str(x) for x in empty_slots)))) return full_grid
if to_train: augmentation_params = params.get("augmentation") if augmentation_params is not None: suffix = int(augmentation_params["n"]) generation_params = augmentation_params.get("params", dict()) augment_file = "augmented/{}-{}-{}".format(language, mode, suffix) if os.path.exists(augment_file) and generation_params.get("to_load", True): auxiliary_data = read_infile(augment_file) else: gen_params = copy.copy(generation_params) gen_params.pop("to_load") auxiliary_data = generate_auxiliary(data, dev_data, suffix, augment_lm_file, augment_file, **gen_params) else: auxiliary_data = None inflector.train(data, dev_data=dev_data, augmented_data=auxiliary_data, save_file=save_file) if use_paradigms: paradigm_checker = ParadigmChecker().train(data) if to_rerank_with_lm: forward_save_file = "language_models/{}-{}.json".format(language, mode) forward_lm = load_lm(forward_save_file) if os.path.exists(forward_save_file) else None reverse_save_file = "language_models/reverse-{}-{}.json".format(language, mode) reverse_lm = load_lm(reverse_save_file) if os.path.exists(reverse_save_file) else None lm_ranker = LmRanker(forward_lm, reverse_lm, to_rerank=(to_rerank_with_lm == "rerank")) if to_test: alignment_data = [elem[:2] for elem in data] # inflector.evaluate(test_data[:20], alignment_data=alignment_data) # sys.exit() answer = inflector.predict(test_data, **params["predict"]) # if use_paradigms: # data_to_filter = [(elem[0], elem[2]) for elem in test_data]