def predict(model, settings, to_predict, elmo, vocabs): pred_path = settings.dir + to_predict.split("/")[-1] + ".pred" entries, predicted, other_predicted = model.predict(to_predict, elmo) f1, _ = sc.score(*zip(*((entry[1][settings.pt].numpy(), predicted[entry[0]].numpy()) for entry in entries))) print("F1 is {:.2%}".format(f1)) if len(other_predicted) > 0: other_f1, _ = sc.score(*zip(*((entry[1][settings.ot].numpy(), other_predicted[entry[0]].numpy()) for entry in entries))) print("Other F1 is {:.2%}".format(other_f1)) with open(pred_path, "w") as fh: for sentence in cd.read_col_data(to_predict): pred = predicted[sentence.id].numpy() if settings.target_style == "scope-": cue_matrix = sentence.make_matrix("cues", True, vocabs[settings.td["cue"]].w2i) pred = np.maximum(pred, cue_matrix) #pred = other_predicted[sentence.id].numpy() sentence.update_parse(pred, settings.target_style, vocabs[settings.pt].i2w) if len(other_predicted) > 0: pred = other_predicted[sentence.id].numpy() # NOTE sem == sem hopefully if settings.target_style == settings.other_target_style: sentence.update_parse(pred, "syn", vocabs[settings.pt].i2w) else: sentence.update_parse(pred, settings.other_target_style, vocabs[settings.pt].i2w) print(sentence, file=fh) return True
def main(): parser = argparse.ArgumentParser( description= 'Take the probability files from several models and do model ensembling.' ) parser.add_argument('--prob_dir', type=str, default='tmp/ensemble/', dest='prob_dir', action='store', help='The dir where the prob files locate.') parser.add_argument('--files', type=str, dest='files', action='store', required=True, help='The list of filenames, separated by comma.') parser.add_argument('--key_file', type=str, dest='key_file', action='store', required=True, help='Where to find the key file.') args = parser.parse_args() if not os.path.exists(args.prob_dir): raise Exception('Probability file dir does not exist at ' + args.prob_dir) prob_files = args.files.split(',') if len(prob_files) <= 1: raise Exception( 'Need to provide more than one model prediction files.') prob_files = [os.path.join(args.prob_dir, x) for x in prob_files] prob_matrices = [] for fname in prob_files: print "Reading prediction prob file at: " + fname pm = read_prob_file(fname) prob_matrices.append(pm) print "Doing majority vote to generate final predictions..." preds = majority_vote(prob_matrices) # convert preds from index to labels label2id = data_utils.LABEL_TO_ID id2label = dict([(v, k) for k, v in label2id.items()]) preds = [id2label[x] for x in preds] # write pred file pred_file = args.prob_dir + '/ensemble.prediction.tmp' with open(pred_file, 'w') as outfile: for p in preds: print >> outfile, p + '\t1.0' # score scorer.score(args.key_file, [pred_file], 1, True)
def solve(problems): files = {'a': 'a_example.txt', 'b': 'b_read_on.txt', 'c': 'c_incunabula.txt', 'd': 'd_tough_choices.txt', 'e': 'e_so_many_books.txt', 'f': 'f_libraries_of_the_world.txt'} pool = Pool() for f in problems: run_file = files[f] days_left, remaining_libs = reader.read('./inputs/' + run_file) outputs = [] while days_left > 0 and len(remaining_libs) > 0: # Tuning: # For b, c, f: 50 is better than 0 # For e: 0 is better than 50 scores = pool.map(lambda x: x.get_score(days_left), remaining_libs) next_lib = remaining_libs[np.argmax(scores)] _ = pool.map(lambda x: x.scan_copy(), next_lib.books.values()) remaining_libs.remove(next_lib) next_lib.books = next_lib.avail_books(days_left) if not next_lib.books: continue _ = pool.map(lambda x: x.remove_dupes(next_lib.books.keys()), remaining_libs) days_left = days_left - next_lib.signup outputs.append(next_lib) writer.write('./outputs/' + run_file, outputs) return scorer.score(run_file)
def simulate_game(players, deck, hand, table): hand += [deck.pop() for i in range(CARDS_IN_HAND - len(hand))] table += [deck.pop() for i in range(CARDS_IN_RIVER - len(table))] player_hands = [[deck.pop() for i in range(CARDS_IN_HAND)] for i in range(players - 1)] your_score = scorer.score(hand + table) player_scores = [scorer.score(h + table) for h in player_hands] if not player_scores: return GameOutcome.WIN max_player = max(player_scores) if your_score < max_player: return GameOutcome.LOSS elif your_score > max_player: return GameOutcome.WIN return GameOutcome.TIE
def cl_corner_bite_render(compressed, size): num, board, revealed, constraints, _ = cl_corner_bite(size) tile_size = 10 points = '-{s},-{s},{s},-{s},{s},{s},-{s},{s}'.format(s=0.96 * tile_size / 2) nodes = [] columns = [] for i in range(num): board[i][1] = compressed[i] nodes.append( dict( id=i, neighbors=board[i][2], position=((i % size) * tile_size, (i // size) * tile_size), has_mine=compressed[i] == '*', secret=compressed[i] == '?', revealed=i in revealed, points=points, )) for j in range(size): # horizontal column hints constraints[2 * j][0] = sum( [compressed[i] == '*' for i in constraints[2 * j][1]]) columns.append( dict( ids=constraints[2 * j][1], text_location=(-tile_size, j * tile_size), )) # vertical column hints constraints[2 * j + 1][0] = sum( [compressed[i] == '*' for i in constraints[2 * j + 1][1]]) columns.append( dict( ids=constraints[2 * j + 1][1], text_location=(j * tile_size, -tile_size), )) constraints[-1][0] = compressed.count('*') result = Puzzle(board, revealed, constraints).solve() scored = score(result, 'seqnum') title = f'CL Corner Bite {size}x{size} with score {scored}' tile_text = 'CoB' return dict( title=title, tile_text=tile_text, nodes=nodes, columns=columns, scored=scored, )
def clone(compressed, filename): contents = None with open(filename) as f: contents = f.read() puzzle, name, reverse_id_map = load(contents) board = puzzle.board revealed = puzzle.revealed constraints = puzzle.og_constraints num = len(board) board.sort( key=lambda c: c[0] in revealed) # praise be to Python's stable sort if not compressed: return dict( num=num, board=board, revealed=revealed, constraints=constraints, ) else: replace_cells(board, revealed, constraints, compressed) puzzle = Puzzle(board, revealed, constraints) result = puzzle.solve() scored = score(result, 'seqnum') title = f'Cloned "{name}" with score {scored}' tile_text = 'CLO' data = extract(contents) num_revealed = 0 for index, node in enumerate(data['nodes']): if index in revealed: node['revealed'] = True num_revealed += 1 else: node['has_mine'] = compressed[index - num_revealed] == '*' node['secret'] = compressed[index - num_revealed] == '?' return dict( title=title, tile_text=tile_text, scored=scored, nodes=data['nodes'], columns=data['columns'], colors=data['colors'], )
def main(): books, libraries, num_days = readFile(sys.argv[1]) for l in libraries: l.tot_score = sum(books[b] for b in l.books) file_id = sys.argv[1][5] if file_id == 'a'or file_id == 'b': libraries.sort(key=lambda x : x.su_time) elif file_id == 'c': libraries.sort(key=lambda x : reward_func(x, 1, 0, 34)) elif file_id == 'd': libraries = sort_by_diff(libraries) elif file_id == 'e': libraries.sort(key=lambda x : reward_func(x, 1, 45900, 45900)) elif file_id == 'f': libraries.sort(key=lambda x : reward_func(x, 1, 7000, 7000)) libraries = scan_books(books, libraries, num_days) outputFile(sys.argv[1], libraries) print(score(books, libraries))
def run(level_id=None, verbose=False): # a shorthand to make it easy to test the latest puzzle you generated if level_id == '-1': filenames = ['../latest.puz'] else: filenames = [] # if you've just cloned the repo, this test/index file does not exist! # this file solely consists of lines in the form "[id] [filename]" # where filenames are in that same folder with open('test/index') as index: for line in index.read().split('\n'): id, name = line.strip().split(' ') filenames.append(name) if level_id and id == level_id: filenames = [name] break print('filenames:', filenames) for index, filename in enumerate(filenames): with open(f'test/{filename}') as level: puzzle, name, reverse_id_map = load(level.read(), verbose=verbose) st = time.time() result = puzzle.solve() et = time.time() if verbose: print('result:', result) print('') for step in result['summary']: print(' ', step) if verbose and not result['solved']: flagged = ','.join([reverse_id_map[cell_id] for cell_id in result['flagged']]) revealed = ','.join([reverse_id_map[cell_id] for cell_id in result['revealed']]) print(f'P:_:{flagged}:{revealed}:n') scored = score(result, 'seqnum') print(f'{index + 1:3} {filename:20}: {et - st:.3f} seconds, solved {result["solved"]}, score {scored:.3f} - {name}')
def check_cutoff(nrl, cutoff=0.4): #old code #max_res = most_common(nrl) #n_mcr = nrl.count(max_res) #return n_mcr/len(nrl) > cutoff max_res = most_common(nrl) order, matrix = sc.blosum62() score_list = [] for res in nrl: current_score = sc.score(max_res, res, order, matrix) score_list.append(current_score) if len(score_list) > 0: avg_score = sum(score_list) / len(score_list) else: return False #print avg_score if avg_score > cutoff: return True else: return False
best_noise_threshs = [] for i in range(5, 60, 5): # Print to help track progress print(str(i)) for j in range(0, 100, 4): score = 0 scorer.set_params(i, j) out_of_images = False cnt = 0 while not out_of_images: img = cv2.imread(dirPath + 'cells/' + str(cnt) + '.png') if img is not None: expected = int(expecteds[cnt]) # Ignore rejected cells if 9 > expected > 0: actual = scorer.score(img) difference = abs(expected - actual) if difference == 0: score += EXACT_WEIGHTING elif difference == 1: score += ONE_OFF_WEIGHTING cnt += 1 else: out_of_images = True if score > max_score: max_score = score best_sector_threshs = [i] best_noise_threshs = [j] elif score >= max_score: best_sector_threshs.append(i) best_noise_threshs.append(j)
def holey_render(compressed, size): num, board, revealed, constraints, _ = holey(size) size = 2 * size + 1 tile_size = 10 points = '-{s},-{s},{s},-{s},{s},{s},-{s},{s}'.format(s=0.96 * tile_size / 2) nodes = [] columns = [] mapped = dict() for i in range(size**2): c = compressed[i] if i < len(compressed) else '.' board[i][1] = c mapped[board[i][0]] = c nodes.append( dict( id=board[i][0], neighbors=board[i][2], position=((board[i][0] % size) * tile_size, (board[i][0] // size) * tile_size), has_mine=c == '*', secret=c == '?', revealed=board[i][0] in revealed, points=points, )) for j in range(size // 2 + 1): # horizontal column hints constraints[2 * j][0] = sum( [mapped[i] == '*' for i in constraints[2 * j][1]]) columns.append( dict( ids=constraints[2 * j][1], text_location=(-tile_size, 2 * j * tile_size), )) # vertical column hints constraints[2 * j + 1][0] = sum( [mapped[i] == '*' for i in constraints[2 * j + 1][1]]) columns.append( dict( ids=constraints[2 * j + 1][1], text_location=(2 * j * tile_size, -tile_size), )) constraints[-1][0] = compressed.count('*') result = Puzzle(board, revealed, constraints).solve() scored = score(result, 'seqnum') title = f'Holey {size}x{size} with score {scored}' tile_text = 'HOL' return dict( title=title, tile_text=tile_text, nodes=nodes, columns=columns, scored=scored, )
def L_shape_grid(compressed, size, depth): """ 33 13 2114 2244 """ size, depth = int(size), int(depth) side_length = 2 * size * 2**depth points = '-{s},-{s},{s},-{s},{s},{s},-{s},{s}'.format(s=0.96 / 2) cindex = 0 board = [] revealed = [] constraints = [] nodes = [] columns = [] colors = [ dict(ids=[], color='RED', is_dark=False), dict(ids=[], color='ORANGE', is_dark=False), dict(ids=[], color='GREEN', is_dark=False), dict(ids=[], color='BLUE', is_dark=False), ] id_map = dict() for y in range(side_length): for x in range(side_length): if x < side_length // 2 - 1 and y < side_length // 2 - 1: continue else: id_map[(x, y)] = len(id_map) pos_to_id = lambda x, y: id_map.get((x, y)) for y in range(side_length): for x in range(side_length): if x < side_length // 2 - 1 and y < side_length // 2 - 1: continue cell_id = pos_to_id(x, y) neighbors = [] for dy in range(-1, 2): for dx in range(-1, 2): if dx == dy == 0: continue neighbor_id = pos_to_id(x + dx, y + dy) if neighbor_id: neighbors.append(neighbor_id) if x < side_length // 2 and y < side_length // 2: what = '.' revealed.append(cell_id) else: what = compressed[cindex] if compressed else '' cindex += 1 temp_x, temp_y, threshold = x, y, side_length // 4 for i in range(depth): if threshold <= temp_x < 3 * threshold and threshold <= temp_y < 3 * threshold: if i == depth - 1: colors[0]['ids'].append(cell_id) else: temp_x -= threshold temp_y -= threshold elif temp_x < threshold * 2: if i == depth - 1: colors[1]['ids'].append(cell_id) else: temp_x, temp_y = temp_y - threshold * 2, threshold * 2 - temp_x - 1 elif temp_y < threshold * 2: if i == depth - 1: colors[2]['ids'].append(cell_id) else: temp_x, temp_y = threshold * 2 - temp_y - 1, temp_x - threshold * 2 else: if i == depth - 1: colors[3]['ids'].append(cell_id) else: temp_x -= threshold * 2 temp_y -= threshold * 2 threshold //= 2 board.append([cell_id, what, neighbors]) if compressed: nodes.append( dict( id=cell_id, neighbors=neighbors, position=(x, y), has_mine=what == '*', secret=what == '?', revealed=cell_id in revealed, points=points, )) constraints.append([ sum([c[1] == '*' for c in board]), [c[0] for c in board if c[0] not in revealed] ]) for color in colors: constraints.append( [sum(board[n][1] == '*' for n in color['ids']), color['ids']]) num = 3 * side_length**2 // 4 board.sort( key=lambda c: c[0] in revealed) # praise be to Python's stable sort if not compressed: return dict( num=num, board=board, revealed=revealed, constraints=constraints, ) else: result = Puzzle(board, revealed, constraints).solve() scored = score(result, 'seqnum') title = f'L-shape {size}-{depth} with score {scored}' tile_text = 'L' return dict( title=title, tile_text=tile_text, nodes=nodes, columns=columns, colors=colors, scored=scored, )
return clf def load_files(): train = np.loadtxt("learning.tab") test = np.loadtxt("test.tab") X_train = train[:, 1:-1] y_train = train[:, -1] X_test = test[:, 1:-1] y_test = test[:, -1] # pairs names_train = train[:, 0] names_test = test[:, 0] return X_train, y_train, X_test, y_test, names_train, names_test if __name__ == "__main__": X_train, y_train, X_test, y_test, train, test = load_files() estimator = svc(X_train, y_train) y_train_predict = estimator.predict(X_train) y_test_predict = estimator.predict(X_test) print score(y_test_predict) np.savetxt("learning.pred.tab", y_train_predict) np.savetxt("test.pred.tab", y_test_predict)
def optimize(self, smiles, vina_conf, vina_log_path, \ log = None, mu=32, lam=64, generation=1000, seed=0, verbose=True): np.random.seed(seed) gene_length = 300 # Initialize population print("Initializing Population....") # Generation 0, start from input smiles initial_smiles = np.random.choice(smiles, mu+lam) initial_smiles = [util.canonicalize(s) for s in initial_smiles] initial_genes = [self.encode(s, max_len=gene_length) for s in initial_smiles] initial_scores = [] print(r"|0%--------------------50%-------------------100%|") for i, s in enumerate(initial_smiles): scorer.score_vina(s, conf_path=vina_conf, log_path=vina_log_path) print("*"*int(50*i/(mu+lam)), end='\r') print() population = [] for score, gene, smiles in zip(initial_scores, initial_genes, initial_smiles): population.append((score, smiles, gene)) # Select top $mu$ smiles as generation 0 population = sorted(population, key=lambda x: x[0], reverse=True)[:mu] # Start! print("Generation Start!") all_smiles = [p[1] for p in population] all_result = [] for epoch in range(generation): new_population = [] # For each mutation in each generation in range $lamda$ for _ in range(lam): # random select one smi/gene in top $mu$ smiles p = population[np.random.randint(mu)] p_gene = p[2] c_gene = util.mutation(p_gene) c_smiles = util.canonicalize(self.decode(c_gene)) if c_smiles not in all_smiles: c_score = scorer.score(c_smiles) c = (c_score, c_smiles, c_gene) new_population.append(c) all_smiles.append(c_smiles) population.extend(new_population) all_result.extend(new_population) population = sorted(population, key=lambda x: x[0], reverse=True)[:mu] if epoch%15 == 0 and verbose: # Log on screen self._log(epoch, population, population_size=len(all_smiles)) print("\nFinished!") if log: try: self._log_file(log, all_result) print("Log file write into %s" % log) except: print("Failed writing log to %s" % log) return all_result
def evaluate(model, device, eval_dataloader, eval_label_ids, num_labels, id2label, verbose=True, raw_data=None): model.eval() eval_loss = 0 nb_eval_steps = 0 preds = [] for input_ids, input_mask, segment_ids, label_ids in eval_dataloader: input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): logits, _ = model(input_ids, segment_ids, input_mask, labels=None) loss_fct = CrossEntropyLoss() tmp_eval_loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1)) eval_loss += tmp_eval_loss.mean().item() nb_eval_steps += 1 if len(preds) == 0: preds.append(logits.detach().cpu().numpy()) else: preds[0] = np.append(preds[0], logits.detach().cpu().numpy(), axis=0) eval_loss = eval_loss / nb_eval_steps preds = np.argmax(preds[0], axis=1).reshape(-1) pred_labels = [id2label[pred_id] for pred_id in preds] eval_labels = [ id2label[label_id] for label_id in eval_label_ids.numpy().reshape(-1) ] _, indices = score(eval_labels, pred_labels, verbose=verbose) structure_parts = compute_structure_parts(raw_data) compute_structure_errors(structure_parts, preds=pred_labels, gold_labels=eval_labels) wrong_indices = indices['wrong_indices'] correct_indices = indices['correct_indices'] wrong_relations = indices['wrong_predictions'] correct_predictions = indices['correct_predictions'] all_predictions = indices['all_predictions'] wrong_ids = [d['id'] for d in raw_data[wrong_indices]] correct_ids = [d['id'] for d in raw_data[correct_indices]] all_ids = [d['id'] for d in raw_data] print('Num Correct: {} | Num Wrong: {}'.format(len(correct_indices), len(wrong_indices))) print('Wrong Predictions: {}') print(Counter(wrong_relations)) # save_dir = os.path.join(cfg_dict['test_save_dir'], cfg_dict['id']) save_dir = '/home/ec2-user/apex/SpanBERT/indices_dir/tacred/' os.makedirs(save_dir, exist_ok=True) print('saving to: {}'.format(save_dir)) np.savetxt(os.path.join(save_dir, 'correct_ids.txt'), correct_ids, fmt='%s') np.savetxt(os.path.join(save_dir, 'wrong_ids.txt'), wrong_ids, fmt='%s') np.savetxt(os.path.join(save_dir, 'wrong_predictions.txt'), wrong_relations, fmt='%s') np.savetxt(os.path.join(save_dir, 'correct_predictions.txt'), correct_predictions, fmt='%s') np.savetxt(os.path.join(save_dir, 'all_predictions.txt'), all_predictions, fmt='%s') np.savetxt(os.path.join(save_dir, 'all_ids.txt'), all_ids, fmt='%s') ids = [instance['id'] for instance in raw_data] formatted_data = [] for instance_id, pred, gold in zip(ids, pred_labels, eval_labels): formatted_data.append({ "id": instance_id.replace("'", '"'), "label_true": gold.replace("'", '"'), "label_pred": pred.replace("'", '"') }) id2preds = {d['id']: pred for d, pred in zip(raw_data, pred_labels)} json.dump(id2preds, open(os.path.join(save_dir, 'id2preds.json'), 'w')) with open(os.path.join(save_dir, 'spanbert_tacred.jsonl'), 'w') as handle: print('Saving to: {}'.format( os.path.join(save_dir, 'spanbert_tacred.jsonl'))) for instance in formatted_data: line = "{}\n".format(instance) handle.write(line) result = compute_f1(preds, eval_label_ids.numpy()) result['accuracy'] = simple_accuracy(preds, eval_label_ids.numpy()) result['eval_loss'] = eval_loss if verbose: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) return preds, result
def compute_score(self): """Scores the job entry as defined in scorefile.csv""" self.preprocess_bodystring() self.score, self.score_hits = scorer.score(self.processed_tokens)
if v != '': continue if k in semantic_preditions: binary_predictions[k] = semantic_preditions[k] else: binary_predictions[k] = 'no_relation' predictions = [] for i in range(0, len(binary_predictions)): assert binary_predictions[i] != '' predictions.append(binary_predictions[i]) gold = [] data = open(y['gold_file'], 'r') for d in data: d = d.strip() gold.append(d) if not os.path.exists('saved_models/depot-all-recent/'): os.mkdir('saved_models/depot-all-recent/') out_file = 'saved_models/depot-all-recent/predictions.txt' out_f = open(out_file, 'w') for i, p in enumerate(predictions): out_f.write('%d %s\n' % (i, p)) p, r, f1 = scorer.score(gold, predictions, verbose=True) print("{} set evaluate result: {:.2f}\t{:.2f}\t{:.2f}".format( 'test', p, r, f1)) print("Evaluation ended.")
""" Score a directory of cells and write the results to a file """ import cv2 import scorer SECTOR_THRESHOLD = 25 NOISE_THRESHOLD = 32 CELL_DIR_PATH = 'img/all_cells/cells/' RESULTS_FILE = 'img/actual_results.txt' if __name__ == '__main__': out_of_images = False file = open(RESULTS_FILE, 'w') cnt = 0 while not out_of_images: img = cv2.imread(CELL_DIR_PATH + str(cnt) + '.png') if img is not None: scorer.set_params(SECTOR_THRESHOLD, NOISE_THRESHOLD) score = scorer.score(img) file.write('%s ' % score) file.write('\n') cnt += 1 else: out_of_images = True file.close()
newline='') writer = csv.writer(csvfile) writer.writerow(["sentence", "idx", "predict", "gold"]) for i in tqdm(range(len(error))): for j in range(len(error[i])): inside = 0 for k in range(len(error[i][j])): inside = 1 sentence = " ".join( [vocab.id2word[g] for g in error[i][j][k]["token"]]) sub = [] obj = [] for it, g in enumerate(error[i][j][k]["sub_pos"]): if g == 0 and it < len(error[i][j][k]["token"]): sub.append(vocab.id2word[error[i][j][k]["token"][it]]) for it, g in enumerate(error[i][j][k]["obj_pos"]): if g == 0 and it < len(error[i][j][k]["token"]): obj.append(vocab.id2word[error[i][j][k]["token"][it]]) predict = id2label[error[i][j][k]["preds"]] gold = id2label[error[i][j][k]["label"]] writer.writerow([sentence, idx, predict, gold]) if inside == 1: writer.writerow("") predictions = [id2label[p] for p in predictions] p, r, f1 = scorer.score(batch.gold(), predictions, verbose=True) print("{} set evaluate result: {:.2f}\t{:.2f}\t{:.2f}".format( args.dataset, p, r, f1)) print("Evaluation ended.")
6: 'classification', 7: 'classification', 8: 'classification', } for i in tests.keys(): start_total_time = time.time() folder = r'..\..\check_' + str(i) + '_' + tests[i][0] + '\\' argv = [ '--train-csv', folder + 'train.csv', '--test-csv', folder + 'test.csv', '--prediction-csv', folder + 'prediction.csv', '--test-target-csv', folder + 'test-target.csv', '--model-dir', '.', # '--nrows', '5000' if i in [3, 4, 5, 6, 7] else '500' if i in [8] else '-1', '--mode', tests[i]] args = parser.parse_args(argv) log('processing', folder) model_config = train(args) X, _, _ = preprocess_test_data(args, model_config) prediction = predict(X, model_config['model']) score(args, prediction=prediction) log('all datasets time: {}'.format(time.time() - start_total_time)) log_trail('=', '\n\n') except BaseException as e: log('EXCEPTION:', e) log(traceback.format_exc()) exit(1)
return clf def load_files(): train = np.loadtxt('learning.tab') test = np.loadtxt('test.tab') X_train = train[:, 1:-1] y_train = train[:, -1] X_test = test[:, 1:-1] y_test = test[:, -1] # pairs names_train = train[:, 0] names_test = test[:, 0] return X_train, y_train, X_test, y_test, names_train, names_test if __name__ == '__main__': X_train, y_train, X_test, y_test, train, test = load_files() estimator = svc(X_train, y_train) y_train_predict = estimator.predict(X_train) y_test_predict = estimator.predict(X_test) print score(y_test_predict) np.savetxt('learning.pred.tab', y_train_predict) np.savetxt('test.pred.tab', y_test_predict)
def train(self): settings = self.settings print("Training is starting for {} epochs using ".format( settings.epochs) + "{} with the following settings:".format(self.device)) print() for key, val in settings.__dict__.items(): print("{}: {}".format(key, val)) print(flush=True) train_dataloader = self._init_training_data(settings.train) best_f1 = 0 best_f1_epoch = 1 + self.epoch_offset for epoch in range(1 + self.epoch_offset, settings.epochs + 1 + self.epoch_offset): start_time = time.time() total_loss, sequences_trained = self._run_train_epoch( train_dataloader, epoch, not settings.quiet, not settings.disable_gradient_clip) total_time = round(time.time() - start_time, 2) print("#" * 50) print("Epoch {}".format(epoch)) print("loss {}".format(total_loss)) print("execution time {}s".format(total_time) \ + " ({} trained sequences/s)".format(round(sequences_trained/(total_time)))) print("#" * 50, flush=True) if not settings.disable_val_eval: entries, predicted, other_predicted = self.predict( settings.val, settings.elmo_dev) #a,d,b,c = zip(*((entry[0], len(entry[4]), entry[1].numpy().shape, predicted[entry[0]].numpy().shape) for entry in entries)) #print([(x,w,y,z) for x,w,y,z in zip(a,d,b,c) if y!=z]) f1, _ = sc.score(*zip(*((entry[1][self.pt].numpy(), predicted[entry[0]].numpy()) for entry in entries))) print("Primary Dev F1 on epoch {} is {:.2%}".format(epoch, f1)) if len(other_predicted) > 0: other_f1, _ = sc.score(*zip( *((entry[1][self.ot].numpy(), other_predicted[entry[0]].numpy()) for entry in entries))) print("Secondary Dev F1 on epoch {} is {:.2%}".format( epoch, other_f1)) #f1 = sc.score() improvement = f1 > best_f1 elapsed = epoch - best_f1_epoch es_active = settings.early_stopping > 0 if (es_active and not improvement and elapsed == settings.early_stopping): print("Have not seen any improvement for {} epochs".format( elapsed)) print("Best F1 was {} seen at epoch #{}".format( best_f1, best_f1_epoch)) break else: if improvement: best_f1 = f1 best_f1_epoch = epoch print("Saving {} model".format(best_f1_epoch)) self.save("best_model.save", epoch) else: print("Have not seen any improvement for {} epochs". format(elapsed)) print("Best F1 was {:.2%} seen at epoch #{}".format( best_f1, best_f1_epoch)) if settings.enable_train_eval: entries, predicted, other_predicted = self.predict( settings.train, settings.elmo_train) train_f1, _ = sc.score(*zip(*((entry[1][self.pt].numpy(), predicted[entry[0]].numpy()) for entry in entries))) print("Sem Train F1 on epoch {} is {:.2%}".format( epoch, train_f1)) if len(other_predicted) > 0: other_train_f1, _ = sc.score(*zip( *((entry[1][self.ot].numpy(), other_predicted[entry[0]].numpy()) for entry in entries))) print("Syn Train F1 on epoch {} is {:.2%}".format( epoch, other_train_f1)) if settings.save_every: self.save("{}_epoch{}.save".format(int(time.time()), epoch), epoch) else: self.save("last_epoch.save", epoch)
import solver import parser import time import glob #import optimize #solution = solver.solve(parser.parse('datasets/a_example.txt')) #print(solution) for idx, filename in enumerate(sorted(glob.glob('datasets/*'))): dataset = parser.parse(filename) start_time = time.time() solution = solver.solve(dataset) # print(solution) print("--- %.10f seconds ---" % (time.time() - start_time)) score = scorer.score(solution, dataset) #print('Score for %s: %s (%s pizzas for %s person)' % ( # filename[9:], score, dataset['nOfPizzas'], 2*dataset['nOfTwo']+3*dataset['nOfThree']+4*dataset['nOfFour'])) writer.writing(solution, filename[9] + '.txt') #Optimazation Part here #opt_solution = optimize.solve(dataset) #print(opt_solution) #score = opt_scorer.score(opt_solution, dataset) # dataset = parser.parse('datasets/a_example') # solution = solver.solve(dataset) # print(solution) # score = scorer.score(solution, dataset) # print("Score =",score)
def train(): # print training info print _get_training_info() # dealing with files print "Loading data from files..." train_loader = data_utils.DataLoader( os.path.join(FLAGS.data_dir, 'train.vocab%d.id' % FLAGS.vocab_size), FLAGS.batch_size, FLAGS.sent_len, subsample=FLAGS.subsample, unk_prob=FLAGS.corrupt_rate ) # use a subsample of the data if specified # load cv dataset dev_loaders = [] test_loaders = [] for i in range(100): dev_loader = data_utils.DataLoader( os.path.join(FLAGS.data_dir, 'cv', 'dev.vocab%d.id.%d' % (FLAGS.vocab_size, i)), FLAGS.batch_size, FLAGS.sent_len) test_loader = data_utils.DataLoader( os.path.join(FLAGS.data_dir, 'cv', 'test.vocab%d.id.%d' % (FLAGS.vocab_size, i)), FLAGS.batch_size, FLAGS.sent_len) dev_loaders.append(dev_loader) test_loaders.append(test_loader) max_steps = train_loader.num_batches * FLAGS.num_epoch print "# Examples in training data:" print train_loader.num_examples # load label2id mapping and create inverse mapping label2id = data_utils.LABEL_TO_ID id2label = dict([(v, k) for k, v in label2id.iteritems()]) key = random.randint(1e5, 1e6 - 1) # get a random 6-digit int test_key_file_list = [] test_prediction_file_list = [] dev_key_file_list = [] dev_prediction_file_list = [] for i in range(100): test_key_file = os.path.join( FLAGS.train_dir, str(key) + '.shuffled.test.key.tmp.%d' % i) test_prediction_file = os.path.join( FLAGS.train_dir, str(key) + '.shuffled.test.prediction.tmp.%d' % i) dev_key_file = os.path.join(FLAGS.train_dir, str(key) + '.shuffled.dev.key.tmp.%d' % i) dev_prediction_file = os.path.join( FLAGS.train_dir, str(key) + '.shuffled.dev.prediction.tmp.%d' % i) test_key_file_list.append(test_key_file) test_prediction_file_list.append(test_prediction_file) dev_key_file_list.append(dev_key_file) dev_prediction_file_list.append(dev_prediction_file) test_loaders[i].write_keys(test_key_file, id2label=id2label) dev_loaders[i].write_keys(dev_key_file, id2label=id2label) with tf.Graph().as_default(): print "Constructing model %s..." % (FLAGS.model) with tf.variable_scope('model', reuse=None): m = _get_model(is_train=True) with tf.variable_scope('model', reuse=True): mdev = _get_model(is_train=False) saver = tf.train.Saver(tf.all_variables(), max_to_keep=2) save_path = os.path.join(FLAGS.train_dir, 'model.ckpt') config = tf.ConfigProto() gpu_options = tf.GPUOptions( per_process_gpu_memory_fraction=FLAGS.gpu_mem, allow_growth=True) sess = tf.Session(config=tf.ConfigProto(device_count={"GPU": 1}, gpu_options=gpu_options)) summary_writer = tf.train.SummaryWriter(FLAGS.train_dir, graph=sess.graph) sess.run(tf.initialize_all_variables()) if FLAGS.use_pretrain: print "Use pretrained embeddings to initialize model ..." emb_file = os.path.join( FLAGS.data_dir, "emb-v%d-d%d.npy" % (FLAGS.vocab_size, FLAGS.hidden_size)) if not os.path.exists(emb_file): raise Exception("Pretrained vector file does not exist at: " + emb_file) pretrained_embedding = np.load(emb_file) m.assign_embedding(sess, pretrained_embedding) current_lr = FLAGS.init_lr global_step = 0 training_history = [] dev_f_history = [] test_f_history = [] best_dev_scores = [] best_test_scores = [] def eval_once(mdev, sess, data_loader): data_loader.reset_pointer() predictions = [] confidences = [] dev_loss = 0.0 for _ in xrange(data_loader.num_batches): x_batch, y_batch, x_lens = data_loader.next_batch() feed = _get_feed_dict(mdev, x_batch, y_batch, x_lens, use_pos=(FLAGS.pos_size > 0), use_ner=(FLAGS.ner_size > 0), use_deprel=(FLAGS.deprel_size > 0)) loss_value, pred, conf = sess.run( [mdev.loss, mdev.prediction, mdev.confidence], feed_dict=feed) predictions += list(pred) confidences += list(conf) dev_loss += loss_value dev_loss /= data_loader.num_batches return dev_loss, predictions, confidences print "Start training with %d epochs, and %d steps per epoch..." % ( FLAGS.num_epoch, train_loader.num_batches) for epoch in xrange(FLAGS.num_epoch): train_loss = 0.0 train_loader.reset_pointer() m.assign_lr(sess, current_lr) for _ in xrange(train_loader.num_batches): global_step += 1 start_time = time.time() x_batch, y_batch, x_lens = train_loader.next_batch() feed = _get_feed_dict(m, x_batch, y_batch, x_lens, use_pos=(FLAGS.pos_size > 0), use_ner=(FLAGS.ner_size > 0), use_deprel=(FLAGS.deprel_size > 0)) _, loss_value = sess.run([m.train_op, m.loss], feed_dict=feed) duration = time.time() - start_time train_loss += loss_value assert not np.isnan(loss_value), "Model loss is NaN." if global_step % FLAGS.log_step == 0: format_str = ( '%s: step %d/%d (epoch %d/%d), loss = %.6f (%.3f sec/batch), lr: %.6f' ) print format_str % (datetime.now(), global_step, max_steps, epoch + 1, FLAGS.num_epoch, loss_value, duration, current_lr) # summary loss after each epoch train_loss /= train_loader.num_batches summary_writer.add_summary(_summary_for_scalar( 'eval/training_loss', train_loss), global_step=epoch) # do CV on test set and use average score avg_dev_loss = 0.0 avg_test_loss = 0.0 avg_dev_f = 0.0 avg_dev_p = 0.0 avg_dev_r = 0.0 avg_test_f = 0.0 avg_test_p = 0.0 avg_test_r = 0.0 for i in range(100): dev_loss, dev_preds, dev_confs = eval_once( mdev, sess, dev_loaders[i]) avg_dev_loss += dev_loss summary_writer.add_summary(_summary_for_scalar( 'eval/dev_loss%d' % i, dev_loss), global_step=epoch) _write_prediction_file(dev_preds, dev_confs, id2label, dev_prediction_file_list[i]) # print "Evaluating on dev set..." dev_prec, dev_recall, dev_f = scorer.score( dev_key_file_list[i], [dev_prediction_file_list[i]], FLAGS.f_measure) avg_dev_f += dev_f avg_dev_p += dev_prec avg_dev_r += dev_recall test_loss, test_preds, test_confs = eval_once( mdev, sess, test_loaders[i]) avg_test_loss += test_loss summary_writer.add_summary(_summary_for_scalar( 'eval/test_loss%d' % i, test_loss), global_step=epoch) _write_prediction_file(test_preds, test_confs, id2label, test_prediction_file_list[i]) # print "Evaluating on test set..." test_prec, test_recall, test_f = scorer.score( test_key_file_list[i], [test_prediction_file_list[i]], FLAGS.f_measure) avg_test_f += test_f avg_test_p += test_prec avg_test_r += test_recall avg_dev_loss /= 100 avg_test_loss /= 100 avg_dev_f /= 100 avg_dev_p /= 100 avg_dev_r /= 100 avg_test_f /= 100 avg_test_p /= 100 avg_test_r /= 100 print "Epoch %d: training_loss = %.6f" % (epoch + 1, train_loss) print "Epoch %d: avg_dev_loss = %.6f, avg_dev_f-%g = %.6f" % ( epoch + 1, avg_dev_loss, FLAGS.f_measure, avg_dev_f) print "Epoch %d: avg_test_loss = %.6f, avg_test_f-%g = %.6f" % ( epoch + 1, avg_test_loss, FLAGS.f_measure, avg_test_f) # decrease learning rate if dev_f does not increase after an epoch if len(dev_f_history) > 10 and avg_dev_f <= dev_f_history[-1]: current_lr *= FLAGS.lr_decay training_history.append(train_loss) # save the model when best f score is achieved on dev set if len(dev_f_history) == 0 or (len(dev_f_history) > 0 and avg_dev_f > max(dev_f_history)): saver.save(sess, save_path, global_step=epoch) print "\tmodel saved at epoch %d, with best dev dataset f-%g score %.6f" % ( epoch + 1, FLAGS.f_measure, avg_dev_f) best_dev_scores = [avg_dev_p, avg_dev_r, avg_dev_f] best_test_scores = [avg_test_p, avg_test_r, avg_test_f] dev_f_history.append(avg_dev_f) test_f_history.append(avg_test_f) # stop learning if lr is too low if current_lr < 1e-6: break # saver.save(sess, save_path, global_step=epoch) print "Training ended with %d epochs." % epoch print "\tBest dev scores achieved (P, R, F-%g):\t%.3f\t%.3f\t%.3f" % tuple( [FLAGS.f_measure] + [x * 100 for x in best_dev_scores]) print "\tBest test scores achieved on best dev scores (P, R, F-%g):\t%.3f\t%.3f\t%.3f" % tuple( [FLAGS.f_measure] + [x * 100 for x in best_test_scores]) # clean up for dev_key_file, dev_prediction_file, test_key_file, test_prediction_file in zip( dev_key_file_list, dev_prediction_file_list, test_key_file_list, test_prediction_file_list): if os.path.exists(dev_key_file): os.remove(dev_key_file) if os.path.exists(dev_prediction_file): os.remove(dev_prediction_file) if os.path.exists(test_key_file): os.remove(test_key_file) if os.path.exists(test_prediction_file): os.remove(test_prediction_file)
def evaluate(): print "Building graph and loading model..." with tf.Graph().as_default(): ### the first model will be doing the full batches (a residual of examples will be left) with tf.variable_scope('model'): m = _get_model(is_train=False) saver = tf.train.Saver(tf.all_variables()) config = tf.ConfigProto() gpu_options = tf.GPUOptions(allow_growth=True) sess = tf.Session(config=tf.ConfigProto(device_count={"GPU": 1}, gpu_options=gpu_options)) ckpt = tf.train.get_checkpoint_state(FLAGS.train_dir) if ckpt and ckpt.model_checkpoint_path: saver.restore(sess, ckpt.model_checkpoint_path) else: raise IOError("Loading checkpoint file failed!") print "====> Evaluating on %s data" % FLAGS.eval_set print "Loading %s data..." % FLAGS.eval_set loader = data_utils.DataLoader( os.path.join(FLAGS.data_dir, '%s.vocab%d.id' % (FLAGS.eval_set, FLAGS.vocab_size)), FLAGS.batch_size, FLAGS.sent_len ) # load test data with batch_size 1; this is too slow # load label2id mapping and create inverse mapping label2id = data_utils.LABEL_TO_ID id2label = dict([(v, k) for k, v in label2id.iteritems()]) # key = random.randint(1e5, 1e6-1) # get a random 6-digit int test_key_file = os.path.join(FLAGS.train_dir, 'shuffled.%s.key.tmp' % FLAGS.eval_set) test_prediction_file = os.path.join( FLAGS.train_dir, 'shuffled.%s.prediction.tmp' % FLAGS.eval_set) test_prob_file = os.path.join(FLAGS.train_dir, 'shuffled.%s.probs.tmp' % FLAGS.eval_set) loader.write_keys(test_key_file, id2label=id2label, include_residual=True ) # write shuffled key to file, used by scorer test_loss = .0 print "Evaluating on %d test examples with full batch..." % ( loader.num_batches * loader.batch_size) preds, confs = [], [] all_probs = np.zeros([loader.num_examples, FLAGS.num_class]) for i in range(loader.num_batches): x, y, x_lens = loader.next_batch() feed = _get_feed_dict(m, x, y, x_lens, use_pos=(FLAGS.pos_size > 0), use_ner=(FLAGS.ner_size > 0), use_deprel=(FLAGS.deprel_size > 0)) loss_value, predictions, confidences, probs = sess.run( [m.loss, m.prediction, m.confidence, m.probs], feed_dict=feed) test_loss += loss_value preds += list(predictions) confs += list(confidences) all_probs[i * loader.batch_size:(i + 1) * loader.batch_size, :] = probs ### second model will do the residuals with one batch if loader.num_residual > 0: print "Evaluating on an residual of %d examples..." % loader.num_residual x, y, x_lens = loader.get_residual() feed = _get_feed_dict(m, x, y, x_lens, use_pos=(FLAGS.pos_size > 0), use_ner=(FLAGS.ner_size > 0), use_deprel=(FLAGS.deprel_size > 0)) loss_value, predictions, confidences, probs = sess.run( [m.loss, m.prediction, m.confidence, m.probs], feed_dict=feed) test_loss += loss_value preds += list(predictions) confs += list(confidences) all_probs[loader.num_batches * loader.batch_size:, :] = probs if not FLAGS.use_confidence: confs = [1.0] * len(confs) _write_prediction_file(preds, confs, all_probs, id2label, test_prediction_file, test_prob_file) test_loss /= loader.num_examples print "%s: test_loss = %.6f" % (datetime.now(), test_loss) prec, recall, f1 = scorer.score(test_key_file, [test_prediction_file], verbose=True) # clean up if FLAGS.cleanup and os.path.exists(test_key_file): os.remove(test_key_file) if FLAGS.cleanup and os.path.exists(test_prediction_file): os.remove(test_prediction_file)