def perform_training_and_testing(training_stage, args, data): ''' Returns ------- The validation error. A quantity that we want to minimize. ''' stats = None with rasengan.tictoc(training_stage): with rasengan.debug_support(): if args.perform_training or args.perform_testing: with rasengan.tictoc("Circuit Compilation"): ttns = get_train_test_namespace(args) with rasengan.tictoc("Loading Parameters"): load_params_from_pklfile(ttns, args) pass rasengan.decrease_print_indent() print_pklfn_performance(args) rasengan.increase_print_indent() # Train if args.perform_training: with rasengan.tictoc("Training"): stats = lstm_seqlabel_training.training(args, data, ttns) # Test (IF asked) if args.perform_testing: with rasengan.tictoc("Testing"): stats = lstm_seqlabel_validation.testing(args, data, ttns) return (100 - stats) if stats is None: return 100 else: best_epoch_id = stats['best_epoch_id'] return (100 - stats['validation_result'][best_epoch_id]['f1'])
def perform_training_and_testing(training_stage, args, data): ''' Returns ------- The validation error. A quantity that we want to minimize. ''' stats = None with rasengan.tictoc(training_stage): with rasengan.debug_support(): if args.perform_training or args.perform_testing: with rasengan.tictoc("Circuit Compilation"): ttns = get_train_test_namespace(args) with rasengan.tictoc("Loading Parameters"): load_params_from_pklfile(ttns, args) pass rasengan.decrease_print_indent() print_pklfn_performance(args) rasengan.increase_print_indent() # Train if args.perform_training: with rasengan.tictoc("Training"): stats = lstm_seqlabel_training.training(args, data, ttns) # Test (IF asked) if args.perform_testing: with rasengan.tictoc("Testing"): stats = lstm_seqlabel_validation.testing(args, data, ttns) return (100 - stats) if stats is None: return 100 else: best_epoch_id = stats['best_epoch_id'] return (100 - stats['validation_result'][best_epoch_id]['f1'])
def main(): import transducer_score args = transducer_score.args set_dropout_to_zero(args) data = transducer_score.data #--------------------------# # Compile disparate models # #--------------------------# models = [] for pkl_fn, changes in pkl_to_combine: args_clone = rasengan.Namespace(**args) #--------------------# # Update args_clone. # #--------------------# rasengan.warn('NOTE: Seting pretrained_param_pklfile') args_clone.pretrained_param_pklfile = pkl_fn for (k,v) in changes.items(): setattr(args_clone, k, v) print 'Setting args_clone.%s=%s'%(k,str(v)) #---------------------# # Compile args_clone. # #---------------------# ttns_i = rasengan.Namespace('ttns').update_and_append_prefix( compile_args(args_clone), 'test_') load_params_from_pklfile_to_stack_config( pkl_fn, ttns_i.test_stack_config) models.append(ttns_i) #----------------------------# # Aggregate disparate model. # #----------------------------# ttns = Aggregator(models, data) #-----------------------------------------------# # Test performance of Aggregated decision rule. # #-----------------------------------------------# with rasengan.debug_support(): stats_valid = args.validate_predictions_f( data.valid_lex, data.idx2label, args, ttns.test_f_classify, data.valid_y, data.words_valid, fn='/combined.valid.txt') print 'stats_valid', stats_valid
def test_entity_descriptors(self): with rasengan.debug_support(): entity_descriptors = catpeople_preprocessor.entity_descriptors from test_identify_governors import l def decode(s, idi): return TM[[s[_] for _ in idi]] sentence, parent, label, ctags = ( # 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, TM(['the', 'musical','august','30th',',', '2009', '|', 'author',':', 'operator','shrek','the', 'musical','is', 'a', 'musical','with','music','by', 'jeanine','tesori','and', 'a', 'book', 'and', 'lyrics','by', 'david','lindsay-abaire','.']), (4, 4, 4, 16, 4, 4, 4, 4, 4, 11, 4, 13, 16, 16, 16, 0, 16, 17, 18, 21, 19, 18, 24, 18, 24, 24, 24, 29, 27, 16), LABELMAP(['det','amod', 'amod', 'dep', 'punct','amod','punct','appos','punct', 'nn', 'dep', 'det', 'nsubj', 'cop', 'det','ROOT', 'prep','pobj', 'prep', 'nn', 'pobj', 'cc', 'det','conj', 'cc', 'conj', 'prep', 'nn', 'pobj', 'punct']), CTMAP(['DET', 'ADJ', 'ADJ', 'NOUN','.', 'NUM', '.', 'NOUN', '.', 'NOUN', 'VERB', 'DET', 'NOUN', 'VERB', 'DET','ADJ', 'ADP', 'NOUN', 'ADP', 'NOUN', 'NOUN', 'CONJ', 'DET','NOUN', 'CONJ','NOUN', 'ADP', 'NOUN','NOUN', '.'])) referents = [27, 28] self.expectEqual(decode(sentence, entity_descriptors(sentence, parent, label, ctags, referents)), ['book']) referents = [19,20] self.expectEqual(decode(sentence, entity_descriptors(sentence, parent, label, ctags, referents)), ['music']) referents = [10,11,12] self.expectEqual(decode(sentence, entity_descriptors(sentence, parent, label, ctags, referents)), ['music', 'jeanine', 'tesori', 'musical']) expected_output = [['executive'], ['family', 'book'], ['lied'], ['missing'], ['66th', 'united', 'states', 'secretary', 'state'], ['left'], ['efforts', '750-page', 'book'], ['ousted', 'board', 'directors', 'hp'], [], ['stressed', 'points', 'book'], ['dabbled', 'politics', 'dismissed', 'board'], [], ['attended', 'dinner'], ['told', 'ceo', 'troops', 'hewlett', 'packard'], ['looks', 'beautiful'], ['knew'], ['writes', 'career'],] for referents, parse, ep in zip(l[0::2], l[1::2], expected_output): sentence, parent, label, ctags = convert(parse) self.expectEqual(decode(sentence, entity_descriptors(sentence, parent, label, ctags, referents)), ep) return
def main(): if args.out_fn is None: basename = ''.join( [translate(get_ppcfg_title(e)[1]) for e in args.pptitle]) args.out_fn = 'figures/%s.pdf' % (basename) with debug_support(): fig = plt.figure( figsize=(args.figsize_x, args.figsize_y)) # give plots a rectangular frame ax = fig.add_subplot(111) label_to_artists = {} cm = plt.get_cmap(args.cmap) for pptitle_idx, pptitle in enumerate(args.pptitle): ppcfg, title = get_ppcfg_title(pptitle) aupr, mrr, _, C, shape = get_stats(ppcfg, expcfg_str=args.expcfg_str) for (a, m, c_, s) in zip(aupr, mrr, C, shape): c = cm(.1 * pptitle_idx) print a, m, c, c_, s if s == 'circle': label = 'Test %s' % title label_to_artists[label] = ax.add_artist( plt.Circle((a, m), .005, color=c, alpha=0.7, label=label)) else: label = 'Train %s' % title label_to_artists[label] = ax.add_artist( plt.Rectangle((a - .005, m - .005), 0.01, 0.01, color=c, alpha=0.7, label=label)) plt.text( a + .01, m + .000 * round(rand()), '%s %.1f' % (title.replace('Hinge ', ''), c_), fontsize=2, # verticalalignment='top', alpha=0.7) plt.xlim(xmin=min(aupr) - 0.05 if args.xmin is None else args.xmin) plt.ylim(ymin=min(mrr) - 0.05 if args.ymin is None else args.ymin) plt.xlabel('AUPR') plt.ylabel('MRR') plt.title('Various Feature Sets at Different C') plt.grid(True) continue label, handle = zip( *sorted(label_to_artists.items(), key=lambda x: x[0])) plt.legend([getline2d(e) for e in handle], label, loc='lower right', numpoints=1) pass print 'Saving file', args.out_fn plt.savefig(args.out_fn) plt.close() return
dcr2emb[e] = scale_to_unit(dcr2emb[e]) cat2mode = get_cat2mode() CONSTANT = (lambda x, t: 1) COUNT = (lambda x, t: x) LOG_COUNT = (lambda x, t: math.log(1 + x)) SQRT_COUNT = (lambda x, t: math.sqrt(x)) FREQ = (lambda x, t: float(x + 1) / (t + 1)) SQ_FREQ = (lambda x, t: (float(x + 1) / (t + 1))**2) SQRT_FREQ = (lambda x, t: math.sqrt(float(x + 1) / (t + 1))) PROD_SQRT_FREQ_SQRT_COUNT = ( lambda x, t: SQRT_COUNT(x, t) * SQRT_FREQ(x, t)) GM_SQRT_FREQ_SQRT_COUNT = ( lambda x, t: math.sqrt(SQRT_COUNT(x, t) * SQRT_FREQ(x, t))) cnt_transform = eval(args.cnt_transform) with debug_support(): def intervene_modes_hook(cat, modes): index_to_remove = [] if cat == '20th-century_women_writers': index_to_remove = [1, 4] elif cat == 'American_television_reporters_and_correspondents': index_to_remove = [1, 4] elif cat == 'Recipients_of_the_Purple_Heart_medal': index_to_remove = [2, 3, 4] elif cat == 'United_States_Army_soldiers': index_to_remove = [2, 4] modes = [e for i, e in enumerate(modes) if i not in index_to_remove] return modes idi_list = []
import readline, code print pkl.keys() code.InteractiveConsole(pkl).interact() if __name__ == '__main__': arg_parser = argparse.ArgumentParser( description='Tabulate performance of saved model files.') arg_parser.add_argument('--path', nargs='*', default=[], help='A glob of the paths to the pkls') arg_parser.add_argument('--interact', default=0, type=int, help='Default={0}') arg_parser.add_argument('--keys', nargs='*', default=[], help='Default={0}') arg_parser.add_argument('--server', default=0, type=int, help='Default={0}') arg_parser.add_argument('--client', default=0, type=int, help='Default={0}') with rasengan.debug_support(): main(args=arg_parser.parse_args())
def main(): import argparse arg_parser = argparse.ArgumentParser(description='') arg_parser.add_argument('--seed', default=0, type=int, help='Default={0}') arg_parser.add_argument('--emb_pkl_fn', default='data/demonstrate_similarity_idea.emb.pkl', type=str) arg_parser.add_argument( '--feat_file', default='data/random/details/89c0c894.American_women_writers', type=str) arg_parser.add_argument('--ctag', default=None, type=int) arg_parser.add_argument('--mode_count', default=5, type=int) arg_parser.add_argument('--method', default='fast_relax', type=str, choices=[ 'brute_force', 'fast_relax', 'annealed_gibbs', 'maxproduct-bp', 'variational_inference', 'dc_programming' ]) args = arg_parser.parse_args() import random random.seed(args.seed) numpy.random.seed(args.seed) cfg.mode_count = args.mode_count tags_to_remove = defaultdict(list) with rasengan.tictoc('Loading pkl'): embeddings = pkl.load(open(args.emb_pkl_fn)) if cfg.introduce_NULL_embedding: embeddings[cfg.NULL_KEY] = numpy.zeros( next(embeddings.itervalues()).shape) with rasengan.debug_support(): for mode_idx in range(cfg.mode_count): print 'mode_idx=', mode_idx entity_tags = {} entities = [] for row in open(args.feat_file): _e, _tags = [e.strip() for e in row.strip().split('|||')] entities.append(_e) entity_tags[_e] = set([ t.lower() for t in (e.strip().split(':')[0] for e in _tags.split()) if t.lower() in embeddings ]) total_tags = set( rasengan.flatten([list(e) for e in entity_tags.values()])) assert all(e in embeddings for e in total_tags) print( 'For each of these people our goal is to select one word.' ' That word should be as similar to other words picked for other' ' entities as possible') problem = rasengan.OrderedDict_Indexable_By_StringKey_Or_Index() for (a, b) in entity_tags.items(): b = list(b) print 'Entity: ', a, 'tags to remove: ', tags_to_remove[a] for ttr in tags_to_remove[a]: tolerant_remove(b, ttr) if cfg.introduce_NULL_embedding and cfg.NULL_KEY not in b: b.append(cfg.NULL_KEY) # print '%-25s' % a, '|||', ', '.join(b) problem[a] = DataFrame(data=numpy.concatenate( [(scale_to_unit(embeddings[e]) if cfg.scale_to_unit else embeddings[e])[None, :] for e in b], axis=0), index=b) if args.ctag is None: initial_assignment = dict( (__a, 0) for __b, __a in enumerate(entities)) else: ctag = 'war'.split()[args.ctag] initial_assignment = dict( (__e, (cfg.NULL_KEY if ctag not in entity_tags[__e] else ctag)) for __e in entities) print 'Initial chosen tags::', chosen_tags(problem, initial_assignment) initial_objective = dp_objective_efficient_impl( problem, initial_assignment) print 'initial_objective=', initial_objective assert numpy.isclose( dp_objective_naive_impl(problem, initial_assignment), initial_objective) final_assignment = optimize_assignment(problem, initial_assignment, method=args.method) final_objective = dp_objective_efficient_impl( problem, final_assignment) for (fa_entity, fa_tag_idx) in final_assignment.iteritems(): tags_to_remove[fa_entity].append( liloc(problem[fa_entity], fa_tag_idx).name) print 'mode_idx=', mode_idx, print 'initial_objective=', initial_objective, print 'final_objective=', final_objective, print 'Final chosen tags=', chosen_tags(problem, final_assignment) return
def main(args): with rasengan.debug_support(): with rasengan.tictoc("Loading Data"): data_list = rasengan.namespacer( read_data(args.train_fn)) val_data_list = rasengan.namespacer( read_data(args.dev_fn)) if args.partition_dev_into_train > 0: lim = args.partition_dev_into_test data_list.extend(val_data_list[lim:]) val_data_list = val_data_list[:lim] if args.partition_dev_into_test > 0: lim = args.partition_dev_into_test test_data_list = val_data_list[lim:] val_data_list = val_data_list[:lim] else: test_data_list = rasengan.namespacer( read_data(args.test_fn)) # data_list = val_data_list = [(u'jason', u'eisner')] lst_char = get_lst_char(data_list + val_data_list + test_data_list) data_list = add_bos(data_list) val_data_list = add_bos(val_data_list) test_data_list = add_bos(test_data_list) warnings.warn(''' NOTE: While preparing sigma, we add 1 to the index returned by enumerate because the transducer unit that Ryan wrote uses index 0 as the index for the epsilon symbol. So essentially the epsilon symbol and the integer 0 are reserved symbols that cannot appear in the vocabulary. ALSO, we need to add 1 to the vocsize because of that. ''') # sigma :: char -> int sigma = dict((b, a+1) for (a,b) in enumerate(lst_char)) # sigma_inv :: int -> char sigma_inv = dict((a+1, b) for (a,b) in enumerate(lst_char)) if args.limit_corpus > 0: data_list = data_list[:args.limit_corpus] train_data = numerize(data_list, sigma, args.win) val_data = numerize(val_data_list, sigma, args.win) test_data = numerize(test_data_list, sigma, args.win) data = rasengan.Namespace() #-------------------------------------------------------------# # Add sets that would be used by the tensorflow seq2seq # # model. See~$PY/tensorflow/models/rnn/translate/translate.py # #-------------------------------------------------------------# data.train_data = data_list data.val_data = val_data_list data.test_data = test_data_list data.train_set = train_data data.dev_set = val_data data.test_set = test_data data.vocsize = len(sigma) + 1 data.idx2label = sigma_inv data.label2idx = sigma data.train_lex = [e[0] for e in train_data] data.train_y = [e[1] for e in train_data] data.valid_lex = [e[0] for e in val_data] data.valid_y = util_lstm_seqlabel.convert_id_to_word( [e[1] for e in val_data], data.idx2label) data.test_lex = [e[0] for e in test_data] data.test_y = util_lstm_seqlabel.convert_id_to_word( [e[1] for e in test_data], data.idx2label) data.words_train = [] data.words_valid = [] data.words_test = [] return data
def main(args): with rasengan.debug_support(): with rasengan.tictoc("Loading Data"): data_list = rasengan.namespacer(read_data(args.train_fn)) val_data_list = rasengan.namespacer(read_data(args.dev_fn)) if args.partition_dev_into_train > 0: lim = args.partition_dev_into_test data_list.extend(val_data_list[lim:]) val_data_list = val_data_list[:lim] if args.partition_dev_into_test > 0: lim = args.partition_dev_into_test test_data_list = val_data_list[lim:] val_data_list = val_data_list[:lim] else: test_data_list = rasengan.namespacer(read_data(args.test_fn)) # data_list = val_data_list = [(u'jason', u'eisner')] lst_char = get_lst_char(data_list + val_data_list + test_data_list) data_list = add_bos(data_list) val_data_list = add_bos(val_data_list) test_data_list = add_bos(test_data_list) warnings.warn(''' NOTE: While preparing sigma, we add 1 to the index returned by enumerate because the transducer unit that Ryan wrote uses index 0 as the index for the epsilon symbol. So essentially the epsilon symbol and the integer 0 are reserved symbols that cannot appear in the vocabulary. ALSO, we need to add 1 to the vocsize because of that. ''') # sigma :: char -> int sigma = dict((b, a + 1) for (a, b) in enumerate(lst_char)) # sigma_inv :: int -> char sigma_inv = dict((a + 1, b) for (a, b) in enumerate(lst_char)) if args.limit_corpus > 0: data_list = data_list[:args.limit_corpus] train_data = numerize(data_list, sigma, args.win) val_data = numerize(val_data_list, sigma, args.win) test_data = numerize(test_data_list, sigma, args.win) data = rasengan.Namespace() #-------------------------------------------------------------# # Add sets that would be used by the tensorflow seq2seq # # model. See~$PY/tensorflow/models/rnn/translate/translate.py # #-------------------------------------------------------------# data.train_data = data_list data.val_data = val_data_list data.test_data = test_data_list data.train_set = train_data data.dev_set = val_data data.test_set = test_data data.vocsize = len(sigma) + 1 data.idx2label = sigma_inv data.label2idx = sigma data.train_lex = [e[0] for e in train_data] data.train_y = [e[1] for e in train_data] data.valid_lex = [e[0] for e in val_data] data.valid_y = util_lstm_seqlabel.convert_id_to_word( [e[1] for e in val_data], data.idx2label) data.test_lex = [e[0] for e in test_data] data.test_y = util_lstm_seqlabel.convert_id_to_word( [e[1] for e in test_data], data.idx2label) data.words_train = [] data.words_valid = [] data.words_test = [] return data
#----------------------------------------------------------------------# # Print `keys` from pkl file that were specially mentioned on cmdline. # #----------------------------------------------------------------------# for k in args.keys: print k, pkl[k] #---------------------------------------------------------# # In case we want to interact with the pkl after loading. # #---------------------------------------------------------# if args.interact: import readline, code print pkl.keys() code.InteractiveConsole(pkl).interact() if __name__ == '__main__': arg_parser = argparse.ArgumentParser( description='Tabulate performance of saved model files.') arg_parser.add_argument( '--path', nargs='*', default=[], help='A glob of the paths to the pkls') arg_parser.add_argument( '--interact', default=0, type=int, help='Default={0}') arg_parser.add_argument( '--keys', nargs='*', default=[], help='Default={0}') arg_parser.add_argument('--server', default=0, type=int, help='Default={0}') arg_parser.add_argument('--client', default=0, type=int, help='Default={0}') with rasengan.debug_support(): main(args=arg_parser.parse_args())