def main(args): print 'loading config file', args[1] config = cmn.loadConfig(args[1]) dsdir = config['dsdir'] #first generating name datasets based on the number of names for each set if not os.path.exists(os.path.join(dsdir,'train.txt')): generate_name_dataset(config) trainfile = dsdir + '/train.txt' devfile = dsdir + '/dev.txt' testfile = dsdir + '/test.txt' targetTypesFile=config['typefile'] vectorFile = config['ent_vectors'] vectorFile_words = config['word_vectors'] if 'word_vectors' in config else vectorFile subword_vectorFile = config['fasttext_vecfile'] if 'fasttext_vecfile' in config else None ent2tfidf_features_path = config['ent2tfidf_features_path'] if 'ent2tfidf_features_path' in config else None # the_features = config['features'].split(' ') #i.e. letters entvec words tc ngrams = [int(n) for n in config['ngrams_n'].split()] if 'ngrams_n' in config else [] ngrams_vecfiles = {ngram: config['ngrams'+str(ngram)+'_vecfile'] for ngram in ngrams} letter_vecfile = config['letters_vecfile'] if 'letters_vecfile' in config else None hs_ngram_path = config['hsngrampath'] if 'hsngrampath' in config else None hs_ngram_versions = config['hsngram_vecs'].split() if hs_ngram_path else None use_lowercase = str_to_bool(config['use_lowercase']) if 'use_lowercase' in config else False print "uselower: ", use_lowercase upto = -1 (t2idx, _) = cmn.loadtypes(targetTypesFile) trnMentions = load_ent_ds(trainfile) devMentions = load_ent_ds(devfile) tstMentions = load_ent_ds(testfile) logger.info("#train : %d #dev : %d #test : %d", len(trnMentions), len(devMentions), len(tstMentions)) if not os.path.exists(os.path.join(dsdir,'_targets.h5py')): build_targets_ds(trnMentions, devMentions, tstMentions, t2idx, dsdir) build_entvec_ds(trnMentions, devMentions, tstMentions, t2idx, dsdir, vectorFile, upto=-1) build_letters_ds(trnMentions, devMentions, tstMentions, t2idx, dsdir, letter_vecfile, max_len_name=40) build_typecosine_ds(trnMentions, devMentions, tstMentions, t2idx, dsdir, vectorFile, upto=-1) if hs_ngram_path: build_hsNgram_ds(config, trnMentions, devMentions, tstMentions, t2idx, dsdir, hs_ngram_path, hs_ngram_versions, vectorsize=300, upto=-1) for ng in ngrams: build_ngram_ds(trnMentions, devMentions, tstMentions, t2idx, dsdir, ngrams_vecfiles[ng], ng, upto=-1) build_type_patterns(trnMentions, t2idx, dsdir, vectorFile) save_typevecmatrix(t2idx, dsdir, vectorFile) # build_type_words_cosine_ds(trnMentions, devMentions, tstMentions, t2idx, dsdir, vectorFile, upto=-1) build_subwords_ds(trnMentions, devMentions, tstMentions, t2idx, dsdir, subword_vectorFile, use_lowercase=use_lowercase, upto=-1) build_words_ds(trnMentions, devMentions, tstMentions, t2idx, dsdir, vectorFile_words, use_lowercase=use_lowercase, upto=-1) build_entvec_ds(trnMentions, devMentions, tstMentions, t2idx, dsdir, vectorFile, upto=-1) build_desc_features_ds(trnMentions, devMentions, tstMentions, ent2tfidf_features_path, t2idx, dsdir, vectorFile_words, use_lowercase=True, upto=-1) else: build_desc_features_ds(trnMentions, devMentions, tstMentions, ent2tfidf_features_path, t2idx, dsdir, vectorFile_words, use_lowercase=True, upto=-1)
def main(args): logger.info('loading config file: %s', args.config) exp_dir, _ = os.path.split(os.path.abspath(args.config)) config = cmn.loadConfig(args.config) config['exp_dir'] = exp_dir config['net'] = os.path.join(exp_dir, config['net']) batch_size = int(config['batchsize']) features = config['features'].split(' ') #i.e. letters words entvec if batch_size == 0: batch_size = None inp_srcs = [] for fea in features: if 'ngrams' in fea: inp_srcs.extend(['ngrams' + ng for ng in config['ngrams_n'].split()]) else: inp_srcs.append(fea) our_sources = inp_srcs + ['targets'] fea2obj = build_input_objs(our_sources, config) typer = EntityTypingGlobal(config) if args.train: import shutil #typer.training(fea2obj, batch_size, learning_rate=float(config['lrate']), steprule=config['steprule'], wait_epochs=10, kl_weight_init=1, klw_ep=100, klw_inc_rate=0, num_epochs=50) typer.training(fea2obj, batch_size, learning_rate=float(config['lrate']), steprule=config['steprule'], wait_epochs=3, num_epochs=30) shutil.copyfile(config['net']+'.best.pkl', config['net']+'.toload.pkl') shutil.copyfile(config['net']+'.best.pkl', config['net']+'.best1.pkl') # logger.info('One more epoch training...') # typer.training(fea2obj, batch_size, learning_rate=float(config['lrate'])/2, steprule=config['steprule'], wait_epochs=2, klw_ep=10, kl_weight_init=0.008, num_epochs=20) # shutil.copyfile(config['net']+'.best.pkl', config['net']+'.toload.pkl') # shutil.copyfile(config['net']+'.best.pkl', config['net']+'.best2.pkl') #logger.info('One more epoch training...') #typer.training(fea2obj, batch_size, learning_rate=float(config['lrate'])/2, steprule=config['steprule'], wait_epochs=2, klw_ep=10, kl_weight_init=0.02, num_epochs=10) shutil.copyfile(config['net']+'.best.pkl', config['net']+'.toload.pkl') logger.info('One more epoch training...') typer.training(fea2obj, batch_size=100, learning_rate=0.005, steprule='adagrad', wait_epochs=2, klw_ep=10, kl_weight_init=None, num_epochs=10) if args.test: typer.testing(fea2obj) if args.eval: typer.evaluate(args.config)
(etest2types, etest2names, _) = cmn.load_entname_ds(testfile, t2idx) logger.info("number of test examples: %d",len(etest2names)) (edev2types, edev2names, _) = cmn.load_entname_ds(devfile, t2idx) logger.info("number of dev examples: %d", len(edev2names)) logger.info('number of names for each entity in trn,dev,test: %s', max_names) logger.info('generating new datasets based on entity names') # dsdir = dsdir + 'maxname' + ','.join([str(n) for n in max_names]) # if not os.path.exists(dsdir): os.makedirs(dsdir) gen_new_ds(etrain2names, etrain2types, max_names[0], outfile = dsdir + '/train.txt') gen_new_ds(edev2names, edev2types, max_names[1], outfile = dsdir + '/dev.txt') gen_new_ds(etest2names, etest2types, max_names[2], outfile = dsdir + '/test.txt') if __name__ == '__main__': print 'loading config file', sys.argv[1] config = cmn.loadConfig(sys.argv[1]) trainfile = config['Etrain'] devfile = config['Edev'] testfile = config['Etest'] targetTypesFile=config['typefile'] max_names = [int(n) for n in config['name_num'].split()] dsdir = config['dsdir'] upto = -1 (t2idx, idx2t) = cmn.loadtypes(targetTypesFile) numtargets = len(t2idx) (etrain2types, etrain2names, _) = cmn.load_entname_ds(trainfile, t2idx) logger.info("number of train examples: %d",len(etrain2names)) (etest2types, etest2names, _) = cmn.load_entname_ds(testfile, t2idx) logger.info("number of test examples: %d",len(etest2names)) (edev2types, edev2names, _) = cmn.load_entname_ds(devfile, t2idx)
type=bool, help="Training the model on the test data, or not") parser.add_argument("--loaddata", "-lo", type=bool, help="To load the feature matrices or not?") return parser if __name__ == '__main__': UPTO = -1 parser = get_argument_parser() args = parser.parse_args() config = loadConfig(args.config) brownMappingFile = config['brownclusters'] trainfile = config['Etrain'] devfile = config['Edev'] testfile = config['Etest'] batch_size = int(config['batchsize']) targetTypesFile = config['typefile'] learning_rate = float(config['lrate']) networkfile = config['net'] num_of_hidden_units = int(config['hidden_units']) n_epochs = int(config['nepochs']) maxngram = int(config['maxngram']) MLP = str_to_bool(config['mlp']) featuresToUse = [fea for fea in config['features'].split(' ')] npdir = config['npdir'] if not os.path.exists(npdir): os.makedirs(npdir)
parser.add_argument( "--test", "-t", type=bool, help="Applying the model on the test data, or not") parser.add_argument( "--train", "-tr", type=bool, help="Training the model on the test data, or not") parser.add_argument( "--loaddata", "-lo", type=bool, help="To load the feature matrices or not?") return parser if __name__ == '__main__': UPTO = -1 parser = get_argument_parser() args = parser.parse_args() config = loadConfig(args.config) brownMappingFile=config['brownclusters'] trainfile=config['Etrain'] devfile=config['Edev'] testfile=config['Etest'] batch_size=int(config['batchsize']) targetTypesFile=config['typefile'] learning_rate = float(config['lrate']) networkfile = config['net'] num_of_hidden_units = int(config['hidden_units']) n_epochs = int(config['nepochs']) maxngram = int(config['maxngram']) MLP=str_to_bool(config['mlp']) featuresToUse= [fea for fea in config['features'].split(' ')] npdir = config['npdir'] if not os.path.exists(npdir): os.makedirs(npdir)
def main(args): print 'loading config file', args[1] config = cmn.loadConfig(args[1]) dsdir = config['dsdir'] #first generating name datasets based on the number of names for each set if not os.path.exists(os.path.join(dsdir, 'train.txt')): generate_name_dataset(config) trainfile = dsdir + '/train.txt' devfile = dsdir + '/dev.txt' testfile = dsdir + '/test.txt' targetTypesFile = config['typefile'] vectorFile = config['ent_vectors'] vectorFile_words = config[ 'word_vectors'] if 'word_vectors' in config else vectorFile subword_vectorFile = config[ 'fasttext_vecfile'] if 'fasttext_vecfile' in config else None ent2tfidf_features_path = config[ 'ent2tfidf_features_path'] if 'ent2tfidf_features_path' in config else None # the_features = config['features'].split(' ') #i.e. letters entvec words tc ngrams = [int(n) for n in config['ngrams_n'].split() ] if 'ngrams_n' in config else [] ngrams_vecfiles = { ngram: config['ngrams' + str(ngram) + '_vecfile'] for ngram in ngrams } letter_vecfile = config[ 'letters_vecfile'] if 'letters_vecfile' in config else None hs_ngram_path = config['hsngrampath'] if 'hsngrampath' in config else None hs_ngram_versions = config['hsngram_vecs'].split( ) if hs_ngram_path else None use_lowercase = str_to_bool( config['use_lowercase']) if 'use_lowercase' in config else False print "uselower: ", use_lowercase upto = -1 (t2idx, _) = cmn.loadtypes(targetTypesFile) trnMentions = load_ent_ds(trainfile) devMentions = load_ent_ds(devfile) tstMentions = load_ent_ds(testfile) logger.info("#train : %d #dev : %d #test : %d", len(trnMentions), len(devMentions), len(tstMentions)) if not os.path.exists(os.path.join(dsdir, '_targets.h5py')): build_targets_ds(trnMentions, devMentions, tstMentions, t2idx, dsdir) build_entvec_ds(trnMentions, devMentions, tstMentions, t2idx, dsdir, vectorFile, upto=-1) build_letters_ds(trnMentions, devMentions, tstMentions, t2idx, dsdir, letter_vecfile, max_len_name=40) build_typecosine_ds(trnMentions, devMentions, tstMentions, t2idx, dsdir, vectorFile, upto=-1) if hs_ngram_path: build_hsNgram_ds(config, trnMentions, devMentions, tstMentions, t2idx, dsdir, hs_ngram_path, hs_ngram_versions, vectorsize=300, upto=-1) for ng in ngrams: build_ngram_ds(trnMentions, devMentions, tstMentions, t2idx, dsdir, ngrams_vecfiles[ng], ng, upto=-1) build_type_patterns(trnMentions, t2idx, dsdir, vectorFile) save_typevecmatrix(t2idx, dsdir, vectorFile) # build_type_words_cosine_ds(trnMentions, devMentions, tstMentions, t2idx, dsdir, vectorFile, upto=-1) build_subwords_ds(trnMentions, devMentions, tstMentions, t2idx, dsdir, subword_vectorFile, use_lowercase=use_lowercase, upto=-1) build_words_ds(trnMentions, devMentions, tstMentions, t2idx, dsdir, vectorFile_words, use_lowercase=use_lowercase, upto=-1) build_entvec_ds(trnMentions, devMentions, tstMentions, t2idx, dsdir, vectorFile, upto=-1) build_desc_features_ds(trnMentions, devMentions, tstMentions, ent2tfidf_features_path, t2idx, dsdir, vectorFile_words, use_lowercase=True, upto=-1) else: build_desc_features_ds(trnMentions, devMentions, tstMentions, ent2tfidf_features_path, t2idx, dsdir, vectorFile_words, use_lowercase=True, upto=-1)