Beispiel #1
0
def main(args):
    print 'loading config file', args[1]
    config = cmn.loadConfig(args[1])
    dsdir = config['dsdir']
    #first generating name datasets based on the number of names for each set
    if not os.path.exists(os.path.join(dsdir,'train.txt')):
        generate_name_dataset(config) 
    
    trainfile = dsdir + '/train.txt'
    devfile = dsdir + '/dev.txt'
    testfile = dsdir + '/test.txt'
    targetTypesFile=config['typefile']
    vectorFile = config['ent_vectors']
    vectorFile_words = config['word_vectors'] if 'word_vectors' in config else vectorFile
    subword_vectorFile = config['fasttext_vecfile'] if 'fasttext_vecfile' in config else None
    ent2tfidf_features_path = config['ent2tfidf_features_path'] if 'ent2tfidf_features_path' in config else None
#     the_features = config['features'].split(' ') #i.e. letters entvec words tc 
    ngrams = [int(n) for n in config['ngrams_n'].split()] if 'ngrams_n' in config else []
    ngrams_vecfiles = {ngram: config['ngrams'+str(ngram)+'_vecfile'] for ngram in ngrams}
    letter_vecfile = config['letters_vecfile'] if 'letters_vecfile' in config else None
    hs_ngram_path = config['hsngrampath'] if 'hsngrampath' in config else None
    hs_ngram_versions = config['hsngram_vecs'].split() if hs_ngram_path else None
    use_lowercase = str_to_bool(config['use_lowercase']) if 'use_lowercase' in config else False
    print "uselower: ", use_lowercase
    upto = -1
    (t2idx, _) = cmn.loadtypes(targetTypesFile)
    trnMentions = load_ent_ds(trainfile)
    devMentions = load_ent_ds(devfile)
    tstMentions = load_ent_ds(testfile)
    logger.info("#train : %d #dev : %d #test : %d", len(trnMentions), len(devMentions), len(tstMentions))
    
    if not os.path.exists(os.path.join(dsdir,'_targets.h5py')):
        build_targets_ds(trnMentions, devMentions, tstMentions, t2idx, dsdir)
        build_entvec_ds(trnMentions, devMentions, tstMentions, t2idx, dsdir, vectorFile, upto=-1)
        build_letters_ds(trnMentions, devMentions, tstMentions, t2idx, dsdir, letter_vecfile, max_len_name=40)
        build_typecosine_ds(trnMentions, devMentions, tstMentions, t2idx, dsdir, vectorFile, upto=-1)
        if hs_ngram_path:
            build_hsNgram_ds(config, trnMentions, devMentions, tstMentions, t2idx, dsdir, hs_ngram_path, hs_ngram_versions, vectorsize=300, upto=-1)
        for ng in ngrams:
            build_ngram_ds(trnMentions, devMentions, tstMentions, t2idx, dsdir, ngrams_vecfiles[ng], ng, upto=-1)
        build_type_patterns(trnMentions, t2idx, dsdir, vectorFile)
        save_typevecmatrix(t2idx, dsdir, vectorFile)
#         build_type_words_cosine_ds(trnMentions, devMentions, tstMentions, t2idx, dsdir, vectorFile, upto=-1)
        build_subwords_ds(trnMentions, devMentions, tstMentions, t2idx, dsdir, subword_vectorFile, use_lowercase=use_lowercase, upto=-1)
        build_words_ds(trnMentions, devMentions, tstMentions, t2idx, dsdir, vectorFile_words, use_lowercase=use_lowercase, upto=-1)
        build_entvec_ds(trnMentions, devMentions, tstMentions, t2idx, dsdir, vectorFile, upto=-1)
        build_desc_features_ds(trnMentions, devMentions, tstMentions, ent2tfidf_features_path, t2idx, dsdir, vectorFile_words, use_lowercase=True, upto=-1)
    else:
        build_desc_features_ds(trnMentions, devMentions, tstMentions, ent2tfidf_features_path, t2idx, dsdir, vectorFile_words, use_lowercase=True, upto=-1)
Beispiel #2
0
def main(args):
    logger.info('loading config file: %s', args.config)
    exp_dir, _ = os.path.split(os.path.abspath(args.config))
    config = cmn.loadConfig(args.config)
    config['exp_dir'] = exp_dir
    config['net'] = os.path.join(exp_dir, config['net'])
    batch_size =  int(config['batchsize'])
    features = config['features'].split(' ') #i.e. letters words entvec 
    if batch_size == 0: batch_size = None
    inp_srcs = []
    for fea in features:
        if 'ngrams' in fea:
            inp_srcs.extend(['ngrams' + ng for ng in config['ngrams_n'].split()])
        else:
            inp_srcs.append(fea)
    our_sources = inp_srcs + ['targets']
    
    fea2obj = build_input_objs(our_sources, config)
    
    typer = EntityTypingGlobal(config) 
    if args.train:
        import shutil
        #typer.training(fea2obj, batch_size, learning_rate=float(config['lrate']), steprule=config['steprule'], wait_epochs=10, kl_weight_init=1, klw_ep=100, klw_inc_rate=0, num_epochs=50)
        typer.training(fea2obj, batch_size, learning_rate=float(config['lrate']), steprule=config['steprule'], wait_epochs=3, num_epochs=30)
        shutil.copyfile(config['net']+'.best.pkl', config['net']+'.toload.pkl')
        shutil.copyfile(config['net']+'.best.pkl', config['net']+'.best1.pkl')
        # logger.info('One more epoch training...')
        # typer.training(fea2obj, batch_size, learning_rate=float(config['lrate'])/2, steprule=config['steprule'], wait_epochs=2, klw_ep=10, kl_weight_init=0.008, num_epochs=20)
        # shutil.copyfile(config['net']+'.best.pkl', config['net']+'.toload.pkl')
        # shutil.copyfile(config['net']+'.best.pkl', config['net']+'.best2.pkl')
        #logger.info('One more epoch training...')
        #typer.training(fea2obj, batch_size, learning_rate=float(config['lrate'])/2, steprule=config['steprule'], wait_epochs=2, klw_ep=10, kl_weight_init=0.02, num_epochs=10)
        shutil.copyfile(config['net']+'.best.pkl', config['net']+'.toload.pkl')
        logger.info('One more epoch training...')
        typer.training(fea2obj, batch_size=100, learning_rate=0.005, steprule='adagrad', wait_epochs=2, klw_ep=10, kl_weight_init=None, num_epochs=10)


    if args.test:
        typer.testing(fea2obj)
    
    if args.eval:
        typer.evaluate(args.config)
Beispiel #3
0
    (etest2types, etest2names, _) = cmn.load_entname_ds(testfile, t2idx)
    logger.info("number of test examples: %d",len(etest2names))
    (edev2types, edev2names, _) = cmn.load_entname_ds(devfile, t2idx)
    logger.info("number of dev examples: %d", len(edev2names))

    logger.info('number of names for each entity in trn,dev,test: %s', max_names)
    logger.info('generating new datasets based on entity names')
#     dsdir = dsdir + 'maxname'  + ','.join([str(n) for n in max_names])
#     if not os.path.exists(dsdir): os.makedirs(dsdir)
    gen_new_ds(etrain2names, etrain2types, max_names[0], outfile = dsdir + '/train.txt')
    gen_new_ds(edev2names, edev2types, max_names[1], outfile = dsdir + '/dev.txt')
    gen_new_ds(etest2names, etest2types, max_names[2], outfile = dsdir + '/test.txt')

if __name__ == '__main__':
    print 'loading config file', sys.argv[1]
    config = cmn.loadConfig(sys.argv[1])
    trainfile = config['Etrain']
    devfile = config['Edev']
    testfile = config['Etest']
    targetTypesFile=config['typefile']
    max_names = [int(n) for n in config['name_num'].split()]
    dsdir = config['dsdir']
    
    upto = -1
    (t2idx, idx2t) = cmn.loadtypes(targetTypesFile)
    numtargets = len(t2idx)
    (etrain2types, etrain2names, _) = cmn.load_entname_ds(trainfile, t2idx)
    logger.info("number of train examples: %d",len(etrain2names))
    (etest2types, etest2names, _) = cmn.load_entname_ds(testfile, t2idx)
    logger.info("number of test examples: %d",len(etest2names))
    (edev2types, edev2names, _) = cmn.load_entname_ds(devfile, t2idx)
Beispiel #4
0
                        type=bool,
                        help="Training the model on the test data, or not")

    parser.add_argument("--loaddata",
                        "-lo",
                        type=bool,
                        help="To load the feature matrices or not?")
    return parser


if __name__ == '__main__':
    UPTO = -1
    parser = get_argument_parser()
    args = parser.parse_args()

    config = loadConfig(args.config)
    brownMappingFile = config['brownclusters']
    trainfile = config['Etrain']
    devfile = config['Edev']
    testfile = config['Etest']
    batch_size = int(config['batchsize'])
    targetTypesFile = config['typefile']
    learning_rate = float(config['lrate'])
    networkfile = config['net']
    num_of_hidden_units = int(config['hidden_units'])
    n_epochs = int(config['nepochs'])
    maxngram = int(config['maxngram'])
    MLP = str_to_bool(config['mlp'])
    featuresToUse = [fea for fea in config['features'].split(' ')]
    npdir = config['npdir']
    if not os.path.exists(npdir): os.makedirs(npdir)
Beispiel #5
0
    parser.add_argument(
        "--test", "-t", type=bool, help="Applying the model on the test data, or not")
    
    parser.add_argument(
        "--train", "-tr", type=bool, help="Training the model on the test data, or not")

    parser.add_argument(
        "--loaddata", "-lo", type=bool, help="To load the feature matrices or not?")
    return parser

if __name__ == '__main__':
    UPTO = -1
    parser = get_argument_parser()
    args = parser.parse_args()

    config = loadConfig(args.config)
    brownMappingFile=config['brownclusters']
    trainfile=config['Etrain']
    devfile=config['Edev']
    testfile=config['Etest']
    batch_size=int(config['batchsize'])
    targetTypesFile=config['typefile']
    learning_rate = float(config['lrate'])
    networkfile = config['net']
    num_of_hidden_units = int(config['hidden_units'])
    n_epochs = int(config['nepochs'])
    maxngram = int(config['maxngram'])
    MLP=str_to_bool(config['mlp'])
    featuresToUse= [fea for fea in config['features'].split(' ')]
    npdir = config['npdir']
    if not os.path.exists(npdir): os.makedirs(npdir)
def main(args):
    print 'loading config file', args[1]
    config = cmn.loadConfig(args[1])
    dsdir = config['dsdir']
    #first generating name datasets based on the number of names for each set
    if not os.path.exists(os.path.join(dsdir, 'train.txt')):
        generate_name_dataset(config)

    trainfile = dsdir + '/train.txt'
    devfile = dsdir + '/dev.txt'
    testfile = dsdir + '/test.txt'
    targetTypesFile = config['typefile']
    vectorFile = config['ent_vectors']
    vectorFile_words = config[
        'word_vectors'] if 'word_vectors' in config else vectorFile
    subword_vectorFile = config[
        'fasttext_vecfile'] if 'fasttext_vecfile' in config else None
    ent2tfidf_features_path = config[
        'ent2tfidf_features_path'] if 'ent2tfidf_features_path' in config else None
    #     the_features = config['features'].split(' ') #i.e. letters entvec words tc
    ngrams = [int(n) for n in config['ngrams_n'].split()
              ] if 'ngrams_n' in config else []
    ngrams_vecfiles = {
        ngram: config['ngrams' + str(ngram) + '_vecfile']
        for ngram in ngrams
    }
    letter_vecfile = config[
        'letters_vecfile'] if 'letters_vecfile' in config else None
    hs_ngram_path = config['hsngrampath'] if 'hsngrampath' in config else None
    hs_ngram_versions = config['hsngram_vecs'].split(
    ) if hs_ngram_path else None
    use_lowercase = str_to_bool(
        config['use_lowercase']) if 'use_lowercase' in config else False
    print "uselower: ", use_lowercase
    upto = -1
    (t2idx, _) = cmn.loadtypes(targetTypesFile)
    trnMentions = load_ent_ds(trainfile)
    devMentions = load_ent_ds(devfile)
    tstMentions = load_ent_ds(testfile)
    logger.info("#train : %d #dev : %d #test : %d", len(trnMentions),
                len(devMentions), len(tstMentions))

    if not os.path.exists(os.path.join(dsdir, '_targets.h5py')):
        build_targets_ds(trnMentions, devMentions, tstMentions, t2idx, dsdir)
        build_entvec_ds(trnMentions,
                        devMentions,
                        tstMentions,
                        t2idx,
                        dsdir,
                        vectorFile,
                        upto=-1)
        build_letters_ds(trnMentions,
                         devMentions,
                         tstMentions,
                         t2idx,
                         dsdir,
                         letter_vecfile,
                         max_len_name=40)
        build_typecosine_ds(trnMentions,
                            devMentions,
                            tstMentions,
                            t2idx,
                            dsdir,
                            vectorFile,
                            upto=-1)
        if hs_ngram_path:
            build_hsNgram_ds(config,
                             trnMentions,
                             devMentions,
                             tstMentions,
                             t2idx,
                             dsdir,
                             hs_ngram_path,
                             hs_ngram_versions,
                             vectorsize=300,
                             upto=-1)
        for ng in ngrams:
            build_ngram_ds(trnMentions,
                           devMentions,
                           tstMentions,
                           t2idx,
                           dsdir,
                           ngrams_vecfiles[ng],
                           ng,
                           upto=-1)
        build_type_patterns(trnMentions, t2idx, dsdir, vectorFile)
        save_typevecmatrix(t2idx, dsdir, vectorFile)
        #         build_type_words_cosine_ds(trnMentions, devMentions, tstMentions, t2idx, dsdir, vectorFile, upto=-1)
        build_subwords_ds(trnMentions,
                          devMentions,
                          tstMentions,
                          t2idx,
                          dsdir,
                          subword_vectorFile,
                          use_lowercase=use_lowercase,
                          upto=-1)
        build_words_ds(trnMentions,
                       devMentions,
                       tstMentions,
                       t2idx,
                       dsdir,
                       vectorFile_words,
                       use_lowercase=use_lowercase,
                       upto=-1)
        build_entvec_ds(trnMentions,
                        devMentions,
                        tstMentions,
                        t2idx,
                        dsdir,
                        vectorFile,
                        upto=-1)
        build_desc_features_ds(trnMentions,
                               devMentions,
                               tstMentions,
                               ent2tfidf_features_path,
                               t2idx,
                               dsdir,
                               vectorFile_words,
                               use_lowercase=True,
                               upto=-1)
    else:
        build_desc_features_ds(trnMentions,
                               devMentions,
                               tstMentions,
                               ent2tfidf_features_path,
                               t2idx,
                               dsdir,
                               vectorFile_words,
                               use_lowercase=True,
                               upto=-1)