def generate_name_dataset(config):
    trainfile = config['Etrain']
    devfile = config['Edev']
    testfile = config['Etest']
    targetTypesFile = config['typefile']
    max_names = [int(n) for n in config['name_num'].split()]
    dsdir = config['dsdir']
    (t2idx, _) = cmn.loadtypes(targetTypesFile)
    _ = len(t2idx)
    (etrain2types, etrain2names, _) = cmn.load_entname_ds(trainfile, t2idx)
    logger.info("number of train examples: %d", len(etrain2names))
    (etest2types, etest2names, _) = cmn.load_entname_ds(testfile, t2idx)
    logger.info("number of test examples: %d", len(etest2names))
    (edev2types, edev2names, _) = cmn.load_entname_ds(devfile, t2idx)
    logger.info("number of dev examples: %d", len(edev2names))

    logger.info('number of names for each entity in trn,dev,test: %s',
                max_names)
    logger.info('generating new datasets based on entity names')
    #     dsdir = dsdir + 'maxname'  + ','.join([str(n) for n in max_names])
    #     if not os.path.exists(dsdir): os.makedirs(dsdir)
    gen_new_ds(etrain2names,
               etrain2types,
               max_names[0],
               outfile=dsdir + '/train.txt')
    gen_new_ds(edev2names,
               edev2types,
               max_names[1],
               outfile=dsdir + '/dev.txt')
    gen_new_ds(etest2names,
               etest2types,
               max_names[2],
               outfile=dsdir + '/test.txt')
Beispiel #2
0
def main(args):
    print 'loading config file', args[1]
    config = cmn.loadConfig(args[1])
    dsdir = config['dsdir']
    #first generating name datasets based on the number of names for each set
    if not os.path.exists(os.path.join(dsdir,'train.txt')):
        generate_name_dataset(config) 
    
    trainfile = dsdir + '/train.txt'
    devfile = dsdir + '/dev.txt'
    testfile = dsdir + '/test.txt'
    targetTypesFile=config['typefile']
    vectorFile = config['ent_vectors']
    vectorFile_words = config['word_vectors'] if 'word_vectors' in config else vectorFile
    subword_vectorFile = config['fasttext_vecfile'] if 'fasttext_vecfile' in config else None
    ent2tfidf_features_path = config['ent2tfidf_features_path'] if 'ent2tfidf_features_path' in config else None
#     the_features = config['features'].split(' ') #i.e. letters entvec words tc 
    ngrams = [int(n) for n in config['ngrams_n'].split()] if 'ngrams_n' in config else []
    ngrams_vecfiles = {ngram: config['ngrams'+str(ngram)+'_vecfile'] for ngram in ngrams}
    letter_vecfile = config['letters_vecfile'] if 'letters_vecfile' in config else None
    hs_ngram_path = config['hsngrampath'] if 'hsngrampath' in config else None
    hs_ngram_versions = config['hsngram_vecs'].split() if hs_ngram_path else None
    use_lowercase = str_to_bool(config['use_lowercase']) if 'use_lowercase' in config else False
    print "uselower: ", use_lowercase
    upto = -1
    (t2idx, _) = cmn.loadtypes(targetTypesFile)
    trnMentions = load_ent_ds(trainfile)
    devMentions = load_ent_ds(devfile)
    tstMentions = load_ent_ds(testfile)
    logger.info("#train : %d #dev : %d #test : %d", len(trnMentions), len(devMentions), len(tstMentions))
    
    if not os.path.exists(os.path.join(dsdir,'_targets.h5py')):
        build_targets_ds(trnMentions, devMentions, tstMentions, t2idx, dsdir)
        build_entvec_ds(trnMentions, devMentions, tstMentions, t2idx, dsdir, vectorFile, upto=-1)
        build_letters_ds(trnMentions, devMentions, tstMentions, t2idx, dsdir, letter_vecfile, max_len_name=40)
        build_typecosine_ds(trnMentions, devMentions, tstMentions, t2idx, dsdir, vectorFile, upto=-1)
        if hs_ngram_path:
            build_hsNgram_ds(config, trnMentions, devMentions, tstMentions, t2idx, dsdir, hs_ngram_path, hs_ngram_versions, vectorsize=300, upto=-1)
        for ng in ngrams:
            build_ngram_ds(trnMentions, devMentions, tstMentions, t2idx, dsdir, ngrams_vecfiles[ng], ng, upto=-1)
        build_type_patterns(trnMentions, t2idx, dsdir, vectorFile)
        save_typevecmatrix(t2idx, dsdir, vectorFile)
#         build_type_words_cosine_ds(trnMentions, devMentions, tstMentions, t2idx, dsdir, vectorFile, upto=-1)
        build_subwords_ds(trnMentions, devMentions, tstMentions, t2idx, dsdir, subword_vectorFile, use_lowercase=use_lowercase, upto=-1)
        build_words_ds(trnMentions, devMentions, tstMentions, t2idx, dsdir, vectorFile_words, use_lowercase=use_lowercase, upto=-1)
        build_entvec_ds(trnMentions, devMentions, tstMentions, t2idx, dsdir, vectorFile, upto=-1)
        build_desc_features_ds(trnMentions, devMentions, tstMentions, ent2tfidf_features_path, t2idx, dsdir, vectorFile_words, use_lowercase=True, upto=-1)
    else:
        build_desc_features_ds(trnMentions, devMentions, tstMentions, ent2tfidf_features_path, t2idx, dsdir, vectorFile_words, use_lowercase=True, upto=-1)
Beispiel #3
0
def generate_name_dataset(config):
    trainfile = config['Etrain']
    devfile = config['Edev']
    testfile = config['Etest']
    targetTypesFile=config['typefile']
    max_names = [int(n) for n in config['name_num'].split()]
    dsdir = config['dsdir']
    (t2idx, _) = cmn.loadtypes(targetTypesFile)
    _ = len(t2idx)
    (etrain2types, etrain2names, _) = cmn.load_entname_ds(trainfile, t2idx)
    logger.info("number of train examples: %d",len(etrain2names))
    (etest2types, etest2names, _) = cmn.load_entname_ds(testfile, t2idx)
    logger.info("number of test examples: %d",len(etest2names))
    (edev2types, edev2names, _) = cmn.load_entname_ds(devfile, t2idx)
    logger.info("number of dev examples: %d", len(edev2names))

    logger.info('number of names for each entity in trn,dev,test: %s', max_names)
    logger.info('generating new datasets based on entity names')
#     dsdir = dsdir + 'maxname'  + ','.join([str(n) for n in max_names])
#     if not os.path.exists(dsdir): os.makedirs(dsdir)
    gen_new_ds(etrain2names, etrain2types, max_names[0], outfile = dsdir + '/train.txt')
    gen_new_ds(edev2names, edev2types, max_names[1], outfile = dsdir + '/dev.txt')
    gen_new_ds(etest2names, etest2types, max_names[2], outfile = dsdir + '/test.txt')
Beispiel #4
0
    gen_new_ds(etrain2names, etrain2types, max_names[0], outfile = dsdir + '/train.txt')
    gen_new_ds(edev2names, edev2types, max_names[1], outfile = dsdir + '/dev.txt')
    gen_new_ds(etest2names, etest2types, max_names[2], outfile = dsdir + '/test.txt')

if __name__ == '__main__':
    print 'loading config file', sys.argv[1]
    config = cmn.loadConfig(sys.argv[1])
    trainfile = config['Etrain']
    devfile = config['Edev']
    testfile = config['Etest']
    targetTypesFile=config['typefile']
    max_names = [int(n) for n in config['name_num'].split()]
    dsdir = config['dsdir']
    
    upto = -1
    (t2idx, idx2t) = cmn.loadtypes(targetTypesFile)
    numtargets = len(t2idx)
    (etrain2types, etrain2names, _) = cmn.load_entname_ds(trainfile, t2idx)
    logger.info("number of train examples: %d",len(etrain2names))
    (etest2types, etest2names, _) = cmn.load_entname_ds(testfile, t2idx)
    logger.info("number of test examples: %d",len(etest2names))
    (edev2types, edev2names, _) = cmn.load_entname_ds(devfile, t2idx)
    logger.info("number of dev examples: %d", len(edev2names))

    logger.info('number of names for each entity in trn,dev,test: %s', max_names)
    logger.info('generating new datasets based on entity names')
    dsdir = dsdir + 'maxname'  + ','.join([str(n) for n in max_names])
    if not os.path.exists(dsdir): os.makedirs(dsdir)
    gen_new_ds(etrain2names, etrain2types, max_names[0], outfile = dsdir + '/train.txt')
    gen_new_ds(edev2names, edev2types, max_names[1], outfile = dsdir + '/dev.txt')
    gen_new_ds(etest2names, etest2types, max_names[2], outfile = dsdir + '/test.txt')
def main(args):
    print 'loading config file', args[1]
    config = cmn.loadConfig(args[1])
    dsdir = config['dsdir']
    #first generating name datasets based on the number of names for each set
    if not os.path.exists(os.path.join(dsdir, 'train.txt')):
        generate_name_dataset(config)

    trainfile = dsdir + '/train.txt'
    devfile = dsdir + '/dev.txt'
    testfile = dsdir + '/test.txt'
    targetTypesFile = config['typefile']
    vectorFile = config['ent_vectors']
    vectorFile_words = config[
        'word_vectors'] if 'word_vectors' in config else vectorFile
    subword_vectorFile = config[
        'fasttext_vecfile'] if 'fasttext_vecfile' in config else None
    ent2tfidf_features_path = config[
        'ent2tfidf_features_path'] if 'ent2tfidf_features_path' in config else None
    #     the_features = config['features'].split(' ') #i.e. letters entvec words tc
    ngrams = [int(n) for n in config['ngrams_n'].split()
              ] if 'ngrams_n' in config else []
    ngrams_vecfiles = {
        ngram: config['ngrams' + str(ngram) + '_vecfile']
        for ngram in ngrams
    }
    letter_vecfile = config[
        'letters_vecfile'] if 'letters_vecfile' in config else None
    hs_ngram_path = config['hsngrampath'] if 'hsngrampath' in config else None
    hs_ngram_versions = config['hsngram_vecs'].split(
    ) if hs_ngram_path else None
    use_lowercase = str_to_bool(
        config['use_lowercase']) if 'use_lowercase' in config else False
    print "uselower: ", use_lowercase
    upto = -1
    (t2idx, _) = cmn.loadtypes(targetTypesFile)
    trnMentions = load_ent_ds(trainfile)
    devMentions = load_ent_ds(devfile)
    tstMentions = load_ent_ds(testfile)
    logger.info("#train : %d #dev : %d #test : %d", len(trnMentions),
                len(devMentions), len(tstMentions))

    if not os.path.exists(os.path.join(dsdir, '_targets.h5py')):
        build_targets_ds(trnMentions, devMentions, tstMentions, t2idx, dsdir)
        build_entvec_ds(trnMentions,
                        devMentions,
                        tstMentions,
                        t2idx,
                        dsdir,
                        vectorFile,
                        upto=-1)
        build_letters_ds(trnMentions,
                         devMentions,
                         tstMentions,
                         t2idx,
                         dsdir,
                         letter_vecfile,
                         max_len_name=40)
        build_typecosine_ds(trnMentions,
                            devMentions,
                            tstMentions,
                            t2idx,
                            dsdir,
                            vectorFile,
                            upto=-1)
        if hs_ngram_path:
            build_hsNgram_ds(config,
                             trnMentions,
                             devMentions,
                             tstMentions,
                             t2idx,
                             dsdir,
                             hs_ngram_path,
                             hs_ngram_versions,
                             vectorsize=300,
                             upto=-1)
        for ng in ngrams:
            build_ngram_ds(trnMentions,
                           devMentions,
                           tstMentions,
                           t2idx,
                           dsdir,
                           ngrams_vecfiles[ng],
                           ng,
                           upto=-1)
        build_type_patterns(trnMentions, t2idx, dsdir, vectorFile)
        save_typevecmatrix(t2idx, dsdir, vectorFile)
        #         build_type_words_cosine_ds(trnMentions, devMentions, tstMentions, t2idx, dsdir, vectorFile, upto=-1)
        build_subwords_ds(trnMentions,
                          devMentions,
                          tstMentions,
                          t2idx,
                          dsdir,
                          subword_vectorFile,
                          use_lowercase=use_lowercase,
                          upto=-1)
        build_words_ds(trnMentions,
                       devMentions,
                       tstMentions,
                       t2idx,
                       dsdir,
                       vectorFile_words,
                       use_lowercase=use_lowercase,
                       upto=-1)
        build_entvec_ds(trnMentions,
                        devMentions,
                        tstMentions,
                        t2idx,
                        dsdir,
                        vectorFile,
                        upto=-1)
        build_desc_features_ds(trnMentions,
                               devMentions,
                               tstMentions,
                               ent2tfidf_features_path,
                               t2idx,
                               dsdir,
                               vectorFile_words,
                               use_lowercase=True,
                               upto=-1)
    else:
        build_desc_features_ds(trnMentions,
                               devMentions,
                               tstMentions,
                               ent2tfidf_features_path,
                               t2idx,
                               dsdir,
                               vectorFile_words,
                               use_lowercase=True,
                               upto=-1)