Ejemplo n.º 1
0
    ap = 0
    for idx in range(1, len(lst) + 1):
        if lst[idx - 1] in truth:
            rel += 1
            ap += rel / idx
    return ap / len(truth)


FLAGS_USE_TYPE = True
cur_dir = os.path.dirname(os.path.realpath(__file__))
data = "wiki"
print('dataset:%s' % data)
folder = '/../../data/' + data + '/intermediate/'
start = time.time()
print('loading eid and name maps')
eid2ename, ename2eid = util.loadEidToEntityMap(cur_dir + folder +
                                               'entity2id.txt')  #entity2id.txt
print('loading eid and skipgram maps')
eid2patterns, pattern2eids = util.loadFeaturesAndEidMap(
    cur_dir + folder + 'reduced_eidSkipgramCounts.txt')  #eidSkipgramCount.txt
print('loading skipgram strength map')
eidAndPattern2strength = util.loadWeightByEidAndFeatureMap(
    cur_dir + folder + 'setexpan_eidSkipgram2TFIDFStrength.txt',
    idx=-1)  #(eid, feature, weight) file
print('loading eid and type maps')
eid2types, type2eids = util.loadFeaturesAndEidMap(
    cur_dir + folder + 'eidTypeCounts.txt')  #eidTypeCount.txt
print('loading type strength map')
eidAndType2strength = util.loadWeightByEidAndFeatureMap(
    cur_dir + folder + 'eidType2TFIDFStrength.txt',
    idx=-1)  #(eid, feature, weight) file
end = time.time()
    for idx in range(1, len(lst) + 1):
        if lst[idx - 1] in truth:
            rel += 1
            ap += rel / idx
    return ap / len(truth)


FLAGS_USE_TYPE = True
cur_dir = os.path.dirname(os.getcwd())
dataset = "wiki2"
print('dataset:%s' % dataset)
folder = cur_dir + '/data/{}/intermediate/'.format(dataset)
start = time.time()
print('data folder: {}'.format(folder))
print('loading eid and name maps')
eid2ename, ename2eid = util.loadEidToEntityMap(folder + 'entity2id.txt')
print('loading eid and skipgram maps')
eid2patterns, pattern2eids = util.loadFeaturesAndEidMap(
    folder + 'reduced_eidSkipgramCounts.txt')
print('loading skipgram strength maps')
eidAndPattern2strength = util.loadWeightByEidAndFeatureMap(
    folder + 'setexpan_eidSkipgram2TFIDFStrength.txt', idx=-1)
print('loading eid and type maps')
eid2types, type2eids = util.loadFeaturesAndEidMap(folder + 'eidTypeCounts.txt')
print('loading type strength maps')
eidAndType2strength = util.loadWeightByEidAndFeatureMap(
    folder + 'eidType2TFIDFStrength.txt', idx=-1)
end = time.time()
print("Finish loading all dataset, using %s seconds" % (end - start))

good_gold_set = {}
Ejemplo n.º 3
0
def main():
    ## create index
    es = ES()
    if FLAGS_FIRST_RUN:
        if es.check_existing_index(index_name=FLAGS_CORPUS_NAME,
                                   delete_existing=False):
            es.create_skipgram2eid_index(index_name=FLAGS_CORPUS_NAME,
                                         type_name="skipgram2eid")
            es.create_eid2skipgram_index(index_name=FLAGS_CORPUS_NAME,
                                         type_name="eid2skipgram")
            es.create_eid2eid_index(index_name=FLAGS_CORPUS_NAME,
                                    type_name="eid2eid")

        start = time.time()
        skipgram2id, skipgram2eidcounts, eid2skipgramcounts = util.load_skipgram2eidcounts(
            eidSkipgramFilePath)
        end = time.time()
        print("[INFO] Loading data using time %s (seconds)" % (end - start))

        start = time.time()
        eid2eid_w_strength = util.calculateEidSimilarity(skipgram2eidcounts)
        end = time.time()
        print("[INFO] Calculating eid-eid similarity using time %s (seconds)" %
              (end - start))

        es.index_skipgram2eid(index_name=FLAGS_CORPUS_NAME,
                              type_name="skipgram2eid",
                              skipgram2id=skipgram2id,
                              skipgram2eidcounts=skipgram2eidcounts)

        es.index_eid2skipgram(index_name=FLAGS_CORPUS_NAME,
                              type_name="eid2skipgram",
                              eid2skipgramcounts=eid2skipgramcounts)
        es.index_eid2eid(index_name=FLAGS_CORPUS_NAME,
                         type_name="eid2eid",
                         eid2eid_w_strength=eid2eid_w_strength)

        es.match_all(index_name=FLAGS_CORPUS_NAME, type_name="skipgram2eid")
        es.match_all(index_name=FLAGS_CORPUS_NAME, type_name="eid2skipgram")
        es.match_all(index_name=FLAGS_CORPUS_NAME, type_name="eid2eid")

    eid2ename, ename2eid = util.loadEidToEntityMap(eidEnameFilePath)
    eid2types = util.loadEidToTypeMap(eidTypeFilePath, ename2eid=ename2eid)

    # userInput = ["NBA", "NCAA", "NFL"] # sports league, good performance
    userInput = ["BBC", "HBO", "CNN", "Fox",
                 "Channel 4"]  # TV Channel, good performance
    # userInput = ["Twitter", "Microsoft", "Lenovo", "Toyota", "Qualcomm"] # company, good performance
    # userInput = ["Toyota", "Hyundai", "Mazda", "Chrysler", "Ford"] # car company (top-30, avg.rank=10), good performance
    # userInput = ["Google", "Facebook", "Microsoft", "Amazon", "Twitter"] # high tech company, good performance
    #
    # userInput = ["United States", "China", "Japan", "germany", "England", "Russia", "India"] # country, using dist.sim
    # userInput = ["Illinois", "Texas", "California", "Ohio", "Maryland"] # state, using dist.sim

    seedEidsWithConfidence = [(ename2eid[ele.lower()], 0.0)
                              for ele in userInput]
    negativeSeedEids = set()
    params = SetExpanParams(index_name=FLAGS_CORPUS_NAME,
                            max_iter=10,
                            ensemble_batch=10,
                            num_of_top_skipgrams=150,
                            num_of_top_candidate_eids=50,
                            feature_subset_size_ratio=0.8,
                            average_rank=10,
                            skipgramDistLower=3,
                            skipgramDistUpper=30,
                            use_type=False)

    start = time.time()
    (expanded_eids, stop_iter) = setExpan(es,
                                          seedEidsWithConfidence,
                                          negativeSeedEids,
                                          eid2ename,
                                          eid2types,
                                          params,
                                          FLAGS_DEBUG=False)
    end = time.time()
    print("[INFO!!!] Finish SetExpan++ in %s seconds" % (end - start))
    for ele in expanded_eids:
        print(ele[0], eid2ename[ele[0]], ele[1])