ap = 0 for idx in range(1, len(lst) + 1): if lst[idx - 1] in truth: rel += 1 ap += rel / idx return ap / len(truth) FLAGS_USE_TYPE = True cur_dir = os.path.dirname(os.path.realpath(__file__)) data = "wiki" print('dataset:%s' % data) folder = '/../../data/' + data + '/intermediate/' start = time.time() print('loading eid and name maps') eid2ename, ename2eid = util.loadEidToEntityMap(cur_dir + folder + 'entity2id.txt') #entity2id.txt print('loading eid and skipgram maps') eid2patterns, pattern2eids = util.loadFeaturesAndEidMap( cur_dir + folder + 'reduced_eidSkipgramCounts.txt') #eidSkipgramCount.txt print('loading skipgram strength map') eidAndPattern2strength = util.loadWeightByEidAndFeatureMap( cur_dir + folder + 'setexpan_eidSkipgram2TFIDFStrength.txt', idx=-1) #(eid, feature, weight) file print('loading eid and type maps') eid2types, type2eids = util.loadFeaturesAndEidMap( cur_dir + folder + 'eidTypeCounts.txt') #eidTypeCount.txt print('loading type strength map') eidAndType2strength = util.loadWeightByEidAndFeatureMap( cur_dir + folder + 'eidType2TFIDFStrength.txt', idx=-1) #(eid, feature, weight) file end = time.time()
for idx in range(1, len(lst) + 1): if lst[idx - 1] in truth: rel += 1 ap += rel / idx return ap / len(truth) FLAGS_USE_TYPE = True cur_dir = os.path.dirname(os.getcwd()) dataset = "wiki2" print('dataset:%s' % dataset) folder = cur_dir + '/data/{}/intermediate/'.format(dataset) start = time.time() print('data folder: {}'.format(folder)) print('loading eid and name maps') eid2ename, ename2eid = util.loadEidToEntityMap(folder + 'entity2id.txt') print('loading eid and skipgram maps') eid2patterns, pattern2eids = util.loadFeaturesAndEidMap( folder + 'reduced_eidSkipgramCounts.txt') print('loading skipgram strength maps') eidAndPattern2strength = util.loadWeightByEidAndFeatureMap( folder + 'setexpan_eidSkipgram2TFIDFStrength.txt', idx=-1) print('loading eid and type maps') eid2types, type2eids = util.loadFeaturesAndEidMap(folder + 'eidTypeCounts.txt') print('loading type strength maps') eidAndType2strength = util.loadWeightByEidAndFeatureMap( folder + 'eidType2TFIDFStrength.txt', idx=-1) end = time.time() print("Finish loading all dataset, using %s seconds" % (end - start)) good_gold_set = {}
def main(): ## create index es = ES() if FLAGS_FIRST_RUN: if es.check_existing_index(index_name=FLAGS_CORPUS_NAME, delete_existing=False): es.create_skipgram2eid_index(index_name=FLAGS_CORPUS_NAME, type_name="skipgram2eid") es.create_eid2skipgram_index(index_name=FLAGS_CORPUS_NAME, type_name="eid2skipgram") es.create_eid2eid_index(index_name=FLAGS_CORPUS_NAME, type_name="eid2eid") start = time.time() skipgram2id, skipgram2eidcounts, eid2skipgramcounts = util.load_skipgram2eidcounts( eidSkipgramFilePath) end = time.time() print("[INFO] Loading data using time %s (seconds)" % (end - start)) start = time.time() eid2eid_w_strength = util.calculateEidSimilarity(skipgram2eidcounts) end = time.time() print("[INFO] Calculating eid-eid similarity using time %s (seconds)" % (end - start)) es.index_skipgram2eid(index_name=FLAGS_CORPUS_NAME, type_name="skipgram2eid", skipgram2id=skipgram2id, skipgram2eidcounts=skipgram2eidcounts) es.index_eid2skipgram(index_name=FLAGS_CORPUS_NAME, type_name="eid2skipgram", eid2skipgramcounts=eid2skipgramcounts) es.index_eid2eid(index_name=FLAGS_CORPUS_NAME, type_name="eid2eid", eid2eid_w_strength=eid2eid_w_strength) es.match_all(index_name=FLAGS_CORPUS_NAME, type_name="skipgram2eid") es.match_all(index_name=FLAGS_CORPUS_NAME, type_name="eid2skipgram") es.match_all(index_name=FLAGS_CORPUS_NAME, type_name="eid2eid") eid2ename, ename2eid = util.loadEidToEntityMap(eidEnameFilePath) eid2types = util.loadEidToTypeMap(eidTypeFilePath, ename2eid=ename2eid) # userInput = ["NBA", "NCAA", "NFL"] # sports league, good performance userInput = ["BBC", "HBO", "CNN", "Fox", "Channel 4"] # TV Channel, good performance # userInput = ["Twitter", "Microsoft", "Lenovo", "Toyota", "Qualcomm"] # company, good performance # userInput = ["Toyota", "Hyundai", "Mazda", "Chrysler", "Ford"] # car company (top-30, avg.rank=10), good performance # userInput = ["Google", "Facebook", "Microsoft", "Amazon", "Twitter"] # high tech company, good performance # # userInput = ["United States", "China", "Japan", "germany", "England", "Russia", "India"] # country, using dist.sim # userInput = ["Illinois", "Texas", "California", "Ohio", "Maryland"] # state, using dist.sim seedEidsWithConfidence = [(ename2eid[ele.lower()], 0.0) for ele in userInput] negativeSeedEids = set() params = SetExpanParams(index_name=FLAGS_CORPUS_NAME, max_iter=10, ensemble_batch=10, num_of_top_skipgrams=150, num_of_top_candidate_eids=50, feature_subset_size_ratio=0.8, average_rank=10, skipgramDistLower=3, skipgramDistUpper=30, use_type=False) start = time.time() (expanded_eids, stop_iter) = setExpan(es, seedEidsWithConfidence, negativeSeedEids, eid2ename, eid2types, params, FLAGS_DEBUG=False) end = time.time() print("[INFO!!!] Finish SetExpan++ in %s seconds" % (end - start)) for ele in expanded_eids: print(ele[0], eid2ename[ele[0]], ele[1])