def get_mid2wiki(filename): # print("Loading Wiki") mid2wiki = defaultdict(bool) fin = open(filename) for line in fin.readlines(): items = line.strip().split('\t') sub = rdf2fb(clean_uri(items[0])) mid2wiki[sub] = True return mid2wiki
def get_names_for_entities(namespath): print("getting names map...") names = {} with open(namespath, 'r') as f: for i, line in enumerate(f): if i % 1000000 == 0: print("line: {}".format(i)) items = line.strip().split("\t") if len(items) != 4: print("ERROR: line - {}".format(line)) entity = clean_uri(items[0]) type = clean_uri(items[1]) literal = clean_uri(items[2]).lower() if entity not in names.keys(): names[entity] = [literal] else: names[entity].append(literal) return names
def trim_names(fbsubsetpath, namespath, outpath): print("getting all entity MIDs from Freebase subset...") mids_to_check = get_all_entity_mids(fbsubsetpath) print("trimming names...") outfile = open(outpath, 'w') with open(namespath, 'r') as f: for i, line in enumerate(f): if i % 1000000 == 0: print("line: {}".format(i)) items = line.strip().split("\t") if len(items) != 4: print("ERROR: line - {}".format(line)) entity = www2fb(clean_uri(items[0])) type = clean_uri(items[1]) if entity in mids_to_check: outfile.write(line) outfile.close()
def create_inverted_index_entity(namespath, outpath): print("creating the index map...") index = {} size = 0 with open(namespath, 'r') as f: for i, line in enumerate(f): if i % 1000000 == 0: print("line: {}".format(i)) items = line.strip().split("\t") if len(items) != 4: print("ERROR: line - {}".format(line)) entity_mid = clean_uri(items[0]) entity_type = clean_uri(items[1]) entity_name = clean_uri(items[2]) name_ngrams = get_name_ngrams(entity_name) for ngram_tuple in name_ngrams: size += 1 ngram = " ".join(ngram_tuple) ngram = strip_accents(ngram) # print(ngram) if ngram in index.keys(): index[ngram].add(entity_mid) else: index[ngram] = set([entity_mid]) print("num keys: {}".format(len(index))) print("total key-value pairs: {}".format(size)) print("dumping to pickle...") with open(outpath, 'wb') as f: pickle.dump(index, f) print("DONE")
match_mid_list.extend(mids) for mid in mids: if mid_dic.get(mid) is not None: tuplelist.append((mid, name)) tupleset.extend(tuplelist) head_mid_idx[i] = list(set(tuplelist)) if tuplelist: id_match.add(i) tupleset = set(tupleset) tuple_topic = [] with open('data/FB5M.name.txt', 'r',encoding='utf-8') as f: for i, line in enumerate(f): if i % 1000000 == 0: print("line: {}".format(i)) items = line.strip().split("\t") if (www2fb(clean_uri(items[0])), processed_text(clean_uri(items[2]))) in tupleset and items[1] == "<fb:type.object.name>": tuple_topic.append((www2fb(clean_uri(items[0])), processed_text(clean_uri(items[2])))) tuple_topic = set(tuple_topic) ######################## Learn entity representation ######################## head_emb = np.zeros((total_num, args.embed_dim)) TEXT = data.Field(lower=True) ED = data.Field(sequential=False, use_vocab=False) train, dev = data.TabularDataset.splits(path=args.output, train='entity_train.txt', validation='entity_valid.txt', format='tsv', fields=[('text', TEXT), ('mid', ED)]) field = [('id', None), ('sub', None), ('entity', None), ('relation', None), ('obj', None), ('text', TEXT), ('ed', None)] test = data.TabularDataset(path=os.path.join(args.output, 'test.txt'), format='tsv', fields=field) TEXT.build_vocab(train, dev, test) # training data includes validation data args.gpu == -1 # load the model
with open(os.path.join(args.dataset, "annotated_wd_data_train_answerable.txt"), 'r') as f: for i, line in enumerate(f): items = line.strip().split("\t") if len(items) != 4: print("ERROR: line - {}".format(line)) break entiset.add(items[0]) # entiset.add(www2fb(items[2])) outfile = open(os.path.join(args.output, 'names.trimmed.txt'), 'w') # output file path for trimmed names file with open(args.names, 'r') as f: for i, line in enumerate(f): if i % 1000000 == 0: print("line: {}".format(i)) items = line.strip().split("\t") if len(items) != 3: print("ERROR: line - {}".format(line)) entity = clean_uri(items[0]) if entity in fb_mids: name = processed_text(clean_uri(items[2])) if name.strip() != "": if entity in entiset: outfile.write("{}\t{}\n".format(entity, name)) elif name in gramset: entiset.add(entity) outfile.write("{}\t{}\n".format(entity, name)) #name_gram = [name] #tokens = name.split() #maxlen = len(tokens) #if maxlen > 2: # j = maxlen - 1 # for token in [tokens[idx:idx + j] for idx in range(maxlen - j + 1)]: # name_gram.append(' '.join(token))
match_mid_list.extend(mids) for mid in mids: if mid_dic.get(mid) is not None: tuplelist.append((mid, name)) tupleset.extend(tuplelist) head_mid_idx[i] = list(set(tuplelist)) if tuplelist: id_match.add(i) tupleset = set(tupleset) tuple_topic = [] with open('data/FB5M.name.txt', 'r') as f: for i, line in enumerate(f): if i % 1000000 == 0: print("line: {}".format(i)) items = line.strip().split("\t") if (www2fb(clean_uri(items[0])), processed_text(clean_uri(items[2])) ) in tupleset and items[1] == "<fb:type.object.name>": tuple_topic.append((www2fb(clean_uri(items[0])), processed_text(clean_uri(items[2])))) tuple_topic = set(tuple_topic) ######################## Learn entity representation ######################## head_emb = np.zeros((total_num, args.embed_dim)) TEXT = data.Field(lower=True) ED = data.Field(sequential=False, use_vocab=False) train, dev = data.TabularDataset.splits(path=args.output, train='entity_train.txt', validation='entity_valid.txt', format='tsv', fields=[('text', TEXT), ('mid', ED)]) field = [('id', None), ('sub', None), ('entity', None), ('relation', None),
for i, line in enumerate(f): items = line.strip().split("\t") if len(items) != 4: print("ERROR: line - {}".format(line)) break entiset.add(www2fb(items[0])) # entiset.add(www2fb(items[2])) outfile = open(os.path.join(args.output, 'names.trimmed.txt'), 'w') # output file path for trimmed names file with open(args.names, 'r') as f: for i, line in enumerate(f): if i % 1000000 == 0: print("line: {}".format(i)) items = line.strip().split("\t") if len(items) != 4: print("ERROR: line - {}".format(line)) entity = www2fb(clean_uri(items[0])) if entity in fb_mids: name = processed_text(clean_uri(items[2])) if name.strip() != "": if entity in entiset: outfile.write("{}\t{}\n".format(entity, name)) elif name in gramset: entiset.add(entity) outfile.write("{}\t{}\n".format(entity, name)) #name_gram = [name] #tokens = name.split() #maxlen = len(tokens) #if maxlen > 2: # j = maxlen - 1 # for token in [tokens[idx:idx + j] for idx in range(maxlen - j + 1)]: # name_gram.append(' '.join(token))