def processDoc(line, dicts, config=defualtconfig): inverted_list = dicts[0] inverted_index = dicts[1] entity_tokennum = dicts[2] inverted_list_len = dicts[3] entity_realid = dicts[4] entity_real = dicts[5] maxenl = dicts[6] threshold = config["threshold"] docfileds = config["document"]["value_attribute"] n = config["token_size"] documentId = line[config["document"]["id_attribute"]] document_real = line[docfileds[0]] for filed in docfileds[1:]: document_real += " " + line[filed] jsonline = {} document = document_real.lower().strip().replace(" ", "") tokens = list(ngrams(document, n)) heap = [] keys = [] los = len(tokens) # build the heap for i, token in enumerate(tokens): key = "".join(token) keys.append(key) try: heap.append([inverted_list[key][0], i]) except KeyError: pass if heap: return_values_from_c = singleheap.getcandidates( heap, entity_tokennum, inverted_list_len, inverted_index, inverted_list, keys, los, maxenl, threshold) jsonline["document"] = {} jsonline["document"]["id"] = documentId jsonline["document"]["value"] = document_real jsonline["entities"] = {} for value in return_values_from_c: temp = dict() temp["start"] = value[1] temp["end"] = value[2] + 2 temp["score"] = value[3] value_o = str(value[0]) try: entity_id = entity_realid[value_o] except KeyError: value_o = value[0] entity_id = entity_realid[value_o] try: jsonline["entities"][entity_id]["candwins"].append(temp) except KeyError: jsonline["entities"][entity_id] = {} jsonline["entities"][entity_id]["value"] = entity_real[value_o] jsonline["entities"][entity_id]["candwins"] = [temp] return jsonline
def processDoc(line, dicts, config=defualtconfig): inverted_list = dicts[0] inverted_index = dicts[1] entity_tokennum = dicts[2] inverted_list_len = dicts[3] entity_realid = dicts[4] entity_real = dicts[5] maxenl = dicts[6] threshold = config["threshold"] docfileds = config["document"]["value_attribute"] n = config["token_size"] documentId = line[config["document"]["id_attribute"]] document_real = line[docfileds[0]] for filed in docfileds[1:]: document_real += " " + line[filed] jsonline = {} document = document_real.lower().strip().replace(" ", "") tokens = list(ngrams(document, n)) heap = [] keys = [] los = len(tokens) # build the heap for i, token in enumerate(tokens): key = "".join(token) keys.append(key) try: heap.append([inverted_list[key][0], i]) except KeyError: pass if heap: return_values_from_c = singleheap.getcandidates(heap, entity_tokennum, inverted_list_len, inverted_index, inverted_list, keys, los, maxenl, threshold) jsonline["document"] = {} jsonline["document"]["id"] = documentId jsonline["document"]["value"] = document_real jsonline["entities"] = {} for value in return_values_from_c: temp = dict() temp["start"] = value[1] temp["end"] = value[2]+2 temp["score"] = value[3] value_o = str(value[0]) try: entity_id = entity_realid[value_o] except KeyError: value_o = value[0] entity_id = entity_realid[value_o] try: jsonline["entities"][entity_id]["candwins"].append(temp) except KeyError: jsonline["entities"][entity_id] = {} jsonline["entities"][entity_id]["value"] = entity_real[value_o] jsonline["entities"][entity_id]["candwins"] = [temp] return jsonline
def processDoc(self, line): inverted_list = self.dictionary[0] inverted_index = self.dictionary[1] entity_tokennum = self.dictionary[2] inverted_list_len = self.dictionary[3] entity_realid = self.dictionary[4] entity_real = self.dictionary[5] maxenl = self.dictionary[6] threshold = self.threshold n = self.token_size document_real = line jsonline = {} document = document_real.lower().strip() tokens = list(ngrams(document, n)) heap = [] keys = [] los = len(tokens) # build the heap for i, token in enumerate(tokens): key = "".join(token) keys.append(key) try: heap.append([inverted_list[key][0], i]) except KeyError: pass if heap: return_values_from_c = singleheap.getcandidates(heap, entity_tokennum, inverted_list_len, inverted_index, inverted_list, keys, los, maxenl, threshold) jsonline["document"] = {} jsonline["document"]["value"] = document_real jsonline["entities"] = {} for value in return_values_from_c: temp = dict() temp["start"] = value[1] temp["end"] = value[2]+2 temp["score"] = value[3] value_o = str(value[0]) try: entity_id = entity_realid[value_o] except KeyError: value_o = value[0] entity_id = entity_realid[value_o] try: jsonline["entities"][entity_id]["candwins"].append(temp) except KeyError: jsonline["entities"][entity_id] = {} jsonline["entities"][entity_id]["value"] = entity_real[value_o] jsonline["entities"][entity_id]["candwins"] = [temp] return jsonline
def run(dictfile,inputfile,dictfileds,docfileds,n=2,threshold=0.8): inverted_list = {} inverted_index = [] entity_tokennum = {} inverted_list_len = {} entity_realid = {} entity_real = {} i = 0 maxenl = 0 for line in open(dictfile): line = json.loads(line) entity_realid[i] = line["uri"].split("/")[-1] entity_real[i] = line["name"] for filed in dictfileds: entity_real[i] += " "+line[filed.split(".")[0]][filed.split(".")[1]] entity = entity_real[i].lower().strip() inverted_index.append(entity) # record each entity and its id tokens = list(ngrams(entity, n)) entity_tokennum[entity] = len(tokens) # record each entity's token number if maxenl<len(tokens): maxenl = len(tokens) # build inverted lists for tokens tokens = list(set(tokens)) for token in tokens: token = str(token) try: inverted_list[token].append(i) inverted_list_len[token] += 1 except KeyError: inverted_list[token] = [] inverted_list[token].append(i) inverted_list_len[token] = 1 i = i + 1 for line in open(inputfile): line = json.loads(line) documentId = line["uri"].split("/")[-1] document_real = line["name"] for filed in docfileds: document_real += " "+line[filed.split(".")[0]][filed.split(".")[1]] #tokenize document, add inverted list(empty) of new tokens in document document = document_real.lower().strip() jsonline = {} jsonline["document"] = {} jsonline["document"]["id"] = documentId jsonline["document"]["value"] = document_real jsonline["entities"] = {} tokens = list(ngrams(document, n)) heap = [] keys = [] los = len(tokens) # build the heap for i, token in enumerate(tokens): key = str(token) keys.append(key) try: heap.append([inverted_list[key][0],i]) except KeyError: pass if heap != []: returnValuesFromC = singleheap.getcandidates(heap,entity_tokennum,inverted_list_len,inverted_index,inverted_list,keys,los,maxenl) for value in returnValuesFromC: temp = {} temp["start"] = value[1] temp["end"] = value[2] temp["score"] = value[3] try: jsonline["entities"][entity_realid[value[0]]]["candwins"].append(temp) except KeyError: jsonline["entities"][entity_realid[value[0]]] = {} jsonline["entities"][entity_realid[value[0]]]["value"] = entity_real[value[0]] jsonline["entities"][entity_realid[value[0]]]["candwins"] = [temp] print json.dumps(jsonline)