def test_parser(): import nltk from nltk.parse import stanford from nltk.parse.stanford import StanfordParser os.environ[ 'STANFORD_PARSER'] = 'F:/eclipse_doctor/KnowledgeGraph/stanford-parser/stanford-parser.jar' os.environ[ 'STANFORD_MODELS'] = 'F:/eclipse_doctor/KnowledgeGraph/stanford-parser/stanford-parser-3.7.0-models.jar' java_path = "C:/ProgramData/Oracle/Java/javapath" os.environ['JAVAHOME'] = java_path start = datetime.now() print start parser = stanford.StanfordParser( model_path= "F:/eclipse_doctor/KnowledgeGraph/stanford-parser/englishPCFG.ser.gz") end = datetime.now() print end print "cost time: " + str((end - start).microseconds) sent = 'angulated abutment is an abutment whose body is not parallel to the long axis of the implant. It is utilized when the implant is at a different inclination in relation to the proposed prosthesis.' start = datetime.now() print start trees = parser.parse(sent.split()) end = datetime.now() print end print "cost time: " + str((end - start).microseconds) print 'len(trees)', len(list(trees)) path_project = os.path.abspath( os.path.join(os.getcwd(), os.pardir, os.pardir)) path_data = path_project + os.sep + "input" + os.sep + "items_tagged_modified.json" data = json.load(codecs.open(path_data, encoding='UTF-8')) start_all = datetime.now() cnt = 0 trees_all = [] for item in data: pos2definition = item["pos2definition"] for pos2def in pos2definition: definition = pos2def["definition"] definition_pure = re.sub(r'\([\s\S]*?\)', "", definition) text = nltk.word_tokenize(definition_pure) sents_pos_period = cut_list(text, ['.']) for sent_list in sents_pos_period: cnt += 1 start = datetime.now() # print start trees = parser.parse(' '.join(sent_list).split()) trees_all.append(trees) end = datetime.now() # print end # print "cost time: "+str((end - start).microseconds) end_all = datetime.now() print end_all sum_time = (end_all - start_all).seconds sum_time_mic = (end_all - start_all).microseconds avg_time = (end_all - start_all).seconds * 1.0 / cnt print sum_time, sum_time_mic, avg_time, cnt
def definition_restore(concept_real, def_tokens): final_tokens = [] concept_real_tokens = nltk.pos_tag(concept_real) deifination_new = '' if def_tokens[0] == 'in': if ',' not in def_tokens: final_tokens.extend( definition_restore_process(concept_real_tokens, def_tokens)) else: final_tokens.extend(def_tokens[0:def_tokens.index(',') + 1]) final_tokens.extend( definition_restore_process( concept_real_tokens, def_tokens[def_tokens.index(',') + 1:])) elif def_tokens[0] == ',': final_tokens.extend( definition_restore_process(concept_real_tokens, def_tokens[def_tokens.index(',') + 1:])) elif def_tokens[0] == 'brand': final_tokens.extend(def_tokens) else: seg_point = ['.'] sents = cut_list(def_tokens, seg_point) if ';' in sents[0]: # print def_tokens sents_new = [] chips = cut_list(sents[0], [';']) for i in range(len(chips) - 1): sents_new.extend( definition_restore_process(concept_real_tokens, chips[i])) sents_new.append(';') sents_new.extend( definition_restore_process(concept_real_tokens, chips[-1])) sents_new.append('.') final_tokens.extend(sents_new) else: final_tokens.extend( definition_restore_process(concept_real_tokens, def_tokens)) return ' '.join(final_tokens)
def test_parseDef(item, parser): import nltk treeDef = [] pos2definition = item["pos2definition"] for pos2def in pos2definition: definition = pos2def["definition"] definition_pure = re.sub(r'\([\s\S]*?\)', "", definition) text = nltk.word_tokenize(definition_pure) sents_pos_period = cut_list(text, ['.']) for sent_list in sents_pos_period: tree = parser.parse(' '.join(sent_list).split()) treeDef.append(tree) return treeDef
def get_match_result(patterns, pattern2attributes, sent_pos): ''' use the final patterns to match the sent,if ';' exist in the sent, we cut the sent first, otherwise we directly depend the pattern and value position to get the attributes and values patterns: the final patterns that used in sentence ''' attributes2value = {} if (';', ':') in sent_pos and not (sent_pos[0][0] == 'See' and sent_pos[1][0] == 'also'): chips = cut_list(sent_pos, [(';', ':')]) for chip in chips: value_pos = get_value_pos(patterns, chip) logger.info(str(patterns) + 'value_pos' + str(value_pos)) for i in range(len(patterns) - 1, -1, -1): sent = "" end = value_pos[i][1] slice_chip = chip[value_pos[i][0]:end] for word_tag in slice_chip: if word_tag[0] in ['.', ';', ',']: sent = sent.strip() + word_tag[0] + " " else: sent += word_tag[0] + " " attributes2value[pattern2attributes[patterns[i]]] = sent.strip() if len(sent.strip()) > 0 and sent.strip()[-1] in [';', ',', '.']: attributes2value[pattern2attributes[ patterns[i]]] = sent.strip()[:-1] else: value_pos = get_value_pos(patterns, sent_pos) logger.info(str(patterns) + ' value_pos: ' + str(value_pos)) # print 'value_pos',value_pos for i in range(len(patterns) - 1, -1, -1): # print i,patterns[i] sent = "" end = value_pos[i][1] # print value_pos[i][0],end slice_sent_pos = sent_pos[value_pos[i][0]:end] for word_tag in slice_sent_pos: if word_tag[0] in ['.', ';', ',']: sent = sent.strip() + word_tag[0] + " " else: sent += word_tag[0] + " " attributes2value[pattern2attributes[patterns[i]]] = sent.strip() if len(sent.strip()) > 0 and sent.strip()[-1] in [';', ',', '.']: attributes2value[pattern2attributes[ patterns[i]]] = sent.strip()[:-1] return attributes2value
def process_definition(definition, pattern2attrubute, tagger): attributes2value = {} type = sys.getfilesystemencoding() def_pos = tagger.tag(seg_sent(definition).strip().split()) logger.info(def_pos) seg_point = [('.', '.'), (';', ':')] sents_pos = cut_list(def_pos, seg_point) for sent_pos in sents_pos: logger.info("sent_pos: " + str(sent_pos)) sent = produce_new_sent(sent_pos) logger.info("sent_cut: " + sent) candidate_patterns = find_candidate_pattern(pattern2attrubute.keys(), sent_pos) logger.info("candidate_patterns: " + str(candidate_patterns)) if len(candidate_patterns) == 0: continue choiced_patterns = choice_final_pattern(candidate_patterns, sent_pos) logger.info("choiced_patterns: " + str(choiced_patterns)) attributes2value.update( get_match_result(choiced_patterns, pattern2attrubute, sent_pos)) logger.info("attributes2value: " + str(attributes2value)) logger.info("whole attributes2value: " + str(attributes2value)) return attributes2value
def parseMosby(): # path_project = os.path.abspath(os.path.join(os.getcwd(), os.pardir, os.pardir)) # path_data= path_project+os.sep+"input"+os.sep+"items_tagged_modified.json" path_data = "F:/eclipse_doctor/KnowledgeGraph/kg/books/extractor/data/items_modified.json" data = json.load(codecs.open(path_data, encoding='UTF-8')) import nltk from nltk.parse import stanford from nltk.parse.stanford import StanfordParser os.environ[ 'STANFORD_PARSER'] = 'F:/eclipse_doctor/KnowledgeGraph/stanford-parser/stanford-parser.jar' os.environ[ 'STANFORD_MODELS'] = 'F:/eclipse_doctor/KnowledgeGraph/stanford-parser/stanford-parser-3.7.0-models.jar' java_path = "C:/ProgramData/Oracle/Java/javapath" os.environ['JAVAHOME'] = java_path parser = stanford.StanfordParser( model_path= "F:/eclipse_doctor/KnowledgeGraph/stanford-parser/englishPCFG.ser.gz") cnt = 0 for item in data: print "parse %dth item" % cnt cnt += 1 if cnt < 4686: continue pos2definition = item["pos2definition"] for pos2def in pos2definition: definition = pos2def["definition"] definition_pure = re.sub(r'\([\s\S]*?\)', "", definition) text = nltk.word_tokenize(definition_pure) sents_pos_period = cut_list(text, ['.']) for sent_list in sents_pos_period: # print list(parser.parse(sent_list)) logger.info(list(parser.parse(sent_list))) logger.info('--------') logger.info('~~~~~~~')
def process_definition(definition, pattern2attrubute, tagger): attributes2value = {} if definition.strip().startswith('See') or definition.strip().startswith( 'see'): return logger.info(definition) # type=sys.getfilesystemencoding() start = datetime.datetime.now() text = nltk.word_tokenize(definition) def_pos1 = tagger.tag(text) logger.info(def_pos1) def_pos2 = nltk.pos_tag(text) logger.info(def_pos2) # print definition logger.info("two tag methods similarity: " + str(compare_similar_pos(def_pos1, def_pos2))) print compare_similar_pos(def_pos1, def_pos2) end = datetime.datetime.now() global tag_time_all tag_time_all += (end - start).microseconds logger.info('tagging time:%d ' % ((end - start).microseconds)) # logger.info(def_pos) seg_point = [('.', '.'), (';', ':')] sents_pos = cut_list(def_pos2, seg_point) start = datetime.datetime.now() end = datetime.datetime.now() time_find_candidate_pattern = (end - start).microseconds time_choice_final_pattern = (end - start).microseconds time_get_match_result = (end - start).microseconds for sent_pos in sents_pos: logger.info("sent_pos: " + str(sent_pos)) sent = produce_new_sent(sent_pos) logger.info("sent_cut: " + sent) start = datetime.datetime.now() candidate_patterns = find_candidate_pattern(pattern2attrubute.keys(), sent_pos) end = datetime.datetime.now() time_find_candidate_pattern += (end - start).microseconds logger.info('find candidate pattern time: ' + str((end - start).microseconds)) logger.info("candidate_patterns: " + str(candidate_patterns)) if len(candidate_patterns) == 0: continue start = datetime.datetime.now() choiced_patterns = choice_final_pattern(candidate_patterns, sent_pos) end = datetime.datetime.now() time_choice_final_pattern += (end - start).microseconds logger.info('choice final pattern time: ' + str((end - start).microseconds)) logger.info("choiced_patterns: " + str(choiced_patterns)) start = datetime.datetime.now() attributes2value.update( get_match_result(choiced_patterns, pattern2attrubute, sent_pos)) end = datetime.datetime.now() time_get_match_result += (end - start).microseconds logger.info('get match result time: ' + str((end - start).microseconds)) logger.info("attributes2value: " + str(attributes2value)) global find_candidate_time find_candidate_time += time_find_candidate_pattern logger.info('time_find_candidate_pattern: ' + str(time_find_candidate_pattern)) logger.info('time_choice_final_pattern: ' + str(time_choice_final_pattern)) logger.info('time_get_match_result: ' + str(time_get_match_result)) logger.info("whole attributes2value: " + str(attributes2value)) return attributes2value
def process_definition(definition, pattern2attrubute): attributes2value = {} logger.info('definition: %s' % definition) if definition.strip().startswith('See') or definition.strip().startswith( 'see'): process_vacant_definition(definition) start = datetime.datetime.now() text = nltk.word_tokenize(definition) def_pos = nltk.pos_tag(text) logger.info(def_pos) end = datetime.datetime.now() global tag_time_all tag_time_all += (end - start).microseconds logger.info('tagging time:%d ' % ((end - start).microseconds)) # logger.info(def_pos) seg_point = [('.', '.')] sents_pos_period = cut_list(def_pos, seg_point) sents_pos = [] for sent_pos_period in sents_pos_period: if sent_pos_period[0][0] == 'See' and sent_pos_period[1][0] == 'also': sents_pos.append(sent_pos_period) else: sents_pos.extend(cut_list(sent_pos_period, [(';', ':')])) start = datetime.datetime.now() end = datetime.datetime.now() time_find_candidate_pattern = (end - start).microseconds time_choice_final_pattern = (end - start).microseconds time_get_match_result = (end - start).microseconds for sent_pos in sents_pos: logger.info("sent_pos: " + str(sent_pos)) start = datetime.datetime.now() candidate_patterns = find_candidate_pattern(pattern2attrubute.keys(), sent_pos) end = datetime.datetime.now() time_find_candidate_pattern += (end - start).microseconds logger.info('find candidate pattern time: ' + str((end - start).microseconds)) logger.info("candidate_patterns: " + str(candidate_patterns)) if len(candidate_patterns) == 0: continue start = datetime.datetime.now() choiced_patterns = choice_final_pattern(candidate_patterns, sent_pos) end = datetime.datetime.now() time_choice_final_pattern += (end - start).microseconds logger.info('choice final pattern time: ' + str((end - start).microseconds)) logger.info("choiced_patterns: " + str(choiced_patterns)) start = datetime.datetime.now() attributes2value_part = get_match_result(choiced_patterns, pattern2attrubute, sent_pos) for attribute, value in attributes2value_part.iteritems(): if attribute in attributes2value.keys(): part1 = attributes2value[attribute] attributes2value[attribute] = part1 + '; ' + value else: attributes2value[attribute] = value end = datetime.datetime.now() time_get_match_result += (end - start).microseconds logger.info('get match result time: ' + str((end - start).microseconds)) logger.info("attributes2value: " + str(attributes2value)) global find_candidate_time find_candidate_time += time_find_candidate_pattern logger.info('time_find_candidate_pattern: ' + str(time_find_candidate_pattern)) logger.info('time_choice_final_pattern: ' + str(time_choice_final_pattern)) logger.info('time_get_match_result: ' + str(time_get_match_result)) logger.info("whole attributes2value: " + str(attributes2value)) return attributes2value