Ejemplo n.º 1
0
def test_parser():
    import nltk
    from nltk.parse import stanford
    from nltk.parse.stanford import StanfordParser
    os.environ[
        'STANFORD_PARSER'] = 'F:/eclipse_doctor/KnowledgeGraph/stanford-parser/stanford-parser.jar'
    os.environ[
        'STANFORD_MODELS'] = 'F:/eclipse_doctor/KnowledgeGraph/stanford-parser/stanford-parser-3.7.0-models.jar'
    java_path = "C:/ProgramData/Oracle/Java/javapath"
    os.environ['JAVAHOME'] = java_path

    start = datetime.now()
    print start
    parser = stanford.StanfordParser(
        model_path=
        "F:/eclipse_doctor/KnowledgeGraph/stanford-parser/englishPCFG.ser.gz")
    end = datetime.now()
    print end
    print "cost time: " + str((end - start).microseconds)

    sent = 'angulated abutment is an abutment whose body is not parallel to the long axis of the implant. It is utilized when the implant is at a different inclination in relation to the proposed prosthesis.'
    start = datetime.now()
    print start
    trees = parser.parse(sent.split())
    end = datetime.now()
    print end
    print "cost time: " + str((end - start).microseconds)
    print 'len(trees)', len(list(trees))

    path_project = os.path.abspath(
        os.path.join(os.getcwd(), os.pardir, os.pardir))
    path_data = path_project + os.sep + "input" + os.sep + "items_tagged_modified.json"
    data = json.load(codecs.open(path_data, encoding='UTF-8'))
    start_all = datetime.now()
    cnt = 0
    trees_all = []
    for item in data:
        pos2definition = item["pos2definition"]
        for pos2def in pos2definition:
            definition = pos2def["definition"]
            definition_pure = re.sub(r'\([\s\S]*?\)', "", definition)
            text = nltk.word_tokenize(definition_pure)
            sents_pos_period = cut_list(text, ['.'])
            for sent_list in sents_pos_period:
                cnt += 1
                start = datetime.now()
                #                 print start
                trees = parser.parse(' '.join(sent_list).split())
                trees_all.append(trees)
                end = datetime.now()
#                 print end
#                 print "cost time: "+str((end - start).microseconds)
    end_all = datetime.now()
    print end_all
    sum_time = (end_all - start_all).seconds
    sum_time_mic = (end_all - start_all).microseconds
    avg_time = (end_all - start_all).seconds * 1.0 / cnt
    print sum_time, sum_time_mic, avg_time, cnt
def definition_restore(concept_real, def_tokens):
    final_tokens = []
    concept_real_tokens = nltk.pos_tag(concept_real)
    deifination_new = ''
    if def_tokens[0] == 'in':
        if ',' not in def_tokens:
            final_tokens.extend(
                definition_restore_process(concept_real_tokens, def_tokens))
        else:
            final_tokens.extend(def_tokens[0:def_tokens.index(',') + 1])
            final_tokens.extend(
                definition_restore_process(
                    concept_real_tokens,
                    def_tokens[def_tokens.index(',') + 1:]))
    elif def_tokens[0] == ',':
        final_tokens.extend(
            definition_restore_process(concept_real_tokens,
                                       def_tokens[def_tokens.index(',') + 1:]))
    elif def_tokens[0] == 'brand':
        final_tokens.extend(def_tokens)
    else:
        seg_point = ['.']
        sents = cut_list(def_tokens, seg_point)
        if ';' in sents[0]:
            #             print def_tokens
            sents_new = []
            chips = cut_list(sents[0], [';'])
            for i in range(len(chips) - 1):
                sents_new.extend(
                    definition_restore_process(concept_real_tokens, chips[i]))
                sents_new.append(';')
            sents_new.extend(
                definition_restore_process(concept_real_tokens, chips[-1]))
            sents_new.append('.')
            final_tokens.extend(sents_new)
        else:
            final_tokens.extend(
                definition_restore_process(concept_real_tokens, def_tokens))
    return ' '.join(final_tokens)
Ejemplo n.º 3
0
def test_parseDef(item, parser):
    import nltk
    treeDef = []
    pos2definition = item["pos2definition"]
    for pos2def in pos2definition:
        definition = pos2def["definition"]
        definition_pure = re.sub(r'\([\s\S]*?\)', "", definition)
        text = nltk.word_tokenize(definition_pure)
        sents_pos_period = cut_list(text, ['.'])
        for sent_list in sents_pos_period:
            tree = parser.parse(' '.join(sent_list).split())
            treeDef.append(tree)
    return treeDef
def get_match_result(patterns, pattern2attributes, sent_pos):
    '''
    use the final patterns to match the sent,if ';' exist in the sent, we cut the sent first, 
    otherwise we directly depend the pattern and value position to get the attributes and values
    patterns: the final patterns that used in sentence
    '''
    attributes2value = {}
    if (';', ':') in sent_pos and not (sent_pos[0][0] == 'See'
                                       and sent_pos[1][0] == 'also'):
        chips = cut_list(sent_pos, [(';', ':')])
        for chip in chips:
            value_pos = get_value_pos(patterns, chip)
            logger.info(str(patterns) + 'value_pos' + str(value_pos))
            for i in range(len(patterns) - 1, -1, -1):
                sent = ""
                end = value_pos[i][1]
                slice_chip = chip[value_pos[i][0]:end]
                for word_tag in slice_chip:
                    if word_tag[0] in ['.', ';', ',']:
                        sent = sent.strip() + word_tag[0] + " "
                    else:
                        sent += word_tag[0] + " "
            attributes2value[pattern2attributes[patterns[i]]] = sent.strip()
            if len(sent.strip()) > 0 and sent.strip()[-1] in [';', ',', '.']:
                attributes2value[pattern2attributes[
                    patterns[i]]] = sent.strip()[:-1]
    else:
        value_pos = get_value_pos(patterns, sent_pos)
        logger.info(str(patterns) + ' value_pos: ' + str(value_pos))
        #         print 'value_pos',value_pos
        for i in range(len(patterns) - 1, -1, -1):
            #             print i,patterns[i]
            sent = ""
            end = value_pos[i][1]
            #             print value_pos[i][0],end
            slice_sent_pos = sent_pos[value_pos[i][0]:end]
            for word_tag in slice_sent_pos:
                if word_tag[0] in ['.', ';', ',']:
                    sent = sent.strip() + word_tag[0] + " "
                else:
                    sent += word_tag[0] + " "

            attributes2value[pattern2attributes[patterns[i]]] = sent.strip()
            if len(sent.strip()) > 0 and sent.strip()[-1] in [';', ',', '.']:
                attributes2value[pattern2attributes[
                    patterns[i]]] = sent.strip()[:-1]
    return attributes2value
Ejemplo n.º 5
0
def process_definition(definition, pattern2attrubute, tagger):
    attributes2value = {}
    type = sys.getfilesystemencoding()
    def_pos = tagger.tag(seg_sent(definition).strip().split())
    logger.info(def_pos)
    seg_point = [('.', '.'), (';', ':')]
    sents_pos = cut_list(def_pos, seg_point)
    for sent_pos in sents_pos:
        logger.info("sent_pos: " + str(sent_pos))
        sent = produce_new_sent(sent_pos)
        logger.info("sent_cut: " + sent)
        candidate_patterns = find_candidate_pattern(pattern2attrubute.keys(),
                                                    sent_pos)
        logger.info("candidate_patterns: " + str(candidate_patterns))
        if len(candidate_patterns) == 0:
            continue
        choiced_patterns = choice_final_pattern(candidate_patterns, sent_pos)
        logger.info("choiced_patterns: " + str(choiced_patterns))
        attributes2value.update(
            get_match_result(choiced_patterns, pattern2attrubute, sent_pos))
        logger.info("attributes2value: " + str(attributes2value))
    logger.info("whole attributes2value: " + str(attributes2value))
    return attributes2value
Ejemplo n.º 6
0
def parseMosby():
    #     path_project = os.path.abspath(os.path.join(os.getcwd(), os.pardir, os.pardir))
    #     path_data= path_project+os.sep+"input"+os.sep+"items_tagged_modified.json"
    path_data = "F:/eclipse_doctor/KnowledgeGraph/kg/books/extractor/data/items_modified.json"
    data = json.load(codecs.open(path_data, encoding='UTF-8'))

    import nltk
    from nltk.parse import stanford
    from nltk.parse.stanford import StanfordParser
    os.environ[
        'STANFORD_PARSER'] = 'F:/eclipse_doctor/KnowledgeGraph/stanford-parser/stanford-parser.jar'
    os.environ[
        'STANFORD_MODELS'] = 'F:/eclipse_doctor/KnowledgeGraph/stanford-parser/stanford-parser-3.7.0-models.jar'
    java_path = "C:/ProgramData/Oracle/Java/javapath"
    os.environ['JAVAHOME'] = java_path

    parser = stanford.StanfordParser(
        model_path=
        "F:/eclipse_doctor/KnowledgeGraph/stanford-parser/englishPCFG.ser.gz")
    cnt = 0
    for item in data:
        print "parse %dth item" % cnt
        cnt += 1
        if cnt < 4686:
            continue
        pos2definition = item["pos2definition"]
        for pos2def in pos2definition:
            definition = pos2def["definition"]
            definition_pure = re.sub(r'\([\s\S]*?\)', "", definition)
            text = nltk.word_tokenize(definition_pure)
            sents_pos_period = cut_list(text, ['.'])
            for sent_list in sents_pos_period:
                #                 print list(parser.parse(sent_list))
                logger.info(list(parser.parse(sent_list)))
            logger.info('--------')
        logger.info('~~~~~~~')
Ejemplo n.º 7
0
def process_definition(definition, pattern2attrubute, tagger):
    attributes2value = {}
    if definition.strip().startswith('See') or definition.strip().startswith(
            'see'):
        return
    logger.info(definition)
    #     type=sys.getfilesystemencoding()
    start = datetime.datetime.now()
    text = nltk.word_tokenize(definition)
    def_pos1 = tagger.tag(text)
    logger.info(def_pos1)
    def_pos2 = nltk.pos_tag(text)
    logger.info(def_pos2)
    #     print definition
    logger.info("two tag methods similarity: " +
                str(compare_similar_pos(def_pos1, def_pos2)))
    print compare_similar_pos(def_pos1, def_pos2)
    end = datetime.datetime.now()
    global tag_time_all
    tag_time_all += (end - start).microseconds
    logger.info('tagging time:%d ' % ((end - start).microseconds))
    #     logger.info(def_pos)
    seg_point = [('.', '.'), (';', ':')]
    sents_pos = cut_list(def_pos2, seg_point)
    start = datetime.datetime.now()
    end = datetime.datetime.now()
    time_find_candidate_pattern = (end - start).microseconds
    time_choice_final_pattern = (end - start).microseconds
    time_get_match_result = (end - start).microseconds
    for sent_pos in sents_pos:
        logger.info("sent_pos: " + str(sent_pos))
        sent = produce_new_sent(sent_pos)
        logger.info("sent_cut: " + sent)
        start = datetime.datetime.now()
        candidate_patterns = find_candidate_pattern(pattern2attrubute.keys(),
                                                    sent_pos)
        end = datetime.datetime.now()
        time_find_candidate_pattern += (end - start).microseconds
        logger.info('find candidate pattern time: ' +
                    str((end - start).microseconds))
        logger.info("candidate_patterns: " + str(candidate_patterns))

        if len(candidate_patterns) == 0:
            continue
        start = datetime.datetime.now()
        choiced_patterns = choice_final_pattern(candidate_patterns, sent_pos)
        end = datetime.datetime.now()
        time_choice_final_pattern += (end - start).microseconds
        logger.info('choice final pattern time: ' +
                    str((end - start).microseconds))
        logger.info("choiced_patterns: " + str(choiced_patterns))

        start = datetime.datetime.now()
        attributes2value.update(
            get_match_result(choiced_patterns, pattern2attrubute, sent_pos))
        end = datetime.datetime.now()
        time_get_match_result += (end - start).microseconds
        logger.info('get match result time: ' +
                    str((end - start).microseconds))
        logger.info("attributes2value: " + str(attributes2value))
    global find_candidate_time
    find_candidate_time += time_find_candidate_pattern
    logger.info('time_find_candidate_pattern: ' +
                str(time_find_candidate_pattern))
    logger.info('time_choice_final_pattern: ' + str(time_choice_final_pattern))
    logger.info('time_get_match_result: ' + str(time_get_match_result))
    logger.info("whole attributes2value: " + str(attributes2value))
    return attributes2value
def process_definition(definition, pattern2attrubute):
    attributes2value = {}
    logger.info('definition: %s' % definition)
    if definition.strip().startswith('See') or definition.strip().startswith(
            'see'):
        process_vacant_definition(definition)
    start = datetime.datetime.now()
    text = nltk.word_tokenize(definition)
    def_pos = nltk.pos_tag(text)
    logger.info(def_pos)
    end = datetime.datetime.now()
    global tag_time_all
    tag_time_all += (end - start).microseconds
    logger.info('tagging time:%d ' % ((end - start).microseconds))
    #     logger.info(def_pos)
    seg_point = [('.', '.')]
    sents_pos_period = cut_list(def_pos, seg_point)
    sents_pos = []
    for sent_pos_period in sents_pos_period:
        if sent_pos_period[0][0] == 'See' and sent_pos_period[1][0] == 'also':
            sents_pos.append(sent_pos_period)
        else:
            sents_pos.extend(cut_list(sent_pos_period, [(';', ':')]))

    start = datetime.datetime.now()
    end = datetime.datetime.now()
    time_find_candidate_pattern = (end - start).microseconds
    time_choice_final_pattern = (end - start).microseconds
    time_get_match_result = (end - start).microseconds
    for sent_pos in sents_pos:
        logger.info("sent_pos: " + str(sent_pos))
        start = datetime.datetime.now()
        candidate_patterns = find_candidate_pattern(pattern2attrubute.keys(),
                                                    sent_pos)
        end = datetime.datetime.now()
        time_find_candidate_pattern += (end - start).microseconds
        logger.info('find candidate pattern time: ' +
                    str((end - start).microseconds))
        logger.info("candidate_patterns: " + str(candidate_patterns))

        if len(candidate_patterns) == 0:
            continue
        start = datetime.datetime.now()
        choiced_patterns = choice_final_pattern(candidate_patterns, sent_pos)
        end = datetime.datetime.now()
        time_choice_final_pattern += (end - start).microseconds
        logger.info('choice final pattern time: ' +
                    str((end - start).microseconds))
        logger.info("choiced_patterns: " + str(choiced_patterns))

        start = datetime.datetime.now()
        attributes2value_part = get_match_result(choiced_patterns,
                                                 pattern2attrubute, sent_pos)
        for attribute, value in attributes2value_part.iteritems():
            if attribute in attributes2value.keys():
                part1 = attributes2value[attribute]
                attributes2value[attribute] = part1 + '; ' + value
            else:
                attributes2value[attribute] = value
        end = datetime.datetime.now()
        time_get_match_result += (end - start).microseconds
        logger.info('get match result time: ' +
                    str((end - start).microseconds))
        logger.info("attributes2value: " + str(attributes2value))
    global find_candidate_time
    find_candidate_time += time_find_candidate_pattern
    logger.info('time_find_candidate_pattern: ' +
                str(time_find_candidate_pattern))
    logger.info('time_choice_final_pattern: ' + str(time_choice_final_pattern))
    logger.info('time_get_match_result: ' + str(time_get_match_result))
    logger.info("whole attributes2value: " + str(attributes2value))
    return attributes2value