def test(fun):
    result = []
    for item in data_modified:
        pos2defs = item["pos2definition"]
        for pos2def in pos2defs:
            def_tagged_str = pos2def["def_tagged"]
            def_tagged = POSfromstring(def_tagged_str)
            chunks = cut_tuple_list(def_tagged, [('.', '.')])
            for chunk in chunks:
                result.extend(fun(chunk))
    sorted_re = sorted(Counter(result).iteritems(),
                       key=lambda asd: asd[1],
                       reverse=True)

    cnt_1 = 0
    cnt_2 = 0
    cnt_3 = 0
    for item in sorted_re:
        logger.info(item)
        print item
        if item[1] > 1:
            cnt_1 += 1
        if item[1] > 2:
            cnt_2 += 1
        if item[1] > 3:
            cnt_3 += 1

    print len(sorted_re), cnt_1, cnt_2, cnt_3
Example #2
0
def extract_single_item(data, i, new_data, pattern2attrubute, stanford_tagger):
    print i, 'start'
    pos2definition = data[i]["pos2definition"]
    for pos2def in pos2definition:
        definition = pos2def["definition"]
        definition_pure = re.sub(r'\([\s\S]*?\)', "", definition)
        attributes2value = process_definition(definition_pure,
                                              pattern2attrubute,
                                              stanford_tagger)
        pos2def["attributes"] = attributes2value
    logger.info("\n\n")
    print i, ' over'
    new_data.append(data[i])
Example #3
0
def get_start_pos(pattern, sent_pos):
    pattern_start = KMP_match(pattern, sent_pos)
    pattern_words = pattern[:pattern.index("$")].split('+')
    cur = int(re.findall('\$(\d+)', pattern)[0])
    if cur > 0:
        logger.info("start move right %d" % cur)
    start_tmp = pattern_start + len(pattern_words) - cur
    start = 0
    if sent_pos[start_tmp][0] in [',']:
        start = start_tmp + 1
    else:
        start = start_tmp
    return start
Example #4
0
def get_end_pos(end_current, sent_pos):
    cur = 0
    for i in range(end_current - 1, 0, -1):
        if sent_pos[i][1] not in [
                'DT', 'CC', 'TO', 'WDT', 'IN', 'RB'
        ] and sent_pos[i][0] not in [
                'be', 'is', 'are', 'that', 'may', 'can', 'performed', ','
        ]:
            break
        else:
            cur += 1
    if cur > 0:
        logger.info("end move left %d" % cur)
    return end_current - cur
def choice_final_pattern(patterns, sent_pos):
    '''
    we have two comparing principles: if two patterns occur in a same position, the bigger range 
    the prior higher; if two pattern have common part or common match in a sentence, 
    occur earlier prior higher
    '''
    #     pos2patterns,pattern2range=get_pos_patterns_range(patterns,sent_pos)
    pos2patterns = get_pos2patterns(patterns, sent_pos)
    sort_pos2patterns = sorted(pos2patterns.iteritems(),
                               key=lambda d: d[0],
                               reverse=False)
    logger.info('sorted_pattern:¡¡' + str(sort_pos2patterns))

    pattern2range = get_pattern_range(patterns, sent_pos)
    logger.info('pattern2range:¡¡' + str(pattern2range))

    patterns_sort_by_range = get_prior_by_range(sort_pos2patterns,
                                                pattern2range)
    logger.info('patterns_range: ' + str(patterns_sort_by_range))

    patterns_final = get_prior_by_priority(patterns_sort_by_range,
                                           pattern2range)
    logger.info('patterns_final: ' + str(patterns_final))
    #     patterns_by_priority=get_prior_by_priority(patterns_sort_by_range,pattern2range)
    #     logger.info('patterns_by_priority: '+str(patterns_by_priority))
    #
    #     patterns_final=get_prior_by_pos(patterns_sort_by_range,pattern2range)
    #     logger.info('patterns_final: '+str(patterns_final))

    return patterns_final
Example #6
0
def get_match_result(patterns, pattern2attributes, sent_pos):
    '''
    use the final patterns to match the sent,if ';' exist in the sent, we cut the sent first, 
    otherwise we directly depend the pattern and value position to get the attributes and values
    patterns: the final patterns that used in sentence
    '''
    attributes2value = {}
    if (';', ':') in sent_pos and not (sent_pos[0][0] == 'See'
                                       and sent_pos[1][0] == 'also'):
        chips = cut_list(sent_pos, [(';', ':')])
        for chip in chips:
            value_pos = get_value_pos(patterns, chip)
            logger.info(str(patterns) + 'value_pos' + str(value_pos))
            for i in range(len(patterns) - 1, -1, -1):
                sent = ""
                end = value_pos[i][1]
                slice_chip = chip[value_pos[i][0]:end]
                for word_tag in slice_chip:
                    if word_tag[0] in ['.', ';', ',']:
                        sent = sent.strip() + word_tag[0] + " "
                    else:
                        sent += word_tag[0] + " "
            attributes2value[pattern2attributes[patterns[i]]] = sent.strip()
            if len(sent.strip()) > 0 and sent.strip()[-1] in [';', ',', '.']:
                attributes2value[pattern2attributes[
                    patterns[i]]] = sent.strip()[:-1]
    else:
        value_pos = get_value_pos(patterns, sent_pos)
        logger.info(str(patterns) + ' value_pos: ' + str(value_pos))
        #         print 'value_pos',value_pos
        for i in range(len(patterns) - 1, -1, -1):
            #             print i,patterns[i]
            sent = ""
            end = value_pos[i][1]
            #             print value_pos[i][0],end
            slice_sent_pos = sent_pos[value_pos[i][0]:end]
            for word_tag in slice_sent_pos:
                if word_tag[0] in ['.', ';', ',']:
                    sent = sent.strip() + word_tag[0] + " "
                else:
                    sent += word_tag[0] + " "

            attributes2value[pattern2attributes[patterns[i]]] = sent.strip()
            if len(sent.strip()) > 0 and sent.strip()[-1] in [';', ',', '.']:
                attributes2value[pattern2attributes[
                    patterns[i]]] = sent.strip()[:-1]
    return attributes2value
Example #7
0
def pre_process():
    cnt_exp = 0
    for item in data_tagged_modified:
        #     for item in data_modified_test:
        #     for item in data_modified:
        pos2definition = item["pos2definition"]
        for pos2def in pos2definition:
            try:
                definition = pos2def["definition"]
                definition_pure = re.sub(r'\([\s\S]*?\)', "", definition)
                tokens = nltk.word_tokenize(definition_pure.encode('utf-8'))
                for token in tokens:
                    #                     try:
                    if chardet.detect(token)['encoding'] != 'ascii':
                        #                         print token,chardet.detect(token)['encoding'],token.decode('utf-8').encode('gbk')
                        logger.info("%s\t%s\t%s" %
                                    (token, chardet.detect(token)['encoding'],
                                     token.decode("utf-8").encode("gbk")))
#                     except Exception:
#                         traceback.print_exc()
#                         print token
            except Exception:
                cnt_exp += 1
                traceback.print_exc()
                #                 print traceback.format_exc()
                print definition


#                 print definition.encode('gbk')
#
# #             print definition_pure.encode('gbk')
#             try:
#                 pos2def["definition"]=pos2def["definition"].decode("utf-8").encode("gbk")
#             except Exception:
#                 cnt_exp+=1
#                 traceback.print_exc()
#                 print pos2def["definition"]
    print cnt_exp
    path_tagged_output = "items_tagged_modified_pre.json"
    json.dump(data_tagged_modified,
              codecs.open(path_tagged_output, 'w', 'utf-8'),
              ensure_ascii=False,
              indent=2)
Example #8
0
def extract_items_single_thread(data, pattern2attrubute):
    data_new = []
    all_time = 0
    #     attributes=set([])
    for item in data:
        pos2definition = item["pos2definition"]
        for pos2def in pos2definition:
            def_tagged = tagfromstring(pos2def["def_tagged"])
            #             definition=pos2def["definition"]
            #             tagged_text=stanford_tagger.tag(definition.split())
            #             definition_pure=re.sub(r'\([\s\S]*?\)', "", definition)
            #             cnt+=1

            start = datetime.datetime.now()
            #             for key in pos2def["attributes"]:
            #                 attributes.add(key)
            attributes2value = process_definition(pattern2attrubute,
                                                  def_tagged)
            end = datetime.datetime.now()
            all_time += (end - start).seconds * 1000 + (end -
                                                        start).microseconds
            logger.info('process_definition time: %ds: %dms ' %
                        ((end - start).seconds, (end - start).microseconds))
            pos2def["attributes"] = attributes2value
        logger.info("\n\n")
        data_new.append(item)


#     for attribue in sorted(list(attributes)):
#         print attribue
    global tag_time_all
    logger.info("tag all time is: %d" % tag_time_all)
    global find_candidate_time
    logger.info("find candidate time is: %d" % find_candidate_time)
    logger.info("all time is: %d" % all_time)
    return data_new
Example #9
0
def IE_multi_thread():
    path_data = "data" + os.sep + "items_tagged_modified.json"
    path_pattern = "patterns.json"
    #     path_data= path_project+os.sep+"input"+os.sep+"items.json"
    #     path_data_output=path_project+os.sep+"output"+os.sep+"test_items.json"
    path_tagged_output = "items_tagged_auto.json"
    pattern2attrubute = json.load(codecs.open(path_pattern, encoding='UTF-8'))
    logger.info("loaded all the patterns")
    data = json.load(codecs.open(path_data, encoding='UTF-8'))
    logger.info("loaded all the data")
    data_new = extractor_multi_thread(data, pattern2attrubute)
    #     data_new=extractor_multi_thread(data,pattern2attrubute,stanford_tagger)
    logger.info("has extracted all the attributes")
    json.dump(data_new,
              codecs.open(path_tagged_output, 'w', 'utf-8'),
              ensure_ascii=False,
              indent=2)
    logger.info("output over")
Example #10
0
def IE_auto_pattern():
    path_project = os.path.abspath(
        os.path.join(os.getcwd(), os.pardir, os.pardir))
    path_data = path_project + os.sep + "input" + os.sep + "items_tagged_modified.json"
    path_pattern = path_project + os.sep + "output" + os.sep + "Patterns_auto.json"
    #     path_pattern= path_project+os.sep+"output"+os.sep+"pattern_auto_test.json"
    #     path_pattern= path_project+os.sep+"input"+os.sep+"patterns_target.json"
    #     path_pattern= path_project+os.sep+"input"+os.sep+"patterns_merge.json"

    #     path_data= path_project+os.sep+"input"+os.sep+"items.json"
    #     path_data_output=path_project+os.sep+"output"+os.sep+"test_items.json"
    path_tagged_output = path_project + os.sep + "output" + os.sep + "items_tagged_auto.json"
    pattern2attrubute = json.load(codecs.open(path_pattern, encoding='UTF-8'))
    logger.info("loaded all the patterns")
    data = json.load(codecs.open(path_data, encoding='UTF-8'))
    logger.info("loaded all the data")
    data_new = extract_items_single_thread(data, pattern2attrubute)
    #     data_new=extractor_multi_thread(data,pattern2attrubute,stanford_tagger)
    logger.info("has extracted all the attributes")
    json.dump(data_new,
              codecs.open(path_tagged_output, 'w', 'utf-8'),
              ensure_ascii=False,
              indent=2)
    logger.info("output over")
        elif lines[i].strip() == '~~~~~~~':
            items.append(itemTmp)
            itemTmp = []
        else:
            sent = tagfromstring(lines[i])
            defTmp.append(sent)
    return items


# path_data="data"+os.sep+"items_tagged_modified.json"

path_data_tagged_modified = "data" + os.sep + "items_tagged_modified_POS.json"
path_data_tagged_modified_extract = "out" + os.sep + "items_tagged_modified_extract.json"
data_tagged_modified = json.load(
    codecs.open(path_data_tagged_modified, encoding='UTF-8'))
logger.info("loaded all the data")

path_data_modified = "data" + os.sep + "items_modified_POS.json"
path_data_tagged_modified_extract = "out" + os.sep + "items_modified_extract.json"
data_modified = json.load(codecs.open(path_data_modified, encoding='UTF-8'))

path_data_tagged_modified_test = "data" + os.sep + "items_tagged_modified_test.json"
path_data_tagged_modified_extract_test = "out" + os.sep + "items_tagged_modified_extract_test.json"
data_tagged_modified_test = json.load(
    codecs.open(path_data_tagged_modified_test, encoding='UTF-8'))

# path_data_modified_test="data"+os.sep+"items_modified_test.json"
# path_data_tagged_modified_extract_test="out"+os.sep+"items_modified_extract_test.json"
# data_modified_test=json.load(codecs.open(path_data_modified_test, encoding='UTF-8'))

logger.info("loaded all the patterns")
Example #12
0
def process_definition(pattern2attrubute, def_tagged):
    attributes2value = {}
    definition = ' '.join([x[0] for x in def_tagged])
    logger.info('definition: %s' % definition)
    if definition.strip().startswith('See') or definition.strip().startswith(
            'see'):
        return
    start = datetime.datetime.now()
    #     text = nltk.word_tokenize(definition)
    #     def_pos=nltk.pos_tag(text)
    logger.info(def_tagged)
    end = datetime.datetime.now()
    global tag_time_all
    tag_time_all += (end - start).microseconds
    logger.info('tagging time:%d ' % ((end - start).microseconds))
    #     logger.info(def_pos)
    seg_point = [('.', '.')]
    sents_pos_period = cut_list(def_tagged, seg_point)
    sents_pos = []
    for sent_pos_period in sents_pos_period:
        if sent_pos_period[0][0] == 'See' and sent_pos_period[1][0] == 'also':
            sents_pos.append(sent_pos_period)
        else:
            sents_pos.extend(cut_list(sent_pos_period, [(';', ':')]))

    start = datetime.datetime.now()
    end = datetime.datetime.now()
    time_find_candidate_pattern = (end - start).microseconds
    time_choice_final_pattern = (end - start).microseconds
    time_get_match_result = (end - start).microseconds
    for sent_pos in sents_pos:
        logger.info("sent_pos: " + str(sent_pos))
        start = datetime.datetime.now()
        candidate_patterns = find_candidate_pattern(pattern2attrubute.keys(),
                                                    sent_pos)
        end = datetime.datetime.now()
        time_find_candidate_pattern += (end - start).microseconds
        logger.info('find candidate pattern time: ' +
                    str((end - start).microseconds))
        logger.info("candidate_patterns: " + str(candidate_patterns))

        if len(candidate_patterns) == 0:
            continue
        start = datetime.datetime.now()
        choiced_patterns = choice_final_pattern(candidate_patterns, sent_pos)
        end = datetime.datetime.now()
        time_choice_final_pattern += (end - start).microseconds
        logger.info('choice final pattern time: ' +
                    str((end - start).microseconds))
        logger.info("choiced_patterns: " + str(choiced_patterns))

        start = datetime.datetime.now()
        attributes2value_part = get_match_result(choiced_patterns,
                                                 pattern2attrubute, sent_pos)
        for attribute, value in attributes2value_part.iteritems():
            if attribute in attributes2value.keys():
                part1 = attributes2value[attribute]
                attributes2value[attribute] = part1 + '; ' + value
            else:
                attributes2value[attribute] = value
        end = datetime.datetime.now()
        time_get_match_result += (end - start).microseconds
        logger.info('get match result time: ' +
                    str((end - start).microseconds))
        logger.info("attributes2value: " + str(attributes2value))
    global find_candidate_time
    find_candidate_time += time_find_candidate_pattern
    logger.info('time_find_candidate_pattern: ' +
                str(time_find_candidate_pattern))
    logger.info('time_choice_final_pattern: ' + str(time_choice_final_pattern))
    logger.info('time_get_match_result: ' + str(time_get_match_result))
    logger.info("whole attributes2value: " + str(attributes2value))
    return attributes2value
Example #13
0
#                 print definition.encode('gbk')
#
# #             print definition_pure.encode('gbk')
#             try:
#                 pos2def["definition"]=pos2def["definition"].decode("utf-8").encode("gbk")
#             except Exception:
#                 cnt_exp+=1
#                 traceback.print_exc()
#                 print pos2def["definition"]
    print cnt_exp
    path_tagged_output = "items_tagged_modified_pre.json"
    json.dump(data_tagged_modified,
              codecs.open(path_tagged_output, 'w', 'utf-8'),
              ensure_ascii=False,
              indent=2)

if __name__ == '__main__':
    start = datetime.datetime.now()
    logger.info(start)
    print start
    #     IE_auto_pattern()
    #     tagged_def()
    #     IE()
    pre_process()
    end = datetime.datetime.now()
    logger.info(end)
    print end
    print(end - start).seconds, (end - start).microseconds
    logger.info("cost time: " + str((end - start).seconds))