Example #1
0
def compare_all(data_manual,data_auto):
    def2result={}
    for i in range(len(data_manual)):
        concept_manual,pronunciation_manual,pos2definition_manual=extract_item_properties(data_manual[i])
        concept_auto,pronunciation_auto,pos2definition_auto=extract_item_properties(data_auto[i])
        for j in range(len(pos2definition_manual)):
            attributes_manual=pos2definition_manual[j]["attributes"]
            attributes_auto=pos2definition_auto[j]["attributes"]
            key_similarity,attribute2value_similarity=attribute_compare(attributes_manual,attributes_auto)
            #locate the position item and definition
            position=(i,j)
            def2result[position]=(key_similarity,attribute2value_similarity)
    return def2result
Example #2
0
def compare_all_different(data_manual,data_auto):
    path_project = os.path.abspath(os.path.join(os.getcwd(), os.pardir, os.pardir))
    path_diff=path_project+os.sep+"output"+os.sep+"attribute_diff.txt"
    fp_diff=codecs.open(path_diff,'w','utf-8')
    for i in range(len(data_manual)):
        concept_manual,pronunciation_manual,pos2definition_manual=extract_item_properties(data_manual[i])
        concept_auto,pronunciation_auto,pos2definition_auto=extract_item_properties(data_auto[i])
        for j in range(len(pos2definition_manual)):
            attributes_manual=pos2definition_manual[j]["attributes"]
            attributes_auto=pos2definition_auto[j]["attributes"]
            manual_diff,auto_diff=attribute_different(attributes_manual,attributes_auto)
            if len(manual_diff)!=0 or len(auto_diff)!=0:
                write_error((i,j),data_manual,data_auto,fp_diff)
def compare_all(data_manual,data_auto):
    def2result={}
    for i in range(len(data_manual)):
        concept_manual,pronunciation_manual,pos2definition_manual=extract_item_properties(data_manual[i])
        concept_auto,pronunciation_auto,pos2definition_auto=extract_item_properties(data_auto[i])
#         print 'concept_manual',concept_manual,'concept_auto',concept_auto
        for j in range(len(pos2definition_manual)):
            attributes_manual=pos2definition_manual[j]["attributes"]
            attributes_auto=pos2definition_auto[j]["attributes"]
#             print 'attributes_manual',attributes_manual,'\nattributes_auto',attributes_auto
            similarity,attribute2value_similarity=attribute_compare(attributes_manual,attributes_auto)
#             print similarity,attribute2value_similarity
            def2result[pos2definition_manual[j]["definition"]]=(similarity,attribute2value_similarity)
    return def2result
Example #4
0
def extract_items_single_thread(data, pattern2attrubute, tagger):
    data_new = []
    all_time = 0
    for item in data:
        concept, pronunciation, pos2definition = extract_item_properties(item)
        for pos2def in pos2definition:
            definition = pos2def["definition"]
            #             tagged_text=stanford_tagger.tag(definition.split())
            definition_pure = re.sub(r'\([\s\S]*?\)', "", definition)
            #             cnt+=1
            start = datetime.datetime.now()
            attributes2value = process_definition(definition_pure,
                                                  pattern2attrubute, tagger)
            end = datetime.datetime.now()
            all_time += (end - start).seconds * 1000 + (end -
                                                        start).microseconds
            logger.info('process_definition time: %ds: %dms ' %
                        ((end - start).seconds, (end - start).microseconds))
            pos2def["attributes"] = attributes2value
        logger.info("\n\n")
        data_new.append(item)
    global tag_time_all
    logger.info("tag all time is: %d" % tag_time_all)
    global find_candidate_time
    logger.info("find candidate time is: %d" % find_candidate_time)
    logger.info("all time is: %d" % all_time)
    return data_new
Example #5
0
def upload(data):
    from py2neo import Node, Relationship
    from py2neo import Graph
    graph = Graph("http://localhost:7474",
                  username="******",
                  password="******")
    graph.delete_all()
    nodes = []
    for item in data:
        concept, pronunciation, pos2definition = extract_item_properties(item)
        node_tmp = Node("Prosthodontics", name=concept)
        node_tmp.properties["pronunciation"] = pronunciation
        cnt = 1
        for pos2def in pos2definition:
            node_tmp.properties["pos " + str(cnt)] = pos2def["pos"]
            #             node_tmp.properties["definition "+str(cnt)]=pos2def["definition"]
            for attribute, value in pos2def["attributes"].iteritems():
                node_tmp["def " + str(cnt) + " : " + attribute] = value
        graph.create(node_tmp)
        nodes.append(node_tmp)
    print "nodes create over , relation start to create"

    for node1 in nodes:
        properties = node1.properties.keys()
        for property in properties:
            if property[8:] == "cross_reference":
                for node2 in nodes:
                    if node2.properties["name"] == node1[property]:
                        graph.create(
                            Relationship(node1, "cross_reference", node2))
    print "graph create over"
def extract_items_all(data,pattern2attrubute):
    data_new=[]
    all_time=0
#     attributes=set([])
    cnt=0
    for item in data:
#         print 'processing %d item'%cnt
        concept,pronunciation,pos2definition=extract_item_properties(item)
        for pos2def in pos2definition:
            definition=pos2def["definition"]
#             tagged_text=stanford_tagger.tag(definition.split())
            definition_pure=re.sub(r'\([\s\S]*?\)', "", definition)
#             cnt+=1
            start=datetime.datetime.now()
#             for key in pos2def["attributes"]:
#                 attributes.add(key)
            attributes2value=process_definition(definition_pure,pattern2attrubute)
            end=datetime.datetime.now()
            all_time+=(end-start).seconds*1000+(end-start).microseconds
            logger.info('process_definition time: %ds: %dms ' % ((end-start).seconds,(end-start).microseconds))
            pos2def["attributes"]=attributes2value
        cnt+=1
        logger.info("\n\n")
        data_new.append(item)
#     for attribue in sorted(list(attributes)):
#         print attribue
    global tag_time_all
    logger.info("tag all time is: %d"%tag_time_all)
    global find_candidate_time
    logger.info("find candidate time is: %d"%find_candidate_time)
    logger.info("all time is: %d"%all_time)
    return data_new
def modify_data(data):
    logger.info("starting to transfer the data")
    data_new = []
    cnt_item = 0
    print "data size: %d" % len(data)
    while cnt_item < len(data):
        item = data[cnt_item]
        #         print 'processing the %d item' % cnt_item
        logger.info('processing the %d item' % cnt_item)
        concept, pronunciation, pos2definition = extract_item_properties(item)
        concept_result = concept_analysis(concept)
        logger.info(concept + " : concept result is:  " + str(concept_result) +
                    "\n")
        concept_real = concept_result[0]
        item['concept'] = concept_real
        if len(concept_result[1]) > 0:
            item['abbr'] = concept_result[1]
        for i in range(len(pos2definition) - 1, -1, -1):
            pos2def = pos2definition[i]
            definition = pos2def["definition"]
            def_tokens = nltk.word_tokenize(
                re.sub(r'\([\s\S]*?\)', "", definition).strip())
            logger.info(def_tokens[0])
            if def_tokens[0] in ['See', 'see']:
                logger.info('concept: %s \n definition: %s \n is removed ' %
                            (concept, definition))
                logger.info('\n')
                pos2definition.remove(pos2def)
                continue

            pos = pos2def["pos"]
            if 'n' not in pos:
                logger.info('concept: %s \n definition: %s \n is removed ' %
                            (concept, definition))
                logger.info('\n')
                pos2definition.remove(pos2def)
                continue

            definition_new = definition_restore(
                nltk.word_tokenize(concept_real), def_tokens)
            pos2def["definition"] = definition_new
            logger.info("\n" + definition + "\n definition result is:  \n" +
                        definition_new)
        if len(pos2definition) == 0:
            data.remove(item)
            logger.info('concept: %s is removed ' % concept)
            logger.info('\n')
            continue
        cnt_item += 1
        data_new.append(item)


#         logger.info('\n')
    print "items left %d " % cnt_item
    return data_new
def analysis_data(data):
    cnt = 0
    for item in data:
        concept, pronunciation, pos2definition = extract_item_properties(item)
        if len(pos2definition) > 1:
            for pos2def in pos2definition:
                definition = pos2def["definition"]
                if definition[0:4] in ['See ', 'see ']:
                    print concept
                    cnt += 1
    print cnt
def extract_single_item(data,i,new_data,pattern2attrubute,stanford_tagger):
    print i,'start'
    concept,pronunciation,pos2definition=extract_item_properties(data[i])
    for pos2def in pos2definition:
        definition=pos2def["definition"]
        definition_pure=re.sub(r'\([\s\S]*?\)', "", definition)
        attributes2value=process_definition(definition_pure,pattern2attrubute,stanford_tagger)
        pos2def["attributes"]=attributes2value
    logger.info("\n\n")
    print i,' over'
    new_data.append(data[i])
def acquire_patterns(data):
    attribute2patterns_can_all = {}
    for item in data:
        concept, pronunciation, pos2definition = extract_item_properties(item)
        #         logger.info('concept: %s'%concept)
        for pos2def in pos2definition:
            definition = pos2def['definition']
            #             logger.info('definition: \n%s'%definition)
            text = nltk.word_tokenize(definition)
            definition_pos = nltk.pos_tag(text)
            #             logger.info('definition_pos: \n%s'%definition_pos)
            definition_tokens = get_tokens(definition_pos)
            attributes = pos2def['attributes']
            for attribute_name, attribute_value in attributes.iteritems():
                #                 logger.info('attribute_name: %s'%attribute_name)
                attribute_value_text = nltk.word_tokenize(attribute_value)
                attribute_value_tokens = nltk.pos_tag(attribute_value_text)
                #                 logger.info('attributes value token: %s'%attribute_value_tokens)
                attribute_tokens = get_tokens(attribute_value_tokens)
                prefixs_tokens2intersect = get_fix(prefix_window_size,
                                                   attribute_tokens,
                                                   definition_tokens)
                #                 logger.info('prefixs_token: \n%s'%str([str((token.word,token.pos))+": "+str(intersect) for (tokens,intersect) in prefixs_tokens2intersect for token in tokens]))
                prefixs = get_combination_fix(prefixs_tokens2intersect)
                #                 logger.info('prefix: \n%s'%str(prefixs))
                #                 if attribute_name in ['isA','isThe','purpose','used_for']:
                #                     continue
                #                 if attribute_name in ['isA','isThe']:
                #                     continue
                if attribute_name in attribute2patterns_can_all.keys():
                    attribute2patterns_can_all[attribute_name].extend(
                        list(set(prefixs)))
                else:
                    attribute2patterns_can_all[attribute_name] = list(
                        set(prefixs))


#                 logger.info('\n')
#             logger.info('\n')

    attribute2patterns_all = {}
    for attribue, patterns in attribute2patterns_can_all.iteritems():
        print 'attribue: ' + attribue
        logger.info('final attribue: %s\n' % attribue)
        logger.info('patterns original length: ' + str(len(patterns)))
        logger.info('patterns original: %s\n' % str(patterns))
        filter_patterns = get_filter_pattern_seq(patterns)
        attribute2patterns_all[attribue] = filter_patterns

        #         logger.info('final attribue: %s\n'%attribue)
        logger.info('final pattern: %s\n' % str(filter_patterns))
    pattern2single_attribute = filter_efficient(attribute2patterns_all)
    return pattern2single_attribute
def test6():
    path_project = os.path.abspath(os.path.join(os.getcwd(), os.pardir, os.pardir))
    path_data= path_project+os.sep+"input"+os.sep+"items_tagged_modified.json"
    data=load_json(path_data)
    attributes=set([])
    for item in data:
        concept,pronunciation,pos2definition=extract_item_properties(item)
        for pos2def in pos2definition:
            for attribute in pos2def['attributes'].keys():
                attributes.add(attribute)
                
    for x in sorted(list(attributes)):
        print x
def test7():
    path_project = os.path.abspath(os.path.join(os.getcwd(), os.pardir, os.pardir))
    path_data= path_project+os.sep+"input"+os.sep+"items_tagged_modified.json"
    data=load_json(path_data)
    attributes=set([])
    for item in data:
        concept,pronunciation,pos2definition=extract_item_properties(item)
        for pos2def in pos2definition:
            definition=pos2def['definition']
            grammar = "NP: {<DT>?<JJ>*<NN>}"
            tokens=nltk.word_tokenize(definition)
            tagged=nltk.pos_tag(tokens)
            print nltk.RegexpParser(grammar).parse(tagged)
Example #13
0
def extract_items_single_thread(data, pattern2attrubute, tagger):
    data_new = []
    for item in data:
        concept, pronunciation, pos2definition = extract_item_properties(item)
        for pos2def in pos2definition:
            definition = pos2def["definition"]
            #             tagged_text=stanford_tagger.tag(definition.split())
            definition_pure = re.sub(r'\([\s\S]*?\)', "", definition)
            #             cnt+=1
            attributes2value = process_definition(definition_pure,
                                                  pattern2attrubute, tagger)
            pos2def["attributes"] = attributes2value
        logger.info("\n\n")
        data_new.append(item)
    return data_new
def extract_all_items(data,patterns):
    data_new=[]
    stanford_tagger=get_tagger()
    cnt=0
    for item in data:
        cnt+=1
        concept,pronunciation,pos2definition=extract_item_properties(item)
        for pos2def in pos2definition:
            definition=pos2def["definition"]
#             tagged_text=stanford_tagger.tag(definition.split())
            definition_pure=re.sub(r'\([\s\S]*?\)', "", definition)
            cnt+=1
            attributes2value=process_definition(definition_pure,patterns,stanford_tagger)
            pos2def["attributes"]=attributes2value
        data_new.append(item)
    return data_new
def acquire_patterns(data):
    attribute2patterns_can_all={}
    for item in data:
        concept,pronunciation,pos2definition=extract_item_properties(item)
        logger.info('concept: %s'%concept)
        for pos2def in pos2definition:
            definition=pos2def['definition']
            logger.info('definition: \n%s'%definition)
            text=nltk.word_tokenize(definition)
            definition_pos=nltk.pos_tag(text)
            logger.info('definition_pos: \n%s'%definition_pos)
            definition_tokens=get_tokens(definition_pos)
            attributes=pos2def['attributes']
            for attribute_name,attribute_value in attributes.iteritems():
                logger.info('attribute_name: %s'%attribute_name)
                attribute_value_text=nltk.word_tokenize(attribute_value)
                attribute_value_tokens=nltk.pos_tag(attribute_value_text)
                logger.info('attributes value token: %s'%attribute_value_tokens)
                attribute_tokens=get_tokens(attribute_value_tokens)
                prefixs_tokens=get_fix('pre',prefix_window_size,attribute_tokens,definition_tokens)
                logger.info('prefixs_token: \n%s'%str([(token.word,token.pos) for tokens in prefixs_tokens for token in tokens]))
#                 postfixs_tokens=get_fix('post',prefix_window_size,attribute_tokens,definition_tokens)
#                 logger.info('postfixs_token: \n%s'%str([(token.word,token.pos) for tokens in postfixs_tokens for token in tokens]))
                prefixs=get_combination_fix(prefixs_tokens)
                logger.info('prefix: \n%s'%str(prefixs))
#                 postfixs=get_combination_fix(postfixs_tokens)
#                 logger.info('postfix: \n%s'%str(postfixs))

#                 logger.info('all candidate patterns: \n%s'%str(patterns))
#                 if attribute_name in attribute2patterns_can_all.keys():
#                     attribute2patterns_can_all[attribute_name].extend(patterns)
#                 else:
#                     attribute2patterns_can_all[attribute_name]=patterns
                logger.info('\n')
            logger.info('\n')
    
    attribute2patterns_all={}      
    for attribue,patterns  in attribute2patterns_can_all.iteritems():
        filter_patterns=get_filter_pattern(patterns)
        attribute2patterns_all[attribue]=filter_patterns
        logger.info('final attribue: %s\n'%attribue)
        logger.info('final pattern: %s\n'%str(filter_patterns))
    return  attribute2patterns_can_all
def main():
    path_project = os.path.abspath(
        os.path.join(os.getcwd(), os.pardir, os.pardir))
    path_book = path_project + os.sep + "input" + os.sep + "prosthodontic_items_full.json"
    path_stop_words = path_project + os.sep + "input" + os.sep + "stop_words"
    #     path_out = path_project+os.sep+"output"+os.sep+"sorted_result.txt"
    path_sent_words_freq = path_project + os.sep + "output" + os.sep + "sent_words_freq.txt"
    data = json.load(open(path_book, "r"), encoding="utf-8")
    definaitons = []
    for item in data:
        concept, pronunciation, pos2definition = extract_item_properties(item)
        for pos2def in pos2definition:
            definaitons.append(pos2def["definition"])
    uni_thre = 5
    bi_thre = 5
    tri_thre = 3
    uni_filter, bi_filter, tri_filter = filter_ngram(definaitons, uni_thre,
                                                     bi_thre, tri_thre)
    fp = codecs.open(path_sent_words_freq, 'w', 'utf-8')

    stop_words = open(path_stop_words, "r").readlines()
    new_stop_words = []
    for stop_word in stop_words:
        new_stop_words.append(stop_word.strip())

    ngrams_filter = get_ngram_filter(new_stop_words, uni_filter, bi_filter,
                                     tri_filter, uni_thre, bi_thre, tri_thre)
    del ngrams_filter[0]
    ngrams_filter_words = []
    ngrams_filter_freqs = []
    for ngram_filter in ngrams_filter:
        ngrams_filter_words.append(ngram_filter[0])
        ngrams_filter_freqs.append(ngram_filter[1])

    for defination in definaitons:
        line_candidate = get_sent_high_freq_word(defination,
                                                 ngrams_filter_words,
                                                 ngrams_filter_freqs)
        fp.write(defination + "\n")
        fp.write(str(line_candidate) + "\n\n")
Example #17
0
def test2():
    path_project = os.path.abspath(
        os.path.join(os.getcwd(), os.pardir, os.pardir))
    path_data = path_project + os.sep + "input" + os.sep + "tagged_items.json"
    data = load_json(path_data)
    tagger = get_tagger()
    cnt_same_pos_all = 0
    cnt_same_word_all = 0
    for item in data:
        concept, pronunciation, pos2definition = extract_item_properties(item)
        for pos2def in pos2definition:
            definition = pos2def["definition"]
        text = nltk.word_tokenize(definition)
        def_pos1 = tagger.tag(text)
        logger.info(def_pos1)
        def_pos2 = nltk.pos_tag(text)
        logger.info(def_pos2)
        similar, cnt_same_pos, cnt_same_word = compare_similar_pos(
            def_pos1, def_pos2)
        cnt_same_pos_all += cnt_same_pos
        cnt_same_word_all += cnt_same_word
    print float(cnt_same_pos_all) / cnt_same_word_all