def main(): path_project = os.path.abspath(os.path.join(os.getcwd(), os.pardir, os.pardir)) path_data_manual= path_project+os.sep+"input"+os.sep+"tagged_items.json" path_data_auto= path_project+os.sep+"input"+os.sep+"items_tagged_auto.json" data_manual=load_json(path_data_manual) data_auto=load_json(path_data_auto) def2result=compare_all(data_manual,data_auto) count_result(def2result,data_manual,data_auto)
def main(): # IE() IE_auto_pattern() path_project = os.path.abspath(os.path.join(os.getcwd(), os.pardir, os.pardir)) path_data_manual= path_project+os.sep+"input"+os.sep+"items_tagged_modified.json" path_data_auto= path_project+os.sep+"output"+os.sep+"items_tagged_auto.json" data_manual=load_json(path_data_manual) data_auto=load_json(path_data_auto) def2result=compare_all(data_manual,data_auto) count_result(def2result,data_manual,data_auto) compare_all_different(data_manual,data_auto)
def main(): path_project = os.path.abspath( os.path.join(os.getcwd(), os.pardir, os.pardir)) path_data = "items_tagged_modified.json" # path_data= path_project+os.sep+"input"+os.sep+"items_tagged_modified_test.json" path_pattern = path_project + os.sep + "output" + os.sep + "Patterns_auto.json" path_pattern_sorted = path_project + os.sep + "output" + os.sep + "pattern_auto_sorted_by_attribute_pattern.json" path_pattern = "Patterns_auto.json" path_pattern_sorted = "pattern_auto_sorted_by_attribute_pattern.json" # path_pattern_sorted_by_attribute=path_project+os.sep+"output"+os.sep+"pattern_auto_sorted_by_attribute.json" data = load_json(path_data) logger.info("loaded all the data") #acquire all the pattern candidates attribute2PSFSList = acquire_patterns(data) # getVBContext(attribute2PSFSList) #filter the pattern candidates PASFFLSsList = filterPatterns(attribute2PSFSList) #merge the pattern candidates patterns = merge_pattern_all(PASFFLSsList) patterns_priority = calculate_proprity2pattern(patterns) logger.info("has acquired all the patterns") json.dump(patterns_priority, codecs.open(path_pattern, 'w', 'utf-8'), ensure_ascii=False, indent=2) # json.dump(dict_reverse(patterns), codecs.open(path_pattern_reverse, 'w','utf-8'),ensure_ascii=False,indent=2) # dict_sorted_value(dict_reverse(patterns),path_pattern_reverse_sorted) sorted_by_attribute_pattern(patterns_priority, path_pattern_sorted) logger.info("output over")
def main(): path_project = os.path.abspath( os.path.join(os.getcwd(), os.pardir, os.pardir)) path_data = path_project + os.sep + "input" + os.sep + "items_tagged_modified.json" # path_data= path_project+os.sep+"input"+os.sep+"items_tagged_modified_test.json" path_pattern = path_project + os.sep + "output" + os.sep + "Patterns_auto.json" path_pattern_sorted = path_project + os.sep + "output" + os.sep + "pattern_auto_sorted_by_attribute_pattern.json" # path_pattern_sorted_by_attribute=path_project+os.sep+"output"+os.sep+"pattern_auto_sorted_by_attribute.json" data = load_json(path_data) logger.info("loaded all the data") patternTfix_tokensTattribute_tuples = acquire_patterns(data) logger.info('patternTfix_tokensTattribute_tuples: %d\n' % len(patternTfix_tokensTattribute_tuples) + str(patternTfix_tokensTattribute_tuples)) patterns = merge_pattern_all(patternTfix_tokensTattribute_tuples) patterns_priority = calculate_proprity2pattern(patterns) logger.info("has acquired all the patterns") json.dump(patterns_priority, codecs.open(path_pattern, 'w', 'utf-8'), ensure_ascii=False, indent=2) # json.dump(dict_reverse(patterns), codecs.open(path_pattern_reverse, 'w','utf-8'),ensure_ascii=False,indent=2) # dict_sorted_value(dict_reverse(patterns),path_pattern_reverse_sorted) sorted_by_attribute_pattern(patterns_priority, path_pattern_sorted) logger.info("output over")
def test2(): path_project = os.path.abspath(os.path.join(os.getcwd(), os.pardir, os.pardir)) path_pattern= path_project+os.sep+"input"+os.sep+"patterns.json" # path_tagged_output= path_project+os.sep+"output"+os.sep+"patterns_priority.json" path_pattern_new= path_project+os.sep+"output"+os.sep+"patterns_priority_sorted.json" data=load_json(path_pattern) data_new=calculate_proprity2pattern(data) # json.dump(data_new, codecs.open(path_tagged_output, 'w','utf-8'),ensure_ascii=False,indent=2) sorted_pattern(data_new,path_pattern_new)
def test(): path_project = os.path.abspath(os.path.join(os.getcwd(), os.pardir, os.pardir)) path_data= path_project+os.sep+"input"+os.sep+"items.json" path_output=path_project+os.sep+"output"+os.sep+"special_chars.txt" data=load_json(path_data) special_chars=find_special_char(data) fp=codecs.open(path_output, 'w','utf-8') for i in special_chars: fp.write(i+"\n") print special_chars
def test1(): path_project = os.path.abspath(os.path.join(os.getcwd(), os.pardir, os.pardir)) path_data= path_project+os.sep+"input"+os.sep+"items_tagged_modified_test.json" # path_data= path_project+os.sep+"input"+os.sep+"items_tagged_modified_test.json" path_pattern= path_project+os.sep+"output"+os.sep+"patterns_auto_test.json" data=load_json(path_data) logger.info("loaded all the data") patterns=acquire_patterns(data) logger.info("has acquired all the patterns") json.dump(patterns, codecs.open(path_pattern, 'w','utf-8'),ensure_ascii=False,indent=2) logger.info("output over")
def main(): path_project = os.path.abspath(os.path.join(os.getcwd(), os.pardir, os.pardir)) path_data= path_project+os.sep+"input"+os.sep+"tagged_items.json" path_pattern= path_project+os.sep+"input"+os.sep+"Patterns.json" # path_data= path_project+os.sep+"input"+os.sep+"items.json" # path_data_output=path_project+os.sep+"output"+os.sep+"test_items.json" path_tagged_output=path_project+os.sep+"output"+os.sep+"test_items_tagged.txt" pattern2attrubute=get_all_pattern(load_patterns(path_pattern)) data=load_json(path_data) data_new=extract_all_items(data,pattern2attrubute.keys()) json.dump(data_new, codecs.open(path_tagged_output, 'w','utf-8'),ensure_ascii=False,indent=2)
def test7(): path_project = os.path.abspath(os.path.join(os.getcwd(), os.pardir, os.pardir)) path_data= path_project+os.sep+"input"+os.sep+"items_tagged_modified.json" data=load_json(path_data) attributes=set([]) for item in data: concept,pronunciation,pos2definition=extract_item_properties(item) for pos2def in pos2definition: definition=pos2def['definition'] grammar = "NP: {<DT>?<JJ>*<NN>}" tokens=nltk.word_tokenize(definition) tagged=nltk.pos_tag(tokens) print nltk.RegexpParser(grammar).parse(tagged)
def test6(): path_project = os.path.abspath(os.path.join(os.getcwd(), os.pardir, os.pardir)) path_data= path_project+os.sep+"input"+os.sep+"items_tagged_modified.json" data=load_json(path_data) attributes=set([]) for item in data: concept,pronunciation,pos2definition=extract_item_properties(item) for pos2def in pos2definition: for attribute in pos2def['attributes'].keys(): attributes.add(attribute) for x in sorted(list(attributes)): print x
def test4(): path_project = os.path.abspath(os.path.join(os.getcwd(), os.pardir, os.pardir)) path_data= path_project+os.sep+"input"+os.sep+"items_modified.json" path_pattern= path_project+os.sep+"input"+os.sep+"patterns.json" # path_data= path_project+os.sep+"input"+os.sep+"items.json" # path_data_output=path_project+os.sep+"output"+os.sep+"test_items.json" path_tagged_output=path_project+os.sep+"output"+os.sep+"items_modified_auto.json" pattern2attrubute=load_patterns(path_pattern) logger.info("loaded all the patterns") data=load_json(path_data) logger.info("loaded all the data") data_new=extract_items_all(data,pattern2attrubute) # data_new=extractor_multi_thread(data,pattern2attrubute,stanford_tagger) logger.info("has extracted all the attributes") json.dump(data_new, codecs.open(path_tagged_output, 'w','utf-8'),ensure_ascii=False,indent=2) logger.info("output over")
def main(): path_project = os.path.abspath( os.path.join(os.getcwd(), os.pardir, os.pardir)) path_data = path_project + os.sep + "input" + os.sep + "items.json" # path_data= path_project+os.sep+"input"+os.sep+"items_test.json" path_new_data = path_project + os.sep + "output" + os.sep + "items_modified.json" # path_data= path_project+os.sep+"input"+os.sep+"items_tagged_modified1.json" # path_data= path_project+os.sep+"input"+os.sep+"items_tagged.json" # path_new_data=path_project+os.sep+"output"+os.sep+"items_tagged_modified.json" # path_data= path_project+os.sep+"input"+os.sep+"items_test2.json" # path_new_data=path_project+os.sep+"output"+os.sep+"items_modified_test.json" data = load_json(path_data) # analysis_data(data) data_new = modify_data(data) json.dump(data_new, codecs.open(path_new_data, 'w', 'utf-8'), ensure_ascii=False, indent=2)
def test5(): path_project = os.path.abspath( os.path.join(os.getcwd(), os.pardir, os.pardir)) path_data = path_project + os.sep + "input" + os.sep + "items_tagged_modified.json" # path_pattern= path_project+os.sep+"output"+os.sep+"Patterns_auto.json" data = load_json(path_data) logger.info("loaded all the data") values = [] for item in data: pos2definition = item["pos2definition"] for pos2def in pos2definition: for value in pos2def["attributes"].values(): values.append(len(value.split(" "))) c_value_len = Counter(values) print sorted(c_value_len.iteritems(), key=lambda asd: asd[0], reverse=False) logger.info("output over")
def test8(): path_project = os.path.abspath( os.path.join(os.getcwd(), os.pardir, os.pardir)) path_data = path_project + os.sep + "output" + os.sep + "patterns_priority.json" path_data_new = path_project + os.sep + "output" + os.sep + "patterns_priority_new.json" data = load_json(path_data) data_new = {} print data for pattern, pattern_name in data.iteritems(): # print item pattern_new = tranfer_pattern(pattern) data_new[pattern_new] = pattern_name sorted_data = sorted(data.iteritems(), key=lambda asd: asd[1], reverse=True) json.dump(sorted_data, codecs.open(path_data_new, 'w', 'utf-8'), ensure_ascii=False, indent=2)
def test7(): path_project = os.path.abspath( os.path.join(os.getcwd(), os.pardir, os.pardir)) path_data = path_project + os.sep + "input" + os.sep + "items_tagged_modified.json" path_tagged_output = path_project + os.sep + "output" + os.sep + "items_tagged_modified_no_bracket.json" # path_pattern= path_project+os.sep+"output"+os.sep+"Patterns_auto.json" data = load_json(path_data) logger.info("loaded all the data") data_new = [] for item in data: pos2definition = item["pos2definition"] for pos2def in pos2definition: for attribute, value in pos2def["attributes"].iteritems(): pos2def["attributes"][attribute] = re.sub( r'\([\s\S]*?\)', "", value) data_new.append(item) json.dump(data_new, codecs.open(path_tagged_output, 'w', 'utf-8'), ensure_ascii=False, indent=2) logger.info("output over")
def test2(): path_project = os.path.abspath( os.path.join(os.getcwd(), os.pardir, os.pardir)) path_data = path_project + os.sep + "input" + os.sep + "tagged_items.json" data = load_json(path_data) tagger = get_tagger() cnt_same_pos_all = 0 cnt_same_word_all = 0 for item in data: concept, pronunciation, pos2definition = extract_item_properties(item) for pos2def in pos2definition: definition = pos2def["definition"] text = nltk.word_tokenize(definition) def_pos1 = tagger.tag(text) logger.info(def_pos1) def_pos2 = nltk.pos_tag(text) logger.info(def_pos2) similar, cnt_same_pos, cnt_same_word = compare_similar_pos( def_pos1, def_pos2) cnt_same_pos_all += cnt_same_pos cnt_same_word_all += cnt_same_word print float(cnt_same_pos_all) / cnt_same_word_all
def main(): path_project = os.path.abspath( os.path.join(os.getcwd(), os.pardir, os.pardir)) path_data = path_project + os.sep + "input" + os.sep + "items_tagged_modified.json" # path_data= path_project+os.sep+"input"+os.sep+"items_tagged_modified_test.json" path_pattern = path_project + os.sep + "output" + os.sep + "Patterns_auto.json" path_pattern_reverse = path_project + os.sep + "output" + os.sep + "Patterns_auto_reverse.json" path_pattern_reverse_sorted = path_project + os.sep + "output" + os.sep + "Patterns_auto_reverse_sorted.json" data = load_json(path_data) logger.info("loaded all the data") patterns = acquire_patterns(data) logger.info("has acquired all the patterns") json.dump(patterns, codecs.open(path_pattern, 'w', 'utf-8'), ensure_ascii=False, indent=2) json.dump(dict_reverse(patterns), codecs.open(path_pattern_reverse, 'w', 'utf-8'), ensure_ascii=False, indent=2) dict_sorted_value(dict_reverse(patterns), path_pattern_reverse_sorted) logger.info("output over")
def load_patterns(path): return load_json(path)