calculated_clauses = [] #try: calculated_clauses = fio.readFileToList(CALCULATED_FILE) #except Exception,e: # print e # pass clause_freq = fio.readFileToDict(CLAUSE_FREQ_FILE) clause_freq = dict([str(k),v] for k,v in clause_freq.iteritems() if len(str(k)) > 3)#clause length more than 1 chinese word clauses = [str(k) for k in clause_freq.keys()] clauses = sorted(clauses,key = lambda x:len(x))#sort clause list by clause length clause_num = len(clauses) high_freq_clauses = [str(k) for k,v in clause_freq.iteritems() if v >2] low_freq_clauses = [str(k) for k,v in clause_freq.iteritems() if v <=2] similar_clauses = fio.readFileToList(SIMIL_FILE) mgr = multiprocessing.Manager() similar_clauses = mgr.list(similar_clauses) try: find_similar_in_lowfreqlist(calculated_clauses,low_freq_clauses) except: pass finally: similar_clauses = list(similar_clauses) fio.recordToFile(HIGH_FREQ_FILE,high_freq_clauses) fio.recordToFile(SIMIL_FILE,similar_clauses) fio.recordToFile(HIGH_FREQ_SIMIL_FILE,set(high_freq_clauses+similar_clauses)) fio.recordToFile(CALCULATED_FILE,calculated_clauses)
xml_string = "" xml_line = "" while(not xml_line.startswith("</xml4nlp>")): xml_line = f.readline() xml_string+=xml_line if not clause.encode('utf-8') in highfreq_clauses: continue doc = parseString(xml_string.encode('utf-8')) relates = [node.getAttribute('relate') for node in doc.getElementsByTagName("word")] words = [node.getAttribute('cont') for node in doc.getElementsByTagName("word")] relateString = "" for i in relates: relateString+=(i+"@") relateString = relateString.rstrip('@') if relateString in FINDING_PATTERN: word_list.append(words[relates.index(FINDING_RELATE)].encode('utf-8')) if not relate_clauselist.has_key(relateString): relate_clauselist[relateString] = [] relate_clauselist[relateString].append(clause) #print len(relate_clauselist),"relations" #sort relate_clauses by number of clauses sorted_relate_clauses = sorted(relate_clauselist.items(),key = lambda x:len(x[1]),reverse = True) for k,clausel in sorted_relate_clauses: print k for i in clausel: print i.encode('utf-8') fio.recordToFile(WORD_FILE,set(word_list))