コード例 #1
0
    calculated_clauses = [] 
    #try:
    calculated_clauses = fio.readFileToList(CALCULATED_FILE)
    #except Exception,e:
    #    print e
    #    pass

    clause_freq = fio.readFileToDict(CLAUSE_FREQ_FILE)
    clause_freq = dict([str(k),v] for k,v in clause_freq.iteritems() if len(str(k)) > 3)#clause length more than 1 chinese word
    clauses = [str(k) for k in clause_freq.keys()]
    clauses = sorted(clauses,key = lambda x:len(x))#sort clause list by clause length
    clause_num = len(clauses)
    high_freq_clauses = [str(k) for k,v in clause_freq.iteritems() if v >2]
    low_freq_clauses = [str(k) for k,v in clause_freq.iteritems() if v <=2]

    similar_clauses = fio.readFileToList(SIMIL_FILE)
    mgr = multiprocessing.Manager()
    similar_clauses = mgr.list(similar_clauses)

    try:
        find_similar_in_lowfreqlist(calculated_clauses,low_freq_clauses)
    except:
        pass
    finally:
        similar_clauses = list(similar_clauses)
        fio.recordToFile(HIGH_FREQ_FILE,high_freq_clauses)
        fio.recordToFile(SIMIL_FILE,similar_clauses)
        fio.recordToFile(HIGH_FREQ_SIMIL_FILE,set(high_freq_clauses+similar_clauses))
        fio.recordToFile(CALCULATED_FILE,calculated_clauses)
    
コード例 #2
0
                xml_string = ""
                xml_line = ""
                while(not xml_line.startswith("</xml4nlp>")):
                    xml_line = f.readline()
                    xml_string+=xml_line
                if not clause.encode('utf-8') in highfreq_clauses:
                    continue
                doc = parseString(xml_string.encode('utf-8'))
                relates = [node.getAttribute('relate') for node in doc.getElementsByTagName("word")]
                words = [node.getAttribute('cont') for node in doc.getElementsByTagName("word")]
                relateString = ""
                for i in relates:
                    relateString+=(i+"@")
                relateString = relateString.rstrip('@')
                if relateString in FINDING_PATTERN:
                    word_list.append(words[relates.index(FINDING_RELATE)].encode('utf-8'))
                if not relate_clauselist.has_key(relateString):
                    relate_clauselist[relateString] = []
                relate_clauselist[relateString].append(clause)

    #print len(relate_clauselist),"relations"

    #sort relate_clauses by number of clauses
    sorted_relate_clauses = sorted(relate_clauselist.items(),key = lambda x:len(x[1]),reverse = True)
    for k,clausel in sorted_relate_clauses:
        print k
        for i in clausel:
            print i.encode('utf-8')

    fio.recordToFile(WORD_FILE,set(word_list))