Esempio n. 1
0
def map_wordsmodel_overall_weighting(fin, fout, wordsmodel, src_field="g0", dst_field="g1",\
                                     weight = lambda wordsmodel,doc_wordid2count,wordid: tf(doc_wordid2count, wordid)*idf(wordsmodel, wordid) ):
    """Maps value of src_field using wordsmodel and weigting function. Results stores to dst_field."""
    counter = 0    
    for i,record in enumerate(zbl_io.read_zbl_records(fin)):                            
        if i%100 == 0: logging.info("[map_wordsmodel_overall_weighting] "+str(i)+" records processed."+str(counter)+"enriched.")
        if src_field in record:                
            doc_wordid2count    = _di_( zbl_io.unpack_dictionary_field(record[src_field]) )
            doc_wordid2weight   = [( wordid,weight(wordsmodel,doc_wordid2count,wordid) ) for wordid,count in doc_wordid2count.iteritems() ]
            record[dst_field]   = zbl_io.pack_listpairs_field( sorted( doc_wordid2weight ) )
            counter = counter + 1 
        zbl_io.write_zbl_record(fout, record)
        fout.write("\n")
    return counter
Esempio n. 2
0
def modify_wordslist_file(fin, fout, list_of_fields, wordslist_modifier):
    """Converts single words in selected fields into n-grams by merging words.
    
    wordslist_modifier(words list) -> modified_words list
    """
    for record in zbl_io.read_zbl_records(fin):
        for field in list_of_fields:
            if not record.has_key(field): continue
            words = record[field].split()
            modified_words = wordslist_modifier(words)
            if len(modified_words) <= 0: 
                logging.warn("Error in an="+str(record[zbl_io.ZBL_ID_FIELD])+" in field "+ str(field)+ "="+str(record[field])+". Using single words instead.")
                modified_words = words
            record[field] = reduce(lambda w1,w2: (w1)+' '+(w2), modified_words)
        zbl_io.write_zbl_record(fout, record)
        fout.write("\n")
Esempio n. 3
0
def modify_wordslist_file(fin, fout, list_of_fields, wordslist_modifier):
    """Converts single words in selected fields into n-grams by merging words.
    
    wordslist_modifier(words list) -> modified_words list
    """
    for record in zbl_io.read_zbl_records(fin):
        for field in list_of_fields:
            if not record.has_key(field): continue
            words = record[field].split()
            modified_words = wordslist_modifier(words)
            if len(modified_words) <= 0:
                logging.warn("Error in an=" +
                             str(record[zbl_io.ZBL_ID_FIELD]) + " in field " +
                             str(field) + "=" + str(record[field]) +
                             ". Using single words instead.")
                modified_words = words
            record[field] = reduce(lambda w1, w2: (w1) + ' ' + (w2),
                                   modified_words)
        zbl_io.write_zbl_record(fout, record)
        fout.write("\n")
Esempio n. 4
0
 print "len(n2terms[1])=",len(n2terms.get(1, [])),"->", str(n2terms.get(1, []))[:500]
 print "len(n2terms[2])=",len(n2terms.get(2, [])), "->", str(n2terms.get(2, []))[:500]
 print "len(n2terms[3])=",len(n2terms.get(3, [])),"->", str(n2terms.get(3, []))[:500]
 
 print "REPROCESSING src file"            
 for N, record in enumerate(zbl_io.read_zbl_records(open(fin))):
     if N%500==0: print N, "read"
     doc = []
     for field in list_of_fields:
         if not record.has_key(field): continue
         words = record[field].split()
         
         selected_words = []
         prohibited = set()
         mgrams = []
         for n in xrange(min(maxn, len(words)+1), 1, -1):
             ngrams = build_ngrams(words, n, ngram_separator)
             ngrams = [w for w in ngrams if w in selected]
                             
             mgrams.extend( ngrams )
             
         modified_words = build_mgrams(words, maxn)
                     
         selected_words = [w for w in modified_words if w in selected]
         try:
             record[field] = reduce(lambda w1,w2: (w1)+' '+(w2), selected_words)
         except:
             print "ERR:", modified_words," -> ",selected_words
     zbl_io.write_zbl_record(fout, record)
     fout.write("\n")
     
Esempio n. 5
0
    print "len(n2terms[3])=", len(n2terms.get(3, [])), "->", str(
        n2terms.get(3, []))[:500]

    print "REPROCESSING src file"
    for N, record in enumerate(zbl_io.read_zbl_records(open(fin))):
        if N % 500 == 0: print N, "read"
        doc = []
        for field in list_of_fields:
            if not record.has_key(field): continue
            words = record[field].split()

            selected_words = []
            prohibited = set()
            mgrams = []
            for n in xrange(min(maxn, len(words) + 1), 1, -1):
                ngrams = build_ngrams(words, n, ngram_separator)
                ngrams = [w for w in ngrams if w in selected]

                mgrams.extend(ngrams)

            modified_words = build_mgrams(words, maxn)

            selected_words = [w for w in modified_words if w in selected]
            try:
                record[field] = reduce(lambda w1, w2: (w1) + ' ' + (w2),
                                       selected_words)
            except:
                print "ERR:", modified_words, " -> ", selected_words
        zbl_io.write_zbl_record(fout, record)
        fout.write("\n")