def map_wordsmodel_overall_weighting(fin, fout, wordsmodel, src_field="g0", dst_field="g1",\ weight = lambda wordsmodel,doc_wordid2count,wordid: tf(doc_wordid2count, wordid)*idf(wordsmodel, wordid) ): """Maps value of src_field using wordsmodel and weigting function. Results stores to dst_field.""" counter = 0 for i,record in enumerate(zbl_io.read_zbl_records(fin)): if i%100 == 0: logging.info("[map_wordsmodel_overall_weighting] "+str(i)+" records processed."+str(counter)+"enriched.") if src_field in record: doc_wordid2count = _di_( zbl_io.unpack_dictionary_field(record[src_field]) ) doc_wordid2weight = [( wordid,weight(wordsmodel,doc_wordid2count,wordid) ) for wordid,count in doc_wordid2count.iteritems() ] record[dst_field] = zbl_io.pack_listpairs_field( sorted( doc_wordid2weight ) ) counter = counter + 1 zbl_io.write_zbl_record(fout, record) fout.write("\n") return counter
def modify_wordslist_file(fin, fout, list_of_fields, wordslist_modifier): """Converts single words in selected fields into n-grams by merging words. wordslist_modifier(words list) -> modified_words list """ for record in zbl_io.read_zbl_records(fin): for field in list_of_fields: if not record.has_key(field): continue words = record[field].split() modified_words = wordslist_modifier(words) if len(modified_words) <= 0: logging.warn("Error in an="+str(record[zbl_io.ZBL_ID_FIELD])+" in field "+ str(field)+ "="+str(record[field])+". Using single words instead.") modified_words = words record[field] = reduce(lambda w1,w2: (w1)+' '+(w2), modified_words) zbl_io.write_zbl_record(fout, record) fout.write("\n")
def modify_wordslist_file(fin, fout, list_of_fields, wordslist_modifier): """Converts single words in selected fields into n-grams by merging words. wordslist_modifier(words list) -> modified_words list """ for record in zbl_io.read_zbl_records(fin): for field in list_of_fields: if not record.has_key(field): continue words = record[field].split() modified_words = wordslist_modifier(words) if len(modified_words) <= 0: logging.warn("Error in an=" + str(record[zbl_io.ZBL_ID_FIELD]) + " in field " + str(field) + "=" + str(record[field]) + ". Using single words instead.") modified_words = words record[field] = reduce(lambda w1, w2: (w1) + ' ' + (w2), modified_words) zbl_io.write_zbl_record(fout, record) fout.write("\n")
print "len(n2terms[1])=",len(n2terms.get(1, [])),"->", str(n2terms.get(1, []))[:500] print "len(n2terms[2])=",len(n2terms.get(2, [])), "->", str(n2terms.get(2, []))[:500] print "len(n2terms[3])=",len(n2terms.get(3, [])),"->", str(n2terms.get(3, []))[:500] print "REPROCESSING src file" for N, record in enumerate(zbl_io.read_zbl_records(open(fin))): if N%500==0: print N, "read" doc = [] for field in list_of_fields: if not record.has_key(field): continue words = record[field].split() selected_words = [] prohibited = set() mgrams = [] for n in xrange(min(maxn, len(words)+1), 1, -1): ngrams = build_ngrams(words, n, ngram_separator) ngrams = [w for w in ngrams if w in selected] mgrams.extend( ngrams ) modified_words = build_mgrams(words, maxn) selected_words = [w for w in modified_words if w in selected] try: record[field] = reduce(lambda w1,w2: (w1)+' '+(w2), selected_words) except: print "ERR:", modified_words," -> ",selected_words zbl_io.write_zbl_record(fout, record) fout.write("\n")
print "len(n2terms[3])=", len(n2terms.get(3, [])), "->", str( n2terms.get(3, []))[:500] print "REPROCESSING src file" for N, record in enumerate(zbl_io.read_zbl_records(open(fin))): if N % 500 == 0: print N, "read" doc = [] for field in list_of_fields: if not record.has_key(field): continue words = record[field].split() selected_words = [] prohibited = set() mgrams = [] for n in xrange(min(maxn, len(words) + 1), 1, -1): ngrams = build_ngrams(words, n, ngram_separator) ngrams = [w for w in ngrams if w in selected] mgrams.extend(ngrams) modified_words = build_mgrams(words, maxn) selected_words = [w for w in modified_words if w in selected] try: record[field] = reduce(lambda w1, w2: (w1) + ' ' + (w2), selected_words) except: print "ERR:", modified_words, " -> ", selected_words zbl_io.write_zbl_record(fout, record) fout.write("\n")