def _report_ci_(path, records_filter = allow_all_filter, \ ci_dst_records_filter = allow_all_filter, \ uq_id_field_name = zbl_io.ZBL_ID_FIELD): """Prints report about citations in ZBL file. records_filter(record) - should return True if record is admitted ci_dst_records_filter(record) - should return True if record that citation is pointing at is admitted uq_id_field_name - name of a field that uniquely identifies record """ #wczytywanie zbioru na ktory moga wskazywac cytowania: print "Loading ids of records that may be citation destination." dst_records_ids = set() for i,record in enumerate( zbl_io.read_zbl_records(open(path)) ): if i%100000 == 0: print i," records considered" #progress bar if record.has_key(uq_id_field_name) and ci_dst_records_filter(record): dst_records_ids.add(record[uq_id_field_name]) print "Done.", len(dst_records_ids), " records loaded." #statystyki: cis_len = [] #liczba cytowan cis_matched = [] #liczba cytowan ktore trafiaja w zadany zbior for i,record in enumerate(zbl_io.read_zbl_records(open(path))): if i%100000 == 0: print i," records considered" #progress bar if not record.has_key("ci") or not records_filter(record): continue cis = zbl_io.unpack_list_of_dictionaries(record["ci"]) #identyfikatory cytowan: identified_ci_ids = list(ci[uq_id_field_name] for ci in cis if ci.has_key(uq_id_field_name)) #rekordy dopsowane do cytowan i w zadanym zbiorze: filtered_matched_records = list(id for id in identified_ci_ids if id in dst_records_ids) cis_len.append(len(cis)) cis_matched.append(len(filtered_matched_records)) cis_matched_div_len = list( float(m)/float(l) for m,l in zip(cis_matched, cis_len) ) print "Citation statistics (only on records with citations) [total, min avg max std]: " print "-Number of citations :", "\t", round(sum(cis_len),0), "\t", round(min(cis_len),0), "\t", round(avg(cis_len),2), "\t", round(max(cis_len),0), "\t", round(std(cis_len),2) print "-Matching citations:", "\t", round(sum(cis_matched),0), "\t", round(min(cis_matched),0), "\t", round(avg(cis_matched),2), "\t", round(max(cis_matched),0), "\t", round(std(cis_matched),2) print "-Fraction of matching citations: - ", "\t", round(min(cis_matched_div_len),3), "\t", round(avg(cis_matched_div_len),3), "\t", round(max(cis_matched_div_len),3), "\t", round(std(cis_matched_div_len),3) print "-Total Number of citations/Matching citations:", "\t", round(float(sum(cis_matched))/sum(cis_len),3) print "->", round(sum(cis_len),0), (max(cis_len)), round(avg(cis_len),2), round(std(cis_len),2), \ round(sum(cis_matched),0), (max(cis_matched)), round(avg(cis_matched),2), round(std(cis_matched),2), \ round(max(cis_matched_div_len),3), round(avg(cis_matched_div_len),3), round(std(cis_matched_div_len),3) cis_matched_hist = {} for i in xrange(0, max(cis_matched)+1): cis_matched_hist[i] = sum(1 for c in cis_matched if c==i) print "Histogram:", cis_matched_hist n, bins, patches = plt.hist(sorted(cis_matched), bins = max(cis_matched), normed=False, alpha=0.75) plt.xlabel("Liczba dopasowanych cytowan") plt.ylabel("Liczba rekordow") plt.show()
def count_ids(fin): """ Counts how many records there are with each fields list of given list of field lists """ id_vals = {} au_vals = 0 all = 0 records = list(read_zbl_records(fin)) for ind, r in enumerate(records): all += 1 au_vals += 'au' in r if r['an'] not in id_vals: id_vals[r['an']] = [ind] else: pass ''' print "-------------------------------" print "Powtorzenie id an!", r['an'] id_vals[r['an']] += [ind] for i in id_vals[r['an']]: print records[i] print "-------------------------------" ''' return all, len(id_vals), au_vals
def count_ids(fin): """ Counts how many records there are with each fields list of given list of field lists """ id_vals = {} au_vals = 0 all = 0 records = list(read_zbl_records(fin)) for ind, r in enumerate(records): all+=1 au_vals += 'au' in r if r['an'] not in id_vals: id_vals[r['an']] = [ind] else: pass ''' print "-------------------------------" print "Powtorzenie id an!", r['an'] id_vals[r['an']] += [ind] for i in id_vals[r['an']]: print records[i] print "-------------------------------" ''' return all, len(id_vals), au_vals
def _draw_af_hist_(path, records_filter = allow_all_filter): """Draws histogram of authorship.""" af_count = {} #dict{author: count} for i,record in enumerate(zbl_io.read_zbl_records(open(path))): if i%100000 == 0: print i," records considered" #progress bar if not record.has_key("af") or not records_filter(record): continue afs = zbl_io.unpack_multivalue_field(record["af"]) afs_ok = list( af for af in afs if af!='-' ) for af in afs_ok: af_count[af] = af_count.get(af, 0) + 1 print len(af_count), " authors found." print max(af_count.values()), " = max" print min(af_count.values()), " = min" avg_af_values = avg(af_count.values()) print round(avg_af_values, 2), " = avg" print round(std(af_count.values()), 2), " = std" print sum(1 for af in af_count.values() if af > avg_af_values) , " authors above avg" print sum(1 for af in af_count.values() if af < avg_af_values) , " authors below avg" n, bins, patches = plt.hist(af_count.values(), bins = max(af_count.values()), normed=False, log=True, alpha=0.75) plt.xlabel("Liczba wystapien w rekordach") plt.ylabel("Liczba autorow") plt.show()
def _draw_af_hist_(path, records_filter=allow_all_filter): """Draws histogram of authorship.""" af_count = {} #dict{author: count} for i, record in enumerate(zbl_io.read_zbl_records(open(path))): if i % 100000 == 0: print i, " records considered" #progress bar if not record.has_key("af") or not records_filter(record): continue afs = zbl_io.unpack_multivalue_field(record["af"]) afs_ok = list(af for af in afs if af != '-') for af in afs_ok: af_count[af] = af_count.get(af, 0) + 1 print len(af_count), " authors found." print max(af_count.values()), " = max" print min(af_count.values()), " = min" avg_af_values = avg(af_count.values()) print round(avg_af_values, 2), " = avg" print round(std(af_count.values()), 2), " = std" print sum(1 for af in af_count.values() if af > avg_af_values), " authors above avg" print sum(1 for af in af_count.values() if af < avg_af_values), " authors below avg" n, bins, patches = plt.hist(af_count.values(), bins=max(af_count.values()), normed=False, log=True, alpha=0.75) plt.xlabel("Liczba wystapien w rekordach") plt.ylabel("Liczba autorow") plt.show()
def _report_af_quality_(path, records_filter=allow_all_filter): """Prints report about authors' identities quality.""" afs_len = [] afs_ok_len = [] for i, record in enumerate(zbl_io.read_zbl_records(open(path))): if i % 100000 == 0: print i, " records considered" #progress bar if not record.has_key("af") or not records_filter(record): continue afs = zbl_io.unpack_multivalue_field(record["af"]) afs_ok = list(af for af in afs if af != '-') afs_len.append(len(afs)) afs_ok_len.append(len(afs_ok)) afs_ok_frac = list( float(m) / float(l) for m, l in zip(afs_ok_len, afs_len)) print max(afs_len), "\n", round(avg(afs_len), 2), "\n", round(std(afs_len), 2) print max(afs_ok_len), "\n", round(avg(afs_ok_len), 2), "\n", round(std(afs_ok_len), 2) print round(max(afs_ok_frac), 2), "\n", round(avg(afs_ok_frac), 2), "\n", round(std(afs_ok_frac), 2)
def _get_zbl_generator_(zbl_path, must_have_field = 'mc'): """Returns zbl-records generator that has guaranteed presence of must_have_field field.""" UNI = True #unic f = zbl_io.open_file(zbl_path, UNI) #return (zbl for zbl in zbl_io.read_zbl_records(f, UNI) if must_have_field in zbl) for ix,zbl in enumerate(zbl_io.read_zbl_records(f, UNI)): if must_have_field in zbl: #zbl[zbl_io.ZBL_ID_FIELD] = ix #replacing ids with numbers for faster processing yield zbl
def _get_zbl_generator_(zbl_path, must_have_field='mc'): """Returns zbl-records generator that has guaranteed presence of must_have_field field.""" UNI = True #unic f = zbl_io.open_file(zbl_path, UNI) #return (zbl for zbl in zbl_io.read_zbl_records(f, UNI) if must_have_field in zbl) for ix, zbl in enumerate(zbl_io.read_zbl_records(f, UNI)): if must_have_field in zbl: #zbl[zbl_io.ZBL_ID_FIELD] = ix #replacing ids with numbers for faster processing yield zbl
def extract_citations_doublelinked_graph_file(fin, fout): """From fin reads zbl_records and to fout writes in lines: zbl_id:zbl-id1,...,zbl-idN. If records r1 cites r2 than in output graph there are two links: r1->r2 and r2->r1. """ zbl_generator = zbl_io.read_zbl_records(fin) id2ids_generator = extract_citations_doublelinked_graph(zbl_generator) #print "[extract_citations_doublelinked_graph_file]",id2ids_generator return write_file_id2ids(fout, id2ids_generator.iteritems(), cast_container = set)
def extract_fv_graph_file(fin, fout, multival_field_name="af", empty_value="-"): """From fin reads zbl_records and to fout writes in lines: zbl_id:id1,id2,id3 (graph extracted from field of name multival_field_name).""" zbl_generator = zbl_io.read_zbl_records(fin) fv2ids = extract_fv_graph(zbl_generator, multival_field_name, empty_value, set) return write_file_id2ids(fout, fv2ids.iteritems(), cast_container=set)
def build_wordsmodel(fin, fout, src_field = "g0"): """Returns ({wordid:number-of-occurrences-in-whole-corpus}, {wordid:number-of-docs-that-contain-this-word}, numdocs).""" wordsmodel = WordsModel() for i,record in enumerate(zbl_io.read_zbl_records(fin)): if i%500 == 0: logging.info("[build_wordsmodel] "+str(i)+" records processed") if src_field in record: doc_wordid2count = _di_( zbl_io.unpack_dictionary_field(record[src_field]) ) wordsmodel.update(doc_wordid2count) wordsmodel.finish_updates() return wordsmodel
def extract_citations_doublelinked_graph_file(fin, fout): """From fin reads zbl_records and to fout writes in lines: zbl_id:zbl-id1,...,zbl-idN. If records r1 cites r2 than in output graph there are two links: r1->r2 and r2->r1. """ zbl_generator = zbl_io.read_zbl_records(fin) id2ids_generator = extract_citations_doublelinked_graph(zbl_generator) #print "[extract_citations_doublelinked_graph_file]",id2ids_generator return write_file_id2ids(fout, id2ids_generator.iteritems(), cast_container=set)
def count_occurrences(file, required_fields, records_filter=allow_all_filter): """Counts records in ZBL file that have all fields from required_fields (and were admitted by filter). records_filter(record) - should return True if record is admitted""" occurrences = 0 for record in zbl_io.read_zbl_records(file): if not records_filter(record): continue #filter does not allow if has_record_fields(record, required_fields): occurrences = occurrences + 1 return occurrences
def count_occurrences(file, required_fields, records_filter = allow_all_filter): """Counts records in ZBL file that have all fields from required_fields (and were admitted by filter). records_filter(record) - should return True if record is admitted""" occurrences = 0 for record in zbl_io.read_zbl_records(file): if not records_filter(record): continue #filter does not allow if has_record_fields(record, required_fields): occurrences = occurrences + 1 return occurrences
def find_fields(fname): """ Counts how many records there are with given fields """ all = 0 fields = set() for r in read_zbl_records( open(fname, 'r')): all+=1 for field in r.iterkeys(): fields.add(field) return all, fields
def count_msc_occurences(file, records_filter=lambda x: True, field_name="mc"): """Counts number of occurrences of MSC codes in ZBL file. Returns dictionary{code_name: count}""" counts = {} for record in zbl_io.read_zbl_records(file): if not records_filter(record) or not record.has_key(field_name): continue codes = zbl_io.unpack_multivalue_field(record[field_name]) for code in codes: counts[code] = counts.get(code, 0) + 1 return counts
def count_msc_occurences(file, records_filter = lambda x: True, field_name = "mc"): """Counts number of occurrences of MSC codes in ZBL file. Returns dictionary{code_name: count}""" counts = {} for record in zbl_io.read_zbl_records(file): if not records_filter(record) or not record.has_key(field_name): continue codes = zbl_io.unpack_multivalue_field(record[field_name]) for code in codes: counts[code] = counts.get(code, 0) + 1 return counts
def map_wordsmodel_overall_weighting(fin, fout, wordsmodel, src_field="g0", dst_field="g1",\ weight = lambda wordsmodel,doc_wordid2count,wordid: tf(doc_wordid2count, wordid)*idf(wordsmodel, wordid) ): """Maps value of src_field using wordsmodel and weigting function. Results stores to dst_field.""" counter = 0 for i,record in enumerate(zbl_io.read_zbl_records(fin)): if i%100 == 0: logging.info("[map_wordsmodel_overall_weighting] "+str(i)+" records processed."+str(counter)+"enriched.") if src_field in record: doc_wordid2count = _di_( zbl_io.unpack_dictionary_field(record[src_field]) ) doc_wordid2weight = [( wordid,weight(wordsmodel,doc_wordid2count,wordid) ) for wordid,count in doc_wordid2count.iteritems() ] record[dst_field] = zbl_io.pack_listpairs_field( sorted( doc_wordid2weight ) ) counter = counter + 1 zbl_io.write_zbl_record(fout, record) fout.write("\n") return counter
def count_records_with_fields(fname, fields): """ Counts how many records there are with given fields """ all = 0 cnt = 0 for r in read_zbl_records( open(fname, 'r')): all+=1 if fields: #check if this fields occur if reduce(lambda x, y: x and y, map(lambda f: f in r and r[f].strip()<>'null', fields)): cnt+=1 return all, cnt
def count_records_with_various_fields(fname, lfields): """ Counts how many records there are with each fields list of given list of field lists """ all = 0 counts = len(lfields)*[0] for r in read_zbl_records( open(fname, 'r')): all+=1 for i, fields in enumerate(lfields): #check if this fields occur if reduce(lambda x, y: x and y, map(lambda f: f in r and r[f].strip()<>'null', fields)): counts[i] += 1 return all, counts
def _get_zbl_generator_(zbl_path, must_have_fields): """Returns zbl-records generator that has guaranteed presence of must_have_fields.""" UNI = True #unic f = zbl_io.open_file(zbl_path, UNI) for ix,zbl in enumerate(zbl_io.read_zbl_records(f, UNI)): has_all_fields = sum(1 for field in must_have_fields if field in zbl) == len(must_have_fields) #has_all_fields = True #for field in must_have_fields: # if not field in zbl: # has_all_fields = False # break #print zbl,"->",has_all_fields if has_all_fields: zbl[zbl_io.ZBL_ID_FIELD] = ix #replacing ids with numbers for faster processing yield zbl
def modify_wordslist_file(fin, fout, list_of_fields, wordslist_modifier): """Converts single words in selected fields into n-grams by merging words. wordslist_modifier(words list) -> modified_words list """ for record in zbl_io.read_zbl_records(fin): for field in list_of_fields: if not record.has_key(field): continue words = record[field].split() modified_words = wordslist_modifier(words) if len(modified_words) <= 0: logging.warn("Error in an="+str(record[zbl_io.ZBL_ID_FIELD])+" in field "+ str(field)+ "="+str(record[field])+". Using single words instead.") modified_words = words record[field] = reduce(lambda w1,w2: (w1)+' '+(w2), modified_words) zbl_io.write_zbl_record(fout, record) fout.write("\n")
def count_records_with_fields(fname, fields): """ Counts how many records there are with given fields """ all = 0 cnt = 0 for r in read_zbl_records(open(fname, 'r')): all += 1 if fields: #check if this fields occur if reduce(lambda x, y: x and y, map(lambda f: f in r and r[f].strip() <> 'null', fields)): cnt += 1 return all, cnt
def count_records_with_various_fields(fname, lfields): """ Counts how many records there are with each fields list of given list of field lists """ all = 0 counts = len(lfields) * [0] for r in read_zbl_records(open(fname, 'r')): all += 1 for i, fields in enumerate(lfields): #check if this fields occur if reduce(lambda x, y: x and y, map(lambda f: f in r and r[f].strip() <> 'null', fields)): counts[i] += 1 return all, counts
def modify_wordslist_file(fin, fout, list_of_fields, wordslist_modifier): """Converts single words in selected fields into n-grams by merging words. wordslist_modifier(words list) -> modified_words list """ for record in zbl_io.read_zbl_records(fin): for field in list_of_fields: if not record.has_key(field): continue words = record[field].split() modified_words = wordslist_modifier(words) if len(modified_words) <= 0: logging.warn("Error in an=" + str(record[zbl_io.ZBL_ID_FIELD]) + " in field " + str(field) + "=" + str(record[field]) + ". Using single words instead.") modified_words = words record[field] = reduce(lambda w1, w2: (w1) + ' ' + (w2), modified_words) zbl_io.write_zbl_record(fout, record) fout.write("\n")
def _draw_mc_hist(path, records_filter = allow_all_filter): """Draws histogram of MSC codes occurrence in records.""" mc_counts = [] for i,record in enumerate(zbl_io.read_zbl_records(open(path))): if i%100000 == 0: print i," records considered" #progress bar if not record.has_key("mc") or not records_filter(record): continue mc = zbl_io.unpack_multivalue_field(record["mc"]) mc_counts.append(len(mc)) print len(mc_counts), " record found." print max(mc_counts), " = max" print min(mc_counts), " = min" print round(avg(mc_counts), 2), " = avg" print round(std(mc_counts), 2), " = std" n, bins, patches = plt.hist(mc_counts, bins = max(mc_counts), normed=False, alpha=0.75) plt.xlabel("Liczba kodow MSC w rekordzie") plt.ylabel("Liczba rekordow") plt.show()
def _report_af_quality_(path, records_filter = allow_all_filter): """Prints report about authors' identities quality.""" afs_len = [] afs_ok_len = [] for i,record in enumerate(zbl_io.read_zbl_records(open(path))): if i%100000 == 0: print i," records considered" #progress bar if not record.has_key("af") or not records_filter(record): continue afs = zbl_io.unpack_multivalue_field(record["af"]) afs_ok = list( af for af in afs if af!='-' ) afs_len.append(len(afs)) afs_ok_len.append(len(afs_ok)) afs_ok_frac = list( float(m)/float(l) for m,l in zip(afs_ok_len, afs_len) ) print max(afs_len), "\n", round(avg(afs_len),2), "\n", round(std(afs_len),2) print max(afs_ok_len), "\n", round(avg(afs_ok_len),2), "\n", round(std(afs_ok_len),2) print round(max(afs_ok_frac),2), "\n", round(avg(afs_ok_frac),2), "\n", round(std(afs_ok_frac),2)
def _draw_mc_hist(path, records_filter=allow_all_filter): """Draws histogram of MSC codes occurrence in records.""" mc_counts = [] for i, record in enumerate(zbl_io.read_zbl_records(open(path))): if i % 100000 == 0: print i, " records considered" #progress bar if not record.has_key("mc") or not records_filter(record): continue mc = zbl_io.unpack_multivalue_field(record["mc"]) mc_counts.append(len(mc)) print len(mc_counts), " record found." print max(mc_counts), " = max" print min(mc_counts), " = min" print round(avg(mc_counts), 2), " = avg" print round(std(mc_counts), 2), " = std" n, bins, patches = plt.hist(mc_counts, bins=max(mc_counts), normed=False, alpha=0.75) plt.xlabel("Liczba kodow MSC w rekordzie") plt.ylabel("Liczba rekordow") plt.show()
def extract_citations_graph_file(fin, fout): """From fin reads zbl_records and to fout writes in lines: zbl_id:citation-id1,...,citation-idN.""" zbl_generator = zbl_io.read_zbl_records(fin) id2ids_generator = yield_citations(zbl_generator) return write_file_id2ids(fout, id2ids_generator, cast_container = set)
def _report_ci_(path, records_filter = allow_all_filter, \ ci_dst_records_filter = allow_all_filter, \ uq_id_field_name = zbl_io.ZBL_ID_FIELD): """Prints report about citations in ZBL file. records_filter(record) - should return True if record is admitted ci_dst_records_filter(record) - should return True if record that citation is pointing at is admitted uq_id_field_name - name of a field that uniquely identifies record """ #wczytywanie zbioru na ktory moga wskazywac cytowania: print "Loading ids of records that may be citation destination." dst_records_ids = set() for i, record in enumerate(zbl_io.read_zbl_records(open(path))): if i % 100000 == 0: print i, " records considered" #progress bar if record.has_key(uq_id_field_name) and ci_dst_records_filter(record): dst_records_ids.add(record[uq_id_field_name]) print "Done.", len(dst_records_ids), " records loaded." #statystyki: cis_len = [] #liczba cytowan cis_matched = [] #liczba cytowan ktore trafiaja w zadany zbior for i, record in enumerate(zbl_io.read_zbl_records(open(path))): if i % 100000 == 0: print i, " records considered" #progress bar if not record.has_key("ci") or not records_filter(record): continue cis = zbl_io.unpack_list_of_dictionaries(record["ci"]) #identyfikatory cytowan: identified_ci_ids = list(ci[uq_id_field_name] for ci in cis if ci.has_key(uq_id_field_name)) #rekordy dopsowane do cytowan i w zadanym zbiorze: filtered_matched_records = list(id for id in identified_ci_ids if id in dst_records_ids) cis_len.append(len(cis)) cis_matched.append(len(filtered_matched_records)) cis_matched_div_len = list( float(m) / float(l) for m, l in zip(cis_matched, cis_len)) print "Citation statistics (only on records with citations) [total, min avg max std]: " print "-Number of citations :", "\t", round(sum(cis_len), 0), "\t", round( min(cis_len), 0), "\t", round(avg(cis_len), 2), "\t", round(max(cis_len), 0), "\t", round(std(cis_len), 2) print "-Matching citations:", "\t", round( sum(cis_matched), 0), "\t", round(min(cis_matched), 0), "\t", round( avg(cis_matched), 2), "\t", round(max(cis_matched), 0), "\t", round(std(cis_matched), 2) print "-Fraction of matching citations: - ", "\t", round( min(cis_matched_div_len), 3), "\t", round(avg(cis_matched_div_len), 3), "\t", round( max(cis_matched_div_len), 3), "\t", round(std(cis_matched_div_len), 3) print "-Total Number of citations/Matching citations:", "\t", round( float(sum(cis_matched)) / sum(cis_len), 3) print "->", round(sum(cis_len),0), (max(cis_len)), round(avg(cis_len),2), round(std(cis_len),2), \ round(sum(cis_matched),0), (max(cis_matched)), round(avg(cis_matched),2), round(std(cis_matched),2), \ round(max(cis_matched_div_len),3), round(avg(cis_matched_div_len),3), round(std(cis_matched_div_len),3) cis_matched_hist = {} for i in xrange(0, max(cis_matched) + 1): cis_matched_hist[i] = sum(1 for c in cis_matched if c == i) print "Histogram:", cis_matched_hist n, bins, patches = plt.hist(sorted(cis_matched), bins=max(cis_matched), normed=False, alpha=0.75) plt.xlabel("Liczba dopasowanych cytowan") plt.ylabel("Liczba rekordow") plt.show()
sys.exit(-1) print "src=", fin try: if sys.argv[1] == sys.argv[2]: system.exit(-1) print "Paths must be different!" fout = open(sys.argv[2], "w") except: print "Argument expected: output-Zbl-file path" sys.exit(-1) print "dst=", fout print "LOADING" docs = [] for N, record in enumerate(zbl_io.read_zbl_records(open(fin))): if N % 500 == 0: print N, "read" doc = [] for field in list_of_fields: if not record.has_key(field): continue words = record[field].split() modified_words = build_mgrams(words, maxn) doc.extend(modified_words) if len(doc) > 0: docs.append(doc) print "CALC terms vs counts" term2count = {} docs_term2count = [] docs_len = [] for doc in docs: doc_term2count = {}
def extract_fv_graph_file(fin, fout, multival_field_name = "af", empty_value = "-"): """From fin reads zbl_records and to fout writes in lines: zbl_id:id1,id2,id3 (graph extracted from field of name multival_field_name).""" zbl_generator = zbl_io.read_zbl_records(fin) fv2ids = extract_fv_graph(zbl_generator, multival_field_name, empty_value, set) return write_file_id2ids(fout, fv2ids.iteritems(), cast_container = set)
sys.exit(-1) print "src=", fin try: if sys.argv[1]==sys.argv[2]: system.exit(-1) print "Paths must be different!" fout = open(sys.argv[2], "w") except: print "Argument expected: output-Zbl-file path" sys.exit(-1) print "dst=", fout print "LOADING" docs = [] for N, record in enumerate(zbl_io.read_zbl_records(open(fin))): if N%500==0: print N, "read" doc = [] for field in list_of_fields: if not record.has_key(field): continue words = record[field].split() modified_words = build_mgrams(words, maxn) doc.extend(modified_words) if len(doc)>0: docs.append(doc) print "CALC terms vs counts" term2count = {} docs_term2count = [] docs_len = [] for doc in docs: doc_term2count = {}
def extract_citations_graph_file(fin, fout): """From fin reads zbl_records and to fout writes in lines: zbl_id:citation-id1,...,citation-idN.""" zbl_generator = zbl_io.read_zbl_records(fin) id2ids_generator = yield_citations(zbl_generator) return write_file_id2ids(fout, id2ids_generator, cast_container=set)