def copy_field(fin, fout, src_field, dst_field): """In every record from fin copies field src_field to field dst_field and stores record to fout.""" for record in zbl_io.read_zbl_records(fin): if record.has_key(src_field): record[dst_field] = record[src_field] zbl_io.write_zbl_record(fout, record) fout.write("\n")
def gensim_mapfields_dict_file(fin, fout, fields, filter_by_fields, dictionary, dst_field, dbg_field_name="g_"): """For every records from ZBL-fin-stream that have filter_by_fields fields are merged, mapped with gensim dictionary and stored in dst-field. Returns number of processed records.""" logging.info("[gensim_mapfields_dict_file] filter_by_fields="+str(filter_by_fields)+\ " fields="+str(fields)+" dictionary="+str(dictionary)+" fin="+str(fin)+" dst_field="+str(dst_field)) id2token = dict( (idx, token) for idx, token in dictionary.iteritems()) #this-line is for debugging purposes counter = 0 for i, record in enumerate(zbl_io.read_zbl_records(fin)): if i % 10000 == 0: logging.info("[gensim_mapfields_dict_file] " + str(i) + " records processed") record = gensim_mapfields_dict(record, fields, filter_by_fields, dictionary, dst_field, id2token, dbg_field_name) if dst_field in record: counter = counter + 1 zbl_io.write_zbl_record(fout, record) fout.write("\n") return counter
def copy_file(fin, fout): """Copies all records from fin to fout. Returns number of all copied records.""" counter = 0 for record in zbl_io.read_zbl_records(fin): zbl_io.write_zbl_record(fout, record) fout.write("\n") counter = counter + 1 return counter
def merge_fields(fin, fout, src_fields, dst_field, separator = " "): """In every record from fin merges fields from src_field to field dst_field and stores record to fout.""" for record in zbl_io.read_zbl_records(fin): try: dst_val = reduce(lambda a,b: a+separator+b, (record[src_field] for src_field in src_fields if src_field in record) ) record[dst_field] = dst_val except: print "[merge_fields] Failed merging in record an=", record[zbl_io.ZBL_ID_FIELD] zbl_io.write_zbl_record(fout, record) fout.write("\n")
def append_not_matched_records(self, fout): """Appends to fout all loaded (aux) records that have never been matched in self.match_aux_record. Returns number of records appended. """ counter = 0 for rec in self.aux_zbl_recs_list: if not rec[zbl_io.ZBL_ID_FIELD] in self.aux_used_ids: zbl_io.write_zbl_record(fout, rec) fout.write("\n") counter = counter + 1 return counter
def keep_records_ids(fin, fout, keep_ids_file): """Copies records from fin to fout. Keeps only those records of ids contained in file keep_ids_file (path). Returns list of kept ids.""" filter_ids = set(line.strip() for line in open(keep_ids_file).xreadlines()) print len(filter_ids), " on the 'keep-ids' list" kept_ids = set() for record in zbl_io.read_zbl_records(fin): if not record[zbl_io.ZBL_ID_FIELD] in filter_ids: continue kept_ids.add(record[zbl_io.ZBL_ID_FIELD]) zbl_io.write_zbl_record(fout, record) fout.write("\n") return kept_ids
def keep_authors(fin, fout): """Copies all records from fin to fout. Removes all fields apart from an, au, ai. Returns number of all copied records. """ counter = 0 for record in zbl_io.read_zbl_records(fin): zbl_io.write_zbl_record(fout, record_keep_authors(record)) fout.write("\n") counter = counter + 1 return counter
def filter_fields_vals(fin, fout, list_of_fields, text_filter = text_filter_lower_space, word_predicate = def_word_predicate): """Copies records from fin to fout and for fields on list_of_fields filters its' values.""" logging.info("[filter_fields_vals] text_filter="+str(text_filter)+" word_predicate="+str(word_predicate)) for record in zbl_io.read_zbl_records(fin): for field in list_of_fields: if record.has_key(field): try: record[field] = words_filter(text_filter(record[field]), word_predicate) except: logging.warn("Removing field in an="+str(record[zbl_io.ZBL_ID_FIELD])+" (is field empty?):"+field+" = "+record[field]) record.pop(field) zbl_io.write_zbl_record(fout, record) fout.write("\n")
def filter_records(fin, fout, bad_ids_file): """Copies records from fin to fout. Filters out records of ids contained in file bad_ids_file (path). Returns list of skipped (filtered out) ids.""" filter_ids = set(line.strip() for line in open(bad_ids_file).xreadlines()) skipped_ids = set() for record in zbl_io.read_zbl_records(fin): if record[zbl_io.ZBL_ID_FIELD] in filter_ids: skipped_ids.add(record[zbl_io.ZBL_ID_FIELD]) continue zbl_io.write_zbl_record(fout, record) fout.write("\n") return skipped_ids
def keep_records_ids(fin, fout, keep_ids_file): """Copies records from fin to fout. Keeps only those records of ids contained in file keep_ids_file (path). Returns list of kept ids.""" filter_ids = set(line.strip() for line in open(keep_ids_file).xreadlines()) print len(filter_ids)," on the 'keep-ids' list" kept_ids = set() for record in zbl_io.read_zbl_records(fin): if not record[zbl_io.ZBL_ID_FIELD] in filter_ids: continue kept_ids.add(record[zbl_io.ZBL_ID_FIELD]) zbl_io.write_zbl_record(fout, record) fout.write("\n") return kept_ids
def keep_records(fin, fout, must_have_fields): """Copies records from fin to fout. Keeps only these records that have all fields from must_have_fields list. """ kept_counter = 0 for i,record in enumerate(zbl_io.read_zbl_records(fin)): if i%10000 == 0: print "[keep_records]", i,"processed", kept_counter, "kept" if has_all_fields(record, must_have_fields): zbl_io.write_zbl_record(fout, record) fout.write("\n") kept_counter = kept_counter + 1 return kept_counter
def keep_records(fin, fout, must_have_fields): """Copies records from fin to fout. Keeps only these records that have all fields from must_have_fields list. """ kept_counter = 0 for i, record in enumerate(zbl_io.read_zbl_records(fin)): if i % 10000 == 0: print "[keep_records]", i, "processed", kept_counter, "kept" if has_all_fields(record, must_have_fields): zbl_io.write_zbl_record(fout, record) fout.write("\n") kept_counter = kept_counter + 1 return kept_counter
def merge_fields(fin, fout, src_fields, dst_field, separator=" "): """In every record from fin merges fields from src_field to field dst_field and stores record to fout.""" for record in zbl_io.read_zbl_records(fin): try: dst_val = reduce( lambda a, b: a + separator + b, (record[src_field] for src_field in src_fields if src_field in record)) record[dst_field] = dst_val except: print "[merge_fields] Failed merging in record an=", record[ zbl_io.ZBL_ID_FIELD] zbl_io.write_zbl_record(fout, record) fout.write("\n")
def filter_duplicates(fin, fout): """Copies records from fin to fout. Records with duplicated id are filtered out. Returns list of duplicated ids.""" ids = set() duplicated_ids = set() for record in zbl_io.read_zbl_records(fin): id = fix_id(record[zbl_io.ZBL_ID_FIELD]) if id in ids: duplicated_ids.add(id) continue ids.add(id) zbl_io.write_zbl_record(fout, record) fout.write("\n") return duplicated_ids
def filter_field(fin, fout, field_name): """Copies records from fin to fout but keeping only id and field of field_name. Returns number of found fields. """ counter = 0 for record in zbl_io.read_zbl_records(fin): newrec = {} newrec[zbl_io.ZBL_ID_FIELD] = record[zbl_io.ZBL_ID_FIELD] if record.has_key(field_name): newrec[field_name] = record[field_name] counter = counter + 1 zbl_io.write_zbl_record(fout, newrec) fout.write("\n") return counter
def filter_af(fin, fout): """Copies records from fin to fout but also removes from records empty (only "-" values) af fields. Returns number of removed fields. """ counter = 0 for record in zbl_io.read_zbl_records(fin): if record.has_key("af"): af = zbl_io.unpack_multivalue_field(record["af"]) empty = sum(1 for a in af if a == '-') == len(af) if empty: record.pop("af") counter = counter + 1 zbl_io.write_zbl_record(fout, record) fout.write("\n") return counter
def gensim_mapfields_dict_file(fin, fout, fields, filter_by_fields, dictionary, dst_field, dbg_field_name = "g_"): """For every records from ZBL-fin-stream that have filter_by_fields fields are merged, mapped with gensim dictionary and stored in dst-field. Returns number of processed records.""" logging.info("[gensim_mapfields_dict_file] filter_by_fields="+str(filter_by_fields)+\ " fields="+str(fields)+" dictionary="+str(dictionary)+" fin="+str(fin)+" dst_field="+str(dst_field)) id2token = dict( (idx,token) for idx,token in dictionary.iteritems() ) #this-line is for debugging purposes counter = 0 for i,record in enumerate(zbl_io.read_zbl_records(fin)): if i%10000 == 0: logging.info("[gensim_mapfields_dict_file] "+str(i)+" records processed") record = gensim_mapfields_dict(record, fields, filter_by_fields, dictionary, dst_field, id2token, dbg_field_name) if dst_field in record: counter = counter + 1 zbl_io.write_zbl_record(fout, record) fout.write("\n") return counter
def calc_msc_membership(fin, fout, known_msc_codes, \ src_field='mc', dst_field='m0', \ re_leaf_pattern=msc_processing.MSC_ORDINARY_LEAF_PATTERN_RE, dbg_field='m_'): """Updates records with additional dst_field with membership vector calculated basing on msc codes read from src_field.""" msccodes = filter_msccodes(known_msc_codes, re_leaf_pattern) msc2ix = calc_msc2ix(msccodes) ix2msc = dict((ix, msc) for msc, ix in msc2ix.iteritems()) prefix2msc = group_by_prefix(msccodes) counter = 0 for i, record in enumerate(zbl_io.read_zbl_records(fin)): if i % 10000 == 0: logging.info("[calc_msc_membership] " + str(i) + " records processed. " + str(counter) + "updated.") if src_field in record: record_msccodes = zbl_io.unpack_multivalue_field(record[src_field]) record_msccodes = filter_msccodes(record_msccodes, re_leaf_pattern) compared_codes = set( ) #patrzymy po tych ktore maja zgodne fragmenty prefiksow for record_msccode in record_msccodes: prefix2 = record_msccode[:2] prefix3 = record_msccode[:3] compared_codes.update(prefix2msc[prefix2]) compared_codes.update(prefix2msc[prefix3]) mscmembership = [] for compared_code in compared_codes: membership = msccode_membership(record_msccodes, compared_code) mscmembership.append((msc2ix[compared_code], membership)) if len(mscmembership) > 0: #zapsiujemy wyniki mscmembership = sorted(set(mscmembership)) record[dst_field] = zbl_io.pack_listpairs_field(mscmembership) record[dbg_field] = zbl_io.pack_listpairs_field([ (ix2msc[ix], m) for ix, m in mscmembership ]) counter = counter + 1 zbl_io.write_zbl_record(fout, record) fout.write("\n") return counter
def gensim_mapfield_model(fin, fout, model, src_field, dst_field,\ src_field_value_extractor=extract_bag_of_ids, dst_field_value_builder=zbl_io.pack_listpairs_field): """For every records from ZBL-fin-stream that have src_field its content is interpreted as gensim-bag-of-ids(with weights/counts/values) and transformed using model (results are stored into dst_field). Returns number of enriched records.""" logging.info("[gensim_mapfield_model] src_field="+str(src_field)+\ " model="+str(model)+" fin="+str(fin)+" dst_field="+str(dst_field)+\ " src_field_value_extractor="+str(src_field_value_extractor)+" dst_field_value_builder="+str(dst_field_value_builder)) counter = 0 for i,record in enumerate(zbl_io.read_zbl_records(fin)): if i%10000 == 0: logging.info("[gensim_mapfield_model] "+str(i)+" documents mapped...") if src_field in record: bag_of_ids = src_field_value_extractor(record[src_field]) tfidf_values = model[bag_of_ids] record[dst_field] = dst_field_value_builder(tfidf_values) logging.debug("[gensim_mapfield_model]"+record[src_field]+" -> "+record[dst_field]) counter = counter + 1 zbl_io.write_zbl_record(fout, record) fout.write("\n") return counter
def filter_fields_vals(fin, fout, list_of_fields, text_filter=text_filter_lower_space, word_predicate=def_word_predicate): """Copies records from fin to fout and for fields on list_of_fields filters its' values.""" logging.info("[filter_fields_vals] text_filter=" + str(text_filter) + " word_predicate=" + str(word_predicate)) for record in zbl_io.read_zbl_records(fin): for field in list_of_fields: if record.has_key(field): try: record[field] = words_filter(text_filter(record[field]), word_predicate) except: logging.warn("Removing field in an=" + str(record[zbl_io.ZBL_ID_FIELD]) + " (is field empty?):" + field + " = " + record[field]) record.pop(field) zbl_io.write_zbl_record(fout, record) fout.write("\n")
def calc_msc_membership(fin, fout, known_msc_codes, \ src_field='mc', dst_field='m0', \ re_leaf_pattern=msc_processing.MSC_ORDINARY_LEAF_PATTERN_RE, dbg_field='m_'): """Updates records with additional dst_field with membership vector calculated basing on msc codes read from src_field.""" msccodes = filter_msccodes(known_msc_codes, re_leaf_pattern) msc2ix = calc_msc2ix(msccodes) ix2msc = dict((ix,msc) for msc,ix in msc2ix.iteritems()) prefix2msc = group_by_prefix(msccodes) counter = 0; for i,record in enumerate(zbl_io.read_zbl_records(fin)): if i%10000 == 0: logging.info("[calc_msc_membership] "+str(i)+" records processed. "+str(counter)+"updated.") if src_field in record: record_msccodes = zbl_io.unpack_multivalue_field(record[src_field]) record_msccodes = filter_msccodes(record_msccodes, re_leaf_pattern) compared_codes = set() #patrzymy po tych ktore maja zgodne fragmenty prefiksow for record_msccode in record_msccodes: prefix2 = record_msccode[:2] prefix3 = record_msccode[:3] compared_codes.update( prefix2msc[prefix2] ) compared_codes.update( prefix2msc[prefix3] ) mscmembership = [] for compared_code in compared_codes: membership = msccode_membership(record_msccodes, compared_code) mscmembership.append( (msc2ix[compared_code],membership) ) if len(mscmembership) > 0: #zapsiujemy wyniki mscmembership = sorted(set(mscmembership)) record[dst_field] = zbl_io.pack_listpairs_field(mscmembership) record[dbg_field] = zbl_io.pack_listpairs_field([(ix2msc[ix],m) for ix,m in mscmembership]) counter = counter + 1 zbl_io.write_zbl_record(fout, record) fout.write("\n") return counter
def gensim_mapfield_model(fin, fout, model, src_field, dst_field,\ src_field_value_extractor=extract_bag_of_ids, dst_field_value_builder=zbl_io.pack_listpairs_field): """For every records from ZBL-fin-stream that have src_field its content is interpreted as gensim-bag-of-ids(with weights/counts/values) and transformed using model (results are stored into dst_field). Returns number of enriched records.""" logging.info("[gensim_mapfield_model] src_field="+str(src_field)+\ " model="+str(model)+" fin="+str(fin)+" dst_field="+str(dst_field)+\ " src_field_value_extractor="+str(src_field_value_extractor)+" dst_field_value_builder="+str(dst_field_value_builder)) counter = 0 for i, record in enumerate(zbl_io.read_zbl_records(fin)): if i % 10000 == 0: logging.info("[gensim_mapfield_model] " + str(i) + " documents mapped...") if src_field in record: bag_of_ids = src_field_value_extractor(record[src_field]) tfidf_values = model[bag_of_ids] record[dst_field] = dst_field_value_builder(tfidf_values) logging.debug("[gensim_mapfield_model]" + record[src_field] + " -> " + record[dst_field]) counter = counter + 1 zbl_io.write_zbl_record(fout, record) fout.write("\n") return counter
try: in_path = sys.argv[2] except: print "Second argument expected: input-zbl-file-path (Pseudo-ZBL)" sys.exit(-1) try: out_path = sys.argv[3] except: print "Third argument expected: output-zbl-file-path (Pseudo-ZBL)" sys.exit(-1) print "mapping_path =", mapping_path print "in_path =", in_path print "out_path =", out_path id_mapper = ZblIdMapper(mapping_path, False) fin = open(in_path, 'r') fout = open(out_path, 'w') for record in zbl_io.read_zbl_records(fin): zbl_io.write_zbl_record(fout, id_mapper.update_record(record)) fout.write("\n") fin.close() fout.close() id_mapper.print_stats()
print "-----------------------------------" zbl_matcher.print_py_report() print "-----------------------------------" ###################################################################################################### #mix records from two files: start_time = time.clock() for main_zbl_record in zbl_io.read_zbl_records(fmain): aux_zbl_record = zbl_matcher.match_aux_record(main_zbl_record, only_fast_match_methods) if not aux_zbl_record is None: #mix two records: main_zbl_record = update_zbl_record(main_zbl_record, aux_zbl_record, forced_fields) #write results: zbl_io.write_zbl_record(fout, main_zbl_record) fout.write("\n") #progress bar: main_counter = zbl_matcher.total_processed_records_num() matched_counter = zbl_matcher.total_matched_records_num() if main_counter%10000 == 0: print (time.clock() - start_time),"s - ",main_counter, "processed,",matched_counter,"matched" ###################################################################################################### if append_not_matched_records_flag: print zbl_matcher.append_not_matched_records(fout), " appended not matched records..." fmain.close() fout.close()
def add_field(fin, fout, add_field_name, add_field_value): """To every record from fin adds field (add_field_value:add_field_name) and stores record to fout.""" for record in zbl_io.read_zbl_records(fin): record[add_field_name] = add_field_value zbl_io.write_zbl_record(fout, record) fout.write("\n")
sys.exit(-1) print "src = ", main_zbl_path print "dst = ", out_path cimatch = CitationMatcher(main_zbl_path) fout = open(out_path, "w") main_counter = 0 start_time = time.clock() for record in zbl_io.read_zbl_records(open(main_zbl_path, 'r')): #update citations: if record.has_key("ci"): cis = zbl_io.unpack_list_of_dictionaries(record["ci"]) for ci in cis: ci = cimatch.add_citation_identity(ci) record["ci"] = zbl_io.pack_list_of_dictionaries(cis) #write output: zbl_io.write_zbl_record(fout, record) fout.write("\n") #progress bar: if main_counter % 10000 == 0: print(time.clock() - start_time), "s - ", main_counter, "processed,", ( cimatch.matched), "matched", (cimatch.missed), "missed" main_counter = main_counter + 1 fout.close() print "missed=", (cimatch.missed) print "matched=", (cimatch.matched)
try: mapping_path = sys.argv[1] except: print "First argument expected: mapping-file-path (every line in format: src-id dst-id)" sys.exit(-1) try: in_path = sys.argv[2] except: print "Second argument expected: input-zbl-file-path (Pseudo-ZBL)" sys.exit(-1) try: out_path = sys.argv[3] except: print "Third argument expected: output-zbl-file-path (Pseudo-ZBL)" sys.exit(-1) print "mapping_path =", mapping_path print "in_path =", in_path print "out_path =", out_path id_mapper = ZblIdMapper(mapping_path, False) fin = open(in_path, 'r') fout = open(out_path, 'w') for record in zbl_io.read_zbl_records(fin): zbl_io.write_zbl_record(fout, id_mapper.update_record(record)) fout.write("\n") fin.close() fout.close() id_mapper.print_stats()
print "Second argument expected: output-zbl-file-path (Pseudo-ZBL)" sys.exit(-1) print "src = ", main_zbl_path print "dst = ", out_path cimatch = CitationMatcher(main_zbl_path) fout = open(out_path, "w") main_counter = 0 start_time = time.clock() for record in zbl_io.read_zbl_records( open(main_zbl_path, 'r') ): #update citations: if record.has_key("ci"): cis = zbl_io.unpack_list_of_dictionaries(record["ci"]) for ci in cis: ci = cimatch.add_citation_identity(ci) record["ci"] = zbl_io.pack_list_of_dictionaries(cis) #write output: zbl_io.write_zbl_record(fout, record) fout.write("\n") #progress bar: if main_counter%10000 == 0: print (time.clock() - start_time),"s - ",main_counter, "processed,", (cimatch.matched),"matched",(cimatch.missed),"missed" main_counter = main_counter + 1 fout.close() print "missed=",(cimatch.missed) print "matched=",(cimatch.matched)
zbl_matcher.print_py_report() print "-----------------------------------" ###################################################################################################### #mix records from two files: start_time = time.clock() for main_zbl_record in zbl_io.read_zbl_records(fmain): aux_zbl_record = zbl_matcher.match_aux_record(main_zbl_record, only_fast_match_methods) if not aux_zbl_record is None: #mix two records: main_zbl_record = update_zbl_record(main_zbl_record, aux_zbl_record, forced_fields) #write results: zbl_io.write_zbl_record(fout, main_zbl_record) fout.write("\n") #progress bar: main_counter = zbl_matcher.total_processed_records_num() matched_counter = zbl_matcher.total_matched_records_num() if main_counter % 10000 == 0: print( time.clock() - start_time ), "s - ", main_counter, "processed,", matched_counter, "matched" ###################################################################################################### if append_not_matched_records_flag: print zbl_matcher.append_not_matched_records( fout), " appended not matched records..."