def append_file(fin, fout, fappend): """Copies all records from fin and fappend to fout. Returns number of all copied records.""" counter = 0 for record in zbl_io.read_zbl_records(fin): zbl_io.write_zbl_record(fout, record) fout.write("\n") counter = counter + 1 for record in zbl_io.read_zbl_records(fappend): zbl_io.write_zbl_record(fout, record) fout.write("\n") counter = counter + 1 return counter
def gensim_mapfields_dict_file(fin, fout, fields, filter_by_fields, dictionary, dst_field, dbg_field_name="g_"): """For every records from ZBL-fin-stream that have filter_by_fields fields are merged, mapped with gensim dictionary and stored in dst-field. Returns number of processed records.""" logging.info("[gensim_mapfields_dict_file] filter_by_fields="+str(filter_by_fields)+\ " fields="+str(fields)+" dictionary="+str(dictionary)+" fin="+str(fin)+" dst_field="+str(dst_field)) id2token = dict( (idx, token) for idx, token in dictionary.iteritems()) #this-line is for debugging purposes counter = 0 for i, record in enumerate(zbl_io.read_zbl_records(fin)): if i % 10000 == 0: logging.info("[gensim_mapfields_dict_file] " + str(i) + " records processed") record = gensim_mapfields_dict(record, fields, filter_by_fields, dictionary, dst_field, id2token, dbg_field_name) if dst_field in record: counter = counter + 1 zbl_io.write_zbl_record(fout, record) fout.write("\n") return counter
def copy_field(fin, fout, src_field, dst_field): """In every record from fin copies field src_field to field dst_field and stores record to fout.""" for record in zbl_io.read_zbl_records(fin): if record.has_key(src_field): record[dst_field] = record[src_field] zbl_io.write_zbl_record(fout, record) fout.write("\n")
def add_citation_identity(self, ci, only_fast_match_methods=True): """According to records in ZBL file (self.main_zbl_path) and id-maps (self.mr_to_id_map, self.zbl_to_id_map) tries to assign identity (<an> field) to citation (given as a dictionary).""" self.__match_identity_on_id__(ci) if ci.has_key(zbl_io.ZBL_ID_FIELD): #print "Assigning to citation [ID/ZBL/MR]:", ci[zbl_io.ZBL_ID_FIELD] self.matched = self.matched + 1 return ci elif only_fast_match_methods: self.missed = self.missed + 1 return ci candidates = [] f = open(self.main_zbl_path, 'r') for record in zbl_io.read_zbl_records(f): if ci.has_key("py") and record.has_key("py"): if ci["py"] != record["py"]: continue if self.similarity_operator(record, ci): candidates.append(aux_zbl_record) f.close() if len(candidates) == 0: self.missed = self.missed + 1 return ci matching_record = zbl_similarity.select_best_fitting_record( ci, candidates, self.selection_fields) ci[zbl_io.ZBL_ID_FIELD] = matching_record[zbl_io.ZBL_ID_FIELD] #print "Assigning to citation [SIMILARITY]:", ci[zbl_io.ZBL_ID_FIELD] self.matched = self.matched + 1 return ci
def add_citation_identity(self, ci, only_fast_match_methods = True): """According to records in ZBL file (self.main_zbl_path) and id-maps (self.mr_to_id_map, self.zbl_to_id_map) tries to assign identity (<an> field) to citation (given as a dictionary).""" self.__match_identity_on_id__(ci) if ci.has_key(zbl_io.ZBL_ID_FIELD): #print "Assigning to citation [ID/ZBL/MR]:", ci[zbl_io.ZBL_ID_FIELD] self.matched = self.matched + 1 return ci elif only_fast_match_methods: self.missed = self.missed + 1 return ci candidates = [] f = open(self.main_zbl_path, 'r') for record in zbl_io.read_zbl_records(f): if ci.has_key("py") and record.has_key("py"): if ci["py"] != record["py"]: continue if self.similarity_operator(record, ci): candidates.append(aux_zbl_record) f.close() if len(candidates) == 0: self.missed = self.missed + 1 return ci matching_record = zbl_similarity.select_best_fitting_record(ci, candidates, self.selection_fields) ci[zbl_io.ZBL_ID_FIELD] = matching_record[zbl_io.ZBL_ID_FIELD] #print "Assigning to citation [SIMILARITY]:", ci[zbl_io.ZBL_ID_FIELD] self.matched = self.matched + 1 return ci
def merge_fields(fin, fout, src_fields, dst_field, separator = " "): """In every record from fin merges fields from src_field to field dst_field and stores record to fout.""" for record in zbl_io.read_zbl_records(fin): try: dst_val = reduce(lambda a,b: a+separator+b, (record[src_field] for src_field in src_fields if src_field in record) ) record[dst_field] = dst_val except: print "[merge_fields] Failed merging in record an=", record[zbl_io.ZBL_ID_FIELD] zbl_io.write_zbl_record(fout, record) fout.write("\n")
def extract_field_value(fin, fout, field_name): """Extracts to fout value of a field of field_name. Returns number of found fields. """ counter = 0 for record in zbl_io.read_zbl_records(fin): if record.has_key(field_name): fout.write(str(record[field_name])) fout.write("\n") counter = counter + 1 return counter
def filter_records(fin, fout, bad_ids_file): """Copies records from fin to fout. Filters out records of ids contained in file bad_ids_file (path). Returns list of skipped (filtered out) ids.""" filter_ids = set(line.strip() for line in open(bad_ids_file).xreadlines()) skipped_ids = set() for record in zbl_io.read_zbl_records(fin): if record[zbl_io.ZBL_ID_FIELD] in filter_ids: skipped_ids.add(record[zbl_io.ZBL_ID_FIELD]) continue zbl_io.write_zbl_record(fout, record) fout.write("\n") return skipped_ids
def keep_authors(fin, fout): """Copies all records from fin to fout. Removes all fields apart from an, au, ai. Returns number of all copied records. """ counter = 0 for record in zbl_io.read_zbl_records(fin): zbl_io.write_zbl_record(fout, record_keep_authors(record)) fout.write("\n") counter = counter + 1 return counter
def keep_records_ids(fin, fout, keep_ids_file): """Copies records from fin to fout. Keeps only those records of ids contained in file keep_ids_file (path). Returns list of kept ids.""" filter_ids = set(line.strip() for line in open(keep_ids_file).xreadlines()) print len(filter_ids), " on the 'keep-ids' list" kept_ids = set() for record in zbl_io.read_zbl_records(fin): if not record[zbl_io.ZBL_ID_FIELD] in filter_ids: continue kept_ids.add(record[zbl_io.ZBL_ID_FIELD]) zbl_io.write_zbl_record(fout, record) fout.write("\n") return kept_ids
def keep_records_ids(fin, fout, keep_ids_file): """Copies records from fin to fout. Keeps only those records of ids contained in file keep_ids_file (path). Returns list of kept ids.""" filter_ids = set(line.strip() for line in open(keep_ids_file).xreadlines()) print len(filter_ids)," on the 'keep-ids' list" kept_ids = set() for record in zbl_io.read_zbl_records(fin): if not record[zbl_io.ZBL_ID_FIELD] in filter_ids: continue kept_ids.add(record[zbl_io.ZBL_ID_FIELD]) zbl_io.write_zbl_record(fout, record) fout.write("\n") return kept_ids
def keep_records(fin, fout, must_have_fields): """Copies records from fin to fout. Keeps only these records that have all fields from must_have_fields list. """ kept_counter = 0 for i,record in enumerate(zbl_io.read_zbl_records(fin)): if i%10000 == 0: print "[keep_records]", i,"processed", kept_counter, "kept" if has_all_fields(record, must_have_fields): zbl_io.write_zbl_record(fout, record) fout.write("\n") kept_counter = kept_counter + 1 return kept_counter
def filter_fields_vals(fin, fout, list_of_fields, text_filter = text_filter_lower_space, word_predicate = def_word_predicate): """Copies records from fin to fout and for fields on list_of_fields filters its' values.""" logging.info("[filter_fields_vals] text_filter="+str(text_filter)+" word_predicate="+str(word_predicate)) for record in zbl_io.read_zbl_records(fin): for field in list_of_fields: if record.has_key(field): try: record[field] = words_filter(text_filter(record[field]), word_predicate) except: logging.warn("Removing field in an="+str(record[zbl_io.ZBL_ID_FIELD])+" (is field empty?):"+field+" = "+record[field]) record.pop(field) zbl_io.write_zbl_record(fout, record) fout.write("\n")
def merge_fields(fin, fout, src_fields, dst_field, separator=" "): """In every record from fin merges fields from src_field to field dst_field and stores record to fout.""" for record in zbl_io.read_zbl_records(fin): try: dst_val = reduce( lambda a, b: a + separator + b, (record[src_field] for src_field in src_fields if src_field in record)) record[dst_field] = dst_val except: print "[merge_fields] Failed merging in record an=", record[ zbl_io.ZBL_ID_FIELD] zbl_io.write_zbl_record(fout, record) fout.write("\n")
def calc_msc2count(fin, src_field='mc'): """Returns msc2counts dictionary.""" msc2count = {}; for i,record in enumerate(zbl_io.read_zbl_records(fin)): if i%10000 == 0: logging.info("[calc_msc_model] "+str(i)+" records processed") if not src_field in record: continue msccodes = zbl_io.unpack_multivalue_field(record[src_field]) for msc in msccodes: msc2count[msc] = msc2count.get(msc, 0) + 1 #zbl_io.write_zbl_record(fout, record) #fout.write("\n") return msc2count
def keep_records(fin, fout, must_have_fields): """Copies records from fin to fout. Keeps only these records that have all fields from must_have_fields list. """ kept_counter = 0 for i, record in enumerate(zbl_io.read_zbl_records(fin)): if i % 10000 == 0: print "[keep_records]", i, "processed", kept_counter, "kept" if has_all_fields(record, must_have_fields): zbl_io.write_zbl_record(fout, record) fout.write("\n") kept_counter = kept_counter + 1 return kept_counter
def gen_record(fname, filtered_by, uni=False): """ Returns records that contain fields specified in filtered_by """ if type(fname) == file: ff = fname elif uni: ff = codecs.open(fname, "r", encoding="utf-8") else: ff = open(fname, "r") for r in read_zbl_records(ff, uni): if reduce(lambda x, y: x and y, map(lambda f: f in r, filtered_by)): yield r
def gen_record(fname, filtered_by, uni=False): """ Returns records that contain fields specified in filtered_by """ if type(fname) == file: ff = fname elif uni: ff = codecs.open(fname, 'r', encoding='utf-8') else: ff = open(fname, 'r') for r in read_zbl_records(ff, uni): if reduce(lambda x, y: x and y, map(lambda f: f in r, filtered_by)): yield r
def filter_duplicates(fin, fout): """Copies records from fin to fout. Records with duplicated id are filtered out. Returns list of duplicated ids.""" ids = set() duplicated_ids = set() for record in zbl_io.read_zbl_records(fin): id = fix_id(record[zbl_io.ZBL_ID_FIELD]) if id in ids: duplicated_ids.add(id) continue ids.add(id) zbl_io.write_zbl_record(fout, record) fout.write("\n") return duplicated_ids
def filter_field(fin, fout, field_name): """Copies records from fin to fout but keeping only id and field of field_name. Returns number of found fields. """ counter = 0 for record in zbl_io.read_zbl_records(fin): newrec = {} newrec[zbl_io.ZBL_ID_FIELD] = record[zbl_io.ZBL_ID_FIELD] if record.has_key(field_name): newrec[field_name] = record[field_name] counter = counter + 1 zbl_io.write_zbl_record(fout, newrec) fout.write("\n") return counter
def calc_msc2count(fin, src_field='mc'): """Returns msc2counts dictionary.""" msc2count = {} for i, record in enumerate(zbl_io.read_zbl_records(fin)): if i % 10000 == 0: logging.info("[calc_msc_model] " + str(i) + " records processed") if not src_field in record: continue msccodes = zbl_io.unpack_multivalue_field(record[src_field]) for msc in msccodes: msc2count[msc] = msc2count.get(msc, 0) + 1 #zbl_io.write_zbl_record(fout, record) #fout.write("\n") return msc2count
def filter_af(fin, fout): """Copies records from fin to fout but also removes from records empty (only "-" values) af fields. Returns number of removed fields. """ counter = 0 for record in zbl_io.read_zbl_records(fin): if record.has_key("af"): af = zbl_io.unpack_multivalue_field(record["af"]) empty = sum(1 for a in af if a == '-') == len(af) if empty: record.pop("af") counter = counter + 1 zbl_io.write_zbl_record(fout, record) fout.write("\n") return counter
def gensim_mapfields_dict_file(fin, fout, fields, filter_by_fields, dictionary, dst_field, dbg_field_name = "g_"): """For every records from ZBL-fin-stream that have filter_by_fields fields are merged, mapped with gensim dictionary and stored in dst-field. Returns number of processed records.""" logging.info("[gensim_mapfields_dict_file] filter_by_fields="+str(filter_by_fields)+\ " fields="+str(fields)+" dictionary="+str(dictionary)+" fin="+str(fin)+" dst_field="+str(dst_field)) id2token = dict( (idx,token) for idx,token in dictionary.iteritems() ) #this-line is for debugging purposes counter = 0 for i,record in enumerate(zbl_io.read_zbl_records(fin)): if i%10000 == 0: logging.info("[gensim_mapfields_dict_file] "+str(i)+" records processed") record = gensim_mapfields_dict(record, fields, filter_by_fields, dictionary, dst_field, id2token, dbg_field_name) if dst_field in record: counter = counter + 1 zbl_io.write_zbl_record(fout, record) fout.write("\n") return counter
def calc_msc_membership(fin, fout, known_msc_codes, \ src_field='mc', dst_field='m0', \ re_leaf_pattern=msc_processing.MSC_ORDINARY_LEAF_PATTERN_RE, dbg_field='m_'): """Updates records with additional dst_field with membership vector calculated basing on msc codes read from src_field.""" msccodes = filter_msccodes(known_msc_codes, re_leaf_pattern) msc2ix = calc_msc2ix(msccodes) ix2msc = dict((ix, msc) for msc, ix in msc2ix.iteritems()) prefix2msc = group_by_prefix(msccodes) counter = 0 for i, record in enumerate(zbl_io.read_zbl_records(fin)): if i % 10000 == 0: logging.info("[calc_msc_membership] " + str(i) + " records processed. " + str(counter) + "updated.") if src_field in record: record_msccodes = zbl_io.unpack_multivalue_field(record[src_field]) record_msccodes = filter_msccodes(record_msccodes, re_leaf_pattern) compared_codes = set( ) #patrzymy po tych ktore maja zgodne fragmenty prefiksow for record_msccode in record_msccodes: prefix2 = record_msccode[:2] prefix3 = record_msccode[:3] compared_codes.update(prefix2msc[prefix2]) compared_codes.update(prefix2msc[prefix3]) mscmembership = [] for compared_code in compared_codes: membership = msccode_membership(record_msccodes, compared_code) mscmembership.append((msc2ix[compared_code], membership)) if len(mscmembership) > 0: #zapsiujemy wyniki mscmembership = sorted(set(mscmembership)) record[dst_field] = zbl_io.pack_listpairs_field(mscmembership) record[dbg_field] = zbl_io.pack_listpairs_field([ (ix2msc[ix], m) for ix, m in mscmembership ]) counter = counter + 1 zbl_io.write_zbl_record(fout, record) fout.write("\n") return counter
def gensim_mapfield_model(fin, fout, model, src_field, dst_field,\ src_field_value_extractor=extract_bag_of_ids, dst_field_value_builder=zbl_io.pack_listpairs_field): """For every records from ZBL-fin-stream that have src_field its content is interpreted as gensim-bag-of-ids(with weights/counts/values) and transformed using model (results are stored into dst_field). Returns number of enriched records.""" logging.info("[gensim_mapfield_model] src_field="+str(src_field)+\ " model="+str(model)+" fin="+str(fin)+" dst_field="+str(dst_field)+\ " src_field_value_extractor="+str(src_field_value_extractor)+" dst_field_value_builder="+str(dst_field_value_builder)) counter = 0 for i,record in enumerate(zbl_io.read_zbl_records(fin)): if i%10000 == 0: logging.info("[gensim_mapfield_model] "+str(i)+" documents mapped...") if src_field in record: bag_of_ids = src_field_value_extractor(record[src_field]) tfidf_values = model[bag_of_ids] record[dst_field] = dst_field_value_builder(tfidf_values) logging.debug("[gensim_mapfield_model]"+record[src_field]+" -> "+record[dst_field]) counter = counter + 1 zbl_io.write_zbl_record(fout, record) fout.write("\n") return counter
def filter_fields_vals(fin, fout, list_of_fields, text_filter=text_filter_lower_space, word_predicate=def_word_predicate): """Copies records from fin to fout and for fields on list_of_fields filters its' values.""" logging.info("[filter_fields_vals] text_filter=" + str(text_filter) + " word_predicate=" + str(word_predicate)) for record in zbl_io.read_zbl_records(fin): for field in list_of_fields: if record.has_key(field): try: record[field] = words_filter(text_filter(record[field]), word_predicate) except: logging.warn("Removing field in an=" + str(record[zbl_io.ZBL_ID_FIELD]) + " (is field empty?):" + field + " = " + record[field]) record.pop(field) zbl_io.write_zbl_record(fout, record) fout.write("\n")
def calc_msc_membership(fin, fout, known_msc_codes, \ src_field='mc', dst_field='m0', \ re_leaf_pattern=msc_processing.MSC_ORDINARY_LEAF_PATTERN_RE, dbg_field='m_'): """Updates records with additional dst_field with membership vector calculated basing on msc codes read from src_field.""" msccodes = filter_msccodes(known_msc_codes, re_leaf_pattern) msc2ix = calc_msc2ix(msccodes) ix2msc = dict((ix,msc) for msc,ix in msc2ix.iteritems()) prefix2msc = group_by_prefix(msccodes) counter = 0; for i,record in enumerate(zbl_io.read_zbl_records(fin)): if i%10000 == 0: logging.info("[calc_msc_membership] "+str(i)+" records processed. "+str(counter)+"updated.") if src_field in record: record_msccodes = zbl_io.unpack_multivalue_field(record[src_field]) record_msccodes = filter_msccodes(record_msccodes, re_leaf_pattern) compared_codes = set() #patrzymy po tych ktore maja zgodne fragmenty prefiksow for record_msccode in record_msccodes: prefix2 = record_msccode[:2] prefix3 = record_msccode[:3] compared_codes.update( prefix2msc[prefix2] ) compared_codes.update( prefix2msc[prefix3] ) mscmembership = [] for compared_code in compared_codes: membership = msccode_membership(record_msccodes, compared_code) mscmembership.append( (msc2ix[compared_code],membership) ) if len(mscmembership) > 0: #zapsiujemy wyniki mscmembership = sorted(set(mscmembership)) record[dst_field] = zbl_io.pack_listpairs_field(mscmembership) record[dbg_field] = zbl_io.pack_listpairs_field([(ix2msc[ix],m) for ix,m in mscmembership]) counter = counter + 1 zbl_io.write_zbl_record(fout, record) fout.write("\n") return counter
def gensim_mapfield_model(fin, fout, model, src_field, dst_field,\ src_field_value_extractor=extract_bag_of_ids, dst_field_value_builder=zbl_io.pack_listpairs_field): """For every records from ZBL-fin-stream that have src_field its content is interpreted as gensim-bag-of-ids(with weights/counts/values) and transformed using model (results are stored into dst_field). Returns number of enriched records.""" logging.info("[gensim_mapfield_model] src_field="+str(src_field)+\ " model="+str(model)+" fin="+str(fin)+" dst_field="+str(dst_field)+\ " src_field_value_extractor="+str(src_field_value_extractor)+" dst_field_value_builder="+str(dst_field_value_builder)) counter = 0 for i, record in enumerate(zbl_io.read_zbl_records(fin)): if i % 10000 == 0: logging.info("[gensim_mapfield_model] " + str(i) + " documents mapped...") if src_field in record: bag_of_ids = src_field_value_extractor(record[src_field]) tfidf_values = model[bag_of_ids] record[dst_field] = dst_field_value_builder(tfidf_values) logging.debug("[gensim_mapfield_model]" + record[src_field] + " -> " + record[dst_field]) counter = counter + 1 zbl_io.write_zbl_record(fout, record) fout.write("\n") return counter
ccs = filter_categories( cc.strip() for cc in ccstr.split(' ') ) for cc in ccs: if count.has_key(cc): count[cc] = count[cc] + 1 else: count[cc] = 1 return count if __name__ == "__main__": args = sys.argv if len(sys.argv) != 3: print "[ERROR] Exactly two arguments are expected: input-zbl-file-path output-count-path-prefix" exit(-1) zblInPath = args[1] statsOutPath = args[2] #zliczenie zbl_src = zbl_io.read_zbl_records(open(zblInPath)) count = count_categories(zbl_src, filter_XXY_categories) #zapis do pliku #f = open(statsOutPath, "w") #for cc in count: # f.write(cc+" "+str(count[cc])+"\n") #f.close() io.fwrite_vector(statsOutPath+"_labels.svector", count.keys()) io.fwrite_vector(statsOutPath+"_count.ivector", count.values())
sys.exit(-1) try: out_path = sys.argv[2] except: print "Second argument expected: output-zbl-file-path (Pseudo-ZBL)" sys.exit(-1) print "src = ", main_zbl_path print "dst = ", out_path cimatch = CitationMatcher(main_zbl_path) fout = open(out_path, "w") main_counter = 0 start_time = time.clock() for record in zbl_io.read_zbl_records( open(main_zbl_path, 'r') ): #update citations: if record.has_key("ci"): cis = zbl_io.unpack_list_of_dictionaries(record["ci"]) for ci in cis: ci = cimatch.add_citation_identity(ci) record["ci"] = zbl_io.pack_list_of_dictionaries(cis) #write output: zbl_io.write_zbl_record(fout, record) fout.write("\n") #progress bar: if main_counter%10000 == 0: print (time.clock() - start_time),"s - ",main_counter, "processed,", (cimatch.matched),"matched",(cimatch.missed),"missed" main_counter = main_counter + 1 fout.close()
print len(zbl_matcher.aux_zbl_recs_list), "zbl records loaded." print "Opening main file =", main_zbl_path fmain = open(main_zbl_path, 'r') print "Opening output file =", out_path fout = open(out_path, 'w') print "-----------------------------------" zbl_matcher.print_py_report() print "-----------------------------------" ###################################################################################################### #mix records from two files: start_time = time.clock() for main_zbl_record in zbl_io.read_zbl_records(fmain): aux_zbl_record = zbl_matcher.match_aux_record(main_zbl_record, only_fast_match_methods) if not aux_zbl_record is None: #mix two records: main_zbl_record = update_zbl_record(main_zbl_record, aux_zbl_record, forced_fields) #write results: zbl_io.write_zbl_record(fout, main_zbl_record) fout.write("\n") #progress bar: main_counter = zbl_matcher.total_processed_records_num() matched_counter = zbl_matcher.total_matched_records_num() if main_counter % 10000 == 0:
print len(zbl_matcher.aux_zbl_recs_list), "zbl records loaded." print "Opening main file =", main_zbl_path fmain = open(main_zbl_path, 'r') print "Opening output file =", out_path fout = open(out_path, 'w') print "-----------------------------------" zbl_matcher.print_py_report() print "-----------------------------------" ###################################################################################################### #mix records from two files: start_time = time.clock() for main_zbl_record in zbl_io.read_zbl_records(fmain): aux_zbl_record = zbl_matcher.match_aux_record(main_zbl_record, only_fast_match_methods) if not aux_zbl_record is None: #mix two records: main_zbl_record = update_zbl_record(main_zbl_record, aux_zbl_record, forced_fields) #write results: zbl_io.write_zbl_record(fout, main_zbl_record) fout.write("\n") #progress bar: main_counter = zbl_matcher.total_processed_records_num() matched_counter = zbl_matcher.total_matched_records_num() if main_counter%10000 == 0: print (time.clock() - start_time),"s - ",main_counter, "processed,",matched_counter,"matched"
if __name__ == "__main__": print "The program splits single ZBL file into several files of required size." try: in_path = sys.argv[1] except: print "First argument expected: source-file" sys.exit(-1) try: part_size = int(sys.argv[2]) except: print "Second argument expected: number of records per output file" sys.exit(-1) print "Source file:", in_path print "Records per file:", part_size part_counter = 0 part_records_counter = 0 fout = open(in_path + ".part" + str(part_counter), "w") for record in zbl_io.read_zbl_records(open(in_path, "r")): part_records_counter = part_records_counter + 1 if part_records_counter >= part_size: print part_records_counter, "records stored to file", fout part_counter = part_counter + 1 part_records_counter = 0 fout = open(in_path + ".part" + str(part_counter), "w") zbl_io.write_zbl_record(fout, record) fout.write("\n") print part_records_counter, "records stored to file", fout
def add_field(fin, fout, add_field_name, add_field_value): """To every record from fin adds field (add_field_value:add_field_name) and stores record to fout.""" for record in zbl_io.read_zbl_records(fin): record[add_field_name] = add_field_value zbl_io.write_zbl_record(fout, record) fout.write("\n")
def id_bags_generator(fin, src_field, value_extractor=extract_bag_of_ids): """Returns generator that generates gensim-bags-of-ids/tfidfs (read from src_field) from ZBL-file fin.""" return (value_extractor(record[src_field]) for record in zbl_io.read_zbl_records(fin) if src_field in record)
if __name__ == "__main__": print "The program splits single ZBL file into several files of required size." try: in_path = sys.argv[1] except: print "First argument expected: source-file" sys.exit(-1) try: part_size = int(sys.argv[2]) except: print "Second argument expected: number of records per output file" sys.exit(-1) print "Source file:", in_path print "Records per file:", part_size part_counter = 0; part_records_counter = 0 fout = open(in_path+".part"+str(part_counter), "w") for record in zbl_io.read_zbl_records( open(in_path, "r") ): part_records_counter = part_records_counter + 1 if part_records_counter >= part_size: print part_records_counter,"records stored to file", fout part_counter = part_counter + 1 part_records_counter = 0 fout = open(in_path+".part"+str(part_counter), "w") zbl_io.write_zbl_record(fout, record) fout.write("\n") print part_records_counter,"records stored to file", fout
ccstr = record["cc"].strip().replace("*", "") ccs = filter_categories(cc.strip() for cc in ccstr.split(" ")) for cc in ccs: if count.has_key(cc): count[cc] = count[cc] + 1 else: count[cc] = 1 return count if __name__ == "__main__": args = sys.argv if len(sys.argv) != 3: print "[ERROR] Exactly two arguments are expected: input-zbl-file-path output-count-path-prefix" exit(-1) zblInPath = args[1] statsOutPath = args[2] # zliczenie zbl_src = zbl_io.read_zbl_records(open(zblInPath)) count = count_categories(zbl_src, filter_XXY_categories) # zapis do pliku # f = open(statsOutPath, "w") # for cc in count: # f.write(cc+" "+str(count[cc])+"\n") # f.close() io.fwrite_vector(statsOutPath + "_labels.svector", count.keys()) io.fwrite_vector(statsOutPath + "_count.ivector", count.values())
sys.exit(-1) try: out_path = sys.argv[2] except: print "Second argument expected: output-zbl-file-path (Pseudo-ZBL)" sys.exit(-1) print "src = ", main_zbl_path print "dst = ", out_path cimatch = CitationMatcher(main_zbl_path) fout = open(out_path, "w") main_counter = 0 start_time = time.clock() for record in zbl_io.read_zbl_records(open(main_zbl_path, 'r')): #update citations: if record.has_key("ci"): cis = zbl_io.unpack_list_of_dictionaries(record["ci"]) for ci in cis: ci = cimatch.add_citation_identity(ci) record["ci"] = zbl_io.pack_list_of_dictionaries(cis) #write output: zbl_io.write_zbl_record(fout, record) fout.write("\n") #progress bar: if main_counter % 10000 == 0: print(time.clock() - start_time), "s - ", main_counter, "processed,", ( cimatch.matched), "matched", (cimatch.missed), "missed" main_counter = main_counter + 1
try: in_path = sys.argv[2] except: print "Second argument expected: input-zbl-file-path (Pseudo-ZBL)" sys.exit(-1) try: out_path = sys.argv[3] except: print "Third argument expected: output-zbl-file-path (Pseudo-ZBL)" sys.exit(-1) print "mapping_path =", mapping_path print "in_path =", in_path print "out_path =", out_path id_mapper = ZblIdMapper(mapping_path, False) fin = open(in_path, 'r') fout = open(out_path, 'w') for record in zbl_io.read_zbl_records(fin): zbl_io.write_zbl_record(fout, id_mapper.update_record(record)) fout.write("\n") fin.close() fout.close() id_mapper.print_stats()
def id_bags_generator(fin, src_field, value_extractor=extract_bag_of_ids): """Returns generator that generates gensim-bags-of-ids/tfidfs (read from src_field) from ZBL-file fin.""" return ( value_extractor(record[src_field]) for record in zbl_io.read_zbl_records(fin) if src_field in record )
@author: mlukasik Find out the differences between 2 data sets ''' from record_store import store_py_records, store_txt_records from zbl_io import load_zbl_file, read_zbl_records import sys from collections import defaultdict fname1 = sys.argv[1] fname2 = sys.argv[2] print "loading records1" records1 = defaultdict(lambda: {}) records1_cnt = 0 for rec1 in read_zbl_records(open(fname1, 'r')): try: records1[rec1['an']][rec1['ti']] = rec1 records1_cnt += 1 except: pass print "loaded records1", len(records1), "all of them:", records1_cnt print "going through records2" not_in_rec1 = 0 records2_len = 0 for rec2 in read_zbl_records(open(fname2, 'r')): if rec2['an'] not in records1 or rec2['ti'] not in records1[rec2['an']]: #print "record not in records1!" print rec2
@author: mlukasik Find out the differences between 2 data sets ''' from record_store import store_py_records, store_txt_records from zbl_io import load_zbl_file, read_zbl_records import sys from collections import defaultdict fname1 = sys.argv[1] fname2 = sys.argv[2] print "loading records1" records1 = defaultdict(lambda: {}) records1_cnt = 0 for rec1 in read_zbl_records(open(fname1, 'r')): try: records1[rec1['an']][rec1['ti']] = rec1 records1_cnt+=1 except: pass print "loaded records1", len(records1), "all of them:", records1_cnt print "going through records2" not_in_rec1 = 0 records2_len = 0 for rec2 in read_zbl_records(open(fname2, 'r')): if rec2['an'] not in records1 or rec2['ti'] not in records1[rec2['an']]: #print "record not in records1!" print rec2
try: mapping_path = sys.argv[1] except: print "First argument expected: mapping-file-path (every line in format: src-id dst-id)" sys.exit(-1) try: in_path = sys.argv[2] except: print "Second argument expected: input-zbl-file-path (Pseudo-ZBL)" sys.exit(-1) try: out_path = sys.argv[3] except: print "Third argument expected: output-zbl-file-path (Pseudo-ZBL)" sys.exit(-1) print "mapping_path =", mapping_path print "in_path =", in_path print "out_path =", out_path id_mapper = ZblIdMapper(mapping_path, False) fin = open(in_path, 'r') fout = open(out_path, 'w') for record in zbl_io.read_zbl_records(fin): zbl_io.write_zbl_record(fout, id_mapper.update_record(record)) fout.write("\n") fin.close() fout.close() id_mapper.print_stats()