def are_zbl_records_similar(rec1, rec2): """Returns true if (we believe that) rec1 describes the same data as rec2. The same data has the same zbl-id or mr-id or very similar authors and title and the same publication year.""" if rec1.has_key(zbl_io.ZBL_ID_FIELD) and rec2.has_key(zbl_io.ZBL_ID_FIELD): if rec1[zbl_io.ZBL_ID_FIELD] == rec2[zbl_io.ZBL_ID_FIELD]: return True if rec1.has_key("zb") and rec2.has_key("zb"): return rec1["zb"] == rec2["zb"] if rec1.has_key("mr") and rec2.has_key("mr"): return rec1["mr"] == rec2["mr"] #if present publication years must agree if rec1.has_key("py") and rec2.has_key("py"): if rec1["py"] != rec2["py"]: return False if rec1.has_key("au") and rec2.has_key("au"): au1 = zbl_io.unpack_multivalue_field(rec1["au"]); au2 = zbl_io.unpack_multivalue_field(rec2["au"]); if not are_lists_almost_equal(au1, au2, 4, 2): return False else: #both articles must have authors return False #are titles similar? ti1 = rec1.get('ti', '').lower() ti2 = rec2.get('ti', '').lower() if not are_elements_almost_equal(ti1, ti2, 6, 4): return False return True
def _update_zbl_record_history_(main_zbl_record, aux_zbl_record): """Updates field <zz> in main_zbl_record basing on its previous value and <zz> value in aux_zbl_record. <zz> - field that identifies records's source (should be merged instead of overwriting.) """ if main_zbl_record.has_key("zz") and aux_zbl_record.has_key("zz"): main_zz_list = zbl_io.unpack_multivalue_field(main_zbl_record["zz"]) aux_zz_list = zbl_io.unpack_multivalue_field(aux_zbl_record["zz"]) main_zz_list.extend(aux_zz_list) main_zbl_record["zz"] = zbl_io.pack_multivalue_field(main_zz_list) return main_zbl_record
def calc_msc2count(fin, src_field='mc'): """Returns msc2counts dictionary.""" msc2count = {}; for i,record in enumerate(zbl_io.read_zbl_records(fin)): if i%10000 == 0: logging.info("[calc_msc_model] "+str(i)+" records processed") if not src_field in record: continue msccodes = zbl_io.unpack_multivalue_field(record[src_field]) for msc in msccodes: msc2count[msc] = msc2count.get(msc, 0) + 1 #zbl_io.write_zbl_record(fout, record) #fout.write("\n") return msc2count
def calc_msc2count(fin, src_field='mc'): """Returns msc2counts dictionary.""" msc2count = {} for i, record in enumerate(zbl_io.read_zbl_records(fin)): if i % 10000 == 0: logging.info("[calc_msc_model] " + str(i) + " records processed") if not src_field in record: continue msccodes = zbl_io.unpack_multivalue_field(record[src_field]) for msc in msccodes: msc2count[msc] = msc2count.get(msc, 0) + 1 #zbl_io.write_zbl_record(fout, record) #fout.write("\n") return msc2count
def filter_af(fin, fout): """Copies records from fin to fout but also removes from records empty (only "-" values) af fields. Returns number of removed fields. """ counter = 0 for record in zbl_io.read_zbl_records(fin): if record.has_key("af"): af = zbl_io.unpack_multivalue_field(record["af"]) empty = sum(1 for a in af if a == '-') == len(af) if empty: record.pop("af") counter = counter + 1 zbl_io.write_zbl_record(fout, record) fout.write("\n") return counter
def calc_msc_membership(fin, fout, known_msc_codes, \ src_field='mc', dst_field='m0', \ re_leaf_pattern=msc_processing.MSC_ORDINARY_LEAF_PATTERN_RE, dbg_field='m_'): """Updates records with additional dst_field with membership vector calculated basing on msc codes read from src_field.""" msccodes = filter_msccodes(known_msc_codes, re_leaf_pattern) msc2ix = calc_msc2ix(msccodes) ix2msc = dict((ix, msc) for msc, ix in msc2ix.iteritems()) prefix2msc = group_by_prefix(msccodes) counter = 0 for i, record in enumerate(zbl_io.read_zbl_records(fin)): if i % 10000 == 0: logging.info("[calc_msc_membership] " + str(i) + " records processed. " + str(counter) + "updated.") if src_field in record: record_msccodes = zbl_io.unpack_multivalue_field(record[src_field]) record_msccodes = filter_msccodes(record_msccodes, re_leaf_pattern) compared_codes = set( ) #patrzymy po tych ktore maja zgodne fragmenty prefiksow for record_msccode in record_msccodes: prefix2 = record_msccode[:2] prefix3 = record_msccode[:3] compared_codes.update(prefix2msc[prefix2]) compared_codes.update(prefix2msc[prefix3]) mscmembership = [] for compared_code in compared_codes: membership = msccode_membership(record_msccodes, compared_code) mscmembership.append((msc2ix[compared_code], membership)) if len(mscmembership) > 0: #zapsiujemy wyniki mscmembership = sorted(set(mscmembership)) record[dst_field] = zbl_io.pack_listpairs_field(mscmembership) record[dbg_field] = zbl_io.pack_listpairs_field([ (ix2msc[ix], m) for ix, m in mscmembership ]) counter = counter + 1 zbl_io.write_zbl_record(fout, record) fout.write("\n") return counter
def calc_msc_membership(fin, fout, known_msc_codes, \ src_field='mc', dst_field='m0', \ re_leaf_pattern=msc_processing.MSC_ORDINARY_LEAF_PATTERN_RE, dbg_field='m_'): """Updates records with additional dst_field with membership vector calculated basing on msc codes read from src_field.""" msccodes = filter_msccodes(known_msc_codes, re_leaf_pattern) msc2ix = calc_msc2ix(msccodes) ix2msc = dict((ix,msc) for msc,ix in msc2ix.iteritems()) prefix2msc = group_by_prefix(msccodes) counter = 0; for i,record in enumerate(zbl_io.read_zbl_records(fin)): if i%10000 == 0: logging.info("[calc_msc_membership] "+str(i)+" records processed. "+str(counter)+"updated.") if src_field in record: record_msccodes = zbl_io.unpack_multivalue_field(record[src_field]) record_msccodes = filter_msccodes(record_msccodes, re_leaf_pattern) compared_codes = set() #patrzymy po tych ktore maja zgodne fragmenty prefiksow for record_msccode in record_msccodes: prefix2 = record_msccode[:2] prefix3 = record_msccode[:3] compared_codes.update( prefix2msc[prefix2] ) compared_codes.update( prefix2msc[prefix3] ) mscmembership = [] for compared_code in compared_codes: membership = msccode_membership(record_msccodes, compared_code) mscmembership.append( (msc2ix[compared_code],membership) ) if len(mscmembership) > 0: #zapsiujemy wyniki mscmembership = sorted(set(mscmembership)) record[dst_field] = zbl_io.pack_listpairs_field(mscmembership) record[dbg_field] = zbl_io.pack_listpairs_field([(ix2msc[ix],m) for ix,m in mscmembership]) counter = counter + 1 zbl_io.write_zbl_record(fout, record) fout.write("\n") return counter