def copy_field(fin, fout, src_field, dst_field):
    """In every record from fin copies field src_field to field dst_field and stores record to fout."""
    for record in zbl_io.read_zbl_records(fin):
        if record.has_key(src_field):            
            record[dst_field] = record[src_field]
        zbl_io.write_zbl_record(fout, record)
        fout.write("\n")
Example #2
0
def gensim_mapfields_dict_file(fin,
                               fout,
                               fields,
                               filter_by_fields,
                               dictionary,
                               dst_field,
                               dbg_field_name="g_"):
    """For every records from ZBL-fin-stream that have filter_by_fields 
     fields are merged, mapped with gensim dictionary and stored in dst-field.

       Returns number of processed records."""
    logging.info("[gensim_mapfields_dict_file] filter_by_fields="+str(filter_by_fields)+\
    " fields="+str(fields)+" dictionary="+str(dictionary)+" fin="+str(fin)+" dst_field="+str(dst_field))
    id2token = dict(
        (idx, token) for idx, token in
        dictionary.iteritems())  #this-line is for debugging purposes
    counter = 0
    for i, record in enumerate(zbl_io.read_zbl_records(fin)):
        if i % 10000 == 0:
            logging.info("[gensim_mapfields_dict_file] " + str(i) +
                         " records processed")
        record = gensim_mapfields_dict(record, fields, filter_by_fields,
                                       dictionary, dst_field, id2token,
                                       dbg_field_name)
        if dst_field in record:
            counter = counter + 1
        zbl_io.write_zbl_record(fout, record)
        fout.write("\n")
    return counter
def copy_field(fin, fout, src_field, dst_field):
    """In every record from fin copies field src_field to field dst_field and stores record to fout."""
    for record in zbl_io.read_zbl_records(fin):
        if record.has_key(src_field):
            record[dst_field] = record[src_field]
        zbl_io.write_zbl_record(fout, record)
        fout.write("\n")
def copy_file(fin, fout):
    """Copies all records from fin to fout.
    
    Returns number of all copied records."""
    counter = 0
    for record in zbl_io.read_zbl_records(fin):                        
        zbl_io.write_zbl_record(fout, record)
        fout.write("\n")    
        counter = counter + 1 
    return counter
def merge_fields(fin, fout, src_fields, dst_field, separator = " "):
    """In every record from fin merges fields from src_field to field dst_field and stores record to fout."""
    for record in zbl_io.read_zbl_records(fin):
        try:
            dst_val = reduce(lambda a,b: a+separator+b, (record[src_field] for src_field in src_fields if src_field in record) )
            record[dst_field] = dst_val
        except:
            print "[merge_fields] Failed merging in record an=", record[zbl_io.ZBL_ID_FIELD]
        zbl_io.write_zbl_record(fout, record)
        fout.write("\n")
def copy_file(fin, fout):
    """Copies all records from fin to fout.
    
    Returns number of all copied records."""
    counter = 0
    for record in zbl_io.read_zbl_records(fin):
        zbl_io.write_zbl_record(fout, record)
        fout.write("\n")
        counter = counter + 1
    return counter
Example #7
0
 def append_not_matched_records(self, fout):
     """Appends to fout all loaded (aux) records that have never been matched in self.match_aux_record.
     
     Returns number of records appended.
     """
     counter = 0
     for rec in self.aux_zbl_recs_list:
         if not rec[zbl_io.ZBL_ID_FIELD] in self.aux_used_ids:
             zbl_io.write_zbl_record(fout, rec)
             fout.write("\n")
             counter = counter + 1
     return counter
 def append_not_matched_records(self, fout):
     """Appends to fout all loaded (aux) records that have never been matched in self.match_aux_record.
     
     Returns number of records appended.
     """
     counter = 0
     for rec in self.aux_zbl_recs_list:
         if not rec[zbl_io.ZBL_ID_FIELD] in self.aux_used_ids:
             zbl_io.write_zbl_record(fout, rec)  
             fout.write("\n")
             counter = counter + 1
     return counter                                   
def keep_records_ids(fin, fout, keep_ids_file):
    """Copies records from fin to fout. Keeps only those records of ids contained in file keep_ids_file (path).
    
    Returns list of kept ids."""
    filter_ids = set(line.strip() for line in open(keep_ids_file).xreadlines())
    print len(filter_ids), " on the 'keep-ids' list"
    kept_ids = set()
    for record in zbl_io.read_zbl_records(fin):
        if not record[zbl_io.ZBL_ID_FIELD] in filter_ids: continue
        kept_ids.add(record[zbl_io.ZBL_ID_FIELD])
        zbl_io.write_zbl_record(fout, record)
        fout.write("\n")
    return kept_ids
def keep_authors(fin, fout):
    """Copies all records from fin to fout. 
    
    
    Removes all fields apart from an, au, ai.
    Returns number of all copied records.
    """
    counter = 0
    for record in zbl_io.read_zbl_records(fin):
        zbl_io.write_zbl_record(fout, record_keep_authors(record))
        fout.write("\n")    
        counter = counter + 1 
    return counter
def filter_fields_vals(fin, fout, list_of_fields, text_filter = text_filter_lower_space, word_predicate = def_word_predicate):
    """Copies records from fin to fout and for fields on list_of_fields filters its' values."""
    logging.info("[filter_fields_vals] text_filter="+str(text_filter)+" word_predicate="+str(word_predicate))
    for record in zbl_io.read_zbl_records(fin):
        for field in list_of_fields:   
            if record.has_key(field):         
                try:   
                    record[field] = words_filter(text_filter(record[field]), word_predicate)
                except:
                    logging.warn("Removing field in an="+str(record[zbl_io.ZBL_ID_FIELD])+" (is field empty?):"+field+" = "+record[field])
                    record.pop(field)             
        zbl_io.write_zbl_record(fout, record)
        fout.write("\n")
def keep_authors(fin, fout):
    """Copies all records from fin to fout. 
    
    
    Removes all fields apart from an, au, ai.
    Returns number of all copied records.
    """
    counter = 0
    for record in zbl_io.read_zbl_records(fin):
        zbl_io.write_zbl_record(fout, record_keep_authors(record))
        fout.write("\n")
        counter = counter + 1
    return counter
def filter_records(fin, fout, bad_ids_file):
    """Copies records from fin to fout. Filters out records of ids contained in file bad_ids_file (path).
    
    Returns list of skipped (filtered out) ids."""
    filter_ids = set(line.strip() for line in open(bad_ids_file).xreadlines())
    skipped_ids = set()        
    for record in zbl_io.read_zbl_records(fin):
        if record[zbl_io.ZBL_ID_FIELD] in filter_ids:
            skipped_ids.add(record[zbl_io.ZBL_ID_FIELD])
            continue
        zbl_io.write_zbl_record(fout, record)
        fout.write("\n")
    return skipped_ids
def filter_records(fin, fout, bad_ids_file):
    """Copies records from fin to fout. Filters out records of ids contained in file bad_ids_file (path).
    
    Returns list of skipped (filtered out) ids."""
    filter_ids = set(line.strip() for line in open(bad_ids_file).xreadlines())
    skipped_ids = set()
    for record in zbl_io.read_zbl_records(fin):
        if record[zbl_io.ZBL_ID_FIELD] in filter_ids:
            skipped_ids.add(record[zbl_io.ZBL_ID_FIELD])
            continue
        zbl_io.write_zbl_record(fout, record)
        fout.write("\n")
    return skipped_ids
def keep_records_ids(fin, fout, keep_ids_file):
    """Copies records from fin to fout. Keeps only those records of ids contained in file keep_ids_file (path).
    
    Returns list of kept ids."""
    filter_ids = set(line.strip() for line in open(keep_ids_file).xreadlines())
    print len(filter_ids)," on the 'keep-ids' list"
    kept_ids = set()        
    for record in zbl_io.read_zbl_records(fin):
        if not record[zbl_io.ZBL_ID_FIELD] in filter_ids: continue
        kept_ids.add(record[zbl_io.ZBL_ID_FIELD])
        zbl_io.write_zbl_record(fout, record)
        fout.write("\n")
    return kept_ids
def keep_records(fin, fout, must_have_fields):
    """Copies records from fin to fout. 
    
    Keeps only these records that have all fields from must_have_fields list.
    """
    kept_counter = 0
    for i,record in enumerate(zbl_io.read_zbl_records(fin)):
        if i%10000 == 0: print "[keep_records]", i,"processed", kept_counter, "kept"
        if has_all_fields(record, must_have_fields):
            zbl_io.write_zbl_record(fout, record)
            fout.write("\n")  
            kept_counter = kept_counter + 1  
    return kept_counter
def keep_records(fin, fout, must_have_fields):
    """Copies records from fin to fout. 
    
    Keeps only these records that have all fields from must_have_fields list.
    """
    kept_counter = 0
    for i, record in enumerate(zbl_io.read_zbl_records(fin)):
        if i % 10000 == 0:
            print "[keep_records]", i, "processed", kept_counter, "kept"
        if has_all_fields(record, must_have_fields):
            zbl_io.write_zbl_record(fout, record)
            fout.write("\n")
            kept_counter = kept_counter + 1
    return kept_counter
def merge_fields(fin, fout, src_fields, dst_field, separator=" "):
    """In every record from fin merges fields from src_field to field dst_field and stores record to fout."""
    for record in zbl_io.read_zbl_records(fin):
        try:
            dst_val = reduce(
                lambda a, b: a + separator + b,
                (record[src_field]
                 for src_field in src_fields if src_field in record))
            record[dst_field] = dst_val
        except:
            print "[merge_fields] Failed merging in record an=", record[
                zbl_io.ZBL_ID_FIELD]
        zbl_io.write_zbl_record(fout, record)
        fout.write("\n")
def filter_duplicates(fin, fout):
    """Copies records from fin to fout. Records with duplicated id are filtered out.
    
    Returns list of duplicated ids."""
    ids = set()
    duplicated_ids = set()
    for record in zbl_io.read_zbl_records(fin):
        id = fix_id(record[zbl_io.ZBL_ID_FIELD])
        if id in ids:
            duplicated_ids.add(id)
            continue
        ids.add(id)
        zbl_io.write_zbl_record(fout, record)
        fout.write("\n")
    return duplicated_ids
def filter_duplicates(fin, fout):
    """Copies records from fin to fout. Records with duplicated id are filtered out.
    
    Returns list of duplicated ids."""
    ids = set()
    duplicated_ids = set()
    for record in zbl_io.read_zbl_records(fin):
        id = fix_id(record[zbl_io.ZBL_ID_FIELD])        
        if id in ids:
            duplicated_ids.add(id)
            continue        
        ids.add(id)
        zbl_io.write_zbl_record(fout, record)
        fout.write("\n")    
    return duplicated_ids
def filter_field(fin, fout, field_name):
    """Copies records from fin to fout but keeping only id and field of field_name.    
    
    Returns number of found fields.
    """
    counter = 0
    for record in zbl_io.read_zbl_records(fin):
        newrec = {}
        newrec[zbl_io.ZBL_ID_FIELD] = record[zbl_io.ZBL_ID_FIELD]
        if record.has_key(field_name):
            newrec[field_name] = record[field_name]            
            counter = counter + 1
        zbl_io.write_zbl_record(fout, newrec)
        fout.write("\n")            
    return counter    
def filter_field(fin, fout, field_name):
    """Copies records from fin to fout but keeping only id and field of field_name.    
    
    Returns number of found fields.
    """
    counter = 0
    for record in zbl_io.read_zbl_records(fin):
        newrec = {}
        newrec[zbl_io.ZBL_ID_FIELD] = record[zbl_io.ZBL_ID_FIELD]
        if record.has_key(field_name):
            newrec[field_name] = record[field_name]
            counter = counter + 1
        zbl_io.write_zbl_record(fout, newrec)
        fout.write("\n")
    return counter
def filter_af(fin, fout):
    """Copies records from fin to fout but also removes from records empty (only "-" values) af fields.
    
    Returns number of removed fields.
    """
    counter = 0
    for record in zbl_io.read_zbl_records(fin):
        if record.has_key("af"):
            af = zbl_io.unpack_multivalue_field(record["af"])
            empty = sum(1 for a in af if a == '-') == len(af)
            if empty:
                record.pop("af")
                counter = counter + 1
        zbl_io.write_zbl_record(fout, record)
        fout.write("\n")            
    return counter
def filter_af(fin, fout):
    """Copies records from fin to fout but also removes from records empty (only "-" values) af fields.
    
    Returns number of removed fields.
    """
    counter = 0
    for record in zbl_io.read_zbl_records(fin):
        if record.has_key("af"):
            af = zbl_io.unpack_multivalue_field(record["af"])
            empty = sum(1 for a in af if a == '-') == len(af)
            if empty:
                record.pop("af")
                counter = counter + 1
        zbl_io.write_zbl_record(fout, record)
        fout.write("\n")
    return counter
def gensim_mapfields_dict_file(fin, fout, fields, filter_by_fields, dictionary, dst_field, dbg_field_name = "g_"):
    """For every records from ZBL-fin-stream that have filter_by_fields 
     fields are merged, mapped with gensim dictionary and stored in dst-field.

       Returns number of processed records."""
    logging.info("[gensim_mapfields_dict_file] filter_by_fields="+str(filter_by_fields)+\
    " fields="+str(fields)+" dictionary="+str(dictionary)+" fin="+str(fin)+" dst_field="+str(dst_field))
    id2token = dict( (idx,token) for idx,token in dictionary.iteritems() ) #this-line is for debugging purposes
    counter = 0
    for i,record in enumerate(zbl_io.read_zbl_records(fin)):                            
        if i%10000 == 0: logging.info("[gensim_mapfields_dict_file] "+str(i)+" records processed")
        record = gensim_mapfields_dict(record, fields, filter_by_fields, dictionary, dst_field, id2token, dbg_field_name)
        if dst_field in record:
            counter = counter + 1 
        zbl_io.write_zbl_record(fout, record)
        fout.write("\n")            
    return counter
def calc_msc_membership(fin, fout, known_msc_codes, \
                        src_field='mc', dst_field='m0', \
                        re_leaf_pattern=msc_processing.MSC_ORDINARY_LEAF_PATTERN_RE, dbg_field='m_'):
    """Updates records with additional dst_field with membership vector calculated basing on msc codes read from src_field."""
    msccodes = filter_msccodes(known_msc_codes, re_leaf_pattern)
    msc2ix = calc_msc2ix(msccodes)
    ix2msc = dict((ix, msc) for msc, ix in msc2ix.iteritems())
    prefix2msc = group_by_prefix(msccodes)

    counter = 0
    for i, record in enumerate(zbl_io.read_zbl_records(fin)):
        if i % 10000 == 0:
            logging.info("[calc_msc_membership] " + str(i) +
                         " records processed. " + str(counter) + "updated.")
        if src_field in record:
            record_msccodes = zbl_io.unpack_multivalue_field(record[src_field])
            record_msccodes = filter_msccodes(record_msccodes, re_leaf_pattern)

            compared_codes = set(
            )  #patrzymy po tych ktore maja zgodne fragmenty prefiksow
            for record_msccode in record_msccodes:
                prefix2 = record_msccode[:2]
                prefix3 = record_msccode[:3]
                compared_codes.update(prefix2msc[prefix2])
                compared_codes.update(prefix2msc[prefix3])

            mscmembership = []
            for compared_code in compared_codes:
                membership = msccode_membership(record_msccodes, compared_code)
                mscmembership.append((msc2ix[compared_code], membership))

            if len(mscmembership) > 0:  #zapsiujemy wyniki
                mscmembership = sorted(set(mscmembership))
                record[dst_field] = zbl_io.pack_listpairs_field(mscmembership)
                record[dbg_field] = zbl_io.pack_listpairs_field([
                    (ix2msc[ix], m) for ix, m in mscmembership
                ])
                counter = counter + 1

        zbl_io.write_zbl_record(fout, record)
        fout.write("\n")
    return counter
def gensim_mapfield_model(fin, fout, model, src_field, dst_field,\
                          src_field_value_extractor=extract_bag_of_ids, dst_field_value_builder=zbl_io.pack_listpairs_field):
    """For every records from ZBL-fin-stream that have src_field its content 
    is interpreted as gensim-bag-of-ids(with weights/counts/values) and transformed using model (results are stored into dst_field).

    Returns number of enriched records."""
    logging.info("[gensim_mapfield_model] src_field="+str(src_field)+\
     " model="+str(model)+" fin="+str(fin)+" dst_field="+str(dst_field)+\
     " src_field_value_extractor="+str(src_field_value_extractor)+" dst_field_value_builder="+str(dst_field_value_builder))
    counter = 0
    for i,record in enumerate(zbl_io.read_zbl_records(fin)):                            
        if i%10000 == 0: logging.info("[gensim_mapfield_model] "+str(i)+" documents mapped...")
        if src_field in record:
            bag_of_ids = src_field_value_extractor(record[src_field])
            tfidf_values = model[bag_of_ids]
            record[dst_field] = dst_field_value_builder(tfidf_values)
            logging.debug("[gensim_mapfield_model]"+record[src_field]+" -> "+record[dst_field])
            counter = counter + 1    
        zbl_io.write_zbl_record(fout, record)
        fout.write("\n")
    return counter
def filter_fields_vals(fin,
                       fout,
                       list_of_fields,
                       text_filter=text_filter_lower_space,
                       word_predicate=def_word_predicate):
    """Copies records from fin to fout and for fields on list_of_fields filters its' values."""
    logging.info("[filter_fields_vals] text_filter=" + str(text_filter) +
                 " word_predicate=" + str(word_predicate))
    for record in zbl_io.read_zbl_records(fin):
        for field in list_of_fields:
            if record.has_key(field):
                try:
                    record[field] = words_filter(text_filter(record[field]),
                                                 word_predicate)
                except:
                    logging.warn("Removing field in an=" +
                                 str(record[zbl_io.ZBL_ID_FIELD]) +
                                 " (is field empty?):" + field + " = " +
                                 record[field])
                    record.pop(field)
        zbl_io.write_zbl_record(fout, record)
        fout.write("\n")
def calc_msc_membership(fin, fout, known_msc_codes, \
                        src_field='mc', dst_field='m0', \
                        re_leaf_pattern=msc_processing.MSC_ORDINARY_LEAF_PATTERN_RE, dbg_field='m_'):
    """Updates records with additional dst_field with membership vector calculated basing on msc codes read from src_field."""
    msccodes    = filter_msccodes(known_msc_codes, re_leaf_pattern)
    msc2ix      = calc_msc2ix(msccodes)
    ix2msc      = dict((ix,msc) for msc,ix in msc2ix.iteritems())
    prefix2msc  = group_by_prefix(msccodes)    
        
    counter = 0;
    for i,record in enumerate(zbl_io.read_zbl_records(fin)):                            
        if i%10000 == 0: logging.info("[calc_msc_membership] "+str(i)+" records processed. "+str(counter)+"updated.")
        if src_field in record:         
            record_msccodes = zbl_io.unpack_multivalue_field(record[src_field])
            record_msccodes = filter_msccodes(record_msccodes, re_leaf_pattern)
            
            compared_codes = set() #patrzymy po tych ktore maja zgodne fragmenty prefiksow
            for record_msccode in record_msccodes:                
                prefix2 = record_msccode[:2]
                prefix3 = record_msccode[:3]
                compared_codes.update( prefix2msc[prefix2] )
                compared_codes.update( prefix2msc[prefix3] )

            mscmembership = []
            for compared_code in compared_codes:
                membership = msccode_membership(record_msccodes, compared_code)
                mscmembership.append( (msc2ix[compared_code],membership) )
                    
            if len(mscmembership) > 0: #zapsiujemy wyniki
                mscmembership = sorted(set(mscmembership))
                record[dst_field] = zbl_io.pack_listpairs_field(mscmembership)                
                record[dbg_field] = zbl_io.pack_listpairs_field([(ix2msc[ix],m) for ix,m in mscmembership])
                counter = counter + 1
            
        zbl_io.write_zbl_record(fout, record)
        fout.write("\n")            
    return counter            
Example #30
0
def gensim_mapfield_model(fin, fout, model, src_field, dst_field,\
                          src_field_value_extractor=extract_bag_of_ids, dst_field_value_builder=zbl_io.pack_listpairs_field):
    """For every records from ZBL-fin-stream that have src_field its content 
    is interpreted as gensim-bag-of-ids(with weights/counts/values) and transformed using model (results are stored into dst_field).

    Returns number of enriched records."""
    logging.info("[gensim_mapfield_model] src_field="+str(src_field)+\
     " model="+str(model)+" fin="+str(fin)+" dst_field="+str(dst_field)+\
     " src_field_value_extractor="+str(src_field_value_extractor)+" dst_field_value_builder="+str(dst_field_value_builder))
    counter = 0
    for i, record in enumerate(zbl_io.read_zbl_records(fin)):
        if i % 10000 == 0:
            logging.info("[gensim_mapfield_model] " + str(i) +
                         " documents mapped...")
        if src_field in record:
            bag_of_ids = src_field_value_extractor(record[src_field])
            tfidf_values = model[bag_of_ids]
            record[dst_field] = dst_field_value_builder(tfidf_values)
            logging.debug("[gensim_mapfield_model]" + record[src_field] +
                          " -> " + record[dst_field])
            counter = counter + 1
        zbl_io.write_zbl_record(fout, record)
        fout.write("\n")
    return counter
    try:
        in_path = sys.argv[2]
    except:
        print "Second argument expected: input-zbl-file-path (Pseudo-ZBL)"        
        sys.exit(-1)
    try:
        out_path = sys.argv[3]
    except:
        print "Third argument expected: output-zbl-file-path (Pseudo-ZBL)"        
        sys.exit(-1)        
        
    print "mapping_path =", mapping_path
    print "in_path =", in_path
    print "out_path =", out_path


    id_mapper = ZblIdMapper(mapping_path, False)
    fin = open(in_path, 'r')
    fout = open(out_path, 'w')
    for record in zbl_io.read_zbl_records(fin):        
        zbl_io.write_zbl_record(fout, id_mapper.update_record(record))           
        fout.write("\n")               
    fin.close()
    fout.close()
    
    id_mapper.print_stats()
    
   

        
    
    print "-----------------------------------"
    zbl_matcher.print_py_report()
    print "-----------------------------------"

    ######################################################################################################
    #mix records from two files:
    start_time = time.clock()
    for main_zbl_record in zbl_io.read_zbl_records(fmain):       

        aux_zbl_record = zbl_matcher.match_aux_record(main_zbl_record, only_fast_match_methods)                        
        if not aux_zbl_record is None: #mix two records:                                        
            main_zbl_record = update_zbl_record(main_zbl_record, aux_zbl_record, forced_fields)
            
        #write results:    
        zbl_io.write_zbl_record(fout, main_zbl_record)  
        fout.write("\n")
        
        #progress bar:   
        main_counter = zbl_matcher.total_processed_records_num()
        matched_counter = zbl_matcher.total_matched_records_num()
        if main_counter%10000 == 0:
            print (time.clock() - start_time),"s - ",main_counter, "processed,",matched_counter,"matched"
           
    ######################################################################################################
    if append_not_matched_records_flag:                   
        print zbl_matcher.append_not_matched_records(fout), " appended not matched records..."
                    
    fmain.close()
    fout.close()
    
def add_field(fin, fout, add_field_name, add_field_value):
    """To every record from fin adds field (add_field_value:add_field_name) and stores record to fout."""
    for record in zbl_io.read_zbl_records(fin):
        record[add_field_name] = add_field_value
        zbl_io.write_zbl_record(fout, record)
        fout.write("\n")
Example #34
0
        sys.exit(-1)

    print "src = ", main_zbl_path
    print "dst = ", out_path

    cimatch = CitationMatcher(main_zbl_path)

    fout = open(out_path, "w")
    main_counter = 0
    start_time = time.clock()
    for record in zbl_io.read_zbl_records(open(main_zbl_path, 'r')):
        #update citations:
        if record.has_key("ci"):
            cis = zbl_io.unpack_list_of_dictionaries(record["ci"])
            for ci in cis:
                ci = cimatch.add_citation_identity(ci)
            record["ci"] = zbl_io.pack_list_of_dictionaries(cis)
        #write output:
        zbl_io.write_zbl_record(fout, record)
        fout.write("\n")
        #progress bar:
        if main_counter % 10000 == 0:
            print(time.clock() -
                  start_time), "s - ", main_counter, "processed,", (
                      cimatch.matched), "matched", (cimatch.missed), "missed"
        main_counter = main_counter + 1
    fout.close()

    print "missed=", (cimatch.missed)
    print "matched=", (cimatch.matched)
    try:
        mapping_path = sys.argv[1]
    except:
        print "First argument expected: mapping-file-path (every line in format: src-id dst-id)"
        sys.exit(-1)
    try:
        in_path = sys.argv[2]
    except:
        print "Second argument expected: input-zbl-file-path (Pseudo-ZBL)"
        sys.exit(-1)
    try:
        out_path = sys.argv[3]
    except:
        print "Third argument expected: output-zbl-file-path (Pseudo-ZBL)"
        sys.exit(-1)

    print "mapping_path =", mapping_path
    print "in_path =", in_path
    print "out_path =", out_path

    id_mapper = ZblIdMapper(mapping_path, False)
    fin = open(in_path, 'r')
    fout = open(out_path, 'w')
    for record in zbl_io.read_zbl_records(fin):
        zbl_io.write_zbl_record(fout, id_mapper.update_record(record))
        fout.write("\n")
    fin.close()
    fout.close()

    id_mapper.print_stats()
def add_field(fin, fout, add_field_name, add_field_value):
    """To every record from fin adds field (add_field_value:add_field_name) and stores record to fout."""
    for record in zbl_io.read_zbl_records(fin):
        record[add_field_name] = add_field_value            
        zbl_io.write_zbl_record(fout, record)
        fout.write("\n")
        print "Second argument expected: output-zbl-file-path (Pseudo-ZBL)"        
        sys.exit(-1)       
        
    print "src = ", main_zbl_path
    print "dst = ", out_path

    cimatch = CitationMatcher(main_zbl_path)
        
    fout = open(out_path, "w")
    main_counter = 0
    start_time = time.clock()
    for record in zbl_io.read_zbl_records( open(main_zbl_path, 'r') ):
        #update citations:
        if record.has_key("ci"):            
            cis = zbl_io.unpack_list_of_dictionaries(record["ci"])                            
            for ci in cis:
                ci = cimatch.add_citation_identity(ci)                                     
            record["ci"] = zbl_io.pack_list_of_dictionaries(cis)                                    
        #write output:
        zbl_io.write_zbl_record(fout, record)
        fout.write("\n")
        #progress bar:
        if main_counter%10000 == 0:
            print (time.clock() - start_time),"s - ",main_counter, "processed,", (cimatch.matched),"matched",(cimatch.missed),"missed"
        main_counter = main_counter + 1         
    fout.close()
    
    print "missed=",(cimatch.missed)
    print "matched=",(cimatch.matched)
    
Example #38
0
    zbl_matcher.print_py_report()
    print "-----------------------------------"

    ######################################################################################################
    #mix records from two files:
    start_time = time.clock()
    for main_zbl_record in zbl_io.read_zbl_records(fmain):

        aux_zbl_record = zbl_matcher.match_aux_record(main_zbl_record,
                                                      only_fast_match_methods)
        if not aux_zbl_record is None:  #mix two records:
            main_zbl_record = update_zbl_record(main_zbl_record,
                                                aux_zbl_record, forced_fields)

        #write results:
        zbl_io.write_zbl_record(fout, main_zbl_record)
        fout.write("\n")

        #progress bar:
        main_counter = zbl_matcher.total_processed_records_num()
        matched_counter = zbl_matcher.total_matched_records_num()
        if main_counter % 10000 == 0:
            print(
                time.clock() - start_time
            ), "s - ", main_counter, "processed,", matched_counter, "matched"

    ######################################################################################################
    if append_not_matched_records_flag:
        print zbl_matcher.append_not_matched_records(
            fout), " appended not matched records..."