def append_file(fin, fout, fappend):
    """Copies all records from fin and fappend to fout.
    
    Returns number of all copied records."""
    counter = 0
    for record in zbl_io.read_zbl_records(fin):
        zbl_io.write_zbl_record(fout, record)
        fout.write("\n")
        counter = counter + 1
    for record in zbl_io.read_zbl_records(fappend):
        zbl_io.write_zbl_record(fout, record)
        fout.write("\n")
        counter = counter + 1
    return counter
def append_file(fin, fout, fappend):
    """Copies all records from fin and fappend to fout.
    
    Returns number of all copied records."""
    counter = 0
    for record in zbl_io.read_zbl_records(fin):                        
        zbl_io.write_zbl_record(fout, record)
        fout.write("\n")    
        counter = counter + 1 
    for record in zbl_io.read_zbl_records(fappend):                    
        zbl_io.write_zbl_record(fout, record)
        fout.write("\n")    
        counter = counter + 1
    return counter
Beispiel #3
0
def gensim_mapfields_dict_file(fin,
                               fout,
                               fields,
                               filter_by_fields,
                               dictionary,
                               dst_field,
                               dbg_field_name="g_"):
    """For every records from ZBL-fin-stream that have filter_by_fields 
     fields are merged, mapped with gensim dictionary and stored in dst-field.

       Returns number of processed records."""
    logging.info("[gensim_mapfields_dict_file] filter_by_fields="+str(filter_by_fields)+\
    " fields="+str(fields)+" dictionary="+str(dictionary)+" fin="+str(fin)+" dst_field="+str(dst_field))
    id2token = dict(
        (idx, token) for idx, token in
        dictionary.iteritems())  #this-line is for debugging purposes
    counter = 0
    for i, record in enumerate(zbl_io.read_zbl_records(fin)):
        if i % 10000 == 0:
            logging.info("[gensim_mapfields_dict_file] " + str(i) +
                         " records processed")
        record = gensim_mapfields_dict(record, fields, filter_by_fields,
                                       dictionary, dst_field, id2token,
                                       dbg_field_name)
        if dst_field in record:
            counter = counter + 1
        zbl_io.write_zbl_record(fout, record)
        fout.write("\n")
    return counter
def copy_field(fin, fout, src_field, dst_field):
    """In every record from fin copies field src_field to field dst_field and stores record to fout."""
    for record in zbl_io.read_zbl_records(fin):
        if record.has_key(src_field):
            record[dst_field] = record[src_field]
        zbl_io.write_zbl_record(fout, record)
        fout.write("\n")
def copy_field(fin, fout, src_field, dst_field):
    """In every record from fin copies field src_field to field dst_field and stores record to fout."""
    for record in zbl_io.read_zbl_records(fin):
        if record.has_key(src_field):            
            record[dst_field] = record[src_field]
        zbl_io.write_zbl_record(fout, record)
        fout.write("\n")
Beispiel #6
0
    def add_citation_identity(self, ci, only_fast_match_methods=True):
        """According to records in ZBL file (self.main_zbl_path) and id-maps (self.mr_to_id_map, self.zbl_to_id_map)
            tries to assign identity (<an> field) to citation (given as a dictionary)."""
        self.__match_identity_on_id__(ci)

        if ci.has_key(zbl_io.ZBL_ID_FIELD):
            #print "Assigning to citation [ID/ZBL/MR]:", ci[zbl_io.ZBL_ID_FIELD]
            self.matched = self.matched + 1
            return ci
        elif only_fast_match_methods:
            self.missed = self.missed + 1
            return ci

        candidates = []
        f = open(self.main_zbl_path, 'r')
        for record in zbl_io.read_zbl_records(f):
            if ci.has_key("py") and record.has_key("py"):
                if ci["py"] != record["py"]:
                    continue
            if self.similarity_operator(record, ci):
                candidates.append(aux_zbl_record)
        f.close()

        if len(candidates) == 0:
            self.missed = self.missed + 1
            return ci

        matching_record = zbl_similarity.select_best_fitting_record(
            ci, candidates, self.selection_fields)
        ci[zbl_io.ZBL_ID_FIELD] = matching_record[zbl_io.ZBL_ID_FIELD]
        #print "Assigning to citation [SIMILARITY]:", ci[zbl_io.ZBL_ID_FIELD]
        self.matched = self.matched + 1
        return ci
 def add_citation_identity(self, ci, only_fast_match_methods = True):
     """According to records in ZBL file (self.main_zbl_path) and id-maps (self.mr_to_id_map, self.zbl_to_id_map)
         tries to assign identity (<an> field) to citation (given as a dictionary)."""            
     self.__match_identity_on_id__(ci)
             
     if ci.has_key(zbl_io.ZBL_ID_FIELD):
         #print "Assigning to citation [ID/ZBL/MR]:", ci[zbl_io.ZBL_ID_FIELD]
         self.matched = self.matched + 1 
         return ci        
     elif only_fast_match_methods: 
         self.missed = self.missed + 1
         return ci
     
     candidates = []  
     f = open(self.main_zbl_path, 'r')
     for record in zbl_io.read_zbl_records(f):
         if ci.has_key("py") and record.has_key("py"):
             if ci["py"] != record["py"]:
                 continue
         if self.similarity_operator(record, ci):
             candidates.append(aux_zbl_record)                                                                    
     f.close()
     
     if len(candidates) == 0:           
         self.missed = self.missed + 1               
         return ci    
     
     matching_record = zbl_similarity.select_best_fitting_record(ci, candidates, self.selection_fields)  
     ci[zbl_io.ZBL_ID_FIELD] =  matching_record[zbl_io.ZBL_ID_FIELD]
     #print "Assigning to citation [SIMILARITY]:", ci[zbl_io.ZBL_ID_FIELD]
     self.matched = self.matched + 1 
     return ci
def merge_fields(fin, fout, src_fields, dst_field, separator = " "):
    """In every record from fin merges fields from src_field to field dst_field and stores record to fout."""
    for record in zbl_io.read_zbl_records(fin):
        try:
            dst_val = reduce(lambda a,b: a+separator+b, (record[src_field] for src_field in src_fields if src_field in record) )
            record[dst_field] = dst_val
        except:
            print "[merge_fields] Failed merging in record an=", record[zbl_io.ZBL_ID_FIELD]
        zbl_io.write_zbl_record(fout, record)
        fout.write("\n")
def extract_field_value(fin, fout, field_name):
    """Extracts to fout value of a field of field_name.    
    
    Returns number of found fields.
    """
    counter = 0
    for record in zbl_io.read_zbl_records(fin):
        if record.has_key(field_name):
            fout.write(str(record[field_name]))
            fout.write("\n")
            counter = counter + 1
    return counter
def extract_field_value(fin, fout, field_name):
    """Extracts to fout value of a field of field_name.    
    
    Returns number of found fields.
    """
    counter = 0
    for record in zbl_io.read_zbl_records(fin):
        if record.has_key(field_name):
            fout.write(str(record[field_name]))
            fout.write("\n")            
            counter = counter + 1
    return counter    
def filter_records(fin, fout, bad_ids_file):
    """Copies records from fin to fout. Filters out records of ids contained in file bad_ids_file (path).
    
    Returns list of skipped (filtered out) ids."""
    filter_ids = set(line.strip() for line in open(bad_ids_file).xreadlines())
    skipped_ids = set()        
    for record in zbl_io.read_zbl_records(fin):
        if record[zbl_io.ZBL_ID_FIELD] in filter_ids:
            skipped_ids.add(record[zbl_io.ZBL_ID_FIELD])
            continue
        zbl_io.write_zbl_record(fout, record)
        fout.write("\n")
    return skipped_ids
def keep_authors(fin, fout):
    """Copies all records from fin to fout. 
    
    
    Removes all fields apart from an, au, ai.
    Returns number of all copied records.
    """
    counter = 0
    for record in zbl_io.read_zbl_records(fin):
        zbl_io.write_zbl_record(fout, record_keep_authors(record))
        fout.write("\n")    
        counter = counter + 1 
    return counter
def keep_authors(fin, fout):
    """Copies all records from fin to fout. 
    
    
    Removes all fields apart from an, au, ai.
    Returns number of all copied records.
    """
    counter = 0
    for record in zbl_io.read_zbl_records(fin):
        zbl_io.write_zbl_record(fout, record_keep_authors(record))
        fout.write("\n")
        counter = counter + 1
    return counter
def filter_records(fin, fout, bad_ids_file):
    """Copies records from fin to fout. Filters out records of ids contained in file bad_ids_file (path).
    
    Returns list of skipped (filtered out) ids."""
    filter_ids = set(line.strip() for line in open(bad_ids_file).xreadlines())
    skipped_ids = set()
    for record in zbl_io.read_zbl_records(fin):
        if record[zbl_io.ZBL_ID_FIELD] in filter_ids:
            skipped_ids.add(record[zbl_io.ZBL_ID_FIELD])
            continue
        zbl_io.write_zbl_record(fout, record)
        fout.write("\n")
    return skipped_ids
def keep_records_ids(fin, fout, keep_ids_file):
    """Copies records from fin to fout. Keeps only those records of ids contained in file keep_ids_file (path).
    
    Returns list of kept ids."""
    filter_ids = set(line.strip() for line in open(keep_ids_file).xreadlines())
    print len(filter_ids), " on the 'keep-ids' list"
    kept_ids = set()
    for record in zbl_io.read_zbl_records(fin):
        if not record[zbl_io.ZBL_ID_FIELD] in filter_ids: continue
        kept_ids.add(record[zbl_io.ZBL_ID_FIELD])
        zbl_io.write_zbl_record(fout, record)
        fout.write("\n")
    return kept_ids
def keep_records_ids(fin, fout, keep_ids_file):
    """Copies records from fin to fout. Keeps only those records of ids contained in file keep_ids_file (path).
    
    Returns list of kept ids."""
    filter_ids = set(line.strip() for line in open(keep_ids_file).xreadlines())
    print len(filter_ids)," on the 'keep-ids' list"
    kept_ids = set()        
    for record in zbl_io.read_zbl_records(fin):
        if not record[zbl_io.ZBL_ID_FIELD] in filter_ids: continue
        kept_ids.add(record[zbl_io.ZBL_ID_FIELD])
        zbl_io.write_zbl_record(fout, record)
        fout.write("\n")
    return kept_ids
def keep_records(fin, fout, must_have_fields):
    """Copies records from fin to fout. 
    
    Keeps only these records that have all fields from must_have_fields list.
    """
    kept_counter = 0
    for i,record in enumerate(zbl_io.read_zbl_records(fin)):
        if i%10000 == 0: print "[keep_records]", i,"processed", kept_counter, "kept"
        if has_all_fields(record, must_have_fields):
            zbl_io.write_zbl_record(fout, record)
            fout.write("\n")  
            kept_counter = kept_counter + 1  
    return kept_counter
def filter_fields_vals(fin, fout, list_of_fields, text_filter = text_filter_lower_space, word_predicate = def_word_predicate):
    """Copies records from fin to fout and for fields on list_of_fields filters its' values."""
    logging.info("[filter_fields_vals] text_filter="+str(text_filter)+" word_predicate="+str(word_predicate))
    for record in zbl_io.read_zbl_records(fin):
        for field in list_of_fields:   
            if record.has_key(field):         
                try:   
                    record[field] = words_filter(text_filter(record[field]), word_predicate)
                except:
                    logging.warn("Removing field in an="+str(record[zbl_io.ZBL_ID_FIELD])+" (is field empty?):"+field+" = "+record[field])
                    record.pop(field)             
        zbl_io.write_zbl_record(fout, record)
        fout.write("\n")
def merge_fields(fin, fout, src_fields, dst_field, separator=" "):
    """In every record from fin merges fields from src_field to field dst_field and stores record to fout."""
    for record in zbl_io.read_zbl_records(fin):
        try:
            dst_val = reduce(
                lambda a, b: a + separator + b,
                (record[src_field]
                 for src_field in src_fields if src_field in record))
            record[dst_field] = dst_val
        except:
            print "[merge_fields] Failed merging in record an=", record[
                zbl_io.ZBL_ID_FIELD]
        zbl_io.write_zbl_record(fout, record)
        fout.write("\n")
def calc_msc2count(fin, src_field='mc'):
    """Returns msc2counts dictionary."""
    msc2count = {};
    for i,record in enumerate(zbl_io.read_zbl_records(fin)):                            
        if i%10000 == 0: logging.info("[calc_msc_model] "+str(i)+" records processed")
        if not src_field in record: continue
        
        msccodes = zbl_io.unpack_multivalue_field(record[src_field])
        for msc in msccodes:
            msc2count[msc] = msc2count.get(msc, 0) + 1
        
        #zbl_io.write_zbl_record(fout, record)
        #fout.write("\n")            
    return msc2count  
def keep_records(fin, fout, must_have_fields):
    """Copies records from fin to fout. 
    
    Keeps only these records that have all fields from must_have_fields list.
    """
    kept_counter = 0
    for i, record in enumerate(zbl_io.read_zbl_records(fin)):
        if i % 10000 == 0:
            print "[keep_records]", i, "processed", kept_counter, "kept"
        if has_all_fields(record, must_have_fields):
            zbl_io.write_zbl_record(fout, record)
            fout.write("\n")
            kept_counter = kept_counter + 1
    return kept_counter
def gen_record(fname, filtered_by, uni=False):
    """
    Returns records that contain fields specified in filtered_by
    
    """
    if type(fname) == file:
        ff = fname
    elif uni:
        ff = codecs.open(fname, "r", encoding="utf-8")
    else:
        ff = open(fname, "r")

    for r in read_zbl_records(ff, uni):
        if reduce(lambda x, y: x and y, map(lambda f: f in r, filtered_by)):
            yield r
Beispiel #23
0
def gen_record(fname, filtered_by, uni=False):
    """
    Returns records that contain fields specified in filtered_by
    
    """
    if type(fname) == file:
        ff = fname
    elif uni:
        ff = codecs.open(fname, 'r', encoding='utf-8')
    else:
        ff = open(fname, 'r')

    for r in read_zbl_records(ff, uni):
        if reduce(lambda x, y: x and y, map(lambda f: f in r, filtered_by)):
            yield r
def filter_duplicates(fin, fout):
    """Copies records from fin to fout. Records with duplicated id are filtered out.
    
    Returns list of duplicated ids."""
    ids = set()
    duplicated_ids = set()
    for record in zbl_io.read_zbl_records(fin):
        id = fix_id(record[zbl_io.ZBL_ID_FIELD])        
        if id in ids:
            duplicated_ids.add(id)
            continue        
        ids.add(id)
        zbl_io.write_zbl_record(fout, record)
        fout.write("\n")    
    return duplicated_ids
def filter_duplicates(fin, fout):
    """Copies records from fin to fout. Records with duplicated id are filtered out.
    
    Returns list of duplicated ids."""
    ids = set()
    duplicated_ids = set()
    for record in zbl_io.read_zbl_records(fin):
        id = fix_id(record[zbl_io.ZBL_ID_FIELD])
        if id in ids:
            duplicated_ids.add(id)
            continue
        ids.add(id)
        zbl_io.write_zbl_record(fout, record)
        fout.write("\n")
    return duplicated_ids
def filter_field(fin, fout, field_name):
    """Copies records from fin to fout but keeping only id and field of field_name.    
    
    Returns number of found fields.
    """
    counter = 0
    for record in zbl_io.read_zbl_records(fin):
        newrec = {}
        newrec[zbl_io.ZBL_ID_FIELD] = record[zbl_io.ZBL_ID_FIELD]
        if record.has_key(field_name):
            newrec[field_name] = record[field_name]            
            counter = counter + 1
        zbl_io.write_zbl_record(fout, newrec)
        fout.write("\n")            
    return counter    
def calc_msc2count(fin, src_field='mc'):
    """Returns msc2counts dictionary."""
    msc2count = {}
    for i, record in enumerate(zbl_io.read_zbl_records(fin)):
        if i % 10000 == 0:
            logging.info("[calc_msc_model] " + str(i) + " records processed")
        if not src_field in record: continue

        msccodes = zbl_io.unpack_multivalue_field(record[src_field])
        for msc in msccodes:
            msc2count[msc] = msc2count.get(msc, 0) + 1

        #zbl_io.write_zbl_record(fout, record)
        #fout.write("\n")
    return msc2count
def filter_field(fin, fout, field_name):
    """Copies records from fin to fout but keeping only id and field of field_name.    
    
    Returns number of found fields.
    """
    counter = 0
    for record in zbl_io.read_zbl_records(fin):
        newrec = {}
        newrec[zbl_io.ZBL_ID_FIELD] = record[zbl_io.ZBL_ID_FIELD]
        if record.has_key(field_name):
            newrec[field_name] = record[field_name]
            counter = counter + 1
        zbl_io.write_zbl_record(fout, newrec)
        fout.write("\n")
    return counter
def filter_af(fin, fout):
    """Copies records from fin to fout but also removes from records empty (only "-" values) af fields.
    
    Returns number of removed fields.
    """
    counter = 0
    for record in zbl_io.read_zbl_records(fin):
        if record.has_key("af"):
            af = zbl_io.unpack_multivalue_field(record["af"])
            empty = sum(1 for a in af if a == '-') == len(af)
            if empty:
                record.pop("af")
                counter = counter + 1
        zbl_io.write_zbl_record(fout, record)
        fout.write("\n")            
    return counter
def filter_af(fin, fout):
    """Copies records from fin to fout but also removes from records empty (only "-" values) af fields.
    
    Returns number of removed fields.
    """
    counter = 0
    for record in zbl_io.read_zbl_records(fin):
        if record.has_key("af"):
            af = zbl_io.unpack_multivalue_field(record["af"])
            empty = sum(1 for a in af if a == '-') == len(af)
            if empty:
                record.pop("af")
                counter = counter + 1
        zbl_io.write_zbl_record(fout, record)
        fout.write("\n")
    return counter
def gensim_mapfields_dict_file(fin, fout, fields, filter_by_fields, dictionary, dst_field, dbg_field_name = "g_"):
    """For every records from ZBL-fin-stream that have filter_by_fields 
     fields are merged, mapped with gensim dictionary and stored in dst-field.

       Returns number of processed records."""
    logging.info("[gensim_mapfields_dict_file] filter_by_fields="+str(filter_by_fields)+\
    " fields="+str(fields)+" dictionary="+str(dictionary)+" fin="+str(fin)+" dst_field="+str(dst_field))
    id2token = dict( (idx,token) for idx,token in dictionary.iteritems() ) #this-line is for debugging purposes
    counter = 0
    for i,record in enumerate(zbl_io.read_zbl_records(fin)):                            
        if i%10000 == 0: logging.info("[gensim_mapfields_dict_file] "+str(i)+" records processed")
        record = gensim_mapfields_dict(record, fields, filter_by_fields, dictionary, dst_field, id2token, dbg_field_name)
        if dst_field in record:
            counter = counter + 1 
        zbl_io.write_zbl_record(fout, record)
        fout.write("\n")            
    return counter
def calc_msc_membership(fin, fout, known_msc_codes, \
                        src_field='mc', dst_field='m0', \
                        re_leaf_pattern=msc_processing.MSC_ORDINARY_LEAF_PATTERN_RE, dbg_field='m_'):
    """Updates records with additional dst_field with membership vector calculated basing on msc codes read from src_field."""
    msccodes = filter_msccodes(known_msc_codes, re_leaf_pattern)
    msc2ix = calc_msc2ix(msccodes)
    ix2msc = dict((ix, msc) for msc, ix in msc2ix.iteritems())
    prefix2msc = group_by_prefix(msccodes)

    counter = 0
    for i, record in enumerate(zbl_io.read_zbl_records(fin)):
        if i % 10000 == 0:
            logging.info("[calc_msc_membership] " + str(i) +
                         " records processed. " + str(counter) + "updated.")
        if src_field in record:
            record_msccodes = zbl_io.unpack_multivalue_field(record[src_field])
            record_msccodes = filter_msccodes(record_msccodes, re_leaf_pattern)

            compared_codes = set(
            )  #patrzymy po tych ktore maja zgodne fragmenty prefiksow
            for record_msccode in record_msccodes:
                prefix2 = record_msccode[:2]
                prefix3 = record_msccode[:3]
                compared_codes.update(prefix2msc[prefix2])
                compared_codes.update(prefix2msc[prefix3])

            mscmembership = []
            for compared_code in compared_codes:
                membership = msccode_membership(record_msccodes, compared_code)
                mscmembership.append((msc2ix[compared_code], membership))

            if len(mscmembership) > 0:  #zapsiujemy wyniki
                mscmembership = sorted(set(mscmembership))
                record[dst_field] = zbl_io.pack_listpairs_field(mscmembership)
                record[dbg_field] = zbl_io.pack_listpairs_field([
                    (ix2msc[ix], m) for ix, m in mscmembership
                ])
                counter = counter + 1

        zbl_io.write_zbl_record(fout, record)
        fout.write("\n")
    return counter
def gensim_mapfield_model(fin, fout, model, src_field, dst_field,\
                          src_field_value_extractor=extract_bag_of_ids, dst_field_value_builder=zbl_io.pack_listpairs_field):
    """For every records from ZBL-fin-stream that have src_field its content 
    is interpreted as gensim-bag-of-ids(with weights/counts/values) and transformed using model (results are stored into dst_field).

    Returns number of enriched records."""
    logging.info("[gensim_mapfield_model] src_field="+str(src_field)+\
     " model="+str(model)+" fin="+str(fin)+" dst_field="+str(dst_field)+\
     " src_field_value_extractor="+str(src_field_value_extractor)+" dst_field_value_builder="+str(dst_field_value_builder))
    counter = 0
    for i,record in enumerate(zbl_io.read_zbl_records(fin)):                            
        if i%10000 == 0: logging.info("[gensim_mapfield_model] "+str(i)+" documents mapped...")
        if src_field in record:
            bag_of_ids = src_field_value_extractor(record[src_field])
            tfidf_values = model[bag_of_ids]
            record[dst_field] = dst_field_value_builder(tfidf_values)
            logging.debug("[gensim_mapfield_model]"+record[src_field]+" -> "+record[dst_field])
            counter = counter + 1    
        zbl_io.write_zbl_record(fout, record)
        fout.write("\n")
    return counter
def filter_fields_vals(fin,
                       fout,
                       list_of_fields,
                       text_filter=text_filter_lower_space,
                       word_predicate=def_word_predicate):
    """Copies records from fin to fout and for fields on list_of_fields filters its' values."""
    logging.info("[filter_fields_vals] text_filter=" + str(text_filter) +
                 " word_predicate=" + str(word_predicate))
    for record in zbl_io.read_zbl_records(fin):
        for field in list_of_fields:
            if record.has_key(field):
                try:
                    record[field] = words_filter(text_filter(record[field]),
                                                 word_predicate)
                except:
                    logging.warn("Removing field in an=" +
                                 str(record[zbl_io.ZBL_ID_FIELD]) +
                                 " (is field empty?):" + field + " = " +
                                 record[field])
                    record.pop(field)
        zbl_io.write_zbl_record(fout, record)
        fout.write("\n")
def calc_msc_membership(fin, fout, known_msc_codes, \
                        src_field='mc', dst_field='m0', \
                        re_leaf_pattern=msc_processing.MSC_ORDINARY_LEAF_PATTERN_RE, dbg_field='m_'):
    """Updates records with additional dst_field with membership vector calculated basing on msc codes read from src_field."""
    msccodes    = filter_msccodes(known_msc_codes, re_leaf_pattern)
    msc2ix      = calc_msc2ix(msccodes)
    ix2msc      = dict((ix,msc) for msc,ix in msc2ix.iteritems())
    prefix2msc  = group_by_prefix(msccodes)    
        
    counter = 0;
    for i,record in enumerate(zbl_io.read_zbl_records(fin)):                            
        if i%10000 == 0: logging.info("[calc_msc_membership] "+str(i)+" records processed. "+str(counter)+"updated.")
        if src_field in record:         
            record_msccodes = zbl_io.unpack_multivalue_field(record[src_field])
            record_msccodes = filter_msccodes(record_msccodes, re_leaf_pattern)
            
            compared_codes = set() #patrzymy po tych ktore maja zgodne fragmenty prefiksow
            for record_msccode in record_msccodes:                
                prefix2 = record_msccode[:2]
                prefix3 = record_msccode[:3]
                compared_codes.update( prefix2msc[prefix2] )
                compared_codes.update( prefix2msc[prefix3] )

            mscmembership = []
            for compared_code in compared_codes:
                membership = msccode_membership(record_msccodes, compared_code)
                mscmembership.append( (msc2ix[compared_code],membership) )
                    
            if len(mscmembership) > 0: #zapsiujemy wyniki
                mscmembership = sorted(set(mscmembership))
                record[dst_field] = zbl_io.pack_listpairs_field(mscmembership)                
                record[dbg_field] = zbl_io.pack_listpairs_field([(ix2msc[ix],m) for ix,m in mscmembership])
                counter = counter + 1
            
        zbl_io.write_zbl_record(fout, record)
        fout.write("\n")            
    return counter            
Beispiel #36
0
def gensim_mapfield_model(fin, fout, model, src_field, dst_field,\
                          src_field_value_extractor=extract_bag_of_ids, dst_field_value_builder=zbl_io.pack_listpairs_field):
    """For every records from ZBL-fin-stream that have src_field its content 
    is interpreted as gensim-bag-of-ids(with weights/counts/values) and transformed using model (results are stored into dst_field).

    Returns number of enriched records."""
    logging.info("[gensim_mapfield_model] src_field="+str(src_field)+\
     " model="+str(model)+" fin="+str(fin)+" dst_field="+str(dst_field)+\
     " src_field_value_extractor="+str(src_field_value_extractor)+" dst_field_value_builder="+str(dst_field_value_builder))
    counter = 0
    for i, record in enumerate(zbl_io.read_zbl_records(fin)):
        if i % 10000 == 0:
            logging.info("[gensim_mapfield_model] " + str(i) +
                         " documents mapped...")
        if src_field in record:
            bag_of_ids = src_field_value_extractor(record[src_field])
            tfidf_values = model[bag_of_ids]
            record[dst_field] = dst_field_value_builder(tfidf_values)
            logging.debug("[gensim_mapfield_model]" + record[src_field] +
                          " -> " + record[dst_field])
            counter = counter + 1
        zbl_io.write_zbl_record(fout, record)
        fout.write("\n")
    return counter
        ccs     = filter_categories( cc.strip() for cc in ccstr.split(' ') )
        for cc in ccs:
            if count.has_key(cc):
                count[cc] = count[cc] + 1
            else:
                count[cc] = 1
    return count

if __name__ == "__main__":

    args = sys.argv
    if len(sys.argv) != 3:
        print "[ERROR] Exactly two arguments are expected: input-zbl-file-path output-count-path-prefix"
        exit(-1)
    zblInPath       = args[1]
    statsOutPath    = args[2]


    #zliczenie
    zbl_src = zbl_io.read_zbl_records(open(zblInPath))
    count = count_categories(zbl_src, filter_XXY_categories)

    #zapis do pliku
    #f = open(statsOutPath, "w")
    #for cc in count:
    #    f.write(cc+" "+str(count[cc])+"\n")
    #f.close()
    io.fwrite_vector(statsOutPath+"_labels.svector", count.keys())
    io.fwrite_vector(statsOutPath+"_count.ivector", count.values())

        sys.exit(-1)
    try:
        out_path = sys.argv[2]
    except:
        print "Second argument expected: output-zbl-file-path (Pseudo-ZBL)"        
        sys.exit(-1)       
        
    print "src = ", main_zbl_path
    print "dst = ", out_path

    cimatch = CitationMatcher(main_zbl_path)
        
    fout = open(out_path, "w")
    main_counter = 0
    start_time = time.clock()
    for record in zbl_io.read_zbl_records( open(main_zbl_path, 'r') ):
        #update citations:
        if record.has_key("ci"):            
            cis = zbl_io.unpack_list_of_dictionaries(record["ci"])                            
            for ci in cis:
                ci = cimatch.add_citation_identity(ci)                                     
            record["ci"] = zbl_io.pack_list_of_dictionaries(cis)                                    
        #write output:
        zbl_io.write_zbl_record(fout, record)
        fout.write("\n")
        #progress bar:
        if main_counter%10000 == 0:
            print (time.clock() - start_time),"s - ",main_counter, "processed,", (cimatch.matched),"matched",(cimatch.missed),"missed"
        main_counter = main_counter + 1         
    fout.close()
    
Beispiel #39
0
    print len(zbl_matcher.aux_zbl_recs_list), "zbl records loaded."

    print "Opening main file =", main_zbl_path
    fmain = open(main_zbl_path, 'r')

    print "Opening output file =", out_path
    fout = open(out_path, 'w')

    print "-----------------------------------"
    zbl_matcher.print_py_report()
    print "-----------------------------------"

    ######################################################################################################
    #mix records from two files:
    start_time = time.clock()
    for main_zbl_record in zbl_io.read_zbl_records(fmain):

        aux_zbl_record = zbl_matcher.match_aux_record(main_zbl_record,
                                                      only_fast_match_methods)
        if not aux_zbl_record is None:  #mix two records:
            main_zbl_record = update_zbl_record(main_zbl_record,
                                                aux_zbl_record, forced_fields)

        #write results:
        zbl_io.write_zbl_record(fout, main_zbl_record)
        fout.write("\n")

        #progress bar:
        main_counter = zbl_matcher.total_processed_records_num()
        matched_counter = zbl_matcher.total_matched_records_num()
        if main_counter % 10000 == 0:
    print len(zbl_matcher.aux_zbl_recs_list), "zbl records loaded." 

    print "Opening main file =",  main_zbl_path
    fmain = open(main_zbl_path, 'r') 

    print "Opening output file =",  out_path  
    fout = open(out_path, 'w')

    print "-----------------------------------"
    zbl_matcher.print_py_report()
    print "-----------------------------------"

    ######################################################################################################
    #mix records from two files:
    start_time = time.clock()
    for main_zbl_record in zbl_io.read_zbl_records(fmain):       

        aux_zbl_record = zbl_matcher.match_aux_record(main_zbl_record, only_fast_match_methods)                        
        if not aux_zbl_record is None: #mix two records:                                        
            main_zbl_record = update_zbl_record(main_zbl_record, aux_zbl_record, forced_fields)
            
        #write results:    
        zbl_io.write_zbl_record(fout, main_zbl_record)  
        fout.write("\n")
        
        #progress bar:   
        main_counter = zbl_matcher.total_processed_records_num()
        matched_counter = zbl_matcher.total_matched_records_num()
        if main_counter%10000 == 0:
            print (time.clock() - start_time),"s - ",main_counter, "processed,",matched_counter,"matched"
           
Beispiel #41
0
if __name__ == "__main__":
    print "The program splits single ZBL file into several files of required size."

    try:
        in_path = sys.argv[1]
    except:
        print "First argument expected: source-file"
        sys.exit(-1)
    try:
        part_size = int(sys.argv[2])
    except:
        print "Second argument expected: number of records per output file"
        sys.exit(-1)

    print "Source file:", in_path
    print "Records per file:", part_size

    part_counter = 0
    part_records_counter = 0
    fout = open(in_path + ".part" + str(part_counter), "w")
    for record in zbl_io.read_zbl_records(open(in_path, "r")):
        part_records_counter = part_records_counter + 1
        if part_records_counter >= part_size:
            print part_records_counter, "records stored to file", fout
            part_counter = part_counter + 1
            part_records_counter = 0
            fout = open(in_path + ".part" + str(part_counter), "w")
        zbl_io.write_zbl_record(fout, record)
        fout.write("\n")
    print part_records_counter, "records stored to file", fout
def add_field(fin, fout, add_field_name, add_field_value):
    """To every record from fin adds field (add_field_value:add_field_name) and stores record to fout."""
    for record in zbl_io.read_zbl_records(fin):
        record[add_field_name] = add_field_value            
        zbl_io.write_zbl_record(fout, record)
        fout.write("\n")
Beispiel #43
0
def id_bags_generator(fin, src_field, value_extractor=extract_bag_of_ids):
    """Returns generator that generates gensim-bags-of-ids/tfidfs (read from src_field) from ZBL-file fin."""
    return (value_extractor(record[src_field])
            for record in zbl_io.read_zbl_records(fin) if src_field in record)
if __name__ == "__main__":
    print "The program splits single ZBL file into several files of required size."
    
    try:
        in_path = sys.argv[1]
    except:
        print "First argument expected: source-file"        
        sys.exit(-1)
    try:
        part_size = int(sys.argv[2])
    except:
        print "Second argument expected: number of records per output file"        
        sys.exit(-1)        
        
    print "Source file:", in_path
    print "Records per file:", part_size
        
    part_counter = 0;        
    part_records_counter = 0
    fout = open(in_path+".part"+str(part_counter), "w")
    for record in zbl_io.read_zbl_records( open(in_path, "r") ):
        part_records_counter = part_records_counter + 1
        if part_records_counter >= part_size:
            print part_records_counter,"records stored to file", fout
            part_counter = part_counter + 1
            part_records_counter = 0            
            fout = open(in_path+".part"+str(part_counter), "w")
        zbl_io.write_zbl_record(fout, record)
        fout.write("\n")
    print part_records_counter,"records stored to file", fout
    
        ccstr = record["cc"].strip().replace("*", "")
        ccs = filter_categories(cc.strip() for cc in ccstr.split(" "))
        for cc in ccs:
            if count.has_key(cc):
                count[cc] = count[cc] + 1
            else:
                count[cc] = 1
    return count


if __name__ == "__main__":

    args = sys.argv
    if len(sys.argv) != 3:
        print "[ERROR] Exactly two arguments are expected: input-zbl-file-path output-count-path-prefix"
        exit(-1)
    zblInPath = args[1]
    statsOutPath = args[2]

    # zliczenie
    zbl_src = zbl_io.read_zbl_records(open(zblInPath))
    count = count_categories(zbl_src, filter_XXY_categories)

    # zapis do pliku
    # f = open(statsOutPath, "w")
    # for cc in count:
    #    f.write(cc+" "+str(count[cc])+"\n")
    # f.close()
    io.fwrite_vector(statsOutPath + "_labels.svector", count.keys())
    io.fwrite_vector(statsOutPath + "_count.ivector", count.values())
Beispiel #46
0
        sys.exit(-1)
    try:
        out_path = sys.argv[2]
    except:
        print "Second argument expected: output-zbl-file-path (Pseudo-ZBL)"
        sys.exit(-1)

    print "src = ", main_zbl_path
    print "dst = ", out_path

    cimatch = CitationMatcher(main_zbl_path)

    fout = open(out_path, "w")
    main_counter = 0
    start_time = time.clock()
    for record in zbl_io.read_zbl_records(open(main_zbl_path, 'r')):
        #update citations:
        if record.has_key("ci"):
            cis = zbl_io.unpack_list_of_dictionaries(record["ci"])
            for ci in cis:
                ci = cimatch.add_citation_identity(ci)
            record["ci"] = zbl_io.pack_list_of_dictionaries(cis)
        #write output:
        zbl_io.write_zbl_record(fout, record)
        fout.write("\n")
        #progress bar:
        if main_counter % 10000 == 0:
            print(time.clock() -
                  start_time), "s - ", main_counter, "processed,", (
                      cimatch.matched), "matched", (cimatch.missed), "missed"
        main_counter = main_counter + 1
    try:
        in_path = sys.argv[2]
    except:
        print "Second argument expected: input-zbl-file-path (Pseudo-ZBL)"        
        sys.exit(-1)
    try:
        out_path = sys.argv[3]
    except:
        print "Third argument expected: output-zbl-file-path (Pseudo-ZBL)"        
        sys.exit(-1)        
        
    print "mapping_path =", mapping_path
    print "in_path =", in_path
    print "out_path =", out_path


    id_mapper = ZblIdMapper(mapping_path, False)
    fin = open(in_path, 'r')
    fout = open(out_path, 'w')
    for record in zbl_io.read_zbl_records(fin):        
        zbl_io.write_zbl_record(fout, id_mapper.update_record(record))           
        fout.write("\n")               
    fin.close()
    fout.close()
    
    id_mapper.print_stats()
    
   

        
    
def id_bags_generator(fin, src_field, value_extractor=extract_bag_of_ids):
    """Returns generator that generates gensim-bags-of-ids/tfidfs (read from src_field) from ZBL-file fin."""
    return ( value_extractor(record[src_field]) for record in zbl_io.read_zbl_records(fin) if src_field in record )
Beispiel #49
0
@author: mlukasik

Find out the differences between 2 data sets
'''
from record_store import store_py_records, store_txt_records
from zbl_io import load_zbl_file, read_zbl_records
import sys
from collections import defaultdict

fname1 = sys.argv[1]
fname2 = sys.argv[2]
print "loading records1"

records1 = defaultdict(lambda: {})
records1_cnt = 0
for rec1 in read_zbl_records(open(fname1, 'r')):
    try:
        records1[rec1['an']][rec1['ti']] = rec1
        records1_cnt += 1
    except:
        pass

print "loaded records1", len(records1), "all of them:", records1_cnt

print "going through records2"
not_in_rec1 = 0
records2_len = 0
for rec2 in read_zbl_records(open(fname2, 'r')):
    if rec2['an'] not in records1 or rec2['ti'] not in records1[rec2['an']]:
        #print "record not in records1!"
        print rec2
def add_field(fin, fout, add_field_name, add_field_value):
    """To every record from fin adds field (add_field_value:add_field_name) and stores record to fout."""
    for record in zbl_io.read_zbl_records(fin):
        record[add_field_name] = add_field_value
        zbl_io.write_zbl_record(fout, record)
        fout.write("\n")
@author: mlukasik

Find out the differences between 2 data sets
'''
from record_store import store_py_records, store_txt_records
from zbl_io import load_zbl_file, read_zbl_records
import sys
from collections import defaultdict

fname1 = sys.argv[1]
fname2 = sys.argv[2]
print "loading records1"

records1 = defaultdict(lambda: {})
records1_cnt = 0
for rec1 in read_zbl_records(open(fname1, 'r')):
    try:
        records1[rec1['an']][rec1['ti']] = rec1
        records1_cnt+=1
    except:
        pass
    
print "loaded records1", len(records1), "all of them:", records1_cnt

print "going through records2"
not_in_rec1 = 0
records2_len = 0
for rec2 in read_zbl_records(open(fname2, 'r')):
    if rec2['an'] not in records1 or rec2['ti'] not in records1[rec2['an']]:
        #print "record not in records1!"
        print rec2
    try:
        mapping_path = sys.argv[1]
    except:
        print "First argument expected: mapping-file-path (every line in format: src-id dst-id)"
        sys.exit(-1)
    try:
        in_path = sys.argv[2]
    except:
        print "Second argument expected: input-zbl-file-path (Pseudo-ZBL)"
        sys.exit(-1)
    try:
        out_path = sys.argv[3]
    except:
        print "Third argument expected: output-zbl-file-path (Pseudo-ZBL)"
        sys.exit(-1)

    print "mapping_path =", mapping_path
    print "in_path =", in_path
    print "out_path =", out_path

    id_mapper = ZblIdMapper(mapping_path, False)
    fin = open(in_path, 'r')
    fout = open(out_path, 'w')
    for record in zbl_io.read_zbl_records(fin):
        zbl_io.write_zbl_record(fout, id_mapper.update_record(record))
        fout.write("\n")
    fin.close()
    fout.close()

    id_mapper.print_stats()