def _draw_af_hist_(path, records_filter = allow_all_filter):
    """Draws histogram of authorship."""    
    af_count = {} #dict{author: count}
    
    for i,record in enumerate(zbl_io.read_zbl_records(open(path))):
        if i%100000 == 0: print i," records considered" #progress bar
        if not record.has_key("af") or not records_filter(record):
            continue        
        
        afs = zbl_io.unpack_multivalue_field(record["af"])
        afs_ok = list( af for af in afs if af!='-' )
        
        for af in afs_ok:
            af_count[af] = af_count.get(af, 0) + 1
             
    print len(af_count), " authors found."
    print max(af_count.values()), " = max"
    print min(af_count.values()), " = min"
    avg_af_values = avg(af_count.values())
    print round(avg_af_values, 2), " = avg"
    print round(std(af_count.values()), 2), " = std"
    print sum(1 for af in af_count.values() if af > avg_af_values) , " authors above avg"
    print sum(1 for af in af_count.values() if af < avg_af_values) , " authors below avg"
    
    n, bins, patches = plt.hist(af_count.values(), bins = max(af_count.values()), normed=False, log=True, alpha=0.75)
    plt.xlabel("Liczba wystapien w rekordach")
    plt.ylabel("Liczba autorow")    
    plt.show()     
Example #2
0
def _draw_af_hist_(path, records_filter=allow_all_filter):
    """Draws histogram of authorship."""
    af_count = {}  #dict{author: count}

    for i, record in enumerate(zbl_io.read_zbl_records(open(path))):
        if i % 100000 == 0: print i, " records considered"  #progress bar
        if not record.has_key("af") or not records_filter(record):
            continue

        afs = zbl_io.unpack_multivalue_field(record["af"])
        afs_ok = list(af for af in afs if af != '-')

        for af in afs_ok:
            af_count[af] = af_count.get(af, 0) + 1

    print len(af_count), " authors found."
    print max(af_count.values()), " = max"
    print min(af_count.values()), " = min"
    avg_af_values = avg(af_count.values())
    print round(avg_af_values, 2), " = avg"
    print round(std(af_count.values()), 2), " = std"
    print sum(1 for af in af_count.values()
              if af > avg_af_values), " authors above avg"
    print sum(1 for af in af_count.values()
              if af < avg_af_values), " authors below avg"

    n, bins, patches = plt.hist(af_count.values(),
                                bins=max(af_count.values()),
                                normed=False,
                                log=True,
                                alpha=0.75)
    plt.xlabel("Liczba wystapien w rekordach")
    plt.ylabel("Liczba autorow")
    plt.show()
Example #3
0
def _report_af_quality_(path, records_filter=allow_all_filter):
    """Prints report about authors' identities quality."""
    afs_len = []
    afs_ok_len = []

    for i, record in enumerate(zbl_io.read_zbl_records(open(path))):
        if i % 100000 == 0: print i, " records considered"  #progress bar
        if not record.has_key("af") or not records_filter(record):
            continue

        afs = zbl_io.unpack_multivalue_field(record["af"])
        afs_ok = list(af for af in afs if af != '-')

        afs_len.append(len(afs))
        afs_ok_len.append(len(afs_ok))

    afs_ok_frac = list(
        float(m) / float(l) for m, l in zip(afs_ok_len, afs_len))

    print max(afs_len), "\n", round(avg(afs_len),
                                    2), "\n", round(std(afs_len), 2)
    print max(afs_ok_len), "\n", round(avg(afs_ok_len),
                                       2), "\n", round(std(afs_ok_len), 2)
    print round(max(afs_ok_frac),
                2), "\n", round(avg(afs_ok_frac),
                                2), "\n", round(std(afs_ok_frac), 2)
def extract_fieldvalue2ids(zbl_generator, multivalue_field_name = "af", empty_value = "-", container = list):
    """Returns dictionary{field-value: container-of-identified-ids-that-have-this-value}.
    
    >>> r1 = {'an':'1', 'af': zbl_io.pack_multivalue_field(['a1','-','a2']) }
    >>> r2 = {'an':'2', 'af': zbl_io.pack_multivalue_field(['-','a2','a1']) }
    >>> r3 = {'an':'3', 'af': zbl_io.pack_multivalue_field(['a3', '-']) }
    >>> r4 = {'an':'4', 'af': zbl_io.pack_multivalue_field(['a3', '-', 'a2'])}
    >>> sorted(list(extract_fieldvalue2ids([r1,r2,r3]).iteritems()))
    [('a1', ['1', '2']), ('a2', ['1', '2']), ('a3', ['3'])]
    >>> sorted(list(extract_fieldvalue2ids([r1,r2,r3,r4]).iteritems()))
    [('a1', ['1', '2']), ('a2', ['1', '2', '4']), ('a3', ['3', '4'])]
    """
    af2ids = {}
    skipped = 0
    novals = 0
    for i,zbl in enumerate(zbl_generator):
        if i%10000==0: logging.info("[extract_fieldvalue2ids]"+str(i)+" records processed")
        if not multivalue_field_name in zbl:
            skipped = skipped + 1 
            continue
        zbl_id = zbl[zbl_io.ZBL_ID_FIELD] 
        afs = zbl_io.unpack_multivalue_field(zbl[multivalue_field_name])
        afs_ok = list(af for af in afs if af!=empty_value)
        if len(afs_ok)==0: novals = novals + 1
        for af in afs_ok:
            af2ids[af] = af2ids.get(af, []) + [zbl_id]
    logging.info("[extract_fieldvalue2ids] "+str(i)+" records processed")
    logging.info("[extract_fieldvalue2ids] "+str(skipped)+" records skipped")
    logging.info("[extract_fieldvalue2ids] "+str(novals)+" records with only empty values in field")
    fv2ids = dict( (id,container(ids)) for id,ids in af2ids.iteritems())
    logging.info("[extract_fieldvalue2ids] "+str(len(fv2ids))+" authors found.")
    return fv2ids 
 def update(self, zbl_generator, \
            msc_predicate = lambda msc: MSC_ORDINARY_LEAF_PATTERN_RE.match(msc)):
     logging.info("[MscModel.update] building msc2lists")
     for zbl in zbl_generator:############
         msc_codes = zbl_io.unpack_multivalue_field(zbl['mc'])
         zbl_id = zbl[zbl_io.ZBL_ID_FIELD]
         _update_(self.msc2zblidlist,      msc_codes,     zbl_id, msc_predicate)
         _update_(self.mscprim2zblidlist,  msc_codes[:1], zbl_id, msc_predicate)
         _update_(self.mscsec2zblidlist,   msc_codes[1:], zbl_id, msc_predicate)
     self._update_counts_()         
def count_msc_occurences(file, records_filter=lambda x: True, field_name="mc"):
    """Counts number of occurrences of MSC codes in ZBL file.
        
    Returns dictionary{code_name: count}"""
    counts = {}
    for record in zbl_io.read_zbl_records(file):
        if not records_filter(record) or not record.has_key(field_name):
            continue
        codes = zbl_io.unpack_multivalue_field(record[field_name])
        for code in codes:
            counts[code] = counts.get(code, 0) + 1
    return counts
 def update(self, zbl_generator, \
            msc_predicate = lambda msc: MSC_ORDINARY_LEAF_PATTERN_RE.match(msc)):
     logging.info("[MscModel.update] building msc2lists")
     for zbl in zbl_generator:  ############
         msc_codes = zbl_io.unpack_multivalue_field(zbl['mc'])
         zbl_id = zbl[zbl_io.ZBL_ID_FIELD]
         _update_(self.msc2zblidlist, msc_codes, zbl_id, msc_predicate)
         _update_(self.mscprim2zblidlist, msc_codes[:1], zbl_id,
                  msc_predicate)
         _update_(self.mscsec2zblidlist, msc_codes[1:], zbl_id,
                  msc_predicate)
     self._update_counts_()
def count_msc_occurences(file, records_filter = lambda x: True, field_name = "mc"):
    """Counts number of occurrences of MSC codes in ZBL file.
        
    Returns dictionary{code_name: count}"""
    counts = {}
    for record in zbl_io.read_zbl_records(file):
        if not records_filter(record) or not record.has_key(field_name):
            continue 
        codes = zbl_io.unpack_multivalue_field(record[field_name])
        for code in codes:
            counts[code] = counts.get(code, 0) + 1    
    return counts
def group_zbl_by_msc(zbl_generator, \
                     msc_primary_predicate = lambda mscprim: MSC_ORDINARY_LEAF_PATTERN_RE.match(mscprim), \
                     zbl_extract = lambda zbl: zbl[zbl_io.ZBL_ID_FIELD]):
    """Returns dictionary{msc-primary-code: list-of-extracted-by-zbl_extract(zbl)-values}.
    
    When msc_primary_predicate(msc_primary_code)==False record will be skipped.    
    """
    msc2zbllist = {}
    for zbl in zbl_generator:
        msc_primary_code = zbl_io.unpack_multivalue_field(zbl['mc'])[0]
        if not msc_primary_predicate(msc_primary_code): continue
        zbllist = msc2zbllist.get(msc_primary_code, [])
        zbllist.append( zbl_extract(zbl) )
        msc2zbllist[msc_primary_code] = zbllist
    return msc2zbllist
def group_zbl_by_msc(zbl_generator, \
                     msc_primary_predicate = lambda mscprim: MSC_ORDINARY_LEAF_PATTERN_RE.match(mscprim), \
                     zbl_extract = lambda zbl: zbl[zbl_io.ZBL_ID_FIELD]):
    """Returns dictionary{msc-primary-code: list-of-extracted-by-zbl_extract(zbl)-values}.
    
    When msc_primary_predicate(msc_primary_code)==False record will be skipped.    
    """
    msc2zbllist = {}
    for zbl in zbl_generator:
        msc_primary_code = zbl_io.unpack_multivalue_field(zbl['mc'])[0]
        if not msc_primary_predicate(msc_primary_code): continue
        zbllist = msc2zbllist.get(msc_primary_code, [])
        zbllist.append(zbl_extract(zbl))
        msc2zbllist[msc_primary_code] = zbllist
    return msc2zbllist
def _draw_mc_hist(path, records_filter = allow_all_filter):
    """Draws histogram of MSC codes occurrence in records."""    
    mc_counts = []
    
    for i,record in enumerate(zbl_io.read_zbl_records(open(path))):
        if i%100000 == 0: print i," records considered" #progress bar
        if not record.has_key("mc") or not records_filter(record):
            continue          
        
        mc = zbl_io.unpack_multivalue_field(record["mc"])
        mc_counts.append(len(mc))
        
    print len(mc_counts), " record found."
    print max(mc_counts), " = max"
    print min(mc_counts), " = min"
    print round(avg(mc_counts), 2), " = avg"
    print round(std(mc_counts), 2), " = std"
    n, bins, patches = plt.hist(mc_counts, bins = max(mc_counts), normed=False, alpha=0.75)
    plt.xlabel("Liczba kodow MSC w rekordzie")
    plt.ylabel("Liczba rekordow")    
    plt.show()                  
def _report_af_quality_(path, records_filter = allow_all_filter):
    """Prints report about authors' identities quality."""    
    afs_len = []
    afs_ok_len = []
    
    for i,record in enumerate(zbl_io.read_zbl_records(open(path))):
        if i%100000 == 0: print i," records considered" #progress bar
        if not record.has_key("af") or not records_filter(record):
            continue        
        
        afs = zbl_io.unpack_multivalue_field(record["af"])
        afs_ok = list( af for af in afs if af!='-' )
        
        afs_len.append(len(afs))
        afs_ok_len.append(len(afs_ok))
        
    afs_ok_frac = list( float(m)/float(l) for m,l in zip(afs_ok_len, afs_len) )    
        
    print max(afs_len), "\n", round(avg(afs_len),2), "\n", round(std(afs_len),2)
    print max(afs_ok_len), "\n", round(avg(afs_ok_len),2), "\n", round(std(afs_ok_len),2)
    print round(max(afs_ok_frac),2), "\n", round(avg(afs_ok_frac),2), "\n", round(std(afs_ok_frac),2)
def extract_fieldvalue2ids(zbl_generator,
                           multivalue_field_name="af",
                           empty_value="-",
                           container=list):
    """Returns dictionary{field-value: container-of-identified-ids-that-have-this-value}.
    
    >>> r1 = {'an':'1', 'af': zbl_io.pack_multivalue_field(['a1','-','a2']) }
    >>> r2 = {'an':'2', 'af': zbl_io.pack_multivalue_field(['-','a2','a1']) }
    >>> r3 = {'an':'3', 'af': zbl_io.pack_multivalue_field(['a3', '-']) }
    >>> r4 = {'an':'4', 'af': zbl_io.pack_multivalue_field(['a3', '-', 'a2'])}
    >>> sorted(list(extract_fieldvalue2ids([r1,r2,r3]).iteritems()))
    [('a1', ['1', '2']), ('a2', ['1', '2']), ('a3', ['3'])]
    >>> sorted(list(extract_fieldvalue2ids([r1,r2,r3,r4]).iteritems()))
    [('a1', ['1', '2']), ('a2', ['1', '2', '4']), ('a3', ['3', '4'])]
    """
    af2ids = {}
    skipped = 0
    novals = 0
    for i, zbl in enumerate(zbl_generator):
        if i % 10000 == 0:
            logging.info("[extract_fieldvalue2ids]" + str(i) +
                         " records processed")
        if not multivalue_field_name in zbl:
            skipped = skipped + 1
            continue
        zbl_id = zbl[zbl_io.ZBL_ID_FIELD]
        afs = zbl_io.unpack_multivalue_field(zbl[multivalue_field_name])
        afs_ok = list(af for af in afs if af != empty_value)
        if len(afs_ok) == 0: novals = novals + 1
        for af in afs_ok:
            af2ids[af] = af2ids.get(af, []) + [zbl_id]
    logging.info("[extract_fieldvalue2ids] " + str(i) + " records processed")
    logging.info("[extract_fieldvalue2ids] " + str(skipped) +
                 " records skipped")
    logging.info("[extract_fieldvalue2ids] " + str(novals) +
                 " records with only empty values in field")
    fv2ids = dict((id, container(ids)) for id, ids in af2ids.iteritems())
    logging.info("[extract_fieldvalue2ids] " + str(len(fv2ids)) +
                 " authors found.")
    return fv2ids
Example #14
0
def _draw_mc_hist(path, records_filter=allow_all_filter):
    """Draws histogram of MSC codes occurrence in records."""
    mc_counts = []

    for i, record in enumerate(zbl_io.read_zbl_records(open(path))):
        if i % 100000 == 0: print i, " records considered"  #progress bar
        if not record.has_key("mc") or not records_filter(record):
            continue

        mc = zbl_io.unpack_multivalue_field(record["mc"])
        mc_counts.append(len(mc))

    print len(mc_counts), " record found."
    print max(mc_counts), " = max"
    print min(mc_counts), " = min"
    print round(avg(mc_counts), 2), " = avg"
    print round(std(mc_counts), 2), " = std"
    n, bins, patches = plt.hist(mc_counts,
                                bins=max(mc_counts),
                                normed=False,
                                alpha=0.75)
    plt.xlabel("Liczba kodow MSC w rekordzie")
    plt.ylabel("Liczba rekordow")
    plt.show()
def _update_docmodel_(zbl):
    if not "mc" in zbl: return
    docid                   = zbl[zbl_io.ZBL_ID_FIELD]
    msc_codes               = zbl_io.unpack_multivalue_field(zbl['mc'])
    docid2primcode[docid]   = msc_codes[0]
    docid2seccodes[docid]   = msc_codes[1:] 
def _update_docmodel_(zbl):
    if not "mc" in zbl: return
    docid = zbl[zbl_io.ZBL_ID_FIELD]
    msc_codes = zbl_io.unpack_multivalue_field(zbl['mc'])
    docid2primcode[docid] = msc_codes[0]
    docid2seccodes[docid] = msc_codes[1:]
Example #17
0
    except:
        print "First argument expected: zbl-file-path."
        sys.exit(-1)
    #l - low level, m - medium level, h - highest level of MSC tree
    print "MIN_COUNT_MSC=",MIN_COUNT_MSC 
    print "NUM_TRIES=",NUM_TRIES
    print "VALID_LEAF_PATTERN_RE=",VALID_LEAF_PATTERN_RE    
    print "must_have_fields=",must_have_fields
    print "zbl_path=",zbl_path
     
    print "Building list of msc-primary-codes that should be considered..."
    start = time.clock()
    msc2count = {}
    for i,zbl in enumerate(_get_zbl_generator_(zbl_path, must_have_fields)):
        if i%10000 == 0: print "",i,"records processed in",(time.clock()-start),"s ->",sum(msc2count.values()),"kept"        
        msc_codes = [ zbl_io.unpack_multivalue_field(zbl['mc'])[0] ] #only primary
        for msc in msc_codes:
            #print msc,"->",(not VALID_LEAF_PATTERN_RE.match(msc) is None)
            if not VALID_LEAF_PATTERN_RE.match(msc) is None:
                msc2count[msc] = msc2count.get(msc, 0)+1        
    print "Filtering for with MIN_COUNT_MSC:",MIN_COUNT_MSC," out of", sum(msc2count.values())
    msc2count = dict((msc,count) for msc,count in msc2count.iteritems() if count>=MIN_COUNT_MSC)
    print "Building mapping msc2ix"
    msc2ix = dict((msc,ix) for ix,msc in enumerate(msc2count))
    ix2msc = dict((ix,msc) for msc,ix in msc2ix.iteritems())
    leaves = list( msc2ix )
    num_leaves = len(leaves)
    print "Building MSC tree out of", num_leaves, "leaves"
    msc_tree = trees.build_msctree(msc2ix.keys(), msc2ix)
    #print str(trees.map_tree_leaves(msc_tree, ix2msc))[:400]