Python read_zbl_recordsの例、data_io.zbl_io.read_zbl_records Pythonの例

コード例 #1

0

ファイルを表示

ファイル: zbl_analysis.py プロジェクト: pszostek/research-python-backup

def _report_ci_(path, records_filter = allow_all_filter, \
                ci_dst_records_filter = allow_all_filter, \
                uq_id_field_name = zbl_io.ZBL_ID_FIELD):
    """Prints report about citations in ZBL file.
    
    records_filter(record) - should return True if record is admitted
    ci_dst_records_filter(record) -  should return True if record that citation is pointing at is admitted
    uq_id_field_name - name of a field that uniquely identifies record 
    """
    #wczytywanie zbioru na ktory moga wskazywac cytowania:
    print "Loading ids of records that may be citation destination."
    dst_records_ids = set()
    for i,record in enumerate( zbl_io.read_zbl_records(open(path)) ):
        if i%100000 == 0: print i," records considered" #progress bar
        if record.has_key(uq_id_field_name) and ci_dst_records_filter(record):
            dst_records_ids.add(record[uq_id_field_name])            
    print "Done.", len(dst_records_ids), " records loaded."
    
    #statystyki:
    cis_len = [] #liczba cytowan
    cis_matched = [] #liczba cytowan ktore trafiaja w zadany zbior 
    
    for i,record in enumerate(zbl_io.read_zbl_records(open(path))):
        if i%100000 == 0: print i," records considered" #progress bar
        if not record.has_key("ci") or not records_filter(record):
            continue
                
        cis                 = zbl_io.unpack_list_of_dictionaries(record["ci"])
        #identyfikatory cytowan:
        identified_ci_ids   = list(ci[uq_id_field_name] for ci in cis if ci.has_key(uq_id_field_name))
        #rekordy dopsowane do cytowan i w zadanym zbiorze:
        filtered_matched_records = list(id for id in identified_ci_ids if id in dst_records_ids)
                                         
        cis_len.append(len(cis))
        cis_matched.append(len(filtered_matched_records))
      
    cis_matched_div_len = list( float(m)/float(l) for m,l in zip(cis_matched, cis_len) )
        
    print "Citation statistics (only on records with citations) [total, min avg max std]: "
    print "-Number of citations :", "\t", round(sum(cis_len),0), "\t", round(min(cis_len),0), "\t", round(avg(cis_len),2), "\t", round(max(cis_len),0), "\t", round(std(cis_len),2)
    print "-Matching citations:",  "\t", round(sum(cis_matched),0), "\t", round(min(cis_matched),0), "\t", round(avg(cis_matched),2), "\t", round(max(cis_matched),0), "\t", round(std(cis_matched),2)
    print "-Fraction of matching citations: - ",  "\t", round(min(cis_matched_div_len),3), "\t", round(avg(cis_matched_div_len),3), "\t", round(max(cis_matched_div_len),3), "\t", round(std(cis_matched_div_len),3)
    print "-Total Number of citations/Matching citations:", "\t", round(float(sum(cis_matched))/sum(cis_len),3)    
    print "->", round(sum(cis_len),0), (max(cis_len)), round(avg(cis_len),2), round(std(cis_len),2), \
     round(sum(cis_matched),0), (max(cis_matched)), round(avg(cis_matched),2), round(std(cis_matched),2), \
      round(max(cis_matched_div_len),3), round(avg(cis_matched_div_len),3), round(std(cis_matched_div_len),3) 
    
    cis_matched_hist = {}
    for i in xrange(0, max(cis_matched)+1):
        cis_matched_hist[i] = sum(1 for c in cis_matched if c==i)
    print "Histogram:", cis_matched_hist
    
    n, bins, patches = plt.hist(sorted(cis_matched), bins = max(cis_matched), normed=False, alpha=0.75)
    plt.xlabel("Liczba dopasowanych cytowan")
    plt.ylabel("Liczba rekordow")    
    plt.show()

コード例 #2

0

ファイルを表示

def count_ids(fin):
    """
    Counts how many records there are with each fields list of given list of field lists
    """
    id_vals = {}
    au_vals = 0
    all = 0
    records = list(read_zbl_records(fin))
    for ind, r in enumerate(records):
        all += 1
        au_vals += 'au' in r

        if r['an'] not in id_vals:
            id_vals[r['an']] = [ind]
        else:
            pass
            '''
            print "-------------------------------"
            print "Powtorzenie id an!", r['an']
            id_vals[r['an']] += [ind]
            for i in id_vals[r['an']]:
                print records[i]
            print "-------------------------------"
            '''
    return all, len(id_vals), au_vals

コード例 #3

0

ファイルを表示

ファイル: count_distinct_ids.py プロジェクト: pszostek/research-python-backup

def count_ids(fin):
    """
    Counts how many records there are with each fields list of given list of field lists
    """
    id_vals = {}
    au_vals = 0
    all = 0
    records = list(read_zbl_records(fin))
    for ind, r in enumerate(records):
        all+=1
        au_vals += 'au' in r
        
        if r['an'] not in id_vals:
            id_vals[r['an']] = [ind]
        else:
            pass
            '''
            print "-------------------------------"
            print "Powtorzenie id an!", r['an']
            id_vals[r['an']] += [ind]
            for i in id_vals[r['an']]:
                print records[i]
            print "-------------------------------"
            '''
    return all, len(id_vals), au_vals

コード例 #4

0

ファイルを表示

ファイル: zbl_analysis.py プロジェクト: pszostek/research-python-backup

def _draw_af_hist_(path, records_filter = allow_all_filter):
    """Draws histogram of authorship."""    
    af_count = {} #dict{author: count}
    
    for i,record in enumerate(zbl_io.read_zbl_records(open(path))):
        if i%100000 == 0: print i," records considered" #progress bar
        if not record.has_key("af") or not records_filter(record):
            continue        
        
        afs = zbl_io.unpack_multivalue_field(record["af"])
        afs_ok = list( af for af in afs if af!='-' )
        
        for af in afs_ok:
            af_count[af] = af_count.get(af, 0) + 1
             
    print len(af_count), " authors found."
    print max(af_count.values()), " = max"
    print min(af_count.values()), " = min"
    avg_af_values = avg(af_count.values())
    print round(avg_af_values, 2), " = avg"
    print round(std(af_count.values()), 2), " = std"
    print sum(1 for af in af_count.values() if af > avg_af_values) , " authors above avg"
    print sum(1 for af in af_count.values() if af < avg_af_values) , " authors below avg"
    
    n, bins, patches = plt.hist(af_count.values(), bins = max(af_count.values()), normed=False, log=True, alpha=0.75)
    plt.xlabel("Liczba wystapien w rekordach")
    plt.ylabel("Liczba autorow")    
    plt.show()

コード例 #5

0

ファイルを表示

def _draw_af_hist_(path, records_filter=allow_all_filter):
    """Draws histogram of authorship."""
    af_count = {}  #dict{author: count}

    for i, record in enumerate(zbl_io.read_zbl_records(open(path))):
        if i % 100000 == 0: print i, " records considered"  #progress bar
        if not record.has_key("af") or not records_filter(record):
            continue

        afs = zbl_io.unpack_multivalue_field(record["af"])
        afs_ok = list(af for af in afs if af != '-')

        for af in afs_ok:
            af_count[af] = af_count.get(af, 0) + 1

    print len(af_count), " authors found."
    print max(af_count.values()), " = max"
    print min(af_count.values()), " = min"
    avg_af_values = avg(af_count.values())
    print round(avg_af_values, 2), " = avg"
    print round(std(af_count.values()), 2), " = std"
    print sum(1 for af in af_count.values()
              if af > avg_af_values), " authors above avg"
    print sum(1 for af in af_count.values()
              if af < avg_af_values), " authors below avg"

    n, bins, patches = plt.hist(af_count.values(),
                                bins=max(af_count.values()),
                                normed=False,
                                log=True,
                                alpha=0.75)
    plt.xlabel("Liczba wystapien w rekordach")
    plt.ylabel("Liczba autorow")
    plt.show()

コード例 #6

0

ファイルを表示

def _report_af_quality_(path, records_filter=allow_all_filter):
    """Prints report about authors' identities quality."""
    afs_len = []
    afs_ok_len = []

    for i, record in enumerate(zbl_io.read_zbl_records(open(path))):
        if i % 100000 == 0: print i, " records considered"  #progress bar
        if not record.has_key("af") or not records_filter(record):
            continue

        afs = zbl_io.unpack_multivalue_field(record["af"])
        afs_ok = list(af for af in afs if af != '-')

        afs_len.append(len(afs))
        afs_ok_len.append(len(afs_ok))

    afs_ok_frac = list(
        float(m) / float(l) for m, l in zip(afs_ok_len, afs_len))

    print max(afs_len), "\n", round(avg(afs_len),
                                    2), "\n", round(std(afs_len), 2)
    print max(afs_ok_len), "\n", round(avg(afs_ok_len),
                                       2), "\n", round(std(afs_ok_len), 2)
    print round(max(afs_ok_frac),
                2), "\n", round(avg(afs_ok_frac),
                                2), "\n", round(std(afs_ok_frac), 2)

コード例 #7

0

ファイルを表示

ファイル: gen_rand_simmatrix.py プロジェクト: pszostek/research-python-backup

def _get_zbl_generator_(zbl_path, must_have_field = 'mc'):
    """Returns zbl-records generator that has guaranteed presence of must_have_field field."""
    UNI = True #unic
    f = zbl_io.open_file(zbl_path, UNI)
    #return (zbl for zbl in zbl_io.read_zbl_records(f, UNI) if must_have_field in zbl)
    for ix,zbl in enumerate(zbl_io.read_zbl_records(f, UNI)): 
        if must_have_field in zbl:
            #zbl[zbl_io.ZBL_ID_FIELD] = ix #replacing ids with numbers for faster processing
            yield zbl

コード例 #8

0

ファイルを表示

ファイル: gen_rand_simmatrix.py プロジェクト: pszostek/research-python-backup

def _get_zbl_generator_(zbl_path, must_have_field='mc'):
    """Returns zbl-records generator that has guaranteed presence of must_have_field field."""
    UNI = True  #unic
    f = zbl_io.open_file(zbl_path, UNI)
    #return (zbl for zbl in zbl_io.read_zbl_records(f, UNI) if must_have_field in zbl)
    for ix, zbl in enumerate(zbl_io.read_zbl_records(f, UNI)):
        if must_have_field in zbl:
            #zbl[zbl_io.ZBL_ID_FIELD] = ix #replacing ids with numbers for faster processing
            yield zbl

コード例 #9

0

ファイルを表示

ファイル: zbl_extract.py プロジェクト: pszostek/research-python-backup

def extract_citations_doublelinked_graph_file(fin, fout):
    """From fin reads zbl_records and to fout writes in lines: zbl_id:zbl-id1,...,zbl-idN.
    
    If records r1 cites r2 than in output graph there are two links: r1->r2 and r2->r1.        
    """
    zbl_generator = zbl_io.read_zbl_records(fin)
    id2ids_generator = extract_citations_doublelinked_graph(zbl_generator)
    #print "[extract_citations_doublelinked_graph_file]",id2ids_generator
    return write_file_id2ids(fout, id2ids_generator.iteritems(), cast_container = set)

コード例 #10

0

ファイルを表示

ファイル: zbl_extract.py プロジェクト: pszostek/research-python-backup

def extract_fv_graph_file(fin,
                          fout,
                          multival_field_name="af",
                          empty_value="-"):
    """From fin reads zbl_records and to fout writes in lines: zbl_id:id1,id2,id3 (graph extracted from field of name multival_field_name)."""
    zbl_generator = zbl_io.read_zbl_records(fin)
    fv2ids = extract_fv_graph(zbl_generator, multival_field_name, empty_value,
                              set)
    return write_file_id2ids(fout, fv2ids.iteritems(), cast_container=set)

コード例 #11

0

ファイルを表示

def build_wordsmodel(fin, fout, src_field = "g0"):
    """Returns ({wordid:number-of-occurrences-in-whole-corpus}, {wordid:number-of-docs-that-contain-this-word}, numdocs)."""
    wordsmodel = WordsModel()        
    for i,record in enumerate(zbl_io.read_zbl_records(fin)):                            
        if i%500 == 0: logging.info("[build_wordsmodel] "+str(i)+" records processed")
        if src_field in record: 
            doc_wordid2count = _di_( zbl_io.unpack_dictionary_field(record[src_field]) )    
            wordsmodel.update(doc_wordid2count)
    wordsmodel.finish_updates()    
    return wordsmodel

コード例 #12

0

ファイルを表示

ファイル: zbl_extract.py プロジェクト: pszostek/research-python-backup

def extract_citations_doublelinked_graph_file(fin, fout):
    """From fin reads zbl_records and to fout writes in lines: zbl_id:zbl-id1,...,zbl-idN.
    
    If records r1 cites r2 than in output graph there are two links: r1->r2 and r2->r1.        
    """
    zbl_generator = zbl_io.read_zbl_records(fin)
    id2ids_generator = extract_citations_doublelinked_graph(zbl_generator)
    #print "[extract_citations_doublelinked_graph_file]",id2ids_generator
    return write_file_id2ids(fout,
                             id2ids_generator.iteritems(),
                             cast_container=set)

コード例 #13

0

ファイルを表示

def count_occurrences(file, required_fields, records_filter=allow_all_filter):
    """Counts records in ZBL file that have all fields from required_fields (and were admitted by filter).
    
    records_filter(record) - should return True if record is admitted"""
    occurrences = 0
    for record in zbl_io.read_zbl_records(file):
        if not records_filter(record):
            continue  #filter does not allow
        if has_record_fields(record, required_fields):
            occurrences = occurrences + 1
    return occurrences

コード例 #14

0

ファイルを表示

ファイル: zbl_analysis.py プロジェクト: pszostek/research-python-backup

def count_occurrences(file, required_fields, records_filter = allow_all_filter):
    """Counts records in ZBL file that have all fields from required_fields (and were admitted by filter).
    
    records_filter(record) - should return True if record is admitted"""        
    occurrences = 0    
    for record in zbl_io.read_zbl_records(file):
        if not records_filter(record):
            continue #filter does not allow
        if has_record_fields(record, required_fields):
            occurrences = occurrences + 1
    return occurrences

コード例 #15

0

ファイルを表示

ファイル: find_fields.py プロジェクト: pszostek/research-python-backup

def find_fields(fname):
    """
    Counts how many records there are with given fields
    """
    
    all = 0
    fields = set()
    for r in read_zbl_records( open(fname, 'r')):
        all+=1
        for field in r.iterkeys():
            fields.add(field)
    return all, fields

コード例 #16

0

ファイルを表示

ファイル: msc_processing.py プロジェクト: pszostek/research-python-backup

def count_msc_occurences(file, records_filter=lambda x: True, field_name="mc"):
    """Counts number of occurrences of MSC codes in ZBL file.
        
    Returns dictionary{code_name: count}"""
    counts = {}
    for record in zbl_io.read_zbl_records(file):
        if not records_filter(record) or not record.has_key(field_name):
            continue
        codes = zbl_io.unpack_multivalue_field(record[field_name])
        for code in codes:
            counts[code] = counts.get(code, 0) + 1
    return counts

コード例 #17

0

ファイルを表示

ファイル: msc_processing.py プロジェクト: pszostek/research-python-backup

def count_msc_occurences(file, records_filter = lambda x: True, field_name = "mc"):
    """Counts number of occurrences of MSC codes in ZBL file.
        
    Returns dictionary{code_name: count}"""
    counts = {}
    for record in zbl_io.read_zbl_records(file):
        if not records_filter(record) or not record.has_key(field_name):
            continue 
        codes = zbl_io.unpack_multivalue_field(record[field_name])
        for code in codes:
            counts[code] = counts.get(code, 0) + 1    
    return counts

コード例 #18

0

ファイルを表示

def map_wordsmodel_overall_weighting(fin, fout, wordsmodel, src_field="g0", dst_field="g1",\
                                     weight = lambda wordsmodel,doc_wordid2count,wordid: tf(doc_wordid2count, wordid)*idf(wordsmodel, wordid) ):
    """Maps value of src_field using wordsmodel and weigting function. Results stores to dst_field."""
    counter = 0    
    for i,record in enumerate(zbl_io.read_zbl_records(fin)):                            
        if i%100 == 0: logging.info("[map_wordsmodel_overall_weighting] "+str(i)+" records processed."+str(counter)+"enriched.")
        if src_field in record:                
            doc_wordid2count    = _di_( zbl_io.unpack_dictionary_field(record[src_field]) )
            doc_wordid2weight   = [( wordid,weight(wordsmodel,doc_wordid2count,wordid) ) for wordid,count in doc_wordid2count.iteritems() ]
            record[dst_field]   = zbl_io.pack_listpairs_field( sorted( doc_wordid2weight ) )
            counter = counter + 1 
        zbl_io.write_zbl_record(fout, record)
        fout.write("\n")
    return counter

コード例 #19

0

ファイルを表示

ファイル: count_fields.py プロジェクト: pszostek/research-python-backup

def count_records_with_fields(fname, fields):
    """
    Counts how many records there are with given fields
    """
    
    all = 0
    cnt = 0
    for r in read_zbl_records( open(fname, 'r')):
        all+=1
        if fields:
            #check if this fields occur
            if reduce(lambda x, y: x and y, map(lambda f: f in r and r[f].strip()<>'null', fields)):
                cnt+=1
    return all, cnt

コード例 #20

0

ファイルを表示

ファイル: count_fields.py プロジェクト: pszostek/research-python-backup

def count_records_with_various_fields(fname, lfields):
    """
    Counts how many records there are with each fields list of given list of field lists
    """
    
    all = 0
    counts = len(lfields)*[0]
    for r in read_zbl_records( open(fname, 'r')):
        all+=1
        for i, fields in enumerate(lfields):
            #check if this fields occur
            if reduce(lambda x, y: x and y, map(lambda f: f in r and r[f].strip()<>'null', fields)):
                counts[i] += 1
    return all, counts

コード例 #21

0

ファイルを表示

def _get_zbl_generator_(zbl_path, must_have_fields):
    """Returns zbl-records generator that has guaranteed presence of must_have_fields."""
    UNI = True #unic
    f = zbl_io.open_file(zbl_path, UNI)    
    for ix,zbl in enumerate(zbl_io.read_zbl_records(f, UNI)):        
        has_all_fields = sum(1 for field in must_have_fields if field in zbl) == len(must_have_fields)
        #has_all_fields = True
        #for field in must_have_fields:
        #    if not field in zbl:
        #        has_all_fields = False                
        #        break
        #print zbl,"->",has_all_fields                
        if has_all_fields:
            zbl[zbl_io.ZBL_ID_FIELD] = ix #replacing ids with numbers for faster processing
            yield zbl

コード例 #22

0

ファイルを表示

ファイル: ngrams.py プロジェクト: pszostek/research-python-backup

def modify_wordslist_file(fin, fout, list_of_fields, wordslist_modifier):
    """Converts single words in selected fields into n-grams by merging words.
    
    wordslist_modifier(words list) -> modified_words list
    """
    for record in zbl_io.read_zbl_records(fin):
        for field in list_of_fields:
            if not record.has_key(field): continue
            words = record[field].split()
            modified_words = wordslist_modifier(words)
            if len(modified_words) <= 0: 
                logging.warn("Error in an="+str(record[zbl_io.ZBL_ID_FIELD])+" in field "+ str(field)+ "="+str(record[field])+". Using single words instead.")
                modified_words = words
            record[field] = reduce(lambda w1,w2: (w1)+' '+(w2), modified_words)
        zbl_io.write_zbl_record(fout, record)
        fout.write("\n")

コード例 #23

0

ファイルを表示

ファイル: count_fields.py プロジェクト: pszostek/research-python-backup

def count_records_with_fields(fname, fields):
    """
    Counts how many records there are with given fields
    """

    all = 0
    cnt = 0
    for r in read_zbl_records(open(fname, 'r')):
        all += 1
        if fields:
            #check if this fields occur
            if reduce(lambda x, y: x and y,
                      map(lambda f: f in r and r[f].strip() <> 'null',
                          fields)):
                cnt += 1
    return all, cnt

コード例 #24

0

ファイルを表示

ファイル: count_fields.py プロジェクト: pszostek/research-python-backup

def count_records_with_various_fields(fname, lfields):
    """
    Counts how many records there are with each fields list of given list of field lists
    """

    all = 0
    counts = len(lfields) * [0]
    for r in read_zbl_records(open(fname, 'r')):
        all += 1
        for i, fields in enumerate(lfields):
            #check if this fields occur
            if reduce(lambda x, y: x and y,
                      map(lambda f: f in r and r[f].strip() <> 'null',
                          fields)):
                counts[i] += 1
    return all, counts

コード例 #25

0

ファイルを表示

def modify_wordslist_file(fin, fout, list_of_fields, wordslist_modifier):
    """Converts single words in selected fields into n-grams by merging words.
    
    wordslist_modifier(words list) -> modified_words list
    """
    for record in zbl_io.read_zbl_records(fin):
        for field in list_of_fields:
            if not record.has_key(field): continue
            words = record[field].split()
            modified_words = wordslist_modifier(words)
            if len(modified_words) <= 0:
                logging.warn("Error in an=" +
                             str(record[zbl_io.ZBL_ID_FIELD]) + " in field " +
                             str(field) + "=" + str(record[field]) +
                             ". Using single words instead.")
                modified_words = words
            record[field] = reduce(lambda w1, w2: (w1) + ' ' + (w2),
                                   modified_words)
        zbl_io.write_zbl_record(fout, record)
        fout.write("\n")

コード例 #26

0

ファイルを表示

ファイル: zbl_analysis.py プロジェクト: pszostek/research-python-backup

def _draw_mc_hist(path, records_filter = allow_all_filter):
    """Draws histogram of MSC codes occurrence in records."""    
    mc_counts = []
    
    for i,record in enumerate(zbl_io.read_zbl_records(open(path))):
        if i%100000 == 0: print i," records considered" #progress bar
        if not record.has_key("mc") or not records_filter(record):
            continue          
        
        mc = zbl_io.unpack_multivalue_field(record["mc"])
        mc_counts.append(len(mc))
        
    print len(mc_counts), " record found."
    print max(mc_counts), " = max"
    print min(mc_counts), " = min"
    print round(avg(mc_counts), 2), " = avg"
    print round(std(mc_counts), 2), " = std"
    n, bins, patches = plt.hist(mc_counts, bins = max(mc_counts), normed=False, alpha=0.75)
    plt.xlabel("Liczba kodow MSC w rekordzie")
    plt.ylabel("Liczba rekordow")    
    plt.show()

コード例 #27

0

ファイルを表示

ファイル: zbl_analysis.py プロジェクト: pszostek/research-python-backup

def _report_af_quality_(path, records_filter = allow_all_filter):
    """Prints report about authors' identities quality."""    
    afs_len = []
    afs_ok_len = []
    
    for i,record in enumerate(zbl_io.read_zbl_records(open(path))):
        if i%100000 == 0: print i," records considered" #progress bar
        if not record.has_key("af") or not records_filter(record):
            continue        
        
        afs = zbl_io.unpack_multivalue_field(record["af"])
        afs_ok = list( af for af in afs if af!='-' )
        
        afs_len.append(len(afs))
        afs_ok_len.append(len(afs_ok))
        
    afs_ok_frac = list( float(m)/float(l) for m,l in zip(afs_ok_len, afs_len) )    
        
    print max(afs_len), "\n", round(avg(afs_len),2), "\n", round(std(afs_len),2)
    print max(afs_ok_len), "\n", round(avg(afs_ok_len),2), "\n", round(std(afs_ok_len),2)
    print round(max(afs_ok_frac),2), "\n", round(avg(afs_ok_frac),2), "\n", round(std(afs_ok_frac),2)

コード例 #28

0

ファイルを表示

def _draw_mc_hist(path, records_filter=allow_all_filter):
    """Draws histogram of MSC codes occurrence in records."""
    mc_counts = []

    for i, record in enumerate(zbl_io.read_zbl_records(open(path))):
        if i % 100000 == 0: print i, " records considered"  #progress bar
        if not record.has_key("mc") or not records_filter(record):
            continue

        mc = zbl_io.unpack_multivalue_field(record["mc"])
        mc_counts.append(len(mc))

    print len(mc_counts), " record found."
    print max(mc_counts), " = max"
    print min(mc_counts), " = min"
    print round(avg(mc_counts), 2), " = avg"
    print round(std(mc_counts), 2), " = std"
    n, bins, patches = plt.hist(mc_counts,
                                bins=max(mc_counts),
                                normed=False,
                                alpha=0.75)
    plt.xlabel("Liczba kodow MSC w rekordzie")
    plt.ylabel("Liczba rekordow")
    plt.show()

コード例 #29

0

ファイルを表示

ファイル: zbl_extract.py プロジェクト: pszostek/research-python-backup

def extract_citations_graph_file(fin, fout):
    """From fin reads zbl_records and to fout writes in lines: zbl_id:citation-id1,...,citation-idN."""
    zbl_generator = zbl_io.read_zbl_records(fin)
    id2ids_generator = yield_citations(zbl_generator)
    return write_file_id2ids(fout, id2ids_generator, cast_container = set)

コード例 #30

0

ファイルを表示

def _report_ci_(path, records_filter = allow_all_filter, \
                ci_dst_records_filter = allow_all_filter, \
                uq_id_field_name = zbl_io.ZBL_ID_FIELD):
    """Prints report about citations in ZBL file.
    
    records_filter(record) - should return True if record is admitted
    ci_dst_records_filter(record) -  should return True if record that citation is pointing at is admitted
    uq_id_field_name - name of a field that uniquely identifies record 
    """
    #wczytywanie zbioru na ktory moga wskazywac cytowania:
    print "Loading ids of records that may be citation destination."
    dst_records_ids = set()
    for i, record in enumerate(zbl_io.read_zbl_records(open(path))):
        if i % 100000 == 0: print i, " records considered"  #progress bar
        if record.has_key(uq_id_field_name) and ci_dst_records_filter(record):
            dst_records_ids.add(record[uq_id_field_name])
    print "Done.", len(dst_records_ids), " records loaded."

    #statystyki:
    cis_len = []  #liczba cytowan
    cis_matched = []  #liczba cytowan ktore trafiaja w zadany zbior

    for i, record in enumerate(zbl_io.read_zbl_records(open(path))):
        if i % 100000 == 0: print i, " records considered"  #progress bar
        if not record.has_key("ci") or not records_filter(record):
            continue

        cis = zbl_io.unpack_list_of_dictionaries(record["ci"])
        #identyfikatory cytowan:
        identified_ci_ids = list(ci[uq_id_field_name] for ci in cis
                                 if ci.has_key(uq_id_field_name))
        #rekordy dopsowane do cytowan i w zadanym zbiorze:
        filtered_matched_records = list(id for id in identified_ci_ids
                                        if id in dst_records_ids)

        cis_len.append(len(cis))
        cis_matched.append(len(filtered_matched_records))

    cis_matched_div_len = list(
        float(m) / float(l) for m, l in zip(cis_matched, cis_len))

    print "Citation statistics (only on records with citations) [total, min avg max std]: "
    print "-Number of citations :", "\t", round(sum(cis_len), 0), "\t", round(
        min(cis_len),
        0), "\t", round(avg(cis_len),
                        2), "\t", round(max(cis_len),
                                        0), "\t", round(std(cis_len), 2)
    print "-Matching citations:", "\t", round(
        sum(cis_matched), 0), "\t", round(min(cis_matched), 0), "\t", round(
            avg(cis_matched),
            2), "\t", round(max(cis_matched),
                            0), "\t", round(std(cis_matched), 2)
    print "-Fraction of matching citations: - ", "\t", round(
        min(cis_matched_div_len),
        3), "\t", round(avg(cis_matched_div_len), 3), "\t", round(
            max(cis_matched_div_len), 3), "\t", round(std(cis_matched_div_len),
                                                      3)
    print "-Total Number of citations/Matching citations:", "\t", round(
        float(sum(cis_matched)) / sum(cis_len), 3)
    print "->", round(sum(cis_len),0), (max(cis_len)), round(avg(cis_len),2), round(std(cis_len),2), \
     round(sum(cis_matched),0), (max(cis_matched)), round(avg(cis_matched),2), round(std(cis_matched),2), \
      round(max(cis_matched_div_len),3), round(avg(cis_matched_div_len),3), round(std(cis_matched_div_len),3)

    cis_matched_hist = {}
    for i in xrange(0, max(cis_matched) + 1):
        cis_matched_hist[i] = sum(1 for c in cis_matched if c == i)
    print "Histogram:", cis_matched_hist

    n, bins, patches = plt.hist(sorted(cis_matched),
                                bins=max(cis_matched),
                                normed=False,
                                alpha=0.75)
    plt.xlabel("Liczba dopasowanych cytowan")
    plt.ylabel("Liczba rekordow")
    plt.show()

コード例 #31

0

ファイルを表示

ファイル: aent.py プロジェクト: pszostek/research-python-backup

        sys.exit(-1)
    print "src=", fin

    try:
        if sys.argv[1] == sys.argv[2]:
            system.exit(-1)
            print "Paths must be different!"
        fout = open(sys.argv[2], "w")
    except:
        print "Argument expected: output-Zbl-file path"
        sys.exit(-1)
    print "dst=", fout

    print "LOADING"
    docs = []
    for N, record in enumerate(zbl_io.read_zbl_records(open(fin))):
        if N % 500 == 0: print N, "read"
        doc = []
        for field in list_of_fields:
            if not record.has_key(field): continue
            words = record[field].split()
            modified_words = build_mgrams(words, maxn)
            doc.extend(modified_words)
        if len(doc) > 0: docs.append(doc)

    print "CALC terms vs counts"
    term2count = {}
    docs_term2count = []
    docs_len = []
    for doc in docs:
        doc_term2count = {}

コード例 #32

0

ファイルを表示

ファイル: zbl_extract.py プロジェクト: pszostek/research-python-backup

def extract_fv_graph_file(fin, fout, multival_field_name = "af", empty_value = "-"):
    """From fin reads zbl_records and to fout writes in lines: zbl_id:id1,id2,id3 (graph extracted from field of name multival_field_name)."""
    zbl_generator =  zbl_io.read_zbl_records(fin)
    fv2ids = extract_fv_graph(zbl_generator, multival_field_name, empty_value, set)
    return write_file_id2ids(fout, fv2ids.iteritems(), cast_container = set)

コード例 #33

0

ファイルを表示

ファイル: aent.py プロジェクト: pszostek/research-python-backup

        sys.exit(-1)
    print "src=", fin
        
    try:
        if sys.argv[1]==sys.argv[2]:
            system.exit(-1)
            print "Paths must be different!"
        fout = open(sys.argv[2], "w")
    except:
        print "Argument expected: output-Zbl-file path"
        sys.exit(-1)
    print "dst=", fout    
    
    print "LOADING"
    docs = []        
    for N, record in enumerate(zbl_io.read_zbl_records(open(fin))):
        if N%500==0: print N, "read"
        doc = []
        for field in list_of_fields:
            if not record.has_key(field): continue
            words = record[field].split()
            modified_words = build_mgrams(words, maxn)
            doc.extend(modified_words)
        if len(doc)>0: docs.append(doc)

    print "CALC terms vs counts"
    term2count = {}
    docs_term2count = []    
    docs_len = []    
    for doc in docs:
        doc_term2count = {}

コード例 #34

0

ファイルを表示

ファイル: zbl_extract.py プロジェクト: pszostek/research-python-backup

def extract_citations_graph_file(fin, fout):
    """From fin reads zbl_records and to fout writes in lines: zbl_id:citation-id1,...,citation-idN."""
    zbl_generator = zbl_io.read_zbl_records(fin)
    id2ids_generator = yield_citations(zbl_generator)
    return write_file_id2ids(fout, id2ids_generator, cast_container=set)