Python parse Examples

Programming Language: Python

Namespace/Package Name: extractrefdata

Method/Function: parse

Examples at hotexamples.com: 2

Python parse - 2 examples found. These are the top rated real world Python examples of extractrefdata.parse extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

File: matchauthors.py Project: gdanezis/refreerank

def test_main():
    # extract REF data
    ref_inst = parse(field = 0)
    ref_title = parse(field = 5)
    ref_venue = parse(field = 6)
    ref_year = parse(field = 17)

    ref_descriptions = parse(field = 30)
    ref_topics = []
    for descriptor in ref_descriptions:
        topic = re.search("<([0-9]+)>", descriptor)
        if topic is not None:
            ref_topics.append(int(topic.group(1)))
        else:
            ref_topics.append(None)

    ref_papers = sorted(zip(ref_inst, ref_title, ref_venue, ref_year, ref_topics)[1:]) # sort by ref_inst

    # Extract DBLP selected data
    dblp_data = file("data/selectfiles.dat", "rb").read()
    dblp_data = unpackb(dblp_data)
    dblp_titles = [title for (authors, title, booktitle, year) in dblp_data]
    all_l = len(dblp_data)

    # inst titles
    inst_titles = parse_institutions("data/Institution.csv")

    # Extract ref author data
    author_data = sorted(parse_authors("data/Staff.csv"))
    author_map = {}
    for k, g in groupby(author_data, FIRST):
        author_map[k] = list(g)


    P = ProjectedStrings(dblp_titles, l = (3,5), n_comp=30)
    P.threshold = 0.3

    matched_papers = {}
    matched_people = {}

    author_list = []
    paper_list = []

    for k, g in groupby(ref_papers, FIRST):
        print
        print k, inst_titles[k]
        matched_papers[k] = [0, 0]
        inst_authors = Counter()
        for inst, titl, venu, yr, topic in g:
            matched_papers[k][0] += 1
            #if len(list(P.matches(titl))) == 0:
            #    unmatched_papers[k][1] += 1
            matches_flag = list(P.matches(titl))
            if len(matches_flag) > 0:
                matched_papers[k][1] += 1
                paper_list.append((k, titl, dblp_data[matches_flag[0][0]]))
            #print ">", titl
            for i, mx, title in matches_flag:
                #matched_papers[k][1] += 1
                #print "(%2.2f) %s" % (mx, title)
                #print ", ".join(dblp_data[i][0])
                inst_authors.update(dblp_data[i][0])
            #print

        diverse_names = sum([diversify_name(a) for a in inst_authors], [])
        just_names = [n1 for n1, _ in diverse_names]

        Pauths = ProjectedStrings(just_names, l=(1,3),  n_comp=30)
        Pauths.threshold = 0.45

        matched_people[k] = [0, 0]
        for _, surname, initials in author_map[k]:
            matched_people[k][0] += 1
            # Normalize a bit
            surname1 = surname.lower().translate(None, ' .-,').strip()
            initials1 = initials.lower().translate(None, ' .-,').strip()
            new_name = initials1 + " " + surname1
            matches = sorted(list(Pauths.matches(new_name, k=5)), key=lambda x:x[1], reverse=True)

            # Find a way to break high-ties
            strong_matches = [(idx, mx * inst_authors[diverse_names[idx][1]], diverse_names[idx][1]) for (idx, mx, name) in matches if mx > 0.40]
            strong_matches = [(idx, mx, name) for (idx, mx, name) in strong_matches if mx > 0.55]
            strong_matches = sorted(strong_matches, key=lambda x:x[1], reverse=True)

            if len(strong_matches) > 0:
                matched_people[k][1] += 1
                author_list.append((k, "%s %s" % (initials, surname), strong_matches[0][2]))
                print "%2.2f | %s %s | %s" % (strong_matches[0][1], initials, surname, strong_matches[0][2])
            else:
                print "%s | %s %s | %s" % ("***", initials, surname, "")
                print matches

    print "Packing data"
    packed_authors = packb(author_list, use_bin_type=True)
    file("data/author_list.dat", "wb").write(packed_authors)

    packed_papers = packb(paper_list, use_bin_type=True)
    file("data/paper_list.dat", "wb").write(packed_papers)

    fo = open("results/match_quality.txt", "w")
    print  >>fo, "People and Papers matched"
    for k, (v1, v2) in matched_people.iteritems():
        (m1, m2) = matched_papers[k]
        print >> fo, "%2.2f%% | %2.2f%% | %s" % ( 100* float(v2) / float(v1), 100* float(m2) / float(m1), inst_titles[k])

Example #2

Show file

File: matchdblp.py Project: gdanezis/refreerank

from extractrefdata import ProjectedStrings, parse
from msgpack import unpackb, packb
import random
from collections import defaultdict

# Get the REF data
datas = parse()
dates = parse(field=17)
p = ProjectedStrings(datas, l=(3, 5))
p.threshold = 0.30

dblp_data = file("data/allfiles.dat", "rb").read()
dblp_data = unpackb(dblp_data)
all_l = len(dblp_data)

venue_stats = defaultdict(int)

# Time
old = 0
new_dblp_list = []
print "starting ..."
for l, (authors, title, booktitle, year) in enumerate(dblp_data):

    # Gather some generic statistics about the venue_stats
    if 2009 <= int(year) <= 2014:
        venue_stats[booktitle] += 1

    m = list(p.matches(title, k=10))
    if m == []:
        continue