Exemple #1
0
def generate_papers(datafile, feature_begin, feature_end, observation_begin,
                    observation_end, conf_list):
    logging.info('generating papers ...')

    # try:
    #     result = pickle.load(open('dblp/data/papers_%s.pkl' % path, 'rb'))
    #     return result
    # except IOError:
    #     pass

    indexer = Indexer(['author', 'paper', 'term', 'venue'])

    index, authors, title, year, venue = None, None, None, None, None
    references = []

    write = 0
    cite = 0
    include = 0
    published = 0

    min_year = 3000
    max_year = 0

    papers_feature_window = []
    papers_observation_window = []

    with open(datafile) as file:
        dataset = file.read().splitlines()

    for line in dataset:
        if not line:
            if year and venue:
                year = int(year)
                if year > 0 and authors and venue in conf_list:
                    min_year = min(min_year, year)
                    max_year = max(max_year, year)
                    authors = authors.split(',')
                    terms = parse_term(title)
                    write += len(authors)
                    cite += len(references)
                    include += len(terms)
                    published += 1

                    p = Paper(year)
                    if feature_begin < year <= feature_end:
                        p.id = indexer.index('paper', index)
                        p.terms = [
                            indexer.index('term', term) for term in terms
                        ]
                        p.references = [
                            indexer.index('paper', paper_id)
                            for paper_id in references
                        ]
                        p.authors = [
                            indexer.index('author', author_name)
                            for author_name in authors
                        ]
                        p.venue = indexer.index('venue', venue)
                        bisect.insort(papers_feature_window, p)
                    elif observation_begin < year <= observation_end:
                        p.references = references
                        p.authors = authors
                        papers_observation_window.append(p)

            index, authors, title, year, venue = None, None, None, None, None
            references = []
        else:
            begin = line[1]
            if begin == '*':
                title = line[2:]
            elif begin == '@':
                authors = line[2:]
            elif begin == 't':
                year = line[2:]
            elif begin == 'c':
                venue = line[2:]
            elif begin == 'i':
                index = line[6:]
            elif begin == '%':
                references.append(line[2:])

    for p in papers_observation_window:
        authors = []
        references = []
        for author in p.authors:
            author_id = indexer.get_index('author', author)
            if author_id is not None:
                authors.append(author_id)
        for ref in p.references:
            paper_id = indexer.get_index('paper', ref)
            if paper_id is not None:
                references.append(paper_id)
        p.authors = authors
        p.references = references

    with open('dblp/data/metadata_%s.txt' % path, 'w') as output:
        output.write('Nodes:\n')
        output.write('-----------------------------\n')
        output.write('#Authors: %d\n' % indexer.indices['author'])
        output.write('#Papers: %d\n' % indexer.indices['paper'])
        output.write('#Venues: %d\n' % indexer.indices['venue'])
        output.write('#Terms: %d\n\n' % indexer.indices['term'])
        output.write('\nEdges:\n')
        output.write('-----------------------------\n')
        output.write('#Write: %d\n' % write)
        output.write('#Cite: %d\n' % cite)
        output.write('#Publish: %d\n' % published)
        output.write('#Contain: %d\n' % include)
        output.write('\nTime Span:\n')
        output.write('-----------------------------\n')
        output.write('From: %s\n' % min_year)
        output.write('To: %s\n' % max_year)

    result = papers_feature_window, papers_observation_window, indexer.indices
    # pickle.dump(result, open('dblp/data/papers_%s.pkl' % path, 'wb'))
    return result