def generate_papers(datafile, feature_begin, feature_end, observation_begin, observation_end, conf_list): logging.info('generating papers ...') # try: # result = pickle.load(open('dblp/data/papers_%s.pkl' % path, 'rb')) # return result # except IOError: # pass indexer = Indexer(['author', 'paper', 'term', 'venue']) index, authors, title, year, venue = None, None, None, None, None references = [] write = 0 cite = 0 include = 0 published = 0 min_year = 3000 max_year = 0 papers_feature_window = [] papers_observation_window = [] with open(datafile) as file: dataset = file.read().splitlines() for line in dataset: if not line: if year and venue: year = int(year) if year > 0 and authors and venue in conf_list: min_year = min(min_year, year) max_year = max(max_year, year) authors = authors.split(',') terms = parse_term(title) write += len(authors) cite += len(references) include += len(terms) published += 1 p = Paper(year) if feature_begin < year <= feature_end: p.id = indexer.index('paper', index) p.terms = [ indexer.index('term', term) for term in terms ] p.references = [ indexer.index('paper', paper_id) for paper_id in references ] p.authors = [ indexer.index('author', author_name) for author_name in authors ] p.venue = indexer.index('venue', venue) bisect.insort(papers_feature_window, p) elif observation_begin < year <= observation_end: p.references = references p.authors = authors papers_observation_window.append(p) index, authors, title, year, venue = None, None, None, None, None references = [] else: begin = line[1] if begin == '*': title = line[2:] elif begin == '@': authors = line[2:] elif begin == 't': year = line[2:] elif begin == 'c': venue = line[2:] elif begin == 'i': index = line[6:] elif begin == '%': references.append(line[2:]) for p in papers_observation_window: authors = [] references = [] for author in p.authors: author_id = indexer.get_index('author', author) if author_id is not None: authors.append(author_id) for ref in p.references: paper_id = indexer.get_index('paper', ref) if paper_id is not None: references.append(paper_id) p.authors = authors p.references = references with open('dblp/data/metadata_%s.txt' % path, 'w') as output: output.write('Nodes:\n') output.write('-----------------------------\n') output.write('#Authors: %d\n' % indexer.indices['author']) output.write('#Papers: %d\n' % indexer.indices['paper']) output.write('#Venues: %d\n' % indexer.indices['venue']) output.write('#Terms: %d\n\n' % indexer.indices['term']) output.write('\nEdges:\n') output.write('-----------------------------\n') output.write('#Write: %d\n' % write) output.write('#Cite: %d\n' % cite) output.write('#Publish: %d\n' % published) output.write('#Contain: %d\n' % include) output.write('\nTime Span:\n') output.write('-----------------------------\n') output.write('From: %s\n' % min_year) output.write('To: %s\n' % max_year) result = papers_feature_window, papers_observation_window, indexer.indices # pickle.dump(result, open('dblp/data/papers_%s.pkl' % path, 'wb')) return result