def set_author_dict(limit = -1, cache_file = '../DATA/py_author_cache.pkl'): global authors_by_id if os.path.exists(cache_file): print "Reading from cache" with open(cache_file) as cf: authors_by_id = pickle.load(cf) return # if cache file not present: authors_by_id = defaultdict(set) # calculate author dict for rec_id,meta_dict in get_json_from_dir(META_DIR, limit = limit): authors_by_id[rec_id].update(meta_dict['creator']) # cache author dict with open(cache_file,'w') as cf: pickle.dump(authors_by_id,cf)
def meta_fill_db(db=db, limit=-1): # # Create Paper Nodes # start = time() chunk_size = 1000 for batch_count, batch in enumerate( group_generator(get_json_from_dir(meta_json_dir, limit=limit), chunk_size)): print 'Processing metadata record %d. Time elapsed: %d sec.' % ( batch_count * chunk_size, time() - start) with db.transaction: for rec_id, meta_dict in batch: # create a new node paper_node = db.node( label='paper_node arxiv:' + rec_id, title=meta_dict['title'][0], abstract=meta_dict['description'][0], unknown_references=[''], date=meta_dict['date'][0], source_url=meta_dict['identifier'] [0], # Check if really works? source_id='arxiv:' + rec_id, # arxiv_meta_dict = [ x for k,v in meta_dict.items() for x in (k, "|".join(v)) ], ) # add a relation paper_node --[type]--> PAPER paper_node.type(PAPER) # register in source_id index source_idx['id'][paper_node['source_id']] = paper_node for author_name in meta_dict['creator']: # create an author name node author_node = add_get_author(author_name) # create a relation paper_node --[author]--> author_node paper_node.author(author_node) print 'closing transaction'
def meta_fill_db(db=db,limit = -1): # # Create Paper Nodes # start = time() chunk_size = 1000 for batch_count, batch in enumerate(group_generator( get_json_from_dir(meta_json_dir, limit = limit), chunk_size)): print 'Processing metadata record %d. Time elapsed: %d sec.' % (batch_count * chunk_size, time() - start) with db.transaction: for rec_id, meta_dict in batch: # create a new node paper_node = db.node( label = 'paper_node arxiv:'+rec_id, title = meta_dict['title'][0], abstract = meta_dict['description'][0], unknown_references = [''], date = meta_dict['date'][0], source_url = meta_dict['identifier'][0], # Check if really works? source_id = 'arxiv:'+rec_id, # arxiv_meta_dict = [ x for k,v in meta_dict.items() for x in (k, "|".join(v)) ], ) # add a relation paper_node --[type]--> PAPER paper_node.type(PAPER) # register in source_id index source_idx['id'][paper_node['source_id']] = paper_node for author_name in meta_dict['creator']: # create an author name node author_node = add_get_author(author_name) # create a relation paper_node --[author]--> author_node paper_node.author(author_node) print 'closing transaction'