def get_unique_text_refs(): """Get unique INDRA DB TextRef IDs for all identifiers in CORD19. Queries TextRef IDs with PMIDs, PMCIDs, and DOIs from CORD19, then deduplicates to obtain a unique set of TextRefs. Returns ------- set of ints Unique TextRef IDs. """ pmcids = get_ids('pmcid') pmids = [fix_pmid(pmid) for pmid in get_ids('pubmed_id')] dois = [fix_doi(doi) for doi in get_ids('doi')] # Get unique text_refs from the DB db = get_primary_db() print("Getting TextRefs by PMCID") tr_pmcids = db.select_all(db.TextRef.id, db.TextRef.pmcid_in(pmcids)) print("Getting TextRefs by PMID") tr_pmids = db.select_all(db.TextRef.id, db.TextRef.pmid_in(pmids)) tr_dois = [] for ix, doi_batch in enumerate(batch_iter(dois, 10000)): print("Getting Text Refs by DOI batch", ix) tr_doi_batch = db.select_all( db.TextRef.id, db.TextRef.doi_in(doi_batch, filter_ids=True)) tr_dois.extend(tr_doi_batch) ids = set([ res.id for res_list in (tr_dois, tr_pmcids, tr_pmids) for res in res_list ]) print(len(ids), "unique TextRefs in DB") trs = db.select_all(db.TextRef, db.TextRef.id.in_(ids)) return trs
def get_indradb_pa_stmts(): """Get preassembled INDRA Stmts for PMC articles from INDRA DB. DEPRECATED. Get Raw Statements instead. """ # Get the list of all PMCIDs from the corpus metadata pmcids = get_ids('pmcid') paper_refs = [('pmcid', p) for p in pmcids] stmt_jsons = [] batch_size = 1000 start = time.time() for batch_ix, paper_batch in enumerate(batch_iter(paper_refs, batch_size)): if batch_ix <= 5: continue papers = list(paper_batch) print("Querying DB for statements for %d papers" % batch_size) batch_start = time.time() result = get_statement_jsons_from_papers(papers) batch_elapsed = time.time() - batch_start batch_jsons = [ stmt_json for stmt_hash, stmt_json in result['statements'].items() ] print("Returned %d stmts in %f sec" % (len(batch_jsons), batch_elapsed)) batch_stmts = stmts_from_json(batch_jsons) ac.dump_statements(batch_stmts, 'batch_%02d.pkl' % batch_ix) stmt_jsons += batch_jsons elapsed = time.time() - start print("Total time: %f sec, %d papers" % (elapsed, len(paper_refs))) stmts = stmts_from_json(stmt_jsons) ac.dump_statements(stmts, 'cord19_pmc_stmts.pkl') return stmt_jsons