def from_db_reading(cls, db_reading): """Construct a DatabaseReadingData object from an entry in the database As returned by SQL Alchemy. """ if db_reading.bytes: if db_reading.format == formats.JSON: reading = json.loads(unpack(db_reading.bytes)) else: reading = unpack(db_reading.bytes) else: reading = None return cls(db_reading.text_content_id, get_reader_class(db_reading.reader), db_reading.reader_version, db_reading.format, reading, db_reading.id)
def get_contexts(reach_output): event_contexts = [] for reading in reach_output: if reading.reader != 'REACH': continue # Unzip and decode json_str = unpack(reading.bytes) json_str = json_str.replace('frame-id', 'frame_id') json_str = json_str.replace('argument-label', 'argument_label') json_str = json_str.replace('object-meta', 'object_meta') json_str = json_str.replace('doc-id', 'doc_id') json_str = json_str.replace('is-hypothesis', 'is_hypothesis') json_str = json_str.replace('is-negated', 'is_negated') json_str = json_str.replace('is-direct', 'is_direct') json_str = json_str.replace('found-by', 'found_by') try: json_dict = json.loads(json_str) except ValueError: logger.error('Could not decode JSON string.') return None tree = objectpath.Tree(json_dict) qstr = "$.events.frames" res = tree.execute(qstr) if res is None: continue for event_frame in res: try: context_id = event_frame['context'] event_contexts.append((reading.id, context_id)) except KeyError: continue return event_contexts
def dump_tcs(tcids, dirname): tcs = db.select_all([ db.TextRef.id, db.TextRef.pmid, db.TextRef.pmcid, db.TextContent.id, db.TextContent.source, db.TextContent.text_type, db.TextContent.content ], db.TextContent.id.in_(tcids), *db.link(db.TextRef, db.TextContent)) tt_counts = {} for row in tcs: tt = row[-1] tt_counts[tt] = tt_counts.get(tt, 0) + 1 print(dirname, tt_counts) if not os.path.exists(dirname): os.mkdir(dirname) else: raise ValueError(f"Directory {dirname} already exists.") metadata = {} for trid, pmid, pmcid, tcid, src, tt, cont_bytes in tcs: metadata[tcid] = { 'trid': trid, 'pmid': pmid, 'tcid': tcid, 'pmcid': pmcid, 'source': src, 'text_type': tt } if src == 'pubmed': fmt = 'txt' else: fmt = 'nxml' with open(f'{dirname}/{tcid}.{fmt}', 'w') as f: f.write(unpack(cont_bytes)) with open(f'{dirname}/metadata.json', 'w') as f: json.dump(metadata, f, indent=2)
def get_reader_output(db, ref_id, ref_type='tcid', reader=None, reader_version=None): """Return reader output for a given text content. Parameters ---------- db : :py:class:`DatabaseManager` Reference to the DB to query ref_id : int or str The text reference ID whose reader output should be returned ref_type : Optional[str] The type of ID to look for, options include 'tcid' for the database's internal unique text content ID, or 'pmid', 'pmcid', 'doi, 'pii', 'manuscript_id' Default: 'tcid' reader : Optional[str] The name of the reader whose output is of interest reader_version : Optional[str] The specific version of the reader Returns ------- reading_results : dict{dict{list[str]}} A dict of reader outputs that match the query criteria, indexed first by text content id, then by reader. """ if ref_type == 'tcid': clauses = [db.Reading.text_content_id == ref_id] else: trids = _get_trids(db, ref_id, ref_type) if not trids: return [] logger.debug("Found %d text ref ids." % len(trids)) clauses = [ db.TextContent.text_ref_id.in_(trids), db.Reading.text_content_id == db.TextContent.id ] if reader: clauses.append(db.Reading.reader == reader.upper()) if reader_version: clauses.append(db.Reading.reader_version == reader_version) res = db.select_all( [db.Reading.text_content_id, db.Reading.reader, db.Reading.bytes], *clauses) reading_dict = defaultdict(lambda: defaultdict(lambda: [])) for tcid, reader, result in res: unpacked_result = None if len(result) == 0: logger.warning("Got reading result with zero content.") else: unpacked_result = unpack(result) reading_dict[tcid][reader].append(unpacked_result) return reading_dict
def _get_trid_title(trid): db = get_db('primary') tc = db.select_one(db.TextContent, db.TextContent.text_ref_id == trid, db.TextContent.text_type == 'title') if tc: title = unpack(tc.content) return title tr = db.select_one(db.TextRef, db.TextRef.id == trid) ref_dict = tr.get_ref_dict() if 'PMID' in ref_dict: pmid = ref_dict['PMID'] pmids_to_titles = _get_pmid_titles([pmid]) if pmid in pmids_to_titles: return pmids_to_titles[pmid] if 'PMCID' in ref_dict: title = _get_pmcid_title(ref_dict['PMCID']) if title: return title if 'DOI' in ref_dict: title = _get_doi_title(ref_dict['DOI']) if title: return title
def get_paper_titles_and_links(self, trids): """Return a dictionary mapping paper IDs to their titles.""" if self.paper_id_type == 'pii': return {}, {} db = get_db('primary') trs = db.select_all(db.TextRef, db.TextRef.id.in_(trids)) ref_dicts = [tr.get_ref_dict() for tr in trs] trid_to_title = {} trid_to_link = {} trid_to_pmids = {} trid_to_pmcids = {} trid_to_dois = {} check_in_db = [] # Map TRIDs to available PMIDs, DOIs, PMCIDs in this order for ref_dict in ref_dicts: link = _get_publication_link(ref_dict) trid_to_link[str(ref_dict['TRID'])] = link if ref_dict.get('PMID'): trid_to_pmids[ref_dict['TRID']] = ref_dict['PMID'] elif ref_dict.get('PMCID'): trid_to_pmcids[ref_dict['TRID']] = ref_dict['PMCID'] elif ref_dict.get('DOI'): trid_to_dois[ref_dict['TRID']] = ref_dict['DOI'] logger.info(f'From {len(trids)} TRIDs got {len(trid_to_pmids)} PMIDs,' f' {len(trid_to_pmcids)} PMCIDs, {len(trid_to_dois)} DOIs') # First get titles for available PMIDs if trid_to_pmids: logger.info(f'Getting titles for {len(trid_to_pmids)} PMIDs') pmids = list(trid_to_pmids.values()) pmids_to_titles = _get_pmid_titles(pmids) for trid, pmid in trid_to_pmids.items(): if pmid in pmids_to_titles: trid_to_title[str(trid)] = pmids_to_titles[pmid] else: check_in_db.append(trid) # Then get titles for available PMCIDs if trid_to_pmcids: logger.info(f'Getting titles for {len(trid_to_pmcids)} PMCIDs') for trid, pmcid in trid_to_pmcids.items(): title = _get_pmcid_title(pmcid) if title: trid_to_title[str(trid)] = title else: check_in_db.append(trid) # Then get titles for available DOIs if trid_to_dois: logger.info(f'Getting titles for {len(trid_to_dois)} DOIs') for trid, doi in trid_to_dois.items(): title = _get_doi_title(doi) if title: trid_to_title[str(trid)] = title else: check_in_db.append(trid) # Try getting remaining titles from db if check_in_db: logger.info(f'Getting titles for {len(check_in_db)} remaining ' 'TRIDs from DB') tcs = db.select_all(db.TextContent, db.TextContent.text_ref_id.in_(check_in_db), db.TextContent.text_type == 'title') for tc in tcs: title = unpack(tc.content) trid_to_title[str(tc.text_ref_id)] = title return trid_to_title, trid_to_link
def get_content_by_refs(db, pmid_list=None, trid_list=None, sources=None, formats=None, content_type='abstract', unzip=True): """Return content from the database given a list of PMIDs or text ref ids. Note that either pmid_list OR trid_list must be set, and only one can be set at a time. Parameters ---------- db : :py:class:`DatabaseManager` Reference to the DB to query pmid_list : list[str] or None A list of pmids. Default is None, in which case trid_list must be given. trid_list : list[int] or None A list of text ref ids. Default is None, in which case pmid list must be given. sources : list[str] or None A list of sources to include (e.g. 'pmc_oa', or 'pubmed'). Default is None, indicating that all sources will be included. formats : list[str] A list of the formats to be included ('xml', 'text'). Default is None, indicating that all formats will be included. content_type : str Select the type of content to load ('abstract' or 'fulltext'). Note that not all refs will have any, or both, types of content. unzip : Optional[bool] If True, the compressed output is decompressed into clear text. Default: True Returns ------- content_dict : dict A dictionary whose keys are text ref ids, with each value being the the corresponding content. """ # Make sure we only get one type of list. if not pmid_list or trid_list: raise ValueError("One of `pmid_list` or `trid_list` must be defined.") if pmid_list and trid_list: raise ValueError("Only one of `pmid_list` or `trid_list` may be used.") # Put together the clauses for the general constraints. clauses = [] if sources is not None: clauses.append(db.TextContent.source.in_(sources)) if formats is not None: clauses.append(db.TextContent.format.in_(formats)) if content_type not in ['abstract', 'fulltext']: raise ValueError("Unrecognized content type: %s" % content_type) else: clauses.append(db.TextContent.text_type == content_type) # Do the query to get the content. if pmid_list is not None: content_list = db.select_all( [db.TextRef.pmid, db.TextContent.content], db.TextRef.id == db.TextContent.text_ref_id, db.TextRef.pmid.in_(pmid_list), *clauses ) else: content_list = db.select_all([db.TextRef.id, db.TextContent.content], db.TextContent.text_ref_id.in_(trid_list), *clauses) if unzip: content_dict = {id_val: unpack(content) for id_val, content in content_list} else: content_dict = {id_val: content for id_val, content in content_list} return content_dict