Ejemplo n.º 1
0
    def from_db_reading(cls, db_reading):
        """Construct a DatabaseReadingData object from an entry in the database

        As returned by SQL Alchemy.
        """
        if db_reading.bytes:
            if db_reading.format == formats.JSON:
                reading = json.loads(unpack(db_reading.bytes))
            else:
                reading = unpack(db_reading.bytes)
        else:
            reading = None
        return cls(db_reading.text_content_id,
                   get_reader_class(db_reading.reader),
                   db_reading.reader_version, db_reading.format,
                   reading, db_reading.id)
Ejemplo n.º 2
0
def get_contexts(reach_output):
    event_contexts = []
    for reading in reach_output:
        if reading.reader != 'REACH':
            continue
        # Unzip and decode
        json_str = unpack(reading.bytes)
        json_str = json_str.replace('frame-id', 'frame_id')
        json_str = json_str.replace('argument-label', 'argument_label')
        json_str = json_str.replace('object-meta', 'object_meta')
        json_str = json_str.replace('doc-id', 'doc_id')
        json_str = json_str.replace('is-hypothesis', 'is_hypothesis')
        json_str = json_str.replace('is-negated', 'is_negated')
        json_str = json_str.replace('is-direct', 'is_direct')
        json_str = json_str.replace('found-by', 'found_by')
        try:
            json_dict = json.loads(json_str)
        except ValueError:
            logger.error('Could not decode JSON string.')
            return None
        tree = objectpath.Tree(json_dict)

        qstr = "$.events.frames"
        res = tree.execute(qstr)
        if res is None:
            continue
        for event_frame in res:
            try:
                context_id = event_frame['context']
                event_contexts.append((reading.id, context_id))
            except KeyError:
                continue
    return event_contexts
Ejemplo n.º 3
0
def dump_tcs(tcids, dirname):
    tcs = db.select_all([
        db.TextRef.id, db.TextRef.pmid, db.TextRef.pmcid, db.TextContent.id,
        db.TextContent.source, db.TextContent.text_type, db.TextContent.content
    ], db.TextContent.id.in_(tcids), *db.link(db.TextRef, db.TextContent))
    tt_counts = {}
    for row in tcs:
        tt = row[-1]
        tt_counts[tt] = tt_counts.get(tt, 0) + 1

    print(dirname, tt_counts)

    if not os.path.exists(dirname):
        os.mkdir(dirname)
    else:
        raise ValueError(f"Directory {dirname} already exists.")

    metadata = {}
    for trid, pmid, pmcid, tcid, src, tt, cont_bytes in tcs:
        metadata[tcid] = {
            'trid': trid,
            'pmid': pmid,
            'tcid': tcid,
            'pmcid': pmcid,
            'source': src,
            'text_type': tt
        }
        if src == 'pubmed':
            fmt = 'txt'
        else:
            fmt = 'nxml'
        with open(f'{dirname}/{tcid}.{fmt}', 'w') as f:
            f.write(unpack(cont_bytes))
    with open(f'{dirname}/metadata.json', 'w') as f:
        json.dump(metadata, f, indent=2)
Ejemplo n.º 4
0
def get_reader_output(db,
                      ref_id,
                      ref_type='tcid',
                      reader=None,
                      reader_version=None):
    """Return reader output for a given text content.

    Parameters
    ----------
    db : :py:class:`DatabaseManager`
        Reference to the DB to query
    ref_id : int or str
        The text reference ID whose reader output should be returned
    ref_type : Optional[str]
        The type of ID to look for, options include
        'tcid' for the database's internal unique text content ID,
        or 'pmid', 'pmcid', 'doi, 'pii', 'manuscript_id'
        Default: 'tcid'
    reader : Optional[str]
        The name of the reader whose output is of interest
    reader_version : Optional[str]
        The specific version of the reader

    Returns
    -------
    reading_results : dict{dict{list[str]}}
        A dict of reader outputs that match the query criteria, indexed first
        by text content id, then by reader.
    """
    if ref_type == 'tcid':
        clauses = [db.Reading.text_content_id == ref_id]
    else:
        trids = _get_trids(db, ref_id, ref_type)
        if not trids:
            return []
        logger.debug("Found %d text ref ids." % len(trids))
        clauses = [
            db.TextContent.text_ref_id.in_(trids),
            db.Reading.text_content_id == db.TextContent.id
        ]
    if reader:
        clauses.append(db.Reading.reader == reader.upper())
    if reader_version:
        clauses.append(db.Reading.reader_version == reader_version)

    res = db.select_all(
        [db.Reading.text_content_id, db.Reading.reader, db.Reading.bytes],
        *clauses)
    reading_dict = defaultdict(lambda: defaultdict(lambda: []))
    for tcid, reader, result in res:
        unpacked_result = None
        if len(result) == 0:
            logger.warning("Got reading result with zero content.")
        else:
            unpacked_result = unpack(result)
        reading_dict[tcid][reader].append(unpacked_result)
    return reading_dict
Ejemplo n.º 5
0
def _get_trid_title(trid):
    db = get_db('primary')
    tc = db.select_one(db.TextContent, db.TextContent.text_ref_id == trid,
                       db.TextContent.text_type == 'title')
    if tc:
        title = unpack(tc.content)
        return title
    tr = db.select_one(db.TextRef, db.TextRef.id == trid)
    ref_dict = tr.get_ref_dict()
    if 'PMID' in ref_dict:
        pmid = ref_dict['PMID']
        pmids_to_titles = _get_pmid_titles([pmid])
        if pmid in pmids_to_titles:
            return pmids_to_titles[pmid]
    if 'PMCID' in ref_dict:
        title = _get_pmcid_title(ref_dict['PMCID'])
        if title:
            return title
    if 'DOI' in ref_dict:
        title = _get_doi_title(ref_dict['DOI'])
        if title:
            return title
Ejemplo n.º 6
0
    def get_paper_titles_and_links(self, trids):
        """Return a dictionary mapping paper IDs to their titles."""
        if self.paper_id_type == 'pii':
            return {}, {}
        db = get_db('primary')
        trs = db.select_all(db.TextRef, db.TextRef.id.in_(trids))
        ref_dicts = [tr.get_ref_dict() for tr in trs]
        trid_to_title = {}
        trid_to_link = {}
        trid_to_pmids = {}
        trid_to_pmcids = {}
        trid_to_dois = {}
        check_in_db = []
        # Map TRIDs to available PMIDs, DOIs, PMCIDs in this order
        for ref_dict in ref_dicts:
            link = _get_publication_link(ref_dict)
            trid_to_link[str(ref_dict['TRID'])] = link
            if ref_dict.get('PMID'):
                trid_to_pmids[ref_dict['TRID']] = ref_dict['PMID']
            elif ref_dict.get('PMCID'):
                trid_to_pmcids[ref_dict['TRID']] = ref_dict['PMCID']
            elif ref_dict.get('DOI'):
                trid_to_dois[ref_dict['TRID']] = ref_dict['DOI']

        logger.info(f'From {len(trids)} TRIDs got {len(trid_to_pmids)} PMIDs,'
                    f' {len(trid_to_pmcids)} PMCIDs, {len(trid_to_dois)} DOIs')

        # First get titles for available PMIDs
        if trid_to_pmids:
            logger.info(f'Getting titles for {len(trid_to_pmids)} PMIDs')
            pmids = list(trid_to_pmids.values())
            pmids_to_titles = _get_pmid_titles(pmids)

            for trid, pmid in trid_to_pmids.items():
                if pmid in pmids_to_titles:
                    trid_to_title[str(trid)] = pmids_to_titles[pmid]
                else:
                    check_in_db.append(trid)

        # Then get titles for available PMCIDs
        if trid_to_pmcids:
            logger.info(f'Getting titles for {len(trid_to_pmcids)} PMCIDs')
            for trid, pmcid in trid_to_pmcids.items():
                title = _get_pmcid_title(pmcid)
                if title:
                    trid_to_title[str(trid)] = title
                else:
                    check_in_db.append(trid)

        # Then get titles for available DOIs
        if trid_to_dois:
            logger.info(f'Getting titles for {len(trid_to_dois)} DOIs')
            for trid, doi in trid_to_dois.items():
                title = _get_doi_title(doi)
                if title:
                    trid_to_title[str(trid)] = title
                else:
                    check_in_db.append(trid)

        # Try getting remaining titles from db
        if check_in_db:
            logger.info(f'Getting titles for {len(check_in_db)} remaining '
                        'TRIDs from DB')
            tcs = db.select_all(db.TextContent,
                                db.TextContent.text_ref_id.in_(check_in_db),
                                db.TextContent.text_type == 'title')
            for tc in tcs:
                title = unpack(tc.content)
                trid_to_title[str(tc.text_ref_id)] = title

        return trid_to_title, trid_to_link
Ejemplo n.º 7
0
def get_content_by_refs(db, pmid_list=None, trid_list=None, sources=None,
                        formats=None, content_type='abstract', unzip=True):
    """Return content from the database given a list of PMIDs or text ref ids.

    Note that either pmid_list OR trid_list must be set, and only one can be
    set at a time.

    Parameters
    ----------
    db : :py:class:`DatabaseManager`
        Reference to the DB to query
    pmid_list : list[str] or None
        A list of pmids. Default is None, in which case trid_list must be
        given.
    trid_list : list[int] or None
        A list of text ref ids. Default is None, in which case pmid list must
        be given.
    sources : list[str] or None
        A list of sources to include (e.g. 'pmc_oa', or 'pubmed'). Default is
        None, indicating that all sources will be included.
    formats : list[str]
        A list of the formats to be included ('xml', 'text'). Default is None,
        indicating that all formats will be included.
    content_type : str
        Select the type of content to load ('abstract' or 'fulltext'). Note
        that not all refs will have any, or both, types of content.
    unzip : Optional[bool]
        If True, the compressed output is decompressed into clear text.
        Default: True

    Returns
    -------
    content_dict : dict
        A dictionary whose keys are text ref ids, with each value being the
        the corresponding content.
    """
    # Make sure we only get one type of list.
    if not pmid_list or trid_list:
        raise ValueError("One of `pmid_list` or `trid_list` must be defined.")
    if pmid_list and trid_list:
        raise ValueError("Only one of `pmid_list` or `trid_list` may be used.")

    # Put together the clauses for the general constraints.
    clauses = []
    if sources is not None:
        clauses.append(db.TextContent.source.in_(sources))
    if formats is not None:
        clauses.append(db.TextContent.format.in_(formats))
    if content_type not in ['abstract', 'fulltext']:
        raise ValueError("Unrecognized content type: %s" % content_type)
    else:
        clauses.append(db.TextContent.text_type == content_type)

    # Do the query to get the content.
    if pmid_list is not None:
        content_list = db.select_all(
            [db.TextRef.pmid, db.TextContent.content],
            db.TextRef.id == db.TextContent.text_ref_id,
            db.TextRef.pmid.in_(pmid_list),
            *clauses
            )
    else:
        content_list = db.select_all([db.TextRef.id, db.TextContent.content],
                                     db.TextContent.text_ref_id.in_(trid_list),
                                     *clauses)
    if unzip:
        content_dict = {id_val: unpack(content)
                        for id_val, content in content_list}
    else:
        content_dict = {id_val: content for id_val, content in content_list}
    return content_dict