Ejemplo n.º 1
0
def test_id_lookup_no_pmid():
    """Look up a paper that has a PMCID and DOI but not PMID."""
    res = id_lookup('10.1083/jcb.1974if', 'doi')
    assert res['pmcid'] == 'PMC3352949'
    res = id_lookup('PMC3352949', 'pmcid')
    assert res['doi'] == '10.1083/jcb.1974if'
    assert unicode_strs(res)
Ejemplo n.º 2
0
def test_id_lookup_no_pmid():
    """Look up a paper that has a PMCID and DOI but not PMID."""
    res = id_lookup('10.1083/jcb.1974if', 'doi')
    assert res['pmcid'] == 'PMC3352949'
    res = id_lookup('PMC3352949', 'pmcid')
    assert res['doi'] == '10.1083/jcb.1974if'
    assert unicode_strs(res)
Ejemplo n.º 3
0
 def __init__(self, xml_etree):
     self.tree = xml_etree
     self.statements = []
     # Extract all sems by category
     self._sems = collections.defaultdict(list)
     for interp in self.tree.findall('interpretation'):
         sentence = interp.find('sentence-text').text
         sems = interp.findall('sem')
         for sem in sems:
             ref = sem.find('ref')
             if ref is not None:
                 category = ref.attrib['category']
                 self._sems[category].append((sem, sentence))
     # Get citation info
     pmcid = self.tree.attrib.get('pmcid')
     pmid = self.tree.attrib.get('pmid')
     if not pmid:
         pmid = self.tree.attrib.get('id')
     self.pmid = None
     if pmid:
         if pmid.startswith('PMID'):
             pmid = pmid[4:]
         self.pmid = pmid
     elif pmcid:
         ids = id_lookup(pmcid, 'pmcid')
         pmid = ids.get('pmid')
         if pmid is not None:
             self.pmid = pmid
Ejemplo n.º 4
0
def process_pmc(pmc_id, offline=False, output_fname=default_output_fname):
    """Return a ReachProcessor by processing a paper with a given PMC id.

    Uses the PMC client to obtain the full text. If it's not available,
    None is returned.

    Parameters
    ----------
    pmc_id : str
        The ID of a PubmedCentral article. The string may start with PMC but
        passing just the ID also works.
        Examples: 3717945, PMC3717945
        https://www.ncbi.nlm.nih.gov/pmc/
    offline : Optional[bool]
        If set to True, the REACH system is ran offline. Otherwise (by default)
        the web service is called. Default: False

    Returns
    -------
    rp : ReachProcessor
        A ReachProcessor containing the extracted INDRA Statements
        in rp.statements.
    """
    xml_str = pmc_client.get_xml(pmc_id)
    if xml_str is None:
        return None
    fname = pmc_id + '.nxml'
    with open(fname, 'wb') as fh:
        fh.write(xml_str.encode('utf-8'))
    ids = id_lookup(pmc_id, 'pmcid')
    pmid = ids.get('pmid')
    rp = process_nxml_file(fname, citation=pmid, offline=offline,
                           output_fname=output_fname)
    return rp
Ejemplo n.º 5
0
 def __init__(self, xml_etree):
     self.tree = xml_etree
     self.statements = []
     # Extract all sems by category
     self._sems = collections.defaultdict(list)
     for interp in self.tree.findall('interpretation'):
         sentence = interp.find('sentence-text').text
         sems = interp.findall('sem')
         for sem in sems:
             ref = sem.find('ref')
             if ref is not None:
                 category = ref.attrib['category']
                 self._sems[category].append((sem, sentence))
     # Get citation info
     pmcid = self.tree.attrib.get('pmcid')
     pmid = self.tree.attrib.get('pmid')
     if not pmid:
         pmid = self.tree.attrib.get('id')
     self.pmid = None
     if pmid:
         if pmid.startswith('PMID'):
             pmid = pmid[4:]
         self.pmid = pmid
     elif pmcid:
         ids = id_lookup(pmcid, 'pmcid')
         pmid = ids.get('pmid')
         if pmid is not None:
             self.pmid = pmid
Ejemplo n.º 6
0
def process_pmc(pmc_id, offline=False):
    """Return a ReachProcessor by processing a paper with a given PMC id.

    Uses the PMC client to obtain the full text. If it's not available,
    None is returned.

    Parameters
    ----------
    pmc_id : str
        The ID of a PubmedCentral article. The string may start with PMC but
        passing just the ID also works.
        Examples: 3717945, PMC3717945
        https://www.ncbi.nlm.nih.gov/pmc/
    offline : Optional[bool]
        If set to True, the REACH system is ran offline. Otherwise (by default)
        the web service is called. Default: False

    Returns
    -------
    rp : ReachProcessor
        A ReachProcessor containing the extracted INDRA Statements
        in rp.statements.
    """
    xml_str = pmc_client.get_xml(pmc_id)
    if xml_str is None:
        return None
    fname = pmc_id + '.nxml'
    with open(fname, 'wb') as fh:
        fh.write(xml_str.encode('utf-8'))
    ids = id_lookup(pmc_id, 'pmcid')
    pmid = ids.get('pmid')
    rp = process_nxml_file(fname, citation=pmid, offline=offline)
    return rp
Ejemplo n.º 7
0
def process_pmc(pmc_id,
                offline=False,
                url=None,
                output_fname=default_output_fname):
    """Return a ReachProcessor by processing a paper with a given PMC id.

    Uses the PMC client to obtain the full text. If it's not available,
    None is returned.

    Parameters
    ----------
    pmc_id : str
        The ID of a PubmedCentral article. The string may start with PMC but
        passing just the ID also works.
        Examples: 3717945, PMC3717945
        https://www.ncbi.nlm.nih.gov/pmc/
    offline : Optional[bool]
        If set to True, the REACH system is run offline via a JAR file.
        Otherwise (by default) the web service is called. Default: False
    url : Optional[str]
        URL for a REACH web service instance, which is used for reading if
        provided. If not provided but offline is set to False (its default
        value), the Arizona REACH web service is called
        (http://agathon.sista.arizona.edu:8080/odinweb/api/help).
        Default: None
    output_fname : Optional[str]
        The file to output the REACH JSON output to.
        Defaults to reach_output.json in current working directory.

    Returns
    -------
    rp : ReachProcessor
        A ReachProcessor containing the extracted INDRA Statements
        in rp.statements.
    """
    # Loading content from PMC first
    logger.info('Loading %s from PMC' % pmc_id)
    xml_str = pmc_client.get_xml(pmc_id)
    if xml_str is None:
        return None
    # Write into a file in the working folder
    fname = pmc_id + '.nxml'
    with open(fname, 'wb') as fh:
        fh.write(xml_str.encode('utf-8'))
    # Try to get the PMID for the paper so that the evidence pmid
    # attribute can be set correctly
    logger.info('Looking up PMID for %s' % pmc_id)
    ids = id_lookup(pmc_id, 'pmcid')
    pmid = ids.get('pmid')
    # Now process the NXML file with the provided arguments
    logger.info('Processing %s with REACH' % pmc_id)
    rp = process_nxml_file(fname,
                           citation=pmid,
                           offline=offline,
                           url=url,
                           output_fname=output_fname)
    return rp
Ejemplo n.º 8
0
def get_pmc_id(stmt):
    pmc_id = ''
    for ev in stmt.evidence:
        pmc_id = id_lookup(ev.pmid, 'pmid')['pmcid']
        if pmc_id is not None:
            if not pmc_id.startswith('PMC'):
                pmc_id = 'PMC' + pmc_id
        else:
            pmc_id = ''
    return str(pmc_id)
Ejemplo n.º 9
0
def get_pmc_id(stmt):
    pmc_id = ''
    for ev in stmt.evidence:
        pmc_id = id_lookup(ev.pmid, 'pmid')['pmcid']
        if pmc_id is not None:
            if not pmc_id.startswith('PMC'):
                pmc_id = 'PMC' + pmc_id
        else:
            pmc_id = ''
    return str(pmc_id)
Ejemplo n.º 10
0
 def _get_evidence(self, card):
     pmcid = card.get('pmc_id')
     ids = id_lookup(pmcid, 'pmcid')
     pmid = ids.get('pmid')
     evidence = card.get('evidence')
     all_evidence = []
     if evidence is not None:
         for text in evidence:
             e = Evidence(self.source_api, pmid=pmid, text=text)
             all_evidence.append(e)
     return all_evidence
Ejemplo n.º 11
0
 def _get_evidence(self, card):
     pmcid = card.get('pmc_id')
     ids = id_lookup(pmcid, 'pmcid')
     pmid = ids.get('pmid')
     evidence = card.get('evidence')
     all_evidence = []
     if evidence is not None:
         for text in evidence:
             e = Evidence(self.source_api, pmid=pmid, text=text)
             all_evidence.append(e)
     return all_evidence
Ejemplo n.º 12
0
def get_sample(pmids, k, fname):
    random.shuffle(pmids)
    done = 0
    with open(fname, 'w') as fh:
        for pmid in pmids:
            ids = id_lookup(pmid, 'pmid')
            pmcid = ids.get('pmcid')
            if pmcid:
                fh.write('%s\n' % pmcid)
                print('Downloading %s' % pmcid)
                xml = pmc_client.get_xml(pmcid)
                if xml:
                    with open('docs/pmc_xmls/%s.nxml' % pmcid, 'w') as xfh:
                        xfh.write(xml)
                    done += 1
                    if done == k:
                        break
Ejemplo n.º 13
0
def test_id_lookup():
    res = id_lookup('17513615', 'pmid')
    assert res['doi'] == '10.1158/1535-7163.MCT-06-0807'
Ejemplo n.º 14
0
    def make_model(self,
                   template=None,
                   grouping_level='agent-pair',
                   add_full_text_search_link=False,
                   no_redundancy=False,
                   **template_kwargs):
        """Return the assembled HTML content as a string.

        Parameters
        ----------
        template : a Template object
            Manually pass a Jinja template to be used in generating the HTML.
            The template is responsible for rendering essentially the output of
            `make_json_model`.
        grouping_level : Optional[str]
            Statements can be grouped under sub-headings at three levels,
            'statement' (ungrouped), 'relation' (grouped by agents and type),
            and 'agent-pair' (grouped by ordered pairs of agents).
            Default: 'agent-pair'.
        add_full_text_search_link : bool
            If True, link with Text fragment search in PMC journal will be
            added for the statements.  
        no_redundancy : Optional[bool]
            If True, any group of statements that was already presented under
            a previous heading will be skipped. This is typically the case
            for complexes where different permutations of complex members
            are presented. By setting this argument to True, these can be
            eliminated. Default: False

            All other keyword arguments are passed along to the template. If you
            are using a custom template with args that are not passed below, this
            is how you pass them.

        Returns
        -------
        str
            The assembled HTML as a string.
        """
        # Make the JSON model.
        tl_stmts = self.make_json_model(grouping_level=grouping_level,
                                        no_redundancy=no_redundancy)

        if add_full_text_search_link:
            for statement in tl_stmts:
                statement = tl_stmts[statement]
                for stmt_formatted in statement["stmts_formatted"]:
                    for stmt_info in stmt_formatted["stmt_info_list"]:
                        for evidence in stmt_info["evidence"]:
                            if 'PMCID' not in evidence.get('text_refs', {}):
                                if evidence.get('pmid'):
                                    ev_pmcid = id_lookup(
                                        evidence['pmid'], 'pmid') \
                                        .get('pmcid', None)
                                    if ev_pmcid:
                                        evidence['pmcid'] = ev_pmcid
                            else:
                                evidence['pmcid'] = \
                                    evidence['text_refs']['PMCID']

        metadata = {
            k.replace('_', ' ').title(): v
            for k, v in self.metadata.items()
            if not isinstance(v, list) and not isinstance(v, dict)
        }
        if self.db_rest_url and not self.db_rest_url.endswith('statements'):
            db_rest_url = self.db_rest_url + '/statements'
        else:
            db_rest_url = None

        # Fill the template.
        if template is None:
            template = default_template
        if self.source_counts and 'source_key_dict' not in template_kwargs:
            template_kwargs['source_key_dict'] = \
                {src: src for src in all_sources}
        if 'source_colors' not in template_kwargs:
            template_kwargs['source_colors'] = DEFAULT_SOURCE_COLORS
        if 'source_info' not in template_kwargs:
            template_kwargs['source_info'] = SOURCE_INFO.copy()
        if 'simple' not in template_kwargs:
            template_kwargs['simple'] = True

        self.model = template.render(
            stmt_data=tl_stmts,
            metadata=metadata,
            title=self.title,
            db_rest_url=db_rest_url,
            add_full_text_search_link=add_full_text_search_link,  # noqa
            **template_kwargs)
        return self.model
Ejemplo n.º 15
0
from indra import trips
from indra.literature import id_lookup
from assembly_eval import have_file, run_assembly

if __name__ == '__main__':
    pmc_ids = ['PMC1234335', 'PMC3178447', 'PMC3690480',
               'PMC4345513', 'PMC534114']
    pmids = [id_lookup(pmcid)['pmid'] for pmcid in pmc_ids]
    # Use the existing EKB extractions.
    for pmid, pmcid in zip(pmids, pmc_ids):
        folder = 'trips'
        prefix = folder + '/' + pmcid
        print 'Processing %s...' % pmcid
        tp = trips.process_xml(open(prefix + '-20160503T1152.ekb').read())
        # PMIDs from TRIPS need to be set here because it propagates
        # the PMCID by default
        for s in tp.statements:
            for e in s.evidence:
                e.pmid = pmid
        run_assembly(tp.statements, folder, pmcid)
Ejemplo n.º 16
0
from indra import trips, reach
from indra.literature import id_lookup
from assembly_eval import have_file, run_assembly

if __name__ == "__main__":
    pmc_ids = ["PMC1234335", "PMC3178447", "PMC3690480", "PMC4345513", "PMC534114"]
    pmids = [id_lookup(pmcid)["pmid"] for pmcid in pmc_ids]

    for pmid, pmcid in zip(pmids, pmc_ids):
        print "Processing %s..." % pmcid
        trips_fname = "trips/" + pmcid + "-20160503T1152.ekb"
        tp = trips.process_xml(open(trips_fname).read())
        for s in tp.statements:
            for e in s.evidence:
                e.pmid = pmid
        reach_fname = "reach/" + pmcid + ".json"
        rp = reach.process_json_file(reach_fname)
        all_statements = tp.statements + rp.statements
        run_assembly(all_statements, "combined", pmcid)
Ejemplo n.º 17
0
    def make_model(self,
                   template=None,
                   with_grouping=True,
                   add_full_text_search_link=False,
                   **template_kwargs):
        """Return the assembled HTML content as a string.

        Parameters
        ----------
        template : a Template object
            Manually pass a Jinja template to be used in generating the HTML.
            The template is responsible for rendering essentially the output of
            `make_json_model`.
        with_grouping : bool
            If True, statements will be grouped under multiple sub-headings. If
            False, all headings will be collapsed into one on every level, with
            all statements placed under a single heading.
        add_full_text_search_link : bool
            If True, link with Text fragment search in PMC journal will be
            added for the statements.

        All other keyword arguments are passed along to the template. If you
        are using a custom template with args that are not passed below, this
        is how you pass them.

        Returns
        -------
        str
            The assembled HTML as a string.
        """
        tl_stmts = self.make_json_model(with_grouping)

        if add_full_text_search_link:
            for statement in tl_stmts:
                statement = tl_stmts[statement]
                for stmt_formatted in statement["stmts_formatted"]:
                    for stmt_info in stmt_formatted["stmt_info_list"]:
                        for evidence in stmt_info["evidence"]:
                            if 'PMCID' not in evidence.get('text_refs', {}):
                                if evidence.get('pmid'):
                                    ev_pmcid = id_lookup(
                                        evidence['pmid'], 'pmid') \
                                        .get('pmcid', None)
                                    if ev_pmcid:
                                        evidence['pmcid'] = ev_pmcid
                            else:
                                evidence['pmcid'] = \
                                    evidence['text_refs']['PMCID']

        metadata = {
            k.replace('_', ' ').title(): v
            for k, v in self.metadata.items()
            if not isinstance(v, list) and not isinstance(v, dict)
        }
        if self.db_rest_url and not self.db_rest_url.endswith('statements'):
            db_rest_url = self.db_rest_url + '/statements'
        else:
            db_rest_url = None

        # Fill the template.
        if template is None:
            template = default_template
        if self.source_counts and 'source_key_dict' not in template_kwargs:
            template_kwargs['source_key_dict'] = SRC_KEY_DICT
        if 'source_colors' not in template_kwargs:
            template_kwargs['source_colors'] = SOURCE_COLORS

        self.model = template.render(
            stmt_data=tl_stmts,
            metadata=metadata,
            title=self.title,
            db_rest_url=db_rest_url,
            add_full_text_search_link=add_full_text_search_link,  # noqa
            **template_kwargs)
        return self.model
Ejemplo n.º 18
0
    rerun = False

    # Download the papers if they are not available yet
    pmids = []
    for pmcid in pmc_ids:
        prefix = folder + '/' + pmcid
        if not have_file(prefix + '.nxml') and\
           not have_file(prefix + '.txt'):
            txt, txt_format = get_full_text(pmcid)
            if txt_format == 'nxml':
                fname = prefix + '.nxml'
            else:
                fname = prefix + '.txt'
            with open(fname, 'wt') as fh:
                fh.write(txt.encode('utf-8'))
        pmids.append(id_lookup(pmcid)['pmid'])


    # Read each paper if it hasn't been read yet.
    # Otherwise use the existing json extractions.
    for pmcid, pmid in zip(pmc_ids, pmids):
        prefix = folder + '/' + pmcid
        print 'Processing %s...' % pmcid
        # If REACH already processed it then don't run it again
        if rerun or not have_file(prefix + '.json'):
            if have_file(prefix + '.txt'):
                txt = open(prefix + '.txt').read().decode('utf-8')
                rp = reach.process_text(txt, citation=pmid)
            elif have_file(prefix + '.nxml'):
                rp = reach.process_nxml_file(prefix + '.nxml', citation=pmid)
            shutil.move('reach_output.json', prefix + '.json')
Ejemplo n.º 19
0
def test_id_lookup():
    res = id_lookup('17513615', 'pmid')
    assert res['doi'] == '10.1158/1535-7163.MCT-06-0807'