Example #1
0
    def _check_reach_env():
        """Check that the environment supports runnig reach."""
        # Get the path to the REACH JAR
        path_to_reach = get_config('REACHPATH')
        if path_to_reach is None:
            path_to_reach = environ.get('REACHPATH', None)
        if path_to_reach is None or not path.exists(path_to_reach):
            raise ReachError(
                'Reach path unset or invalid. Check REACHPATH environment var '
                'and/or config file.'
                )

        logger.debug('Using REACH jar at: %s' % path_to_reach)

        # Get the reach version.
        reach_version = get_config('REACH_VERSION')
        if reach_version is None:
            reach_version = environ.get('REACH_VERSION', None)
        if reach_version is None:
            logger.debug('REACH version not set in REACH_VERSION')
            m = re.match('reach-(.*?)\.jar', path.basename(path_to_reach))
            reach_version = re.sub('-SNAP.*?$', '', m.groups()[0])

        logger.debug('Using REACH version: %s' % reach_version)
        return path_to_reach, reach_version
Example #2
0
def get_default_ndex_cred(ndex_cred):
    """Gets the NDEx credentials from the dict, or tries the environment if None"""
    if ndex_cred:
        username = ndex_cred.get('user')
        password = ndex_cred.get('password')

        if username is not None and password is not None:
            return username, password

    username = get_config('NDEX_USERNAME')
    password = get_config('NDEX_PASSWORD')

    return username, password
Example #3
0
def get_batch_command(command_list, project=None, purpose=None):
    """Get the command appropriate for running something on batch."""
    command_str = ' '.join(command_list)
    ret = ['python', '-m', 'indra.util.aws', 'run_in_batch', command_str]
    if not project and has_config('DEFAULT_AWS_PROJECT'):
        project = get_config('DEFAULT_AWS_PROJECT')
    if project:
        ret += ['--project', project]
    if purpose:
        ret += ['--purpose', purpose]
    return ret
Example #4
0
def _set_classpath():
    clp = os.environ.get('CLASSPATH')
    eip = get_config('EIDOSPATH')
    rep = get_config('REACHPATH')
    clp_parts = clp.split(':') if clp else []
    new_clp_parts = []
    has_eidos = False
    # Look at all the parts of the CLASSPATH
    for part in clp_parts:
        # If REACH is on the CLASSPATH, remove it
        if not rep or os.path.abspath(part) != rep:
            new_clp_parts.append(part)
        # If Eidos is not on the CLASSPATH, add it
        if eip and os.path.abspath(part) == eip:
            has_eidos = True
    if eip and not has_eidos:
        new_clp_parts.append(eip)
    # Set the new CLASSPATH
    new_clp = ':'.join(new_clp_parts)
    os.environ['CLASSPATH'] = new_clp
Example #5
0
def get_batch_command(command_list, project=None, purpose=None):
    """Get the command appropriate for running something on batch."""
    command_str = ' '.join(command_list)
    ret = ['python', '-m', 'indra.util.aws', 'run_in_batch', command_str]
    if not project and has_config('DEFAULT_AWS_PROJECT'):
        project = get_config('DEFAULT_AWS_PROJECT')
    if project:
        ret += ['--project', project]
    if purpose:
        ret += ['--purpose', purpose]
    return ret
Example #6
0
def _set_classpath():
    clp = os.environ.get('CLASSPATH')
    eip = get_config('EIDOSPATH')
    rep = get_config('REACHPATH')
    clp_parts = clp.split(':') if clp else []
    new_clp_parts = []
    has_reach = False
    # Look at all the parts of the CLASSPATH
    for part in clp_parts:
        # If Eidos is on the CLASSPATH, remove it
        if not eip or os.path.abspath(part) != eip:
            new_clp_parts.append(part)
        # If REACH is not on the CLASSPATH, add it
        if rep and os.path.abspath(part) == rep:
            has_reach = True
    if rep and not has_reach:
        new_clp_parts.append(rep)
    # Set the new CLASSPATH
    new_clp = ':'.join(new_clp_parts)
    os.environ['CLASSPATH'] = new_clp
Example #7
0
def _make_request(meth, end_point, query_str, data=None, params=None, tries=2):
    if params is None:
        params = {}

    if end_point is None:
        logger.error("Exception in submit request with args: %s" %
                     str([meth, end_point, query_str, data, params, tries]))
        raise ValueError("end_point cannot be None.")
    url = get_config('INDRA_DB_REST_URL', failure_ok=False)
    api_key = get_config('INDRA_DB_REST_API_KEY', failure_ok=True)
    url_path = url.rstrip('/') + '/' + end_point.lstrip('/')
    url_path += query_str
    headers = {}
    if data:
        # This is an assumption which applies to our use cases for now, but may
        # not generalize.
        headers['content-type'] = 'application/json'
        json_data = json.dumps(data)
    else:
        json_data = None
    params['api_key'] = api_key
    logger.info('url and query string: %s',
                url_path.replace(str(api_key), '[api-key]'))
    logger.info('headers: %s', str(headers).replace(str(api_key), '[api-key]'))
    logger.info('data: %s', str(data).replace(str(api_key), '[api-key]'))
    logger.info('params: %s', str(params).replace(str(api_key), '[api-key]'))
    method_func = getattr(requests, meth.lower())
    while tries > 0:
        tries -= 1
        resp = method_func(url_path,
                           headers=headers,
                           data=json_data,
                           params=params)
        if resp.status_code == 200:
            return resp
        elif resp.status_code == 504 and tries > 0:
            logger.warning("Endpoint timed out. Trying again...")
        else:
            raise IndraDBRestAPIError(resp)
Example #8
0
    def __test_redaction(self, method, endpoint, base_qstr, **data):
        resp, dt, size = self.__time_query(method, endpoint, base_qstr, **data)
        assert resp.status_code == 200, \
            '%s: %s' % (resp.status_code, resp.data.decode())
        resp_dict = json.loads(resp.data)
        stmt_dict_redact = resp_dict['statements']
        elsevier_found = 0
        elsevier_long_found = 0
        for s in stmt_dict_redact.values():
            for ev in s['evidence']:
                if get_source(ev) == 'elsevier':
                    elsevier_found += 1
                    if len(ev['text']) > 200:
                        elsevier_long_found += 1
                        assert ev['text'].endswith(REDACT_MESSAGE), \
                            'Found unredacted Elsevier text: %s.' % ev['text']
                else:
                    if 'text' in ev.keys():
                        assert not ev['text'].startswith('[Redacted'), \
                            'Found redacted non-elsevier text.'
        if elsevier_found == 0:
            raise SkipTest("No Elsevier content occurred.")
        if elsevier_long_found == 0:
            raise SkipTest("No redactable (>200 char) Elsevier content "
                           "occurred.")

        key = get_config('INDRA_DB_REST_API_KEY')
        if key is None:
            return  # Can't test the behavior with an API key.

        key_param = 'api_key=%s' % key
        if base_qstr:
            new_qstr = '&'.join(
                base_qstr.replace('?', '').split('&') + [key_param])
        else:
            new_qstr = key_param
        resp, dt, size = self.__time_query(method, endpoint, new_qstr, **data)
        resp_dict = json.loads(resp.data)
        stmt_dict_intact = resp_dict['statements']
        assert stmt_dict_intact.keys() == stmt_dict_redact.keys(), \
            "Response content changed: different statements without redaction."
        elsevier_found = 0
        for s in stmt_dict_intact.values():
            for ev in s['evidence']:
                if get_source(ev) == 'elsevier':
                    elsevier_found += 1
                if 'text' in ev.keys() and len(ev['text']) > 200:
                    assert not ev['text'].endswith(REDACT_MESSAGE), \
                        'Found redacted text despite api key.'
        assert elsevier_found > 0, "Elsevier content references went missing."
        return
Example #9
0
def make_db_rest_request(meth,
                         end_point,
                         query_str='',
                         data=None,
                         params=None,
                         tries=2,
                         timeout=None,
                         api_key=None):
    if params is None:
        params = {}

    if end_point is None:
        logger.error("Exception in submit request with args: %s" %
                     str([meth, end_point, query_str, data, params, tries]))
        raise ValueError("end_point cannot be None.")
    url_path = get_url_base(end_point)
    if api_key is None:
        api_key = get_config('INDRA_DB_REST_API_KEY', failure_ok=True)
    url_path += query_str
    headers = {}
    if data:
        # This is an assumption which applies to our use cases for now, but may
        # not generalize.
        headers['content-type'] = 'application/json'
        json_data = json.dumps(data)
    else:
        json_data = None
    params['api_key'] = api_key

    def remove_api_key(s):
        if api_key:
            return s.replace(str(api_key), '[api-key]')

    logger.info(f'query: {remove_api_key(url_path)}')
    logger.info(f'params: {remove_api_key(str(params))}')
    logger.info(f'data: {remove_api_key(str(data))}')
    logger.debug(f'headers: {remove_api_key(str(headers))}')
    method_func = getattr(requests, meth.lower())
    while tries > 0:
        tries -= 1
        resp = method_func(url_path,
                           headers=headers,
                           data=json_data,
                           params=params,
                           timeout=timeout)
        if resp.status_code == 200:
            return resp
        elif resp.status_code == 504 and tries > 0:
            logger.warning("Endpoint timed out. Trying again...")
        else:
            raise IndraDBRestAPIError(resp)
Example #10
0
File: api.py Project: steppi/indra
    def post(self):
        """Process PubMedCentral article and return INDRA Statements.

        Parameters
        ----------
        pmc_id : str
            The ID of a PubmedCentral article. The string may start with PMC
            but passing just the ID also works.
            Examples: 8511698, PMC8511698
            https://www.ncbi.nlm.nih.gov/pmc/

        offline : Optional[bool]
            If set to True, the REACH system is run offline via a JAR file.
            Otherwise (by default) the web service is called. Default: False

        url : Optional[str]
            URL for a REACH web service instance, which is used for reading if
            provided. If not provided but offline is set to False (its default
            value), REACH_NXML_URL set in configuration will be used. If not
            provided in configuration, the Arizona REACH web service is called
            (http://agathon.sista.arizona.edu:8080/odinweb/api/help).
            Default: None

        Returns
        -------
        statements : list[indra.statements.Statement.to_json()]
            A list of extracted INDRA Statements.
        """
        args = request.json
        pmcid = args.get('pmc_id')
        offline = True if args.get('offline') else False
        given_url = args.get('url')
        config_url = get_config('REACH_NXML_URL', failure_ok=True)
        # Order: URL given as an explicit argument in the request. Then any URL
        # set in the configuration. Then, unless offline is set, use the
        # default REACH web service URL.
        if 'url' in args:  # This is to take None if explicitly given
            url = given_url
        elif config_url:
            url = config_url
        elif not offline:
            url = reach_nxml_url
        else:
            url = None
        # If a URL is set, prioritize it over the offline setting
        if url:
            offline = False
        rp = reach.process_pmc(pmcid, offline=offline, url=url)
        return _stmts_from_proc(rp)
Example #11
0
    def _check_reach_env():
        """Check that the environment supports runnig reach."""
        # Get the path to the REACH JAR
        path_to_reach = get_config('REACHPATH')
        if path_to_reach is None:
            path_to_reach = environ.get('REACHPATH', None)
        if path_to_reach is None or not path.exists(path_to_reach):
            raise ReachError(
                'Reach path unset or invalid. Check REACHPATH environment var '
                'and/or config file.')

        logger.debug('Using REACH jar at: %s' % path_to_reach)

        # Get the reach version.
        reach_version = get_config('REACH_VERSION')
        if reach_version is None:
            reach_version = environ.get('REACH_VERSION', None)
        if reach_version is None:
            logger.debug('REACH version not set in REACH_VERSION')
            m = re.match('reach-(.*?)\.jar', path.basename(path_to_reach))
            reach_version = re.sub('-SNAP.*?$', '', m.groups()[0])

        logger.debug('Using REACH version: %s' % reach_version)
        return path_to_reach, reach_version
Example #12
0
        def check_api_keys(*args, **kwargs):
            global ELSEVIER_KEYS
            if ELSEVIER_KEYS is None:
                ELSEVIER_KEYS = {}
                # Try to read in Elsevier API keys. For each key, first check
                # the environment variables, then check the INDRA config file.
                if not has_config(INST_KEY_ENV_NAME):
                    logger.warning('Institution API key %s not found in config '
                                   'file or environment variable: this will '
                                   'limit access for %s'
                                   % (INST_KEY_ENV_NAME, task_desc))
                ELSEVIER_KEYS['X-ELS-Insttoken'] = get_config(INST_KEY_ENV_NAME)

                if not has_config(API_KEY_ENV_NAME):
                    logger.error('API key %s not found in configuration file '
                                 'or environment variable: cannot %s'
                                 % (API_KEY_ENV_NAME, task_desc))
                    return failure_ret
                ELSEVIER_KEYS['X-ELS-APIKey'] = get_config(API_KEY_ENV_NAME)
            elif 'X-ELS-APIKey' not in ELSEVIER_KEYS.keys():
                logger.error('No Elsevier API key %s found: cannot %s'
                             % (API_KEY_ENV_NAME, task_desc))
                return failure_ret
            return func(*args, **kwargs)
Example #13
0
File: api.py Project: steppi/indra
    def post(self):
        """Process text with REACH and return INDRA Statements.

        Parameters
        ----------
        text : str
            The text to be processed.

        offline : Optional[bool]
            If set to True, the REACH system is run offline via a JAR file.
            Otherwise (by default) the web service is called. Default: False

        url : Optional[str]
            URL for a REACH web service instance, which is used for reading if
            provided. If not provided but offline is set to False (its default
            value), REACH_TEXT_URL set in configuration will be used. If not
            provided in configuration, the Arizona REACH web service is called
            (http://agathon.sista.arizona.edu:8080/odinweb/api/help).
            Default: None

        Returns
        -------
        statements : list[indra.statements.Statement.to_json()]
            A list of extracted INDRA Statements.
        """
        args = request.json
        text = args.get('text')
        offline = True if args.get('offline') else False
        given_url = args.get('url')
        config_url = get_config('REACH_TEXT_URL', failure_ok=True)
        # Order: URL given as an explicit argument in the request. Then any URL
        # set in the configuration. Then, unless offline is set, use the
        # default REACH web service URL.
        if 'url' in args:  # This is to take None if explicitly given
            url = given_url
        elif config_url:
            url = config_url
        elif not offline:
            url = reach_text_url
        else:
            url = None
        # If a URL is set, prioritize it over the offline setting
        if url:
            offline = False
        rp = reach.process_text(text, offline=offline, url=url)
        return _stmts_from_proc(rp)
Example #14
0
def make_db_rest_request(meth, end_point, query_str, data=None, params=None, tries=2):
    if params is None:
        params = {}

    if end_point is None:
        logger.error("Exception in submit request with args: %s"
                     % str([meth, end_point, query_str, data, params, tries]))
        raise ValueError("end_point cannot be None.")
    url_path = get_url_base(end_point)
    api_key = get_config('INDRA_DB_REST_API_KEY', failure_ok=True)
    url_path += query_str
    headers = {}
    if data:
        # This is an assumption which applies to our use cases for now, but may
        # not generalize.
        headers['content-type'] = 'application/json'
        json_data = json.dumps(data)
    else:
        json_data = None
    params['api_key'] = api_key
    logger.info('url and query string: %s',
                url_path.replace(str(api_key), '[api-key]'))
    logger.info('headers: %s', str(headers).replace(str(api_key), '[api-key]'))
    logger.info('data: %s', str(data).replace(str(api_key), '[api-key]'))
    logger.info('params: %s', str(params).replace(str(api_key), '[api-key]'))
    method_func = getattr(requests, meth.lower())
    while tries > 0:
        tries -= 1
        resp = method_func(url_path, headers=headers, data=json_data,
                           params=params)
        if resp.status_code == 200:
            return resp
        elif resp.status_code == 504 and tries > 0:
            logger.warning("Endpoint timed out. Trying again...")
        else:
            raise IndraDBRestAPIError(resp)
Example #15
0
def run_on_text(text, python2_path):
    """Runs TEES on the given text in a temporary directory and returns a
    temporary directory with TEES output.
    
    The caller should delete this directory when done with it. This function
    runs TEES and produces TEES output files but does not process TEES output
    into INDRA statements.

    Parameters
    ----------
    text : str
        Text from which to extract relationships
    python2_path : str
        The path to the python 2 interpreter

    Returns
    -------
    output_dir : str
        Temporary directory with TEES output. The caller should delete this
        directgory when done with it.
    """
    tees_path = get_config('TEES_PATH')

    if tees_path is None:
        # If TEES directory is not specifies, see if any of the candidate paths
        # exist and contain all of the files expected for a TEES installation.
        for cpath in tees_candidate_paths:
            cpath = os.path.expanduser(cpath)
            if os.path.isdir(cpath):
                # Check to see if it has all of the expected files and
                # directories
                has_expected_files = True
                for f in tees_installation_files:
                    fpath = os.path.join(cpath, f)
                    present = os.path.isfile(fpath)
                    has_expected_files = has_expected_files and present

                has_expected_dirs = True
                for d in tees_installation_dirs:
                    dpath = os.path.join(cpath, d)
                    present = os.path.isdir(dpath)
                    has_expected_dirs = has_expected_dirs and present

                if has_expected_files and has_expected_dirs:
                    # We found a directory with all of the files and
                    # directories  we expected in a TEES installation - let's
                    # assume it's a TEES installation
                    tees_path = cpath
                    print('Found TEES installation at ' + cpath)
                    break

    # Make sure the provided TEES directory exists
    if not os.path.isdir(tees_path):
        raise Exception('Provided TEES directory does not exist.')

    # Make sure the classify.py script exists within this directory
    classify_path = 'classify.py'
    # if not os.path.isfile(classify_path):
    #    raise Exception('classify.py does not exist in provided TEES path.')

    # Create a temporary directory to tag the shared-task files
    tmp_dir = tempfile.mkdtemp(suffix='indra_tees_processor')

    pwd = os.path.abspath(os.getcwd())

    try:
        # Write text to a file in the temporary directory
        text_path = os.path.join(tmp_dir, 'text.txt')
        # Had some trouble with non-ascii characters. A possible TODO item in
        # the future is to look into resolving this, for now just ignoring
        # non-latin-1 characters
        with codecs.open(text_path, 'w', encoding='latin-1', errors='ignore') \
                as f:
            f.write(text)

        # Run TEES
        output_path = os.path.join(tmp_dir, 'output')
        model_path = os.path.join(tees_path, 'tees_data/models/GE11-test/')
        command = [python2_path, classify_path, '-m', model_path,
                   '-i', text_path,
                   '-o', output_path]
        try:
            pwd = os.path.abspath(os.getcwd())
            os.chdir(tees_path)  # Change to TEES directory
            # print('cwd is:', os.getcwd())
            # out = subprocess.check_output(command, stderr=subprocess.STDOUT)
            p = subprocess.Popen(command, stdout=subprocess.PIPE,
                                 stderr=subprocess.PIPE, cwd=tees_path)
            p.wait()
            (so, se) = p.communicate()
            print(so)
            print(se)
            os.chdir(pwd)  # Change back to previous directory
            # print('cwd is:', os.getcwd())
            # print(out.decode('utf-8'))

        except BaseException as e:
            # If there's an error, print it out and then propagate the
            # exception
            os.chdir(pwd)  # Change back to previous directory
            # print (e.output.decode('utf-8'))
            raise e

    except BaseException as e:
        # If there was an exception, delete the temporary directory and
        # pass on the exception
        shutil.rmtree(tmp_dir)
        raise e
    # Return the temporary directory with the TEES output
    output_tuple = extract_output(tmp_dir)
    shutil.rmtree(tmp_dir)
    return output_tuple
Example #16
0
import os
import json
import logging
import signal as sig
import subprocess as sp
import xml.etree.ElementTree as ET
import multiprocessing as mp

from indra.util import UnicodeXMLTreeBuilder as UTB

from .processor import SparserXMLProcessor, SparserJSONProcessor

logger = logging.getLogger(__name__)

sparser_path_var = 'SPARSERPATH'
sparser_path = get_config(sparser_path_var)


def process_text(text, output_fmt='json', outbuf=None, cleanup=True, key='',
                 **kwargs):
    """Return processor with Statements extracted by reading text with Sparser.

    Parameters
    ----------
    text : str
        The text to be processed
    output_fmt: Optional[str]
        The output format to obtain from Sparser, with the two options being
        'json' and 'xml'. Default: 'json'
    outbuf : Optional[file]
        A file like object that the Sparser output is written to.
Example #17
0
def run_reach(pmid_list,
              base_dir,
              num_cores,
              start_index,
              end_index,
              force_read,
              force_fulltext,
              cleanup=False,
              verbose=True):
    """Run reach on a list of pmids."""
    logger.info('Running REACH with force_read=%s' % force_read)
    logger.info('Running REACH with force_fulltext=%s' % force_fulltext)

    # Get the path to the REACH JAR
    path_to_reach = get_config('REACHPATH')
    if path_to_reach is None or not os.path.exists(path_to_reach):
        logger.warning(
            'Reach path not set or invalid. Check REACHPATH environment var.')
        return {}, {}

    logger.info('Using REACH jar at: %s' % path_to_reach)

    # Get the REACH version
    reach_version = get_config('REACH_VERSION')
    if reach_version is None:
        logger.info('REACH version not set in REACH_VERSION')
        m = re.match('reach-(.*?)\.jar', os.path.basename(path_to_reach))
        reach_version = re.sub('-SNAP.*?$', '', m.groups()[0])

    logger.info('Using REACH version: %s' % reach_version)

    tmp_dir, _, output_dir, pmids_read, pmids_unread, num_found =\
        get_content_to_read(
            pmid_list, start_index, end_index, base_dir, num_cores,
            force_fulltext, force_read, 'reach', reach_version
            )

    stmts = {}
    mem_tot = get_mem_total()
    if mem_tot is not None and mem_tot <= REACH_MEM + MEM_BUFFER:
        logger.error("Too little memory to run reach. At least %s required." %
                     REACH_MEM + MEM_BUFFER)
        logger.info("REACH not run.")
    elif len(pmids_unread) > 0 and num_found > 0:
        # Create the REACH configuration file
        with open(REACH_CONF_FMT_FNAME, 'r') as fmt_file:
            conf_file_path = os.path.join(tmp_dir, 'indra.conf')
            with open(conf_file_path, 'w') as conf_file:
                conf_file.write(fmt_file.read().format(
                    tmp_dir=os.path.abspath(tmp_dir),
                    num_cores=num_cores,
                    loglevel='INFO'))

        # Run REACH!
        logger.info("Beginning reach.")
        args = [
            'java', '-Xmx24000m',
            '-Dconfig.file=%s' % conf_file_path, '-jar', path_to_reach
        ]
        p = subprocess.Popen(args,
                             stdout=subprocess.PIPE,
                             stderr=subprocess.PIPE)
        if verbose:
            for line in iter(p.stdout.readline, b''):
                logger.info(line)
        p_out, p_err = p.communicate()
        if p.returncode:
            logger.error('Problem running REACH:')
            logger.error('Stdout: %s' % p_out.decode('utf-8'))
            logger.error('Stderr: %s' % p_err.decode('utf-8'))
            raise Exception('REACH crashed')

        # Process JSON files from local file system, process to INDRA
        # Statements and upload to S3
        some_stmts = upload_process_reach_files(output_dir, pmids_unread,
                                                reach_version, num_cores)
        stmts.update(some_stmts)
        # Delete the tmp directory if desired
        if cleanup:
            shutil.rmtree(tmp_dir)

    # Create a new multiprocessing pool for processing the REACH JSON
    # files previously cached on S3
    logger.info('Creating multiprocessing pool with %d cpus' % num_cores)
    pool = mp.Pool(num_cores)

    # Download and process the JSON files on S3
    logger.info('Processing REACH JSON from S3 in parallel')
    res = pool.map(process_reach_from_s3, pmids_read.keys())
    pool.close()
    logger.info('Multiprocessing pool closed.')
    pool.join()
    logger.info('Multiprocessing pool joined.')
    s3_stmts = {
        pmid: stmt_list
        for res_dict in res for pmid, stmt_list in res_dict.items()
    }
    stmts.update(s3_stmts)

    # Save the list of PMIDs with no content found on S3/literature client
    '''
    content_not_found_file = os.path.join(tmp_dir, 'content_not_found.txt')
    with open(content_not_found_file, 'wt') as f:
        for c in content_not_found:
            f.write('%s\n' % c)
    '''
    return stmts, pmids_unread
Example #18
0
import os
import logging
import jnius_config
from indra import get_config

logger = logging.getLogger(__name__)


def _has_xmx(options):
    for option in options:
        if option.startswith('-Xmx'):
            return True
    return False


default_mem_limit = get_config("INDRA_DEFAULT_JAVA_MEM_LIMIT")
if default_mem_limit is None:
    # Set to 8g if not specified in the configuration
    default_mem_limit = '8g'

if not _has_xmx(jnius_config.get_options()):
    if not jnius_config.vm_running:
        jnius_config.add_options('-Xmx%s' % default_mem_limit)
    else:
        logger.warning("Couldn't set memory limit for Java VM because the VM "
                       "is already running.")

path_here = os.path.dirname(os.path.realpath(__file__))
cp = os.path.join(path_here, 'sources/biopax/jars/paxtools.jar')
cp_existing = os.environ.get('CLASSPATH')
Example #19
0
def run_on_text(text, python2_path):
    """Runs TEES on the given text in a temporary directory and returns a
    temporary directory with TEES output.

    The caller should delete this directory when done with it. This function
    runs TEES and produces TEES output files but does not process TEES output
    into INDRA statements.

    Parameters
    ----------
    text : str
        Text from which to extract relationships
    python2_path : str
        The path to the python 2 interpreter

    Returns
    -------
    output_dir : str
        Temporary directory with TEES output. The caller should delete this
        directgory when done with it.
    """
    tees_path = get_config('TEES_PATH')

    if tees_path is None:
        # If TEES directory is not specifies, see if any of the candidate paths
        # exist and contain all of the files expected for a TEES installation.
        for cpath in tees_candidate_paths:
            cpath = os.path.expanduser(cpath)
            if os.path.isdir(cpath):
                # Check to see if it has all of the expected files and
                # directories
                has_expected_files = True
                for f in tees_installation_files:
                    fpath = os.path.join(cpath, f)
                    present = os.path.isfile(fpath)
                    has_expected_files = has_expected_files and present

                has_expected_dirs = True
                for d in tees_installation_dirs:
                    dpath = os.path.join(cpath, d)
                    present = os.path.isdir(dpath)
                    has_expected_dirs = has_expected_dirs and present

                if has_expected_files and has_expected_dirs:
                    # We found a directory with all of the files and
                    # directories  we expected in a TEES installation - let's
                    # assume it's a TEES installation
                    tees_path = cpath
                    print('Found TEES installation at ' + cpath)
                    break

    # Make sure the provided TEES directory exists
    if not os.path.isdir(tees_path):
        raise Exception('Provided TEES directory does not exist.')

    # Make sure the classify.py script exists within this directory
    classify_path = 'classify.py'
    # if not os.path.isfile(classify_path):
    #    raise Exception('classify.py does not exist in provided TEES path.')

    # Create a temporary directory to tag the shared-task files
    tmp_dir = tempfile.mkdtemp(suffix='indra_tees_processor')

    pwd = os.path.abspath(os.getcwd())

    try:
        # Write text to a file in the temporary directory
        text_path = os.path.join(tmp_dir, 'text.txt')
        # Had some trouble with non-ascii characters. A possible TODO item in
        # the future is to look into resolving this, for now just ignoring
        # non-latin-1 characters
        with codecs.open(text_path, 'w', encoding='latin-1', errors='ignore') \
                as f:
            f.write(text)

        # Run TEES
        output_path = os.path.join(tmp_dir, 'output')
        model_path = os.path.join(tees_path, 'tees_data/models/GE11-test/')
        command = [python2_path, classify_path, '-m', model_path,
                   '-i', text_path,
                   '-o', output_path]
        try:
            pwd = os.path.abspath(os.getcwd())
            os.chdir(tees_path)  # Change to TEES directory
            # print('cwd is:', os.getcwd())
            # out = subprocess.check_output(command, stderr=subprocess.STDOUT)
            p = subprocess.Popen(command, stdout=subprocess.PIPE,
                                 stderr=subprocess.PIPE, cwd=tees_path)
            p.wait()
            (so, se) = p.communicate()
            print(so)
            print(se)
            os.chdir(pwd)  # Change back to previous directory
            # print('cwd is:', os.getcwd())
            # print(out.decode('utf-8'))

        except BaseException as e:
            # If there's an error, print it out and then propagate the
            # exception
            os.chdir(pwd)  # Change back to previous directory
            # print (e.output.decode('utf-8'))
            raise e

    except BaseException as e:
        # If there was an exception, delete the temporary directory and
        # pass on the exception
        shutil.rmtree(tmp_dir)
        raise e
    # Return the temporary directory with the TEES output
    output_tuple = extract_output(tmp_dir)
    shutil.rmtree(tmp_dir)
    return output_tuple
Example #20
0
    import os
    import pickle
    import logging
    import sys

    logger = logging.getLogger('read_pmids_aws')

    # Setting default force read/fulltext parameters
    force_read = args.force_read
    force_fulltext = args.force_fulltext

    client = boto3.client('s3')
    bucket_name = 'bigmech'
    pmid_list_key = 'reading_results/%s/pmids' % args.basename
    if 'reach' in [rdr.lower() for rdr in args.readers]:
        path_to_reach = get_config('REACHPATH')
        reach_version = get_config('REACH_VERSION')
        if path_to_reach is None or reach_version is None:
            print('REACHPATH and/or REACH_VERSION not defined, exiting.')
            sys.exit(1)

    try:
        pmid_list_obj = client.get_object(
            Bucket=bucket_name,
            Key=pmid_list_key
            )
    # Handle a missing object gracefully
    except botocore.exceptions.ClientError as e:
        if e.response['Error']['Code'] == 'NoSuchKey':
            logger.info('Could not find PMID list file at %s, exiting' %
                        pmid_list_key)
Example #21
0
"""
This is a Python based command line interface to Eidos
to complement the Python-Java bridge based interface.
EIDOSPATH (in the INDRA config.ini or as an environmental variable)
needs to be pointing to a fat JAR of the Eidos system.
"""
import os
import glob
import logging
import subprocess
from indra import get_config
from .api import process_json_file


eip = get_config('EIDOSPATH')
eidos_package = 'org.clulab.wm.eidos'
logger = logging.getLogger(__name__)


def run_eidos(endpoint, *args):
    """Run a given enpoint of Eidos through the command line.

    Parameters
    ----------
    endpoint : str
        The class within the Eidos package to run, for instance
        'apps.ExtractFromDirectory' will run
        'org.clulab.wm.eidos.apps.ExtractFromDirectory'
    *args
        Any further arguments to be passed as inputs to the class
        being run.
Example #22
0
    import pickle
    import logging
    import sys

    logger = \
        logging.getLogger('indra.tools.reading.pmid_reading.read_pmids_aws')

    # Setting default force read/fulltext parameters
    force_read = args.force_read
    force_fulltext = args.force_fulltext

    client = boto3.client('s3')
    bucket_name = 'bigmech'
    pmid_list_key = 'reading_results/%s/pmids' % args.basename
    if 'reach' in [rdr.lower() for rdr in args.readers]:
        path_to_reach = get_config('REACHPATH')
        reach_version = get_config('REACH_VERSION')
        if path_to_reach is None or reach_version is None:
            print('REACHPATH and/or REACH_VERSION not defined, exiting.')
            sys.exit(1)

    try:
        pmid_list_obj = client.get_object(
            Bucket=bucket_name,
            Key=pmid_list_key
            )
    # Handle a missing object gracefully
    except botocore.exceptions.ClientError as e:
        if e.response['Error']['Code'] == 'NoSuchKey':
            logger.info('Could not find PMID list file at %s, exiting' %
                        pmid_list_key)
Example #23
0
"""
This is a Python based command line interface to Eidos
to complement the Python-Java bridge based interface.
EIDOSPATH (in the INDRA config.ini or as an environmental variable)
needs to be pointing to a fat JAR of the Eidos system.
"""
import os
import glob
import logging
import subprocess
from indra import get_config
from .eidos_api import process_json_ld_file

eip = get_config('EIDOSPATH')
eidos_package = 'org.clulab.wm.eidos'
logger = logging.getLogger('eidos_cli')


def run_eidos(endpoint, *args):
    """Run a given enpoint of Eidos through the command line.

    Parameters
    ----------
    endpoint : str
        The class within the Eidos package to run, for instance
        'apps.ExtractFromDirectory' will run
        'org.clulab.wm.eidos.apps.ExtractFromDirectory'
    *args
        Any further arguments to be passed as inputs to the class
        being run.
    """
Example #24
0
from collections import Counter
from indra.statements import Complex, Agent, Evidence
from indra.databases import hgnc_client
from indra import has_config, get_config

biogrid_url = 'http://webservice.thebiogrid.org/interactions/'

logger = logging.getLogger(__name__)

# For more information see http://wiki.thebiogrid.org/doku.php/biogridrest
# Try to read the API key from a file
if not has_config('BIOGRID_API_KEY'):
    logger.error('BioGRID API key could not be found in config file or ' + \
                 'environment variable.')
else:
    api_key = get_config('BIOGRID_API_KEY')


def get_interactors(gene_name):
    res_dict = _send_request([gene_name], include_interactors=True)
    interaction_list = []
    for result in res_dict.values():
        if result['OFFICIAL_SYMBOL_A'] == gene_name and \
           result['OFFICIAL_SYMBOL_B'] == gene_name:
            interaction_list.append(gene_name)
        elif result['OFFICIAL_SYMBOL_A'] == gene_name:
            interaction_list.append(result['OFFICIAL_SYMBOL_B'])
        elif result['OFFICIAL_SYMBOL_B'] == gene_name:
            interaction_list.append(result['OFFICIAL_SYMBOL_A'])
        else:
            assert False, "Interaction doesn't contain target gene!"
Example #25
0
from __future__ import absolute_import, print_function, unicode_literals
from builtins import dict, str
import re
import os
import nltk
import zlib
import codecs
import shutil
import logging
import tempfile
import subprocess
from indra import get_config

logger = logging.getLogger(__name__)

nxml2txt_path = get_config('NXML2TXT_PATH')
python2_path = get_config('PYTHON2_PATH')


class IsiPreprocessor(object):
    """Preprocess a set of documents, one by one, and add the preprocessed
    text to a temporary directory in a format suitable for the ISI reader.
    The ISI reader requires plain text with one sentence per line.

    Attributes
    ----------
    preprocessed_dir : str
        The directory holding the literature text preprocessed and sentence
        tokenized in a format suitable for the ISI reader
    next_file_id : int
        The next file with preprocessed text will be named next_file_id.txt
Example #26
0
from indra.assemblers.html import HtmlAssembler
from indra.assemblers.graph import GraphAssembler
from indra.assemblers.english.assembler import _join_list

logger = logging.getLogger('MSA')

mod_map = {'demethylate': 'Demethylation',
           'methylate': 'Methylation',
           'phosphorylate': 'Phosphorylation',
           'dephosphorylate': 'Dephosphorylation',
           'ubiquitinate': 'Ubiquitination',
           'deubiquitinate': 'Deubiquitination',
           'inhibit': 'Inhibition',
           'activate': 'Activation'}

DB_REST_URL = get_config('INDRA_DB_REST_URL')


class EntityError(ValueError):
    pass


class StatementQuery(object):
    """This is an object that encapsulates the information used to make a query.

    Parameters
    ----------
    subj, obj : Agent or None
        The subject and object of the causal mechanism to be found. If an
        Agent, the db_refs will be used as grounding. If there is no subject or
        object, subj or obj may be None respectively.
Example #27
0
def get_url_base(end_point):
    url = get_config('INDRA_DB_REST_URL', failure_ok=False)
    url_path = url.rstrip('/') + '/' + end_point.lstrip('/')
    return url_path
Example #28
0
from collections import Counter
from indra.statements import Complex, Agent, Evidence
from indra.databases import hgnc_client
from indra import has_config, get_config

biogrid_url = 'http://webservice.thebiogrid.org/interactions/'

logger = logging.getLogger('biogrid')

# For more information see http://wiki.thebiogrid.org/doku.php/biogridrest
# Try to read the API key from a file
if not has_config('BIOGRID_API_KEY'):
    logger.error('BioGRID API key could not be found in config file or ' + \
                 'environment variable.')
else:
    api_key = get_config('BIOGRID_API_KEY')


def get_interactors(gene_name):
    res_dict = _send_request([gene_name], include_interactors=True)
    interaction_list = []
    for result in res_dict.values():
        if result['OFFICIAL_SYMBOL_A'] == gene_name and \
           result['OFFICIAL_SYMBOL_B'] == gene_name:
            interaction_list.append(gene_name)
        elif result['OFFICIAL_SYMBOL_A'] == gene_name:
            interaction_list.append(result['OFFICIAL_SYMBOL_B'])
        elif result['OFFICIAL_SYMBOL_B'] == gene_name:
            interaction_list.append(result['OFFICIAL_SYMBOL_A'])
        else:
            assert False, "Interaction doesn't contain target gene!"
Example #29
0
            continue
        # Get the base verb form of the statement, e.g., "phosphorylate"
        base_verb = statement_base_verb(name.lower())
        verb_map[base_verb] = {'stmt': name, 'type': 'base'}
        # Get the present form of the statement, e.g., "inhibits"
        present_verb = statement_present_verb(name.lower())
        verb_map[present_verb] = {'stmt': name, 'type': 'present'}
        # Get the passive / state form of the statement, e.g., "activated"
        passive_verb = statement_passive_verb(name.lower())
        verb_map[passive_verb] = {'stmt': name, 'type': 'passive'}
    return verb_map


verb_map = _build_verb_map()

DB_REST_URL = get_config('INDRA_DB_REST_URL')


class EntityError(ValueError):
    pass


class StatementQuery(object):
    """This is an object that encapsulates the information used to make a query.

    Parameters
    ----------
    subj, obj : Agent or None
        The subject and object of the causal mechanism to be found. If an
        Agent, the db_refs will be used as grounding. If there is no subject or
        object, subj or obj may be None respectively.
Example #30
0
"""
import math
import logging
import requests
from indra import has_config, get_config


logger = logging.getLogger(__name__)


api_key = None
if not has_config('NEWSAPI_API_KEY'):
    logger.error('NewsAPI API key could not be found in config file or ' + \
                 'environment variable.')
else:
    api_key = get_config('NEWSAPI_API_KEY')

newsapi_url = 'https://newsapi.org/v2'


def send_request(endpoint, **kwargs):
    """Return the response to a query as JSON from the NewsAPI web service.

    The basic API is limited to 100 results which is chosen unless explicitly
    given as an argument. Beyond that, paging is supported through the "page"
    argument, if needed.

    Parameters
    ----------
    endpoint : str
        Endpoint to query, e.g. "everything" or "top-headlines"
Example #31
0
import os
import json
import logging
import subprocess
import xml.etree.ElementTree as ET
import multiprocessing as mp

from indra.util import UnicodeXMLTreeBuilder as UTB

from .processor import SparserXMLProcessor, SparserJSONProcessor

logger = logging.getLogger('sparser')

sparser_path_var = 'SPARSERPATH'
sparser_path = get_config(sparser_path_var)


def process_text(text, output_fmt='json', outbuf=None, cleanup=True, key=''):
    """Return processor with Statements extracted by reading text with Sparser.

    Parameters
    ----------
    text : str
        The text to be processed
    output_fmt: Optional[str]
        The output format to obtain from Sparser, with the two options being
        'json' and 'xml'. Default: 'json'
    outbuf : Optional[file]
        A file like object that the Sparser output is written to.
    cleanup : Optional[bool]
Example #32
0
def get_url_base(end_point):
    url = get_config('INDRA_DB_REST_URL', failure_ok=False)
    url_path = url.rstrip('/') + '/' + end_point.lstrip('/')
    return url_path
Example #33
0
import os
import logging
import jnius_config
from indra import get_config

logger = logging.getLogger(__name__)


def _has_xmx(options):
    for option in options:
        if option.startswith('-Xmx'):
            return True
    return False


default_mem_limit = get_config("INDRA_DEFAULT_JAVA_MEM_LIMIT")
if default_mem_limit is None:
    # Set to 8g if not specified in the configuration
    default_mem_limit = '8g'

if not _has_xmx(jnius_config.get_options()):
    if not jnius_config.vm_running:
        jnius_config.add_options('-Xmx%s' % default_mem_limit)
    else:
        logger.warning("Couldn't set memory limit for Java VM because the VM "
                       "is already running.")

path_here = os.path.dirname(os.path.realpath(__file__))
cp = os.path.join(path_here, 'sources/biopax/jars/paxtools.jar')
cp_existing = os.environ.get('CLASSPATH')