Python extract_duplicatesの例、indra_db.util.distill_statements.extract_duplicates Pythonの例

コード例 #1

0

ファイルを表示

ファイル: knowledgebase.py プロジェクト: kkaris/indra_db

    def _get_statements(self):
        import pybel
        import requests
        from indra.sources.bel import process_pybel_graph
        logger.info('Processing CONIB from web')
        url = 'https://github.com/pharmacome/conib/raw/master/conib' \
            '/_cache.bel.nodelink.json'
        res_json = requests.get(url).json()
        graph = pybel.from_nodelink(res_json)
        # Get INDRA statements
        pbp = process_pybel_graph(graph)

        # Fix and issue with PMID spaces
        for stmt in pbp.statements:
            for ev in stmt.evidence:
                if ev.pmid:
                    ev.pmid = ev.pmid.strip()
                if ev.text_refs.get('PMID'):
                    ev.text_refs['PMID'] = ev.text_refs['PMID'].strip()

        logger.info('Expanding evidences and deduplicating')
        filtered_stmts = [s for s in _expanded(pbp.statements)]
        unique_stmts, _ = extract_duplicates(filtered_stmts,
                                             KeyFunc.mk_and_one_ev_src)
        return unique_stmts

コード例 #2

0

ファイルを表示

ファイル: knowledgebase.py プロジェクト: kkaris/indra_db

    def _get_statements(self):
        import requests
        from zipfile import ZipFile
        from indra.sources.bel.api import process_cbn_jgif_file
        import tempfile

        cbn_dir = tempfile.mkdtemp('cbn_manager')

        logger.info('Retrieving CBN network zip archive')
        tmp_zip = os.path.join(cbn_dir, 'cbn_human.zip')
        resp = requests.get(self.archive_url)
        with open(tmp_zip, 'wb') as f:
            f.write(resp.content)

        stmts = []
        tmp_dir = os.path.join(cbn_dir, 'cbn')
        os.mkdir(tmp_dir)
        with ZipFile(tmp_zip) as zipf:
            logger.info('Extracting archive to %s' % tmp_dir)
            zipf.extractall(path=tmp_dir)
            logger.info('Processing jgif files')
            for jgif in zipf.namelist():
                if jgif.endswith('.jgf') or jgif.endswith('.jgif'):
                    logger.info('Processing %s' % jgif)
                    pbp = process_cbn_jgif_file(os.path.join(tmp_dir, jgif))
                    stmts += pbp.statements

        uniques, dups = extract_duplicates(stmts,
                                           key_func=KeyFunc.mk_and_one_ev_src)

        logger.info("Deduplicating...")
        print('\n'.join(str(dup) for dup in dups))
        print(len(dups))

        return uniques

コード例 #3

0

ファイルを表示

ファイル: knowledgebase.py プロジェクト: kkaris/indra_db

 def _get_statements(self):
     from indra.sources import trrust
     tp = trrust.process_from_web()
     unique_stmts, dups = \
         extract_duplicates(_expanded(tp.statements),
                            key_func=KeyFunc.mk_and_one_ev_src)
     print(len(dups))
     return unique_stmts

コード例 #4

0

ファイルを表示

ファイル: knowledgebase.py プロジェクト: kkaris/indra_db

 def _get_statements(self):
     from indra.sources import ubibrowser
     logger.info('Processing UbiBrowser from web')
     up = ubibrowser.process_from_web()
     logger.info('Expanding evidences and deduplicating')
     filtered_stmts = [s for s in _expanded(up.statements)]
     unique_stmts, _ = extract_duplicates(filtered_stmts,
                                          KeyFunc.mk_and_one_ev_src)
     return unique_stmts

コード例 #5

0

ファイルを表示

ファイル: knowledgebase.py プロジェクト: kkaris/indra_db

 def _get_statements(self):
     from indra.sources import dgi
     logger.info('Processing DGI from web')
     dp = dgi.process_version('2020-Nov')
     logger.info('Expanding evidences and deduplicating')
     filtered_stmts = [s for s in _expanded(dp.statements)]
     unique_stmts, _ = extract_duplicates(filtered_stmts,
                                          KeyFunc.mk_and_one_ev_src)
     return unique_stmts

コード例 #6

0

ファイルを表示

ファイル: knowledgebase.py プロジェクト: kkaris/indra_db

    def _get_statements(self):
        from indra.sources import bel

        pbp = bel.process_large_corpus()
        stmts = pbp.statements
        pbp = bel.process_small_corpus()
        stmts += pbp.statements
        stmts, dups = extract_duplicates(stmts,
                                         key_func=KeyFunc.mk_and_one_ev_src)
        print('\n'.join(str(dup) for dup in dups))
        print(len(stmts), len(dups))
        return stmts

コード例 #7

0

ファイルを表示

ファイル: knowledgebase.py プロジェクト: kkaris/indra_db

 def _get_statements(self):
     s3 = boto3.client('s3')
     logger.info('Fetching DrugBank statements from S3...')
     key = 'indra-db/drugbank_5.1.pkl'
     resp = s3.get_object(Bucket='bigmech', Key=key)
     stmts = pickle.loads(resp['Body'].read())
     expanded_stmts = [s for s in _expanded(stmts)]
     # Return exactly one of multiple statements that are exactly the same
     # in terms of content and evidence.
     unique_stmts, _ = extract_duplicates(expanded_stmts,
                                          KeyFunc.mk_and_one_ev_src)
     return unique_stmts

コード例 #8

0

ファイルを表示

ファイル: knowledgebase.py プロジェクト: kkaris/indra_db

 def _get_statements(self):
     s3 = boto3.client('s3')
     all_stmts = []
     for subset in self.subsets:
         logger.info('Fetching CTD subset %s from S3...' % subset)
         key = 'indra-db/ctd_%s.pkl' % subset
         resp = s3.get_object(Bucket='bigmech', Key=key)
         stmts = pickle.loads(resp['Body'].read())
         all_stmts += [s for s in _expanded(stmts)]
     # Return exactly one of multiple statements that are exactly the same
     # in terms of content and evidence.
     unique_stmts, _ = extract_duplicates(all_stmts,
                                          KeyFunc.mk_and_one_ev_src)
     return unique_stmts

コード例 #9

0

ファイルを表示

ファイル: knowledgebase.py プロジェクト: kkaris/indra_db

    def _get_statements(self):
        s3 = boto3.client('s3')

        logger.info('Loading PC content pickle from S3')
        resp = s3.get_object(Bucket='bigmech',
                             Key='indra-db/biopax_pc12_pybiopax.pkl')
        logger.info('Loading PC statements from pickle')
        stmts = pickle.loads(resp['Body'].read())

        logger.info('Expanding evidences and deduplicating')
        filtered_stmts = [s for s in _expanded(stmts) if self._can_include(s)]
        unique_stmts, _ = extract_duplicates(filtered_stmts,
                                             KeyFunc.mk_and_one_ev_src)
        return unique_stmts

コード例 #10

0

ファイルを表示

ファイル: knowledgebase_manager.py プロジェクト: kolusask/indra_db

    def _get_statements(self):
        from indra.sources import biopax

        s3 = boto3.client('s3')
        resp = s3.get_object(Bucket='bigmech',
                             Key='indra-db/Kinase_substrates.owl.gz')
        owl_gz = resp['Body'].read()
        owl_bytes = zlib.decompress(owl_gz, zlib.MAX_WBITS + 32)
        bp = biopax.process_owl_str(owl_bytes)
        stmts, dups = extract_duplicates(bp.statements,
                                         key_func=KeyFunc.mk_and_one_ev_src)
        print('\n'.join(str(dup) for dup in dups))
        print(len(stmts), len(dups))
        return stmts

コード例 #11

0

ファイルを表示

ファイル: knowledgebase.py プロジェクト: kkaris/indra_db

    def _get_statements(self):
        import tarfile
        import requests
        from indra.sources import hprd

        # Download the files.
        hprd_base = 'http://www.hprd.org/RELEASE9/'
        resp = requests.get(hprd_base + 'HPRD_FLAT_FILES_041310.tar.gz')
        tmp_dir = tempfile.mkdtemp('hprd_files')
        tmp_tarfile = os.path.join(tmp_dir, 'hprd_files.tar.gz')
        with open(tmp_tarfile, 'wb') as f:
            f.write(resp.content)

        # Extract the files.
        with tarfile.open(tmp_tarfile, 'r:gz') as tf:
            tf.extractall(tmp_dir)

        # Find the relevant files.
        dirs = os.listdir(tmp_dir)
        for files_dir in dirs:
            if files_dir.startswith('FLAT_FILES'):
                break
        files_path = os.path.join(tmp_dir, files_dir)
        file_names = {
            'id_mappings_file': 'HPRD_ID_MAPPINGS',
            'complexes_file': 'PROTEIN_COMPLEXES',
            'ptm_file': 'POST_TRANSLATIONAL_MODIFICATIONS',
            'ppi_file': 'BINARY_PROTEIN_PROTEIN_INTERACTIONS',
            'seq_file': 'PROTEIN_SEQUENCES'
        }
        kwargs = {
            kw: os.path.join(files_path, fname + '.txt')
            for kw, fname in file_names.items()
        }

        # Run the processor
        hp = hprd.process_flat_files(**kwargs)

        # Filter out exact duplicates
        unique_stmts, dups = \
            extract_duplicates(_expanded(hp.statements),
                               key_func=KeyFunc.mk_and_one_ev_src)
        print('\n'.join(str(dup) for dup in dups))

        return unique_stmts

コード例 #12

0

ファイルを表示

ファイル: knowledgebase.py プロジェクト: kkaris/indra_db

 def _get_statements(self):
     from indra.sources import tas
     # The settings we use here are justified as follows:
     # - only affinities that indicate binding are included
     # - only agents that have some kind of a name available are
     #   included, with ones that get just an ID as a name are
     #   not included.
     # - we do not require full standardization, thereby allowing
     #   set of drugs to be extracted for which we have a name from CHEBML,
     #   HMS-LINCS, or DrugBank
     logger.info('Processing TAS from web')
     tp = tas.process_from_web(affinity_class_limit=2,
                               named_only=True,
                               standardized_only=False)
     logger.info('Expanding evidences and deduplicating')
     filtered_stmts = [s for s in _expanded(tp.statements)]
     unique_stmts, _ = extract_duplicates(filtered_stmts,
                                          KeyFunc.mk_and_one_ev_src)
     return unique_stmts

コード例 #13

0

ファイルを表示

ファイル: knowledgebase.py プロジェクト: kkaris/indra_db

    def _get_statements(self):
        from indra.sources import rlimsp
        import requests

        stmts = []
        for fname, id_type in self._rlimsp_files:
            print("Processing %s..." % fname)
            res = requests.get(self._rlimsp_root + fname)
            jsonish_str = res.content.decode('utf-8')
            rp = rlimsp.process_from_jsonish_str(jsonish_str, id_type)
            stmts += rp.statements
            print("Added %d more statements from %s..." %
                  (len(rp.statements), fname))

        stmts, dups = extract_duplicates(_expanded(stmts),
                                         key_func=KeyFunc.mk_and_one_ev_src)
        print('\n'.join(str(dup) for dup in dups))
        print(len(stmts), len(dups))

        return stmts

コード例 #14

0

ファイルを表示

ファイル: knowledgebase.py プロジェクト: kkaris/indra_db

 def _get_statements(self):
     from indra.sources import phosphoelm
     logger.info('Fetching PhosphoElm dump from S3...')
     s3 = boto3.resource('s3')
     tmp_dir = tempfile.mkdtemp('phosphoelm_files')
     dump_file = os.path.join(tmp_dir, 'phosphoelm.dump')
     s3.meta.client.download_file('bigmech',
                                  'indra-db/phosphoELM_all_2015-04.dump',
                                  dump_file)
     logger.info('Processing PhosphoElm dump...')
     pp = phosphoelm.process_from_dump(dump_file)
     logger.info('Expanding evidences on PhosphoElm statements...')
     # Expand evidences just in case, though this processor always
     # produces a single evidence per statement.
     stmts = [s for s in _expanded(pp.statements)]
     # Return exactly one of multiple statements that are exactly the same
     # in terms of content and evidence.
     # Now make sure we don't include exact duplicates
     unique_stmts, _ = extract_duplicates(stmts, KeyFunc.mk_and_one_ev_src)
     return unique_stmts

コード例 #15

0

ファイルを表示

ファイル: knowledgebase_manager.py プロジェクト: kolusask/indra_db

 def _get_statements(self):
     from indra.sources.tas import process_csv
     proc = process_csv()
     stmts, dups = extract_duplicates(proc.statements)
     print(dups)
     return stmts