コード例 #1
0
ファイル: utils.py プロジェクト: D-I-L/django-data-pipeline
 def ensmart_homolog_parse(cls, *args, **kwargs):
     ''' Parse result from ensembl mart. '''
     download_file = cls._get_download_file(*args, **kwargs)
     with open(download_file, 'rt') as ensmart_f:
         Gene.ensmart_homolog_parse(ensmart_f, kwargs['section']['attrs'],
                                    kwargs['section']['index'],
                                    kwargs['section']['index_type'])
コード例 #2
0
ファイル: utils.py プロジェクト: D-I-L/django-data-pipeline
    def gene_info_parse(cls, *args, **kwargs):
        ''' Parse gene_info file from NCBI. '''
        download_file = cls._get_download_file(*args, **kwargs)
        idx = kwargs['section']['index']

        with gzip.open(download_file, 'rt') as gene_info_f:
            Gene.gene_info_parse(gene_info_f, idx)
コード例 #3
0
ファイル: utils.py プロジェクト: D-I-L/django-data-pipeline
 def ensembl_gene_parse(cls, *args, **kwargs):
     ''' Parse gene GTF file from ensembl. '''
     stage_file = cls._get_stage_file(*args, **kwargs)
     download_file = cls._get_download_file(*args, **kwargs)
     Gene.gene_mapping(kwargs['section']['index'], kwargs['section']['index_type'])
     with gzip.open(download_file, 'rt') as ensembl_gene_f:
         with open(stage_file, 'w') as outfile:
             json.dump(Gene.ensembl_gene_parse(ensembl_gene_f), outfile, indent=0)
コード例 #4
0
    def test__replace_oldids_with_newids(self):
        '''Test if the old ids are getting replaced wih newids'''
        gene_sets = ['339457', '197215', '26191']
        new_gene_ids = {'339457': '85452'}
        replaced_gene_sets = Gene._replace_oldids_with_newids(gene_sets, new_gene_ids)
        self.assertEqual(replaced_gene_sets, ['85452', '197215', '26191'], "Replaced 339457 with 85452")

        discontinued_ids = ['197215']
        replaced_gene_sets = Gene._replace_oldids_with_newids(gene_sets, new_gene_ids, discontinued_ids)
        print(replaced_gene_sets)
        self.assertEqual(replaced_gene_sets, ['85452', '26191'], "Replaced 339457 with 85452")
コード例 #5
0
    def test__convert_entrezid2ensembl(self):

        config = IniParser().read_ini("tests/test_download.ini")
        section = config["BIOPLEX"]
        self.assertIsNotNone(section, "Section is not none")

        gene_sets = ['26191']
        ensembl_ids = Gene._convert_entrezid2ensembl(gene_sets, section)
        self.assertTrue(len(ensembl_ids) == 1, "Got back one id")
        self.assertEqual(ensembl_ids[0], "ENSG00000134242", "Got back the right ensembl id for 26191")

        gene_sets = ['26191', '339457']
        ensembl_ids = Gene._convert_entrezid2ensembl(gene_sets, section)
        self.assertTrue(len(ensembl_ids) == 2, "Got back 2 ensembl ids")
コード例 #6
0
    def _process_pathway(cls, download_file, stage_output_file, section, source, is_public, config=None):
        '''Function to parse the pathway input files eg: kegg, reactome, go
        INPUT file format:
        Pathway name \t Pathyway url \t List of entrez ids
        REACTOME_RNA_POL_I_TRANSCRIPTION_TERMINATION
        http://www.broadinstitute.org/gsea/msigdb/cards/REACTOME_RNA_POL_I_TRANSCRIPTION_TERMINATION1022
        2068    2071    25885    284119    2965    2966    2967    2968    4331

        The entrez ids are converted to ensembl ids and logs are written to track the conversion rates (LESS/MORE/EQUAL)
        '''
        json_target_file_path = stage_output_file.replace(".out", ".json")
        json_target_file = open(json_target_file_path, mode='w', encoding='utf-8')
        json_target_file.write('{"docs":[\n')

        count = 0
        tmp_row_count_file = open(download_file, encoding='utf-8')
        row_count = sum(1 for row in tmp_row_count_file)
        logger.debug('Number of lines in the file ' + str(row_count))

        load_mapping = True

        gene_sets = []
        with open(download_file, encoding='utf-8') as csvfile:
            reader = csv.reader(csvfile, delimiter='\t', quoting=csv.QUOTE_NONE)
            for row in reader:
                gene_sets.extend(row[2:])
        csvfile.close()
        ens_look_up = Gene._entrez_ensembl_lookup(gene_sets, section, config)

        with open(download_file, encoding='utf-8') as csvfile:
            reader = csv.reader(csvfile, delimiter='\t', quoting=csv.QUOTE_NONE)

            for row in reader:
                path_object = dict()
                pathway_name = row[0]
                pathway_url = row[1]
                gene_sets = row[2:]

                converted_genesets = [ens_look_up[entrez] for entrez in gene_sets if entrez in ens_look_up]
                path_object["pathway_name"] = pathway_name
                path_object["pathway_url"] = pathway_url
                path_object["gene_sets"] = converted_genesets
                path_object["source"] = source
                path_object["is_public"] = is_public
                json_target_file.write(json.dumps(path_object))
                count += 1
                if row_count == count:
                    json_target_file.write('\n')
                else:
                    json_target_file.write(',\n')

            json_target_file.write('\n]}')

        logger.debug("No. genes to load "+str(count))
        logger.debug("Json written to " + json_target_file_path)
        logger.debug("Load mappings")

        if load_mapping:
            status = cls._load_pathway_mappings(section)
            print(status)
    def get_ensemb_ids(self, entrez_list):
        config = {}
        section = {}
        section['index'] = 'genes_hg38_v0.0.2'
        section['index_type'] = 'gene_history'
        config['GENE_HISTORY'] = section

        result_dict = Gene._entrez_ensembl_lookup(entrez_list, section, config)
        return result_dict
コード例 #8
0
    def test__check_gene_history(self):
        '''Test if the right newid is fetched from genehistory'''
        config = IniParser().read_ini("tests/test_download.ini")
        section = config["BIOPLEX"]
        self.assertIsNotNone(section, "Section is not none")

        gene_sets = ['339457', '197215', '26191']
        (newgene_ids, discontinued_ids) = Gene._check_gene_history(gene_sets, section)
        self.assertTrue(len(newgene_ids) == 1, "Got back one new id")
        self.assertIn('339457', newgene_ids, "Got back 339457 in new gene ids")
        self.assertTrue(len(discontinued_ids) == 1, "Got back one discontinued geneid")
コード例 #9
0
    def test__check_gene_history(self):
        '''Test if the right newid is fetched from genehistory'''
        config = IniParser().read_ini(MY_INI_FILE)
        gene_sets = ['56730', '26026', '188', '26191']

        (newgene_ids, discontinued_ids) = Gene._check_gene_history(gene_sets, config)
        self.assertTrue(len(newgene_ids) == 2, "Got back two new ids")
        self.assertIn('56730', newgene_ids.keys(), "Got back 56730 in new gene ids -key")
        self.assertIn('84666', newgene_ids.values(), "Got back 84666 in new gene ids -value")
        self.assertIn('188', discontinued_ids, "Got back 188 in new discontinued_ids")
        self.assertTrue(len(discontinued_ids) == 1, "Got back one discontinued geneid")
コード例 #10
0
    def test_gene_history_loader(self):
        """ Test the gene history loading. """
        call_command("pipeline", "--steps", "load", sections="GENE_HISTORY", dir=TEST_DATA_DIR, ini=MY_INI_FILE)

        INI_CONFIG = IniParser().read_ini(MY_INI_FILE)
        idx = INI_CONFIG["GENE_HISTORY"]["index"]
        idx_type = INI_CONFIG["GENE_HISTORY"]["index_type"]
        elastic = Search(idx=idx, idx_type=idx_type)
        Search.index_refresh(idx)

        self.assertTrue(elastic.get_count()["count"] > 1, "Count documents in the index")
        map1_props = Gene.gene_history_mapping(idx, idx_type, test_mode=True).mapping_properties
        map2_props = elastic.get_mapping()
        if idx not in map2_props:
            logger.error("MAPPING ERROR: " + json.dumps(map2_props))
        self._cmpMappings(map2_props[idx]["mappings"], map1_props, idx_type)
コード例 #11
0
    def _process_bioplex(cls, download_file, stage_output_file, section, config):
        '''Function to process bioplex data files. Interactors are in first two columns, they are converted to
        ensembl ids and stored in temperory.out files
        Input File format:
        GeneA    GeneB    UniprotA    UniprotB    SymbolA    SymbolB    pW    pNI    pInt
        100    728378    P00813    A5A3E0    ADA    POTEF    2.38086E-09    0.000331856    0.999668142
        100    345651    P00813    Q562R1    ADA    ACTBL2    9.79E-18    0.211914437    0.788085563

        Output file format:
        interactorA    interactorB
        ENSG00000196839    ENSG00000196604
        ENSG00000196839    ENSG00000169067
        '''
        stage_output_file_handler = open(stage_output_file, 'w')
        mapped_counter = 0
        unmapped_ids = []
        stage_output_file_handler.write('interactorA' + '\t' + 'interactorB\n')

        gene_sets = []
        with open(download_file, encoding='utf-8') as csvfile:
            reader = csv.DictReader(csvfile, delimiter='\t', quoting=csv.QUOTE_NONE)
            for row in reader:
                gene_sets.extend([row['GeneA'], row['GeneB']])
        csvfile.close()

        ens_look_up = Gene._entrez_ensembl_lookup(gene_sets, section, config)

        with open(download_file, encoding='utf-8') as csvfile:
            reader = csv.DictReader(csvfile, delimiter='\t', quoting=csv.QUOTE_NONE)
            for row in reader:
                interactor_a = row['GeneA']
                interactor_b = row['GeneB']
                if interactor_a in ens_look_up and interactor_b in ens_look_up:
                    line = ens_look_up[interactor_a] + '\t' + ens_look_up[interactor_b] + '\n'
                    stage_output_file_handler.write(line)
                    mapped_counter += 1
                else:
                    line = interactor_a + '\t' + interactor_b + '\n'
                    unmapped_ids.append(interactor_a)
                    unmapped_ids.append(interactor_b)

        logger.debug("\n".join(unmapped_ids))
        logger.debug("Mapped {}  Unmapped {} " . format(mapped_counter, len(unmapped_ids)))

        stage_output_file_handler.close()
        cls._process_interaction_out_file(stage_output_file, section, False)
コード例 #12
0
ファイル: utils.py プロジェクト: D-I-L/django-data-pipeline
 def gene_mgi_parse(cls, *args, **kwargs):
     download_file = cls._get_download_file(*args, **kwargs)
     with open(download_file, 'rt') as gene_mgi_f:
         Gene.gene_mgi_parse(gene_mgi_f, kwargs['section']['index'])
コード例 #13
0
ファイル: utils.py プロジェクト: D-I-L/django-data-pipeline
 def gene_history_parse(cls, *args, **kwargs):
     ''' Parse gene_history file from NCBI. '''
     download_file = cls._get_download_file(*args, **kwargs)
     Gene.gene_history_mapping(kwargs['section']['index'], kwargs['section']['index_type'])
     with gzip.open(download_file, 'rt') as gene_his_f:
         Gene.gene_history_parse(gene_his_f, kwargs['section']['index'], kwargs['section']['index_type'])
コード例 #14
0
ファイル: utils.py プロジェクト: D-I-L/django-data-pipeline
 def gene_pub_parse(cls, *args, **kwargs):
     ''' Parse gene2pubmed file from NCBI. '''
     download_file = cls._get_download_file(*args, **kwargs)
     with gzip.open(download_file, 'rt') as gene_pub_f:
         Gene.gene_pub_parse(gene_pub_f, kwargs['section']['index'])
コード例 #15
0
ファイル: utils.py プロジェクト: D-I-L/django-data-pipeline
 def gene2ensembl_parse(cls, *args, **kwargs):
     ''' Parse gene2ensembl file from NCBI. '''
     download_file = cls._get_download_file(*args, **kwargs)
     with gzip.open(download_file, 'rt') as gene2ens_f:
         Gene.gene2ensembl_parse(gene2ens_f, kwargs['section']['index'],
                                 kwargs['section']['index_type'])
コード例 #16
0
    def check_bioplex_data(self, child_doc, parent_doc):
        '''
        Get all interactors, collect the ensembl ids, convert them to entrez ids
        Fetch the source file from bioplex and search for the parent entrez id
        Compare if the interactors if count is same between two sets
        If there is difference, check if the entrez id is in gene_history
        '''
        config = IniParser().read_ini("download.ini")

        self.assertEqual(getattr(child_doc, "interaction_source"), 'bioplex', 'interaction_source is bioplex')

        # Get interactors
        interactors = getattr(child_doc, 'interactors')
        # Get ensembl ids
        ensembl_ids_interactors = [interactor['interactor'] for interactor in interactors]

        # Do a ensembl to entrez id lookup
        section = config["ENSEMBL_GENE"]
        ensembl_entrez_dict = Gene._ensembl_entrez_lookup(ensembl_ids_interactors, section)

        entrez_list_pydgin = set()
        for ensembl_id, entrez_id in ensembl_entrez_dict.items():  # @UnusedVariable
            entrez_list_pydgin.add(entrez_id)

        number_of_interactors_pydgin = len(interactors)

        parent_id = parent_doc.doc_id()
        self.assertEqual(parent_id, child_doc.parent(), 'Parent id ok')

        parent_entrez = getattr(parent_doc, "dbxrefs")["entrez"]

        # Download bioplex file from source and search for the parent entrez id interactors
        section_bioplex = config["BIOPLEX"]
        file_url = section_bioplex['location'] + section_bioplex['files']

        status = HTTPDownload.download(file_url, '/tmp', 'bioplex.tmp')
        my_regex = r"\b" + re.escape(parent_entrez) + r"\b"
        interactor_counter = 0
        if status:
            entrez_list_bioplex = set()
            with open('/tmp/bioplex.tmp', "r") as data:
                for line in data:
                    if re.search(my_regex, line):
                        tmp_list = line.split()
                        if tmp_list[0] != parent_entrez:
                            entrez_list_bioplex.add(tmp_list[0])
                        if tmp_list[1] != parent_entrez:
                            entrez_list_bioplex.add(tmp_list[1])
                        interactor_counter += 1

        if(len(entrez_list_pydgin) == len(entrez_list_bioplex)):
            self.assertEqual(number_of_interactors_pydgin, interactor_counter,
                             "Interactor count is correct " + str(number_of_interactors_pydgin))
        else:
            # find the missing one - Subtract.
            diff = set()
            if(len(entrez_list_pydgin) > len(entrez_list_bioplex)):
                diff = entrez_list_pydgin - entrez_list_bioplex
            else:
                diff = entrez_list_bioplex - entrez_list_pydgin

            # now check if these ids exists in history
            # Do a entrez to ensembl id lookup in gene history
            (newgene_ids, discontinued_ids) = Gene._check_gene_history(list(diff), config)  # @UnusedVariable
            self.assertEqual(len(diff), len(discontinued_ids),
                             "The missing ids where found in gene_history as discontinued ids")
コード例 #17
0
    def test_gene_pipeline(self):
        """ Test gene pipeline. """

        INI_CONFIG = IniParser().read_ini(MY_INI_FILE)
        idx = INI_CONFIG["ENSEMBL_GENE_GTF"]["index"]
        idx_type = INI_CONFIG["ENSEMBL_GENE_GTF"]["index_type"]

        """ 1. Test ensembl GTF loading. """
        call_command(
            "pipeline", "--steps", "stage", "load", sections="ENSEMBL_GENE_GTF", dir=TEST_DATA_DIR, ini=MY_INI_FILE
        )
        Search.index_refresh(idx)

        elastic = Search(idx=idx, idx_type=idx_type)
        self.assertGreaterEqual(elastic.get_count()["count"], 1, "Count documents in the index")
        map1_props = Gene.gene_mapping(idx, idx_type, test_mode=True).mapping_properties
        map2_props = elastic.get_mapping()
        if idx not in map2_props:
            logger.error("MAPPING ERROR: " + json.dumps(map2_props))
        self._cmpMappings(map2_props[idx]["mappings"], map1_props, idx_type)

        """ 2. Test adding entrez ID to documents """
        call_command("pipeline", "--steps", "load", sections="GENE2ENSEMBL", dir=TEST_DATA_DIR, ini=MY_INI_FILE)
        Search.index_refresh(idx)
        query = ElasticQuery.query_string("PTPN22", fields=["symbol"])
        elastic = Search(query, idx=idx)
        docs = elastic.search().docs
        self.assertEqual(len(docs), 1)
        self.assertTrue("entrez" in getattr(docs[0], "dbxrefs"))
        self.assertEqual(getattr(docs[0], "dbxrefs")["entrez"], "26191")

        """ 3. Add uniprot and fill in missing entrez fields. """
        call_command(
            "pipeline", "--steps", "download", "load", sections="ENSMART_GENE", dir=TEST_DATA_DIR, ini=MY_INI_FILE
        )
        Search.index_refresh(idx)
        query = ElasticQuery.query_string("DNMT3L", fields=["symbol"])
        elastic = Search(query, idx=idx)
        docs = elastic.search().docs
        self.assertTrue("entrez" in getattr(docs[0], "dbxrefs"))
        self.assertTrue("swissprot" in getattr(docs[0], "dbxrefs"))

        """ 4. Add gene synonyms and dbxrefs. """
        call_command("pipeline", "--steps", "load", sections="GENE_INFO", dir=TEST_DATA_DIR, ini=MY_INI_FILE)
        Search.index_refresh(idx)
        query = ElasticQuery.query_string("PTPN22", fields=["symbol"])
        elastic = Search(query, idx=idx)
        docs = elastic.search().docs
        self.assertTrue("PTPN8" in getattr(docs[0], "synonyms"))

        """ 5. Add PMIDs to gene docs. """
        call_command("pipeline", "--steps", "load", sections="GENE_PUBS", dir=TEST_DATA_DIR, ini=MY_INI_FILE)
        Search.index_refresh(idx)
        query = ElasticQuery.query_string("PTPN22", fields=["symbol"])
        elastic = Search(query, idx=idx)
        docs = elastic.search().docs
        self.assertGreater(len(getattr(docs[0], "pmids")), 0)

        """ 6. Add ortholog data. """
        call_command("pipeline", "--steps", "load", sections="ENSMART_HOMOLOG", dir=TEST_DATA_DIR, ini=MY_INI_FILE)
        Search.index_refresh(idx)
        query = ElasticQuery.query_string("PTPN22", fields=["symbol"])
        elastic = Search(query, idx=idx)
        docs = elastic.search().docs
        dbxrefs = getattr(docs[0], "dbxrefs")
        self.assertTrue("orthologs" in dbxrefs, dbxrefs)
        self.assertTrue("mmusculus" in dbxrefs["orthologs"], dbxrefs)
        self.assertEqual("ENSMUSG00000027843", dbxrefs["orthologs"]["mmusculus"]["ensembl"])

        query = ElasticQuery.filtered(
            Query.match_all(),
            TermsFilter.get_terms_filter("dbxrefs.orthologs.mmusculus.ensembl", ["ENSMUSG00000027843"]),
        )
        docs = Search(query, idx=idx, size=1).search().docs
        self.assertEqual(len(docs), 1)

        """ 7. Add mouse ortholog link to MGI """
        call_command("pipeline", "--steps", "load", sections="ENSEMBL2MGI", dir=TEST_DATA_DIR, ini=MY_INI_FILE)
        Search.index_refresh(idx)
        docs = Search(query, idx=idx, size=1).search().docs
        dbxrefs = getattr(docs[0], "dbxrefs")
        self.assertEqual("ENSMUSG00000027843", dbxrefs["orthologs"]["mmusculus"]["ensembl"])
        self.assertEqual("107170", dbxrefs["orthologs"]["mmusculus"]["MGI"])