Esempio n. 1
0
    def test__check_gene_history(self):
        '''Test if the right newid is fetched from genehistory'''
        config = IniParser().read_ini(MY_INI_FILE)
        gene_sets = ['56730', '26026', '188', '26191']

        (newgene_ids, discontinued_ids) = Gene._check_gene_history(gene_sets, config)
        self.assertTrue(len(newgene_ids) == 2, "Got back two new ids")
        self.assertIn('56730', newgene_ids.keys(), "Got back 56730 in new gene ids -key")
        self.assertIn('84666', newgene_ids.values(), "Got back 84666 in new gene ids -value")
        self.assertIn('188', discontinued_ids, "Got back 188 in new discontinued_ids")
        self.assertTrue(len(discontinued_ids) == 1, "Got back one discontinued geneid")
    def test__check_gene_history(self):
        '''Test if the right newid is fetched from genehistory'''
        config = IniParser().read_ini("tests/test_download.ini")
        section = config["BIOPLEX"]
        self.assertIsNotNone(section, "Section is not none")

        gene_sets = ['339457', '197215', '26191']
        (newgene_ids, discontinued_ids) = Gene._check_gene_history(gene_sets, section)
        self.assertTrue(len(newgene_ids) == 1, "Got back one new id")
        self.assertIn('339457', newgene_ids, "Got back 339457 in new gene ids")
        self.assertTrue(len(discontinued_ids) == 1, "Got back one discontinued geneid")
    def check_bioplex_data(self, child_doc, parent_doc):
        '''
        Get all interactors, collect the ensembl ids, convert them to entrez ids
        Fetch the source file from bioplex and search for the parent entrez id
        Compare if the interactors if count is same between two sets
        If there is difference, check if the entrez id is in gene_history
        '''
        config = IniParser().read_ini("download.ini")

        self.assertEqual(getattr(child_doc, "interaction_source"), 'bioplex', 'interaction_source is bioplex')

        # Get interactors
        interactors = getattr(child_doc, 'interactors')
        # Get ensembl ids
        ensembl_ids_interactors = [interactor['interactor'] for interactor in interactors]

        # Do a ensembl to entrez id lookup
        section = config["ENSEMBL_GENE"]
        ensembl_entrez_dict = Gene._ensembl_entrez_lookup(ensembl_ids_interactors, section)

        entrez_list_pydgin = set()
        for ensembl_id, entrez_id in ensembl_entrez_dict.items():  # @UnusedVariable
            entrez_list_pydgin.add(entrez_id)

        number_of_interactors_pydgin = len(interactors)

        parent_id = parent_doc.doc_id()
        self.assertEqual(parent_id, child_doc.parent(), 'Parent id ok')

        parent_entrez = getattr(parent_doc, "dbxrefs")["entrez"]

        # Download bioplex file from source and search for the parent entrez id interactors
        section_bioplex = config["BIOPLEX"]
        file_url = section_bioplex['location'] + section_bioplex['files']

        status = HTTPDownload.download(file_url, '/tmp', 'bioplex.tmp')
        my_regex = r"\b" + re.escape(parent_entrez) + r"\b"
        interactor_counter = 0
        if status:
            entrez_list_bioplex = set()
            with open('/tmp/bioplex.tmp', "r") as data:
                for line in data:
                    if re.search(my_regex, line):
                        tmp_list = line.split()
                        if tmp_list[0] != parent_entrez:
                            entrez_list_bioplex.add(tmp_list[0])
                        if tmp_list[1] != parent_entrez:
                            entrez_list_bioplex.add(tmp_list[1])
                        interactor_counter += 1

        if(len(entrez_list_pydgin) == len(entrez_list_bioplex)):
            self.assertEqual(number_of_interactors_pydgin, interactor_counter,
                             "Interactor count is correct " + str(number_of_interactors_pydgin))
        else:
            # find the missing one - Subtract.
            diff = set()
            if(len(entrez_list_pydgin) > len(entrez_list_bioplex)):
                diff = entrez_list_pydgin - entrez_list_bioplex
            else:
                diff = entrez_list_bioplex - entrez_list_pydgin

            # now check if these ids exists in history
            # Do a entrez to ensembl id lookup in gene history
            (newgene_ids, discontinued_ids) = Gene._check_gene_history(list(diff), config)  # @UnusedVariable
            self.assertEqual(len(diff), len(discontinued_ids),
                             "The missing ids where found in gene_history as discontinued ids")