Esempio n. 1
0
 def test_get_set_of_pbdid_from_pdb_seqres_txt_empty_file(self):
     temp_dir = tempfile.mkdtemp()
     try:
         params = D3RParameters()
         task = MakeBlastDBTask(temp_dir, params)
         task.create_dir()
         open(task.get_pdb_seqres_txt(), 'a').close()
         self.assertEqual(os.path.isfile(task.get_pdb_seqres_txt()), True)
         pdbset = task.get_set_of_pbdid_from_pdb_seqres_txt()
         self.assertEqual(len(pdbset), 0)
     finally:
         shutil.rmtree(temp_dir)
Esempio n. 2
0
 def test_get_set_of_pbdid_from_pdb_seqres_txt_file_no_seqs(self):
     temp_dir = tempfile.mkdtemp()
     try:
         params = D3RParameters()
         task = MakeBlastDBTask(temp_dir, params)
         task.create_dir()
         f = open(task.get_pdb_seqres_txt(), 'w')
         f.write('hi\nhow\nare\nyou')
         f.flush()
         f.close()
         self.assertEqual(os.path.isfile(task.get_pdb_seqres_txt()), True)
         pdbset = task.get_set_of_pbdid_from_pdb_seqres_txt()
         self.assertEqual(len(pdbset), 0)
     finally:
         shutil.rmtree(temp_dir)
Esempio n. 3
0
    def test_get_set_of_pdbid_in_crystalph_tsv_and_pdb_seqres_w_hits(self):
        temp_dir = tempfile.mkdtemp()
        try:
            params = D3RParameters()
            task = DataImportTask(temp_dir, params)
            task.create_dir()
            f = open(task.get_crystalph_tsv(), 'w')
            f.write('PDB_ID  _exptl_crystal_grow.pH\n')
            f.write('4X09\t6.5\n')
            f.write('4rfr\t8\n')
            f.write('4XET\t6.2\n')
            f.write('4XF1\t6.2\n')
            f.write('4XF3\t6.2\n')
            f.flush()
            f.close()

            makeblast = MakeBlastDBTask(temp_dir, params)
            makeblast.create_dir()
            f = open(makeblast.get_pdb_seqres_txt(), 'w')
            f.write('>4rfr_A mol:protein length:154  MYOGLOBIN\n')
            f.write('MVLSEGEWQLVLHVWAKVEADVAGHGQDILIRLFKSHPETLEKFDRVK'
                    'HLKTEAEMKASEDLKKHG\n')
            f.write('>102l_A mol:protein length:165  T4 LYSOZYME\n')
            f.write('MNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAAKSELDKA'
                    'IGRNTNGVITKDEAEKLFNQDVDAAVRGILRNAKLKPVYDSLDAVRRAAL'
                    'INMVFQMGETGVAGFTNSLRMLQQKRWDEAAVNLAKSRWYNQTPNRAKRV'
                    'ITTFRTGTWDAYKNL\n')
            f.flush()
            f.close()

            pdbset = task.get_set_of_pdbid_in_crystalph_tsv_and_pdb_seqres()
            self.assertEqual(len(pdbset), 1)
            self.assertEqual('4RFR' in pdbset, True)
        finally:
            shutil.rmtree(temp_dir)
Esempio n. 4
0
    def test_run_where_everything_is_successful(self):
        temp_dir = tempfile.mkdtemp()
        try:
            params = D3RParameters()
            fakegz = os.path.join(temp_dir, 'fake.gz')

            f = gzip.open(fakegz, 'wb')
            f.write('hello\n')
            f.flush()
            f.close()

            params.pdbsequrl = 'file://' + fakegz
            params.makeblastdb = 'echo'
            task = MakeBlastDBTask(temp_dir, params)
            task._retrysleep = 0
            task._maxretries = 1
            task.run()
            self.assertEqual(task.get_error(), None)

            # check echo.stdout file for valid arguments
            f = open(os.path.join(task.get_dir(), 'echo.stdout'), 'r')
            line = f.readline()

            self.assertEqual(
                line, '-in ' + task.get_pdb_seqres_txt() + ' -out ' +
                os.path.join(task.get_dir(), 'pdb_db') + ' -dbtype prot\n')

            f.close()

            lines = task.get_email_log().split('\n')
            self.assertEqual(lines[2], '# sequence(s): 0')
            f.close()
        finally:
            shutil.rmtree(temp_dir)
Esempio n. 5
0
    def test_get_set_of_pbdid_from_pdb_seqres_txt_with_400k_file(self):
        temp_dir = tempfile.mkdtemp()
        try:
            params = D3RParameters()
            task = MakeBlastDBTask(temp_dir, params)
            task.create_dir()
            f = open(task.get_pdb_seqres_txt(), 'w')
            perms = itertools.permutations(
                string.ascii_lowercase + '123456789', 4)
            counter = 0
            try:
                limit = 400000
                while counter < limit:
                    f.write('>' + ''.join(map(str, perms.next())) +
                            '_A mol:protein length:165  T4 LYSOZYME\n')
                    f.write('MVLSEGEWQLVLH\n')
                    counter += 1
            except StopIteration:
                pass
            f.flush()
            f.close()

            pdbset = task.get_set_of_pbdid_from_pdb_seqres_txt()
            self.assertEqual(len(pdbset), counter)

        finally:
            shutil.rmtree(temp_dir)
Esempio n. 6
0
    def test_get_set_of_pbdid_from_pdb_seqres_txt_with_seqs(self):
        temp_dir = tempfile.mkdtemp()
        try:
            params = D3RParameters()
            task = MakeBlastDBTask(temp_dir, params)
            task.create_dir()
            f = open(task.get_pdb_seqres_txt(), 'w')
            f.write('>101m_A mol:protein length:154  MYOGLOBIN\n')
            f.write('MVLSEGEWQLVLHVWAKVEADVAGHGQDILIRLFKSHPETLEKFDRVK'
                    'HLKTEAEMKASEDLKKHG\n')
            f.write('>102l_A mol:protein length:165  T4 LYSOZYME\n')
            f.write('MNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAAKSELDKA'
                    'IGRNTNGVITKDEAEKLFNQDVDAAVRGILRNAKLKPVYDSLDAVRRAAL'
                    'INMVFQMGETGVAGFTNSLRMLQQKRWDEAAVNLAKSRWYNQTPNRAKRV'
                    'ITTFRTGTWDAYKNL\n')
            f.write('>102l_A mol:protein length:154  MYOGLOBIN\n')
            f.write('MVLSEGEWQLVLHVWAKVEADVAGHGQDILIRLFKSHPETLEKFDRFKHL'
                    'KTEAEMKASEDLKKAGVTVLTALGAILKKKGHHEAELKPLAQSHATKHKI'
                    'PIKYLEFISEAIIHVLHSRHPGNFGADAQGAMNKALELFRKDIAAKYKELGYQG\n')
            f.write('>103l_A mol:protein length:167  T4 LYSOZYME\n')
            f.write('MNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNSLDAAKSELD'
                    'KAIGRNTNGVITKDEAEKLFNQDVDAAVRGILRNAKLKPVYDSLDAVRRA'
                    'ALINMVFQMGETGVAGFTNSLRMLQQKRWDEAAVNLAKSRWYNQTPNRAK'
                    'RVITTFRTGTWDAYKNL\n')
            f.write('>10jj3m_A mol:protein length:154  MYOGLOBIN\n')
            f.write('MVLSEGEWQLVLHVWAKVEADVAGHGQDILIRLFKSHPETLEKFDRF\n')
            f.write('>104l_A mol:protein length:166  T4 LYSOZYME\n')
            f.write('MNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAKSAAE\n')
            f.write('>104l_B mol:protein length:166  T4 LYSOZYME\n')
            f.write('MNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAKSAKNL\n')

            f.flush()
            f.close()
            self.assertEqual(os.path.isfile(task.get_pdb_seqres_txt()), True)
            pdbset = task.get_set_of_pbdid_from_pdb_seqres_txt()
            self.assertEqual(len(pdbset), 4)
            self.assertEqual('101M' in pdbset, True)
            self.assertEqual('102L' in pdbset, True)
            self.assertEqual('103L' in pdbset, True)
            self.assertEqual('104L' in pdbset, True)
        finally:
            shutil.rmtree(temp_dir)
Esempio n. 7
0
 def test_get_sequence_count_file_has_zero_size(self):
     temp_dir = tempfile.mkdtemp()
     try:
         params = D3RParameters()
         task = MakeBlastDBTask(temp_dir, params)
         task.create_dir()
         open(task.get_pdb_seqres_txt(), 'a').close()
         self.assertEqual(task._get_sequence_count_message(),
                          '# sequence(s): 0')
     finally:
         shutil.rmtree(temp_dir)
Esempio n. 8
0
    def get_set_of_pdbid_in_crystalph_tsv_and_pdb_seqres(self):
        """Gets set of PDBIDs that are in both tsv and sequence file

           Examines `DataImportTask.CRYSTALPH_TSV` and
           `MakeBlastDBTask.PDB_SEQRES_TXT` and returns a set of PDBIDs
           that are in both files
           :returns: set of PDBIDs uppercase that are in both files above
        """
        make_blastdb = MakeBlastDBTask(self._path, self._args)

        if not os.path.isfile(make_blastdb.get_pdb_seqres_txt()):
            logger.warning('No ' + make_blastdb.get_pdb_seqres_txt() +
                           ' file found')
            return set()

        c_pdbid_set = self.get_set_of_pdbid_from_crystalph_tsv()

        if len(c_pdbid_set) == 0:
            logger.warning('No PDBIds found in ' + self.get_crystalph_tsv())
            return set()

        seq_pdbid_set = make_blastdb.get_set_of_pbdid_from_pdb_seqres_txt()

        if len(seq_pdbid_set) == 0:
            logger.warning('No PDBIds found in ' +
                           make_blastdb.get_pdb_seqres_txt())
            return set()

        common_pdbid = set()

        # iterate through tsv pdb ids and return any found in
        # sequence pdb id set
        for id in c_pdbid_set:
            if id in seq_pdbid_set:
                common_pdbid.add(id)

        logger.debug('Found ' + str(len(common_pdbid)) + ' PDBIDs in ' +
                     self.get_crystalph_tsv() + ' and ' +
                     make_blastdb.get_pdb_seqres_txt())

        return common_pdbid
Esempio n. 9
0
    def test_get_set_of_pbdid_from_pdb_seqres_txt_wrong_len_pdbids(self):
        temp_dir = tempfile.mkdtemp()
        try:
            params = D3RParameters()
            task = MakeBlastDBTask(temp_dir, params)
            task.create_dir()
            f = open(task.get_pdb_seqres_txt(), 'w')
            f.write('>1m_A mol:protein length:154  MYOGLOBIN\n')
            f.write('MVLSEGEWQLVLHVWAKVEADVAGHGQDILIRLFKSHPETLEKFDRVK'
                    'HLKTEAEMKASEDLKKHG\n')
            f.write('>abcdel_A mol:protein length:165  T4 LYSOZYME\n')
            f.write('MNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAAKSELDKA'
                    'IGRNTNGVITKDEAEKLFNQDVDAAVRGILRNAKLKPVYDSLDAVRRAAL'
                    'INMVFQMGETGVAGFTNSLRMLQQKRWDEAAVNLAKSRWYNQTPNRAKRV'
                    'ITTFRTGTWDAYKNL\n')

            f.flush()
            f.close()
            self.assertEqual(os.path.isfile(task.get_pdb_seqres_txt()), True)
            pdbset = task.get_set_of_pbdid_from_pdb_seqres_txt()
            self.assertEqual(len(pdbset), 0)
        finally:
            shutil.rmtree(temp_dir)
Esempio n. 10
0
 def test_get_sequence_count_file_has_multiple_seqs(self):
     temp_dir = tempfile.mkdtemp()
     try:
         params = D3RParameters()
         task = MakeBlastDBTask(temp_dir, params)
         task.create_dir()
         f = open(task.get_pdb_seqres_txt(), 'w')
         f.write('>hi\n>seq\n>are\n')
         f.flush()
         f.close()
         self.assertEqual(task._get_sequence_count_message(),
                          '# sequence(s): 3')
     finally:
         shutil.rmtree(temp_dir)
Esempio n. 11
0
    def test_run_where_gunzip_fails(self):
        temp_dir = tempfile.mkdtemp()
        try:
            params = D3RParameters()
            fakegz = os.path.join(temp_dir, 'fake.gz')

            f = open(fakegz, 'w')
            f.write('hello\n')
            f.flush()
            f.close()

            params.pdbsequrl = 'file://' + fakegz
            params.makeblastdb = 'makeblastdb'
            task = MakeBlastDBTask(temp_dir, params)
            task._retrysleep = 0
            task._maxretries = 1
            task.run()
            self.assertEqual(
                task.get_error(),
                'Unable to uncompress file: ' + task.get_pdb_seqres_txt())
        finally:
            shutil.rmtree(temp_dir)
Esempio n. 12
0
    def test_get_set_of_pdbid_in_crystalph_tsv_and_pdb_seqres_empty_seq(self):
        temp_dir = tempfile.mkdtemp()
        try:
            params = D3RParameters()
            task = DataImportTask(temp_dir, params)
            task.create_dir()
            f = open(task.get_crystalph_tsv(), 'w')
            f.write('PDB_ID  _exptl_crystal_grow.pH\n')
            f.write('4X09\t6.5\n')
            f.write('4rfr\t8\n')
            f.write('4XET\t6.2\n')
            f.write('4XF1\t6.2\n')
            f.write('4XF3\t6.2\n')
            f.flush()
            f.close()

            makeblast = MakeBlastDBTask(temp_dir, params)
            makeblast.create_dir()
            open(makeblast.get_pdb_seqres_txt(), 'a').close()

            pdbset = task.get_set_of_pdbid_in_crystalph_tsv_and_pdb_seqres()
            self.assertEqual(len(pdbset), 0)
        finally:
            shutil.rmtree(temp_dir)
Esempio n. 13
0
 def test_get_pdbseqres_txt(self):
     params = D3RParameters()
     task = MakeBlastDBTask('/foo', params)
     self.assertEqual(
         task.get_pdb_seqres_txt(),
         os.path.join('/foo', task.get_dir_name(), 'pdb_seqres.txt'))