def test_get_set_of_pbdid_from_pdb_seqres_txt_empty_file(self): temp_dir = tempfile.mkdtemp() try: params = D3RParameters() task = MakeBlastDBTask(temp_dir, params) task.create_dir() open(task.get_pdb_seqres_txt(), 'a').close() self.assertEqual(os.path.isfile(task.get_pdb_seqres_txt()), True) pdbset = task.get_set_of_pbdid_from_pdb_seqres_txt() self.assertEqual(len(pdbset), 0) finally: shutil.rmtree(temp_dir)
def test_get_set_of_pbdid_from_pdb_seqres_txt_file_no_seqs(self): temp_dir = tempfile.mkdtemp() try: params = D3RParameters() task = MakeBlastDBTask(temp_dir, params) task.create_dir() f = open(task.get_pdb_seqres_txt(), 'w') f.write('hi\nhow\nare\nyou') f.flush() f.close() self.assertEqual(os.path.isfile(task.get_pdb_seqres_txt()), True) pdbset = task.get_set_of_pbdid_from_pdb_seqres_txt() self.assertEqual(len(pdbset), 0) finally: shutil.rmtree(temp_dir)
def test_get_set_of_pdbid_in_crystalph_tsv_and_pdb_seqres_w_hits(self): temp_dir = tempfile.mkdtemp() try: params = D3RParameters() task = DataImportTask(temp_dir, params) task.create_dir() f = open(task.get_crystalph_tsv(), 'w') f.write('PDB_ID _exptl_crystal_grow.pH\n') f.write('4X09\t6.5\n') f.write('4rfr\t8\n') f.write('4XET\t6.2\n') f.write('4XF1\t6.2\n') f.write('4XF3\t6.2\n') f.flush() f.close() makeblast = MakeBlastDBTask(temp_dir, params) makeblast.create_dir() f = open(makeblast.get_pdb_seqres_txt(), 'w') f.write('>4rfr_A mol:protein length:154 MYOGLOBIN\n') f.write('MVLSEGEWQLVLHVWAKVEADVAGHGQDILIRLFKSHPETLEKFDRVK' 'HLKTEAEMKASEDLKKHG\n') f.write('>102l_A mol:protein length:165 T4 LYSOZYME\n') f.write('MNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAAKSELDKA' 'IGRNTNGVITKDEAEKLFNQDVDAAVRGILRNAKLKPVYDSLDAVRRAAL' 'INMVFQMGETGVAGFTNSLRMLQQKRWDEAAVNLAKSRWYNQTPNRAKRV' 'ITTFRTGTWDAYKNL\n') f.flush() f.close() pdbset = task.get_set_of_pdbid_in_crystalph_tsv_and_pdb_seqres() self.assertEqual(len(pdbset), 1) self.assertEqual('4RFR' in pdbset, True) finally: shutil.rmtree(temp_dir)
def test_run_where_everything_is_successful(self): temp_dir = tempfile.mkdtemp() try: params = D3RParameters() fakegz = os.path.join(temp_dir, 'fake.gz') f = gzip.open(fakegz, 'wb') f.write('hello\n') f.flush() f.close() params.pdbsequrl = 'file://' + fakegz params.makeblastdb = 'echo' task = MakeBlastDBTask(temp_dir, params) task._retrysleep = 0 task._maxretries = 1 task.run() self.assertEqual(task.get_error(), None) # check echo.stdout file for valid arguments f = open(os.path.join(task.get_dir(), 'echo.stdout'), 'r') line = f.readline() self.assertEqual( line, '-in ' + task.get_pdb_seqres_txt() + ' -out ' + os.path.join(task.get_dir(), 'pdb_db') + ' -dbtype prot\n') f.close() lines = task.get_email_log().split('\n') self.assertEqual(lines[2], '# sequence(s): 0') f.close() finally: shutil.rmtree(temp_dir)
def test_get_set_of_pbdid_from_pdb_seqres_txt_with_400k_file(self): temp_dir = tempfile.mkdtemp() try: params = D3RParameters() task = MakeBlastDBTask(temp_dir, params) task.create_dir() f = open(task.get_pdb_seqres_txt(), 'w') perms = itertools.permutations( string.ascii_lowercase + '123456789', 4) counter = 0 try: limit = 400000 while counter < limit: f.write('>' + ''.join(map(str, perms.next())) + '_A mol:protein length:165 T4 LYSOZYME\n') f.write('MVLSEGEWQLVLH\n') counter += 1 except StopIteration: pass f.flush() f.close() pdbset = task.get_set_of_pbdid_from_pdb_seqres_txt() self.assertEqual(len(pdbset), counter) finally: shutil.rmtree(temp_dir)
def test_get_set_of_pbdid_from_pdb_seqres_txt_with_seqs(self): temp_dir = tempfile.mkdtemp() try: params = D3RParameters() task = MakeBlastDBTask(temp_dir, params) task.create_dir() f = open(task.get_pdb_seqres_txt(), 'w') f.write('>101m_A mol:protein length:154 MYOGLOBIN\n') f.write('MVLSEGEWQLVLHVWAKVEADVAGHGQDILIRLFKSHPETLEKFDRVK' 'HLKTEAEMKASEDLKKHG\n') f.write('>102l_A mol:protein length:165 T4 LYSOZYME\n') f.write('MNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAAKSELDKA' 'IGRNTNGVITKDEAEKLFNQDVDAAVRGILRNAKLKPVYDSLDAVRRAAL' 'INMVFQMGETGVAGFTNSLRMLQQKRWDEAAVNLAKSRWYNQTPNRAKRV' 'ITTFRTGTWDAYKNL\n') f.write('>102l_A mol:protein length:154 MYOGLOBIN\n') f.write('MVLSEGEWQLVLHVWAKVEADVAGHGQDILIRLFKSHPETLEKFDRFKHL' 'KTEAEMKASEDLKKAGVTVLTALGAILKKKGHHEAELKPLAQSHATKHKI' 'PIKYLEFISEAIIHVLHSRHPGNFGADAQGAMNKALELFRKDIAAKYKELGYQG\n') f.write('>103l_A mol:protein length:167 T4 LYSOZYME\n') f.write('MNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNSLDAAKSELD' 'KAIGRNTNGVITKDEAEKLFNQDVDAAVRGILRNAKLKPVYDSLDAVRRA' 'ALINMVFQMGETGVAGFTNSLRMLQQKRWDEAAVNLAKSRWYNQTPNRAK' 'RVITTFRTGTWDAYKNL\n') f.write('>10jj3m_A mol:protein length:154 MYOGLOBIN\n') f.write('MVLSEGEWQLVLHVWAKVEADVAGHGQDILIRLFKSHPETLEKFDRF\n') f.write('>104l_A mol:protein length:166 T4 LYSOZYME\n') f.write('MNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAKSAAE\n') f.write('>104l_B mol:protein length:166 T4 LYSOZYME\n') f.write('MNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAKSAKNL\n') f.flush() f.close() self.assertEqual(os.path.isfile(task.get_pdb_seqres_txt()), True) pdbset = task.get_set_of_pbdid_from_pdb_seqres_txt() self.assertEqual(len(pdbset), 4) self.assertEqual('101M' in pdbset, True) self.assertEqual('102L' in pdbset, True) self.assertEqual('103L' in pdbset, True) self.assertEqual('104L' in pdbset, True) finally: shutil.rmtree(temp_dir)
def test_get_sequence_count_file_has_zero_size(self): temp_dir = tempfile.mkdtemp() try: params = D3RParameters() task = MakeBlastDBTask(temp_dir, params) task.create_dir() open(task.get_pdb_seqres_txt(), 'a').close() self.assertEqual(task._get_sequence_count_message(), '# sequence(s): 0') finally: shutil.rmtree(temp_dir)
def get_set_of_pdbid_in_crystalph_tsv_and_pdb_seqres(self): """Gets set of PDBIDs that are in both tsv and sequence file Examines `DataImportTask.CRYSTALPH_TSV` and `MakeBlastDBTask.PDB_SEQRES_TXT` and returns a set of PDBIDs that are in both files :returns: set of PDBIDs uppercase that are in both files above """ make_blastdb = MakeBlastDBTask(self._path, self._args) if not os.path.isfile(make_blastdb.get_pdb_seqres_txt()): logger.warning('No ' + make_blastdb.get_pdb_seqres_txt() + ' file found') return set() c_pdbid_set = self.get_set_of_pdbid_from_crystalph_tsv() if len(c_pdbid_set) == 0: logger.warning('No PDBIds found in ' + self.get_crystalph_tsv()) return set() seq_pdbid_set = make_blastdb.get_set_of_pbdid_from_pdb_seqres_txt() if len(seq_pdbid_set) == 0: logger.warning('No PDBIds found in ' + make_blastdb.get_pdb_seqres_txt()) return set() common_pdbid = set() # iterate through tsv pdb ids and return any found in # sequence pdb id set for id in c_pdbid_set: if id in seq_pdbid_set: common_pdbid.add(id) logger.debug('Found ' + str(len(common_pdbid)) + ' PDBIDs in ' + self.get_crystalph_tsv() + ' and ' + make_blastdb.get_pdb_seqres_txt()) return common_pdbid
def test_get_set_of_pbdid_from_pdb_seqres_txt_wrong_len_pdbids(self): temp_dir = tempfile.mkdtemp() try: params = D3RParameters() task = MakeBlastDBTask(temp_dir, params) task.create_dir() f = open(task.get_pdb_seqres_txt(), 'w') f.write('>1m_A mol:protein length:154 MYOGLOBIN\n') f.write('MVLSEGEWQLVLHVWAKVEADVAGHGQDILIRLFKSHPETLEKFDRVK' 'HLKTEAEMKASEDLKKHG\n') f.write('>abcdel_A mol:protein length:165 T4 LYSOZYME\n') f.write('MNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAAKSELDKA' 'IGRNTNGVITKDEAEKLFNQDVDAAVRGILRNAKLKPVYDSLDAVRRAAL' 'INMVFQMGETGVAGFTNSLRMLQQKRWDEAAVNLAKSRWYNQTPNRAKRV' 'ITTFRTGTWDAYKNL\n') f.flush() f.close() self.assertEqual(os.path.isfile(task.get_pdb_seqres_txt()), True) pdbset = task.get_set_of_pbdid_from_pdb_seqres_txt() self.assertEqual(len(pdbset), 0) finally: shutil.rmtree(temp_dir)
def test_get_sequence_count_file_has_multiple_seqs(self): temp_dir = tempfile.mkdtemp() try: params = D3RParameters() task = MakeBlastDBTask(temp_dir, params) task.create_dir() f = open(task.get_pdb_seqres_txt(), 'w') f.write('>hi\n>seq\n>are\n') f.flush() f.close() self.assertEqual(task._get_sequence_count_message(), '# sequence(s): 3') finally: shutil.rmtree(temp_dir)
def test_run_where_gunzip_fails(self): temp_dir = tempfile.mkdtemp() try: params = D3RParameters() fakegz = os.path.join(temp_dir, 'fake.gz') f = open(fakegz, 'w') f.write('hello\n') f.flush() f.close() params.pdbsequrl = 'file://' + fakegz params.makeblastdb = 'makeblastdb' task = MakeBlastDBTask(temp_dir, params) task._retrysleep = 0 task._maxretries = 1 task.run() self.assertEqual( task.get_error(), 'Unable to uncompress file: ' + task.get_pdb_seqres_txt()) finally: shutil.rmtree(temp_dir)
def test_get_set_of_pdbid_in_crystalph_tsv_and_pdb_seqres_empty_seq(self): temp_dir = tempfile.mkdtemp() try: params = D3RParameters() task = DataImportTask(temp_dir, params) task.create_dir() f = open(task.get_crystalph_tsv(), 'w') f.write('PDB_ID _exptl_crystal_grow.pH\n') f.write('4X09\t6.5\n') f.write('4rfr\t8\n') f.write('4XET\t6.2\n') f.write('4XF1\t6.2\n') f.write('4XF3\t6.2\n') f.flush() f.close() makeblast = MakeBlastDBTask(temp_dir, params) makeblast.create_dir() open(makeblast.get_pdb_seqres_txt(), 'a').close() pdbset = task.get_set_of_pdbid_in_crystalph_tsv_and_pdb_seqres() self.assertEqual(len(pdbset), 0) finally: shutil.rmtree(temp_dir)
def test_get_pdbseqres_txt(self): params = D3RParameters() task = MakeBlastDBTask('/foo', params) self.assertEqual( task.get_pdb_seqres_txt(), os.path.join('/foo', task.get_dir_name(), 'pdb_seqres.txt'))