def test_run_advanced(self): sultan = Sultan() try: sultan.mkdir("-p /tmp/mytestdir")\ .mkdir("-p /tmp/mytestdir/foobar")\ .touch("/tmp/mytestdir/a")\ .touch("/tmp/mytestdir/b")\ .run() response = sultan.ls("-1 /tmp/mytestdir/").run() self.assertEqual(response, ['a', 'b', 'foobar']) finally: if os.path.exists('/tmp/mytestdir'): shutil.rmtree('/tmp/mytestdir')
def generate_profiles(in_dataframe, out_path): """Rather complicated and quite honetly ugly looking function used for generating the profiles from a given set of sequences. Intended to be used internally. """ out_path = Path(out_path) dataset = in_dataframe s = Sultan() print('Unpacking and generating Uniprot DB.') s.gunzip('-fk ../data/swiss-prot/uniprot_sprot.fasta.gz').run() cmd = NcbimakeblastdbCommandline( input_file='../data/swiss-prot/uniprot_sprot.fasta', dbtype='prot') cmd() if not (out_path / 'profile').exists(): s.mkdir(out_path / 'profile').run() with TemporaryDirectory() as psi_temp: for _, sample in tqdm(dataset.iterrows(), total=len(dataset), desc='Generating profiles'): with NamedTemporaryFile(mode='w') as blast_in: if isinstance(sample.name, tuple): sample_id, chain = sample.name[0], sample.name[1] out_name = f'{sample_id}_{chain}' dump_path = out_path / 'full_test_summary.joblib' else: sample_id = sample.name out_name = sample_id dump_path = out_path / 'jpred_summary.joblib' sequence, structure = sample[['Sequence', 'Structure']] structure = ' ' + structure print(f'>{out_name}', file=blast_in) print(sequence, file=blast_in) blast_in.seek(0) cmd = NcbipsiblastCommandline( query=blast_in.name, db='../data/swiss-prot/uniprot_sprot.fasta', evalue=0.01, num_iterations=3, out_ascii_pssm=f'{psi_temp}/{out_name}.pssm', num_descriptions=10000, num_alignments=10000, # out=f'{psi_temp}{out_name}.alns.blast', num_threads=8) cmd() if not os.path.exists( os.path.join(psi_temp, out_name + '.pssm')): tqdm.write( f'Unable to generate profile for {out_name}. No hits in the database.' ) dataset.drop(index=sample.name, inplace=True) continue with open(f'{psi_temp}/{out_name}.pssm', 'r') as pssm_file: pssm_file.readline() pssm_file.readline() profile = [] offset = False position = 0 for line in pssm_file: line = line.rstrip() if not line: break line = line.split() line.append(structure[position]) position += 1 if not offset: for i in range(2): line.insert(0, '') offset = True profile.append(line) profile = pd.DataFrame(profile) profile.drop( (profile.columns[col] for col in range(2, 22)), axis=1, inplace=True) profile.drop((profile.columns[-3:-1]), axis=1, inplace=True) profile.drop((profile.columns[0]), axis=1, inplace=True) profile.columns = profile.iloc[0] profile = profile[1:] profile.rename(columns={profile.columns[0]: "Sequence"}, inplace=True) profile.rename(columns={profile.columns[-1]: "Structure"}, inplace=True) profile = profile[ ['Structure'] + [col for col in profile.columns if col != 'Structure']] profile.loc[:, 'A':'V'] = profile.loc[:, 'A':'V'].astype( float).divide(100) profile.to_csv(out_path / 'profile' / (out_name + '.profile'), sep='\t', index=False) print( f'Dumping clean test to {dump_path}. Profiles are generated in {out_path}/profile' ) dump(dataset, dump_path)