def call_hhblits(input_file, output_file, database, maxfilt=100000, realign_max=100000, B=100000, Z=100000, n=3, e=0.001, retry_hhblits_with_memory_limit_if_fail=False, hhr_file=None, **kwargs): """ calls HH-blits with arguments recommended for CCMpred : https://github.com/soedinglab/CCMpred/wiki/FAQ """ if hhr_file is None: hhr_file = pathlib.Path('.'.join(str(output_file).split('.')[:-1]) + ".hhr") print("calling hhblits on " + str(input_file) + " using " + str(database) + ", output will be available at " + str(output_file)) fm.check_if_file_ok(input_file) #hhblits_call = "hhblits -d "+str(database)+" -n "+str(n)+" -e "+str(e)+" -i "+str(input_file)+" -oa3m "+str(output_file)+" -o "+str(hhr_file) hhblits_call = "hhblits -maxfilt " + str(maxfilt) + " -realign_max " + str( realign_max) + " -d " + str(database) + " -all -B " + str( B) + " -Z " + str(Z) + " -n " + str(n) + " -e " + str( e) + " -i " + str(input_file) + " -oa3m " + str( output_file) + " -o " + str(hhr_file) print(hhblits_call) subprocess.Popen(hhblits_call, shell=True).wait() if not output_file.exists() and retry_hhblits_with_memory_limit_if_fail: print("HH-blits failed for some reason, trying with a memory limit") memory_friendly_call = hhblits_call + " -cpu 1 -maxmem 1" subprocess.Popen(memory_friendly_call, shell=True).wait() if not output_file.exists(): raise Exception("HH-blits failed. Protein is probably too long ?") return hhr_file
def from_msgpack(cls, binary_file, **kwargs): """ initialize MRF from msgpack file given by CCMpredPy """ with open(str(binary_file), 'rb') as data_file: df = msgpack.unpackb(data_file.read()) """ df est un dictionnaire : b'format' b'ncol' b'x_single': np.array(ncol, 20)) b'x_pair' : np.array(ncol, ncol, 21, 21) b'meta' """ print("getting Potts model from " + str(binary_file)) fm.check_if_file_ok(binary_file) ncol = df[b'ncol'] v_20 = np.array(df[b'x_single']).reshape((ncol, 20)) v = np.zeros((ncol, 21)) v[:, :-1] = v_20 w = np.zeros((ncol, ncol, 21, 21)) for p in df[b'x_pair'].values(): i = p[b'i'] j = p[b'j'] mat = np.array(p[b'x']).reshape((21, 21)) w[i, j, :, :] = mat w[j, i, :, :] = mat.T if 'name' not in kwargs: kwargs['name'] = str(binary_file).replace('/', '-') if kwargs['name'].startswith('-'): kwargs['name'] = kwargs['name'][1:] mrf = cls(v, w, **kwargs) mrf.binary_file = binary_file return mrf
def call_hhfilter(input_file, output_file, hhid): print("calling hhfilter " + str(hhid) + " on " + str(input_file)) fm.check_if_file_ok(input_file) cmd = "hhfilter -i " + str(input_file) + " -o " + str( output_file) + " -id " + str(hhid) subprocess.Popen(cmd, shell=True).wait() if not output_file.is_file(): raise Exception("HHfilter failed")
def call_muscle_profile(msa_file, seq_file, output_file): fm.check_if_file_ok(seq_file) muscle_cline = MuscleCommandline(profile=True, in1=str(msa_file), in2=str(seq_file), out=str(output_file), gapopen=-0.1) stdout, stderr = muscle_cline()
def call_mafft(input_file, output_file=None): print("calling MAFFT") if output_file is None: output_file_name = '.'.join( str(input_file).split('.')[:-1]) + "_mafft.fasta" output_file = pathlib.Path(output_file_name) fm.check_if_file_ok(input_file) cmd = "mafft " + str(input_file) + " > " + str(output_file) subprocess.Popen(cmd, shell=True).wait() if not output_file.is_file(): raise Exception("MAFFT failed")
def infer_insertion_penalties_in_file(seed_a3m_file, seed_length, output_file): """ calls DCAbuild function to infer insertion penalties from MSA @seed_a3m_file with insertions indicated as lower case letters wrt MSA of length @seed_length, output .tsv file in DCAbuild format @output_file""" fm.check_if_file_ok(seed_a3m_file) call_dcabuild_infer_ins = Main.include(julia_script_insertions_file) call_dcabuild_infer_ins(str(seed_a3m_file), seed_length, str(output_file)) insertion_penalties = get_insertion_penalties_from_file(output_file) # set external insertions to 0 for insertion_type in ['open', 'extend']: insertion_penalties[insertion_type][0] = 0 insertion_penalties[insertion_type].append(0) write_insertion_penalties_in_file(insertion_penalties, output_file)
def call_reformat(input_file, output_file): print("will reformat " + str(input_file) + " thanks to Soeding's reformat.pl to " + str(output_file)) fm.check_if_file_ok(input_file) call = "reformat.pl a3m fas " + str(input_file) + " " + str( output_file) + " -r" print(call) subprocess.Popen(call, shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL).wait() if not output_file.is_file(): raise Exception("Reformat failed")
def get_insertion_penalties_from_file(insertion_penalties_file): """ reads insertion penalties files in DCAbuild .tsv format into a dictionary {"open":[list of gap open penalties], "extend":[list of gap extend penalties]}""" fm.check_if_file_ok(insertion_penalties_file) insertion_penalties = {'open': [], 'extend': []} # read penalties from file with open(str(insertion_penalties_file), 'r') as tsv_file: csv_reader = csv.reader(tsv_file, delimiter='\t') for row in csv_reader: insertion_penalties['open'].append(float(row[0])) insertion_penalties['extend'].append(float(row[1])) return insertion_penalties
def call_muscle(input_file, output_file=None): print("Calling Muscle") if output_file is None: output_file_name = '.'.join( str(input_file).split('.')[:-1]) + "_muscle.fasta" output_file = pathlib.Path(output_file_name) muscle_cline = MuscleCommandline(input=str(input_file), out=str(output_file)) print(muscle_cline) fm.check_if_file_ok(input_file) stdout, stderr = muscle_cline() if not output_file.is_file(): raise Exception("Muscle failed") return output_file
def call_trimal(input_file, output_file, trimal_gt, cons, colnumbering_file=None): print("calling trimal gt " + str(trimal_gt) + " cons " + str(cons) + " on " + str(input_file)) fm.check_if_file_ok(input_file) cmd = "trimal -in " + str(input_file) + " -out " + str( output_file) + " -gt " + str(trimal_gt) + " -cons " + str(cons) if colnumbering_file is not None: cmd += " -colnumbering > " + str(colnumbering_file) subprocess.Popen(cmd, shell=True).wait() if not output_file.is_file(): raise Exception("Trimal failed")
def from_training_set(cls, aln_file, binary_file, write_readme=True, readme_file=None, **kwargs): """ initialize Potts model from train MSA file """ fm.check_if_file_ok(aln_file) call = "ccmpred " + str(aln_file) + " -b " + str(binary_file) for key_arg in kwargs: arg_ccm = key_arg.replace('_', '-') if arg_ccm in POSSIBLE_CCMPRED_OPTIONS: if isinstance(kwargs[key_arg], bool): if kwargs[key_arg] is True: arg_value = "" call += " --" + arg_ccm + " " + arg_value else: arg_value = str(kwargs[key_arg]) call += " --" + arg_ccm + " " + arg_value if write_readme: if readme_file is None: readme_file = pathlib.Path( str(binary_file)[:-len(".mrf")] + "_mrf_README.txt") with readme_file.open(mode='w') as f: json.dump(call, f, default=str) print(call) subprocess.Popen(call, shell=True).wait() if not os.path.exists(str(binary_file)): raise Exception( "CCMpredPy wasn't able to infer the MRF. Protein is probably too long ?" ) mrf = cls.from_msgpack(binary_file) mrf.training_set = pathlib.Path(aln_file) return mrf
def main(args=sys.argv[1:]): parser = argparse.ArgumentParser() parser.add_argument('-f', '--potts_folder', help="Feature folder", type=pathlib.Path, required=True) parser.add_argument('--pdb_file', help="PDB file", type=pathlib.Path, default=None) parser.add_argument('-i', '--pdb_id', help="PDB id", required=False) parser.add_argument('-cid', '--chain_id', help="PDB chain id (default : A)", default='A') parser.add_argument( '-sep', '--coupling_sep_min', help="Min. nb residues between members of a coupling (default : 3)", type=int, default=3) parser.add_argument('-n', '--top', help="Nb of couplings displayed (default : 20)", type=int, default=20) parser.add_argument( '--auto_top', help= "Nb couplings displayed = elbow of the score curve (default : False)", default=False, action='store_true') parser.add_argument('--wij_cutoff', help="||wij|| <= wij_cutoff (default : no cutoff)", default=None, type=float) parser.add_argument( '-num', '--numbering_type', help= "Use the same numbering type around the circle as sequence (sequence) or PDB structure (pdb) (default : numbering as sequence)", default='sequence') parser.add_argument( '-o', '--output_circos_image', help= "Output circos image (default : [potts_folder]/circos_output/circos.png)", type=pathlib.Path, default=None) parser.add_argument('-t', '--thickness', help="Couplings thickness factor (default : 1)", type=float, default=1) args = vars(parser.parse_args(args)) fm.check_if_dir_ok(args["potts_folder"]) potts_object = Potts_Object.from_folder(args['potts_folder']) if args['pdb_file'] is None: name = str(potts_object.folder) + '/' + args['pdb_id'] args['pdb_file'] = fm.fetch_pdb_file(args['pdb_id'], name) fm.check_if_file_ok(args["pdb_file"]) create_circos_from_potts_object_and_pdb_chain(potts_object, **args)
def test_no_file(self): nonexistent_filename = "patate.fasta" with self.assertRaises(Exception) as context: fm.check_if_file_ok(nonexistent_filename)
def test_file_exists(self): existent_filename = SEQ_1CC8 fm.check_if_file_ok(existent_filename)
def from_sequence_file_with_submat(cls, seq_file, tau=0.5, **kwargs): """ substitution matrix pseudocounts """ fm.check_if_file_ok(seq_file) seq = fm.get_first_sequence_in_fasta_file(seq_file).upper() return cls.from_sequence_with_submat(seq, seq_file, **kwargs)
def from_sequence_file_to_one_hot(cls, seq_file, **kwargs): """ one hot encoding """ fm.check_if_file_ok(seq_file) seq = fm.get_first_sequence_in_fasta_file(seq_file).upper() return cls.from_sequence_to_one_hot(seq, seq_file=seq_file, **kwargs)