Esempio n. 1
0
def call_hhblits(input_file,
                 output_file,
                 database,
                 maxfilt=100000,
                 realign_max=100000,
                 B=100000,
                 Z=100000,
                 n=3,
                 e=0.001,
                 retry_hhblits_with_memory_limit_if_fail=False,
                 hhr_file=None,
                 **kwargs):
    """ calls HH-blits with arguments recommended for CCMpred : https://github.com/soedinglab/CCMpred/wiki/FAQ """
    if hhr_file is None:
        hhr_file = pathlib.Path('.'.join(str(output_file).split('.')[:-1]) +
                                ".hhr")
    print("calling hhblits on " + str(input_file) + " using " + str(database) +
          ", output will be available at " + str(output_file))
    fm.check_if_file_ok(input_file)
    #hhblits_call = "hhblits -d "+str(database)+" -n "+str(n)+" -e "+str(e)+" -i "+str(input_file)+" -oa3m "+str(output_file)+" -o "+str(hhr_file)
    hhblits_call = "hhblits -maxfilt " + str(maxfilt) + " -realign_max " + str(
        realign_max) + " -d " + str(database) + " -all -B " + str(
            B) + " -Z " + str(Z) + " -n " + str(n) + " -e " + str(
                e) + " -i " + str(input_file) + " -oa3m " + str(
                    output_file) + " -o " + str(hhr_file)
    print(hhblits_call)
    subprocess.Popen(hhblits_call, shell=True).wait()
    if not output_file.exists() and retry_hhblits_with_memory_limit_if_fail:
        print("HH-blits failed for some reason, trying with a memory limit")
        memory_friendly_call = hhblits_call + " -cpu 1 -maxmem 1"
        subprocess.Popen(memory_friendly_call, shell=True).wait()
        if not output_file.exists():
            raise Exception("HH-blits failed. Protein is probably too long ?")
    return hhr_file
Esempio n. 2
0
 def from_msgpack(cls, binary_file, **kwargs):
     """
         initialize MRF from msgpack file given by CCMpredPy
     """
     with open(str(binary_file), 'rb') as data_file:
         df = msgpack.unpackb(data_file.read())
         """
         df est un dictionnaire :
             b'format'
             b'ncol'
             b'x_single': np.array(ncol, 20)) 
             b'x_pair' : np.array(ncol, ncol, 21, 21)
             b'meta'
         """
     print("getting Potts model from " + str(binary_file))
     fm.check_if_file_ok(binary_file)
     ncol = df[b'ncol']
     v_20 = np.array(df[b'x_single']).reshape((ncol, 20))
     v = np.zeros((ncol, 21))
     v[:, :-1] = v_20
     w = np.zeros((ncol, ncol, 21, 21))
     for p in df[b'x_pair'].values():
         i = p[b'i']
         j = p[b'j']
         mat = np.array(p[b'x']).reshape((21, 21))
         w[i, j, :, :] = mat
         w[j, i, :, :] = mat.T
     if 'name' not in kwargs:
         kwargs['name'] = str(binary_file).replace('/', '-')
         if kwargs['name'].startswith('-'):
             kwargs['name'] = kwargs['name'][1:]
     mrf = cls(v, w, **kwargs)
     mrf.binary_file = binary_file
     return mrf
Esempio n. 3
0
def call_hhfilter(input_file, output_file, hhid):
    print("calling hhfilter " + str(hhid) + " on " + str(input_file))
    fm.check_if_file_ok(input_file)
    cmd = "hhfilter -i " + str(input_file) + " -o " + str(
        output_file) + " -id " + str(hhid)
    subprocess.Popen(cmd, shell=True).wait()
    if not output_file.is_file():
        raise Exception("HHfilter failed")
Esempio n. 4
0
def call_muscle_profile(msa_file, seq_file, output_file):
    fm.check_if_file_ok(seq_file)
    muscle_cline = MuscleCommandline(profile=True,
                                     in1=str(msa_file),
                                     in2=str(seq_file),
                                     out=str(output_file),
                                     gapopen=-0.1)
    stdout, stderr = muscle_cline()
Esempio n. 5
0
def call_mafft(input_file, output_file=None):
    print("calling MAFFT")
    if output_file is None:
        output_file_name = '.'.join(
            str(input_file).split('.')[:-1]) + "_mafft.fasta"
        output_file = pathlib.Path(output_file_name)
    fm.check_if_file_ok(input_file)
    cmd = "mafft " + str(input_file) + " > " + str(output_file)
    subprocess.Popen(cmd, shell=True).wait()
    if not output_file.is_file():
        raise Exception("MAFFT failed")
Esempio n. 6
0
def infer_insertion_penalties_in_file(seed_a3m_file, seed_length, output_file):
    """ calls DCAbuild function to infer insertion penalties from MSA @seed_a3m_file with insertions indicated as lower case letters wrt MSA of length @seed_length, output .tsv file in DCAbuild format @output_file"""
    fm.check_if_file_ok(seed_a3m_file)
    call_dcabuild_infer_ins = Main.include(julia_script_insertions_file)
    call_dcabuild_infer_ins(str(seed_a3m_file), seed_length, str(output_file))
    insertion_penalties = get_insertion_penalties_from_file(output_file)
    # set external insertions to 0
    for insertion_type in ['open', 'extend']:
        insertion_penalties[insertion_type][0] = 0
        insertion_penalties[insertion_type].append(0)
    write_insertion_penalties_in_file(insertion_penalties, output_file)
Esempio n. 7
0
def call_reformat(input_file, output_file):
    print("will reformat " + str(input_file) +
          " thanks to Soeding's reformat.pl to " + str(output_file))
    fm.check_if_file_ok(input_file)
    call = "reformat.pl a3m fas " + str(input_file) + " " + str(
        output_file) + " -r"
    print(call)
    subprocess.Popen(call,
                     shell=True,
                     stdout=subprocess.DEVNULL,
                     stderr=subprocess.DEVNULL).wait()
    if not output_file.is_file():
        raise Exception("Reformat failed")
Esempio n. 8
0
def get_insertion_penalties_from_file(insertion_penalties_file):
    """ reads insertion penalties files in DCAbuild .tsv format into a dictionary {"open":[list of gap open penalties], "extend":[list of gap extend penalties]}"""
    fm.check_if_file_ok(insertion_penalties_file)
    insertion_penalties = {'open': [], 'extend': []}

    # read penalties from file
    with open(str(insertion_penalties_file), 'r') as tsv_file:
        csv_reader = csv.reader(tsv_file, delimiter='\t')
        for row in csv_reader:
            insertion_penalties['open'].append(float(row[0]))
            insertion_penalties['extend'].append(float(row[1]))

    return insertion_penalties
Esempio n. 9
0
def call_muscle(input_file, output_file=None):
    print("Calling Muscle")
    if output_file is None:
        output_file_name = '.'.join(
            str(input_file).split('.')[:-1]) + "_muscle.fasta"
        output_file = pathlib.Path(output_file_name)
    muscle_cline = MuscleCommandline(input=str(input_file),
                                     out=str(output_file))
    print(muscle_cline)
    fm.check_if_file_ok(input_file)
    stdout, stderr = muscle_cline()
    if not output_file.is_file():
        raise Exception("Muscle failed")
    return output_file
Esempio n. 10
0
def call_trimal(input_file,
                output_file,
                trimal_gt,
                cons,
                colnumbering_file=None):
    print("calling trimal gt " + str(trimal_gt) + " cons " + str(cons) +
          " on " + str(input_file))
    fm.check_if_file_ok(input_file)
    cmd = "trimal -in " + str(input_file) + " -out " + str(
        output_file) + " -gt " + str(trimal_gt) + " -cons " + str(cons)
    if colnumbering_file is not None:
        cmd += " -colnumbering > " + str(colnumbering_file)
    subprocess.Popen(cmd, shell=True).wait()
    if not output_file.is_file():
        raise Exception("Trimal failed")
Esempio n. 11
0
 def from_training_set(cls,
                       aln_file,
                       binary_file,
                       write_readme=True,
                       readme_file=None,
                       **kwargs):
     """
         initialize Potts model from train MSA file
     """
     fm.check_if_file_ok(aln_file)
     call = "ccmpred " + str(aln_file) + " -b " + str(binary_file)
     for key_arg in kwargs:
         arg_ccm = key_arg.replace('_', '-')
         if arg_ccm in POSSIBLE_CCMPRED_OPTIONS:
             if isinstance(kwargs[key_arg], bool):
                 if kwargs[key_arg] is True:
                     arg_value = ""
                     call += " --" + arg_ccm + " " + arg_value
             else:
                 arg_value = str(kwargs[key_arg])
                 call += " --" + arg_ccm + " " + arg_value
     if write_readme:
         if readme_file is None:
             readme_file = pathlib.Path(
                 str(binary_file)[:-len(".mrf")] + "_mrf_README.txt")
         with readme_file.open(mode='w') as f:
             json.dump(call, f, default=str)
     print(call)
     subprocess.Popen(call, shell=True).wait()
     if not os.path.exists(str(binary_file)):
         raise Exception(
             "CCMpredPy wasn't able to infer the MRF. Protein is probably too long ?"
         )
     mrf = cls.from_msgpack(binary_file)
     mrf.training_set = pathlib.Path(aln_file)
     return mrf
Esempio n. 12
0
def main(args=sys.argv[1:]):
    parser = argparse.ArgumentParser()
    parser.add_argument('-f',
                        '--potts_folder',
                        help="Feature folder",
                        type=pathlib.Path,
                        required=True)
    parser.add_argument('--pdb_file',
                        help="PDB file",
                        type=pathlib.Path,
                        default=None)
    parser.add_argument('-i', '--pdb_id', help="PDB id", required=False)
    parser.add_argument('-cid',
                        '--chain_id',
                        help="PDB chain id (default : A)",
                        default='A')
    parser.add_argument(
        '-sep',
        '--coupling_sep_min',
        help="Min. nb residues between members of a coupling (default : 3)",
        type=int,
        default=3)
    parser.add_argument('-n',
                        '--top',
                        help="Nb of couplings displayed (default : 20)",
                        type=int,
                        default=20)
    parser.add_argument(
        '--auto_top',
        help=
        "Nb couplings displayed = elbow of the score curve (default : False)",
        default=False,
        action='store_true')
    parser.add_argument('--wij_cutoff',
                        help="||wij|| <= wij_cutoff (default : no cutoff)",
                        default=None,
                        type=float)
    parser.add_argument(
        '-num',
        '--numbering_type',
        help=
        "Use the same numbering type around the circle as sequence (sequence) or PDB structure (pdb) (default : numbering as sequence)",
        default='sequence')
    parser.add_argument(
        '-o',
        '--output_circos_image',
        help=
        "Output circos image (default : [potts_folder]/circos_output/circos.png)",
        type=pathlib.Path,
        default=None)
    parser.add_argument('-t',
                        '--thickness',
                        help="Couplings thickness factor (default : 1)",
                        type=float,
                        default=1)

    args = vars(parser.parse_args(args))

    fm.check_if_dir_ok(args["potts_folder"])

    potts_object = Potts_Object.from_folder(args['potts_folder'])
    if args['pdb_file'] is None:
        name = str(potts_object.folder) + '/' + args['pdb_id']
        args['pdb_file'] = fm.fetch_pdb_file(args['pdb_id'], name)
    fm.check_if_file_ok(args["pdb_file"])
    create_circos_from_potts_object_and_pdb_chain(potts_object, **args)
Esempio n. 13
0
 def test_no_file(self):
     nonexistent_filename = "patate.fasta"
     with self.assertRaises(Exception) as context:
         fm.check_if_file_ok(nonexistent_filename)
Esempio n. 14
0
 def test_file_exists(self):
     existent_filename = SEQ_1CC8
     fm.check_if_file_ok(existent_filename)
Esempio n. 15
0
 def from_sequence_file_with_submat(cls, seq_file, tau=0.5, **kwargs):
     """ substitution matrix pseudocounts """
     fm.check_if_file_ok(seq_file)
     seq = fm.get_first_sequence_in_fasta_file(seq_file).upper()
     return cls.from_sequence_with_submat(seq, seq_file, **kwargs)
Esempio n. 16
0
 def from_sequence_file_to_one_hot(cls, seq_file, **kwargs):
     """ one hot encoding """
     fm.check_if_file_ok(seq_file)
     seq = fm.get_first_sequence_in_fasta_file(seq_file).upper()
     return cls.from_sequence_to_one_hot(seq, seq_file=seq_file, **kwargs)