Exemple #1
0
def make_msa(seqbuddy, aligner, trimal=()):
    """
    Create a multiple sequence alignment
    :param seqbuddy: SeqBuddy object
    :param aligner: path to alignment program
    :param trimal: List of TrimAl thresholds to try
    :return: AlignBuddy object
    """
    trimal = trimal if trimal else ["clean"]

    if len(seqbuddy) == 1:
        alignment = Alb.AlignBuddy(str(seqbuddy))
    else:
        alignment = Alb.generate_msa(Sb.make_copy(seqbuddy), aligner, quiet=True)
        ave_seq_length = Sb.ave_seq_length(seqbuddy)
        for threshold in trimal:
            align_copy = Alb.trimal(Alb.make_copy(alignment), threshold=threshold)
            cleaned_seqs = Sb.clean_seq(Sb.SeqBuddy(str(align_copy)))
            cleaned_seqs = Sb.delete_small(cleaned_seqs, 1)
            # Structured this way for unit test purposes
            if len(alignment.records()) != len(cleaned_seqs):
                continue
            elif Sb.ave_seq_length(cleaned_seqs) / ave_seq_length < 0.5:
                continue
            else:
                alignment = align_copy
                break
    return alignment
Exemple #2
0
    if not os.path.isfile("%s%s.gb" % (ref_dir, ref_name)):
        shutil.copy(in_args.reference, "%s%s.gb" % (ref_dir, ref_name))

    if not os.path.isfile("%s%s_pep.gb" % (ref_dir, ref_name)):
        sb_pep = Sb.translate_cds(Sb.make_copy(seqbuddy))
        sb_pep.write("%s%s_pep.gb" % (ref_dir, ref_name))

    if not os.path.isfile("%s%s_rna.gb" % (ref_dir, ref_name)):
        sb_rna = Sb.dna2rna(Sb.make_copy(seqbuddy))
        sb_rna.write("%s%s_rna.gb" % (ref_dir, ref_name))

    if not os.path.isfile("%s%s_aln.gb" % (ref_dir, ref_name)):
        alignbuddy = Alb.generate_msa(Sb.make_copy(seqbuddy), "mafft")
        alignbuddy.write("%s%s_aln.gb" % (ref_dir, ref_name))
    else:
        alignbuddy = Alb.AlignBuddy("%s%s_aln.gb" % (ref_dir, ref_name))

    if not os.path.isfile("%s%s_pep_aln.gb" % (ref_dir, ref_name)):
        alb_pep = Alb.translate_cds(Alb.make_copy(alignbuddy))
        alb_pep.write("%s%s_pep_aln.gb" % (ref_dir, ref_name))

    if not os.path.isfile("%s%s_rna_aln.gb" % (ref_dir, ref_name)):
        alb_rna = Alb.dna2rna(Alb.make_copy(alignbuddy))
        alb_rna.write("%s%s_rna_aln.gb" % (ref_dir, ref_name))

    if not os.path.isfile("%s%s_tree.nwk" % (ref_dir, ref_name)):
        phylobuddy = Pb.generate_tree(Alb.make_copy(alignbuddy), "fasttree")
        phylobuddy.write("%s%s_tree.nwk" % (ref_dir, ref_name))
    else:
        phylobuddy = Pb.PhyloBuddy("%s%s_tree.nwk" % (ref_dir, ref_name))
Exemple #3
0
    def start(self):
        self.split_time = time.time()
        self.start_time = time.time()

        self.heartbeat.start()
        self.worker_file = os.path.join(self.working_dir,
                                        "Worker_%s" % self.heartbeat.id)
        with open(self.worker_file, "w") as ofile:
            ofile.write("To terminate this Worker, simply delete this file.")

        self.data_file = os.path.join(self.working_dir,
                                      ".Worker_%s.dat" % self.heartbeat.id)
        open(self.data_file, "w").close()

        helpers.dummy_func()

        self.last_heartbeat_from_master = time.time()
        self.printer.write("Starting Worker_%s" % self.heartbeat.id)
        self.printer.new_line(1)

        idle_countdown = 1
        while os.path.isfile(self.worker_file):
            idle = round(100 * self.idle / (self.idle + self.running), 2)
            if not idle_countdown:
                self.printer.write("Idle %s%%" % idle)
                idle_countdown = 5

            # Make sure there are some masters still kicking around
            self.check_masters(idle)

            # Check for and clean up dead threads and orphaned jobs every twentieth(ish) time through
            rand_check = random()
            if rand_check > 0.95:
                self.clean_dead_threads()

            # Fetch a job from the queue
            data = self.fetch_queue_job()
            if data:
                full_name, psipred_dir, align_m, align_p, trimal, gap_open, gap_extend = data
                subjob_num, num_subjobs, id_hash = [1, 1, full_name] if len(full_name.split("_")) == 1 \
                    else full_name.split("_")
                subjob_num = int(subjob_num)
                num_subjobs = int(num_subjobs)
                self.printer.write("Running %s" % full_name)
            else:
                time.sleep(
                    random() * self.idle_workers()
                )  # Pause for some time relative to num idle workers
                idle_countdown -= 1
                self.idle += time.time() - self.split_time
                self.split_time = time.time()
                continue

            try:
                idle_countdown = 1
                seqbuddy = Sb.SeqBuddy("%s/%s.seqs" % (self.output, id_hash),
                                       in_format="fasta")

                # Prepare alignment
                if len(seqbuddy) == 1:
                    raise ValueError("Queued job of size 1 encountered: %s" %
                                     id_hash)
                else:
                    if num_subjobs == 1:
                        self.printer.write("Creating MSA (%s seqs)" %
                                           len(seqbuddy))
                        alignment = Alb.generate_msa(Sb.make_copy(seqbuddy),
                                                     align_m,
                                                     params=align_p,
                                                     quiet=True)
                    else:
                        self.printer.write("Reading MSA (%s seqs)" %
                                           len(seqbuddy))
                        alignment = Alb.AlignBuddy(
                            os.path.join(self.output, "%s.aln" % id_hash))

                # Prepare psipred dataframes
                psipred_dfs = self.prepare_psipred_dfs(seqbuddy, psipred_dir)

                if num_subjobs == 1:  # This is starting a full job from scratch, not a sub-job
                    # Need to specify what columns the PsiPred files map to now that there are gaps.
                    psipred_dfs = rdmcl.update_psipred(alignment, psipred_dfs,
                                                       "msa")

                    # TrimAl
                    self.printer.write("Trimal (%s seqs)" % len(seqbuddy))
                    alignment = rdmcl.trimal(seqbuddy, trimal, alignment)

                    with helpers.ExclusiveConnect(os.path.join(
                            self.output, "write.lock"),
                                                  max_lock=0):
                        # Place these write commands in ExclusiveConnect to ensure a writing lock
                        if not os.path.isfile(
                                os.path.join(self.output, "%s.aln" % id_hash)):
                            alignment.write(os.path.join(
                                self.output, "%s.aln" % id_hash),
                                            out_format="fasta")

                    # Re-update PsiPred files now that some columns, possibly including non-gap characters, are removed
                    self.printer.write("Updating %s psipred dataframes" %
                                       len(seqbuddy))
                    psipred_dfs = rdmcl.update_psipred(alignment, psipred_dfs,
                                                       "trimal")

                # Prepare all-by-all list
                self.printer.write("Preparing all-by-all data")
                data_len, data = rdmcl.prepare_all_by_all(
                    seqbuddy, psipred_dfs, self.cpus)

                if num_subjobs == 1 and data_len > self.cpus * self.job_size_coff:
                    data_len, data, subjob_num, num_subjobs = self.spawn_subjobs(
                        id_hash, data, psipred_dfs, gap_open, gap_extend)
                elif subjob_num > 1:
                    data_len, data = self.load_subjob(id_hash, subjob_num,
                                                      num_subjobs, psipred_dfs)

                # Launch multicore
                self.printer.write("Running all-by-all data (%s comparisons)" %
                                   data_len)
                with open(self.data_file, "w") as ofile:
                    ofile.write("seq1,seq2,subsmat,psi")

                br.run_multicore_function(data,
                                          rdmcl.mc_score_sequences,
                                          quiet=True,
                                          max_processes=self.cpus,
                                          func_args=[
                                              alignment, gap_open, gap_extend,
                                              self.data_file
                                          ])

                self.printer.write("Processing final results")
                self.process_final_results(id_hash, subjob_num, num_subjobs)

                self.running += time.time() - self.split_time
                self.split_time = time.time()

            except (OSError, FileNotFoundError, br.GuessError,
                    ValueError) as err:
                if num_subjobs == 1:
                    self.terminate(
                        "something wrong with primary cluster %s\n%s" %
                        (full_name, err))
                else:
                    with helpers.ExclusiveConnect(self.wrkdb_path) as cursor:
                        cursor.execute("DELETE FROM processing WHERE hash=?",
                                       (full_name, ))
                    continue

        # Broken out of while loop, clean up and terminate worker
        if os.path.isfile(self.data_file):
            os.remove(self.data_file)

        self.terminate("deleted check file")
Exemple #4
0
outdir = os.path.abspath(in_args.out_dir)
if not os.path.exists(outdir):
    print("Output directory not found, creating it...\n%s" % outdir, file=sys.stderr)
    os.mkdir(outdir)

for block in blocks:
    path = os.path.abspath(block.split("\n")[0])
    file_name = path.split("/")[-1].split(".")[:-1]
    file_name = "_".join(file_name)
    file_name = "_".join(file_name.split(" "))

    new_dir = os.path.abspath("%s/%s" % (outdir, file_name))
    if not os.path.exists(new_dir):
        os.mkdir(new_dir)

    alignbuddy = Alb.AlignBuddy(path)
    with open("%s/%s.phy" % (new_dir, file_name), 'w') as ofile:
        ofile.write(screw_formats_align(alignbuddy))

    # partitionfinder is super fussy about the PHYLIP format used. There needs to be exactly 10 characters in the IDs,
    # with at least 2 spaces before the start of the actual sequence. So annoying.
    Alb.hash_ids(alignbuddy, hash_length=8)
    with open("%s/%s_hashed.phy" % (new_dir, file_name), "w") as ofile:
        ofile.write(screw_formats_align(alignbuddy))

    # Make cfg file
    seq_ranges = block.split("\n")[1:]
    if seq_ranges[-1] == "":
        seq_ranges = seq_ranges[:-1]

    alphabet = Alb.guess_alphabet(alignbuddy)
def test_make_msa(hf, monkeypatch):
    seqbuddy = hf.get_data("cteno_panxs")
    seqbuddy.records = seqbuddy.records[:2]
    alb_obj = group_by_cluster.make_msa(seqbuddy, "clustalo")
    assert type(alb_obj) == Alb.AlignBuddy
    assert str(alb_obj) == """\
>Bab-PanxαA Be_abyssicola|m.8 and m.21|ML036514|937+ 2.
MLLLGSLGTIKNLSIFKDLSLDDWLDQMNRTFMFLLLCFMGTIVAVSQYTGKNISCNGFE
KFSDDFSQDYCWTQGLYTIKEAYDLPESQIPYPGIIPENVPACREHSLKNGGKIICPPPE
EIKPLTRARHLWYQWIPFYFWVIAPVFYLPYMFVKRMGLDRMKPLLKIMSDYYHCTTETP
SEEIIVKCADWVYNSIVDRLSEGSSWTSWRNRHGLGLAVLFSKLMYLGGSILVMMVTTLM
FQVGDFKTYGIEWLKQFPSDENYTTSVKHKLFPKMVACEIKRWGPSGLEEENGMCVLAPN
VIYQYIFLIMWFALAITICTNFFNIFFWVFKLTATRYTYSKLVATGHFSHKHPGWKFMYY
RIGTSGRVLLNIVAQNTNPIIFGAIMEKLTPSVIKHLRIGHVPGEYLTDPA
>Bab-PanxαB Be_abyssicola|m.19|ML47742|1063 2.
--MLDILSKFKGVTPFKGITIDDGWDQLNRSFMFVLLVVMGTTVTVRQYTGSVISCDGFK
KFGSTFAEDYCWTQGLYTVLEGYDQPSYNIPYPGLLPDELPACTPVKLKDGTRLKCPDAD
QLMSPTRISHLWYQWVPFYFWLAAAAFFMPYLLYKNFGMGDIKPLVRLLHNPVESDQ--E
LKKMTDKAATWLFYKFDLYMSEQSLVASLTRKHGLGLSMVFVKILYAAVSFCCFILTAEM
FSIGDFKTYGSKWIKKMRYEDTLATEEKDKLFPKMVACEVKRWGASGIEEEQGMCVLAPN
VINQYLFLILWFCLVFVMICNIVSIFVSLIKLLFTYGSYRRLLST-AFLRDDSAIKHMYF
NVGSSGRLILHVLANNTAPRVFEDILLTLAPKLIQRKLRGNGKAV------
"""

    seqbuddy.records = [seqbuddy.records[0]]
    alb_obj = group_by_cluster.make_msa(seqbuddy, "clustalo")
    assert type(alb_obj) == Alb.AlignBuddy
    assert str(alb_obj) == """\
>Bab-PanxαA Be_abyssicola|m.8 and m.21|ML036514|937+ 2.
MLLLGSLGTIKNLSIFKDLSLDDWLDQMNRTFMFLLLCFMGTIVAVSQYTGKNISCNGFE
KFSDDFSQDYCWTQGLYTIKEAYDLPESQIPYPGIIPENVPACREHSLKNGGKIICPPPE
EIKPLTRARHLWYQWIPFYFWVIAPVFYLPYMFVKRMGLDRMKPLLKIMSDYYHCTTETP
SEEIIVKCADWVYNSIVDRLSEGSSWTSWRNRHGLGLAVLFSKLMYLGGSILVMMVTTLM
FQVGDFKTYGIEWLKQFPSDENYTTSVKHKLFPKMVACEIKRWGPSGLEEENGMCVLAPN
VIYQYIFLIMWFALAITICTNFFNIFFWVFKLTATRYTYSKLVATGHFSHKHPGWKFMYY
RIGTSGRVLLNIVAQNTNPIIFGAIMEKLTPSVIKHLRIGHVPGEYLTDPA
"""

    # Don't modify if any sequence is reduced to nothing
    align = Alb.AlignBuddy("""\
>A
MSTGTC-------
>B
M---TC-------
>C
M---TC---AILP
>D
-STP---YWAILP
""",
                           in_format="fasta")

    seqbuddy = Sb.SeqBuddy(Alb.make_copy(align).records(), in_format="fasta")
    seqbuddy = Sb.clean_seq(seqbuddy)

    monkeypatch.setattr(Alb, "generate_msa", lambda *_, **__: align)
    alb_obj = group_by_cluster.make_msa(seqbuddy, "clustalo", trimal=[0.3])
    assert str(alb_obj) == str(align)

    # Don't modify if average sequence length is reduced by more than half
    align = Alb.AlignBuddy("""\
>A
MSTGTC-------
>B
M---TC-------
>C
M---TC---AILP
>D
-STPTC-YWAILP
""",
                           in_format="fasta")

    seqbuddy = Sb.SeqBuddy(Alb.make_copy(align).records(), in_format="fasta")
    seqbuddy = Sb.clean_seq(seqbuddy)

    monkeypatch.setattr(Alb, "generate_msa", lambda *_, **__: align)
    alb_obj = group_by_cluster.make_msa(seqbuddy, "clustalo", trimal=[0.3])
    assert str(alb_obj) == str(align)

    # Remove some gaps
    alb_obj = group_by_cluster.make_msa(seqbuddy,
                                        "clustalo",
                                        trimal=[0.3, 0.55])
    assert str(alb_obj) == """\
Exemple #6
0
Cfu     6
Dgl     9
Edu     9
Hca     8
Hru     5
Hvu     14
Lcr     12
Lla     3
Mle     12
Oma     4
Pba     7
Tin     6
Vpa     7
'''
cteno_panxs = Sb.SeqBuddy("%s%sCteno_pannexins.fa" % (RESOURCE_PATH, SEP))
cteno_panxs_aln = Alb.AlignBuddy("%s%sCteno_pannexins_aln.fa" %
                                 (RESOURCE_PATH, SEP))
ids = sorted([rec.id for rec in cteno_panxs.records])
sim_scores = pd.read_csv("%sCteno_pannexins_sim.scores" % RESOURCE_PATH,
                         index_col=False,
                         header=None)
sim_scores.columns = ["seq1", "seq2", "subsmat", "psi", "raw_score", "score"]


# #################################  -  Helper class  -  ################################## #
class HelperMethods(object):
    def __init__(self):
        self.sep = SEP
        self.resource_path = RESOURCE_PATH
        self._cteno_panxs = cteno_panxs
        self._cteno_panxs_aln = cteno_panxs_aln
        self._cteno_ids = ids