Beispiel #1
0
 def get_data(self, data):
     if data == "cteno_panxs":
         return Sb.make_copy(self._cteno_panxs)
     elif data == "cteno_panxs_aln":
         return Alb.make_copy(self._cteno_panxs_aln)
     elif data == "cteno_ids":
         return deepcopy(self._cteno_ids)
     elif data == "cteno_sim_scores":
         return deepcopy(self._cteno_sim_scores)
     elif data == "ss2_dfs":
         psi_pred_ss2_dfs = Sb.OrderedDict()
         for rec in cteno_panxs.records:
             path = os.path.join(self.resource_path, "psi_pred",
                                 "%s.ss2" % rec.id)
             psi_pred_ss2_dfs[rec.id] = pd.read_csv(path,
                                                    comment="#",
                                                    header=None,
                                                    delim_whitespace=True)
             psi_pred_ss2_dfs[rec.id].columns = [
                 "indx", "aa", "ss", "coil_prob", "helix_prob", "sheet_prob"
             ]
         return psi_pred_ss2_dfs
     elif data == "ss2_paths":
         psi_pred_ss2 = Sb.OrderedDict()
         for rec in cteno_panxs.records:
             psi_pred_ss2[rec.id] = os.path.join(self.resource_path,
                                                 "psi_pred",
                                                 "%s.ss2" % rec.id)
         return psi_pred_ss2
     else:
         raise AttributeError("Unknown data type: %s" % data)
Beispiel #2
0
def main():
    def fmt(prog):
        return br.CustomHelpFormatter(prog)

    parser = argparse.ArgumentParser(prog="largest_isoform",
                                     formatter_class=fmt,
                                     add_help=False,
                                     usage=argparse.SUPPRESS,
                                     description='''\
\033[1mLargest Isoform\033[m
  Select only the largest isoform from Augustus protein models  

  Pass in a file containing protein sequences with the .t# suffix at the end
  of each sequence ID.
  
\033[1mUsage\033[m:
  largest_isoform.py "/path/to/sequences"
''')

    # Positional
    parser.add_argument("sequences", help="Specify a sequence file")
    parser.add_argument(
        "-i",
        "--in_place",
        action="store_true",
        help="Overwrite original file. Be sure you want to do this!!")
    in_args = parser.parse_args()

    final_records = []
    seqs = Sb.SeqBuddy(in_args.sequences)
    seqs = Sb.order_ids(seqs)

    iso_id = ".".join(seqs.records[0].id.split(".")[:-1])
    max_seq = seqs.records[0]
    for rec in seqs.records:
        cur_id = ".".join(rec.id.split(".")[:-1])
        if cur_id != iso_id:
            iso_id = cur_id
            final_records.append(max_seq)
            max_seq = rec
        else:
            if len(max_seq.seq) < len(rec.seq):
                max_seq = rec

    seqs.records = final_records
    if in_args.in_place:
        seqs.write(in_args.sequences)
    else:
        print(seqs)
Beispiel #3
0
def test_start_worker_1seq_error(hf, capsys, monkeypatch):
    temp_dir = br.TempDir()
    temp_dir.copy_to("%swork_db.sqlite" % hf.resource_path)
    temp_dir.copy_to("%sheartbeat_db.sqlite" % hf.resource_path)
    worker = launch_worker.Worker(temp_dir.path, heartrate=1, max_wait=20)

    work_con = sqlite3.connect(os.path.join(temp_dir.path, "work_db.sqlite"))
    work_cursor = work_con.cursor()
    work_cursor.execute("INSERT INTO "
                        "waiting (hash, master_id) "
                        "VALUES ('foo', 2)")
    work_cursor.execute(
        "INSERT INTO "
        "queue (hash, psi_pred_dir, align_m, align_p, trimal, gap_open, gap_extend) "
        "VALUES ('foo', ?, 'clustalo', '', 'gappyout 50 90 clean', 0, 0)",
        (os.path.join(hf.resource_path, "psi_pred"), ))
    work_con.commit()

    # Only a single sequence present
    seqbuddy = hf.get_data("cteno_panxs")
    seqbuddy = Sb.pull_recs(seqbuddy, "Oma-PanxαC")
    seqbuddy.write(os.path.join(worker.output, "foo.seqs"))

    monkeypatch.setattr(launch_worker.Worker, "check_masters",
                        lambda *_, **__: True)
    with pytest.raises(SystemExit):
        worker.start()

    out, err = capsys.readouterr()
    assert "Queued job of size 1 encountered: foo" in out
    work_con.close()
Beispiel #4
0
def test_start_worker_1seq_error(hf, capsys, monkeypatch):
    temp_dir = br.TempDir()
    temp_dir.copy_to("%swork_db.sqlite" % hf.resource_path)
    temp_dir.copy_to("%sheartbeat_db.sqlite" % hf.resource_path)
    worker = launch_worker.Worker(temp_dir.path, heartrate=1, max_wait=20)

    work_con = sqlite3.connect(os.path.join(temp_dir.path, "work_db.sqlite"))
    work_cursor = work_con.cursor()
    work_cursor.execute("INSERT INTO "
                        "waiting (hash, master_id) "
                        "VALUES ('foo', 2)")
    work_cursor.execute("INSERT INTO "
                        "queue (hash, psi_pred_dir, align_m, align_p, trimal, gap_open, gap_extend) "
                        "VALUES ('foo', ?, 'clustalo', '', 'gappyout 50 90 clean', 0, 0)",
                        (os.path.join(hf.resource_path, "psi_pred"),))
    work_con.commit()

    # Only a single sequence present
    seqbuddy = hf.get_data("cteno_panxs")
    seqbuddy = Sb.pull_recs(seqbuddy, "Oma-PanxαC")
    seqbuddy.write(os.path.join(worker.output, "foo.seqs"))

    monkeypatch.setattr(launch_worker.Worker, "check_masters", lambda *_, **__: True)
    with pytest.raises(SystemExit):
        worker.start()

    out, err = capsys.readouterr()
    assert "Queued job of size 1 encountered: foo" in out
    work_con.close()
def test_main_strip_taxa(monkeypatch, hf, capsys):
    tmp_file = br.TempFile()
    seqbuddy = Sb.SeqBuddy(os.path.join(hf.resource_path,
                                        "Cteno_pannexins.fa"))
    seqbuddy = Sb.rename(seqbuddy, "^.*?\-")
    tmp_file.write(str(seqbuddy))
    argv = [
        'rdmcl.py',
        os.path.join(hf.resource_path, "final_clusters.txt"), tmp_file.path,
        "-s"
    ]
    monkeypatch.setattr(sys, "argv", argv)
    group_by_cluster.main()
    out, err = capsys.readouterr()
    assert hf.string2hash(out) == "3020ea067affd21c77b7446f35689a6a", print(
        out)
def test_main_strip_taxa(monkeypatch, hf, capsys):
    tmp_file = br.TempFile()
    seqbuddy = Sb.SeqBuddy(os.path.join(hf.resource_path, "Cteno_pannexins.fa"))
    seqbuddy = Sb.rename(seqbuddy, "^.*?\-")
    tmp_file.write(str(seqbuddy))
    argv = ['rdmcl.py', os.path.join(hf.resource_path, "final_clusters.txt"),
            tmp_file.path, "-s"]
    monkeypatch.setattr(sys, "argv", argv)
    group_by_cluster.main()
    out, err = capsys.readouterr()
    assert hf.string2hash(out) == "3020ea067affd21c77b7446f35689a6a", print(out)
Beispiel #7
0
def make_msa(seqbuddy, aligner, trimal=()):
    """
    Create a multiple sequence alignment
    :param seqbuddy: SeqBuddy object
    :param aligner: path to alignment program
    :param trimal: List of TrimAl thresholds to try
    :return: AlignBuddy object
    """
    trimal = trimal if trimal else ["clean"]

    if len(seqbuddy) == 1:
        alignment = Alb.AlignBuddy(str(seqbuddy))
    else:
        alignment = Alb.generate_msa(Sb.make_copy(seqbuddy), aligner, quiet=True)
        ave_seq_length = Sb.ave_seq_length(seqbuddy)
        for threshold in trimal:
            align_copy = Alb.trimal(Alb.make_copy(alignment), threshold=threshold)
            cleaned_seqs = Sb.clean_seq(Sb.SeqBuddy(str(align_copy)))
            cleaned_seqs = Sb.delete_small(cleaned_seqs, 1)
            # Structured this way for unit test purposes
            if len(alignment.records()) != len(cleaned_seqs):
                continue
            elif Sb.ave_seq_length(cleaned_seqs) / ave_seq_length < 0.5:
                continue
            else:
                alignment = align_copy
                break
    return alignment
Beispiel #8
0
def test_start_worker_fetch_queue(hf, capsys, monkeypatch):
    temp_dir = br.TempDir()
    temp_dir.copy_to("%swork_db.sqlite" % hf.resource_path)
    temp_dir.copy_to("%sheartbeat_db.sqlite" % hf.resource_path)

    def kill(*args):
        self = args[0]
        os.remove(self.worker_file)
        return

    monkeypatch.setattr(launch_worker.Worker, "process_final_results", kill)

    worker = launch_worker.Worker(temp_dir.path, heartrate=1, max_wait=1)
    work_con = sqlite3.connect(os.path.join(temp_dir.path, "work_db.sqlite"))
    work_cursor = work_con.cursor()
    work_cursor.execute("INSERT INTO "
                        "waiting (hash, master_id) "
                        "VALUES ('foo', 2)")
    work_cursor.execute(
        "INSERT INTO "
        "queue (hash, psi_pred_dir, align_m, align_p, trimal, gap_open, gap_extend) "
        "VALUES ('foo', ?, 'clustalo', '', 'gappyout 50 90 clean', 0, 0)",
        (os.path.join(hf.resource_path, "psi_pred"), ))

    work_con.commit()

    seqbuddy = hf.get_data("cteno_panxs")
    seqbuddy = Sb.pull_recs(seqbuddy,
                            "Oma")  # Only 4 records, which means 6 comparisons
    seqbuddy.write(os.path.join(worker.output, "foo.seqs"))

    with pytest.raises(SystemExit):
        worker.start()

    out, err = capsys.readouterr()
    assert "Running foo" in out
    assert "Creating MSA (4 seqs)" in out
    assert "Trimal (4 seqs)" in out
    assert os.path.isfile(os.path.join(worker.output, "foo.aln"))
    assert "Updating 4 psipred dataframes" in out
    assert "Preparing all-by-all data" in out
    assert "Running all-by-all data (6 comparisons)" in out
    assert "Processing final results" in out
    work_con.close()
Beispiel #9
0
def test_start_worker_fetch_queue(hf, capsys, monkeypatch):
    temp_dir = br.TempDir()
    temp_dir.copy_to("%swork_db.sqlite" % hf.resource_path)
    temp_dir.copy_to("%sheartbeat_db.sqlite" % hf.resource_path)

    def kill(*args):
        self = args[0]
        os.remove(self.worker_file)
        return

    monkeypatch.setattr(launch_worker.Worker, "process_final_results", kill)

    worker = launch_worker.Worker(temp_dir.path, heartrate=1, max_wait=1)
    work_con = sqlite3.connect(os.path.join(temp_dir.path, "work_db.sqlite"))
    work_cursor = work_con.cursor()
    work_cursor.execute("INSERT INTO "
                        "waiting (hash, master_id) "
                        "VALUES ('foo', 2)")
    work_cursor.execute("INSERT INTO "
                        "queue (hash, psi_pred_dir, align_m, align_p, trimal, gap_open, gap_extend) "
                        "VALUES ('foo', ?, 'clustalo', '', 'gappyout 50 90 clean', 0, 0)",
                        (os.path.join(hf.resource_path, "psi_pred"),))

    work_con.commit()

    seqbuddy = hf.get_data("cteno_panxs")
    seqbuddy = Sb.pull_recs(seqbuddy, "Oma")  # Only 4 records, which means 6 comparisons
    seqbuddy.write(os.path.join(worker.output, "foo.seqs"))

    with pytest.raises(SystemExit):
        worker.start()

    out, err = capsys.readouterr()
    assert "Running foo" in out
    assert "Creating MSA (4 seqs)" in out
    assert "Trimal (4 seqs)" in out
    assert os.path.isfile(os.path.join(worker.output, "foo.aln"))
    assert "Updating 4 psipred dataframes" in out
    assert "Preparing all-by-all data" in out
    assert "Running all-by-all data (6 comparisons)" in out
    assert "Processing final results" in out
    work_con.close()
def mc_blast(records_list, args):
    # separate the args into its respective variable
    database, outfile = args
    # set temp_file as a buddy resource variable
    temp_file = br.TempFile()
    # set this variable to each record in records_list -- fasta format
    sub_input_seqs = sb.SeqBuddy(records_list, out_format='fasta')
    # write each sequence/record name to the temp_file of a certain path
    sub_input_seqs.write(temp_file.path)
    # generic blastp command for each file, blastdb used
    blast_cmd = "blastp -query %s -db %s -num_threads 3 -max_target_seqs 1 -outfmt 6" % (
        temp_file.path, database)
    # utilize Popen to write the full blastp command to execute
    output = Popen(blast_cmd, stdout=PIPE, shell=True).communicate()
    # output = [stdout, stderr] - get stdout and decode
    output = output[0].decode()
    # write to file while locked so no other processes can write at the same time
    with lock:
        with open(outfile, 'a') as ofile:
            ofile.write(output)
    return
Beispiel #11
0
    num_duplications_range = num_duplications_range if len(num_duplications_range) == 1 \
        else list(range(num_duplications_range[0], num_duplications_range[1] + 1))

    models = in_args.models
    alphas = make_range_from_inargs(in_args.alpha)

    category_range = sorted(in_args.categories)
    category_range = category_range if len(category_range) == 1 \
        else list(range(num_drops_range[0], num_drops_range[1] + 1))

    seed_file = in_args.seed_file
    assert os.path.exists(seed_file)

    with open(seed_file, 'r') as seed_io:
        seed_seq = seed_io.read()
    seed_seq = str(Sb.clean_seq(Sb.SeqBuddy(seed_seq, out_format='raw')))
    seed_seq = seed_seq.upper().strip()

    # ugly-ass loop
    arguments = []
    for grp in group_range:
        for tax in taxa_range:
            for mdl in models:
                for gbr in gene_branch_len:
                    for gstdv in gene_branch_stdev:
                        for sbr in species_branch_len:
                            for sstdv in species_branch_stdev:
                                for alp in alphas:
                                    for cat in category_range:
                                        for drp in drop_chances:
                                            for ndr in num_drops_range:
Beispiel #12
0
                        help="Print out the result of each tool")
    parser.add_argument(
        "-p",
        "--pause",
        action="store_true",
        help=
        "Stop execution until 'return' key pressed (only workes in combination with -v)"
    )
    in_args = parser.parse_args()

    # Validate input reference file
    if not os.path.isfile(in_args.reference):
        sys.stderr("Error: Reference file does not exist\n")
        sys.exit()

    seqbuddy = Sb.SeqBuddy(in_args.reference)
    if seqbuddy.alpha != IUPAC.ambiguous_dna:
        sys.stderr("Error: Reference file must be DNA\n")
        sys.exit()

    if seqbuddy.in_format not in ["genbank", "gb"]:
        sys.stderr("Error: Reference file must be GenBank format\n")
        sys.exit()

    # Create or load all necessary reference files
    ref_dir = "{0}{1}reference{1}".format(
        os.path.dirname(os.path.realpath(__file__)), os.path.sep)
    ref_name = in_args.reference.split(os.sep)[-1]
    ref_name = os.path.splitext(ref_name)[0]

    if not os.path.isfile("%s%s.gb" % (ref_dir, ref_name)):
        '--speed up the process')
    parser.add_argument('input_file', help='transdecoder file')
    parser.add_argument('database', help='blastp database path')
    parser.add_argument('num_cores', type=int, help='number of cores')
    parser.add_argument('-gs', '--group_size', type=int, help='group size')
    parser.add_argument('-o',
                        '--out_file',
                        default='blastp.outfmt6',
                        help='output file')
    parser.add_argument('-q',
                        '--quiet',
                        help='suppress run time output counter',
                        action='store_true')
    in_args = parser.parse_args()
    # sb.Seqbuddy(input_file) creates a variable of input sequences
    input_seqs = sb.SeqBuddy(in_args.input_file)
    # number of 'groups' of cores -- we divide the total number requested by 3 (number of jobs to perform at once)
    # we have to floor it b/c remainders/leftover cores are not allowed -- need at least 3 cores per job
    num_cores = floor(in_args.num_cores / 3)
    # if we specify group sizes, we're good to go -- otherwise, group size is the ceil(len(input_seqs)/num_cores)
    # You should usually try to specify group sizes
    group_size = ceil(
        len(input_seqs) /
        num_cores) if not in_args.group_size else in_args.group_size
    # specifies which records/seqs are in each group based on group size -- list comprehension
    records_list = [
        input_seqs.records[i:i + group_size]
        for i in range(0, len(input_seqs.records), group_size)
    ]

    ##########
Beispiel #14
0
    def start(self):
        self.split_time = time.time()
        self.start_time = time.time()

        self.heartbeat.start()
        self.worker_file = os.path.join(self.working_dir,
                                        "Worker_%s" % self.heartbeat.id)
        with open(self.worker_file, "w") as ofile:
            ofile.write("To terminate this Worker, simply delete this file.")

        self.data_file = os.path.join(self.working_dir,
                                      ".Worker_%s.dat" % self.heartbeat.id)
        open(self.data_file, "w").close()

        helpers.dummy_func()

        self.last_heartbeat_from_master = time.time()
        self.printer.write("Starting Worker_%s" % self.heartbeat.id)
        self.printer.new_line(1)

        idle_countdown = 1
        while os.path.isfile(self.worker_file):
            idle = round(100 * self.idle / (self.idle + self.running), 2)
            if not idle_countdown:
                self.printer.write("Idle %s%%" % idle)
                idle_countdown = 5

            # Make sure there are some masters still kicking around
            self.check_masters(idle)

            # Check for and clean up dead threads and orphaned jobs every twentieth(ish) time through
            rand_check = random()
            if rand_check > 0.95:
                self.clean_dead_threads()

            # Fetch a job from the queue
            data = self.fetch_queue_job()
            if data:
                full_name, psipred_dir, align_m, align_p, trimal, gap_open, gap_extend = data
                subjob_num, num_subjobs, id_hash = [1, 1, full_name] if len(full_name.split("_")) == 1 \
                    else full_name.split("_")
                subjob_num = int(subjob_num)
                num_subjobs = int(num_subjobs)
                self.printer.write("Running %s" % full_name)
            else:
                time.sleep(
                    random() * self.idle_workers()
                )  # Pause for some time relative to num idle workers
                idle_countdown -= 1
                self.idle += time.time() - self.split_time
                self.split_time = time.time()
                continue

            try:
                idle_countdown = 1
                seqbuddy = Sb.SeqBuddy("%s/%s.seqs" % (self.output, id_hash),
                                       in_format="fasta")

                # Prepare alignment
                if len(seqbuddy) == 1:
                    raise ValueError("Queued job of size 1 encountered: %s" %
                                     id_hash)
                else:
                    if num_subjobs == 1:
                        self.printer.write("Creating MSA (%s seqs)" %
                                           len(seqbuddy))
                        alignment = Alb.generate_msa(Sb.make_copy(seqbuddy),
                                                     align_m,
                                                     params=align_p,
                                                     quiet=True)
                    else:
                        self.printer.write("Reading MSA (%s seqs)" %
                                           len(seqbuddy))
                        alignment = Alb.AlignBuddy(
                            os.path.join(self.output, "%s.aln" % id_hash))

                # Prepare psipred dataframes
                psipred_dfs = self.prepare_psipred_dfs(seqbuddy, psipred_dir)

                if num_subjobs == 1:  # This is starting a full job from scratch, not a sub-job
                    # Need to specify what columns the PsiPred files map to now that there are gaps.
                    psipred_dfs = rdmcl.update_psipred(alignment, psipred_dfs,
                                                       "msa")

                    # TrimAl
                    self.printer.write("Trimal (%s seqs)" % len(seqbuddy))
                    alignment = rdmcl.trimal(seqbuddy, trimal, alignment)

                    with helpers.ExclusiveConnect(os.path.join(
                            self.output, "write.lock"),
                                                  max_lock=0):
                        # Place these write commands in ExclusiveConnect to ensure a writing lock
                        if not os.path.isfile(
                                os.path.join(self.output, "%s.aln" % id_hash)):
                            alignment.write(os.path.join(
                                self.output, "%s.aln" % id_hash),
                                            out_format="fasta")

                    # Re-update PsiPred files now that some columns, possibly including non-gap characters, are removed
                    self.printer.write("Updating %s psipred dataframes" %
                                       len(seqbuddy))
                    psipred_dfs = rdmcl.update_psipred(alignment, psipred_dfs,
                                                       "trimal")

                # Prepare all-by-all list
                self.printer.write("Preparing all-by-all data")
                data_len, data = rdmcl.prepare_all_by_all(
                    seqbuddy, psipred_dfs, self.cpus)

                if num_subjobs == 1 and data_len > self.cpus * self.job_size_coff:
                    data_len, data, subjob_num, num_subjobs = self.spawn_subjobs(
                        id_hash, data, psipred_dfs, gap_open, gap_extend)
                elif subjob_num > 1:
                    data_len, data = self.load_subjob(id_hash, subjob_num,
                                                      num_subjobs, psipred_dfs)

                # Launch multicore
                self.printer.write("Running all-by-all data (%s comparisons)" %
                                   data_len)
                with open(self.data_file, "w") as ofile:
                    ofile.write("seq1,seq2,subsmat,psi")

                br.run_multicore_function(data,
                                          rdmcl.mc_score_sequences,
                                          quiet=True,
                                          max_processes=self.cpus,
                                          func_args=[
                                              alignment, gap_open, gap_extend,
                                              self.data_file
                                          ])

                self.printer.write("Processing final results")
                self.process_final_results(id_hash, subjob_num, num_subjobs)

                self.running += time.time() - self.split_time
                self.split_time = time.time()

            except (OSError, FileNotFoundError, br.GuessError,
                    ValueError) as err:
                if num_subjobs == 1:
                    self.terminate(
                        "something wrong with primary cluster %s\n%s" %
                        (full_name, err))
                else:
                    with helpers.ExclusiveConnect(self.wrkdb_path) as cursor:
                        cursor.execute("DELETE FROM processing WHERE hash=?",
                                       (full_name, ))
                    continue

        # Broken out of while loop, clean up and terminate worker
        if os.path.isfile(self.data_file):
            os.remove(self.data_file)

        self.terminate("deleted check file")
Beispiel #15
0
    def start(self):
        self.split_time = time.time()
        self.start_time = time.time()

        self.heartbeat.start()
        self.worker_file = os.path.join(self.working_dir, "Worker_%s" % self.heartbeat.id)
        with open(self.worker_file, "w") as ofile:
            ofile.write("To terminate this Worker, simply delete this file.")

        self.data_file = os.path.join(self.working_dir, ".Worker_%s.dat" % self.heartbeat.id)
        open(self.data_file, "w").close()

        helpers.dummy_func()

        self.last_heartbeat_from_master = time.time()
        self.printer.write("Starting Worker_%s" % self.heartbeat.id)
        self.printer.new_line(1)

        idle_countdown = 1
        while os.path.isfile(self.worker_file):
            idle = round(100 * self.idle / (self.idle + self.running), 2)
            if not idle_countdown:
                self.printer.write("Idle %s%%" % idle)
                idle_countdown = 5

            # Make sure there are some masters still kicking around
            self.check_masters(idle)

            # Check for and clean up dead threads and orphaned jobs every twentieth(ish) time through
            rand_check = random()
            if rand_check > 0.95:
                self.clean_dead_threads()

            # Fetch a job from the queue
            data = self.fetch_queue_job()
            if data:
                full_name, psipred_dir, align_m, align_p, trimal, gap_open, gap_extend = data
                subjob_num, num_subjobs, id_hash = [1, 1, full_name] if len(full_name.split("_")) == 1 \
                    else full_name.split("_")
                subjob_num = int(subjob_num)
                num_subjobs = int(num_subjobs)
                self.printer.write("Running %s" % full_name)
            else:
                time.sleep(random() * self.idle_workers())  # Pause for some time relative to num idle workers
                idle_countdown -= 1
                self.idle += time.time() - self.split_time
                self.split_time = time.time()
                continue

            try:
                idle_countdown = 1
                seqbuddy = Sb.SeqBuddy("%s/%s.seqs" % (self.output, id_hash), in_format="fasta")

                # Prepare alignment
                if len(seqbuddy) == 1:
                    raise ValueError("Queued job of size 1 encountered: %s" % id_hash)
                else:
                    if num_subjobs == 1:
                        self.printer.write("Creating MSA (%s seqs)" % len(seqbuddy))
                        alignment = Alb.generate_msa(Sb.make_copy(seqbuddy), align_m,
                                                     params=align_p, quiet=True)
                    else:
                        self.printer.write("Reading MSA (%s seqs)" % len(seqbuddy))
                        alignment = Alb.AlignBuddy(os.path.join(self.output, "%s.aln" % id_hash))

                # Prepare psipred dataframes
                psipred_dfs = self.prepare_psipred_dfs(seqbuddy, psipred_dir)

                if num_subjobs == 1:  # This is starting a full job from scratch, not a sub-job
                    # Need to specify what columns the PsiPred files map to now that there are gaps.
                    psipred_dfs = rdmcl.update_psipred(alignment, psipred_dfs, "msa")

                    # TrimAl
                    self.printer.write("Trimal (%s seqs)" % len(seqbuddy))
                    alignment = rdmcl.trimal(seqbuddy, trimal, alignment)

                    with helpers.ExclusiveConnect(os.path.join(self.output, "write.lock"), max_lock=0):
                        # Place these write commands in ExclusiveConnect to ensure a writing lock
                        if not os.path.isfile(os.path.join(self.output, "%s.aln" % id_hash)):
                            alignment.write(os.path.join(self.output, "%s.aln" % id_hash), out_format="fasta")

                    # Re-update PsiPred files now that some columns, possibly including non-gap characters, are removed
                    self.printer.write("Updating %s psipred dataframes" % len(seqbuddy))
                    psipred_dfs = rdmcl.update_psipred(alignment, psipred_dfs, "trimal")

                # Prepare all-by-all list
                self.printer.write("Preparing all-by-all data")
                data_len, data = rdmcl.prepare_all_by_all(seqbuddy, psipred_dfs, self.cpus)

                if num_subjobs == 1 and data_len > self.cpus * self.job_size_coff:
                    data_len, data, subjob_num, num_subjobs = self.spawn_subjobs(id_hash, data, psipred_dfs,
                                                                                 gap_open, gap_extend)
                elif subjob_num > 1:
                    data_len, data = self.load_subjob(id_hash, subjob_num, num_subjobs, psipred_dfs)

                # Launch multicore
                self.printer.write("Running all-by-all data (%s comparisons)" % data_len)
                with open(self.data_file, "w") as ofile:
                    ofile.write("seq1,seq2,subsmat,psi")

                br.run_multicore_function(data, rdmcl.mc_score_sequences, quiet=True, max_processes=self.cpus,
                                          func_args=[alignment, gap_open, gap_extend, self.data_file])

                self.printer.write("Processing final results")
                self.process_final_results(id_hash, subjob_num, num_subjobs)

                self.running += time.time() - self.split_time
                self.split_time = time.time()

            except (OSError, FileNotFoundError, br.GuessError, ValueError) as err:
                if num_subjobs == 1:
                        self.terminate("something wrong with primary cluster %s\n%s" % (full_name, err))
                else:
                    with helpers.ExclusiveConnect(self.wrkdb_path) as cursor:
                        cursor.execute("DELETE FROM processing WHERE hash=?", (full_name,))
                    continue

        # Broken out of while loop, clean up and terminate worker
        if os.path.isfile(self.data_file):
            os.remove(self.data_file)

        self.terminate("deleted check file")
Beispiel #16
0
def main():
    def fmt(prog):
        return br.CustomHelpFormatter(prog)

    parser = argparse.ArgumentParser(prog="homolog_tree_builder",
                                     formatter_class=fmt,
                                     add_help=False,
                                     usage=argparse.SUPPRESS,
                                     description='''\
\033[1mRun PSI-PRED\033[m
  For Sofia, to do awesome stuff with

  Pass in a file of sequences, get secondary structure in return.
  
\033[1mUsage\033[m:
  run_psipred.py "/path/to/seqs" [-options]
''')

    # Positional
    positional = parser.add_argument_group(
        title="\033[1mPositional argument\033[m")

    positional.add_argument(
        "seqs", help="Specify sequence file (most formats accepted)")
    positional.add_argument("save_ss2",
                            action="store",
                            help="Specify directory to save/read ss2 files.")

    # Optional commands
    parser_flags = parser.add_argument_group(
        title="\033[1mAvailable commands\033[m")
    parser_flags.add_argument(
        "-cpu",
        "--max_cpus",
        type=int,
        action="store",
        default=CPUS,
        metavar="",
        help="Specify the maximum number of cores RD-MCL can use (default=%s)"
        % CPUS)

    # Misc
    misc = parser.add_argument_group(title="\033[1mMisc options\033[m")
    misc.add_argument('-v', '--version', action='version', version="1.0")
    misc.add_argument('-h',
                      '--help',
                      action="help",
                      help="Show this help message and exit")

    in_args = parser.parse_args()

    sequences = Sb.SeqBuddy(in_args.seqs)
    if not in_args.save_ss2:
        ss2_files = br.TempDir().path
    else:
        ss2_files = os.path.abspath(in_args.save_ss2)
        os.makedirs(ss2_files, exist_ok=True)

    br.run_multicore_function(sequences.records,
                              mc_psi_pred, [ss2_files],
                              max_processes=in_args.max_cpus)
def test_make_msa(hf, monkeypatch):
    seqbuddy = hf.get_data("cteno_panxs")
    seqbuddy.records = seqbuddy.records[:2]
    alb_obj = group_by_cluster.make_msa(seqbuddy, "clustalo")
    assert type(alb_obj) == Alb.AlignBuddy
    assert str(alb_obj) == """\
>Bab-PanxαA Be_abyssicola|m.8 and m.21|ML036514|937+ 2.
MLLLGSLGTIKNLSIFKDLSLDDWLDQMNRTFMFLLLCFMGTIVAVSQYTGKNISCNGFE
KFSDDFSQDYCWTQGLYTIKEAYDLPESQIPYPGIIPENVPACREHSLKNGGKIICPPPE
EIKPLTRARHLWYQWIPFYFWVIAPVFYLPYMFVKRMGLDRMKPLLKIMSDYYHCTTETP
SEEIIVKCADWVYNSIVDRLSEGSSWTSWRNRHGLGLAVLFSKLMYLGGSILVMMVTTLM
FQVGDFKTYGIEWLKQFPSDENYTTSVKHKLFPKMVACEIKRWGPSGLEEENGMCVLAPN
VIYQYIFLIMWFALAITICTNFFNIFFWVFKLTATRYTYSKLVATGHFSHKHPGWKFMYY
RIGTSGRVLLNIVAQNTNPIIFGAIMEKLTPSVIKHLRIGHVPGEYLTDPA
>Bab-PanxαB Be_abyssicola|m.19|ML47742|1063 2.
--MLDILSKFKGVTPFKGITIDDGWDQLNRSFMFVLLVVMGTTVTVRQYTGSVISCDGFK
KFGSTFAEDYCWTQGLYTVLEGYDQPSYNIPYPGLLPDELPACTPVKLKDGTRLKCPDAD
QLMSPTRISHLWYQWVPFYFWLAAAAFFMPYLLYKNFGMGDIKPLVRLLHNPVESDQ--E
LKKMTDKAATWLFYKFDLYMSEQSLVASLTRKHGLGLSMVFVKILYAAVSFCCFILTAEM
FSIGDFKTYGSKWIKKMRYEDTLATEEKDKLFPKMVACEVKRWGASGIEEEQGMCVLAPN
VINQYLFLILWFCLVFVMICNIVSIFVSLIKLLFTYGSYRRLLST-AFLRDDSAIKHMYF
NVGSSGRLILHVLANNTAPRVFEDILLTLAPKLIQRKLRGNGKAV------
"""

    seqbuddy.records = [seqbuddy.records[0]]
    alb_obj = group_by_cluster.make_msa(seqbuddy, "clustalo")
    assert type(alb_obj) == Alb.AlignBuddy
    assert str(alb_obj) == """\
>Bab-PanxαA Be_abyssicola|m.8 and m.21|ML036514|937+ 2.
MLLLGSLGTIKNLSIFKDLSLDDWLDQMNRTFMFLLLCFMGTIVAVSQYTGKNISCNGFE
KFSDDFSQDYCWTQGLYTIKEAYDLPESQIPYPGIIPENVPACREHSLKNGGKIICPPPE
EIKPLTRARHLWYQWIPFYFWVIAPVFYLPYMFVKRMGLDRMKPLLKIMSDYYHCTTETP
SEEIIVKCADWVYNSIVDRLSEGSSWTSWRNRHGLGLAVLFSKLMYLGGSILVMMVTTLM
FQVGDFKTYGIEWLKQFPSDENYTTSVKHKLFPKMVACEIKRWGPSGLEEENGMCVLAPN
VIYQYIFLIMWFALAITICTNFFNIFFWVFKLTATRYTYSKLVATGHFSHKHPGWKFMYY
RIGTSGRVLLNIVAQNTNPIIFGAIMEKLTPSVIKHLRIGHVPGEYLTDPA
"""

    # Don't modify if any sequence is reduced to nothing
    align = Alb.AlignBuddy("""\
>A
MSTGTC-------
>B
M---TC-------
>C
M---TC---AILP
>D
-STP---YWAILP
""", in_format="fasta")

    seqbuddy = Sb.SeqBuddy(Alb.make_copy(align).records(), in_format="fasta")
    seqbuddy = Sb.clean_seq(seqbuddy)

    monkeypatch.setattr(Alb, "generate_msa", lambda *_, **__: align)
    alb_obj = group_by_cluster.make_msa(seqbuddy, "clustalo", trimal=[0.3])
    assert str(alb_obj) == str(align)

    # Don't modify if average sequence length is reduced by more than half
    align = Alb.AlignBuddy("""\
>A
MSTGTC-------
>B
M---TC-------
>C
M---TC---AILP
>D
-STPTC-YWAILP
""", in_format="fasta")

    seqbuddy = Sb.SeqBuddy(Alb.make_copy(align).records(), in_format="fasta")
    seqbuddy = Sb.clean_seq(seqbuddy)

    monkeypatch.setattr(Alb, "generate_msa", lambda *_, **__: align)
    alb_obj = group_by_cluster.make_msa(seqbuddy, "clustalo", trimal=[0.3])
    assert str(alb_obj) == str(align)

    # Remove some gaps
    alb_obj = group_by_cluster.make_msa(seqbuddy, "clustalo", trimal=[0.3, 0.55])
    assert str(alb_obj) == """\
        "Stop execution until 'return' key pressed (only works in combination with -v)"
    )
    parser.add_argument("-to",
                        "--timeout",
                        action='store',
                        default=31536000,
                        type=int,
                        help="Set max execution time")
    in_args = parser.parse_args()

    # Validate input reference file
    if not os.path.isfile(in_args.reference):
        sys.stderr("Error: Reference file does not exist\n")
        sys.exit()

    seqbuddy = Sb.SeqBuddy(in_args.reference)
    if seqbuddy.alpha != IUPAC.ambiguous_dna:
        sys.stderr("Error: Reference file must be DNA\n")
        sys.exit()

    if seqbuddy.in_format not in ["genbank", "gb"]:
        sys.stderr("Error: Reference file must be GenBank format\n")
        sys.exit()

    # Create or load all necessary reference files
    ref_dir = "{0}{1}reference{1}".format(
        os.path.dirname(os.path.realpath(__file__)), os.path.sep)
    ref_name = in_args.reference.split(os.sep)[-1]
    ref_name = os.path.splitext(ref_name)[0]

    res_dir = "{0}{1}results{1}{2}{1}".format(
Beispiel #19
0
def main():
    in_args = argparse_init()
    mode = in_args.mode.lower()
    mode = "seqs" if "sequences".startswith(mode) else mode
    mode = "aln" if "alignment".startswith(mode) else mode
    mode = "con" if "consensus".startswith(mode) else mode
    mode = "list" if "list".startswith(mode) else mode

    if mode not in ["seqs", "aln", "con", "list"]:
        Sb.br._stderr('Unrecognized mode, please select from ["seqs", "aln", "con", "list"].\n')
        sys.exit()

    if in_args.groups:
        in_args.groups = [x.lower() for x in in_args.groups[0]]
        in_args.groups = "^%s$" % "$|^".join(in_args.groups)

    cluster_file = prepare_clusters(in_args.clusters, hierarchy=True)
    seqbuddy = Sb.SeqBuddy(in_args.sequence_file)
    output = OrderedDict()

    for rank, node in cluster_file.items():
        rank = rank.split()[0]
        if in_args.groups:
            if not re.search(in_args.groups, rank):
                continue

        if in_args.min_size:
            if len(node) < in_args.min_size:
                continue

        if in_args.max_size:
            if len(node) > in_args.max_size:
                continue

        if in_args.strip_taxa:
            node = [re.sub("^.*?\-", "", x) for x in node]

        ids = "^%s$" % "$|^".join(node)
        subset = Sb.pull_recs(Sb.make_copy(seqbuddy), ids)
        subset = Sb.order_ids(subset)

        rank_output = ""
        if mode == "list":
            rank_output += rank
            for rec in subset.records:
                rec.description = re.sub("^%s" % rec.id, "", rec.description)
                rank_output += "\n%s %s" % (rec.id, rec.description)
            rank_output += "\n"

        elif mode == "seqs":
            for rec in subset.records:
                rec.description = "%s %s" % (rank, rec.description)
            rank_output += str(subset)

        elif mode in ["aln", "con"]:
            try:
                rank_output = make_msa(subset, in_args.aligner, in_args.trimal)
            except (SystemError, AttributeError) as err:
                print(err)
                sys.exit()
            rank_output.out_format = "phylip-relaxed"

        if mode == "con":
            rec = Alb.consensus_sequence(rank_output).records()[0]
            rec.id = rank
            rec.name = rank
            rec.description = ""
            rank_output.out_format = "fasta"

        output[rank] = str(rank_output)

    if not in_args.write:
        print("\n".join(data for rank, data in output.items()).strip())

    else:
        outdir = os.path.abspath(in_args.write)
        os.makedirs(outdir, exist_ok=True)
        extension = ".%s" % seqbuddy.out_format[:3] if mode == "seq" \
            else ".txt" if mode == "list" \
            else ".phy" if mode == "aln" \
            else ".fa"

        for rank, data in output.items():
            with open(os.path.join(outdir, rank + extension), "w") as ofile:
                ofile.write(data)
def test_make_msa(hf, monkeypatch):
    seqbuddy = hf.get_data("cteno_panxs")
    seqbuddy.records = seqbuddy.records[:2]
    alb_obj = group_by_cluster.make_msa(seqbuddy, "clustalo")
    assert type(alb_obj) == Alb.AlignBuddy
    assert str(alb_obj) == """\
>Bab-PanxαA Be_abyssicola|m.8 and m.21|ML036514|937+ 2.
MLLLGSLGTIKNLSIFKDLSLDDWLDQMNRTFMFLLLCFMGTIVAVSQYTGKNISCNGFE
KFSDDFSQDYCWTQGLYTIKEAYDLPESQIPYPGIIPENVPACREHSLKNGGKIICPPPE
EIKPLTRARHLWYQWIPFYFWVIAPVFYLPYMFVKRMGLDRMKPLLKIMSDYYHCTTETP
SEEIIVKCADWVYNSIVDRLSEGSSWTSWRNRHGLGLAVLFSKLMYLGGSILVMMVTTLM
FQVGDFKTYGIEWLKQFPSDENYTTSVKHKLFPKMVACEIKRWGPSGLEEENGMCVLAPN
VIYQYIFLIMWFALAITICTNFFNIFFWVFKLTATRYTYSKLVATGHFSHKHPGWKFMYY
RIGTSGRVLLNIVAQNTNPIIFGAIMEKLTPSVIKHLRIGHVPGEYLTDPA
>Bab-PanxαB Be_abyssicola|m.19|ML47742|1063 2.
--MLDILSKFKGVTPFKGITIDDGWDQLNRSFMFVLLVVMGTTVTVRQYTGSVISCDGFK
KFGSTFAEDYCWTQGLYTVLEGYDQPSYNIPYPGLLPDELPACTPVKLKDGTRLKCPDAD
QLMSPTRISHLWYQWVPFYFWLAAAAFFMPYLLYKNFGMGDIKPLVRLLHNPVESDQ--E
LKKMTDKAATWLFYKFDLYMSEQSLVASLTRKHGLGLSMVFVKILYAAVSFCCFILTAEM
FSIGDFKTYGSKWIKKMRYEDTLATEEKDKLFPKMVACEVKRWGASGIEEEQGMCVLAPN
VINQYLFLILWFCLVFVMICNIVSIFVSLIKLLFTYGSYRRLLST-AFLRDDSAIKHMYF
NVGSSGRLILHVLANNTAPRVFEDILLTLAPKLIQRKLRGNGKAV------
"""

    seqbuddy.records = [seqbuddy.records[0]]
    alb_obj = group_by_cluster.make_msa(seqbuddy, "clustalo")
    assert type(alb_obj) == Alb.AlignBuddy
    assert str(alb_obj) == """\
>Bab-PanxαA Be_abyssicola|m.8 and m.21|ML036514|937+ 2.
MLLLGSLGTIKNLSIFKDLSLDDWLDQMNRTFMFLLLCFMGTIVAVSQYTGKNISCNGFE
KFSDDFSQDYCWTQGLYTIKEAYDLPESQIPYPGIIPENVPACREHSLKNGGKIICPPPE
EIKPLTRARHLWYQWIPFYFWVIAPVFYLPYMFVKRMGLDRMKPLLKIMSDYYHCTTETP
SEEIIVKCADWVYNSIVDRLSEGSSWTSWRNRHGLGLAVLFSKLMYLGGSILVMMVTTLM
FQVGDFKTYGIEWLKQFPSDENYTTSVKHKLFPKMVACEIKRWGPSGLEEENGMCVLAPN
VIYQYIFLIMWFALAITICTNFFNIFFWVFKLTATRYTYSKLVATGHFSHKHPGWKFMYY
RIGTSGRVLLNIVAQNTNPIIFGAIMEKLTPSVIKHLRIGHVPGEYLTDPA
"""

    # Don't modify if any sequence is reduced to nothing
    align = Alb.AlignBuddy("""\
>A
MSTGTC-------
>B
M---TC-------
>C
M---TC---AILP
>D
-STP---YWAILP
""",
                           in_format="fasta")

    seqbuddy = Sb.SeqBuddy(Alb.make_copy(align).records(), in_format="fasta")
    seqbuddy = Sb.clean_seq(seqbuddy)

    monkeypatch.setattr(Alb, "generate_msa", lambda *_, **__: align)
    alb_obj = group_by_cluster.make_msa(seqbuddy, "clustalo", trimal=[0.3])
    assert str(alb_obj) == str(align)

    # Don't modify if average sequence length is reduced by more than half
    align = Alb.AlignBuddy("""\
>A
MSTGTC-------
>B
M---TC-------
>C
M---TC---AILP
>D
-STPTC-YWAILP
""",
                           in_format="fasta")

    seqbuddy = Sb.SeqBuddy(Alb.make_copy(align).records(), in_format="fasta")
    seqbuddy = Sb.clean_seq(seqbuddy)

    monkeypatch.setattr(Alb, "generate_msa", lambda *_, **__: align)
    alb_obj = group_by_cluster.make_msa(seqbuddy, "clustalo", trimal=[0.3])
    assert str(alb_obj) == str(align)

    # Remove some gaps
    alb_obj = group_by_cluster.make_msa(seqbuddy,
                                        "clustalo",
                                        trimal=[0.3, 0.55])
    assert str(alb_obj) == """\
Beispiel #21
0
Bfr     4
Cfu     6
Dgl     9
Edu     9
Hca     8
Hru     5
Hvu     14
Lcr     12
Lla     3
Mle     12
Oma     4
Pba     7
Tin     6
Vpa     7
'''
cteno_panxs = Sb.SeqBuddy("%s%sCteno_pannexins.fa" % (RESOURCE_PATH, SEP))
cteno_panxs_aln = Alb.AlignBuddy("%s%sCteno_pannexins_aln.fa" %
                                 (RESOURCE_PATH, SEP))
ids = sorted([rec.id for rec in cteno_panxs.records])
sim_scores = pd.read_csv("%sCteno_pannexins_sim.scores" % RESOURCE_PATH,
                         index_col=False,
                         header=None)
sim_scores.columns = ["seq1", "seq2", "subsmat", "psi", "raw_score", "score"]


# #################################  -  Helper class  -  ################################## #
class HelperMethods(object):
    def __init__(self):
        self.sep = SEP
        self.resource_path = RESOURCE_PATH
        self._cteno_panxs = cteno_panxs
Beispiel #22
0
    num_duplications_range = num_duplications_range if len(num_duplications_range) == 1 \
        else list(range(num_duplications_range[0], num_duplications_range[1] + 1))

    models = in_args.models
    alphas = make_range_from_inargs(in_args.alpha)

    category_range = sorted(in_args.categories)
    category_range = category_range if len(category_range) == 1 \
        else list(range(num_drops_range[0], num_drops_range[1] + 1))

    seed_file = in_args.seed_file
    assert os.path.exists(seed_file)

    with open(seed_file, 'r') as seed_io:
        seed_seq = seed_io.read()
    seed_seq = str(Sb.clean_seq(Sb.SeqBuddy(seed_seq, out_format='raw')))
    seed_seq = seed_seq.upper().strip()

    # ugly-ass loop
    arguments = []
    for grp in group_range:
        for tax in taxa_range:
            for mdl in models:
                for gbr in gene_branch_len:
                    for gstdv in gene_branch_stdev:
                        for sbr in species_branch_len:
                            for sstdv in species_branch_stdev:
                                for alp in alphas:
                                    for cat in category_range:
                                        for drp in drop_chances:
                                            for ndr in num_drops_range:
seq_files = []
for _file in files:
    extension = _file.split(".")[-1]
    name = re.sub(extension, "", _file.split("/")[-1])
    if extension in in_args.extensions and name not in prev_blast_dbs:
        seq_files.append("%s/%s" % (in_args.indir, _file))

print("***Hashing proteomes***")
chars = string.ascii_uppercase + string.digits
for i in range(len(seq_files)):
    _file = seq_files[i]
    name = _file.split("/")[-1]
    name = "_".join(name.split(".")[:-1])

    seqbuddy = Sb.SeqBuddy(_file)
    seqbuddy = Sb.clean_seq(seqbuddy)

    if "%s/blastdbs/%s" % (in_args.outdir, name) in prev_blast_dbs:
        for record in seqbuddy.records:
            if not in_args.original_names:
                record.id = reverse_hash_map["%s@%s" % (name, record.id)]
            else:
                record.id = reverse_hash_map[record.id]
        prev_records_list += seqbuddy.records
        continue

    print(name)
    for indx, rec in enumerate(seqbuddy.records):
        while True:
            new_hash = "".join([random.choice(chars) for _ in range(10)])