Beispiel #1
0
    def test_write_fasta_1(self):
        """Verify write_fasta() creates readable fasta formatted file"""
        fileio.write_fasta(self.fasta_dict_1, self.fasta_file)

        record = SeqIO.read(self.fasta_file, "fasta")
        id = list(self.fasta_dict_1.keys())[0]
        seq = self.fasta_dict_1[id]

        self.assertEqual(record.id, id)
        self.assertEqual(str(record.seq), seq)
Beispiel #2
0
def create_genes_fasta(alchemist, values, fasta_dir, db_name,
                       mol_type="prot", verbose=False, data_cache=None):
    if data_cache is None:
        data_cache = {}

    fasta_path = fasta_dir.joinpath(".".join([db_name, "fasta"]))

    gs_to_ts = map_translations(alchemist, values)

    pdm_fileio.write_fasta(gs_to_ts, fasta_path)

    return fasta_path
Beispiel #3
0
    def test_reintroduce_fasta_duplicates_2(self):
        """Verify reintroduce_duplicates() recognizes duplicate sequences"""
        fileio.write_fasta(self.test_fa_1_gs_to_ts, self.fasta_file)

        fileio.reintroduce_fasta_duplicates(self.test_fa_2_ts_to_gs,
                                            self.fasta_file)

        with self.fasta_file.open(mode="r") as filehandle:
            for record in SeqIO.parse(filehandle, "fasta"):
                with self.subTest(seq_id=record.id):
                    translation = self.test_fa_2_gs_to_ts[record.id]
                    self.assertTrue(translation is not None)
                    self.assertEqual(str(record.seq), translation)
Beispiel #4
0
    def test_write_fasta_2(self):
        """Verify write_fasta() can properly concatenate fasta files"""
        fileio.write_fasta(self.fasta_dict_2, self.fasta_file)

        records = SeqIO.parse(self.fasta_file, "fasta")

        keys = list(self.fasta_dict_2.keys())

        for record in records:
            self.assertTrue(record.id in keys)

            seq = self.fasta_dict_2[record.id]
            self.assertEqual(str(record.seq), seq)
Beispiel #5
0
def write_phams_process(fasta_dir, aln_dir, pham, pham_translations):
    fasta_path = fasta_dir.joinpath("".join([str(pham), "_genes.fasta"]))
    aln_path = aln_dir.joinpath("".join([str(pham), "_genes.aln"]))

    gs_to_ts = {}
    for translation, gene_ids in pham_translations.items():
        gs_to_ts[gene_ids[0]] = translation

    fileio.write_fasta(gs_to_ts, fasta_path)

    if len(pham_translations) > 1:
        run_clustalo(fasta_path, aln_path)
        fileio.reintroduce_fasta_duplicates(pham_translations, aln_path)

    fileio.reintroduce_fasta_duplicates(pham_translations, fasta_path)
Beispiel #6
0
def create_pham_hmms_process(working_dir, pham, ts_to_gs, name, M, seq_id,
                             add_cons, seq_lim):
    aln_path = working_dir.joinpath(".".join([str(pham), "aln"]))
    hmm_path = working_dir.joinpath(".".join([str(pham), "hhm"]))

    gs_to_ts = dict()
    for ts, ids in ts_to_gs.items():
        gs_to_ts[ts] = ids[0]

    pdm_fileio.write_fasta(gs_to_ts, aln_path)

    clustalo(aln_path, aln_path, None, None, "fasta", "fasta")

    hhmake(aln_path, hmm_path, name, add_cons, seq_lim, M, seq_id)

    return (pham, hmm_path)
Beispiel #7
0
def estimate_linker_sequence(source_cluster, target_centroid_seq, temp_dir):
    guidetree = source_cluster.GT

    curr_node = guidetree.clade
    while curr_node.is_bifurcating() and len(curr_node.clades) > 1:
        left_child = curr_node.clades[0]
        right_child = curr_node.clades[1]

        left_leaf = get_furthest_sequence(source_cluster, left_child)
        right_leaf = get_furthest_sequence(source_cluster, right_child)

        left_pid = left_leaf.comment.get("TargetIdentity")
        if left_pid is None:
            left_pid = alignment.calculate_levenshtein(
                left_leaf.comment["Sequence"],
                target_centroid_seq,
                identity=True)
            left_leaf.comment["TargetIdentity"] = left_pid
            pass

        right_pid = right_leaf.comment.get("TargetIdentity")
        if right_pid is None:
            right_pid = alignment.calculate_levenshtein(
                right_leaf.comment["Sequence"],
                target_centroid_seq,
                identity=True)
            right_leaf.comment["TargetIdentity"] = right_pid

        if left_pid > right_pid:
            curr_node = left_child
        else:
            curr_node = right_child

    linker = get_furthest_sequence(source_cluster, curr_node)

    if linker.comment.get("FilePath") is None:
        linker_path = temp_dir.joinpath(".".join([linker.name, "fasta"]))
        if not linker_path.is_file():
            pdm_fileio.write_fasta({linker.name: linker.comment["Sequence"]},
                                   linker_path)
        linker.comment["FilePath"] = linker_path

    return linker
Beispiel #8
0
def create_pham_alns_process(working_dir, pham, ts_to_gs, mat_out, tree_out,
                             infile_type, outfile_type):
    aln_path = working_dir.joinpath(".".join([str(pham), "aln"]))

    mat_path = None
    if mat_out:
        mat_path = working_dir.joinpath(".".join([str(pham), "mat"]))

    tree_path = None
    if tree_out:
        tree_path = working_dir.joinpath(".".join([str(pham), "tree"]))

    gs_to_ts = dict()
    for ts, ids in ts_to_gs.items():
        gs_to_ts[ids[0]] = ts

    pdm_fileio.write_fasta(gs_to_ts, aln_path)

    clustalo(aln_path, aln_path, mat_path, tree_path, infile_type,
             outfile_type)

    pdm_fileio.reintroduce_fasta_duplicates(ts_to_gs, aln_path)

    return (pham, aln_path)
Beispiel #9
0
def estimate_min_distance(source_cluster,
                          target_cluster,
                          temp_dir,
                          data_dir,
                          align=False):
    if source_cluster.GT is not None:
        source_leaf = estimate_linker_sequence(source_cluster,
                                               target_cluster.centroid_seq_str,
                                               temp_dir)

        source_linker = source_leaf.name
        source_linker_seq = source_leaf.comment["Sequence"]
        source_linker_path = source_leaf.comment["FilePath"]
    else:
        source_linker = source_cluster.CentroidID
        source_linker_seq = source_cluster.centroid_seq_str
        source_linker_path = temp_dir.joinpath(".".join(
            [source_linker, "fasta"]))

    if target_cluster.GT is not None:
        target_leaf = estimate_linker_sequence(target_cluster,
                                               source_cluster.centroid_seq_str,
                                               temp_dir)

        target_linker = target_leaf.name
        target_linker_seq = target_leaf.comment["Sequence"]
        target_linker_path = target_leaf.comment["FilePath"]
    else:
        target_linker = target_cluster.CentroidID
        target_linker_seq = target_cluster.centroid_seq_str
        target_linker_path = temp_dir.joinpath(".".join(
            [target_linker, "fasta"]))

    if align:
        source_len = len(source_linker_seq)
        target_len = len(target_linker_seq)

        if not source_linker_path.is_file():
            pdm_fileio.write_fasta({source_linker: source_linker_seq},
                                   source_linker_path)

        if not target_linker_path.is_file():
            pdm_fileio.write_fasta({target_linker: target_linker_seq},
                                   target_linker_path)

        pairwise_path = data_dir.joinpath("".join(
            [source_linker, "__", target_linker, ".fasta"]))

        error = True
        while error:
            try:
                linker_alignment = alignment.pairwise_align(source_linker_path,
                                                            target_linker_path,
                                                            pairwise_path,
                                                            tool="needle")
                error = False
            except ApplicationError:
                time.sleep(0.2)

        if source_len > target_len:
            linker_pid = linker_alignment.annotations["identity"] / source_len
        else:
            linker_pid = linker_alignment.annotations["identity"] / target_len

        linker_pid = linker_pid * 100
    else:
        linker_pid = alignment.calculate_levenshtein(source_linker_seq,
                                                     target_linker_seq)

    return linker_pid
Beispiel #10
0
def write_centroids_threadtask(temp_dir, source_path, target_path,
                               source_gs_to_ts, target_gs_to_ts):
    temp_dir.mkdir()

    pdm_fileio.write_fasta(source_gs_to_ts, source_path)
    pdm_fileio.write_fasta(target_gs_to_ts, target_path)