Ejemplo n.º 1
0
    def _compute_block(self, seqs1, seqs2, origin):
        subst_mat = parasail.Matrix(self.subst_mat)
        origin_row, origin_col = origin

        square_matrix = seqs2 is None
        if square_matrix:
            seqs2 = seqs1

        self_alignment_scores1 = self._self_alignment_scores(seqs1)
        if square_matrix:
            self_alignment_scores2 = self_alignment_scores1
        else:
            self_alignment_scores2 = self._self_alignment_scores(seqs2)

        result = []
        for row, s1 in enumerate(seqs1):
            col_start = row if square_matrix else 0
            for col, s2 in enumerate(seqs2[col_start:], start=col_start):
                profile = parasail.profile_create_16(s1, subst_mat)
                r = parasail.nw_scan_profile_16(profile, s2, self.gap_open,
                                                self.gap_extend)
                max_score = np.min(
                    [self_alignment_scores1[row], self_alignment_scores2[col]])
                d = max_score - r.score
                if d <= self.cutoff:
                    result.append((d + 1, origin_row + row, origin_col + col))

        return result
Ejemplo n.º 2
0
def genome_vs_genome(gen1, gen2):
    homolog_db = dict()

    #Define names of files to avoid thread collisions
    fname = gen1[8:-3] + '_' + gen2[8:-3]
    query_gene = 'query_gene_' + fname + '.fa'
    opal_output = 'OPAL_OUTPUT_' + fname + '.txt'
    parse_out = 'PARSE_OUT_' + fname + '.fa'

    iden_matrix = parasail.Matrix("WM_IDENTITY_MATRIX_parasail.txt")
    i = 0
    for gene in SeqIO.parse(gen1, "fasta"):
        gene.id = gene.id.split("|")[0]  #Necessary for H37Rv genome
        #print(threading.current_thread().name, gen2)
        SeqIO.write(gene, query_gene, "fasta")
        #Call query_gene against gen2 with opal
        #t0 = time.time()
        subprocess.run(
            './opal_aligner -o 0 -e 0 -a NW -f Identity_score_matrix.txt -x 1 '
            + query_gene + ' ' + gen2 + ' > ' + opal_output,
            shell=True)
        #t1 = time.time()
        #Read in opal output
        subprocess.run('./parse_opal.py ' + opal_output + ' ' + gen2 + ' ' +
                       parse_out + ' -t ' + str(req_match_fraction),
                       shell=True)
        #t2 = time.time()
        #Follow-up opal hits with parasail to count matches
        profile = parasail.profile_create_stats_16(str(gene.seq), iden_matrix)
        homolog_ls = list()
        for s2 in SeqIO.parse(parse_out, "fasta"):
            s2.id = s2.id.split("|")[0]  #Necessary for H37Rv genome
            #print("1"+gene.id,s2.id)
            result = parasail.nw_stats_scan_profile_16(profile, str(s2.seq),
                                                       20, 1)
            #print("2"+gene.id,s2.id)
            match_fraction = result.matches / len(gene.seq)
            if match_fraction > req_match_fraction:
                homolog_ls.append((s2.id, match_fraction))
        #t3 = time.time()
        #print("t1-t0: ",t1-t0,"t2-t1: ",t2-t1,"t3-t2: ",t3-t2)

        homolog_db[gene.id] = homolog_ls
        i += 1
        if i == num_entries:
            break
    os.remove(query_gene)
    os.remove(opal_output)
    os.remove(parse_out)

    return (homolog_db)
Ejemplo n.º 3
0
 def _self_alignment_scores(self, seqs: Sequence) -> dict:
     """Calculate self-alignments. We need them as reference values
     to turn scores into dists"""
     return np.fromiter(
         (parasail.nw_scan_16(
             s,
             s,
             self.gap_open,
             self.gap_extend,
             parasail.Matrix(self.subst_mat),
         ).score for s in seqs),
         dtype=int,
         count=len(seqs),
     )
Ejemplo n.º 4
0
    def calc_dist_mat(self, seqs: Collection) -> coo_matrix:
        """Calculate the distances between amino acid sequences based on
        of all-against-all pairwise sequence alignments.

        Parameters
        ----------
        seqs
            Array of amino acid sequences

        Returns
        -------
        Upper diagonal distance matrix of normalized alignment distances. 
        """
        # first, calculate self-alignments. We need them as refererence values
        # to turn scores into dists
        self_alignment_scores = np.array(
            [
                parasail.nw_scan_16(
                    s,
                    s,
                    self.gap_open,
                    self.gap_extend,
                    parasail.Matrix(self.subst_mat),
                ).score
                for s in seqs
            ]
        )

        p = Pool(self.n_jobs)
        rows = p.starmap_progress(
            self._align_row,
            zip(
                itertools.repeat(seqs),
                itertools.repeat(self_alignment_scores),
                range(len(seqs)),
            ),
            chunksize=200,
            total=len(seqs),
        )
        p.close()

        score_mat = scipy.sparse.vstack(rows)
        score_mat.eliminate_zeros()
        assert score_mat.shape[0] == score_mat.shape[1]

        return score_mat
Ejemplo n.º 5
0
    def _align_row(
        self, seqs: np.ndarray, self_alignment_scores: np.array, i_row: int
    ) -> np.ndarray:
        """Generates a row of the triangular distance matrix. 
        
        Aligns `seqs[i_row]` with all other sequences in `seqs[i_row:]`. 

        Parameters
        ----------
        seqs
            Array of amino acid sequences
        self_alignment_scores
            Array containing the scores of aligning each sequence in `seqs` 
            with itself. This is used as a reference value to turn 
            alignment scores into distances. 
        i_row
            Index of the row in the final distance matrix. Determines the target sequence. 

        Returns
        -------
        The i_th row of the final score matrix. 
        """
        subst_mat = parasail.Matrix(self.subst_mat)
        target = seqs[i_row]
        profile = parasail.profile_create_16(target, subst_mat)

        def coord_generator():
            for j, s2 in enumerate(seqs[i_row:], start=i_row):
                r = parasail.nw_scan_profile_16(
                    profile, s2, self.gap_open, self.gap_extend
                )
                max_score = np.min(self_alignment_scores[[i_row, j]])
                d = max_score - r.score
                if d <= self.cutoff:
                    yield d + 1, j

        d, col = zip(*coord_generator())
        row = np.zeros(len(col), dtype="int")
        return coo_matrix((d, (row, col)), dtype=self.DTYPE, shape=(1, len(seqs)))
Ejemplo n.º 6
0
    references = {}
    for record in SeqIO.parse(ref_file, "fasta"):
         references[record.id + "_forward"] = str(record.seq)
         references[record.id + "_reverse"] = str(record.seq.reverse_complement())
    return references

if __name__ == '__main__':

    args = parse_args()
    references = load_reference_dict(args.references)
    cpg_dict = load_cpg_dict(args.cpg_csv)
    cpg_counter = make_cpg_counter(args.cpg_csv)
    fw = open(str(args.report),"w")
    fw2 = open(str(args.counts),"w")
    nuc_matrix = parasail.Matrix(str(args.substitution_matrix))

    counts, cpg_counts = process_file(str(args.reads), references, cpg_dict, args.sample, cpg_counter,nuc_matrix)

    count_str = str(args.sample) + ","

    cpg_order = args.cpg_header.split(",")
    for i in cpg_order:
        if not i == "sample":
            c_and_t = cpg_counts[i]["C"] + cpg_counts[i]["T"]
            prop = "NA"
            if c_and_t > 50:
                prop = round(cpg_counts[i]["C"] / c_and_t, 3)
            count_str += f"{prop},"
            
            
Ejemplo n.º 7
0
def load_matrix(matrix_file):
    print(f"Loading substitution matrix from {matrix_file}")
    return parasail.Matrix("finding_defining_mutations/substitution_matrix.txt")