def _compute_block(self, seqs1, seqs2, origin): subst_mat = parasail.Matrix(self.subst_mat) origin_row, origin_col = origin square_matrix = seqs2 is None if square_matrix: seqs2 = seqs1 self_alignment_scores1 = self._self_alignment_scores(seqs1) if square_matrix: self_alignment_scores2 = self_alignment_scores1 else: self_alignment_scores2 = self._self_alignment_scores(seqs2) result = [] for row, s1 in enumerate(seqs1): col_start = row if square_matrix else 0 for col, s2 in enumerate(seqs2[col_start:], start=col_start): profile = parasail.profile_create_16(s1, subst_mat) r = parasail.nw_scan_profile_16(profile, s2, self.gap_open, self.gap_extend) max_score = np.min( [self_alignment_scores1[row], self_alignment_scores2[col]]) d = max_score - r.score if d <= self.cutoff: result.append((d + 1, origin_row + row, origin_col + col)) return result
def genome_vs_genome(gen1, gen2): homolog_db = dict() #Define names of files to avoid thread collisions fname = gen1[8:-3] + '_' + gen2[8:-3] query_gene = 'query_gene_' + fname + '.fa' opal_output = 'OPAL_OUTPUT_' + fname + '.txt' parse_out = 'PARSE_OUT_' + fname + '.fa' iden_matrix = parasail.Matrix("WM_IDENTITY_MATRIX_parasail.txt") i = 0 for gene in SeqIO.parse(gen1, "fasta"): gene.id = gene.id.split("|")[0] #Necessary for H37Rv genome #print(threading.current_thread().name, gen2) SeqIO.write(gene, query_gene, "fasta") #Call query_gene against gen2 with opal #t0 = time.time() subprocess.run( './opal_aligner -o 0 -e 0 -a NW -f Identity_score_matrix.txt -x 1 ' + query_gene + ' ' + gen2 + ' > ' + opal_output, shell=True) #t1 = time.time() #Read in opal output subprocess.run('./parse_opal.py ' + opal_output + ' ' + gen2 + ' ' + parse_out + ' -t ' + str(req_match_fraction), shell=True) #t2 = time.time() #Follow-up opal hits with parasail to count matches profile = parasail.profile_create_stats_16(str(gene.seq), iden_matrix) homolog_ls = list() for s2 in SeqIO.parse(parse_out, "fasta"): s2.id = s2.id.split("|")[0] #Necessary for H37Rv genome #print("1"+gene.id,s2.id) result = parasail.nw_stats_scan_profile_16(profile, str(s2.seq), 20, 1) #print("2"+gene.id,s2.id) match_fraction = result.matches / len(gene.seq) if match_fraction > req_match_fraction: homolog_ls.append((s2.id, match_fraction)) #t3 = time.time() #print("t1-t0: ",t1-t0,"t2-t1: ",t2-t1,"t3-t2: ",t3-t2) homolog_db[gene.id] = homolog_ls i += 1 if i == num_entries: break os.remove(query_gene) os.remove(opal_output) os.remove(parse_out) return (homolog_db)
def _self_alignment_scores(self, seqs: Sequence) -> dict: """Calculate self-alignments. We need them as reference values to turn scores into dists""" return np.fromiter( (parasail.nw_scan_16( s, s, self.gap_open, self.gap_extend, parasail.Matrix(self.subst_mat), ).score for s in seqs), dtype=int, count=len(seqs), )
def calc_dist_mat(self, seqs: Collection) -> coo_matrix: """Calculate the distances between amino acid sequences based on of all-against-all pairwise sequence alignments. Parameters ---------- seqs Array of amino acid sequences Returns ------- Upper diagonal distance matrix of normalized alignment distances. """ # first, calculate self-alignments. We need them as refererence values # to turn scores into dists self_alignment_scores = np.array( [ parasail.nw_scan_16( s, s, self.gap_open, self.gap_extend, parasail.Matrix(self.subst_mat), ).score for s in seqs ] ) p = Pool(self.n_jobs) rows = p.starmap_progress( self._align_row, zip( itertools.repeat(seqs), itertools.repeat(self_alignment_scores), range(len(seqs)), ), chunksize=200, total=len(seqs), ) p.close() score_mat = scipy.sparse.vstack(rows) score_mat.eliminate_zeros() assert score_mat.shape[0] == score_mat.shape[1] return score_mat
def _align_row( self, seqs: np.ndarray, self_alignment_scores: np.array, i_row: int ) -> np.ndarray: """Generates a row of the triangular distance matrix. Aligns `seqs[i_row]` with all other sequences in `seqs[i_row:]`. Parameters ---------- seqs Array of amino acid sequences self_alignment_scores Array containing the scores of aligning each sequence in `seqs` with itself. This is used as a reference value to turn alignment scores into distances. i_row Index of the row in the final distance matrix. Determines the target sequence. Returns ------- The i_th row of the final score matrix. """ subst_mat = parasail.Matrix(self.subst_mat) target = seqs[i_row] profile = parasail.profile_create_16(target, subst_mat) def coord_generator(): for j, s2 in enumerate(seqs[i_row:], start=i_row): r = parasail.nw_scan_profile_16( profile, s2, self.gap_open, self.gap_extend ) max_score = np.min(self_alignment_scores[[i_row, j]]) d = max_score - r.score if d <= self.cutoff: yield d + 1, j d, col = zip(*coord_generator()) row = np.zeros(len(col), dtype="int") return coo_matrix((d, (row, col)), dtype=self.DTYPE, shape=(1, len(seqs)))
references = {} for record in SeqIO.parse(ref_file, "fasta"): references[record.id + "_forward"] = str(record.seq) references[record.id + "_reverse"] = str(record.seq.reverse_complement()) return references if __name__ == '__main__': args = parse_args() references = load_reference_dict(args.references) cpg_dict = load_cpg_dict(args.cpg_csv) cpg_counter = make_cpg_counter(args.cpg_csv) fw = open(str(args.report),"w") fw2 = open(str(args.counts),"w") nuc_matrix = parasail.Matrix(str(args.substitution_matrix)) counts, cpg_counts = process_file(str(args.reads), references, cpg_dict, args.sample, cpg_counter,nuc_matrix) count_str = str(args.sample) + "," cpg_order = args.cpg_header.split(",") for i in cpg_order: if not i == "sample": c_and_t = cpg_counts[i]["C"] + cpg_counts[i]["T"] prop = "NA" if c_and_t > 50: prop = round(cpg_counts[i]["C"] / c_and_t, 3) count_str += f"{prop},"
def load_matrix(matrix_file): print(f"Loading substitution matrix from {matrix_file}") return parasail.Matrix("finding_defining_mutations/substitution_matrix.txt")