def ssw_check_parasail(query, target): user_matrix = parasail.matrix_create("ACGT", 2, -2) alignment_info = {} for sQId, sQSeq, sQQual in read(query): sQSeq_r = reverse_complement(sQSeq) for sTId, sTSeq, STQual in read(target): res = parasail.ssw(sQSeq, sTSeq, 3, 1, user_matrix) res_r = parasail.ssw(sQSeq_r, sTSeq, 3, 1, user_matrix) if res.score1 > res_r.score1: score = res.score1 qstart, qend = res.read_begin1 + 1., res.read_end1 + 1 tstart, tend = res.ref_begin1 + 1, res.ref_end1 + 1 strand = '+' else: score = res_r.score1 qstart, qend = len(sQSeq) - res_r.read_end1, len( sQSeq) - res_r.read_begin1 tstart, tend = res_r.ref_begin1 + 1, res_r.ref_end1 + 1 strand = '-' alignment_info[sTId] = [ score, int(qstart), int(qend), int(tstart), int(tend), strand ] return (alignment_info)
def get_refined_bp_sbnd(tconsensus, fasta_file_h, tchr, tstart, tend, tdir, hout_log, margin = 200): tconsensus_part = tconsensus[:1000] if len(tconsensus) > 1000 else tconsensus ref_len = fasta_file_h.get_reference_length(tchr) if tstart < 1: tstart = 1 if ref_len < tend: tend = ref_len if tdir == '+': qseq = fasta_file_h.fetch(tchr, max(int(tstart) - margin, 0), int(tend)) else: qseq = fasta_file_h.fetch(tchr, max(int(tstart) - 1, 0), int(tend) + margin) qseq = reverse_complement(qseq) user_matrix = parasail.matrix_create("ACGT", 1, -2) res = parasail.ssw(qseq, tconsensus, 3, 1, user_matrix) if res is None: logger.debug(f"Alignment for breakpoint localization failed for {tchr},{tstart},{tend},{tdir}") return None if tdir == '+': bp_pos_reference = tend - (len(qseq) - res.read_end1 - 1) else: bp_pos_reference = tstart + (len(qseq) - res.read_end1 - 1) tconsensus_after = tconsensus[(res.ref_end1 + 1):] return (bp_pos_reference, tconsensus_after)
def generate_paf_file(query_fasta, target_fasta, output_file): user_matrix = parasail.matrix_create("ACGT", 2, -2) with open(target_fasta, 'r') as hin: for line in hin: if line.startswith('>'): tid = line.rstrip('\n').split(' ')[0].lstrip('>') else: tseq = line.rstrip('\n') with open(query_fasta, 'r') as hin, open(output_file, 'w') as hout: for line in hin: if line.startswith('>'): qid = line.rstrip('\n').lstrip('>') else: qseq = line.rstrip('\n') res = parasail.ssw(qseq, tseq, 3, 1, user_matrix) if res is not None: print("%s\t%d\t%d\t%d\t+\t%s\t%d\t%d\t%d\t*\t*\t60" % (qid, len(qseq), res.read_begin1, res.read_end1, tid, len(tseq), res.ref_begin1, res.ref_end1), file=hout) else: logger.warning( f'Error occured in the alignment of {qid} and {tid} via parasail' )
def generate_paf_file(self, query_fasta, target_fasta, output_file): user_matrix = parasail.matrix_create("ACGT", 2, -2) paf_rec_count = 0 with open(target_fasta, 'r') as hin: for line in hin: if line.startswith('>'): tid = line.rstrip('\n').split(' ')[0].lstrip('>') else: tseq = line.rstrip('\n') with open(query_fasta, 'r') as hin, open(output_file, 'w') as hout: for line in hin: if line.startswith('>'): qid = line.rstrip('\n').lstrip('>') else: qseq = line.rstrip('\n') res = parasail.ssw(qseq, tseq, 3, 1, user_matrix) if res is not None: print(f"{qid}\t{len(qseq)}\t{res.read_begin1}\t{res.read_end1}\t+\t" + f"{tid}\t{len(tseq)}\t{res.ref_begin1}\t{res.ref_end1}\t*\t*\t60", file = hout) paf_rec_count = paf_rec_count + 1 else: self.parasail_error.append((qid, tid)) return(paf_rec_count)
def ssw_alignment(s1, s2, match_score=2, mismatch_penalty=-2, opening_penalty=3, gap_ext=1): user_matrix = parasail.matrix_create("ACGT", match_score, mismatch_penalty) result = parasail.ssw(s1, s2, opening_penalty, gap_ext, user_matrix) print(result, type(result), dir(result)) print(dir(result)) for attr, value in result.__dict__.items(): print(attr, value) # print(result.ref_begin1, result.ref_end1, result.read_begin1, result.read_end1) # print() return s1_alignment, s2_alignment, cigar_string, cigar_tuples, result.score
def generate_paf_file(query_fasta, target_fasta, output_file): user_matrix = parasail.matrix_create("ACGT", 2, -2) with open(target_fasta, 'r') as hin: for line in hin: if line.startswith('>'): tid = line.rstrip('\n').lstrip('>') else: tseq = line.rstrip('\n') with open(query_fasta, 'r') as hin, open(output_file, 'w') as hout: for line in hin: if line.startswith('>'): qid = line.rstrip('\n').lstrip('>') else: qseq = line.rstrip('\n') res = parasail.ssw(qseq, tseq, 3, 1, user_matrix) print("%s\t%d\t%d\t%d\t+\t%s\t%d\t%d\t%d\t*\t*\t60" % (qid, len(qseq), res.read_begin1, res.read_end1, tid, len(tseq), res.ref_begin1, res.ref_end1), file=hout) """
parser = argparse.ArgumentParser( description='Align inserts to a series of target sequences') parser.add_argument('--inserts', required=True) parser.add_argument('--targets', required=True) args = parser.parse_args() # read in the control sequences control_seqs = list() with open(args.targets) as control_fh: count = 0 name = "" for line in control_fh: count += 1 if count % 2 == 0: control_seqs.append([name, line.strip()]) else: name = line.strip() scoring_matrix = parasail.matrix_create("ACGT", 5, -1) print("Read\tReadLen\tReadStart\tReadEnd\tRef\tRefLen\tRefStart\tRefEnd\tScore\tMatches") for read in pysam.FastxFile(args.inserts): # align this read against all oligos for (control_name, control_sequence) in control_seqs: #result = parasail.sw_stats_table_scan_16(read.sequence, control_sequence, 5, 4, scoring_matrix) if len(read.sequence) == 0: continue result = parasail.ssw(read.sequence, control_sequence, 5, 4, scoring_matrix) result2 = parasail.sw_stats_table_striped_16(read.sequence, control_sequence, 5, 4, scoring_matrix) print(read.name+"\t"+str(len(read.sequence))+"\t"+str(result.read_begin1)+"\t"+str(result.read_end1)+"\t"+control_name+"\t"+str(len(control_sequence))+"\t"+str(result.ref_begin1)+"\t"+str(result.ref_end1)+"\t"+str(result.score1)+"\t"+str(result2.matches))
def filter_sv_insertion_match(self, sv, ins, filter_item="Duplicate_with_insertion"): # only apply when the first sv is not insertion and the second sv is insertion type if len(sv.inseq) >= 100 or len(ins.inseq) < 100: return if not (sv.chr1 == ins.chr1 and abs(sv.pos1 - ins.pos1) <= self.bp_dist_margin) and \ not (sv.chr2 == ins.chr2 and abs(sv.pos2 - ins.pos2) <= self.bp_dist_margin): return ins_seg = self.reference_h.fetch( ins.chr1, max(ins.pos1 - self.validate_seg_len - self.bp_dist_margin - 1, 0), ins.pos1 - 1) ins_seg = ins_seg + ins.inseq ins_seg = ins_seg + self.reference_h.fetch( ins.chr1, ins.pos2 - 1, ins.pos2 + self.validate_seg_len + self.bp_dist_margin - 1) if sv.dir1 == '+': tseq = self.reference_h.fetch( sv.chr1, max(sv.pos1 - self.validate_seg_len - 1, 0), sv.pos1 - 1) else: tseq = self.reference_h.fetch(sv.chr1, sv.pos1 - 1, sv.pos1 + self.validate_seg_len - 1) tseq = reverse_complement(tseq) if sv.dir1 == '+': sv_seg = tseq + sv.inseq else: sv_seg = tseq + reverse_complement(sv.inseq) if sv.dir2 == '-': tseq = self.reference_h.fetch(sv.chr2, sv.pos2 - 1, sv.pos2 + self.validate_seg_len - 1) else: tseq = self.reference_h.fetch( sv.chr2, max(sv.pos2 - self.validate_seg_len - 1, 0), sv.pos2 - 1) tseq = reverse_complement(tseq) sv_seg = sv_seg + tseq user_matrix = parasail.matrix_create("ACGT", 2, -2) res = parasail.ssw(sv_seg, ins_seg, 3, 1, user_matrix) res_r = parasail.ssw(reverse_complement(sv_seg), ins_seg, 3, 1, user_matrix) if res.score1 > res_r.score1: match_ratio = float( res.score1) / (2 * (res.ref_end1 - res.ref_begin1 + 1)) if match_ratio > 0.75: if res.read_begin1 < 0.1 * len( sv_seg) and res.read_end1 > 0.9 * len(sv_seg): sv.filter.append(filter_item) return else: match_ratio = float( res_r.score1) / (2 * (res_r.ref_end1 - res_r.ref_begin1 + 1)) if match_ratio > 0.75: if res_r.read_begin1 < 0.1 * len( sv_seg) and res_r.read_end1 > 0.9 * len(sv_seg): sv.filter.append(filter_item) return return