def extract_seqs_for_edlib(ref_seq, contig_seq, rstart, rend, qstart, qend): if (rend > rstart): nw_ref = ref_seq[(rstart-1):(rend+1-1)]; # +1 because the end base is inclusive, and -1 because it's 1-based. else: nw_ref = fastqparser.revcomp_seq(ref_seq[(rend-1):(rstart+1-1)]); if (qend > qstart): nw_contig = contig_seq[(qstart-1):(qend+1-1)]; # +1 because the end base is inclusive, and -1 because it's 1-based. else: nw_contig = fastqparser.revcomp_seq(contig_seq[(qend-1):(qstart+1-1)]); return [nw_ref, nw_contig];
def get_circular_score(ref_path, contig_path, temp_folder): if (not os.path.exists(temp_folder)): os.makedirs(temp_folder) [headers_ref, seqs_ref, quals_ref] = fastqparser.read_fastq(ref_path) circularized_fwd_path = '%s/circ-fwd.fa' % (temp_folder) circularized_rev_path = '%s/circ-rev.fa' % (temp_folder) fp_fwd = open(circularized_fwd_path, 'w') fp_rev = open(circularized_rev_path, 'w') for i in xrange(0, len(seqs_ref)): rev_seq = fastqparser.revcomp_seq(seqs_ref[i]) rev_qual = quals_ref[i][::-1] # if (len(quals_ref) > 0): # fp_fwd.write('@%s\n%s%s\n+\n%s%s\n' % (headers_ref[i], seqs_ref[i], seqs_ref[i], quals_ref[i], quals_ref[i])); # fp_rev.write('@%s\n%s%s\n+\n%s%s\n' % (headers_ref[i], rev_seq, rev_seq, rev_qual, rev_qual)); # else: fp_fwd.write('>%s\n%s%s\n' % (headers_ref[i], seqs_ref[i], seqs_ref[i])) fp_rev.write('>%s\n%s%s\n' % (headers_ref[i], rev_seq, rev_seq)) fp_fwd.close() fp_rev.close() # sys.stdout.write('Aligning the fwd orientation...\n'); # command = '%s %s %s -m HW' % (EDLIB_PATH, contig_path, circularized_fwd_path); # [rc_fwd, rstdout_fwd, rstderr_fwd] = execute_command_with_ret(DRY_RUN, command); # scores_fwd = parse_edlib_scores(rstdout_fwd); # for i in xrange(0, len(scores_fwd)): # sys.stdout.write('[%d] %d %s\n' % (i, scores_fwd[i], 'fwd')); # sys.stdout.write('\n'); sys.stdout.write('Aligning the rev orientation...\n') command = '%s %s %s -m HW' % (EDLIB_PATH, contig_path, circularized_rev_path) [rc_rev, rstdout_rev, rstderr_rev] = execute_command_with_ret(DRY_RUN, command) scores_rev = parse_edlib_scores(rstdout_rev) for i in xrange(0, len(scores_rev)): sys.stdout.write('[%d] %d %s\n' % (i, scores_rev[i], 'rev')) sys.stdout.write('\n')
def extract_seqs_for_edlib(temp_folder, temp_suffix, ref_path, contig_path, rstart, rend, qstart, qend, is_fwd, rname, qname, generate_kmer_spectrum=False): if (not os.path.exists(temp_folder)): os.makedirs(temp_folder) [headers_ref, seqs_ref, quals_ref] = fastqparser.read_fastq(ref_path) [headers_contig, seqs_contig, quals_contig] = fastqparser.read_fastq(contig_path) ref_hash = hash_headers(headers_ref) contig_hash = hash_headers(headers_contig) print ref_hash if ((rname in ref_hash) == False): sys.stderr.write( 'ERROR: Reference name "%s" not found in file "%s"! Exiting.\n' % (rname, ref_path)) exit(1) if ((qname in contig_hash) == False): sys.stderr.write( 'ERROR: Contig name "%s" not found in file "%s"! Exiting.\n' % (qname, contig_path)) exit(1) if (rend < rstart): sys.stderr.write( 'ERROR: Reference end should come before reference start (it is expected that the ref is forward oriented), but ref_start = %d, ref_end = %d. Exiting.\n' % (ref_start, ref_end)) exit(1) rid = ref_hash[rname] ref_header = headers_ref[rid] ref_seq = seqs_ref[rid][(rstart - 1):(rend)] # Coordinates are 1-based. qid = contig_hash[qname] contig_header = headers_contig[qid] contig_seq = '' if (is_fwd): if (qend >= qstart): contig_seq = seqs_contig[qid][(qstart - 1):(qend)] else: contig_seq = seqs_contig[qid][(qstart - 1):] + seqs_contig[qid][0:(qend)] else: if (qend > qstart): contig_seq = seqs_contig[qid][(qend - 1):] + seqs_contig[qid][0:(qstart)] else: contig_seq = seqs_contig[qid][(qend - 1):(qstart)] contig_seq = fastqparser.revcomp_seq(contig_seq) nw_ref_path = '%s/nw-ref%s.fasta' % (temp_folder, temp_suffix) nw_contig_path = '%s/nw-contig%s.fasta' % (temp_folder, temp_suffix) nw_kmer_comp_path = '%s/nw-kmers%s.spect' % (temp_folder, temp_suffix) fp_nw_ref = open(nw_ref_path, 'w') fp_nw_contig = open(nw_contig_path, 'w') fp_nw_ref.write('>%s\n%s\n' % (ref_header, ref_seq)) fp_nw_contig.write('>%s\n%s\n' % (contig_header, contig_seq)) fp_nw_ref.close() fp_nw_contig.close() sys.stderr.write('Running Edlib to determine the edit distance...\n') command = '%s %s %s -m NW' % (EDLIB_PATH, nw_contig_path, nw_ref_path) [rc, rstdout, rstderr] = execute_command_with_ret(DRY_RUN, command) # execute_command(command, None, False); scores = parse_edlib_scores(rstdout) unaligned_len = len(seqs_ref[rid]) - len(ref_seq) if (len(scores) == 0): sys.stderr.write( 'ERROR: len(scores) == 0!\nreturn code: %d\nrstdout:\n%s\n' % (rc, rstdout)) sys.stderr.write( 'Final edit distance: %d, aligned edit distance: %d, unaligned ref len: %d, aligned ref len: %d, aligned contig len: %d\n' % ((scores[0] + unaligned_len), scores[0], unaligned_len, len(ref_seq), len(contig_seq))) # for i in xrange(0, len(scores)): # sys.stdout.write('[%d] edit dist: %d\tunaligned len: %d\n' % (i, scores[i], unaligned_len)); sys.stdout.write('\n') if (generate_kmer_spectrum == True): sys.stderr.write('Generating the kmer spectrum.\n') command = '%s -o %s %s %s' % (KMERCOMP_PATH, nw_kmer_comp_path, nw_contig_path, nw_ref_path) [rc, rstdout, rstderr] = execute_command_with_ret(DRY_RUN, command) sys.stderr.write('Stdout:\n%s\nStderr:\n%s\n' % (rstdout, rstderr)) sys.stderr.write('Done generating the kmer spectrum!\n')
# In this case, something weird happened. Most likely the header got messed up. # Another option is that someone changed the reads file. In any case, if the original read # cannot be found, we will call this alignment unmapped. seq = '*'; flag = 4; qual = '*'; if (len(read_quals) > 0): try: qual = read_quals[read_header_hash[qseqid]]; except: qual = '*'; sam_start = (int(sstart) - 1) if (sstrand == 'plus') else (int(send) - 1); sam_end = (int(send) - 1) if (sstrand == 'plus') else (int(sstart) - 1); sam_seq = (seq) if (sstrand == 'plus' or seq == '*') else (fastqparser.revcomp_seq(seq)); # Reverse the seq field if necessary. sam_qual = (qual) if (sstrand == 'plus' or qual == '*') else (qual[::-1]); # Reverse the quality values if necessary. num_clip_front = int(qstart) - 1; num_clip_back = int(qlen) - (int(qend)); sam_cigar = convert_btop_to_cigar(btop, num_clip_front, num_clip_back, sstrand); sam_line = ''; sam_line += '%s\t' % (qseqid); # 1. qname sam_line += '%d\t' % (flag); # 2. flag sam_line += '%s\t' % (sseqid); # 3. rname sam_line += '%d\t' % (sam_start + 1); # 4. pos sam_line += '255\t'; # 5. mapq sam_line += '%s\t' % (sam_cigar); # 6. CIGAR sam_line += '*\t'; # 7. rnext sam_line += '0\t'; # 8. pnext sam_line += '0\t'; # 9. tlen