def eval_contigs(ref_path, contig_path, temp_folder, generate_kmer_spectrum=False): if (not os.path.exists(temp_folder)): os.makedirs(temp_folder) [headers_contigs, seqs_contigs, quals_contigs] = fastqparser.read_fastq(contig_path) [headers_ref, seqs_ref, quals_ref] = fastqparser.read_fastq(ref_path) ref_hash = hash_headers(headers_ref) contig_hash = hash_headers(headers_contigs) single_contig_path = '%s/singlecontig.fasta' % (temp_folder) for i in xrange(0, len(seqs_contigs)): contig_name = headers_contigs[i].split()[0] contig_seq = seqs_contigs[i] fp_contig = open(single_contig_path, 'w') fp_contig.write('>%s\n%s\n' % (contig_name, seqs_contigs[i])) fp_contig.close() nucmer_out_prefix = '%s/nucmer' % (temp_folder) sys.stderr.write('\n') sys.stderr.write('Running MUMmer on contig: "%s"\n' % (contig_name)) command = '%s --maxmatch --extend -p %s %s %s; delta-filter -r -q %s.delta > %s.filt.delta; show-coords -r -c %s.filt.delta > %s.filt.coords' % \ (NUCMER_PATH, nucmer_out_prefix, ref_path, single_contig_path, nucmer_out_prefix, nucmer_out_prefix, nucmer_out_prefix, nucmer_out_prefix) # execute_command(command, None, False); [rc, rstdout, rstderr] = execute_command_with_ret(DRY_RUN, command) sys.stderr.write('\n') sys.stderr.write('Parsing the coords file.\n') # fp = open('/home/isovic/work/eclipse-workspace/git/consise/temp2-mummer/test-data/out/nucmer.coords2', 'r'); coords_path = '%s.filt.coords' % (nucmer_out_prefix) fp = open(coords_path, 'r') lines = fp.readlines() fp.close() coords = parse_coords_lines(lines, contig_name, seqs_ref, ref_hash, seqs_contigs, contig_hash) print '' print 'coords: "%s"' % (coords) print 'lines:' for line in lines: print line sys.stdout.flush() [rstart, rend, qstart, qend, is_fwd, rname, qname] = coords extract_seqs_for_edlib(temp_folder, '.%d' % (i), ref_path, contig_path, rstart, rend, qstart, qend, is_fwd, rname, qname, generate_kmer_spectrum=generate_kmer_spectrum) sys.stderr.write('\n')
def extractFromFAST(fast_fname, qnames_fname): sys.stderr.write('\nLoading qnames file!') qnames = [] qnames_dict = {} with open(qnames_fname, 'rU') as qnames_f: qnames = qnames_f.readlines() qnames_f.close() # Creating a dictionary for faster search # Also removing '\n' from the end for qname in qnames: qnames_dict[qname[:-1]] = 1 sys.stderr.write('\nLoading FASTA/FASTQ file!') [headers, seqs, quals] = read_fastq(fast_fname) sys.stderr.write('\nExtracting ...') for i in xrange(len(headers)): header = headers[i] seq = seq[i] qual = quals[i] qname = header[i] if qname in qnames_dict: if fext.lower() in ['.fa', '.fna', 'faa', '.fasta']: sys.stdout.write('>' + header + '\n') sys.stdout.write(seq + '\n') elif fext.lower() in ['.fq', '.fastq']: sys.stdout.write('@' + header + '\n') sys.stdout.write(seq + '\n') sys.stdout.write('+' + header + '\n') sys.stdout.write(qual + '\n') sys.stderr.write('\nFinished!')
def prepare_human_genome(genome_file): filename, file_extension = os.path.splitext(genome_file) processed_genome_file = filename + "_P" + file_extension [headers, seqs, quals] = read_fastq(genome_file) with open(processed_genome_file, "w") as pgfile: for i in range(len(headers)): header = headers[i] new_header = "chr19" seq = seqs[i] qual = quals[i] if (header.find("chromosome 19") > -1 and header.find("Primary Assembly") > -1): if file_extension.lower() in [".fa", ".fna", "faa", ".fasta"]: pgfile.write(">" + new_header + "\n") pgfile.write(seq + "\n") elif file_extension.lower() in [".fq", ".fastq"]: pgfile.write("@" + new_header + "\n") pgfile.write(seq + "\n") pgfile.write("+" + new_header + "\n") pgfile.write(qual + "\n") else: pgfile.write(r"@ERROR occured. File is NOT COMPLETE!") raise Exception("Invalid file extension: %s" % file_extension) break
def prepare_human_genome(genome_file): filename, file_extension = os.path.splitext(genome_file) processed_genome_file = filename + '_P' + file_extension [headers, seqs, quals] = read_fastq(genome_file) with open(processed_genome_file, 'w') as pgfile: for i in range(len(headers)): header = headers[i] new_header = 'chr19' seq = seqs[i] qual = quals[i] if header.find('chromosome 19') > -1 and header.find('Primary Assembly') > -1: if file_extension.lower() in ['.fa', '.fna', 'faa', '.fasta']: pgfile.write('>' + new_header + '\n') pgfile.write(seq + '\n') elif file_extension.lower() in ['.fq', '.fastq']: pgfile.write('@' + new_header + '\n') pgfile.write(seq + '\n') pgfile.write('+' + new_header + '\n') pgfile.write(qual + '\n') else: pgfile.write(r'@ERROR occured. File is NOT COMPLETE!') raise Exception('Invalid file extension: %s' % file_extension) break
def extractFromFAST(fast_fname, qnames_fname): sys.stderr.write("\nLoading qnames file!") qnames = [] qnames_dict = {} with open(qnames_fname, "rU") as qnames_f: qnames = qnames_f.readlines() qnames_f.close() # Creating a dictionary for faster search # Also removing '\n' from the end for qname in qnames: qnames_dict[qname[:-1]] = 1 sys.stderr.write("\nLoading FASTA/FASTQ file!") [headers, seqs, quals] = read_fastq(fast_fname) sys.stderr.write("\nExtracting ...") for i in range(len(headers)): header = headers[i] seq = seq[i] qual = quals[i] qname = header[i] if qname in qnames_dict: if fext.lower() in [".fa", ".fna", "faa", ".fasta"]: sys.stdout.write(">" + header + "\n") sys.stdout.write(seq + "\n") elif fext.lower() in [".fq", ".fastq"]: sys.stdout.write("@" + header + "\n") sys.stdout.write(seq + "\n") sys.stdout.write("+" + header + "\n") sys.stdout.write(qual + "\n") sys.stderr.write("\nFinished!")
def prepare_dm_genome(genome_file): filename, file_extension = os.path.ext(genome_file) processed_genome_file = filename + "_P" + file_extension [headers, seqs, quals] = read_fastq(genome_file) with open(processed_genome_file, "w") as pgfile: for i in range(len(headers)): header = headers[i] new_header = "ERROR!" # In case it somehow slips through seq = seqs[i] qual = quals[i] goodLine = True # Check if line contains any disqualifying enteries for badstring in bad_strings_genomes: if header.find(badstring) > -1: goodLine = False break if goodLine: pos = header.find("chromosome") if pos > -1: pos2 = header[pos:].find(" ") pos3 = header[pos + pos2 + 1:].find( " ") # Looking for second space if pos3 == -1: new_header = "chr" + header[pos + pos2 + 1:] else: new_header = ( "chr" + header[pos + pos2 + 1:pos + pos2 + 1 + pos3]) elif header.find("chr") > -1: # If we can find chr and not chromosome, assume that this header is as it should be new_header = header else: pos = header.find("mitochondrion") if pos > -1: new_header = "chrM" else: # This shouldn't happens import pdb pdb.set_trace() raise Exception( "Invalid DM genome header: %s!") % header if goodLine: if file_extension.lower() in [".fa", ".fna", "faa", ".fasta"]: pgfile.write(">" + new_header + "\n") pgfile.write(seq + "\n") elif file_extension.lower() in [".fq", ".fastq"]: pgfile.write("@" + new_header + "\n") pgfile.write(seq + "\n") pgfile.write("+" + new_header + "\n") pgfile.write(qual + "\n") else: pgfile.write(r"@ERROR occured. File is NOT COMPLETE!") raise Exception("Invalid file extension: %s" % file_extension)
def TEST_SAM_TO_CONTIG(single_contig_file, contig_sam, output_alt_contig_fasta): [ctg_headers, ctg_seqs, ctg_quals] = fastqparser.read_fastq(single_contig_file); [headers, contig_sams] = utility_sam.LoadSAM(contig_sam); [new_contig, non_clipped_len, new_contig_cigar] = construct_contig_from_overlapping_sams(ctg_seqs, contig_sams); fp = open(output_alt_contig_fasta, 'w'); fp.write('>Alternate contig\n'); fp.write('%s\n' % new_contig); fp.close();
def prepare_dm_genome(genome_file): filename, file_extension = os.path.ext(genome_file) processed_genome_file = filename + '_P' + file_extension [headers, seqs, quals] = read_fastq(genome_file) with open(processed_genome_file, 'w') as pgfile: for i in range(len(headers)): header = headers[i] new_header = 'ERROR!' # In case it somehow slips through seq = seqs[i] qual = quals[i] goodLine = True # Check if line contains any disqualifying enteries for badstring in bad_strings_genomes: if header.find(badstring) > -1: goodLine = False break if goodLine: pos = header.find('chromosome') if pos > -1: pos2 = header[pos:].find(' ') pos3 = header[pos + pos2 + 1:].find( ' ') # Looking for second space if pos3 == -1: new_header = 'chr' + header[pos + pos2 + 1:] else: new_header = 'chr' + header[pos + pos2 + 1:pos + pos2 + 1 + pos3] elif header.find('chr') > -1: # If we can find chr and not chromosome, assume that this header is as it should be new_header = header else: pos = header.find('mitochondrion') if pos > -1: new_header = 'chrM' else: # This shouldn't happens import pdb pdb.set_trace() raise Exception( 'Invalid DM genome header: %s!') % header if goodLine: if file_extension.lower() in ['.fa', '.fna', 'faa', '.fasta']: pgfile.write('>' + new_header + '\n') pgfile.write(seq + '\n') elif file_extension.lower() in ['.fq', '.fastq']: pgfile.write('@' + new_header + '\n') pgfile.write(seq + '\n') pgfile.write('+' + new_header + '\n') pgfile.write(qual + '\n') else: pgfile.write(r'@ERROR occured. File is NOT COMPLETE!') raise Exception('Invalid file extension: %s' % file_extension)
def split(readsfile, namesfile): fname, fext = os.path.splitext(readsfile) readsfile1 = fname + '1' + fext readsfile2 = fname + '2' + fext file1 = open(readsfile1, 'w') file2 = open(readsfile2, 'w') [headers, seqs, quals] = read_fastq(readsfile) names = [] nfile = open(namesfile, 'rU') for line in nfile: names.append(line[:-1]) i = 0 count1 = count2 = 0 for i in range(len(headers)): header = headers[i] # Removing everything after the first space pos = header.find(' ') header = header[:pos] seq = seqs[i] qual = quals[i] if header in names: if fext.lower() in ['.fa', '.fna', 'faa', '.fasta']: file1.write('>' + header + '\n') file1.write(seq + '\n') elif fext.lower() in ['.fq', '.fastq']: file1.write('@' + header + '\n') file1.write(seq + '\n') file1.write('+' + header + '\n') file1.write(qual + '\n') else: raise Exception('Invalid extension for reads file: %s' % fext) count1 += 1 else: if fext.lower() in ['.fa', '.fna', 'faa', '.fasta']: file2.write('>' + header + '\n') file2.write(seq + '\n') elif fext.lower() in ['.fq', '.fastq']: file2.write('@' + header + '\n') file2.write(seq + '\n') file2.write('+' + header + '\n') file2.write(qual + '\n') count2 += 1 i += 1 file1.close() file2.close() sys.stderr.write('\n%d reads in file1; %d reads n file2\n' % (count1, count2))
def split(readsfile, namesfile): fname, fext = os.path.splitext(readsfile) readsfile1 = fname + "1" + fext readsfile2 = fname + "2" + fext file1 = open(readsfile1, "w") file2 = open(readsfile2, "w") [headers, seqs, quals] = read_fastq(readsfile) names = [] nfile = open(namesfile, "rU") for line in nfile: names.append(line[:-1]) i = 0 count1 = count2 = 0 for i in range(len(headers)): header = headers[i] # Removing everything after the first space pos = header.find(" ") header = header[:pos] seq = seqs[i] qual = quals[i] if header in names: if fext.lower() in [".fa", ".fna", "faa", ".fasta"]: file1.write(">" + header + "\n") file1.write(seq + "\n") elif fext.lower() in [".fq", ".fastq"]: file1.write("@" + header + "\n") file1.write(seq + "\n") file1.write("+" + header + "\n") file1.write(qual + "\n") else: raise Exception("Invalid extension for reads file: %s" % fext) count1 += 1 else: if fext.lower() in [".fa", ".fna", "faa", ".fasta"]: file2.write(">" + header + "\n") file2.write(seq + "\n") elif fext.lower() in [".fq", ".fastq"]: file2.write("@" + header + "\n") file2.write(seq + "\n") file2.write("+" + header + "\n") file2.write(qual + "\n") count2 += 1 i += 1 file1.close() file2.close() sys.stderr.write("\n%d reads in file1; %d reads n file2\n" % (count1, count2))
def get_sam_header(reference_file): [headers, seqs, quals] = fastqparser.read_fastq(reference_file); line = ''; i = 0; while i < len(headers): line += '@SQ\tSN:%s\tLN:%d\n' % (headers[i], len(seqs[i])); i += 1; return line;
def get_sam_header(reference_file): [headers, seqs, quals] = fastqparser.read_fastq(reference_file) line = '' i = 0 while i < len(headers): line += '@SQ\tSN:%s\tLN:%d\n' % (headers[i].split()[0], len(seqs[i])) i += 1 return line
def prepare_dm_genome(genome_file): filename, file_extension = os.path.ext(genome_file) processed_genome_file = filename + '_P' + file_extension [headers, seqs, quals] = read_fastq(genome_file) with open(processed_genome_file, 'w') as pgfile: for i in range(len(headers)): header = headers[i] new_header = 'ERROR!' # In case it somehow slips through seq = seqs[i] qual = quals[i] goodLine = True # Check if line contains any disqualifying enteries for badstring in bad_strings_genomes: if header.find(badstring) > -1: goodLine = False break if goodLine: pos = header.find('chromosome') if pos > -1: pos2 = header[pos:].find(' ') pos3 = header[pos+pos2+1:].find(' ') # Looking for second space if pos3 == -1: new_header = 'chr' + header[pos+pos2+1:] else: new_header = 'chr' + header[pos+pos2+1:pos+pos2+1+pos3] elif header.find('chr') > -1: # If we can find chr and not chromosome, assume that this header is as it should be new_header = header else: pos = header.find('mitochondrion') if pos > -1: new_header = 'chrM' else: # This shouldn't happens import pdb pdb.set_trace() raise Exception('Invalid DM genome header: %s!') % header if goodLine: if file_extension.lower() in ['.fa', '.fna', 'faa', '.fasta']: pgfile.write('>' + new_header + '\n') pgfile.write(seq + '\n') elif file_extension.lower() in ['.fq', '.fastq']: pgfile.write('@' + new_header + '\n') pgfile.write(seq + '\n') pgfile.write('+' + new_header + '\n') pgfile.write(qual + '\n') else: pgfile.write(r'@ERROR occured. File is NOT COMPLETE!') raise Exception('Invalid file extension: %s' % file_extension)
def TEST_SAM_TO_CONTIG(single_contig_file, contig_sam, output_alt_contig_fasta): [ctg_headers, ctg_seqs, ctg_quals] = fastqparser.read_fastq(single_contig_file) [headers, contig_sams] = utility_sam.LoadSAM(contig_sam) [new_contig, non_clipped_len, new_contig_cigar ] = construct_contig_from_overlapping_sams(ctg_seqs, contig_sams) fp = open(output_alt_contig_fasta, 'w') fp.write('>Alternate contig\n') fp.write('%s\n' % new_contig) fp.close()
def ProcessFromFiles(reference_file, sam_path, out_accuracy_counts_path, count_indels_as_events=False): [ref_headers, ref_seqs, ref_quals] = fastqparser.read_fastq(reference_file); references = {}; accuracy_counts = []; i = 0; while i < len(ref_headers): header = ref_headers[i]; seq = ref_seqs[i]; references[header] = seq; references[header.split()[0]] = seq; i += 1; ProcessSAM(references, sam_path, out_accuracy_counts_path, count_indels_as_events);
def convert_blast_to_sam(reference_file, reads_file, blast_out_file, sam_file): sys.stderr.write('[%s wrapper] Converting BLAST output to SAM file...\n' % (MAPPER_NAME)); [ref_headers, ref_seqs, ref_quals] = fastqparser.read_fastq(reference_file); ref_header_hash = {}; i = 0; while (i < len(ref_headers)): ref_header_hash[ref_headers[i]] = i; ref_header_hash[ref_headers[i].split()[0]] = i; i += 1; [read_headers, read_seqs, read_quals] = fastqparser.read_fastq(reads_file); read_header_hash = {}; i = 0; while (i < len(read_headers)): read_header_hash[read_headers[i]] = i; read_header_hash[read_headers[i].split()[0]] = i; i += 1; try: fp_in = open(blast_out_file, 'r'); except Exception, e: sys.stderr.write('ERROR: Could not open file "%s" for reading!\n' % blast_out_file); exit(1);
def load_and_process_sam(in_sam, ref_file, fp_out): [sam_headers, sam_lines] = parse_sam(in_sam); [headers_ref, seqs_ref, quals_ref] = fastqparser.read_fastq(ref_file); seqs_ref_hash = {}; for i in xrange(0, len(seqs_ref)): seqs_ref_hash[headers_ref[i]] = seqs_ref[i]; seqs_ref_hash[headers_ref[i].split()[0]] = seqs_ref[i]; for sam_line in sam_lines: if (sam_line.IsMapped() == False): continue; # print sam_line.verbose(); seq_ref = seqs_ref_hash[sam_line.RefName]; stablyLeftAlign(sam_line, seq_ref, 1, False);
def load_and_process_sam(in_sam, ref_file, fp_out): [sam_headers, sam_lines] = parse_sam(in_sam) [headers_ref, seqs_ref, quals_ref] = fastqparser.read_fastq(ref_file) seqs_ref_hash = {} for i in xrange(0, len(seqs_ref)): seqs_ref_hash[headers_ref[i]] = seqs_ref[i] seqs_ref_hash[headers_ref[i].split()[0]] = seqs_ref[i] for sam_line in sam_lines: if (sam_line.IsMapped() == False): continue # print sam_line.verbose(); seq_ref = seqs_ref_hash[sam_line.RefName] stablyLeftAlign(sam_line, seq_ref, 1, False)
def fixAfterRacon(consensus_file, original_file, output_file=sys.stdout): [cheaders, cseqs, cquals] = read_fastq(consensus_file) [oheaders, oseqs, oquals] = read_fastq(original_file) clen = len(cheaders) olen = len(oheaders) for oidx in xrange(olen): csame = 0 oheader = oheaders[oidx] oseq = oseqs[oidx] for cidx in xrange(clen): cheader = cheaders[cidx] cseq = cseqs[cidx] if oheader == cheader[10:]: csame += 1 # Write consensus sequence to output sys.stdout.write('>%s\n' % cheader) sys.stdout.write('%s\n' % cseq) if csame == 0: # Write original sequence to output sys.stdout.write('>%s\n' % oheader) sys.stdout.write('%s\n' % oseq) if csame > 1: sys.stderr.write( '\nFound an original with %d corresponding consensuses' % csame) sys.stderr.write('\n%s' % oheader) sys.stdout.write('\n') sys.stdout.write('\nNumber of sequences in original file: %d' % olen) sys.stdout.write('\nNumber of sequences in consensus file: %d' % clen) sys.stdout.write('\n') pass
def get_kmers_from_positions(fastq_file, pos_list, k): kmers = [] [headers, seqs, quals] = fastqparser.read_fastq(fastq_file) header_hash = {} i = 0 while (i < len(headers)): header_hash[headers[i]] = i header_hash[headers[i].split()[0]] = i i += 1 num_homo = 0 i = 0 for pos_item in pos_list: i += 1 chrom = pos_item[0] pos = pos_item[1] - 1 ref = pos_item[2] alt = pos_item[3] info = pos_item[4] try: seq = seqs[header_hash[chrom]] except Exception, e: sys.stderr.write(str(e) + '\n') continue # kstart = (pos - 1) - k/2; k_before = k k_after = k # klen = # kend = (pos - 1) + k/2; # kmer_before = seq[pos-k_before:pos] if (k_before <= pos) else (' ' * (k_before - pos) + seq[0:pos]); # kmer_after = seqs[(pos + 1):(pos+1+k_after)] if ((pos+1+k_after) >= len(seq)) else (seq[(pos+1):len(seq)] + (' ' * (pos+1+k_after - len(seq)))); kmer_before = seq[pos - k:pos] kmer_after = seq[(pos + 1):(pos + 1 + k)] kmer = kmer_before + '_' + seq[pos] + '_' + kmer_after kmers.append([kmer, chrom, pos]) kmer_ref = kmer_before + '_' + ref + '_' + kmer_after kmer_alt = kmer_before + '_' + alt + '_' + kmer_after num_homo += 1 if (kmer_before[-1] == ref or kmer_after[0] == ref) else 0 # if (kmer_before[-1] == ref or kmer_after[0] == ref): # sys.stdout.write('\th [%d] %s\t%s\t%s\t%d\t%s\n' % (i, kmer, kmer_ref, kmer_alt, (pos + 1), info)); # else: sys.stdout.write('[%d] %s\t%s\t%s\t%d\t%s\n' % (i, kmer, kmer_ref, kmer_alt, (pos + 1), info))
def fixAfterRacon(consensus_file, original_file, output_file = sys.stdout): [cheaders, cseqs, cquals] = read_fastq(consensus_file) [oheaders, oseqs, oquals] = read_fastq(original_file) clen = len(cheaders) olen = len(oheaders) for oidx in xrange(olen): csame = 0 oheader = oheaders[oidx] oseq = oseqs[oidx] for cidx in xrange(clen): cheader = cheaders[cidx] cseq = cseqs[cidx] if oheader == cheader[10:]: csame += 1 # Write consensus sequence to output sys.stdout.write('>%s\n' % cheader) sys.stdout.write('%s\n' % cseq) if csame == 0: # Write original sequence to output sys.stdout.write('>%s\n' % oheader) sys.stdout.write('%s\n' % oseq) if csame > 1: sys.stderr.write('\nFound an original with %d corresponding consensuses' % csame) sys.stderr.write('\n%s' % oheader) sys.stdout.write('\n'); sys.stdout.write('\nNumber of sequences in original file: %d' % olen); sys.stdout.write('\nNumber of sequences in consensus file: %d' % clen); sys.stdout.write('\n'); pass
def get_kmers_from_positions(fastq_file, pos_list, k): kmers = [] [headers, seqs, quals] = fastqparser.read_fastq(fastq_file) header_hash = {} i = 0 while i < len(headers): header_hash[headers[i]] = i header_hash[headers[i].split()[0]] = i i += 1 num_homo = 0 i = 0 for pos_item in pos_list: i += 1 chrom = pos_item[0] pos = pos_item[1] - 1 ref = pos_item[2] alt = pos_item[3] info = pos_item[4] try: seq = seqs[header_hash[chrom]] except Exception, e: sys.stderr.write(str(e) + "\n") continue # kstart = (pos - 1) - k/2; k_before = k k_after = k # klen = # kend = (pos - 1) + k/2; # kmer_before = seq[pos-k_before:pos] if (k_before <= pos) else (' ' * (k_before - pos) + seq[0:pos]); # kmer_after = seqs[(pos + 1):(pos+1+k_after)] if ((pos+1+k_after) >= len(seq)) else (seq[(pos+1):len(seq)] + (' ' * (pos+1+k_after - len(seq)))); kmer_before = seq[pos - k : pos] kmer_after = seq[(pos + 1) : (pos + 1 + k)] kmer = kmer_before + "_" + seq[pos] + "_" + kmer_after kmers.append([kmer, chrom, pos]) kmer_ref = kmer_before + "_" + ref + "_" + kmer_after kmer_alt = kmer_before + "_" + alt + "_" + kmer_after num_homo += 1 if (kmer_before[-1] == ref or kmer_after[0] == ref) else 0 # if (kmer_before[-1] == ref or kmer_after[0] == ref): # sys.stdout.write('\th [%d] %s\t%s\t%s\t%d\t%s\n' % (i, kmer, kmer_ref, kmer_alt, (pos + 1), info)); # else: sys.stdout.write("[%d] %s\t%s\t%s\t%d\t%s\n" % (i, kmer, kmer_ref, kmer_alt, (pos + 1), info))
def ProcessFromFiles(reference_file, sam_path, out_accuracy_counts_path, count_indels_as_events=False): [ref_headers, ref_seqs, ref_quals] = fastqparser.read_fastq(reference_file) references = {} accuracy_counts = [] i = 0 while i < len(ref_headers): header = ref_headers[i] seq = ref_seqs[i] references[header] = seq references[header.split()[0]] = seq i += 1 ProcessSAM(references, sam_path, out_accuracy_counts_path, count_indels_as_events)
def split_transcriptome(transcriptome_file): # split = {1: 4000, 2: 1000, 3: 1000} # Split ratio # limits = [4000, 5000, 6000] split = split_sc limits = limits_sc filename, file_extension = os.path.splitext(transcriptome_file) g1_filename = filename + "_G1" + file_extension g2_filename = filename + "_G2" + file_extension g3_filename = filename + "_G3" + file_extension [headers, seqs, quals] = read_fastq(transcriptome_file) total = sum(split.values()) if len(headers) > total: total = len(headers) random.seed() with open(g1_filename, "w") as g1file, open(g2_filename, "w") as g2file, open(g3_filename, "w") as g3file: for i in range(len(headers)): header = headers[i] seq = seqs[i] qual = quals[i] rnum = random.randint(0, total) # Generate random number gfile = None if rnum < limits[0]: gfile = g1file elif rnum < limits[1]: gfile = g2file elif rnum < limits[2]: gfile = g3file else: continue # Skip this sequence if file_extension.lower() in [".fa", ".fna", "faa", ".fasta"]: gfile.write(">" + header + "\n") gfile.write(seq + "\n") elif file_extension.lower() in [".fq", ".fastq"]: gfile.write("@" + header + "\n") gfile.write(seq + "\n") gfile.write("+" + header + "\n") gfile.write(qual + "\n")
def split_transcriptome(transcriptome_file): # split = {1: 4000, 2: 1000, 3: 1000} # Split ratio # limits = [4000, 5000, 6000] split = split_sc limits = limits_sc filename, file_extension = os.path.splitext(transcriptome_file) g1_filename = filename + '_G1' + file_extension g2_filename = filename + '_G2' + file_extension g3_filename = filename + '_G3' + file_extension [headers, seqs, quals] = read_fastq(transcriptome_file) total = sum(split.values()) if len(headers) > total: total = len(headers) random.seed() with open(g1_filename, 'w') as g1file, open(g2_filename, 'w') as g2file, open(g3_filename, 'w') as g3file: for i in xrange(len(headers)): header = headers[i] seq = seqs[i] qual = quals[i] rnum = random.randint(0, total) # Generate random number gfile = None if rnum < limits[0]: gfile = g1file elif rnum < limits[1]: gfile = g2file elif rnum < limits[2]: gfile = g3file else: continue # Skip this sequence if file_extension.lower() in ['.fa', '.fna', 'faa', '.fasta']: gfile.write('>' + header + '\n') gfile.write(seq + '\n') elif file_extension.lower() in ['.fq', '.fastq']: gfile.write('@' + header + '\n') gfile.write(seq + '\n') gfile.write('+' + header + '\n') gfile.write(qual + '\n')
def get_circular_score(ref_path, contig_path, temp_folder): if (not os.path.exists(temp_folder)): os.makedirs(temp_folder) [headers_ref, seqs_ref, quals_ref] = fastqparser.read_fastq(ref_path) circularized_fwd_path = '%s/circ-fwd.fa' % (temp_folder) circularized_rev_path = '%s/circ-rev.fa' % (temp_folder) fp_fwd = open(circularized_fwd_path, 'w') fp_rev = open(circularized_rev_path, 'w') for i in xrange(0, len(seqs_ref)): rev_seq = fastqparser.revcomp_seq(seqs_ref[i]) rev_qual = quals_ref[i][::-1] # if (len(quals_ref) > 0): # fp_fwd.write('@%s\n%s%s\n+\n%s%s\n' % (headers_ref[i], seqs_ref[i], seqs_ref[i], quals_ref[i], quals_ref[i])); # fp_rev.write('@%s\n%s%s\n+\n%s%s\n' % (headers_ref[i], rev_seq, rev_seq, rev_qual, rev_qual)); # else: fp_fwd.write('>%s\n%s%s\n' % (headers_ref[i], seqs_ref[i], seqs_ref[i])) fp_rev.write('>%s\n%s%s\n' % (headers_ref[i], rev_seq, rev_seq)) fp_fwd.close() fp_rev.close() # sys.stdout.write('Aligning the fwd orientation...\n'); # command = '%s %s %s -m HW' % (EDLIB_PATH, contig_path, circularized_fwd_path); # [rc_fwd, rstdout_fwd, rstderr_fwd] = execute_command_with_ret(DRY_RUN, command); # scores_fwd = parse_edlib_scores(rstdout_fwd); # for i in xrange(0, len(scores_fwd)): # sys.stdout.write('[%d] %d %s\n' % (i, scores_fwd[i], 'fwd')); # sys.stdout.write('\n'); sys.stdout.write('Aligning the rev orientation...\n') command = '%s %s %s -m HW' % (EDLIB_PATH, contig_path, circularized_rev_path) [rc_rev, rstdout_rev, rstderr_rev] = execute_command_with_ret(DRY_RUN, command) scores_rev = parse_edlib_scores(rstdout_rev) for i in xrange(0, len(scores_rev)): sys.stdout.write('[%d] %d %s\n' % (i, scores_rev[i], 'rev')) sys.stdout.write('\n')
def adjustFqHeaders(fastqfile, findStr, replaceStr): # Reading fastq file [headers, seqs, quals] = read_fastq(fastqfile) filename, file_extension = os.path.splitext(fastqfile) totalSeqs = len(headers) findLen = len(findStr) replaceLen = len(replaceStr) replaced = 0 notreplaced = 0 for i in xrange(totalSeqs): header = headers[i] seq = seqs[i] # Not really needed qual = quals[i] # Not really needed if header[:findLen] == findStr: newheader = replaceStr + header[findLen:] headers[i] = newheader replaced += 1 else: notreplaced += 1 with open(fastqfile, 'w') as ffile: for i in xrange(totalSeqs): header = headers[i] seq = seqs[i] qual = quals[i] if file_extension.lower() in ['.fa', '.fna', 'faa', '.fasta']: ffile.write('>' + header + '\n') ffile.write(seq + '\n') elif file_extension.lower() in ['.fq', '.fastq']: ffile.write('@' + header + '\n') ffile.write(seq + '\n') ffile.write('+' + header + '\n') ffile.write(qual + '\n') else: ffile.write(r'@ERROR occured. File is NOT COMPLETE!') raise Exception('Invalid file extension: %s' % file_extension) return replaced, notreplaced
def adjustFqHeaders(fastqfile, findStr, replaceStr): # Reading fastq file [headers, seqs, quals] = read_fastq(fastqfile) filename, file_extension = os.path.splitext(fastqfile) totalSeqs = len(headers) findLen = len(findStr) replaceLen = len(replaceStr) replaced = 0 notreplaced = 0 for i in range(totalSeqs): header = headers[i] seq = seqs[i] # Not really needed qual = quals[i] # Not really needed if header[:findLen] == findStr: newheader = replaceStr + header[findLen:] headers[i] = newheader replaced += 1 else: notreplaced += 1 with open(fastqfile, "w") as ffile: for i in range(totalSeqs): header = headers[i] seq = seqs[i] qual = quals[i] if file_extension.lower() in [".fa", ".fna", "faa", ".fasta"]: ffile.write(">" + header + "\n") ffile.write(seq + "\n") elif file_extension.lower() in [".fq", ".fastq"]: ffile.write("@" + header + "\n") ffile.write(seq + "\n") ffile.write("+" + header + "\n") ffile.write(qual + "\n") else: ffile.write(r"@ERROR occured. File is NOT COMPLETE!") raise Exception("Invalid file extension: %s" % file_extension) return replaced, notreplaced
def process_sam_on_the_fly(in_sam, ref_file, fp_out): if (not os.path.exists(in_sam)): sys.stderr.write('ERROR: File "%s" does not exist!\n' % (in_sam)); exit(1); if (not os.path.exists(ref_file)): sys.stderr.write('ERROR: File "%s" does not exist!\n' % (ref_file)); exit(1); [headers_ref, seqs_ref, quals_ref] = fastqparser.read_fastq(ref_file); seqs_ref_hash = {}; for i in xrange(0, len(seqs_ref)): seqs_ref_hash[headers_ref[i]] = seqs_ref[i]; seqs_ref_hash[headers_ref[i].split()[0]] = seqs_ref[i]; try: fp_in = open(in_sam, 'r'); except IOError, e: sys.stderr.write('ERROR: Could not open file %s for reading! Exiting.\n' % (in_sam)); sys.stderr.write(str(e)); exit(1);
def process_sam_on_the_fly(in_sam, ref_file, fp_out): if (not os.path.exists(in_sam)): sys.stderr.write('ERROR: File "%s" does not exist!\n' % (in_sam)) exit(1) if (not os.path.exists(ref_file)): sys.stderr.write('ERROR: File "%s" does not exist!\n' % (ref_file)) exit(1) [headers_ref, seqs_ref, quals_ref] = fastqparser.read_fastq(ref_file) seqs_ref_hash = {} for i in xrange(0, len(seqs_ref)): seqs_ref_hash[headers_ref[i]] = seqs_ref[i] seqs_ref_hash[headers_ref[i].split()[0]] = seqs_ref[i] try: fp_in = open(in_sam, 'r') except IOError, e: sys.stderr.write( 'ERROR: Could not open file %s for reading! Exiting.\n' % (in_sam)) sys.stderr.write(str(e)) exit(1)
def expandHeader(fastfile, sstring): filename, file_extension = os.path.splitext(fastfile) [headers, seqs, quals] = read_fastq(fastfile) with open(fastfile, 'w') as ffile: for i in range(len(headers)): header = headers[i] new_header = sstring + header seq = seqs[i] qual = quals[i] if file_extension.lower() in ['.fa', '.fna', 'faa', '.fasta']: ffile.write('>' + new_header + '\n') pgfffileile.write(seq + '\n') elif file_extension.lower() in ['.fq', '.fastq']: ffile.write('@' + new_header + '\n') ffile.write(seq + '\n') ffile.write('+' + new_header + '\n') ffile.write(qual + '\n') else: ffile.write(r'@ERROR occured. File is NOT COMPLETE!') raise Exception('Invalid file extension: %s' % file_extension)
def expandHeader(fastfile, sstring): filename, file_extension = os.path.splitext(fastfile) [headers, seqs, quals] = read_fastq(fastfile) with open(fastfile, "w") as ffile: for i in range(len(headers)): header = headers[i] new_header = sstring + header seq = seqs[i] qual = quals[i] if file_extension.lower() in [".fa", ".fna", "faa", ".fasta"]: ffile.write(">" + new_header + "\n") pgfffileile.write(seq + "\n") elif file_extension.lower() in [".fq", ".fastq"]: ffile.write("@" + new_header + "\n") ffile.write(seq + "\n") ffile.write("+" + new_header + "\n") ffile.write(qual + "\n") else: ffile.write(r"@ERROR occured. File is NOT COMPLETE!") raise Exception("Invalid file extension: %s" % file_extension)
def run(reads_file, reference_file, machine_name, output_path, output_suffix=''): # Sparse also runs only on fasta # Atm parameters are hardcoded. # TODO: if fastq is given convert it to fasta # callculate estimated genome size (GS) from reference and/or reads files num_threads = multiprocessing.cpu_count() / 2 # ATM using the same set of parametars for all sequencers if machine_name in basicdefines.TECH: genomesize = 60000000 # Starting value / historical reasons # Calculating reference size reference_fastq = fastqparser.read_fastq(reference_file) reference_seq = reference_fastq[1][0] genomesize = len(reference_seq) memtime_path = os.path.join(output_path, ASSEMBLER_NAME + '.memtime') command = 'cd %s; %s %s -t %d k 21 GS %d f %s' % (output_path, basicdefines.measure_command(memtime_path), ASSEMBLER_BIN, num_threads, 10*genomesize, reads_file) subprocess.call(command, shell='True') else: sys.stderr.write('\}\nInvalid machine_name parameter for assembler %s' % ASSEMBLER_NAME) sys.stderr.write('\nSkipping ....')
# Looking through MSA for i in xrange(numseq): seqname = lines[i * 2 + 1][1:-1] seqalign = lines[i * 2 + 2][:-1] base = seqalign[position] if base == '-': base = 'N' if base in reads_dict: reads_dict[base].append(seqname) else: reads_dict[base] = [seqname] # Loading reads [headers, seqs, quals] = read_fastq(reads_filename) r_fname, r_fext = os.path.splitext(os.path.basename(reads_filename)) # Separating reads into files, but only if there is a sufficient number of them! max_coverage = numseq factor = 0.2 for base, readname_list in reads_dict.iteritems(): if len(readname_list) < factor * max_coverage: continue sep_filename = os.path.join(results_folder, r_fname + '_' + base + r_fext) file = open(sep_filename, 'w') for i in xrange(len(headers)): header = headers[i] seq = seqs[i] qual = quals[i]
def processData(datafolder, resultfile, annotationfile, Array, SS_list, csv_path): sys.stderr.write( '\n(%s) Loading and processing SAM file with mappings ... ' % datetime.now().time().isoformat()) all_sam_lines = load_and_process_SAM(resultfile, BBMapFormat=True) # Reading annotation file annotations = Annotation_formats.Load_Annotation_From_File(annotationfile) # Hashing annotations according to name annotation_dict = {} for annotation in annotations: if annotation.transcriptname in annotation_dict: pass #sys.stderr.write('\nWARNING: anotation with name %s already in the dictionary!' % annotation.genename) else: #annotation_dict[annotation.genename] = annotation annotation_dict[annotation.transcriptname] = annotation #*********************************** #*********************************** static_dict = {} #"A": with exon < 30 "B": exon > 30 #"C": single splicing "D": alternative splicing # key = ["All", "A", "B", "C", "D", "E", "F", "G"] key = ["All", "A", "B", "C", "D"] for i in range(len(key)): static_dict[key[i]] = Static() ss_array = list() with open(SS_list, 'r') as f_ss: for line in f_ss: ss_array.append(line.strip()) #********************************** allowed_inacc = Annotation_formats.DEFAULT_ALLOWED_INACCURACY # Allowing some shift in positions # Setting allowed inaccuracy # allowed_inacc = 25 # All samlines in a list should have the same query name for samline_list in all_sam_lines: qname = samline_list[0].qname # Checking the SAM file if all samlines in a list have the same qname for samline in samline_list[1:]: if samline.qname != qname: sys.stderr.write( '\nWARNING: two samlines in the same list with different query names (%s/%s)' % (qname, samline.qname)) # Look for the first underscore in query name # Everything before that is the simulation folder name # Everything after that is simulated query name pos = qname.find('_') if pos < 0: raise Exception('Invalid query name in results file (%s)!' % qname) simFolderKey = qname[:pos] if simFolderKey not in simFolderDict: # import pdb # pdb.set_trace() raise Exception('Bad simulation folder short name (%s)!' % simFolderKey) simFolder = simFolderDict[simFolderKey] simQName = qname[pos + 1:] # print(simFolderKey) # print(simFolder) # print(simQName) simFileSuffix = 'SimG2_S' pos = simQName.find('_') pos2 = simQName.find('_part') if pos < 0: raise Exception( 'Invalid simulated query name in results file (%s)!' % simQName) # BBMap separates a query into smaller parts he can manage # Extends query with '_part_#', which has to be ignored if pos2 != -1: simQName = simQName[:pos2] simRefNumber = int(simQName[1:pos]) simFileName = simFileSuffix + '_%04d' % simRefNumber simRefFileName = simFileName + '.ref' simSeqFileName = simFileName + '.fastq' simMafFileName = simFileName + '.maf' simFilePath = os.path.join(datafolder, simFolder) simRefFilePath = os.path.join(simFilePath, simRefFileName) # simSeqFilePath = os.path.join(simFilePath, simSeqFileName) simMafFilePath = os.path.join(simFilePath, simMafFileName) if not os.path.exists(simRefFilePath): # import pdb # pdb.set_trace() raise Exception( 'Reference file for simulated read %s does not exist!' % qname) #if not os.path.exists(simSeqFilePath): # raise Exception('Sequence file for simulated read %s does not exist!' % qname) if not os.path.exists(simMafFilePath): # import pdb # pdb.set_trace() raise Exception( 'Sequence alignment (MAF) for simulated read %s does not exist!' % qname) # Reading reference file [headers, seqs, quals] = read_fastq(simRefFilePath) simGeneName = headers[0] # if "transcript" in simGeneName: # simGeneName = simGeneName.split(':')[1] annotation = annotation_dict[ simGeneName] # Getting the correct annotation #--------------------- #for i in range(len(annotation.items)): # print "(%d,%d)" %(annotation.items[i].start, annotation.items[i].end) # Reading MAF file to get original position and length of the simulated read # Query name should be a second item maf_startpos = maf_length = 0 maf_reflen = 0 i = 0 with open(simMafFilePath, 'rU') as maffile: i += 1 for line in maffile: if line[0] == 's': elements = line.split() maf_qname = elements[1] if maf_qname == 'ref': # Have to remember data for the last reference before the actual read maf_startpos = int(elements[2]) maf_length = int(elements[3]) maf_strand = elements[4] maf_reflen = int(int(elements[5]) / 3) if maf_qname == simQName: # maf_startpos = int(elements[2]) # maf_length = int(elements[3]) break if maf_qname != simQName: # import pdb # pdb.set_trace() print("maf_qname = %s, simQName = %s" % (maf_qname, simQName)) raise Exception('ERROR: could not find query %s in maf file %s' % (qname, simMafFileName)) # IMPORTANT: If the reads were generated from an annotation on reverse strand # expected partial alignments must be reversed if annotation.strand == Annotation_formats.GFF_STRANDRV: maf_startpos = maf_reflen * 3 - maf_length - maf_startpos if maf_startpos > maf_reflen * 2: maf_startpos = maf_startpos - maf_reflen * 2 elif maf_startpos > maf_reflen: maf_startpos = maf_startpos - maf_reflen # Calculating expected partial alignmetns from MAF and annotations sigA = False sigB = True sigC = False sigD = False # 1. Calculating the index of the first exon # i - the index of exon currently being considered i = 0 flag_wrong = 0 while annotation.items[i].getLength() <= maf_startpos: maf_startpos -= annotation.items[i].getLength() i += 1 if len(annotation.items) == i: flag_wrong = 1 break if flag_wrong == 1: continue # Calculating expected partial alignments by filling up exons using maf_length maf_length = int(maf_length / 3) expected_partial_alignments = [] while maf_length > 0: start = annotation.items[i].start + maf_startpos end = annotation.items[i].end assert start <= end # print "(%d, %d)" %(start, end) # OLD: length = end-start+1 # KK: End is already indicating position after the last base, so adding one when callculating length is not correct length = end - start if length <= maf_length: expected_partial_alignments.append((start, end)) maf_length -= length i += 1 if len(annotation.items) == i: maf_length = 0 else: expected_partial_alignments.append((start, start + maf_length)) maf_length = 0 i += 1 # Start position should only be considered for the first exon maf_startpos = 0 #***************************************** #***************************************** #level2 for ele in expected_partial_alignments[1:-1]: if ele[1] - ele[0] < 30: sigA = True sigB = False break #level4 n = len(expected_partial_alignments) #level3 if simGeneName in ss_array: sigC = True else: sigD = True if DEBUG: print("exon in expected alignment---------------") for i in range(len(expected_partial_alignments)): print("(%d, %d)" % (expected_partial_alignments[i][0], expected_partial_alignments[i][1])) print("exon in real alignment-------------") numparts = len(expected_partial_alignments) # For each part of expected partial alignments, these maps will count # how many real partial alignments overlap or equal it parteqmap = {(i + 1): 0 for i in range(numparts)} parthitmap = {(i + 1): 0 for i in range(numparts)} if getChromName(samline_list[0].rname) != getChromName( annotation.seqname): static_dict["All"].Total_aligned_reads += 1 part_cal.cal(static_dict, sigA, sigC, "Total_aligned_reads", 1) else: for samline in samline_list: # sl_startpos = samline.pos - 1 # SAM positions are 1-based sl_startpos = samline.pos reflength = samline.CalcReferenceLengthFromCigar() readlength = samline.CalcReadLengthFromCigar() #************************ #************************ sl_endpos = sl_startpos + reflength if DEBUG: print("(%d, %d)" % (sl_startpos, sl_endpos)) # Comparing a samline to all expected partial alignments tmp_aln = 0 for i in range(len(expected_partial_alignments)): expected_alignement = expected_partial_alignments[i] maf_startpos = expected_alignement[0] maf_endpos = expected_alignement[1] if numparts > 2 and i == 0 and abs( sl_endpos - maf_endpos) < allowed_inacc: parteqmap[i + 1] += 1 parthitmap[i + 1] += 1 elif numparts > 2 and ( i == len(expected_partial_alignments) - 1 ) and abs(sl_startpos - maf_startpos) < allowed_inacc: parteqmap[i + 1] += 1 parthitmap[i + 1] += 1 elif interval_equals((sl_startpos, sl_endpos), (maf_startpos, maf_endpos), allowed_inacc): parteqmap[i + 1] += 1 parthitmap[i + 1] += 1 elif interval_overlaps((sl_startpos, sl_endpos), (maf_startpos, maf_endpos), 5): parthitmap[i + 1] += 1 if interval_overlaps((sl_startpos, sl_endpos), (maf_startpos, maf_endpos), 5): l = basesInside(sl_startpos, sl_endpos, maf_startpos, maf_endpos) if tmp_aln < l: tmp_aln = l if tmp_aln > readlength: tmp_aln = readlength static_dict["All"].Total_aligned_bases += tmp_aln part_cal.cal(static_dict, sigA, sigC, "Total_aligned_bases", tmp_aln) #************************************************************************************* #************************************************************************************* num_recover_exons = len([x for x in parteqmap.values() if x == 1]) num_hit_exons = len([x for x in parthitmap.values() if x == 1]) if num_hit_exons == numparts: static_dict["All"].Hit100 += 1 part_cal.cal(static_dict, sigA, sigC, "Hit100", 1) if num_hit_exons >= int(0.8 * numparts): static_dict["All"].Hit80 += 1 part_cal.cal(static_dict, sigA, sigC, "Hit80", 1) sam_l = len(samline_list) if num_recover_exons == numparts: static_dict["All"].ExR100 += 1 part_cal.cal(static_dict, sigA, sigC, "ExR100", 1) if num_recover_exons == sam_l: static_dict["All"].ExA100 += 1 part_cal.cal(static_dict, sigA, sigC, "ExA100", 1) if num_recover_exons >= int(0.8 * numparts): static_dict["All"].ExR80 += 1 part_cal.cal(static_dict, sigA, sigC, "ExR80", 1) if num_recover_exons >= int(0.8 * sam_l): static_dict["All"].ExA80 += 1 part_cal.cal(static_dict, sigA, sigC, "ExA80", 1) static_dict["All"].Total_aligned_exons += num_recover_exons part_cal.cal(static_dict, sigA, sigC, "Total_aligned_exons", num_recover_exons) static_dict["All"].Total_aligned_reads += 1 part_cal.cal(static_dict, sigA, sigC, "Total_aligned_reads", 1) #************************************************************************************* #************************************************ #******************************************write csv static_dict["All"].Total_reads = Array.Total_reads static_dict["All"].Total_bases = Array.Total_bases static_dict["All"].Total_expected_exons = Array.Total_expected_exons static_dict["A"].Total_reads = Array.Total_level2_reads static_dict["A"].Total_bases = Array.Total_level2_bases static_dict["A"].Total_expected_exons = Array.Total_level2_expected_exons static_dict["B"].Total_reads = Array.Total_level2_r_reads static_dict["B"].Total_bases = Array.Total_level2_r_bases static_dict["B"].Total_expected_exons = Array.Total_level2_r_expected_exons static_dict["C"].Total_reads = Array.Total_level3_SS_reads static_dict["C"].Total_bases = Array.Total_level3_SS_bases static_dict[ "C"].Total_expected_exons = Array.Total_level3_SS_expected_exons static_dict["D"].Total_reads = Array.Total_level3_AS_reads static_dict["D"].Total_bases = Array.Total_level3_AS_bases static_dict[ "D"].Total_expected_exons = Array.Total_level3_AS_expected_exons # static_dict["E"].Total_reads = Array.Total_level4_2_5_reads # static_dict["E"].Total_bases = Array.Total_level4_2_5_bases # static_dict["E"].Total_expected_exons = Array.Total_level4_2_5_expected_exons # static_dict["F"].Total_reads = Array.Total_level4_6_9_reads # static_dict["F"].Total_bases = Array.Total_level4_6_9_bases # static_dict["F"].Total_expected_exons = Array.Total_level4_6_9_expected_exons # static_dict["G"].Total_reads = Array.Total_level4_10_reads # static_dict["G"].Total_bases = Array.Total_level4_10_bases # static_dict["G"].Total_expected_exons = Array.Total_level4_10_expected_exons with open(csv_path, "w") as fw: csv_write = csv.writer(fw, dialect='excel') header = [" ", resultfile] csv_write.writerow(header) for item in key: level = [ item, str(static_dict[item].Total_reads) + ' reads/' + str(static_dict[item].Total_bases) + ' bases/' + str(static_dict[item].Total_expected_exons) + ' exons' ] row1 = [ "Aligned", round( 100 * static_dict[item].Total_aligned_reads / float(static_dict[item].Total_reads), 2) ] row2 = [ "bases%", round( 100 * static_dict[item].Total_aligned_bases / float(static_dict[item].Total_bases), 2) ] #line = str(round(100*static_dict[item].ExR100/float(static_dict[item].Total_reads), 2)) + '/' + str(round(100*static_dict[item].ExR80/float(static_dict[item].Total_reads), 2)) #row3 = ["ExR100/80%", line] line = str( round( 100 * static_dict[item].ExA100 / float(static_dict[item].Total_reads), 2)) + '/' + str( round( 100 * static_dict[item].ExA80 / float(static_dict[item].Total_reads), 2)) row4 = ["Read100/80%", line] #line = str(round(100*static_dict[item].Hit100/float(static_dict[item].Total_reads), 2)) + '/' + str(round(100*static_dict[item].Hit80/float(static_dict[item].Total_reads), 2)) #row5 = ["Hit100/80%", line] row6 = [ "Exons%", round( 100 * static_dict[item].Total_aligned_exons / float(static_dict[item].Total_expected_exons), 2) ] csv_write.writerow(level) csv_write.writerow(row1) csv_write.writerow(row2) #csv_write.writerow(row3) csv_write.writerow(row4) #csv_write.writerow(row5) csv_write.writerow(row6)
def processData(datafolder, resultfile, annotationfile, paramdict): split_qnames = False filename = "" if "--split-qnames" in paramdict: split_qnames = True filename = paramdict["--split-qnames"][0] filename_correct = filename + "_correct.names" filename_hitall = filename + "_hitall.names" filename_hitone = filename + "_hitone.names" filename_bad = filename + "_incorrect.names" filename_unmapped = filename + "_unmapped.names" printMap = False filename_mapping = "" if "--print_mapping" in paramdict: filename_mapping = paramdict["--print_mapping"][0] printMap = True file_correct = None file_hitall = None file_hitone = None file_bad = None file_unmapped = None folder = os.getcwd() # If splittng qnames into files, have to open files first if split_qnames: file_correct = open(os.path.join(folder, filename_correct), "w+") file_hitall = open(os.path.join(folder, filename_hitall), "w+") file_hitone = open(os.path.join(folder, filename_hitone), "w+") file_bad = open(os.path.join(folder, filename_bad), "w+") # Loading results SAM file report = EvalReport(ReportType.FASTA_REPORT ) # not really needed, used for unmapped query names # Have to preserve the paramdict # paramdict = {} sys.stderr.write( "\n(%s) Loading and processing SAM file with mappings ... " % datetime.now().time().isoformat()) all_sam_lines = RNAseqEval.load_and_process_SAM(resultfile, paramdict, report, BBMapFormat=True) # Reading annotation file annotations = Annotation_formats.Load_Annotation_From_File(annotationfile) s_num_multiexon_genes = 0 mapfile = None if printMap: mapfile = open(filename_mapping, "w+") # Hashing annotations according to name annotation_dict = {} for annotation in annotations: if annotation.genename in annotation_dict: sys.stderr.write( "\nWARNING: anotation with name %s already in the dictionary!" % annotation.genename) else: annotation_dict[annotation.genename] = annotation if len(annotation.items) > 1: s_num_multiexon_genes += 1 # Statistical information for evaluating the qualitiy of mapping s_gene_hits = 0 s_gene_misses = 0 s_whole_alignment_hits = 0 s_whole_alignment_misses = 0 s_partial_alignment_hits = 0 s_partial_alignment_misses = 0 s_num_start_hits = 0 s_num_end_hits = 0 s_num_start_end_hits = 0 s_num_fw_strand = 0 s_num_rv_strand = 0 s_num_split_alignment = 0 s_num_oversplit_alignment = 0 # Alignments that have more parts than exons s_num_good_alignments = 0 s_num_badchrom_alignments = 0 s_maf_suspicious_alignments = 0 s_maf_bad_alignments = 0 s_maf_good_alignments = 0 s_maf_split_reads = 0 s_maf_good_split_alignments = 0 s_maf_bad_split_alignments = 0 s_maf_hit_all_parts = 0 s_maf_hit_one_part = 0 s_maf_eq_one_part = 0 s_maf_multihit_parts = 0 s_maf_split_hit_all_parts = 0 s_maf_split_hit_one_part = 0 s_maf_split_eq_one_part = 0 s_maf_miss_alignment = 0 s_maf_too_many_alignments = 0 s_num_potential_bad_strand = 0 allowed_inacc = (Annotation_formats.DEFAULT_ALLOWED_INACCURACY ) # Allowing some shift in positions min_overlap = (Annotation_formats.DEFAULT_MINIMUM_OVERLAP ) # Minimum overlap that is considered # Setting allowed_inaccuracy from parameters if "--allowed_inacc" in paramdict: allowed_inacc = int(paramdict["--allowed_inacc"][0]) elif "-ai" in paramdict: allowed_inacc = int(paramdict["-ai"][0]) # Setting minimum overlap from parameters if "--allowed_inacc" in paramdict: min_overlap = int(paramdict["--allowed_inacc"][0]) elif "-mo" in paramdict: min_overlap = int(paramdict["-mo"][0]) # All samlines in a list should have the same query name for samline_list in all_sam_lines: qname = samline_list[0].qname isSplitAlignment = False if len(samline_list) > 1: s_num_split_alignment += 1 isSplitAlignment = True # Checking the SAM file if all samlines in a list have the same qname for samline in samline_list[1:]: if samline.qname != qname: sys.stderr.write( "\nWARNING: two samlines in the same list with different query names (%s/%s)" % (qname, samline.qname)) # Look for the first underscore in query name # Everything before that is the simulation folder name # Everything after that is simulated query name pos = qname.find("_") if pos < 0: raise Exception("Invalid query name in results file (%s)!" % qname) simFolderKey = qname[:pos] if simFolderKey not in simFolderDict: # import pdb # pdb.set_trace() raise Exception("Bad simulation folder short name (%s)!" % simFolderKey) simFolder = simFolderDict[simFolderKey] simQName = qname[pos + 1:] # Due to error in data preparation, have to make some extra processing if simQName[:6] == "SimG2_": simQName = simQName[6:] # if simFolderKey == 'SimG1': # simFileSuffix = 'g1' # elif simFolderKey == 'SimG2': # simFileSuffix = 'g2' # elif simFolderKey == 'SimG3': # simFileSuffix = 'g3' # else: # simFileSuffix = 'sd' simFileSuffix = "sd" pos = simQName.find("_") pos2 = simQName.find("_part") if pos < 0: raise Exception( "Invalid simulated query name in results file (%s)!" % simQName) simQLetter = simQName[0] # Should always be S # BBMap separates a query into smaller parts he can manage # Extends query with '_part_#', which has to be ignored if pos2 != -1: simQName = simQName[:pos2] simRefNumber = int(simQName[1:pos]) simQNumber = int(simQName[pos + 1:]) simFileName = simFileSuffix + "_%04d" % simRefNumber simRefFileName = simFileName + ".ref" simSeqFileName = simFileName + ".fastq" simMafFileName = simFileName + ".maf" simFilePath = os.path.join(datafolder, simFolder) simRefFilePath = os.path.join(simFilePath, simRefFileName) simSeqFilePath = os.path.join(simFilePath, simSeqFileName) simMafFilePath = os.path.join(simFilePath, simMafFileName) if not os.path.exists(simRefFilePath): # import pdb # pdb.set_trace() raise Exception( "Reference file for simulated read %s does not exist!" % qname) if not os.path.exists(simSeqFilePath): # import pdb # pdb.set_trace() raise Exception( "Sequence file for simulated read %s does not exist!" % qname) if not os.path.exists(simMafFilePath): # import pdb # pdb.set_trace() raise Exception( "Sequence alignment (MAF) for simulated read %s does not exist!" % qname) # Reading reference file [headers, seqs, quals] = read_fastq(simRefFilePath) simGeneName = headers[0] annotation = annotation_dict[ simGeneName] # Getting the correct annotation if len(samline_list) > len(annotation.items): # sys.stderr.write('\nWARNING: A number of partial alignments exceeds the number of exons for query %s! (%d / %d)' % (qname, len(samline_list), len(annotation.items))) s_num_oversplit_alignment += 1 # Reading MAF file to get original position and length of the simulated read # Query name should be a second item maf_startpos = maf_length = 0 maf_strand = "0" maf_reflen = 0 i = 0 with open(simMafFilePath, "rU") as maffile: i += 1 for line in maffile: if line[0] == "s": elements = line.split() maf_qname = elements[1] if ( maf_qname == "ref" ): # Have to remember data for the last reference before the actual read maf_startpos = int(elements[2]) maf_length = int(elements[3]) maf_strand = elements[4] maf_reflen = int(elements[5]) if maf_qname == simQName: # maf_startpos = int(elements[2]) # maf_length = int(elements[3]) break if maf_qname != simQName: # import pdb # pdb.set_trace() raise Exception("ERROR: could not find query %s in maf file %s" % (qname, simMafFileName)) # IMPORTANT: If the reads were generated from an annotation on reverse strand # expected partial alignments must be reversed if annotation.strand == Annotation_formats.GFF_STRANDRV: maf_startpos = maf_reflen - maf_length - maf_startpos # Saving "maf_length" and "maf_startpos" to be able to check it later t_maf_length = maf_length t_maf_startpos = maf_startpos # Calculating expected partial alignmetns from MAF and annotations # 1. Calculating the index of the first exon # i - the index of exon currently being considered i = 0 while annotation.items[i].getLength() < maf_startpos: maf_startpos -= annotation.items[i].getLength() i += 1 # Calculating expected partial alignments by filling up exons using maf_length expected_partial_alignments = [] while maf_length > 0: start = annotation.items[i].start + maf_startpos end = annotation.items[i].end assert start <= end # OLD: length = end-start+1 # KK: End is already indicating position after the last base, so adding one when callculating length is not correct length = end - start if length <= maf_length: expected_partial_alignments.append((start, end)) maf_length -= length i += 1 else: expected_partial_alignments.append((start, start + maf_length)) maf_length = 0 i += 1 # Start position should only be considered for the first exon maf_startpos = 0 # import pdb # pdb.set_trace() numparts = len(expected_partial_alignments) # For each part of expected partial alignments, these maps will count # how many real partial alignments overlap or equal it parthitmap = {(i + 1): 0 for i in range(numparts)} parteqmap = {(i + 1): 0 for i in range(numparts)} isSplitRead = False if len(expected_partial_alignments) > 1: s_maf_split_reads += 1 isSplitRead = True oneHit = False allHits = False oneEq = False multiHit = False good_alignment = False has_miss_alignments = False if RNAseqEval.getChromName( samline_list[0].rname) != RNAseqEval.getChromName( annotation.seqname): # import pdb # pdb.set_trace() s_num_badchrom_alignments += 1 else: if len(samline_list) != len(expected_partial_alignments): # sys.stderr.write('\nWARNING: suspicious number of alignments for query %s!' % qname) s_maf_suspicious_alignments += 1 # import pdb # pdb.set_trace() good_alignment = True k = 0 for samline in samline_list: # sl_startpos = samline.pos - 1 # SAM positions are 1-based sl_startpos = samline.pos reflength = samline.CalcReferenceLengthFromCigar() sl_endpos = sl_startpos + reflength # Comparing a samline to the corresponding expected partial alignment if k < len(expected_partial_alignments): expected_alignement = expected_partial_alignments[k] maf_startpos = expected_alignement[0] maf_endpos = expected_alignement[1] if (abs(sl_startpos - maf_startpos) > allowed_inacc or abs(sl_endpos - maf_endpos) > allowed_inacc): good_alignment = False else: good_alignment = False k += 1 # Comparing a samline to all expected partial alignments for i in range(len(expected_partial_alignments)): expected_alignement = expected_partial_alignments[i] maf_startpos = expected_alignement[0] maf_endpos = expected_alignement[1] if interval_equals( (sl_startpos, sl_endpos), (maf_startpos, maf_endpos), allowed_inacc, min_overlap, ): parteqmap[i + 1] += 1 if interval_overlaps( (sl_startpos, sl_endpos), (maf_startpos, maf_endpos), allowed_inacc, min_overlap, ): parthitmap[i + 1] += 1 has_miss_alignments = False for expected_alignement in expected_partial_alignments: maf_startpos = expected_alignement[0] maf_endpos = expected_alignement[1] overlap = False for samline in samline_list: sl_startpos = samline.pos reflength = samline.CalcReferenceLengthFromCigar() sl_endpos = sl_startpos + reflength if interval_overlaps( (sl_startpos, sl_endpos), (maf_startpos, maf_endpos), allowed_inacc, min_overlap, ): overlap = True if not overlap: has_miss_alignments = True break if len(samline_list) < len(expected_partial_alignments): s_maf_too_many_alignments += 1 # Testing the evaluation process # import pdb # pdb.set_trace() if len(samline_list) != len(expected_partial_alignments): good_alignment = False if good_alignment: s_maf_good_alignments += 1 # Writting qnames to files if split_qnames: file_correct.write(samline_list[0].qname + "\n") if isSplitRead: s_maf_good_split_alignments += 1 else: # import pdb # pdb.set_trace() s_maf_bad_alignments += 1 if isSplitRead: s_maf_bad_split_alignments += 1 # TODO: check which alignments are bad and why # If the choromosome is different its obviously a bad alignment if RNAseqEval.getChromName( samline.rname) == RNAseqEval.getChromName( annotation.seqname): # import pdb # pdb.set_trace() pass else: s_num_badchrom_alignments += 1 # Analyzing parthitmap and parteqmap oneHit = False allHits = True oneEq = False multiHit = False for i in range(numparts): if parthitmap[i + 1] > 0: oneHit = True if parthitmap[i + 1] == 0: allHits = False if parthitmap[i + 1] > 1: multiHit = True if parteqmap[i + 1] > 0: oneEq = True if printMap: status = "INCORRECT" if good_alignment: status = "CORRECT" elif allHits: status = "HITALL" elif oneHit: status = "HITONE" mapfile.write("QNAME: %s, STATUS: %s\n\n" % (samline_list[0].qname, status)) mapfile.write("EXPECTED (%s, %s):\t" % (RNAseqEval.getChromName( annotation.seqname), annotation.strand)) for epa in expected_partial_alignments: mapfile.write("(%d, %d)\t" % (epa[0], epa[1])) mapfile.write("\n") if samline_list[0].flag & 16 == 0: readstrand = Annotation_formats.GFF_STRANDFW else: readstrand = Annotation_formats.GFF_STRANDRV mapfile.write( "ACTUAL (%s, %s):\t" % (RNAseqEval.getChromName(samline_list[0].rname), readstrand)) for samline in samline_list: mapfile.write("(%d, %d)\t" % ( samline.pos, samline.pos + samline.CalcReferenceLengthFromCigar(), )) mapfile.write("\n\n") if oneHit: s_maf_hit_one_part += 1 if isSplitRead: s_maf_split_hit_one_part += 1 # Writting qnames to files if split_qnames: file_hitone.write(samline_list[0].qname + "\n") if not allHits: if "--debug" in paramdict: import pdb pdb.set_trace() # Misses are calculated only for alignments that have at least one hit if has_miss_alignments: s_maf_miss_alignment += 1 else: # Writting qnames to files if split_qnames: file_bad.write(samline_list[0].qname + "\n") # if '--debug' in paramdict: # import pdb # pdb.set_trace() if allHits: s_maf_hit_all_parts += 1 if isSplitRead: s_maf_split_hit_all_parts += 1 # Writting qnames to files if split_qnames: file_hitall.write(samline_list[0].qname + "\n") # Sanity check if "--debug" in paramdict and good_alignment and not allHits: import pdb pdb.set_trace() pass if oneEq: s_maf_eq_one_part += 1 if isSplitRead: s_maf_split_eq_one_part += 1 if multiHit: s_maf_multihit_parts += 1 num_start_hits = 0 num_end_hits = 0 num_hits = 0 num_partial_alignements = len(samline_list) whole_alignment_hit = False for samline in samline_list: startpos = samline.pos - 1 reflength = samline.CalcReferenceLengthFromCigar() endpos = startpos + reflength if samline.flag & 16 == 0: readstrand = Annotation_formats.GFF_STRANDFW s_num_fw_strand += 1 else: readstrand = Annotation_formats.GFF_STRANDRV s_num_rv_strand += 1 chromname = RNAseqEval.getChromName(samline.rname) if (chromname == RNAseqEval.getChromName(annotation.seqname) and readstrand != annotation.strand and annotation.overlapsGene(startpos, endpos)): s_num_potential_bad_strand += 1 if (chromname == RNAseqEval.getChromName(annotation.seqname) and annotation.overlapsGene(startpos, endpos) and (not P_CHECK_STRAND or readstrand == annotation.strand)): whole_alignment_hit = True s_partial_alignment_hits += 1 else: s_partial_alignment_misses += 1 # Checking how well partial alignments match exons startsItem = False endsItem = False for item in annotation.items: if item.overlapsItem(startpos, endpos): num_hits += 1 if item.startsItem(startpos, endpos): num_start_hits += 1 startsItem = True if item.endsItem(startpos, endpos): num_end_hits += 1 endsItem = True if startsItem and endsItem: s_num_start_end_hits += 1 s_num_start_hits += num_start_hits s_num_end_hits += num_end_hits # I'm allowing one start and one end not to match starts and ends of exons if (num_hits == num_partial_alignements) and ( num_start_hits + num_end_hits >= 2 * num_partial_alignements - 2): s_num_good_alignments += 1 # else: # if num_hits > 0: # import pdb # pdb.set_trace() if whole_alignment_hit: s_whole_alignment_hits += 1 else: s_whole_alignment_misses += 1 if printMap: mapfile.close() # Writting unmapped query names to a file, if so specified if split_qnames: with open(filename_unmapped, "w+") as file_unmapped: file_unmapped.write(report.get_unmapped_names()) file_unmapped.close() # Printing out results : NEW # Variables names matching RNA benchmark paper sys.stdout.write("\n\nAnalysis results:") sys.stdout.write("\nOriginal Samlines: %d" % report.num_alignments) sys.stdout.write( "\nUsable whole alignments (with valid CIGAR string): %d" % len(all_sam_lines)) sys.stdout.write("\nAnnotations: %d" % len(annotation_dict)) sys.stdout.write("\nMultiexon genes: %d" % s_num_multiexon_genes) sys.stdout.write("\nNumber of exon start hits: %d" % s_num_start_hits) sys.stdout.write("\nNumber of exon end hits: %d" % s_num_end_hits) sys.stdout.write("\nNumber of exon start and end hits: %d" % s_num_start_end_hits) sys.stdout.write("\nNumber of good whole alignments: %d" % s_num_good_alignments) sys.stdout.write( "\nNumber of alignments mapped to an incorrect chromosome: %d" % s_num_badchrom_alignments) sys.stdout.write("\nMAF: Correct alignment: %d" % s_maf_good_alignments) sys.stdout.write("\nMAF: Hit all parts: %d" % s_maf_hit_all_parts) sys.stdout.write("\nMAF: Hit at least one part: %d" % s_maf_hit_one_part) sys.stdout.write("\nMAF: Equals at least one part: %d" % s_maf_eq_one_part) sys.stdout.write("\nMAF: Number of split reads: %d" % s_maf_split_reads) sys.stdout.write("\nMAF: Correct alignment, SPLIT read: %d" % s_maf_good_split_alignments) sys.stdout.write("\nMAF: Hit all parts, SPLIT read: %d" % s_maf_split_hit_all_parts) sys.stdout.write("\nMAF: Hit at least one part, SPLIT read: %d" % s_maf_split_hit_one_part) sys.stdout.write("\nMAF: Equals at least one part, SPLIT read: %d" % s_maf_split_eq_one_part) sys.stdout.write("\nMAF: Partial alignment that misses: %d" % s_maf_miss_alignment) sys.stdout.write("\nMAF: More alignments than expected: %d" % s_maf_too_many_alignments) sys.stdout.write("\nMAF: Multihit parts (fragmented) alignments: %d" % s_maf_multihit_parts) sys.stdout.write("\nDone!\n") # Closing file with names if split_qnames: file_correct.close() file_hitall.close() file_hitone.close() file_bad.close()
def eval_contigs(ref_path, contig_path, temp_folder, generate_kmer_spectrum=False, silent=False): if (not os.path.exists(temp_folder)): os.makedirs(temp_folder); [headers_contigs, seqs_contigs, quals_contigs] = fastqparser.read_fastq(contig_path); [headers_ref, seqs_ref, quals_ref] = fastqparser.read_fastq(ref_path); ref_hash = hash_headers(headers_ref); contig_hash = hash_headers(headers_contigs); avg_accuracy_overall = 0.0; avg_id_overall = 0.0; num_valid_contigs = 0; single_contig_path = '%s/singlecontig.fasta' % (temp_folder); for i in xrange(0, len(seqs_contigs)): contig_name = headers_contigs[i].split()[0]; contig_seq = seqs_contigs[i]; fp_contig = open(single_contig_path, 'w'); fp_contig.write('>%s\n%s\n' % (contig_name, seqs_contigs[i])); fp_contig.close(); ### Run nucmer to align the contig to the reference, also, filter the delta file and generate alignment coordinates. nucmer_out_prefix = '%s/nucmer' % (temp_folder); log('Running MUMmer on contig %d / %d: "%s"' % ((i + 1), len(seqs_contigs), contig_name), sys.stderr, silent=silent); log('Contig length: %d' % (len(contig_seq)), sys.stderr, silent=silent); command = '%s --maxmatch --extend -p %s %s %s; delta-filter -r -q %s.delta > %s.filt.delta; show-coords -r -c %s.filt.delta > %s.filt.coords' % \ (NUCMER_PATH, nucmer_out_prefix, ref_path, single_contig_path, nucmer_out_prefix, nucmer_out_prefix, nucmer_out_prefix, nucmer_out_prefix); [rc, rstdout, rstderr] = execute_command_with_ret(DRY_RUN, command, silent=True); ### Load the coordinates. log('Parsing the coords file.', sys.stderr, silent=silent); # fp = open('/home/isovic/work/eclipse-workspace/git/consise/temp2-mummer/test-data/out/nucmer.coords2', 'r'); coords_path = '%s.filt.coords' % (nucmer_out_prefix); fp = open(coords_path, 'r'); lines = fp.readlines(); fp.close(); frags = parse_coords_lines(lines, contig_name, seqs_ref, ref_hash, seqs_contigs, contig_hash); avg_accuracy_contig = 0.0; avg_id_contig = 0.0; log('Running Edlib to determine the edit distance for each fragment...', sys.stderr, silent=silent); debug_frags_path = '%s/frags.%d.csv' % (temp_folder, i); fp_frags = open(debug_frags_path, 'w'); fp_frags.write('frag_or_summary\trstart\trend\tqstart\tqend\tfwd\trname\tqname\tidentity\trlen\tqlen\tedit_dist\tnormalized_edit_dist\taccuracy\n') for frag in frags: # print frag; [rstart, rend, qstart, qend, fwd, rname, qname, identity] = frag; ref_seq = seqs_ref[ref_hash[rname]]; [nw_ref, nw_contig] = extract_seqs_for_edlib(ref_seq, contig_seq, rstart, rend, qstart, qend); temp_suffix = '.%d' % (i); nw_ref_path = '%s/nw-ref%s.fasta' % (temp_folder, temp_suffix); nw_contig_path = '%s/nw-contig%s.fasta' % (temp_folder, temp_suffix); fp_nw_ref = open(nw_ref_path, 'w'); fp_nw_contig = open(nw_contig_path, 'w'); fp_nw_ref.write('>%s\n%s\n' % (rname, nw_ref)); fp_nw_contig.write('>%s\n%s\n' % (qname, nw_contig)); fp_nw_ref.close(); fp_nw_contig.close(); command = '%s %s %s -m NW' % (EDLIB_PATH, nw_contig_path, nw_ref_path); [rc, rstdout, rstderr] = execute_command_with_ret(DRY_RUN, command, silent=True); scores = parse_edlib_scores(rstdout); if (len(scores) == 0): log('ERROR: len(scores) == 0!\nreturn code: %d\nrstdout:\n%s' % (rc, rstdout), sys.stderr); continue; # sys.stderr.write('Final edit distance: %d\n' % (scores[0])); normalized_edit_dist = float(scores[0]) / float(abs(qend - qstart + 1)); accuracy = (1.0 - normalized_edit_dist); frag.append(abs(rend - rstart + 1)); frag.append(abs(qend - qstart + 1)); frag.append(scores[0]); frag.append(100.0 * normalized_edit_dist); frag.append(100.0 * accuracy); # print frag; avg_accuracy_contig += accuracy; avg_id_contig += frag[7]; fp_frags.write('f\t' + '\t'.join([str(val) for val in frag]) + '\n'); if (len(frags) > 0): avg_accuracy_contig /= float(len(frags)); avg_id_contig /= float(len(frags)); log('Average ID for contig "%s": %f%%' % (contig_name, avg_id_contig), sys.stderr, silent=silent); log('Average accuracy for contig "%s": %f%%' % (contig_name, 100.0*avg_accuracy_contig), sys.stderr, silent=silent); log('', sys.stderr, silent=silent); avg_accuracy_overall += avg_accuracy_contig; avg_id_overall += avg_id_contig; num_valid_contigs += 1; else: log('ERROR: There are no frags for contig "%s"! Continuing, will not be taken into account.' % (contig_name), sys.stderr, silent=silent); # fp_frags.write('s\t%s\t%f\t%f\n' % (rname, avg_id_contig, 100.0*avg_accuracy_contig)) fp_frags.write('s\t0\t0\t0\t0\t-\t%s\t-\t%f\t0\t0\t0\t0\t%f\n' % (rname, avg_id_contig, 100.0*avg_accuracy_contig)) fp_frags.close(); if (num_valid_contigs > 0): avg_accuracy_overall /= float(num_valid_contigs); avg_id_overall /= float(num_valid_contigs); else: log('ERROR: There are no valid contigs in file "%s"! None of the contigs had valid MUMmer alignments.\n' % (contig_path), sys.stderr, silent=silent); log('Draft assembly: "%s"' % (contig_path), sys.stderr, silent=silent); log('Overall average ID for the draft assembly: %f%%' % (avg_id_overall), sys.stderr, silent=silent); log('Overall average accuracy for the draft assembly: %f%%' % (100.0*avg_accuracy_overall), sys.stderr, silent=silent); log('', sys.stderr, silent=silent); sys.stdout.write('================= Summary ===================\n'); sys.stdout.write('Draft assembly: "%s"\n' % (contig_path)); sys.stdout.write('Overall average ID for the draft assembly: %f%%\n' % (avg_id_overall)); sys.stdout.write('Overall average accuracy for the draft assembly: %f%%\n' % (100.0*avg_accuracy_overall)); sys.stdout.write('=============================================\n\n');
def processData(datafolder, resultfile, annotationfile): # Loading results SAM file report = EvalReport(ReportType.FASTA_REPORT) # not needed paramdict = {} sys.stderr.write( '\n(%s) Loading and processing SAM file with mappings ... ' % datetime.now().time().isoformat()) all_sam_lines = RNAseqEval.load_and_process_SAM(resultfile, paramdict, report, BBMapFormat=True) # Reading annotation file annotations = Annotation_formats.Load_Annotation_From_File(annotationfile) s_num_multiexon_genes = 0 # Hashing annotations according to name annotation_dict = {} for annotation in annotations: if annotation.genename in annotation_dict: sys.stderr.write( '\nWARNING: anotation with name %s already in the dictionary!' % annotation.genename) else: annotation_dict[annotation.genename] = annotation if len(annotation.items) > 1: s_num_multiexon_genes += 1 # Statistical information for evaluating the qualitiy of mapping s_gene_hits = 0 s_gene_misses = 0 s_whole_alignment_hits = 0 s_whole_alignment_misses = 0 s_partial_alignment_hits = 0 s_partial_alignment_misses = 0 s_num_start_hits = 0 s_num_end_hits = 0 s_num_start_end_hits = 0 s_num_fw_strand = 0 s_num_rv_strand = 0 s_num_split_alignment = 0 s_num_oversplit_alignment = 0 # Alignments that have more parts than exons s_num_good_alignments = 0 s_num_badchrom_alignments = 0 s_maf_suspicious_alignments = 0 s_maf_bad_alignments = 0 s_maf_good_alignments = 0 s_maf_split_reads = 0 s_maf_good_split_alignments = 0 s_maf_bad_split_alignments = 0 s_maf_hit_all_parts = 0 s_maf_hit_one_part = 0 s_maf_eq_one_part = 0 s_maf_multihit_parts = 0 s_maf_split_hit_all_parts = 0 s_maf_split_hit_one_part = 0 s_maf_split_eq_one_part = 0 s_num_potential_bad_strand = 0 # allowed_inacc = Annotation_formats.DEFAULT_ALLOWED_INACCURACY # Allowing some shift in positions # Setting allowed inaccuracy allowed_inacc = 5 # All samlines in a list should have the same query name for samline_list in all_sam_lines: qname = samline_list[0].qname isSplitAlignment = False if len(samline_list) > 1: s_num_split_alignment += 1 isSplitAlignment = True # Checking the SAM file if all samlines in a list have the same qname for samline in samline_list[1:]: if samline.qname != qname: sys.stderr.write( '\nWARNING: two samlines in the same list with different query names (%s/%s)' % (qname, samline.qname)) # Look for the first underscore in query name # Everything before that is the simulation folder name # Everything after that is simulated query name pos = qname.find('_') if pos < 0: raise Exception('Invalid query name in results file (%s)!' % qname) simFolderKey = qname[:pos] if simFolderKey not in simFolderDict: # import pdb # pdb.set_trace() raise Exception('Bad simulation folder short name (%s)!' % simFolderKey) simFolder = simFolderDict[simFolderKey] simQName = qname[pos + 1:] # Due to error in data preparation, have to make some extra processing if simQName[:6] == 'SimG2_': simQName = simQName[6:] # if simFolderKey == 'SimG1': # simFileSuffix = 'g1' # elif simFolderKey == 'SimG2': # simFileSuffix = 'g2' # elif simFolderKey == 'SimG3': # simFileSuffix = 'g3' # else: # simFileSuffix = 'sd' simFileSuffix = 'sd' pos = simQName.find('_') pos2 = simQName.find('_part') if pos < 0: raise Exception( 'Invalid simulated query name in results file (%s)!' % simQName) simQLetter = simQName[0] # Should always be S # BBMap separates a query into smaller parts he can manage # Extends query with '_part_#', which has to be ignored if pos2 <> -1: simQName = simQName[:pos2] simRefNumber = int(simQName[1:pos]) simQNumber = int(simQName[pos + 1:]) simFileName = simFileSuffix + '_%04d' % simRefNumber simRefFileName = simFileName + '.ref' simSeqFileName = simFileName + '.fastq' simMafFileName = simFileName + '.maf' simFilePath = os.path.join(datafolder, simFolder) simRefFilePath = os.path.join(simFilePath, simRefFileName) simSeqFilePath = os.path.join(simFilePath, simSeqFileName) simMafFilePath = os.path.join(simFilePath, simMafFileName) if not os.path.exists(simRefFilePath): # import pdb # pdb.set_trace() raise Exception( 'Reference file for simulated read %s does not exist!' % qname) if not os.path.exists(simSeqFilePath): # import pdb # pdb.set_trace() raise Exception( 'Sequence file for simulated read %s does not exist!' % qname) if not os.path.exists(simMafFilePath): # import pdb # pdb.set_trace() raise Exception( 'Sequence alignment (MAF) for simulated read %s does not exist!' % qname) # Reading reference file [headers, seqs, quals] = read_fastq(simRefFilePath) simGeneName = headers[0] annotation = annotation_dict[ simGeneName] # Getting the correct annotation if len(samline_list) > len(annotation.items): # sys.stderr.write('\nWARNING: A number of partial alignments exceeds the number of exons for query %s! (%d / %d)' % (qname, len(samline_list), len(annotation.items))) s_num_oversplit_alignment += 1 # Reading MAF file to get original position and length of the simulated read # Query name should be a second item maf_startpos = maf_length = 0 i = 0 with open(simMafFilePath, 'rU') as maffile: i += 1 for line in maffile: if line[0] == 's': elements = line.split() maf_qname = elements[1] if maf_qname == 'ref': # Have to remember data for the last reference before the actual read maf_startpos = int(elements[2]) maf_length = int(elements[3]) if maf_qname == simQName: # maf_startpos = int(elements[2]) # maf_length = int(elements[3]) break if maf_qname != simQName: # import pdb # pdb.set_trace() raise Exception('ERROR: could not find query %s in maf file %s' % (qname, simMafFileName)) # Calculating expected partial alignmetns from MAF and annotations # Saving "maf_length" to be able to check it later t_maf_length = maf_length # 1. Calculating the index of the first exon # i - the index of exon currently being considered i = 0 while annotation.items[i].getLength() < maf_startpos: maf_startpos -= annotation.items[i].getLength() i += 1 # Calculating expected partial alignments by filling up exons using maf_length expected_partial_alignments = [] while maf_length > 0: # try: # start = annotation.items[i].start + maf_startpos # end = annotation.items[i].end # except Exception: # import pdb # pdb.set_trace() #if not start < end: # import pdb # pdb.set_trace() start = annotation.items[i].start + maf_startpos end = annotation.items[i].end assert start <= end length = end - start + 1 if length <= maf_length: expected_partial_alignments.append((start, end)) maf_length -= length i += 1 else: expected_partial_alignments.append((start, start + maf_length)) maf_length = 0 i += 1 # Start position should only be considered for the first exon maf_startpos = 0 numparts = len(expected_partial_alignments) # For each part of expected partial alignments, these maps will count # how many real partial alignments overlap or equal it parthitmap = {(i + 1): 0 for i in xrange(numparts)} parteqmap = {(i + 1): 0 for i in xrange(numparts)} isSplitRead = False if len(expected_partial_alignments) > 1: s_maf_split_reads += 1 isSplitRead = True if RNAseqEval.getChromName( samline_list[0].rname) != RNAseqEval.getChromName( annotation.seqname): # import pdb # pdb.set_trace() s_num_badchrom_alignments += 1 else: if len(samline_list) != len(expected_partial_alignments): # sys.stderr.write('\nWARNING: suspicious number of alignments for query %s!' % qname) s_maf_suspicious_alignments += 1 # import pdb # pdb.set_trace() good_alignment = True k = 0 for samline in samline_list: # sl_startpos = samline.pos - 1 # SAM positions are 1-based sl_startpos = samline.pos reflength = samline.CalcReferenceLengthFromCigar() sl_endpos = sl_startpos + reflength # Comparing a samline to the corresponding expected partial alignment if k < len(expected_partial_alignments): expected_alignement = expected_partial_alignments[k] maf_startpos = expected_alignement[0] maf_endpos = expected_alignement[1] if abs(sl_startpos - maf_startpos) > allowed_inacc or abs( sl_endpos - maf_endpos) > allowed_inacc: good_alignment = False else: good_alignment = False k += 1 # Comparing a samline to all expected partial alignments for i in xrange(len(expected_partial_alignments)): expected_alignement = expected_partial_alignments[i] maf_startpos = expected_alignement[0] maf_endpos = expected_alignement[1] if interval_equals((sl_startpos, sl_endpos), (maf_startpos, maf_endpos), allowed_inacc): parteqmap[i + 1] += 1 if interval_overlaps((sl_startpos, sl_endpos), (maf_startpos, maf_endpos), allowed_inacc): parthitmap[i + 1] += 1 if good_alignment: s_maf_good_alignments += 1 if isSplitRead: s_maf_good_split_alignments += 1 else: # import pdb # pdb.set_trace() s_maf_bad_alignments += 1 if isSplitRead: s_maf_bad_split_alignments += 1 # TODO: check which alignments are bad and why # If the choromosome is different its obviously a bad alignment if RNAseqEval.getChromName( samline.rname) == RNAseqEval.getChromName( annotation.seqname): # import pdb # pdb.set_trace() pass else: s_num_badchrom_alignments += 1 # Analyzing parthitmap and parteqmap oneHit = False allHits = True oneEq = False multiHit = False for i in xrange(numparts): if parthitmap[i + 1] > 0: oneHit = True if parthitmap[i + 1] == 0: allHits = False if parthitmap[i + 1] > 1: multiHit = True if parteqmap[i + 1] > 0: oneEq = True if oneHit: s_maf_hit_one_part += 1 if isSplitRead: s_maf_split_hit_one_part += 1 if allHits: s_maf_hit_all_parts += 1 if isSplitRead: s_maf_split_hit_all_parts += 1 #import pdb #pdb.set_trace() if oneEq: s_maf_eq_one_part += 1 if isSplitRead: s_maf_split_eq_one_part += 1 if multiHit: s_maf_multihit_parts += 1 num_start_hits = 0 num_end_hits = 0 num_hits = 0 num_partial_alignements = len(samline_list) whole_alignment_hit = False for samline in samline_list: startpos = samline.pos - 1 reflength = samline.CalcReferenceLengthFromCigar() endpos = startpos + reflength if samline.flag & 16 == 0: readstrand = Annotation_formats.GFF_STRANDFW s_num_fw_strand += 1 else: readstrand = Annotation_formats.GFF_STRANDRV s_num_rv_strand += 1 chromname = RNAseqEval.getChromName(samline.rname) if chromname == RNAseqEval.getChromName( annotation.seqname ) and readstrand != annotation.strand and annotation.overlapsGene( startpos, endpos): s_num_potential_bad_strand += 1 if chromname == RNAseqEval.getChromName( annotation.seqname) and annotation.overlapsGene( startpos, endpos) and (not P_CHECK_STRAND or readstrand == annotation.strand): whole_alignment_hit = True s_partial_alignment_hits += 1 else: s_partial_alignment_misses += 1 # Checking how well partial alignments match exons startsItem = False endsItem = False for item in annotation.items: if item.overlapsItem(startpos, endpos): num_hits += 1 if item.startsItem(startpos, endpos): num_start_hits += 1 startsItem = True if item.endsItem(startpos, endpos): num_end_hits += 1 endsItem = True if startsItem and endsItem: s_num_start_end_hits += 1 s_num_start_hits += num_start_hits s_num_end_hits += num_end_hits # I'm allowing one start and one end not to match starts and ends of exons if (num_hits == num_partial_alignements) and ( num_start_hits + num_end_hits >= 2 * num_partial_alignements - 2): s_num_good_alignments += 1 # else: # if num_hits > 0: # import pdb # pdb.set_trace() if whole_alignment_hit: s_whole_alignment_hits += 1 else: s_whole_alignment_misses += 1 # Printing out results : NEW # Variables names matching RNA benchmark paper sys.stdout.write('\n\nAnalysis results:') sys.stdout.write('\nOriginal Samlines: %d' % report.num_alignments) sys.stdout.write( '\nUsable whole alignments (with valid CIGAR string): %d' % len(all_sam_lines)) sys.stdout.write('\nAnnotations: %d' % len(annotation_dict)) sys.stdout.write('\nMultiexon genes: %d' % s_num_multiexon_genes) sys.stdout.write('\nNumber of exon start hits: %d' % s_num_start_hits) sys.stdout.write('\nNumber of exon end hits: %d' % s_num_end_hits) sys.stdout.write('\nNumber of exon start and end hits: %d' % s_num_start_end_hits) sys.stdout.write('\nNumber of good whole alignments: %d' % s_num_good_alignments) sys.stdout.write('\nMAF: Correct alignment: %d' % s_maf_good_alignments) sys.stdout.write('\nMAF: Hit all parts: %d' % s_maf_hit_all_parts) sys.stdout.write('\nMAF: Hit at least one part: %d' % s_maf_hit_one_part) sys.stdout.write('\nMAF: Equals at least one part: %d' % s_maf_eq_one_part) sys.stdout.write('\nMAF: Number of split reads: %d' % s_maf_split_reads) sys.stdout.write('\nMAF: Correct alignment, SPLIT read: %d' % s_maf_good_split_alignments) sys.stdout.write('\nMAF: Hit all parts, SPLIT read: %d' % s_maf_split_hit_all_parts) sys.stdout.write('\nMAF: Hit at least one part, SPLIT read: %d' % s_maf_split_hit_one_part) sys.stdout.write('\nMAF: Equals at least one part, SPLIT read: %d' % s_maf_split_eq_one_part) sys.stdout.write('\nDone!\n')
def run_poa_sequentially_v2(seq_path, out_consensus_file): temp_subseq_file = '%s/tmp.subseq.fasta' % (os.path.dirname(out_consensus_file)); temp_msa_file = '%s/tmp.subseq.fasta.pir' % (os.path.dirname(out_consensus_file)); # out_consensus_file = '%s/consensus-poa.fasta' % (os.path.dirname(seq_path)); out_consensus_file_chunks = '%s/tmp.consensus.chunks.fasta' % (os.path.dirname(out_consensus_file)); fp_out_all = open(out_consensus_file, 'w'); fp_out_chunks = open(out_consensus_file_chunks, 'w'); timestamp = strftime("%Y/%m/%d %H:%M:%S", gmtime()); fp_out_all.write('>Consensus_with_POA all %s\n' % (timestamp)); print 'seq_path = "%s"' % (seq_path); [ret_string, num_seqs, total_seq_len, average_seq_len, max_seq_len] = fastqparser.count_seq_length(seq_path); window_len = 5000; # window_len = 1000; # window_len = max_seq_len; start_coord = 0; while (start_coord < max_seq_len): end_coord = start_coord + window_len; if (end_coord > (max_seq_len - window_len)): end_coord = max_seq_len; sys.stderr.write('Window: start = %d, end = %d\n' % (start_coord, end_coord)); execute_command('%s/fastqfilter.py subseqs %s %d %d %s' % (SAMSCRIPTS_PATH, seq_path, start_coord, end_coord, temp_subseq_file)); # if (start_coord == 0 or end_coord == max_seq_len): # execute_command('%s/poaV2/poa -do_progressive -read_fasta %s -pir %s %s/poaV2/blosum80.mat' % (TOOLS_PATH, temp_subseq_file, temp_msa_file, TOOLS_PATH)); # execute_command('%s/poaV2/poa -do_progressive -read_fasta %s -pir %s %s/poaV2/all1.mat' % (TOOLS_PATH, temp_subseq_file, temp_msa_file, TOOLS_PATH)); # else: execute_command('%s/poaV2/poa -do_global -do_progressive -read_fasta %s -pir %s %s/poaV2/blosum80.mat' % (TOOLS_PATH, temp_subseq_file, temp_msa_file, TOOLS_PATH)); # execute_command('%s/poaV2/poa -do_global -do_progressive -read_fasta %s -pir %s %s/poaV2/all1.mat' % (TOOLS_PATH, temp_subseq_file, temp_msa_file, TOOLS_PATH)); timestamp = strftime("%Y/%m/%d %H:%M:%S", gmtime()); fp_out_chunks.write('>Consensus_with_POA %d-%d %s\n' % (start_coord, end_coord, timestamp)); [headers, seqs, quals] = fastqparser.read_fastq(temp_msa_file); cons_seq = ''; for i in xrange(0, len(seqs[0])): base_counts = {'A': 0, 'C': 0, 'T': 0, 'G': 0, '.': 0}; for j in xrange(0, len(seqs)): base_counts[seqs[j][i]] += 1; sorted_base_counts = sorted(base_counts.items(), key=operator.itemgetter(1)); # print sorted_base_counts; if (sorted_base_counts[-1][0] != '.'): cons_seq += sorted_base_counts[-1][0] fp_out_all.write('%s' % (cons_seq)); fp_out_chunks.write('%s\n' % (cons_seq)); # # print temp_subseq_file; # # print headers; # i = 0; # while (i < len(headers)): # if ('consensus' in headers[i]): # # print seqs[i]; # # print seqs[i].replace('.', ''); # chunk_seq = seqs[i].replace('.', ''); # fp_out_all.write('%s' % (chunk_seq)); # fp_out_chunks.write('%s\n' % (chunk_seq)); # break; # i += 1; # break; start_coord = end_coord; fp_out_all.write('\n'); fp_out_all.close(); fp_out_chunks.close();
def processData(datafolder, resultfile, annotationfile, paramdict): split_qnames = False filename = '' if '--split-qnames' in paramdict: split_qnames = True filename = paramdict['--split-qnames'][0] filename_correct = filename + '_correct.names' filename_hitall = filename + '_hitall.names' filename_hitone = filename + '_hitone.names' filename_bad = filename + '_incorrect.names' filename_unmapped = filename + '_unmapped.names' printMap = False filename_mapping = '' if '--print_mapping' in paramdict: filename_mapping = paramdict['--print_mapping'][0] printMap = True file_correct = None file_hitall = None file_hitone = None file_bad = None file_unmapped = None folder = os.getcwd() # If splittng qnames into files, have to open files first if split_qnames: file_correct = open(os.path.join(folder, filename_correct), 'w+') file_hitall = open(os.path.join(folder, filename_hitall), 'w+') file_hitone = open(os.path.join(folder, filename_hitone), 'w+') file_bad = open(os.path.join(folder, filename_bad), 'w+') # Loading results SAM file report = EvalReport(ReportType.FASTA_REPORT) # not really needed, used for unmapped query names # Have to preserve the paramdict # paramdict = {} sys.stderr.write('\n(%s) Loading and processing SAM file with mappings ... ' % datetime.now().time().isoformat()) all_sam_lines = RNAseqEval.load_and_process_SAM(resultfile, paramdict, report, BBMapFormat = True) # Reading annotation file annotations = Annotation_formats.Load_Annotation_From_File(annotationfile) s_num_multiexon_genes = 0 mapfile = None if printMap: mapfile = open(filename_mapping, 'w+') # Hashing annotations according to name annotation_dict = {} for annotation in annotations: if annotation.genename in annotation_dict: sys.stderr.write('\nWARNING: anotation with name %s already in the dictionary!' % annotation.genename) else: annotation_dict[annotation.genename] = annotation if len(annotation.items) > 1: s_num_multiexon_genes += 1 # Statistical information for evaluating the qualitiy of mapping s_gene_hits = 0 s_gene_misses = 0 s_whole_alignment_hits = 0 s_whole_alignment_misses = 0 s_partial_alignment_hits = 0 s_partial_alignment_misses = 0 s_num_start_hits = 0 s_num_end_hits = 0 s_num_start_end_hits = 0 s_num_fw_strand = 0 s_num_rv_strand = 0 s_num_split_alignment = 0 s_num_oversplit_alignment = 0 # Alignments that have more parts than exons s_num_good_alignments = 0 s_num_badchrom_alignments = 0 s_maf_suspicious_alignments = 0 s_maf_bad_alignments = 0 s_maf_good_alignments = 0 s_maf_split_reads = 0 s_maf_good_split_alignments = 0 s_maf_bad_split_alignments = 0 s_maf_hit_all_parts = 0 s_maf_hit_one_part = 0 s_maf_eq_one_part = 0 s_maf_multihit_parts = 0 s_maf_split_hit_all_parts = 0 s_maf_split_hit_one_part = 0 s_maf_split_eq_one_part = 0 s_maf_miss_alignment = 0 s_maf_too_many_alignments = 0 s_num_potential_bad_strand = 0 allowed_inacc = Annotation_formats.DEFAULT_ALLOWED_INACCURACY # Allowing some shift in positions min_overlap = Annotation_formats.DEFAULT_MINIMUM_OVERLAP # Minimum overlap that is considered # Setting allowed_inaccuracy from parameters if '--allowed_inacc' in paramdict: allowed_inacc = int(paramdict['--allowed_inacc'][0]) elif '-ai' in paramdict: allowed_inacc = int(paramdict['-ai'][0]) # Setting minimum overlap from parameters if '--allowed_inacc' in paramdict: min_overlap = int(paramdict['--allowed_inacc'][0]) elif '-mo' in paramdict: min_overlap = int(paramdict['-mo'][0]) # All samlines in a list should have the same query name for samline_list in all_sam_lines: qname = samline_list[0].qname isSplitAlignment = False if len(samline_list) > 1: s_num_split_alignment += 1 isSplitAlignment = True # Checking the SAM file if all samlines in a list have the same qname for samline in samline_list[1:]: if samline.qname != qname: sys.stderr.write('\nWARNING: two samlines in the same list with different query names (%s/%s)' % (qname, samline.qname)) # Look for the first underscore in query name # Everything before that is the simulation folder name # Everything after that is simulated query name pos = qname.find('_') if pos < 0: raise Exception('Invalid query name in results file (%s)!' % qname) simFolderKey = qname[:pos] if simFolderKey not in simFolderDict: # import pdb # pdb.set_trace() raise Exception('Bad simulation folder short name (%s)!' % simFolderKey) simFolder = simFolderDict[simFolderKey] simQName = qname[pos+1:] # Due to error in data preparation, have to make some extra processing if simQName[:6] == 'SimG2_': simQName = simQName[6:] # if simFolderKey == 'SimG1': # simFileSuffix = 'g1' # elif simFolderKey == 'SimG2': # simFileSuffix = 'g2' # elif simFolderKey == 'SimG3': # simFileSuffix = 'g3' # else: # simFileSuffix = 'sd' simFileSuffix = 'sd' pos = simQName.find('_') pos2 = simQName.find('_part') if pos < 0: raise Exception('Invalid simulated query name in results file (%s)!' % simQName) simQLetter = simQName[0] # Should always be S # BBMap separates a query into smaller parts he can manage # Extends query with '_part_#', which has to be ignored if pos2 <> -1: simQName = simQName[:pos2] simRefNumber = int(simQName[1:pos]) simQNumber = int(simQName[pos+1:]) simFileName = simFileSuffix + '_%04d' % simRefNumber simRefFileName = simFileName + '.ref' simSeqFileName = simFileName + '.fastq' simMafFileName = simFileName + '.maf' simFilePath = os.path.join(datafolder, simFolder) simRefFilePath = os.path.join(simFilePath, simRefFileName) simSeqFilePath = os.path.join(simFilePath, simSeqFileName) simMafFilePath = os.path.join(simFilePath, simMafFileName) if not os.path.exists(simRefFilePath): # import pdb # pdb.set_trace() raise Exception('Reference file for simulated read %s does not exist!' % qname) if not os.path.exists(simSeqFilePath): # import pdb # pdb.set_trace() raise Exception('Sequence file for simulated read %s does not exist!' % qname) if not os.path.exists(simMafFilePath): # import pdb # pdb.set_trace() raise Exception('Sequence alignment (MAF) for simulated read %s does not exist!' % qname) # Reading reference file [headers, seqs, quals] = read_fastq(simRefFilePath) simGeneName = headers[0] annotation = annotation_dict[simGeneName] # Getting the correct annotation if len(samline_list) > len(annotation.items): # sys.stderr.write('\nWARNING: A number of partial alignments exceeds the number of exons for query %s! (%d / %d)' % (qname, len(samline_list), len(annotation.items))) s_num_oversplit_alignment += 1 # Reading MAF file to get original position and length of the simulated read # Query name should be a second item maf_startpos = maf_length = 0 maf_strand = '0' maf_reflen = 0 i = 0 with open(simMafFilePath, 'rU') as maffile: i += 1 for line in maffile: if line[0] == 's': elements = line.split() maf_qname = elements[1] if maf_qname == 'ref': # Have to remember data for the last reference before the actual read maf_startpos = int(elements[2]) maf_length = int(elements[3]) maf_strand = elements[4] maf_reflen = int(elements[5]) if maf_qname == simQName: # maf_startpos = int(elements[2]) # maf_length = int(elements[3]) break if maf_qname != simQName: # import pdb # pdb.set_trace() raise Exception('ERROR: could not find query %s in maf file %s' % (qname, simMafFileName)) # IMPORTANT: If the reads were generated from an annotation on reverse strand # expected partial alignments must be reversed if annotation.strand == Annotation_formats.GFF_STRANDRV: maf_startpos = maf_reflen - maf_length - maf_startpos # Saving "maf_length" and "maf_startpos" to be able to check it later t_maf_length = maf_length t_maf_startpos = maf_startpos # Calculating expected partial alignmetns from MAF and annotations # 1. Calculating the index of the first exon # i - the index of exon currently being considered i = 0 while annotation.items[i].getLength() < maf_startpos: maf_startpos -= annotation.items[i].getLength() i += 1 # Calculating expected partial alignments by filling up exons using maf_length expected_partial_alignments = [] while maf_length > 0: start = annotation.items[i].start + maf_startpos end = annotation.items[i].end assert start <= end # OLD: length = end-start+1 # KK: End is already indicating position after the last base, so adding one when callculating length is not correct length = end - start if length <= maf_length: expected_partial_alignments.append((start, end)) maf_length -= length i += 1 else: expected_partial_alignments.append((start, start + maf_length)) maf_length = 0 i += 1 # Start position should only be considered for the first exon maf_startpos = 0 # import pdb # pdb.set_trace() numparts = len(expected_partial_alignments) # For each part of expected partial alignments, these maps will count # how many real partial alignments overlap or equal it parthitmap = {(i+1):0 for i in xrange(numparts)} parteqmap = {(i+1):0 for i in xrange(numparts)} isSplitRead = False if len(expected_partial_alignments) > 1: s_maf_split_reads += 1 isSplitRead = True oneHit = False allHits = False oneEq = False multiHit = False good_alignment = False has_miss_alignments = False if RNAseqEval.getChromName(samline_list[0].rname) != RNAseqEval.getChromName(annotation.seqname): # import pdb # pdb.set_trace() s_num_badchrom_alignments += 1 else: if len(samline_list) != len(expected_partial_alignments): # sys.stderr.write('\nWARNING: suspicious number of alignments for query %s!' % qname) s_maf_suspicious_alignments += 1 # import pdb # pdb.set_trace() good_alignment = True k = 0 for samline in samline_list: # sl_startpos = samline.pos - 1 # SAM positions are 1-based sl_startpos = samline.pos reflength = samline.CalcReferenceLengthFromCigar() sl_endpos = sl_startpos + reflength # Comparing a samline to the corresponding expected partial alignment if k < len(expected_partial_alignments): expected_alignement = expected_partial_alignments[k] maf_startpos = expected_alignement[0] maf_endpos = expected_alignement[1] if abs(sl_startpos - maf_startpos) > allowed_inacc or abs(sl_endpos - maf_endpos) > allowed_inacc: good_alignment = False else: good_alignment = False k += 1 # Comparing a samline to all expected partial alignments for i in xrange(len(expected_partial_alignments)): expected_alignement = expected_partial_alignments[i] maf_startpos = expected_alignement[0] maf_endpos = expected_alignement[1] if interval_equals((sl_startpos, sl_endpos), (maf_startpos, maf_endpos), allowed_inacc, min_overlap): parteqmap[i+1] += 1 if interval_overlaps((sl_startpos, sl_endpos), (maf_startpos, maf_endpos), allowed_inacc, min_overlap): parthitmap[i+1] += 1 has_miss_alignments = False for expected_alignement in expected_partial_alignments: maf_startpos = expected_alignement[0] maf_endpos = expected_alignement[1] overlap = False for samline in samline_list: sl_startpos = samline.pos reflength = samline.CalcReferenceLengthFromCigar() sl_endpos = sl_startpos + reflength if interval_overlaps((sl_startpos, sl_endpos), (maf_startpos, maf_endpos), allowed_inacc, min_overlap): overlap = True if not overlap: has_miss_alignments = True break if len(samline_list) < len(expected_partial_alignments): s_maf_too_many_alignments += 1 # Testing the evaluation process # import pdb # pdb.set_trace() if len(samline_list) <> len(expected_partial_alignments): good_alignment = False if good_alignment: s_maf_good_alignments += 1 # Writting qnames to files if split_qnames: file_correct.write(samline_list[0].qname + '\n') if isSplitRead: s_maf_good_split_alignments += 1 else: # import pdb # pdb.set_trace() s_maf_bad_alignments += 1 if isSplitRead: s_maf_bad_split_alignments += 1 # TODO: check which alignments are bad and why # If the choromosome is different its obviously a bad alignment if RNAseqEval.getChromName(samline.rname) == RNAseqEval.getChromName(annotation.seqname): # import pdb # pdb.set_trace() pass else: s_num_badchrom_alignments += 1 # Analyzing parthitmap and parteqmap oneHit = False allHits = True oneEq = False multiHit = False for i in xrange(numparts): if parthitmap[i+1] > 0: oneHit = True if parthitmap[i+1] == 0: allHits = False if parthitmap[i+1] > 1: multiHit = True if parteqmap[i+1] > 0: oneEq = True if printMap: status = 'INCORRECT' if good_alignment: status = 'CORRECT' elif allHits: status = 'HITALL' elif oneHit: status = 'HITONE' mapfile.write('QNAME: %s, STATUS: %s\n\n' % (samline_list[0].qname, status)) mapfile.write('EXPECTED (%s, %s):\t' % (RNAseqEval.getChromName(annotation.seqname), annotation.strand)) for epa in expected_partial_alignments: mapfile.write('(%d, %d)\t' % (epa[0], epa[1])) mapfile.write('\n') if samline_list[0].flag & 16 == 0: readstrand = Annotation_formats.GFF_STRANDFW else: readstrand = Annotation_formats.GFF_STRANDRV mapfile.write('ACTUAL (%s, %s):\t' % (RNAseqEval.getChromName(samline_list[0].rname), readstrand)) for samline in samline_list: mapfile.write('(%d, %d)\t' % (samline.pos, samline.pos + samline.CalcReferenceLengthFromCigar())) mapfile.write('\n\n') if oneHit: s_maf_hit_one_part += 1 if isSplitRead: s_maf_split_hit_one_part += 1 # Writting qnames to files if split_qnames: file_hitone.write(samline_list[0].qname + '\n') if not allHits: if '--debug' in paramdict: import pdb pdb.set_trace() # Misses are calculated only for alignments that have at least one hit if has_miss_alignments: s_maf_miss_alignment += 1 else: # Writting qnames to files if split_qnames: file_bad.write(samline_list[0].qname + '\n') # if '--debug' in paramdict: # import pdb # pdb.set_trace() if allHits: s_maf_hit_all_parts += 1 if isSplitRead: s_maf_split_hit_all_parts += 1 # Writting qnames to files if split_qnames: file_hitall.write(samline_list[0].qname + '\n') # Sanity check if '--debug' in paramdict and good_alignment and not allHits: import pdb pdb.set_trace() pass if oneEq: s_maf_eq_one_part += 1 if isSplitRead: s_maf_split_eq_one_part += 1 if multiHit: s_maf_multihit_parts += 1 num_start_hits = 0 num_end_hits = 0 num_hits = 0 num_partial_alignements = len(samline_list) whole_alignment_hit = False for samline in samline_list: startpos = samline.pos - 1 reflength = samline.CalcReferenceLengthFromCigar() endpos = startpos + reflength if samline.flag & 16 == 0: readstrand = Annotation_formats.GFF_STRANDFW s_num_fw_strand += 1 else: readstrand = Annotation_formats.GFF_STRANDRV s_num_rv_strand += 1 chromname = RNAseqEval.getChromName(samline.rname) if chromname == RNAseqEval.getChromName(annotation.seqname) and readstrand != annotation.strand and annotation.overlapsGene(startpos, endpos): s_num_potential_bad_strand += 1 if chromname == RNAseqEval.getChromName(annotation.seqname) and annotation.overlapsGene(startpos, endpos) and (not P_CHECK_STRAND or readstrand == annotation.strand): whole_alignment_hit = True s_partial_alignment_hits += 1 else: s_partial_alignment_misses +=1 # Checking how well partial alignments match exons startsItem = False endsItem = False for item in annotation.items: if item.overlapsItem(startpos, endpos): num_hits += 1 if item.startsItem(startpos, endpos): num_start_hits += 1 startsItem = True if item.endsItem(startpos, endpos): num_end_hits += 1 endsItem = True if startsItem and endsItem: s_num_start_end_hits += 1 s_num_start_hits += num_start_hits s_num_end_hits += num_end_hits # I'm allowing one start and one end not to match starts and ends of exons if (num_hits == num_partial_alignements) and (num_start_hits + num_end_hits >= 2*num_partial_alignements - 2) : s_num_good_alignments += 1 # else: # if num_hits > 0: # import pdb # pdb.set_trace() if whole_alignment_hit: s_whole_alignment_hits += 1 else: s_whole_alignment_misses += 1 if printMap: mapfile.close() # Writting unmapped query names to a file, if so specified if split_qnames: with open(filename_unmapped, 'w+') as file_unmapped: file_unmapped.write(report.get_unmapped_names()) file_unmapped.close() # Printing out results : NEW # Variables names matching RNA benchmark paper sys.stdout.write('\n\nAnalysis results:') sys.stdout.write('\nOriginal Samlines: %d' % report.num_alignments) sys.stdout.write('\nUsable whole alignments (with valid CIGAR string): %d' % len(all_sam_lines)) sys.stdout.write('\nAnnotations: %d' % len(annotation_dict)) sys.stdout.write('\nMultiexon genes: %d' % s_num_multiexon_genes) sys.stdout.write('\nNumber of exon start hits: %d' % s_num_start_hits) sys.stdout.write('\nNumber of exon end hits: %d' % s_num_end_hits) sys.stdout.write('\nNumber of exon start and end hits: %d' % s_num_start_end_hits) sys.stdout.write('\nNumber of good whole alignments: %d' % s_num_good_alignments) sys.stdout.write('\nNumber of alignments mapped to an incorrect chromosome: %d' % s_num_badchrom_alignments) sys.stdout.write('\nMAF: Correct alignment: %d' % s_maf_good_alignments) sys.stdout.write('\nMAF: Hit all parts: %d' % s_maf_hit_all_parts) sys.stdout.write('\nMAF: Hit at least one part: %d' % s_maf_hit_one_part) sys.stdout.write('\nMAF: Equals at least one part: %d' % s_maf_eq_one_part) sys.stdout.write('\nMAF: Number of split reads: %d' % s_maf_split_reads) sys.stdout.write('\nMAF: Correct alignment, SPLIT read: %d' % s_maf_good_split_alignments) sys.stdout.write('\nMAF: Hit all parts, SPLIT read: %d' % s_maf_split_hit_all_parts) sys.stdout.write('\nMAF: Hit at least one part, SPLIT read: %d' % s_maf_split_hit_one_part) sys.stdout.write('\nMAF: Equals at least one part, SPLIT read: %d' % s_maf_split_eq_one_part) sys.stdout.write('\nMAF: Partial alignment that misses: %d' % s_maf_miss_alignment) sys.stdout.write('\nMAF: More alignments than expected: %d' % s_maf_too_many_alignments) sys.stdout.write('\nMAF: Multihit parts (fragmented) alignments: %d' % s_maf_multihit_parts) sys.stdout.write('\nDone!\n') # Closing file with names if split_qnames: file_correct.close() file_hitall.close() file_hitone.close() file_bad.close()
def extract_alternate_contigs(single_contig_file, reads_file, out_alt_ctg_file, ref_file=''): ### Generate file paths for some temporary files. path_aligns_basename = '%s/tmp.allreads' % (os.path.dirname(out_alt_ctg_file)); path_aligns = '%s.sam' % (path_aligns_basename); path_aligns_sorted_basename = '%s.sorted' % (path_aligns_basename); path_aligns_sorted_sam = '%s.sam' % (path_aligns_sorted_basename); path_alt_contig_sams = '%s.altctgs.sam' % (path_aligns_basename); if (not os.path.exists(os.path.dirname(out_alt_ctg_file))): os.path.makedirs(os.path.dirname(out_alt_ctg_file)); ### Generate alignments. # execute_command('%s/graphmap/bin/Linux-x64/graphmap -a anchor -b 3 -r %s -d %s -o %s' % (TOOLS_PATH, single_contig_file, reads_file, path_aligns)); # execute_command('samtools view -Sb %s | samtools sort - %s && samtools view -h %s.bam > %s' % (path_aligns, path_aligns_sorted_basename, path_aligns_sorted_basename, path_aligns_sorted_sam)); [ctg_headers, ctg_seqs, ctg_quals] = fastqparser.read_fastq(single_contig_file); [headers, all_sam_lines] = utility_sam.LoadSAM(path_aligns_sorted_sam); sys.stderr.write('Number of lines in the original SAM file: %d\n' % (len(all_sam_lines))); sam_lines = []; for sam_line in all_sam_lines: if (sam_line.IsMapped() == False): continue; seq_len = len(sam_line.seq) - sam_line.clip_count_front - sam_line.clip_count_back; cigop_counts = sam_line.CountAlignmentOps(); ### Check if the CIGAR string is actually in the extended format. if ('M' in cigop_counts): sys.stderr.write('Warning: alignment does not contain the *extended* CIGAR format! Skipping alignment.\n'); exit(1); else: matches = cigop_counts['=']; errors = cigop_counts['X'] + cigop_counts['D'] + cigop_counts['I']; if (float(matches) / float(seq_len) >= 0.70 and float(errors) / float(seq_len) < 0.40): sam_lines.append(sam_line); sys.stderr.write('Number of filtered SAM lines (only mapped and with errors below threshold): %d\n' % (len(sam_lines))); fp_out_alt_ctg = open(out_alt_ctg_file, 'w'); fp_out_alt_ctg_sams = open(path_alt_contig_sams, 'w'); fp_out_alt_ctg_sams.write('\n'.join(headers) + '\n'); ### Find alternate contigs from alignments. sams_to_process = sam_lines; coverage = 0; while (coverage < 100 and len(sams_to_process) > 0): coverage += 1; print '---------------------------------------'; print 'Coverage = %d' % (coverage); sys.stderr.write('Number of alignments in pool: %d\n' % (len(sams_to_process))); contig_sams = []; unused_sams = []; i = 0; candidate_read = i; contig_sams.append(sams_to_process[candidate_read]); # for candidate_read in xrange((i+1), len(sams_to_process)): start1 = sams_to_process[candidate_read].pos - 1; end1 = start1 + sams_to_process[candidate_read].CalcReferenceLengthFromCigar(); print 'candidate: start = %d, end = %d' % (start1, end1); while ((candidate_read + 1) < len(sams_to_process)): max_overlap_len = 0; max_overlap_id = -1; # j = candidate_read + 1; # while (j < len(sams_to_process)): for j in xrange(candidate_read + 1, len(sams_to_process)): overlap_len = check_overlap(sams_to_process[candidate_read], sams_to_process[j], 0); if (overlap_len == 0): print 'break 1'; print ' j = %d (in the range of %d to %d)' % (j, candidate_read + 1, len(sams_to_process)); break; elif (overlap_len == -1 or overlap_len == -2): ### -1 is for contained sequences, and -2 is for overlaps which are below the threshold. # j += 1; continue; if (max_overlap_id == -1 or overlap_len >= max_overlap_len): max_overlap_len = overlap_len; max_overlap_id = j; # j += 1; if (max_overlap_id > 0): print ' starting read = %d' % (candidate_read); print ' candidate_read = %d' % (max_overlap_id); print ' max_overlap_len = %d' % (max_overlap_len); print ' unused overlapping reads: %d - %d' % ((candidate_read + 1), max_overlap_id); start1 = sams_to_process[max_overlap_id].pos - 1; end1 = start1 + sams_to_process[max_overlap_id].CalcReferenceLengthFromCigar(); print ' candidate: start = %d, end = %d' % (start1, end1); unused_sams += sams_to_process[(candidate_read + 1):max_overlap_id]; candidate_read = max_overlap_id; contig_sams.append(sams_to_process[candidate_read]); else: print 'break 2'; break; print ' unused reads: %d - %d' % ((candidate_read + 1), len(sams_to_process)); unused_sams += sams_to_process[(candidate_read + 1):len(sams_to_process)]; sams_to_process = unused_sams + []; # if ((candidate_read + 1) == len(sam_lines)): # break; # i += 1; # max_overlap_len = 0; # max_overlap_id = -1; # while (i < len(sam_lines)): # overlap_len = check_overlap(sam_lines[candidate_read], sam_lines[i + 1]); # if ((i + 1) >= len(sam_lines) or overlap_len <= 0): # break; # else: # unused_sams.append(sam_lines[i]); # overlap_len = check_overlap(sam_lines[candidate_read], sam_lines[i]); # if (overlap_len >= max_overlap_len): # max_overlap_len = overlap_len; # max_overlap_id = i; # i += 1; # contig_sams.append(sam_lines[candidate_read]); # # candidate_read = i; # if (max_overlap_id > 0): # candidate_read = max_overlap_id; # else: # break; # i += 1; print ' after coverage %d:' % (coverage); print ' len(sams_to_process) = %d' % (len(sams_to_process)); print ' len(contig_sams) = %d' % len(contig_sams); print ' len(unused_sams) = %d' % len(unused_sams); [new_contig, non_clipped_len, new_contig_cigar] = construct_contig_from_overlapping_sams(ctg_seqs, contig_sams); test_sam_line = utility_sam.SAMLine(); test_sam_line.seq = new_contig; test_sam_line.cigar = new_contig_cigar; print 'test_sam_line.CalcReadLengthFromCigar() = %d' % (test_sam_line.CalcReadLengthFromCigar()); print 'test_sam_line.CalcReferenceLengthFromCigar() = %d' % (test_sam_line.CalcReferenceLengthFromCigar()); print 'len(test_sam_line.seq) = %d' % (len(test_sam_line.seq)); print '********************* len(new_contig) = %d, non_clipped_len = %d' % (len(new_contig), non_clipped_len); exit(1); if (float(non_clipped_len) < 0.85*float(len(ctg_seqs[0]))): # print 'Tu sam!'; # exit(1); continue; else: print '++++++++++++++++++++++++++++++++++++++++'; fp_out_alt_ctg.write('>%s %d\n' % (ctg_headers[0], coverage)); fp_out_alt_ctg.write('%s\n' % (new_contig)); for sam_line in contig_sams: fp_out_alt_ctg_sams.write(sam_line.original_line + '\n'); fp_out_alt_ctg_sams.write('\n'); fp_out_alt_ctg_sams.close(); fp_out_alt_ctg.close();