def calculate_kmer_distribution(taxa, bin_width): lengths_file=re.sub('.cmap', '.'+str(bin_width)+'.lengths', taxa) try: with open(lengths_file) as i_file: lengths={} for line in i_file: line_data=line.split("\t") lengths[float(line_data[0])]=int(line_data[1]) return lengths except: pass raw_lengths_file=re.sub('.cmap', '.raw_lengths', taxa) raw_lengths=[] max_length=-1 try: with open(raw_lengths_file) as i_file: for line in i_file: raw_lengths=[float(x) for x in line.strip().split("\t")] max_length=max(raw_lengths) except: cmap=CmapFile(taxa) current_contig_id=None previous_position=0.0 max_length=-1.0 for label in cmap.parse(): if label.channel != "1": continue if label.contig_id != current_contig_id: current_contig_id=label.contig_id previous_position=0.0 length=label.position-previous_position raw_lengths.append(length) if length > max_length: max_length = length with open(raw_lengths_file, 'w') as o_file: for raw_length in raw_lengths: o_file.write(str(raw_length)+"\t") lengths=OrderedDict() bin_max=0 while bin_max < max_length: bin_max+=bin_width lengths[bin_max]=0 for raw_length in raw_lengths: for bin_max in lengths: if raw_length < bin_max: lengths[bin_max]+=1 break with open(lengths_file, 'w') as o_file: for bin_max in lengths: o_file.write(str(bin_max)+"\t"+str(lengths[bin_max])+"\n") return lengths
def createQualityObject(self): if not self.isComplete(): raise Exception("The step is not complete yet") count=0 total_length=0.0 lengths=[] label_occurrences=0 label_count=0 for cmap_name in glob(self.getStepDir() + "/*.cmap"): # This glob relies on there not being a merged .cmap in the same directory (i.e. Summarize has not been run) contigs=set() cmap_file=CmapFile(cmap_name) for label in cmap_file.parse(): if not label.contig_id in contigs: count+=1 total_length+=label.contig_len contigs.add(label.contig_id) lengths.append(label.contig_len) label_occurrences+=label.occurrences label_count+=1 sorted_lengths=sorted(lengths, reverse=True) minlen=sorted_lengths[len(sorted_lengths)-1] maxlen=sorted_lengths[0] n50=0 length_included_in_n50=0 target_length_included=total_length/2.0 for length in sorted(lengths, reverse=True): length_included_in_n50+=length if length_included_in_n50 >= target_length_included: n50 = length break with open(self.getOutputFile()) as contig_file: for line in contig_file: if line[0] != "C": continue contig_data=line.split(",") nummaps=contig_data[len(contig_data)-1] nummaps_data=nummaps.split("=") nummaps=nummaps_data[len(nummaps_data)-1] self.quality=Quality(length=total_length, count=count, average_length=total_length/count, n50=n50, min=minlen, max=maxlen, average_occurrences=float(label_occurrences)/label_count, total_mols_aligned=nummaps, avg_mols_aligned=float(nummaps)/count) self.saveQualityObjectToFile()
class tCmapFile(tFile_base): def setUp(self): with open(self.input_file, "w"): self.obj=CmapFile(self.input_file) def test_getExtension(self): self.assertEqual("cmap", CmapFile.getExtension()) def test_parse(self): expected=CmapFile_iter(self.input_file) self.assertEqual(expected, self.obj.parse()) def test_write(self): label=Mock(contig_id=1, contig_len=1.0, contig_site_count=1, label_id=1, channel="1", position=1.0, stdev=1.0, coverage=1.0, occurrences=1, snr_mean=1.0, snr_stdev=1.0, snr_count=1.0) expected="\t".join([str(label.contig_id), str(label.contig_len), str(label.contig_site_count), str(label.label_id), label.channel, str(label.position), str(label.stdev), str(label.coverage), str(label.occurrences), str(label.snr_mean), str(label.snr_stdev), str(label.snr_count)]) + "\n" o_file=StringIO() self.obj.write(label, o_file) self.assertEqual(expected, o_file.getvalue())
class AssessReferenceAlignment(object): def __init__ (self, xmap_file_name): file_name_parts=xmap_file_name.split('/') file_name_parts_length=len(file_name_parts) if file_name_parts_length>1: self.workspace="/".join(file_name_parts[0:(file_name_parts_length-1)]) else: self.workspace="." with CD(self.workspace): file_name=file_name_parts[file_name_parts_length-1] self.xmap=XmapFile(file_name) self.anchor_cmap=CmapFile(file_name.replace(".xmap", "_r.cmap")) self.query_cmap=CmapFile(file_name.replace(".xmap", "_q.cmap")) self.ALIGNED_LABELS=re.compile("\(([\d]+),([\d]+)\)") def extractTruePositives(self): self.true_positive_labels={} self.true_positive_locations={} for alignment in self.xmap.parse(): anchor=alignment.anchor_id if not anchor in self.true_positive_labels: self.true_positive_labels[anchor]=set() if not anchor in self.true_positive_locations: self.true_positive_locations[anchor]=[] for label_pair in self.ALIGNED_LABELS.finditer(alignment.alignment): self.true_positive_labels[anchor].add(int(label_pair.group(1))) for label in self.anchor_cmap.parse(): if not label.contig_id in self.true_positive_labels: continue if label.label_id in self.true_positive_labels[label.contig_id]: self.true_positive_locations[label.contig_id].append(label.position) return self.true_positive_locations def extractFalseNegatives(self): # false negative lables are present in the anchor, not in the query self.false_negative_labels={} self.false_negative_locations={} for alignment in self.xmap.parse(): anchor=alignment.anchor_id if not anchor in self.false_negative_labels: self.false_negative_labels[anchor]=set() if not anchor in self.false_negative_locations: self.false_negative_locations[anchor]=[] previous_label=None for label_pair in self.ALIGNED_LABELS.finditer(alignment.alignment): anchor_label=int(label_pair.group(1)) if previous_label is None: previous_label=anchor_label continue for i in xrange(previous_label+1,anchor_label): self.false_negative_labels[anchor].add(i) previous_label=anchor_label for label in self.anchor_cmap.parse(): if not label.contig_id in self.false_negative_labels: continue if label.label_id in self.false_negative_labels[label.contig_id]: self.false_negative_locations[label.contig_id].append(label.position) return self.false_negative_locations def extractFalsePositives(self): self.false_positive_labels={} for alignment in self.xmap.parse(): anchor=alignment.anchor_id query=alignment.query_id if not query in self.false_positive_labels: self.false_positive_labels[query]={} previous_label_pair=None for label_pair in self.ALIGNED_LABELS.finditer(alignment.alignment): if previous_label_pair is None: previous_label_pair=label_pair continue previous_query_label=int(previous_label_pair.group(2)) query_label=int(label_pair.group(2)) if alignment.orientation=="+": start=previous_query_label+1 stop=query_label else: start=query_label+1 stop=previous_query_label for i in xrange(start, stop): self.false_positive_labels[query][i]={"anchor_id": anchor, "anchor_last_true_positive": int(previous_label_pair.group(1)), "query_last_true_positive": int(previous_label_pair.group(2))} previous_label_pair=label_pair false_positive_offsets={} last_true_positive=None for label in self.query_cmap.parse(): if not label.contig_id in self.false_positive_labels: last_true_positive=label continue if not label.label_id in self.false_positive_labels[label.contig_id]: last_true_positive=label continue false_positive=self.false_positive_labels[label.contig_id][label.label_id] anchor=false_positive["anchor_id"] anchor_label=false_positive["anchor_last_true_positive"] if not anchor in false_positive_offsets: false_positive_offsets[anchor]={} if not anchor_label in false_positive_offsets[anchor]: false_positive_offsets[anchor][anchor_label]=[] false_positive_offsets[anchor][anchor_label].append(label.position-last_true_positive.position) self.false_positive_locations={} for label in self.anchor_cmap.parse(): if not label.contig_id in false_positive_offsets: continue if not label.label_id in false_positive_offsets[label.contig_id]: continue if not label.contig_id in self.false_positive_locations: self.false_positive_locations[label.contig_id]=[] for offset in false_positive_offsets[label.contig_id][label.label_id]: self.false_positive_locations[label.contig_id].append(label.position+offset) return self.false_positive_locations def extractPartialMatches(self, output_name='partial_matches.xmap'): self.partial_match_locations={} with open(output_name, 'w') as o_file: for align in self.xmap.parse(): proportion=abs(align.query_start-align.query_end)/float(align.query_len) if proportion < 0.9: anchor=align.anchor if not anchor in self.partial_match_locations: self.partial_match_locations[anchor]=[] self.partial_match_locations[anchor].append(align.anchor_start, align.anchor_end) xfile.write(align, o_file) return self.partial_match_locations def extractSequenceContexts(self, loci): pass def processSeqeuenceContexts(self, fasta_file, motif): snvs=set() for i in xrange(0,len(motif)): for base in ['A', 'T', 'C', 'G']: if base==motif[i]: continue snv=motif[0:i]+base+motif[i+1:len(motif)] snvs.add(snv) print("HasGap HasSNV") for record in SeqIO.parse(fasta_file, 'fasta'): output="0" if "NNNNNNN" in record.seq or "nnnnnnn" in record.seq: output="1" contains_snv=False for snv in snvs: if snv in record.seq: contains_snv=True if contains_snv: output+="\t1" else: output+="\t0" print(output) def findNearestNeighbors(self,loci,neighbor_locis): neighbors={} for chr in loci: if not chr in neighbors: neighbors[chr]=[] for locus in loci[chr]: nearest_dist=None for neighbor_loci in neighbor_locis: if not chr in neighbor_loci: continue for neighbor_locus in neighbor_loci[chr]: dist=abs(locus-neighbor_locus) if nearest_dist is None or dist<nearest_dist: nearest_dist=dist if nearest_dist is not None: neighbors[chr].append(nearest_dist) return neighbors def findLabelsWithNearNeighbors(self,loci,neighbor_locis,threshold=301): nearest_neighbors=af.findNearestNeighbors(loci, neighbor_locis) offending_count=0 for chrom in nearest_neighbors: for distance in nearest_neighgbors[chrom]: if distance < 301: offending_count+=1 return offending_count