Python CmapFile.parse Examples, Operations.BioNano.files.CmapFile.parse Python Examples

Example #1

0

Show file

File: kmer_distances.py Project: sharpa/OMWare

def calculate_kmer_distribution(taxa, bin_width):
	lengths_file=re.sub('.cmap', '.'+str(bin_width)+'.lengths', taxa)
	try:
		with open(lengths_file) as i_file:
			lengths={}
			for line in i_file:
				line_data=line.split("\t")
				lengths[float(line_data[0])]=int(line_data[1])
			return lengths
	except:
		pass
			
	raw_lengths_file=re.sub('.cmap', '.raw_lengths', taxa)
	raw_lengths=[]
	max_length=-1
	try:
		with open(raw_lengths_file) as i_file:
			for line in i_file:
				raw_lengths=[float(x) for x in line.strip().split("\t")]
			max_length=max(raw_lengths)
	except:
		cmap=CmapFile(taxa)
		current_contig_id=None
		previous_position=0.0
		max_length=-1.0
		for label in cmap.parse():
			if label.channel != "1":
				continue
			if label.contig_id != current_contig_id:
				current_contig_id=label.contig_id
				previous_position=0.0
			length=label.position-previous_position
			raw_lengths.append(length)
			if length > max_length:
				max_length = length

	with open(raw_lengths_file, 'w') as o_file:
		for raw_length in raw_lengths:
			o_file.write(str(raw_length)+"\t")

	lengths=OrderedDict()
	bin_max=0
	while bin_max < max_length:
		bin_max+=bin_width
		lengths[bin_max]=0

	for raw_length in raw_lengths:
		for bin_max in lengths:
			if raw_length < bin_max:
				lengths[bin_max]+=1
				break

	with open(lengths_file, 'w') as o_file:
		for bin_max in lengths:
			o_file.write(str(bin_max)+"\t"+str(lengths[bin_max])+"\n")

	return lengths

Example #2

0

Show file

File: Assembly.py Project: sharpa/OMWare

	def createQualityObject(self):
		if not self.isComplete():
			raise Exception("The step is not complete yet")
		count=0
		total_length=0.0
		lengths=[]
		label_occurrences=0
		label_count=0
		for cmap_name in glob(self.getStepDir() + "/*.cmap"): # This glob relies on there not being a merged .cmap in the same directory (i.e. Summarize has not been run)
			contigs=set()
			cmap_file=CmapFile(cmap_name)
			for label in cmap_file.parse():
				if not label.contig_id in contigs:
					count+=1
					total_length+=label.contig_len
					contigs.add(label.contig_id)
					lengths.append(label.contig_len)
				label_occurrences+=label.occurrences
				label_count+=1
		
		sorted_lengths=sorted(lengths, reverse=True)
		minlen=sorted_lengths[len(sorted_lengths)-1]
		maxlen=sorted_lengths[0]
		n50=0
		length_included_in_n50=0
		target_length_included=total_length/2.0
		for length in sorted(lengths, reverse=True):
			length_included_in_n50+=length
			if length_included_in_n50 >= target_length_included:
				n50 = length
				break

		with open(self.getOutputFile()) as contig_file:
			for line in contig_file:
				if line[0] != "C":
					continue
				contig_data=line.split(",")
				nummaps=contig_data[len(contig_data)-1]
				nummaps_data=nummaps.split("=")
				nummaps=nummaps_data[len(nummaps_data)-1]

		self.quality=Quality(length=total_length, count=count, average_length=total_length/count, n50=n50, min=minlen, max=maxlen, average_occurrences=float(label_occurrences)/label_count, total_mols_aligned=nummaps, avg_mols_aligned=float(nummaps)/count)
		self.saveQualityObjectToFile()

Example #3

0

Show file

File: tFiles.py Project: sharpa/OMWare

class tCmapFile(tFile_base):
	def setUp(self):
		with open(self.input_file, "w"):
			self.obj=CmapFile(self.input_file)
	def test_getExtension(self):
		self.assertEqual("cmap", CmapFile.getExtension())
	
	def test_parse(self):
		expected=CmapFile_iter(self.input_file)
		self.assertEqual(expected, self.obj.parse())

	def test_write(self):
		label=Mock(contig_id=1, contig_len=1.0, contig_site_count=1, label_id=1, channel="1", position=1.0, stdev=1.0, coverage=1.0, occurrences=1, snr_mean=1.0, snr_stdev=1.0, snr_count=1.0)
		expected="\t".join([str(label.contig_id), str(label.contig_len), str(label.contig_site_count), str(label.label_id), label.channel, str(label.position), str(label.stdev), str(label.coverage), str(label.occurrences), str(label.snr_mean), str(label.snr_stdev), str(label.snr_count)]) + "\n"
		o_file=StringIO()

		self.obj.write(label, o_file)

		self.assertEqual(expected, o_file.getvalue())

Example #4

0

Show file

File: AssessReferenceAlignment.py Project: sharpa/OMWare

class AssessReferenceAlignment(object):
	def __init__ (self, xmap_file_name):
		file_name_parts=xmap_file_name.split('/')
		file_name_parts_length=len(file_name_parts)
		if file_name_parts_length>1:
			self.workspace="/".join(file_name_parts[0:(file_name_parts_length-1)])
		else:
			self.workspace="."
		with CD(self.workspace):
			file_name=file_name_parts[file_name_parts_length-1]
			self.xmap=XmapFile(file_name)
			self.anchor_cmap=CmapFile(file_name.replace(".xmap", "_r.cmap"))
			self.query_cmap=CmapFile(file_name.replace(".xmap", "_q.cmap"))

		self.ALIGNED_LABELS=re.compile("\(([\d]+),([\d]+)\)")

	def extractTruePositives(self):
		self.true_positive_labels={}
		self.true_positive_locations={}
		for alignment in self.xmap.parse():
			anchor=alignment.anchor_id
			if not anchor in self.true_positive_labels:
				self.true_positive_labels[anchor]=set()
			if not anchor in self.true_positive_locations:
				self.true_positive_locations[anchor]=[]
			
			for label_pair in self.ALIGNED_LABELS.finditer(alignment.alignment):
				self.true_positive_labels[anchor].add(int(label_pair.group(1)))

		for label in self.anchor_cmap.parse():
			if not label.contig_id in self.true_positive_labels:
				continue
			if label.label_id in self.true_positive_labels[label.contig_id]:
				self.true_positive_locations[label.contig_id].append(label.position)
		return self.true_positive_locations

	def extractFalseNegatives(self):
		# false negative lables are present in the anchor, not in the query
		self.false_negative_labels={}
		self.false_negative_locations={}

		for alignment in self.xmap.parse():
			anchor=alignment.anchor_id
			if not anchor in self.false_negative_labels:
				self.false_negative_labels[anchor]=set()
			if not anchor in self.false_negative_locations:
				self.false_negative_locations[anchor]=[]
			
			previous_label=None
			for label_pair in self.ALIGNED_LABELS.finditer(alignment.alignment):
				anchor_label=int(label_pair.group(1))

				if previous_label is None:
					previous_label=anchor_label
					continue

				for i in xrange(previous_label+1,anchor_label):
					self.false_negative_labels[anchor].add(i)
				previous_label=anchor_label

		for label in self.anchor_cmap.parse():
			if not label.contig_id in self.false_negative_labels:
				continue
			if label.label_id in self.false_negative_labels[label.contig_id]:
				self.false_negative_locations[label.contig_id].append(label.position)
		return self.false_negative_locations

	def extractFalsePositives(self):
		self.false_positive_labels={}
		for alignment in self.xmap.parse():
			anchor=alignment.anchor_id
			query=alignment.query_id
			if not query in self.false_positive_labels:
				self.false_positive_labels[query]={}
			
			previous_label_pair=None
			for label_pair in self.ALIGNED_LABELS.finditer(alignment.alignment):
				if previous_label_pair is None:
					previous_label_pair=label_pair
					continue

				previous_query_label=int(previous_label_pair.group(2))
				query_label=int(label_pair.group(2))

				if alignment.orientation=="+":
					start=previous_query_label+1
					stop=query_label
				else:
					start=query_label+1
					stop=previous_query_label
				for i in xrange(start, stop):
					self.false_positive_labels[query][i]={"anchor_id": anchor, "anchor_last_true_positive": int(previous_label_pair.group(1)), "query_last_true_positive": int(previous_label_pair.group(2))}

				previous_label_pair=label_pair

		false_positive_offsets={}
		last_true_positive=None
		for label in self.query_cmap.parse():
			if not label.contig_id in self.false_positive_labels:
				last_true_positive=label
				continue
			if not label.label_id in self.false_positive_labels[label.contig_id]:
				last_true_positive=label
				continue

			false_positive=self.false_positive_labels[label.contig_id][label.label_id]
			anchor=false_positive["anchor_id"]
			anchor_label=false_positive["anchor_last_true_positive"]
			if not anchor in false_positive_offsets:
				false_positive_offsets[anchor]={}
			if not anchor_label in false_positive_offsets[anchor]:
				false_positive_offsets[anchor][anchor_label]=[]
			
			false_positive_offsets[anchor][anchor_label].append(label.position-last_true_positive.position)

		self.false_positive_locations={}
		for label in self.anchor_cmap.parse():
			if not label.contig_id in false_positive_offsets:
				continue
			if not label.label_id in false_positive_offsets[label.contig_id]:
				continue

			if not label.contig_id in self.false_positive_locations:
				self.false_positive_locations[label.contig_id]=[]
			for offset in false_positive_offsets[label.contig_id][label.label_id]:
				self.false_positive_locations[label.contig_id].append(label.position+offset)

		return self.false_positive_locations

	def extractPartialMatches(self, output_name='partial_matches.xmap'):
		self.partial_match_locations={}
		with open(output_name, 'w') as o_file:
			for align in self.xmap.parse():
				proportion=abs(align.query_start-align.query_end)/float(align.query_len)
				if proportion < 0.9:
					anchor=align.anchor
					if not anchor in self.partial_match_locations:
						self.partial_match_locations[anchor]=[]
					self.partial_match_locations[anchor].append(align.anchor_start, align.anchor_end)
					xfile.write(align, o_file)
		return self.partial_match_locations

	def extractSequenceContexts(self, loci):
		pass
	def processSeqeuenceContexts(self, fasta_file, motif):
		snvs=set()
		for i in xrange(0,len(motif)):
			for base in ['A', 'T', 'C', 'G']:
				if base==motif[i]:
					continue
				snv=motif[0:i]+base+motif[i+1:len(motif)]
				snvs.add(snv)

		print("HasGap	HasSNV")
		for record in SeqIO.parse(fasta_file, 'fasta'):
			output="0"
			if "NNNNNNN" in record.seq or "nnnnnnn" in record.seq:
				output="1"

			contains_snv=False
			for snv in snvs:
				if snv in record.seq:
					contains_snv=True
			if contains_snv:
				output+="\t1"
			else:
				output+="\t0"
			print(output)


	def findNearestNeighbors(self,loci,neighbor_locis):
		neighbors={}
		for chr in loci:
			if not chr in neighbors:
				neighbors[chr]=[]
			for locus in loci[chr]:
				nearest_dist=None
				for neighbor_loci in neighbor_locis:
					if not chr in neighbor_loci:
						continue
					for neighbor_locus in neighbor_loci[chr]:
						dist=abs(locus-neighbor_locus)
						if nearest_dist is None or dist<nearest_dist:
							nearest_dist=dist
				if nearest_dist is not None:
					neighbors[chr].append(nearest_dist)
		return neighbors

	def findLabelsWithNearNeighbors(self,loci,neighbor_locis,threshold=301):
		
		nearest_neighbors=af.findNearestNeighbors(loci, neighbor_locis)
		offending_count=0
		for chrom in nearest_neighbors:
			for distance in nearest_neighgbors[chrom]:
				if distance < 301:
					offending_count+=1
		return offending_count