예제 #1
0
파일: maf.py 프로젝트: chris-zen/phd-thesis
	def __init__(self, f, fname, default_sample_id):
		TextParser.__init__(self, f, fname, default_sample_id)

		self.__format = None

		# Metadata and comments
		line = self._readline()
		while len(line) > 0 and line.startswith("#"):
			if line.startswith("##fileformat="):
				self.__format = line[13:]
			line = self._readline()

		if len(line) > 0: # Header
			column_indices = {}
			columns = line.rstrip().split("\t")
			self._col_size = len(columns)
			for i, name in enumerate(columns):
				name = name.lower()
				if name in _COLUMNS:
					column_indices[name] = i
			try:
				self._col_name_indices = column_indices
				self._col_indices = [column_indices[name] for name in _COLUMNS]
			except KeyError as ex:
				raise ParserException("Header column not found: {0}".format(ex.args[0]), self._location())
		else:
			raise ParserException("Header not found", (self._fname))
예제 #2
0
파일: vcf.py 프로젝트: chris-zen/phd-thesis
	def __init__(self, f, fname, default_sample_id):
		TextParser.__init__(self, f, fname, default_sample_id)

		self.__format = None

		# Metadata and comments
		line = self._readline()
		while len(line) > 0 and line.startswith("#"):
			if line.startswith("##INDIVIDUAL="):
				self._default_sample_id = line[13:]
			elif line.startswith("##fileformat="):
				self.__format = line[13:]
			line = self._readline()

		if len(line) > 0: # First line
			self._queue_line(line)
예제 #3
0
파일: maf.py 프로젝트: chris-zen/phd-thesis
	def next(self):
		TextParser.next(self)

		var = None
		while var is None:
			line = self._readline()

			if len(line) == 0:
				raise StopIteration()

			fields = line.rstrip("\n").split("\t")

			chr, start, strand, vtype, ref, alt1, alt2, sample = [
				fields[i] if i < self._col_size else None for i in self._col_indices]

			#print ">>>", chr, start, strand, vtype, ref, alt1, alt2, sample

			# Chromosome

			chr = parse_chromosome(chr)
			if chr is None:
				self._discard_line()
				continue

			# Start

			try:
				start = int(start)
			except:
				self._discard_line()
				continue

			# Strand

			if len(strand) == 0 or strand == "1" or strand == "+1":
				strand = "+"
			elif strand == "-1":
				strand = "-"
			elif strand not in ["+", "-"]:
				self._discard_line()
				continue

			# Ref & alt

			if ref is None or alt1 is None or alt2 is None:
				self._discard_line()
				continue

			try:
				for i, x in enumerate([ref, alt1, alt2]):
					if _ALLELE_RE.match(x) is None:
						self._discard_line()
						raise SkipLine()
			except SkipLine:
				continue

			alt = alt1

			if ref == "-":
				# [1   2]  -->  [1 2] 3
				#  . - .         N
				#  . T .         N T
				ref = "N"
				alt = "N" + alt if alt != "-" else "N"
			elif alt == "-":
				# 1 [2] 3  -->  [1 2] 3
				# .  T  .        N T
				# .  -  .        N
				start -= 1
				ref = "N" + ref
				alt = "N"

			ref_len = len(ref)

			vtype = Variant.SUBST if ref_len == len(alt) else Variant.INDEL

			if len(sample) == 0:
				sample = self._default_sample_id

			if alt1 != alt2:
				fields[self._col_name_indices[_COL_ALLELE1]] = fields[self._col_name_indices[_COL_ALLELE2]]
				self._queue_line("\t".join(fields))

			if ref == alt:
				continue

			var = Variant(type=vtype, chr=chr, start=start, ref=ref, alt=alt, strand=strand,
						  samples=[Sample(name=sample)])

			#from intogensm.variants.utils import var_to_tab
			#print "***", var
			#print "+++", var_to_tab(var)
		return var
예제 #4
0
파일: tab.py 프로젝트: chris-zen/phd-thesis
	def next(self):
		TextParser.next(self)

		var = None
		while var is None:
			fields = self.__read_fields()

			if len(fields) < 5:
				self._discard_line()
				continue

			if len(fields) < 6:
				fields += [self._default_sample_id]

			chr, start, end, strand, allele, sample = fields[0:6]

			#print ">>>", chr, start, end, strand, allele, sample

			# Chromosome

			chr = parse_chromosome(chr)
			if chr is None:
				self._discard_line()
				continue

			# Start and end

			try:
				start = int(start)
			except:
				self._discard_line()
				continue

			try:
				end = int(end)
			except:
				self._discard_line()
				continue

			if start > end:
				start, end = end, start

			# Strand

			if len(strand) == 0 or strand == "1" or strand == "+1":
				strand = "+"
			elif strand == "-1":
				strand = "-"
			elif strand not in ["+", "-"]:
				self._discard_line()
				continue

			# Alleles

			alleles = allele.split(">")
			if len(alleles) != 2:
				self._discard_line()
				continue

			ref, alt = alleles

			# Check that are well formed
			try:
				for a in [ref, alt]:
					if _ALLELE_RE.match(a) is None:
						self._discard_line()
						raise SkipLine()
			except SkipLine:
				continue

			# Special cases
			if ref == "-" or alt == "-": # ->A, GCT>-
				if ref == "-":
					# [1   2]  -->  [1 2] 3
					#  . - .         N
					#  . T .         N T
					ref = "N"
					alt = "N" + alt if alt != "-" else "N"
				elif alt == "-":
					# 1 [2] 3  -->  [1 2] 3
					# .  T  .        N T
					# .  -  .        N
					start -= 1
					ref = "N" + ref
					alt = "N"
			elif ref == "*" and len(alt) > 1 and alt[0] in ["-", "+"]: # *>-ACG, *>+CG
				if alt[0] == "-":
					start -= 1
					ref = "N" + alt[1:]
					alt = "N"
				elif alt[0] == "+":
					ref = "N"
					alt = "N" + alt[1:]
			elif "/" in ref or "/" in alt: # A/A>-/GGT, C/C>A/T, C/C>G/G
				ref = ref.split("/")
				alt = alt.split("/")
				if len(ref) != 2 or len(ref) != len(alt):
					self._discard_line()
					continue

				if ref[0] == ref[1] and alt[0] == alt[1]:
					ref.pop()
					alt.pop()

				for i in range(len(ref)):
					allele = "{0}>{1}".format(ref[i], alt[i])
					self._queue_line("\t".join([chr, str(start), str(end), strand, allele, sample]))

				continue

			ref_len = len(ref)
			alt_len = len(alt)

			vtype = Variant.SUBST if ref_len == alt_len else Variant.INDEL

			# Sample

			if len(sample) == 0:
				sample = self._default_sample_id

			var = Variant(type=vtype, chr=chr, start=start, ref=ref, alt=alt, strand=strand,
						  samples=[Sample(name=sample)])


			#from intogensm.variants.utils import var_to_tab
			#print "***", var
			#print "+++", var_to_tab(var)

		return var
예제 #5
0
파일: tab.py 프로젝트: chris-zen/phd-thesis
	def __init__(self, f, fname, default_sample_id):
		TextParser.__init__(self, f, fname, default_sample_id)
예제 #6
0
파일: vcf.py 프로젝트: chris-zen/phd-thesis
	def next(self):
		TextParser.next(self)

		var = None
		while var is None:
			line = self._readline()
			while len(line) > 0 and line.lstrip().startswith("#"):
				if line.startswith("##INDIVIDUAL="):
					self._default_sample_id = line[13:]
				line = self._readline()

			if len(line) == 0:
				raise StopIteration()

			fields = line.rstrip("\n").split("\t")

			if len(fields) < 5:
				self._discard_line()
				continue

			chr, start, external_id, ref, alt = fields[0:5]

			# Chromosome

			chr = parse_chromosome(chr)
			if chr is None:
				self._discard_line()
				continue

			# Start

			try:
				start = int(start)
			except:
				self._discard_line()
				continue

			# Check ref and alt
			if _REF_RE.match(ref) is None:
				self._discard_line()
				continue

			if _ALT_RE.match(alt) is None:
				self._discard_line()
				continue

			ref_len = len(ref)

			if "," in alt:
				vtype = None
				s = alt.split(",")
				for allele in s:
					if ref_len == len(allele):
						t = Variant.SUBST
					else:
						t = Variant.INDEL
					if vtype is None:
						vtype = t
					elif vtype != t:
						vtype = Variant.COMPLEX
						break
			else:
				if ref_len == len(alt):
					vtype = Variant.SUBST
				else:
					vtype = Variant.INDEL

			var = Variant(type=vtype, chr=chr, start=start, ref=ref, alt=alt, strand="+",
							samples=[Sample(name=self._default_sample_id)])

		return var