def __iter__(self): seq = None name = None descr = None for line in FileOrSequence.__iter__(self): if line.startswith(">"): if seq: if self.raw_iterator: s = (seq, name, descr) else: s = Sequence(seq.encode(), name) s.descr = descr yield s mo = _re_fasta_header_line.match(line) name = mo.group(1) descr = mo.group(2) seq = "" else: assert seq is not None, "FASTA file does not start with '>'." seq += line[:-1] if seq is not None: if self.raw_iterator: s = (seq, name, descr) else: s = Sequence(seq.encode(), name) s.descr = descr yield s
def __iter__(self): for line in FileOrSequence.__iter__(self): try: algnt = BowtieAlignment(line) except ValueError: if line.startswith("Reported "): continue warnings.warn( "BowtieReader: Ignoring the following line, which could " "not be parsed:\n%s\n" % line, RuntimeWarning) yield algnt
def meta_info(self, header_filename=None): ret = [] if header_filename is None: the_iter = FileOrSequence.__iter__(self) else: the_iter = open(header_filename, "r") for line in the_iter: if line.startswith('#'): ret.append(line) else: break return ret
def __iter__(self): fin = FileOrSequence.__iter__(self) il = 0 id1 = None id2 = None seq = None qual = None for line in fin: if il == 0: id1 = line il += 1 continue elif il == 1: seq = line il += 1 continue elif il == 2: id2 = line il += 1 continue qual = line il = 0 if qual == "": if id1 != "": warnings.warn("Number of lines in FASTQ file is not " "a multiple of 4. Discarding the last, " "incomplete record") break if not qual.endswith("\n"): qual += "\n" if not id1.startswith("@"): raise ValueError( "Primary ID line in FASTQ file does " "not start with '@'. Either this is not FASTQ data or the " "parser got out of sync.") if not id2.startswith("+"): raise ValueError("Secondary ID line in FASTQ file does" "not start with '+'. Maybe got out of sync.") if len(id2) > 2 and id1[1:] != id2[1:]: raise ValueError("Primary and secondary ID line in FASTQ" "disagree.") if self.raw_iterator: s = (seq[:-1], id1[1:-1], qual[:-1], self.qual_scale) else: s = SequenceWithQualities(seq[:-1].encode(), id1[1:-1], qual[:-1].encode(), self.qual_scale) yield s
def __iter__(self): span = 1 pos = None step = None chrom = None for line in FileOrSequence.__iter__(self): if line.startswith('track'): fields = shlex.split(line)[1:] self.attributes = dict([ (p[0], p[1].strip('"')) for p in [x.split("=") for x in fields] ]) elif line.startswith('fixedStep'): # do fixed step stuff self.stepType = 'fixed' fields = shlex.split(line)[1:] declarations = dict([(p[0], p[1].strip('"')) for p in [x.split("=") for x in fields]]) pos = int(declarations['start']) step = int(declarations['step']) chrom = declarations['chrom'] if 'span' in declarations: span = int(declarations['span']) else: span = 1 elif line.startswith('variableStep'): # do variable step stuff self.stepType = 'variable' fields = shlex.split(line)[1:] declarations = dict([(p[0], p[1].strip('"')) for p in [x.split("=") for x in fields]]) chrom = declarations['chrom'] if 'span' in declarations: span = int(declarations['span']) else: span = 1 elif line.startswith('browser') or line.startswith( '#'): # Comment or ignored if self.verbose: print("Ignored line:", line) continue else: if self.stepType == 'fixed': yield (GenomicInterval(chrom, pos, pos + span, '.'), float(line.strip())) pos += step elif self.stepType == 'variable': tmp = line.strip().split(" ") pos = int(tmp[0]) yield (GenomicInterval(chrom, pos, pos + span, '.'), float(tmp[1]))
def get_sequence_lengths(self): seqname = None length = 0 seqlengths = {} for line in FileOrSequence.__iter__(self): if line.startswith(">"): if seqname is not None: seqlengths[seqname] = length mo = _re_fasta_header_line.match(line) seqname = mo.group(1) length = 0 else: assert seqname is not None, "FASTA file does not start with '>'." length += len(line.rstrip()) if seqname is not None: seqlengths[seqname] = length return seqlengths
def __iter__(self): for line in FileOrSequence.__iter__(self): if line.startswith("track"): continue fields = line.split() if len(fields) < 3: raise ValueError("BED file line contains less than 3 fields") if len(fields) > 12: raise ValueError("BED file line contains more than 12 fields") iv = GenomicInterval(fields[0], int(fields[1]), int(fields[2]), fields[5] if len(fields) > 5 else ".") f = GenomicFeature(fields[3] if len(fields) > 3 else "unnamed", "BED line", iv) f.score = float(fields[4]) if len(fields) > 4 else None f.thick = GenomicInterval(iv.chrom, int(fields[6]), int(fields[7]), iv.strand) if len(fields) > 7 else None f.itemRgb = [int(a) for a in fields[8].split(",") ] if len(fields) > 8 else None yield (f)
def __iter__(self): for line in FileOrSequence.__iter__(self): record = SolexaExportAlignment() fields = SolexaExportReader.parse_line_bare(line) if fields['read_nbr'] != "1": warnings.warn( "Paired-end read encountered. PE is so far supported only " "for SAM files, not yet for SolexaExport. All PE-related " "fields are ignored.") record.read = SequenceWithQualities( fields['read_seq'], "%s:%s:%s:%s:%s#0" % (fields['machine'], fields['lane'], fields['tile'], fields['x_coord'], fields['y_coord']), fields['qual_str'], self.qualscale) if fields['passed_filtering'] == 'Y': record.passed_filter = True elif fields['passed_filtering'] == 'N': record.passed_filter = False else: raise ValueError( "Illegal 'passed filter' value in Solexa export data: '%s'." % fields['passed_filtering']) record.index_string = fields['index_string'] if fields['pos'] == '': record.iv = None record.nomatch_code = fields['chrom'] else: if fields['strand'] == 'F': strand = '+' elif fields['strand'] == 'R': strand = '-' else: raise ValueError( "Illegal strand value in Solexa export data.") start = int(fields['pos']) chrom = fields['chrom'] if fields['chrom'] == "": chrom = fields['contig'] record.iv = GenomicInterval(chrom, start, start + len(fields['read_seq']), strand) yield record
def parse_meta(self, header_filename=None): if header_filename is None: the_iter = FileOrSequence.__iter__(self) else: the_iter = open(header_filename, "r") for line in the_iter: if line.startswith('#'): if line.startswith("##"): mo = _re_vcf_meta_comment.match(line) if mo: value = mo.group(2) if mo.group(1) == "INFO": value = dict( e.rstrip(",").split("=", 1) for e in _re_vcf_meta_descr.findall(value)) key = value["ID"] del value["ID"] self.info[key] = value elif mo.group(1) == "FILTER": value = dict( e.rstrip(",").split("=", 1) for e in _re_vcf_meta_descr.findall(value)) key = value["ID"] del value["ID"] self.filters[key] = value elif mo.group(1) == "FORMAT": value = dict( e.rstrip(",").split("=", 1) for e in _re_vcf_meta_descr.findall(value)) key = value["ID"] del value["ID"] self.formats[key] = value else: self.metadata[mo.group(1)] = mo.group(2) else: self.sampleids = line.rstrip("\t\n").split("\t")[9:] self.nsamples = len(self.sampleids) continue else: break
def __iter__(self): for line in FileOrSequence.__iter__(self): if line == "\n" or line.startswith('#'): continue vc = VariantCall.fromline(line, self.nsamples, self.sampleids) yield vc