コード例 #1
0
 def __iter__(self):
     seq = None
     name = None
     descr = None
     for line in FileOrSequence.__iter__(self):
         if line.startswith(">"):
             if seq:
                 if self.raw_iterator:
                     s = (seq, name, descr)
                 else:
                     s = Sequence(seq.encode(), name)
                     s.descr = descr
                 yield s
             mo = _re_fasta_header_line.match(line)
             name = mo.group(1)
             descr = mo.group(2)
             seq = ""
         else:
             assert seq is not None, "FASTA file does not start with '>'."
             seq += line[:-1]
     if seq is not None:
         if self.raw_iterator:
             s = (seq, name, descr)
         else:
             s = Sequence(seq.encode(), name)
             s.descr = descr
         yield s
コード例 #2
0
 def __iter__(self):
     for line in FileOrSequence.__iter__(self):
         try:
             algnt = BowtieAlignment(line)
         except ValueError:
             if line.startswith("Reported "):
                 continue
             warnings.warn(
                 "BowtieReader: Ignoring the following line, which could "
                 "not be parsed:\n%s\n" % line, RuntimeWarning)
         yield algnt
コード例 #3
0
    def meta_info(self, header_filename=None):
        ret = []
        if header_filename is None:
            the_iter = FileOrSequence.__iter__(self)
        else:
            the_iter = open(header_filename, "r")

        for line in the_iter:
            if line.startswith('#'):
                ret.append(line)
            else:
                break
        return ret
コード例 #4
0
    def __iter__(self):
        fin = FileOrSequence.__iter__(self)
        il = 0
        id1 = None
        id2 = None
        seq = None
        qual = None
        for line in fin:
            if il == 0:
                id1 = line
                il += 1
                continue
            elif il == 1:
                seq = line
                il += 1
                continue
            elif il == 2:
                id2 = line
                il += 1
                continue

            qual = line
            il = 0

            if qual == "":
                if id1 != "":
                    warnings.warn("Number of lines in FASTQ file is not "
                                  "a multiple of 4. Discarding the last, "
                                  "incomplete record")
                break

            if not qual.endswith("\n"):
                qual += "\n"
            if not id1.startswith("@"):
                raise ValueError(
                    "Primary ID line in FASTQ file does "
                    "not start with '@'. Either this is not FASTQ data or the "
                    "parser got out of sync.")
            if not id2.startswith("+"):
                raise ValueError("Secondary ID line in FASTQ file does"
                                 "not start with '+'. Maybe got out of sync.")
            if len(id2) > 2 and id1[1:] != id2[1:]:
                raise ValueError("Primary and secondary ID line in FASTQ"
                                 "disagree.")

            if self.raw_iterator:
                s = (seq[:-1], id1[1:-1], qual[:-1], self.qual_scale)
            else:
                s = SequenceWithQualities(seq[:-1].encode(), id1[1:-1],
                                          qual[:-1].encode(), self.qual_scale)
            yield s
コード例 #5
0
 def __iter__(self):
     span = 1
     pos = None
     step = None
     chrom = None
     for line in FileOrSequence.__iter__(self):
         if line.startswith('track'):
             fields = shlex.split(line)[1:]
             self.attributes = dict([
                 (p[0], p[1].strip('"'))
                 for p in [x.split("=") for x in fields]
             ])
         elif line.startswith('fixedStep'):  # do fixed step stuff
             self.stepType = 'fixed'
             fields = shlex.split(line)[1:]
             declarations = dict([(p[0], p[1].strip('"'))
                                  for p in [x.split("=") for x in fields]])
             pos = int(declarations['start'])
             step = int(declarations['step'])
             chrom = declarations['chrom']
             if 'span' in declarations:
                 span = int(declarations['span'])
             else:
                 span = 1
         elif line.startswith('variableStep'):  # do variable step stuff
             self.stepType = 'variable'
             fields = shlex.split(line)[1:]
             declarations = dict([(p[0], p[1].strip('"'))
                                  for p in [x.split("=") for x in fields]])
             chrom = declarations['chrom']
             if 'span' in declarations:
                 span = int(declarations['span'])
             else:
                 span = 1
         elif line.startswith('browser') or line.startswith(
                 '#'):  # Comment or ignored
             if self.verbose:
                 print("Ignored line:", line)
             continue
         else:
             if self.stepType == 'fixed':
                 yield (GenomicInterval(chrom, pos, pos + span,
                                        '.'), float(line.strip()))
                 pos += step
             elif self.stepType == 'variable':
                 tmp = line.strip().split(" ")
                 pos = int(tmp[0])
                 yield (GenomicInterval(chrom, pos, pos + span,
                                        '.'), float(tmp[1]))
コード例 #6
0
 def get_sequence_lengths(self):
     seqname = None
     length = 0
     seqlengths = {}
     for line in FileOrSequence.__iter__(self):
         if line.startswith(">"):
             if seqname is not None:
                 seqlengths[seqname] = length
             mo = _re_fasta_header_line.match(line)
             seqname = mo.group(1)
             length = 0
         else:
             assert seqname is not None, "FASTA file does not start with '>'."
             length += len(line.rstrip())
     if seqname is not None:
         seqlengths[seqname] = length
     return seqlengths
コード例 #7
0
 def __iter__(self):
     for line in FileOrSequence.__iter__(self):
         if line.startswith("track"):
             continue
         fields = line.split()
         if len(fields) < 3:
             raise ValueError("BED file line contains less than 3 fields")
         if len(fields) > 12:
             raise ValueError("BED file line contains more than 12 fields")
         iv = GenomicInterval(fields[0], int(fields[1]), int(fields[2]),
                              fields[5] if len(fields) > 5 else ".")
         f = GenomicFeature(fields[3] if len(fields) > 3 else "unnamed",
                            "BED line", iv)
         f.score = float(fields[4]) if len(fields) > 4 else None
         f.thick = GenomicInterval(iv.chrom, int(fields[6]), int(fields[7]),
                                   iv.strand) if len(fields) > 7 else None
         f.itemRgb = [int(a) for a in fields[8].split(",")
                      ] if len(fields) > 8 else None
         yield (f)
コード例 #8
0
 def __iter__(self):
     for line in FileOrSequence.__iter__(self):
         record = SolexaExportAlignment()
         fields = SolexaExportReader.parse_line_bare(line)
         if fields['read_nbr'] != "1":
             warnings.warn(
                 "Paired-end read encountered. PE is so far supported only "
                 "for SAM files, not yet for SolexaExport. All PE-related "
                 "fields are ignored.")
         record.read = SequenceWithQualities(
             fields['read_seq'], "%s:%s:%s:%s:%s#0" %
             (fields['machine'], fields['lane'], fields['tile'],
              fields['x_coord'], fields['y_coord']), fields['qual_str'],
             self.qualscale)
         if fields['passed_filtering'] == 'Y':
             record.passed_filter = True
         elif fields['passed_filtering'] == 'N':
             record.passed_filter = False
         else:
             raise ValueError(
                 "Illegal 'passed filter' value in Solexa export data: '%s'."
                 % fields['passed_filtering'])
         record.index_string = fields['index_string']
         if fields['pos'] == '':
             record.iv = None
             record.nomatch_code = fields['chrom']
         else:
             if fields['strand'] == 'F':
                 strand = '+'
             elif fields['strand'] == 'R':
                 strand = '-'
             else:
                 raise ValueError(
                     "Illegal strand value in Solexa export data.")
             start = int(fields['pos'])
             chrom = fields['chrom']
             if fields['chrom'] == "":
                 chrom = fields['contig']
             record.iv = GenomicInterval(chrom, start,
                                         start + len(fields['read_seq']),
                                         strand)
         yield record
コード例 #9
0
    def parse_meta(self, header_filename=None):
        if header_filename is None:
            the_iter = FileOrSequence.__iter__(self)
        else:
            the_iter = open(header_filename, "r")

        for line in the_iter:
            if line.startswith('#'):
                if line.startswith("##"):
                    mo = _re_vcf_meta_comment.match(line)
                    if mo:
                        value = mo.group(2)
                        if mo.group(1) == "INFO":
                            value = dict(
                                e.rstrip(",").split("=", 1)
                                for e in _re_vcf_meta_descr.findall(value))
                            key = value["ID"]
                            del value["ID"]
                            self.info[key] = value
                        elif mo.group(1) == "FILTER":
                            value = dict(
                                e.rstrip(",").split("=", 1)
                                for e in _re_vcf_meta_descr.findall(value))
                            key = value["ID"]
                            del value["ID"]
                            self.filters[key] = value
                        elif mo.group(1) == "FORMAT":
                            value = dict(
                                e.rstrip(",").split("=", 1)
                                for e in _re_vcf_meta_descr.findall(value))
                            key = value["ID"]
                            del value["ID"]
                            self.formats[key] = value
                        else:
                            self.metadata[mo.group(1)] = mo.group(2)
                else:
                    self.sampleids = line.rstrip("\t\n").split("\t")[9:]
                    self.nsamples = len(self.sampleids)
                continue
            else:
                break
コード例 #10
0
 def __iter__(self):
     for line in FileOrSequence.__iter__(self):
         if line == "\n" or line.startswith('#'):
             continue
         vc = VariantCall.fromline(line, self.nsamples, self.sampleids)
         yield vc