Ejemplo n.º 1
0
def convert(in_file, in_format, out_file, out_format, alphabet=None):
    """Convert between two alignment files, returns number of alignments.

        - in_file - an input handle or filename
        - in_format - input file format, lower case string
        - output - an output handle or filename
        - out_file - output file format, lower case string
        - alphabet - optional alphabet to assume

    **NOTE** - If you provide an output filename, it will be opened which will
    overwrite any existing file without warning. This may happen if even the
    conversion is aborted (e.g. an invalid out_format name is given).
    """
    # TODO - Add optimised versions of important conversions
    # For now just off load the work to SeqIO parse/write
    with as_handle(in_file, 'rU') as in_handle:
        # Don't open the output file until we've checked the input is OK:
        alignments = parse(in_handle, in_format, None, alphabet)

        # This will check the arguments and issue error messages,
        # after we have opened the file which is a shame.
        with as_handle(out_file, 'w') as out_handle:
            count = write(alignments, out_handle, out_format)

    return count
Ejemplo n.º 2
0
def convert(in_file, in_format, out_file, out_format, alphabet=None):
    """Convert between two alignment files, returns number of alignments.

     - in_file - an input handle or filename
     - in_format - input file format, lower case string
     - output - an output handle or filename
     - out_file - output file format, lower case string
     - alphabet - optional alphabet to assume

    NOTE - If you provide an output filename, it will be opened which will
    overwrite any existing file without warning. This may happen if even the
    conversion is aborted (e.g. an invalid out_format name is given).
    """
    #TODO - Add optimised versions of important conversions
    #For now just off load the work to SeqIO parse/write
    with as_handle(in_file, 'rU') as in_handle:
        #Don't open the output file until we've checked the input is OK:
        alignments = parse(in_handle, in_format, None, alphabet)

        #This will check the arguments and issue error messages,
        #after we have opened the file which is a shame.
        with as_handle(out_file, 'w') as out_handle:
            count = write(alignments, out_handle, out_format)

    return count
Ejemplo n.º 3
0
def convert(in_file, in_format, out_file, out_format, alphabet=None):
    """Convert between two sequence file formats, return number of records.

    Arguments:
     - in_file - an input handle or filename
     - in_format - input file format, lower case string
     - out_file - an output handle or filename
     - out_format - output file format, lower case string
     - alphabet - optional alphabet to assume

    **NOTE** - If you provide an output filename, it will be opened which will
    overwrite any existing file without warning. This may happen if even
    the conversion is aborted (e.g. an invalid out_format name is given).

    For example, going from a filename to a handle:

    >>> from Bio import SeqIO
    >>> try:
    ...     from StringIO import StringIO # Python 2
    ... except ImportError:
    ...     from io import StringIO # Python 3
    ...
    >>> handle = StringIO("")
    >>> SeqIO.convert("Quality/example.fastq", "fastq", handle, "fasta")
    3
    >>> print(handle.getvalue())
    >EAS54_6_R1_2_1_413_324
    CCCTTCTTGTCTTCAGCGTTTCTCC
    >EAS54_6_R1_2_1_540_792
    TTGGCAGGCCAAGGCCGATGGATCA
    >EAS54_6_R1_2_1_443_348
    GTTGCTTCTGGCGTGGGTGGGGGGG
    <BLANKLINE>
    """
    # Hack for SFF, will need to make this more general in future
    if in_format in _BinaryFormats:
        in_mode = 'rb'
    else:
        in_mode = 'rU'

    # Don't open the output file until we've checked the input is OK?
    if out_format in ["sff", "sff_trim"]:
        out_mode = 'wb'
    else:
        out_mode = 'w'

    # This will check the arguments and issue error messages,
    # after we have opened the file which is a shame.
    from ._convert import _handle_convert  # Lazy import
    with as_handle(in_file, in_mode) as in_handle:
        with as_handle(out_file, out_mode) as out_handle:
            count = _handle_convert(in_handle, in_format,
                                    out_handle, out_format,
                                    alphabet)
    return count
Ejemplo n.º 4
0
def _handle_convert(in_file, in_format, out_file, out_format, alphabet=None):
    """Convert handles from one format to another (PRIVATE)."""
    try:
        f = _converter[(in_format, out_format)]
    except KeyError:
        f = None
    if f:
        with as_handle(in_file, "r") as in_handle:
            with as_handle(out_file, "w") as out_handle:
                return f(in_handle, out_handle, alphabet)
    else:
        records = SeqIO.parse(in_file, in_format, alphabet)
        return SeqIO.write(records, out_file, out_format)
Ejemplo n.º 5
0
def parse(handle, file_format):
    """
    Iterate over a gene ontology file.
    
    Parameters:
     - handle - File handle object to read from, or filename as a string,
     - file_format - lower case string describing the file format to write,
         Formats:
             - obo
             - tsv
             
    You should close the handle after calling this function.
    """

    if not isinstance(file_format, basestring):
        raise TypeError("Need a string for the file format (lower case)")
    if not file_format:
        raise ValueError("Format required (lower case string)")          
    if file_format != file_format.lower():
        raise ValueError("Format string '%s' should be lower case" % format)
    with as_handle(handle, 'rU') as fp:
        if file_format in _FormatToIterator:
            iterator_generator = _FormatToIterator[file_format]
            it = iterator_generator(fp)

            for el in it:
                yield el
        else:
            raise ValueError("Unknown format '%s'" % file_format)
Ejemplo n.º 6
0
def read(handle, file_format, **params):
    """
    Read file in given format.
    
    Parameters:
     - handle - File handle object to read from, or filename as a string,
     - file_format - lower case string describing the file format to write,
         Formats:
             - nexo
             - obo
             - etsv
             - gaf
     - params - additional parameters

    You should close the handle after calling this function.
    """

    if not isinstance(file_format, basestring):
        raise TypeError("Need a string for the file format (lower case)")
    if not file_format:
        raise ValueError("Format required (lower case string)")          
    if file_format != file_format.lower():
        raise ValueError("Format string '%s' should be lower case" % format)
    with as_handle(handle, 'rU') as fp:
        if file_format in _FormatToReader:
            reader_generator = _FormatToReader[file_format]
            return reader_generator(fp, **params).read()
        else:
            raise ValueError("Unknown format '%s'" % file_format)
Ejemplo n.º 7
0
def write(data, handle, file_format, **params):
    """
    Writes given data to file.

    Parameters:
     - data - data to write to a file,
     - handle - File handle object to write to, or filename as string
                   (note older versions of Biopython only took a handle),
     - file_format - lower case string describing the file format to write,
         Formats:
             - png - writes picture of graph to png format (this feature needs
               pygraphviz to be installed)
             - etsv
     - params - additional parameters
     
    You should close the handle after calling this function.

    """
    
    if not isinstance(file_format, basestring):
        raise TypeError("Need a string for the file format (lower case)")
    if not file_format:
        raise ValueError("Format required (lower case string)")

    with as_handle(handle, 'w') as fp:
        #Map the file format to a writer class
        if file_format in _FormatToWriter:
            writer_class = _FormatToWriter[file_format]
            writer_class(fp, **params).write(data)
        else:
            raise ValueError("Unknown format '%s'" % file_format)
Ejemplo n.º 8
0
def pretty_print(enrichment, graph, handle, file_format, **params):
    """
    Print results returned by enrichment finder in a specified format.
    
     Parameters:
     - enrichment - result from EnrichmentFinder
     - graph - OntologyGraph with containing enriched nodes
     - handle - File handle object to read from, or filename as a string,
     - file_format - lower case string describing the file format to write,
         Formats:
             - gml
             - png
             - txt
             - html
     - params - additional parameters
     
    You should close the handle after calling this function.
    """
    
    if not isinstance(file_format, basestring):
        raise TypeError("Need a string for the file format (lower case)")
    if not file_format:
        raise ValueError("Format required (lower case string)")

    with as_handle(handle, 'w') as fp:
        #Map the file format to a writer class
        if file_format in _FormatToPrinter:
            writer_class = _FormatToPrinter[file_format]
            writer = writer_class(fp, **params)
            writer.pretty_print(enrichment, graph)
        else:
            raise ValueError("Unknown format '%s'" % file_format)
Ejemplo n.º 9
0
    def restore(cls, fpOrFilePrefix):
        """
        Load a database from a file.

        @param fpOrFilePrefix: A file pointer, or the C{str} prefix of a file
            name, or C{None}. If a C{str}, self.SAVE_SUFFIX is appended to get
            the full file name.
        @return: An instance of L{Database}.
        @raises ValueError: If a now non-existent connector class name is
            found in the saved database file.
        """
        if isinstance(fpOrFilePrefix, str):
            saveFile = fpOrFilePrefix + cls.SAVE_SUFFIX
            filePrefix = fpOrFilePrefix
        else:
            saveFile = fpOrFilePrefix
            filePrefix = None

        with as_handle(saveFile) as fp:
            dbParams = DatabaseParameters.restore(fp)
            state = loads(fp.readline()[:-1])

        connectorClassName = state['_connectorClassName']
        if connectorClassName == SimpleConnector.__name__:
            connector = SimpleConnector.restore(fpOrFilePrefix)
        elif six.PY3 and connectorClassName == WampServerConnector.__name__:
            connector = WampServerConnector.restore(fpOrFilePrefix)
        else:
            raise ValueError('Unknown backend connector class %r.' %
                             connectorClassName)

        new = cls(dbParams, connector, filePrefix=filePrefix)

        return new
Ejemplo n.º 10
0
def parse_text_coords(fname, coord_only, _keep_strand):
    """Parse text coordinates: chrom:start-end

    Text coordinates are assumed to be counting from 1.
    """
    if coord_only:

        @report_bad_line
        def _parse_line(line):
            chrom, _rest = line.rstrip().split(':', 1)
            start, end = _rest.split('-')
            if ':' in end:
                end = end.split(':', 1)[0]
            return chrom, int(start) - 1, int(end)
    else:

        @report_bad_line
        def _parse_line(line):
            fields = line.split(':')
            if len(fields) == 3:
                chrom, start_end, name = fields
            elif len(fields) == 2:
                chrom, start_end = fields
                name = '-'
            else:
                raise ValueError
            start, end = start_end.split('-')
            return chrom, int(start) - 1, int(end), name.rstrip()

    with as_handle(fname, 'rU') as handle:
        for line in handle:
            yield _parse_line(line)
Ejemplo n.º 11
0
def _read_text_integers(handleish, sep=None, header=False):
    """
    Read and separate text and integers, where integers can be found in the form of ranges, e.g. "4-10"
    :param handleish: 
    :return: [[text,...],...], [[number,...],...]
    """
    texts, numbers = [], []

    with as_handle(handleish) as fh:
        if header: next(fh)

        for line in fh:
            line = line.strip().split(sep)
            texts.append([])
            numbers.append([])
            for cell in line:
                try:
                    numbers[-1].append(int(cell))
                except ValueError:
                    pass
                else:
                    continue
                # match for range, i.e. numbers separated by one or two non-alphabet character.
                _range = st.re_range(cell)
                if _range is not None: numbers[-1].extend(_range)
                else: texts[-1].append(cell)

    return texts, numbers
Ejemplo n.º 12
0
def write(alignments, handle, format):
    """Write complete set of alignments to a file.

    Arguments:
     - alignments - A list (or iterator) of MultipleSeqAlignment objects,
       or a single alignment object.
     - handle    - File handle object to write to, or filename as string
       (note older versions of Biopython only took a handle).
     - format    - lower case string describing the file format to write.

    You should close the handle after calling this function.

    Returns the number of alignments written (as an integer).
    """
    from Bio import SeqIO

    # Try and give helpful error messages:
    if not isinstance(format, str):
        raise TypeError("Need a string for the file format (lower case)")
    if not format:
        raise ValueError("Format required (lower case string)")
    if format != format.lower():
        raise ValueError("Format string '%s' should be lower case" % format)

    if isinstance(alignments, MultipleSeqAlignment):
        # This raised an exception in older versions of Biopython
        alignments = [alignments]

    with as_handle(handle, "w") as fp:
        # Map the file format to a writer class
        if format in _FormatToWriter:
            writer_class = _FormatToWriter[format]
            count = writer_class(fp).write_file(alignments)
        elif format in SeqIO._FormatToWriter:
            # Exploit the existing SeqIO parser to do the dirty work!
            # TODO - Can we make one call to SeqIO.write() and count the alignments?
            count = 0
            for alignment in alignments:
                if not isinstance(alignment, MultipleSeqAlignment):
                    raise TypeError(
                        "Expect a list or iterator of MultipleSeqAlignment "
                        "objects, got: %r" % alignment
                    )
                SeqIO.write(alignment, fp, format)
                count += 1
        elif format in _FormatToIterator or format in SeqIO._FormatToIterator:
            raise ValueError(
                "Reading format '%s' is supported, but not writing" % format
            )
        else:
            raise ValueError("Unknown format '%s'" % format)

    if not isinstance(count, int):
        raise RuntimeError(
            "Internal error - the underlying %s "
            "writer should have returned the alignment count, not %s"
            % (format, repr(count))
        )

    return count
Ejemplo n.º 13
0
def process_output(fg_aln, bg_aln, hits, alpha, output, pattern, pdb_data):
    """Generate the output files from the processed data."""
    with as_handle(output, 'w+') as outfile:
        write_pvalues(hits, outfile, alpha)
    tophits = top_hits(hits, alpha)
    if pattern:
        with open(pattern, 'w+') as ptnfile:
            write_mcbpps(tophits, ptnfile)
        # XXX hack: don't make pairlogo in single mode
        if bg_aln:
            pairlogo.make_pairlogos(fg_aln, bg_aln, tophits,
                                    pattern.rsplit('.', 1)[0],
                                    10)
    if pdb_data:
        patterns = [t[0] for t in tophits]
        if len(pdb_data) == 1:
            pdb_fname, pdb_rec, pdb_resnums, pdb_inserts = pdb_data[0]
            script = pmlscript.build_single(pdb_resnums, pdb_inserts,
                                            patterns, pdb_fname,
                                            pdb_rec.annotations['chain'])
            pml_fname = pdb_fname + ".pml"
        else:
            pdb_fnames, pdb_recs, pdb_resnumses, pdb_insertses = zip(*pdb_data)
            # TODO multi-PDB mode
            pml_fname = pdb_fnames[0] + "-etc.pml"
        with open(pml_fname, 'w+') as pmlfile:
            pmlfile.write(script)
        logging.info("Wrote %s", pml_fname)
Ejemplo n.º 14
0
def SnapGeneIterator(handle):
    """Parse a SnapGene file and return a SeqRecord object.

    Note that a SnapGene file can only contain one sequence, so this
    iterator will always return a single record.
    """
    record = SeqRecord(None)
    n = 0

    # check if file is empty
    empty = True

    with as_handle(handle, "rb") as handle:

        for n, (type, length, data) in enumerate(_PacketIterator(handle)):
            empty = False
            if n == 0 and type != 0x09:
                raise ValueError(
                    "The file does not start with a SnapGene cookie packet")

            if type in _packet_handlers:
                _packet_handlers[type](length, data, record)

        if empty:
            raise ValueError("Empty file.")

        if not record.seq:
            raise ValueError("No DNA packet in file")

        yield record
Ejemplo n.º 15
0
def write_PDB(entity, file, pdbid=None, chainid=None):
    """Write PDB file with HEADER and TITLE."""
    with as_handle(file, 'w') as fp:
        try:
            if 'S' == entity.level:
                if not pdbid:
                    pdbid = entity.header.get('idcode', None)
                hdr = entity.header.get('head', None)
                dd = entity.header.get('deposition_date', None)
                if hdr:
                    fp.write(('HEADER    {:40}{:8}   {:4}\n'
                              ).format(hdr.upper(), (dd or ''), (pdbid or '')))
                nam = entity.header.get('name', None)
                if nam:
                    fp.write('TITLE     ' + nam.upper() + '\n')
                io = PDBIO()
                io.set_structure(entity)
                io.save(fp)

            else:
                raise PDBException("level not 'S': "
                                   + str(entity.level))
        except KeyError:
            raise Exception(
                "write_PIC: argument is not a Biopython PDB Entity "
                + str(entity))
Ejemplo n.º 16
0
def FastaTwoLineIterator(handle, alphabet=single_letter_alphabet):
    """Iterate over two-line Fasta records (as SeqRecord objects).

    Arguments:
     - handle - input file
     - alphabet - optional alphabet

    This uses a strict interpretation of the FASTA as requiring
    exactly two lines per record (no line wrapping).

    Only the default title to ID/name/description parsing offered
    by the relaxed FASTA parser is offered.
    """
    with as_handle(handle) as handle:
        for title, sequence in FastaTwoLineParser(handle):
            try:
                first_word = title.split(None, 1)[0]
            except IndexError:
                assert not title, repr(title)
                # Should we use SeqRecord default for no ID?
                first_word = ""
            yield SeqRecord(
                Seq(sequence, alphabet),
                id=first_word,
                name=first_word,
                description=title,
            )
Ejemplo n.º 17
0
def pretty_print(enrichment, graph, handle, file_format, **params):
    """
    Print results returned by enrichment finder in a specified format.

     Parameters:
     - enrichment - result from EnrichmentFinder
     - graph - OntologyGraph with containing enriched nodes
     - handle - File handle object to read from, or filename as a string,
     - file_format - lower case string describing the file format to write,
         Formats:
             - gml
             - png
             - txt
             - html
     - params - additional parameters

    You should close the handle after calling this function.
    """

    if not isinstance(file_format, basestring):
        raise TypeError("Need a string for the file format (lower case)")
    if not file_format:
        raise ValueError("Format required (lower case string)")

    with as_handle(handle, 'w') as fp:
        #Map the file format to a writer class
        if file_format in _FormatToPrinter:
            writer_class = _FormatToPrinter[file_format]
            writer = writer_class(fp, **params)
            writer.pretty_print(enrichment, graph)
        else:
            raise ValueError("Unknown format '%s'" % file_format)
Ejemplo n.º 18
0
    def get_structure(self, id, file):
        """Return the structure.

        Arguments:
        o id - string, the id that will be used for the structure
        o file - name of the PDB file OR an open filehandle
        """

        if self.QUIET:
            warning_list = warnings.filters[:]
            warnings.filterwarnings("ignore", category=PDBConstructionWarning)

        self.header = None
        self.trailer = None
        # Make a StructureBuilder instance (pass id of structure as parameter)
        self.structure_builder.init_structure(id)

        with as_handle(file) as handle:
            self._parse(handle.readlines())

        self.structure_builder.set_header(self.header)
        # Return the Structure instance
        structure = self.structure_builder.get_structure()

        if self.QUIET:
            warnings.filters = warning_list

        return structure
Ejemplo n.º 19
0
    def restore(cls, fpOrFilePrefix):
        """
        Restore state from a file.

        @param fpOrFilePrefix: A file pointer or the C{str} prefix of a file
            name. If a C{str}, self.SAVE_SUFFIX is appended to get the full
            file name.
        @return: An instance of L{WampServerConnector}.
        @raises ValueError: If valid JSON cannot be loaded from C{fp}.
        """
        if isinstance(fpOrFilePrefix, str):
            saveFile = fpOrFilePrefix + cls.SAVE_SUFFIX
            filePrefix = fpOrFilePrefix
        else:
            saveFile = fpOrFilePrefix
            filePrefix = None

        with as_handle(saveFile) as fp:
            dbParams = DatabaseParameters.restore(fp)
            state = loads(fp.readline()[:-1])

        disconnectedBackends = {}
        for name, backendInfo in state['disconnectedBackends'].items():
            disconnectedBackends[name] = {
                'checksum': Checksum(backendInfo['checksum']),
                'subjectCount': backendInfo['subjectCount'],
            }

        return cls(dbParams,
                   _id=state['id'],
                   checksum=Checksum(state['checksum']),
                   disconnectedBackends=disconnectedBackends,
                   filePrefix=filePrefix)
Ejemplo n.º 20
0
def PhdIterator(handle):
    """Return SeqRecord objects from a PHD file.

    This uses the Bio.Sequencing.Phd module to do the hard work.
    """
    with as_handle(handle, "rU") as handle:
        phd_records = Phd.parse(handle)
        for phd_record in phd_records:
            # Convert the PHY record into a SeqRecord...
            # The "filename" can contain spaces, e.g. 'HWI-EAS94_4_1_1_602_99 1'
            # from unit test example file phd_solexa.
            # This will cause problems if used as the record identifier
            # (e.g. output for FASTQ format).
            name = phd_record.file_name.split(None, 1)[0]
            seq_record = SeqRecord(phd_record.seq,
                                   id=name,
                                   name=name,
                                   description=phd_record.file_name)
            # Just re-use the comments dictionary as the SeqRecord's annotations
            seq_record.annotations = phd_record.comments
            # And store the qualities and peak locations as per-letter-annotation
            seq_record.letter_annotations["phred_quality"] = [
                int(site[1]) for site in phd_record.sites
            ]
            try:
                seq_record.letter_annotations["peak_location"] = [
                    int(site[2]) for site in phd_record.sites
                ]
            except IndexError:
                # peak locations are not always there according to
                # David Gordon (the Consed author)
                pass
            yield seq_record
Ejemplo n.º 21
0
def parse_text_coords(fname, coord_only, _keep_strand):
    """Parse text coordinates: chrom:start-end

    Text coordinates are assumed to be counting from 1.
    """
    if coord_only:
        @report_bad_line
        def _parse_line(line):
            chrom, _rest = line.rstrip().split(':', 1)
            start, end = _rest.split('-')
            if ':' in end:
                end = end.split(':', 1)[0]
            return chrom, int(start) - 1, int(end)
    else:
        @report_bad_line
        def _parse_line(line):
            fields = line.split(':')
            if len(fields) == 3:
                chrom, start_end, name = fields
            elif len(fields) == 2:
                chrom, start_end = fields
                name = '-'
            else:
                raise ValueError
            start, end = start_end.split('-')
            return chrom, int(start) - 1, int(end), name.rstrip()

    with as_handle(fname, 'rU') as handle:
        for line in handle:
            yield _parse_line(line)
Ejemplo n.º 22
0
def read_bed(infile):
    """UCSC Browser Extensible Data (BED) format.

    A BED file has these columns:
        chromosome, start position, end position, [gene, strand, other stuff...]

    Coordinate indexing is from 0.

    Sets of regions are separated by "track" lines. This function stops reading
    after encountering a track line other than the first one in the file.
    """
    # ENH: just pd.read_table, skip 'track'
    @report_bad_line
    def _parse_line(line):
        fields = line.split('\t', 6)
        chrom, start, end = fields[:3]
        gene = (fields[3].rstrip() if len(fields) >= 4 else '-')
        strand = (fields[5].rstrip() if len(fields) >= 6 else '.')
        return chrom, int(start), int(end), gene, strand

    def track2track(handle):
        firstline = next(handle)
        if firstline.startswith("track"):
            pass
        else:
            yield firstline
        for line in handle:
            if line.startswith('track'):
                raise StopIteration
            yield line

    with as_handle(infile, 'rU') as handle:
        rows = map(_parse_line, track2track(handle))
        return pd.DataFrame.from_records(
            rows, columns=["chromosome", "start", "end", "gene", "strand"])
Ejemplo n.º 23
0
def read_vcf_simple(infile):
    """Read VCF file w/o samples."""
    # ENH: Make all readers return a tuple (header_string, body_table)
    # ENH: usecols -- need to trim dtypes dict to match?
    header_lines = []
    with as_handle(infile, 'rU') as handle:
        for line in handle:
            if line.startswith('##'):
                header_lines.append(line)
            else:
                assert line.startswith('#CHR')
                header_line = line
                header_lines.append(line)
                break

        # Extract sample names from VCF header, keep as column names
        header_fields = header_line.split('\t')
        sample_ids = header_fields[9:]
        colnames = ['chromosome', 'start', 'id', 'ref', 'alt',
                    'qual', 'filter', 'info', 'format'] + sample_ids
        dtypes = {c: str for c in colnames}
        dtypes['start'] = int
        del dtypes['qual']
        table = pd.read_csv(handle, sep='\t', header=None, na_filter=False,
                            names=colnames,
                            converters={'qual': parse_qual},
                            dtype=dtypes)
    # ENH: do things with filter, info
    table['start'] -= 1
    table['end'] = table['info'].apply(parse_end_from_info)
    set_ends(table)
    logging.info("Loaded %d plain records", len(table))
    return table
Ejemplo n.º 24
0
    def save(self, fpOrFilePrefix=None):
        """
        Save state to a file.

        @param fpOrFilePrefix: A file pointer, or the C{str} prefix of a file
            name, or C{None}. If a C{str}, self.SAVE_SUFFIX is appended to get
            the full file name. If C{None}, self._filePrefix will be used as a
            file prefix unless it is also C{None}.
        @raises ValueError: If C{fpOrFilePrefix} and C{self._filePrefix} are
            both C{None}
        """
        if isinstance(fpOrFilePrefix, str):
            saveFile = fpOrFilePrefix + self.SAVE_SUFFIX
        elif fpOrFilePrefix is None:
            if self._filePrefix is None:
                raise ValueError('save must be given an argument (or the '
                                 'database must have been restored from a '
                                 'file).')
            else:
                saveFile = self._filePrefix + self.SAVE_SUFFIX
        else:
            saveFile = fpOrFilePrefix

        with as_handle(saveFile, 'w') as fp:
            self.dbParams.save(fp)

        self._backend.save(fpOrFilePrefix)
Ejemplo n.º 25
0
def parse(handle, file_format):
    """
    Iterate over a gene ontology file.

    Parameters:
     - handle - File handle object to read from, or filename as a string,
     - file_format - lower case string describing the file format to write,
         Formats:
             - obo
             - tsv

    You should close the handle after calling this function.
    """

    if not isinstance(file_format, basestring):
        raise TypeError("Need a string for the file format (lower case)")
    if not file_format:
        raise ValueError("Format required (lower case string)")
    if file_format != file_format.lower():
        raise ValueError("Format string '%s' should be lower case" % format)
    with as_handle(handle, 'rU') as fp:
        if file_format in _FormatToIterator:
            iterator_generator = _FormatToIterator[file_format]
            it = iterator_generator(fp)

            for el in it:
                yield el
        else:
            raise ValueError("Unknown format '%s'" % file_format)
Ejemplo n.º 26
0
def read(handle, file_format, **params):
    """
    Read file in given format.

    Parameters:
     - handle - File handle object to read from, or filename as a string,
     - file_format - lower case string describing the file format to write,
         Formats:
             - nexo
             - obo
             - etsv
             - gaf
     - params - additional parameters

    You should close the handle after calling this function.
    """

    if not isinstance(file_format, basestring):
        raise TypeError("Need a string for the file format (lower case)")
    if not file_format:
        raise ValueError("Format required (lower case string)")
    if file_format != file_format.lower():
        raise ValueError("Format string '%s' should be lower case" % format)
    with as_handle(handle, 'rU') as fp:
        if file_format in _FormatToReader:
            reader_generator = _FormatToReader[file_format]
            return reader_generator(fp, **params).read()
        else:
            raise ValueError("Unknown format '%s'" % file_format)
Ejemplo n.º 27
0
def write(data, handle, file_format, **params):
    """
    Writes given data to file.

    Parameters:
     - data - data to write to a file,
     - handle - File handle object to write to, or filename as string
                   (note older versions of Biopython only took a handle),
     - file_format - lower case string describing the file format to write,
         Formats:
             - png - writes picture of graph to png format (this feature needs
               pygraphviz to be installed)
             - etsv
     - params - additional parameters

    You should close the handle after calling this function.

    """

    if not isinstance(file_format, basestring):
        raise TypeError("Need a string for the file format (lower case)")
    if not file_format:
        raise ValueError("Format required (lower case string)")

    with as_handle(handle, 'w') as fp:
        #Map the file format to a writer class
        if file_format in _FormatToWriter:
            writer_class = _FormatToWriter[file_format]
            writer_class(fp, **params).write(data)
        else:
            raise ValueError("Unknown format '%s'" % file_format)
Ejemplo n.º 28
0
def write_PDB(entity: Structure,
              file: str,
              pdbid: str = None,
              chainid: str = None) -> None:
    """Write PDB file with HEADER and TITLE."""
    enumerate_atoms(entity)
    with as_handle(file, "w") as fp:
        try:
            if "S" == entity.level:
                if hasattr(entity, "header"):
                    if not pdbid:
                        pdbid = entity.header.get("idcode", None)
                    hdr = entity.header.get("head", None)
                    dd = pdb_date(entity.header.get("deposition_date", None))

                    if hdr:
                        fp.write(("HEADER    {:40}{:8}   {:4}\n").format(
                            hdr.upper(), (dd or ""), (pdbid or "")))
                    nam = entity.header.get("name", None)
                    if nam:
                        fp.write("TITLE     " + nam.upper() + "\n")
                io = PDBIO()
                io.set_structure(entity)
                io.save(fp, preserve_atom_numbering=True)

            else:
                raise PDBException("level not 'S': " + str(entity.level))
        except KeyError:
            raise Exception(
                "write_PIC: argument is not a Biopython PDB Entity " +
                str(entity))
Ejemplo n.º 29
0
    def get_structure(self, id, file):
        """Return the structure.

        Arguments:
        o id - string, the id that will be used for the structure
        o file - name of the PDB file OR an open filehandle
        """

        if self.QUIET:
            warning_list = warnings.filters[:]
            warnings.filterwarnings('ignore', category=PDBConstructionWarning)

        self.header = None
        self.trailer = None
        # Make a StructureBuilder instance (pass id of structure as parameter)
        self.structure_builder.init_structure(id)

        with as_handle(file) as handle:
            self._parse(handle.readlines())

        self.structure_builder.set_header(self.header)
        # Return the Structure instance
        structure = self.structure_builder.get_structure()

        if self.QUIET:
            warnings.filters = warning_list

        return structure
    def get_structure(self, id, file):
        """Return the structure.

        Arguments:
         - id - string, the id that will be used for the structure
         - file - name of the PDB file OR an open filehandle

        """
        with warnings.catch_warnings():
            if self.QUIET:
                warnings.filterwarnings("ignore",
                                        category=PDBConstructionWarning)

            self.header = None
            self.trailer = None
            # Make a StructureBuilder instance (pass id of structure as parameter)
            self.structure_builder.init_structure(id)

            with as_handle(file, mode="rU") as handle:
                lines = handle.readlines()
                if not lines:
                    raise ValueError("Empty file.")
                self._parse(lines)

            self.structure_builder.set_header(self.header)
            # Return the Structure instance
            structure = self.structure_builder.get_structure()

        return structure
Ejemplo n.º 31
0
def write(sequences, handle, format):
    """Write complete set of sequences to a file.

        - sequences - A list (or iterator) of SeqRecord objects, or (if using
          Biopython 1.54 or later) a single SeqRecord.
        - handle    - File handle object to write to, or filename as string
          (note older versions of Biopython only took a handle).
        - format    - lower case string describing the file format to write.

    You should close the handle after calling this function.

    Returns the number of records written (as an integer).
    """
    from Bio import AlignIO

    # Try and give helpful error messages:
    if not isinstance(format, basestring):
        raise TypeError("Need a string for the file format (lower case)")
    if not format:
        raise ValueError("Format required (lower case string)")
    if format != format.lower():
        raise ValueError("Format string '%s' should be lower case" % format)

    if isinstance(sequences, SeqRecord):
        # This raised an exception in order version of Biopython
        sequences = [sequences]

    if format in _BinaryFormats:
        mode = 'wb'
    else:
        mode = 'w'

    with as_handle(handle, mode) as fp:
        # Map the file format to a writer class
        if format in _FormatToWriter:
            writer_class = _FormatToWriter[format]
            count = writer_class(fp).write_file(sequences)
        elif format in AlignIO._FormatToWriter:
            # Try and turn all the records into a single alignment,
            # and write that using Bio.AlignIO
            alignment = MultipleSeqAlignment(sequences)
            alignment_count = AlignIO.write([alignment], fp, format)
            assert alignment_count == 1, \
                "Internal error - the underlying writer " \
                " should have returned 1, not %s" % repr(alignment_count)
            count = len(alignment)
            del alignment_count, alignment
        elif format in _FormatToIterator or format in AlignIO._FormatToIterator:
            raise ValueError("Reading format '%s' is supported, but not writing"
                             % format)
        else:
            raise ValueError("Unknown format '%s'" % format)

        assert isinstance(count, int), "Internal error - the underlying %s " \
            "writer should have returned the record count, not %s" \
            % (format, repr(count))

    return count
Ejemplo n.º 32
0
def write(sequences, handle, format):
    """Write complete set of sequences to a file.

     - sequences - A list (or iterator) of SeqRecord objects, or (if using
                   Biopython 1.54 or later) a single SeqRecord.
     - handle    - File handle object to write to, or filename as string
                   (note older versions of Biopython only took a handle).
     - format    - lower case string describing the file format to write.

    You should close the handle after calling this function.

    Returns the number of records written (as an integer).
    """
    from Bio import AlignIO

    #Try and give helpful error messages:
    if not isinstance(format, basestring):
        raise TypeError("Need a string for the file format (lower case)")
    if not format:
        raise ValueError("Format required (lower case string)")
    if format != format.lower():
        raise ValueError("Format string '%s' should be lower case" % format)

    if isinstance(sequences, SeqRecord):
        #This raised an exception in order version of Biopython
        sequences = [sequences]

    if format in _BinaryFormats:
        mode = 'wb'
    else:
        mode = 'w'

    with as_handle(handle, mode) as fp:
        #Map the file format to a writer class
        if format in _FormatToWriter:
            writer_class = _FormatToWriter[format]
            count = writer_class(fp).write_file(sequences)
        elif format in AlignIO._FormatToWriter:
            #Try and turn all the records into a single alignment,
            #and write that using Bio.AlignIO
            alignment = MultipleSeqAlignment(sequences)
            alignment_count = AlignIO.write([alignment], fp, format)
            assert alignment_count == 1, \
                    "Internal error - the underlying writer " \
                    " should have returned 1, not %s" % repr(alignment_count)
            count = len(alignment)
            del alignment_count, alignment
        elif format in _FormatToIterator or format in AlignIO._FormatToIterator:
            raise ValueError(
                "Reading format '%s' is supported, but not writing" % format)
        else:
            raise ValueError("Unknown format '%s'" % format)

        assert isinstance(count, int), "Internal error - the underlying %s " \
               "writer should have returned the record count, not %s" \
               % (format, repr(count))

    return count
Ejemplo n.º 33
0
def write_fasta(records, fname):
    """Write a FASTA file without wrapping lines."""
    with as_handle(fname, 'w+') as outfile:
        for rec in records:
            descr = rec.description.strip()
            if descr:
                outfile.write(">%s %s\n%s\n" % (rec.id, descr, rec.seq))
            else:
                outfile.write(">%s\n%s\n" % (rec.id, rec.seq))
Ejemplo n.º 34
0
def write_fasta(records, fname):
    """Write a FASTA file without wrapping lines."""
    with as_handle(fname, 'w+') as outfile:
        for rec in records:
            descr = rec.description.strip()
            if descr:
                outfile.write(">%s %s\n%s\n" % (rec.id, descr, rec.seq))
            else:
                outfile.write(">%s\n%s\n" % (rec.id, rec.seq))
Ejemplo n.º 35
0
def read_dict(infile):
    colnames = [
        "chromosome",
        "start",
        "end",  # "file", "md5"
    ]
    with as_handle(infile, 'r') as handle:
        rows = _parse_lines(handle)
        return pd.DataFrame.from_records(rows, columns=colnames)
Ejemplo n.º 36
0
def get_mmcif_dictionary(filename):
    def get_mmcif_dictionary_local_function(fnm):
        return MMCIF2Dict(fnm)

    try:
        return get_mmcif_dictionary_local_function(filename)
    except UnicodeDecodeError:
        with as_handle(filename, 'r', encoding='utf-16') as f:
            return get_mmcif_dictionary_local_function(f)
Ejemplo n.º 37
0
def convert(in_file, in_format, out_file, out_format, alphabet=None):
    """Convert between two sequence file formats, return number of records.

    Arguments:
     - in_file - an input handle or filename
     - in_format - input file format, lower case string
     - out_file - an output handle or filename
     - out_format - output file format, lower case string
     - alphabet - optional alphabet to assume

    **NOTE** - If you provide an output filename, it will be opened which will
    overwrite any existing file without warning. This may happen if even
    the conversion is aborted (e.g. an invalid out_format name is given).

    For example, going from a filename to a handle:

    >>> from Bio import SeqIO
    >>> from io import StringIO
    >>> handle = StringIO("")
    >>> SeqIO.convert("Quality/example.fastq", "fastq", handle, "fasta")
    3
    >>> print(handle.getvalue())
    >EAS54_6_R1_2_1_413_324
    CCCTTCTTGTCTTCAGCGTTTCTCC
    >EAS54_6_R1_2_1_540_792
    TTGGCAGGCCAAGGCCGATGGATCA
    >EAS54_6_R1_2_1_443_348
    GTTGCTTCTGGCGTGGGTGGGGGGG
    <BLANKLINE>
    """
    in_mode = "rb" if in_format in _BinaryFormats else "r"

    out_mode = "wb" if out_format in _BinaryFormats else "w"

    # This will check the arguments and issue error messages,
    # after we have opened the file which is a shame.
    from ._convert import _handle_convert  # Lazy import

    with as_handle(in_file, in_mode) as in_handle:
        with as_handle(out_file, out_mode) as out_handle:
            count = _handle_convert(
                in_handle, in_format, out_handle, out_format, alphabet
            )
    return count
Ejemplo n.º 38
0
def parse(handle, format, seq_count=None):
    """Iterate over an alignment file as MultipleSeqAlignment objects.

    Arguments:
     - handle    - handle to the file, or the filename as a string
       (note older versions of Biopython only took a handle).
     - format    - string describing the file format.
     - seq_count - Optional integer, number of sequences expected in each
       alignment.  Recommended for fasta format files.

    If you have the file name in a string 'filename', use:

    >>> from Bio import AlignIO
    >>> filename = "Emboss/needle.txt"
    >>> format = "emboss"
    >>> for alignment in AlignIO.parse(filename, format):
    ...     print("Alignment of length %i" % alignment.get_alignment_length())
    Alignment of length 124
    Alignment of length 119
    Alignment of length 120
    Alignment of length 118
    Alignment of length 125

    If you have a string 'data' containing the file contents, use::

      from Bio import AlignIO
      from io import StringIO
      my_iterator = AlignIO.parse(StringIO(data), format)

    Use the Bio.AlignIO.read() function when you expect a single record only.
    """
    from Bio import SeqIO

    # Try and give helpful error messages:
    if not isinstance(format, str):
        raise TypeError("Need a string for the file format (lower case)")
    if not format:
        raise ValueError("Format required (lower case string)")
    if format != format.lower():
        raise ValueError("Format string '%s' should be lower case" % format)
    if seq_count is not None and not isinstance(seq_count, int):
        raise TypeError("Need integer for seq_count (sequences per alignment)")

    with as_handle(handle) as fp:
        # Map the file format to a sequence iterator:
        if format in _FormatToIterator:
            iterator_generator = _FormatToIterator[format]
            i = iterator_generator(fp, seq_count)

        elif format in SeqIO._FormatToIterator:
            # Exploit the existing SeqIO parser to the dirty work!
            i = _SeqIO_to_alignment_iterator(fp, format, seq_count=seq_count)
        else:
            raise ValueError("Unknown format '%s'" % format)

        yield from i
Ejemplo n.º 39
0
def XdnaIterator(handle):
    """Parse a Xdna file and return a SeqRecord object.

    Note that this is an "iterator" in name only since a Xdna file always
    contain a single sequence.
    """
    # Parse fixed-size header and do some rudimentary checks
    #
    # The "neg_length" value is the length of the part of the sequence
    # before the nucleotide considered as the "origin" (nucleotide number 1,
    # which in DNA Strider is not always the first nucleotide).
    # Biopython's SeqRecord has no such concept of a sequence origin as far
    # as I know, so we ignore that value. SerialCloner has no such concept
    # either and always generates files with a neg_length of zero.

    with as_handle(handle, "rb") as handle:

        header = _read_header(handle, 112)
        (version, type, topology, length, neg_length,
         com_length) = unpack(">BBB25xII60xI12x", header)
        if version != 0:
            raise ValueError("Unsupported XDNA version")
        if type not in _seq_types:
            raise ValueError("Unknown sequence type")

        # Read actual sequence and comment found in all XDNA files
        sequence = _read(handle, length).decode("ASCII")
        comment = _read(handle, com_length).decode("ASCII")

        # Try to derive a name from the first "word" of the comment
        name = comment.split(" ")[0]

        # Create record object
        record = SeqRecord(Seq(sequence, _seq_types[type]),
                           description=comment,
                           name=name,
                           id=name)
        if topology in _seq_topologies:
            record.annotations["topology"] = _seq_topologies[topology]

        if len(handle.read(1)) == 1:
            # This is an XDNA file with an optional annotation section.

            # Skip the overhangs as I don't know how to represent
            # them in the SeqRecord model.
            _read_overhang(handle)  # right-side overhang
            _read_overhang(handle)  # left-side overhang

            # Read the features
            num_features = unpack(">B", _read(handle, 1))[0]
            while num_features > 0:
                _read_feature(handle, record)
                num_features -= 1

        yield record
Ejemplo n.º 40
0
def sniff_region_format(infile):
    """Guess the format of the given file by reading the first line.

    Returns
    -------
    str or None
        The detected format name, or None if the file is empty.
    """
    # If the filename extension indicates the format, try that first
    fname_fmt = None
    fname = get_filename(infile)
    if fname:
        _base, ext = os.path.splitext(fname)
        ext = ext.lstrip('.')
        # if ext in known_extensions:
        if ext[1:] in format_patterns:
            fname_fmt = ext[1:]

    # Fallback: regex detection
    # has_track = False
    with as_handle(infile, 'rU') as handle:
        for line in handle:
            if not line.strip():
                # Skip blank lines
                continue
            if line.startswith('track'):
                # NB: Could be UCSC BED or Ensembl GFF
                # has_track = True
                continue
            if fname_fmt and format_patterns[fname_fmt].match(line):
                return fname_fmt
            # Formats that (may) declare themselves in an initial '#' comment
            if (line.startswith('##gff-version') or
                format_patterns['gff'].match(line)):
                return 'gff'
            if line.startswith(('##fileformat=VCF', '#CHROM\tPOS\tID')):
                return 'vcf'
            if line.startswith('#'):
                continue
            # Formats that need to be guessed solely by regex
            if format_patterns['text'].match(line):
                return 'text'
            if format_patterns['tab'].match(line):
                return 'tab'
            if line.startswith('@') or format_patterns['interval'].match(line):
                return 'interval'
            if format_patterns['refflat'].match(line):
                return 'refflat'
            if format_patterns['bed'].match(line):
                return 'bed'

            raise ValueError("File %r does not appear to be a recognized "
                             "format! (Any of: %s)\n"
                             "First non-blank line:\n%s"
                             % (fname, ', '.join(format_patterns.keys()), line))
Ejemplo n.º 41
0
def parse_bed(fname, coord_only, keep_strand):
    """Parse a BED file.

    A BED file has these columns:
        chromosome, start position, end position, [name, strand, other stuff...]

    Counting is from 0.

    Sets of regions are separated by "track" lines. This function stops
    iteration after encountering a track line other than the first one in the
    file.
    """
    if coord_only:
        if keep_strand:

            @report_bad_line
            def _parse_line(line):
                chrom, start, end, _name, _score, strand = line.split('\t',
                                                                      6)[:6]
                return chrom, int(start), int(end), strand.rstrip()
        else:

            @report_bad_line
            def _parse_line(line):
                chrom, start, end = line.split('\t', 3)[:3]
                return chrom, int(start), int(end)
    elif keep_strand:

        @report_bad_line
        def _parse_line(line):
            fields = line.split('\t', 6)
            chrom, start, end = fields[:3]
            name = (fields[3].rstrip() if len(fields) >= 4 else '-')
            strand = (fields[5].rstrip() if len(fields) >= 6 else '.')
            return chrom, int(start), int(end), name, strand
    else:

        @report_bad_line
        def _parse_line(line):
            fields = line.split('\t', 4)
            chrom, start, end = fields[:3]
            name = (fields[3].rstrip() if len(fields) >= 4 else '-')
            return chrom, int(start), int(end), name

    with as_handle(fname, 'rU') as handle:
        firstline = next(handle)
        if firstline.startswith("track"):
            pass
        else:
            yield _parse_line(firstline)

        for line in handle:
            if line.startswith('track'):
                raise StopIteration
            yield _parse_line(line)
Ejemplo n.º 42
0
def sniff_region_format(infile):
    """Guess the format of the given file by reading the first line.

    Returns
    -------
    str or None
        The detected format name, or None if the file is empty.
    """
    # If the filename extension indicates the format, try that first
    fname_fmt = None
    fname = get_filename(infile)
    if fname:
        _base, ext = os.path.splitext(fname)
        ext = ext.lstrip('.')
        # if ext in known_extensions:
        if ext[1:] in format_patterns:
            fname_fmt = ext[1:]

    # Fallback: regex detection
    # has_track = False
    with as_handle(infile, 'rU') as handle:
        for line in handle:
            if not line.strip():
                # Skip blank lines
                continue
            if line.startswith('track'):
                # NB: Could be UCSC BED or Ensembl GFF
                # has_track = True
                continue
            if fname_fmt and format_patterns[fname_fmt].match(line):
                return fname_fmt
            # Formats that (may) declare themselves in an initial '#' comment
            if (line.startswith('##gff-version')
                    or format_patterns['gff'].match(line)):
                return 'gff'
            if line.startswith(('##fileformat=VCF', '#CHROM\tPOS\tID')):
                return 'vcf'
            if line.startswith('#'):
                continue
            # Formats that need to be guessed solely by regex
            if format_patterns['text'].match(line):
                return 'text'
            if format_patterns['tab'].match(line):
                return 'tab'
            if line.startswith('@') or format_patterns['interval'].match(line):
                return 'interval'
            if format_patterns['refflat'].match(line):
                return 'refflat'
            if format_patterns['bed'].match(line):
                return 'bed'

            raise ValueError("File %r does not appear to be a recognized "
                             "format! (Any of: %s)\n"
                             "First non-blank line:\n%s" %
                             (fname, ', '.join(format_patterns.keys()), line))
Ejemplo n.º 43
0
def write(alignments, handle, format):
    """Write complete set of alignments to a file.

    Arguments:
     - alignments - A list (or iterator) of Alignment objects (ideally the
                   new MultipleSeqAlignment objects), or (if using Biopython
                   1.54 or later) a single alignment object.
     - handle    - File handle object to write to, or filename as string
                   (note older versions of Biopython only took a handle).
     - format    - lower case string describing the file format to write.

    You should close the handle after calling this function.

    Returns the number of alignments written (as an integer).
    """
    from Bio import SeqIO

    #Try and give helpful error messages:
    if not isinstance(format, basestring):
        raise TypeError("Need a string for the file format (lower case)")
    if not format:
        raise ValueError("Format required (lower case string)")
    if format != format.lower():
        raise ValueError("Format string '%s' should be lower case" % format)

    if isinstance(alignments, Alignment):
        #This raised an exception in older versions of Biopython
        alignments = [alignments]

    with as_handle(handle, 'w') as fp:
        #Map the file format to a writer class
        if format in _FormatToWriter:
            writer_class = _FormatToWriter[format]
            count = writer_class(fp).write_file(alignments)
        elif format in SeqIO._FormatToWriter:
            #Exploit the existing SeqIO parser to do the dirty work!
            #TODO - Can we make one call to SeqIO.write() and count the alignments?
            count = 0
            for alignment in alignments:
                if not isinstance(alignment, Alignment):
                    raise TypeError(
                        "Expect a list or iterator of Alignment objects.")
                SeqIO.write(alignment, fp, format)
                count += 1
        elif format in _FormatToIterator or format in SeqIO._FormatToIterator:
            raise ValueError("Reading format '%s' is supported, but not writing"
                             % format)
        else:
            raise ValueError("Unknown format '%s'" % format)

    assert isinstance(count, int), "Internal error - the underlying %s " \
           "writer should have returned the alignment count, not %s" \
           % (format, repr(count))

    return count
Ejemplo n.º 44
0
def parse_bed(fname, coord_only, keep_strand):
    """Parse a BED file.

    A BED file has these columns:
        chromosome, start position, end position, [name, strand, other stuff...]

    Counting is from 0.

    Sets of regions are separated by "track" lines. This function stops
    iteration after encountering a track line other than the first one in the
    file.
    """
    if coord_only:
        if keep_strand:
            @report_bad_line
            def _parse_line(line):
                chrom, start, end, _name, _score, strand = line.split('\t', 6)[:6]
                return chrom, int(start), int(end), strand.rstrip()
        else:
            @report_bad_line
            def _parse_line(line):
                chrom, start, end = line.split('\t', 3)[:3]
                return chrom, int(start), int(end)
    elif keep_strand:
        @report_bad_line
        def _parse_line(line):
            fields = line.split('\t', 6)
            chrom, start, end = fields[:3]
            name = (fields[3].rstrip()
                    if len(fields) >= 4 else '-')
            strand = (fields[5].rstrip()
                      if len(fields) >= 6 else '.')
            return chrom, int(start), int(end), name, strand
    else:
        @report_bad_line
        def _parse_line(line):
            fields = line.split('\t', 4)
            chrom, start, end = fields[:3]
            name = (fields[3].rstrip()
                    if len(fields) >= 4 else '-')
            return chrom, int(start), int(end), name

    with as_handle(fname, 'rU') as handle:
        firstline = next(handle)
        if firstline.startswith("track"):
            pass
        else:
            yield _parse_line(firstline)

        for line in handle:
            if line.startswith('track'):
                raise StopIteration
            yield _parse_line(line)
Ejemplo n.º 45
0
 def iter(self):
     """
     Iterate over the sequences in self.file_, yielding each as an
     instance of the desired read class.
     """
     # Use FastqGeneralIterator because it provides access to the
     # unconverted quality string (i.e., it doesn't try to figure out
     # the numeric quality values, which we don't care about at this
     # point).
     with as_handle(self.file_) as fp:
         for sequenceId, sequence, quality in FastqGeneralIterator(fp):
             yield self.readClass(sequenceId, sequence, quality)
Ejemplo n.º 46
0
def parse_tsv(infile, keep_header=False):
    """Parse a tabular data table into an iterable of lists.

    Rows are split on tabs.  Header row is optionally included in the output.
    """
    with as_handle(infile) as handle:
        lines = iter(handle)
        header = next(lines)
        if keep_header:
            yield header.rstrip().split('\t')
        for line in lines:
            yield line.rstrip().split('\t')
Ejemplo n.º 47
0
 def records(self):
     """
     Yield BLAST records, as read by the BioPython NCBIXML.parse
     method. Set self.params from data in the first record.
     """
     first = True
     with as_handle(self._filename) as fp:
         for record in NCBIXML.parse(fp):
             if first:
                 self.params = self._convertBlastParamsToDict(record)
                 first = False
             yield record
Ejemplo n.º 48
0
def parse(handle, format=None, **kwargs):
    """Turns a search output file into a generator that yields QueryResult
    objects.

    Arguments:
    handle -- Handle to the file, or the filename as a string.
    format -- Lower case string denoting one of the supported formats.
    kwargs -- Format-specific keyword arguments.

    This function is used to iterate over each query in a given search output
    file:

    >>> from Bio import SearchIO
    >>> qresults = SearchIO.parse('Blast/mirna.xml', 'blast-xml')
    >>> qresults
    <generator object ...>
    >>> for qresult in qresults:
    ...     print "Search %s has %i hits" % (qresult.id, len(qresult))
    ...
    Search 33211 has 100 hits
    Search 33212 has 44 hits
    Search 33213 has 95 hits

    Depending on the file format, `parse` may also accept additional keyword
    argument(s) that modifies the behavior of the format parser. Here is a
    simple example, where the keyword argument enables parsing of a commented
    BLAST tabular output file:

    >>> from Bio import SearchIO
    >>> for qresult in SearchIO.parse('Blast/mirna.tab', 'blast-tab', comments=True):
    ...     print "Search %s has %i hits" % (qresult.id, len(qresult))
    ...
    Search 33211 has 100 hits
    Search 33212 has 44 hits
    Search 33213 has 95 hits

    """
    # get the iterator object and do error checking
    iterator = get_processor(format, _ITERATOR_MAP)

    # HACK: force BLAST XML decoding to use utf-8
    handle_kwargs = {}
    if format == 'blast-xml' and sys.version_info[0] > 2:
        handle_kwargs['encoding'] = 'utf-8'

    # and start iterating
    with as_handle(handle, 'rU', **handle_kwargs) as source_file:
        generator = iterator(source_file, **kwargs)

        for qresult in generator:
            yield qresult
Ejemplo n.º 49
0
def read_text(infile):
    """Text coordinate format: "chr:start-end", one per line.

    Or sometimes: "chrom:start-end gene" or "chrom:start-end REF>ALT"

    Coordinate indexing is assumed to be from 1.
    """
    parse_line = report_bad_line(from_label)
    with as_handle(infile, 'rU') as handle:
        rows = [parse_line(line) for line in handle]
    table = pd.DataFrame.from_records(rows, columns=["chromosome", "start",
                                                     "end", "gene"])
    table['gene'] = table['gene'].replace('', '-')
    return table
Ejemplo n.º 50
0
    def get_structure(self, structure_id, filename):
        """Return the structure.

        Arguments:
         - structure_id - string, the id that will be used for the structure
         - filename - name of the mmCIF file OR an open filehandle
        """
        with warnings.catch_warnings():
            if self.QUIET:
                warnings.filterwarnings("ignore", category=PDBConstructionWarning)
        with as_handle(filename) as handle:
            self._build_structure(structure_id, handle)

        return self._structure_builder.get_structure()
Ejemplo n.º 51
0
def diamondTabularFormatToDicts(filename, fieldNames=None):
    """
    Read DIAMOND tabular (--outfmt 6) output and convert lines to dictionaries.

    @param filename: Either a C{str} file name or an open file pointer.
    @param fieldNames: A C{list} or C{tuple} of C{str} DIAMOND field names.
        Run 'diamond -help' to see the full list. If C{None}, a default set of
        fields will be used, as compatible with convert-diamond-to-sam.py
    @raise ValueError: If a line of C{filename} does not have the expected
        number of TAB-separated fields (i.e., len(fieldNames)). Or if
        C{fieldNames} is empty or contains duplicates.
    @return: A generator that yields C{dict}s with keys that are the DIAMOND
        field names and values as converted by DIAMOND_FIELD_CONVERTER.
    """
    fieldNames = fieldNames or FIELDS.split()
    nFields = len(fieldNames)
    if not nFields:
        raise ValueError('fieldNames cannot be empty.')

    c = Counter(fieldNames)
    if c.most_common(1)[0][1] > 1:
        raise ValueError(
            'fieldNames contains duplicated names: %s.' %
            (', '.join(sorted(x[0] for x in c.most_common() if x[1] > 1))))

    def identity(x):
        return x

    convertFunc = DIAMOND_FIELD_CONVERTER.get

    with as_handle(filename) as fp:
        for count, line in enumerate(fp, start=1):
            result = {}
            line = line[:-1]
            values = line.split('\t')
            if len(values) != nFields:
                raise ValueError(
                    'Line %d of %s had %d field values (expected %d). '
                    'To provide input for this function, DIAMOND must be '
                    'called with "--outfmt 6 %s" (without the quotes). '
                    'The offending input line was %r.' %
                    (count,
                     (filename if isinstance(filename, six.string_types)
                      else 'input'),
                     len(values), nFields, FIELDS, line))
            for fieldName, value in zip(fieldNames, values):
                value = convertFunc(fieldName, identity)(value)
                result[fieldName] = value
            yield result
Ejemplo n.º 52
0
    def __init__(self, filename):
        """Parse a mmCIF file and return a dictionary.

        Arguments:
         - file - name of the PDB file OR an open filehandle

        """
        self.quote_chars = ['\'', '\"']
        self.whitespace_chars = [' ', '\t']
        with as_handle(filename) as handle:
            loop_flag = False
            key = None
            tokens = self._tokenize(handle)
            try:
                token = next(tokens)
            except StopIteration:
                return  # for Python 3.7 and PEP 479
            self[token[0:5]] = token[5:]
            i = 0
            n = 0
            for token in tokens:
                if token.lower() == "loop_":
                    loop_flag = True
                    keys = []
                    i = 0
                    n = 0
                    continue
                elif loop_flag:
                    # The second condition checks we are in the first column
                    # Some mmCIF files (e.g. 4q9r) have values in later columns
                    # starting with an underscore and we don't want to read
                    # these as keys
                    if token.startswith("_") and (n == 0 or i % n == 0):
                        if i > 0:
                            loop_flag = False
                        else:
                            self[token] = []
                            keys.append(token)
                            n += 1
                            continue
                    else:
                        self[keys[i % n]].append(token)
                        i += 1
                        continue
                if key is None:
                    key = token
                else:
                    self[key] = token
                    key = None
Ejemplo n.º 53
0
def write(qresults, handle, format=None, **kwargs):
    """Writes QueryResult objects to a file in the given format.

    Arguments:
    qresults -- An iterator returning QueryResult objects or a single
                QueryResult object.
    handle -- Handle to the file, or the filename as a string.
    format -- Lower case string denoting one of the supported formats.
    kwargs -- Format-specific keyword arguments.

    The `write` function writes QueryResult object(s) into the given output
    handle / filename. You can supply it with a single QueryResult object or an
    iterable returning one or more QueryResult objects. In both cases, the
    function will return a tuple of four values: the number of QueryResult, Hit,
    HSP, and HSPFragment objects it writes to the output file.

    from Bio import SearchIO
    qresults = SearchIO.parse('Blast/mirna.xml', 'blast-xml')
    SearchIO.write(qresults, 'results.tab', 'blast-tab')
    <stdout> (3, 239, 277, 277)

    The output of different formats may be adjusted using the format-specific
    keyword arguments. Here is an example that writes BLAT PSL output file with
    a header:

    from Bio import SearchIO
    qresults = SearchIO.parse('Blat/psl_34_001.psl', 'blat-psl')
    SearchIO.write(qresults, 'results.tab', 'blat-psl', header=True)
    <stdout> (2, 13, 22, 26)

    """
    # turn qresults into an iterator if it's a single QueryResult object
    if isinstance(qresults, QueryResult):
        qresults = iter([qresults])
    else:
        qresults = iter(qresults)

    # get the writer object and do error checking
    writer_class = get_processor(format, _WRITER_MAP)

    # write to the handle
    with as_handle(handle, 'w') as target_file:
        writer = writer_class(target_file, **kwargs)
        # count how many qresults, hits, and hsps
        qresult_count, hit_count, hsp_count, frag_count = \
                writer.write_file(qresults)

    return qresult_count, hit_count, hsp_count, frag_count
Ejemplo n.º 54
0
def parse_interval_list(fname, coord_only, keep_strand):
    """Parse a Picard-compatible interval list.

    Expected tabular columns:
        chromosome, start position, end position, strand, region name

    Counting is from 1.
    """
    if coord_only:
        if keep_strand:
            @report_bad_line
            def _parse_line(line):
                chrom, start, end, strand = line.split('\t')[:4]
                return chrom, int(start) - 1, int(end), strand.rstrip()
        else:
            @report_bad_line
            def _parse_line(line):
                chrom, start, end = line.split('\t')[:3]
                return chrom, int(start) - 1, int(end)
    elif keep_strand:
        @report_bad_line
        def _parse_line(line):
            fields = line.split('\t')
            chrom, start, end, strand = fields[:4]
            if len(fields) > 4:
                name = fields[-1].rstrip()
            else:
                name = '-'
            return chrom, int(start) - 1, int(end), name, strand
    else:
        @report_bad_line
        def _parse_line(line):
            fields = line.split('\t')
            chrom, start, end = fields[:3]
            if len(fields) > 3:
                name = fields[-1].rstrip()
            else:
                name = '-'
            return chrom, int(start) - 1, int(end), name

    with as_handle(fname, 'rU') as handle:
        for line in handle:
            if line.startswith('@'):
                # Skip the SAM header
                continue
            yield _parse_line(line)
Ejemplo n.º 55
0
 def read(cls, infile, sample_id=None):
     """Parse a tabular table of coverage data from a handle or filename.
     """
     if sample_id is None:
         if isinstance(infile, basestring):
             sample_id = core.fbase(infile)
         else:
             sample_id = '<unknown>'
     with as_handle(infile) as handle:
         rows = _parse_lines(handle)
         try:
             xtra = next(rows)
             row_data = [next(rows)]
             row_data.extend(rows)
         except StopIteration:
             # Don't crash on empty files
             return cls(sample_id, [], [], [], [], [])
     return cls.from_rows(sample_id, row_data, xtra)
Ejemplo n.º 56
0
def group_bed_tracks(bedfile):
    """Group the parsed rows in a BED file by track.

    Yields (track_name, iterable_of_lines), much like itertools.groupby.
    """
    # ENH - make this memory-efficient w/ generators or something
    with as_handle(bedfile, 'r') as handle:
        curr_track = 'DEFAULT'
        curr_lines = []
        for line in handle:
            if line.startswith('track'):
                if curr_lines:
                    yield curr_track, curr_lines
                    curr_lines = []
                curr_track = parse_bed_track(line)
            else:
                curr_lines.append(line)
        yield curr_track, curr_lines
Ejemplo n.º 57
0
 def read(cls, infile, sample_id=None):
     """Parse a tabular table of coverage data from a handle or filename."""
     if sample_id is None:
         if isinstance(infile, basestring):
             sample_id = core.fbase(infile)
         else:
             sample_id = '<unknown>'
     with as_handle(infile) as handle:
         try:
             header = next(handle)
         except StopIteration:
             # Don't crash on empty files
             return cls(sample_id)
         # Build CNA...
         xtra = _sniff_xtra(header)
         cnarr = cls(sample_id, xtra)
         arr = numpy.loadtxt(handle, delimiter="\t", dtype=cnarr.data.dtype, ndmin=1)
     cnarr.data = arr
     return cnarr
Ejemplo n.º 58
0
Archivo: bedio.py Proyecto: etal/cnvkit
def read_bed(infile):
    """UCSC Browser Extensible Data (BED) format.

    A BED file has these columns:
        chromosome, start position, end position, [gene, strand, other stuff...]

    Coordinate indexing is from 0.

    Sets of regions are separated by "track" lines. This function stops reading
    after encountering a track line other than the first one in the file.
    """
    # ENH: just pd.read_csv, skip 'track'
    @report_bad_line
    def _parse_line(line):
        fields = line.split('\t', 6)
        chrom, start, end = fields[:3]
        gene = (fields[3].rstrip()
                if len(fields) >= 4 else '-')
        strand = (fields[5].rstrip()
                if len(fields) >= 6 else '.')
        return chrom, int(start), int(end), gene, strand

    def track2track(handle):
        try:
            firstline = next(handle)
            if firstline.startswith("browser "):
                # UCSC Genome Browser feature -- ignore it
                firstline = next(handle)
        except StopIteration:
            pass
        else:
            if not firstline.startswith("track"):
                yield firstline
            for line in handle:
                if line.startswith("track"):
                    break
                yield line

    with as_handle(infile, 'rU') as handle:
        rows = map(_parse_line, track2track(handle))
        return pd.DataFrame.from_records(rows, columns=["chromosome", "start",
                                                        "end", "gene", "strand"])
Ejemplo n.º 59
0
    def __init__(self, filename):
        """Parse a mmCIF file and return a dictionary.

        Arguments:
         - file - name of the PDB file OR an open filehandle

        """
        with as_handle(filename) as handle:
            loop_flag = False
            key = None
            tokens = self._tokenize(handle)
            token = next(tokens)
            self[token[0:5]] = token[5:]
            i = 0
            n = 0
            for token in tokens:
                if token == "loop_":
                    loop_flag = True
                    keys = []
                    i = 0
                    n = 0
                    continue
                elif loop_flag:
                    if token.startswith("_"):
                        if i > 0:
                            loop_flag = False
                        else:
                            self[token] = []
                            keys.append(token)
                            n += 1
                            continue
                    else:
                        self[keys[i % n]].append(token)
                        i += 1
                        continue
                if key is None:
                    key = token
                else:
                    self[key] = token
                    key = None