def convert(in_file, in_format, out_file, out_format, alphabet=None): """Convert between two alignment files, returns number of alignments. - in_file - an input handle or filename - in_format - input file format, lower case string - output - an output handle or filename - out_file - output file format, lower case string - alphabet - optional alphabet to assume **NOTE** - If you provide an output filename, it will be opened which will overwrite any existing file without warning. This may happen if even the conversion is aborted (e.g. an invalid out_format name is given). """ # TODO - Add optimised versions of important conversions # For now just off load the work to SeqIO parse/write with as_handle(in_file, 'rU') as in_handle: # Don't open the output file until we've checked the input is OK: alignments = parse(in_handle, in_format, None, alphabet) # This will check the arguments and issue error messages, # after we have opened the file which is a shame. with as_handle(out_file, 'w') as out_handle: count = write(alignments, out_handle, out_format) return count
def convert(in_file, in_format, out_file, out_format, alphabet=None): """Convert between two alignment files, returns number of alignments. - in_file - an input handle or filename - in_format - input file format, lower case string - output - an output handle or filename - out_file - output file format, lower case string - alphabet - optional alphabet to assume NOTE - If you provide an output filename, it will be opened which will overwrite any existing file without warning. This may happen if even the conversion is aborted (e.g. an invalid out_format name is given). """ #TODO - Add optimised versions of important conversions #For now just off load the work to SeqIO parse/write with as_handle(in_file, 'rU') as in_handle: #Don't open the output file until we've checked the input is OK: alignments = parse(in_handle, in_format, None, alphabet) #This will check the arguments and issue error messages, #after we have opened the file which is a shame. with as_handle(out_file, 'w') as out_handle: count = write(alignments, out_handle, out_format) return count
def convert(in_file, in_format, out_file, out_format, alphabet=None): """Convert between two sequence file formats, return number of records. Arguments: - in_file - an input handle or filename - in_format - input file format, lower case string - out_file - an output handle or filename - out_format - output file format, lower case string - alphabet - optional alphabet to assume **NOTE** - If you provide an output filename, it will be opened which will overwrite any existing file without warning. This may happen if even the conversion is aborted (e.g. an invalid out_format name is given). For example, going from a filename to a handle: >>> from Bio import SeqIO >>> try: ... from StringIO import StringIO # Python 2 ... except ImportError: ... from io import StringIO # Python 3 ... >>> handle = StringIO("") >>> SeqIO.convert("Quality/example.fastq", "fastq", handle, "fasta") 3 >>> print(handle.getvalue()) >EAS54_6_R1_2_1_413_324 CCCTTCTTGTCTTCAGCGTTTCTCC >EAS54_6_R1_2_1_540_792 TTGGCAGGCCAAGGCCGATGGATCA >EAS54_6_R1_2_1_443_348 GTTGCTTCTGGCGTGGGTGGGGGGG <BLANKLINE> """ # Hack for SFF, will need to make this more general in future if in_format in _BinaryFormats: in_mode = 'rb' else: in_mode = 'rU' # Don't open the output file until we've checked the input is OK? if out_format in ["sff", "sff_trim"]: out_mode = 'wb' else: out_mode = 'w' # This will check the arguments and issue error messages, # after we have opened the file which is a shame. from ._convert import _handle_convert # Lazy import with as_handle(in_file, in_mode) as in_handle: with as_handle(out_file, out_mode) as out_handle: count = _handle_convert(in_handle, in_format, out_handle, out_format, alphabet) return count
def _handle_convert(in_file, in_format, out_file, out_format, alphabet=None): """Convert handles from one format to another (PRIVATE).""" try: f = _converter[(in_format, out_format)] except KeyError: f = None if f: with as_handle(in_file, "r") as in_handle: with as_handle(out_file, "w") as out_handle: return f(in_handle, out_handle, alphabet) else: records = SeqIO.parse(in_file, in_format, alphabet) return SeqIO.write(records, out_file, out_format)
def parse(handle, file_format): """ Iterate over a gene ontology file. Parameters: - handle - File handle object to read from, or filename as a string, - file_format - lower case string describing the file format to write, Formats: - obo - tsv You should close the handle after calling this function. """ if not isinstance(file_format, basestring): raise TypeError("Need a string for the file format (lower case)") if not file_format: raise ValueError("Format required (lower case string)") if file_format != file_format.lower(): raise ValueError("Format string '%s' should be lower case" % format) with as_handle(handle, 'rU') as fp: if file_format in _FormatToIterator: iterator_generator = _FormatToIterator[file_format] it = iterator_generator(fp) for el in it: yield el else: raise ValueError("Unknown format '%s'" % file_format)
def read(handle, file_format, **params): """ Read file in given format. Parameters: - handle - File handle object to read from, or filename as a string, - file_format - lower case string describing the file format to write, Formats: - nexo - obo - etsv - gaf - params - additional parameters You should close the handle after calling this function. """ if not isinstance(file_format, basestring): raise TypeError("Need a string for the file format (lower case)") if not file_format: raise ValueError("Format required (lower case string)") if file_format != file_format.lower(): raise ValueError("Format string '%s' should be lower case" % format) with as_handle(handle, 'rU') as fp: if file_format in _FormatToReader: reader_generator = _FormatToReader[file_format] return reader_generator(fp, **params).read() else: raise ValueError("Unknown format '%s'" % file_format)
def write(data, handle, file_format, **params): """ Writes given data to file. Parameters: - data - data to write to a file, - handle - File handle object to write to, or filename as string (note older versions of Biopython only took a handle), - file_format - lower case string describing the file format to write, Formats: - png - writes picture of graph to png format (this feature needs pygraphviz to be installed) - etsv - params - additional parameters You should close the handle after calling this function. """ if not isinstance(file_format, basestring): raise TypeError("Need a string for the file format (lower case)") if not file_format: raise ValueError("Format required (lower case string)") with as_handle(handle, 'w') as fp: #Map the file format to a writer class if file_format in _FormatToWriter: writer_class = _FormatToWriter[file_format] writer_class(fp, **params).write(data) else: raise ValueError("Unknown format '%s'" % file_format)
def pretty_print(enrichment, graph, handle, file_format, **params): """ Print results returned by enrichment finder in a specified format. Parameters: - enrichment - result from EnrichmentFinder - graph - OntologyGraph with containing enriched nodes - handle - File handle object to read from, or filename as a string, - file_format - lower case string describing the file format to write, Formats: - gml - png - txt - html - params - additional parameters You should close the handle after calling this function. """ if not isinstance(file_format, basestring): raise TypeError("Need a string for the file format (lower case)") if not file_format: raise ValueError("Format required (lower case string)") with as_handle(handle, 'w') as fp: #Map the file format to a writer class if file_format in _FormatToPrinter: writer_class = _FormatToPrinter[file_format] writer = writer_class(fp, **params) writer.pretty_print(enrichment, graph) else: raise ValueError("Unknown format '%s'" % file_format)
def restore(cls, fpOrFilePrefix): """ Load a database from a file. @param fpOrFilePrefix: A file pointer, or the C{str} prefix of a file name, or C{None}. If a C{str}, self.SAVE_SUFFIX is appended to get the full file name. @return: An instance of L{Database}. @raises ValueError: If a now non-existent connector class name is found in the saved database file. """ if isinstance(fpOrFilePrefix, str): saveFile = fpOrFilePrefix + cls.SAVE_SUFFIX filePrefix = fpOrFilePrefix else: saveFile = fpOrFilePrefix filePrefix = None with as_handle(saveFile) as fp: dbParams = DatabaseParameters.restore(fp) state = loads(fp.readline()[:-1]) connectorClassName = state['_connectorClassName'] if connectorClassName == SimpleConnector.__name__: connector = SimpleConnector.restore(fpOrFilePrefix) elif six.PY3 and connectorClassName == WampServerConnector.__name__: connector = WampServerConnector.restore(fpOrFilePrefix) else: raise ValueError('Unknown backend connector class %r.' % connectorClassName) new = cls(dbParams, connector, filePrefix=filePrefix) return new
def parse_text_coords(fname, coord_only, _keep_strand): """Parse text coordinates: chrom:start-end Text coordinates are assumed to be counting from 1. """ if coord_only: @report_bad_line def _parse_line(line): chrom, _rest = line.rstrip().split(':', 1) start, end = _rest.split('-') if ':' in end: end = end.split(':', 1)[0] return chrom, int(start) - 1, int(end) else: @report_bad_line def _parse_line(line): fields = line.split(':') if len(fields) == 3: chrom, start_end, name = fields elif len(fields) == 2: chrom, start_end = fields name = '-' else: raise ValueError start, end = start_end.split('-') return chrom, int(start) - 1, int(end), name.rstrip() with as_handle(fname, 'rU') as handle: for line in handle: yield _parse_line(line)
def _read_text_integers(handleish, sep=None, header=False): """ Read and separate text and integers, where integers can be found in the form of ranges, e.g. "4-10" :param handleish: :return: [[text,...],...], [[number,...],...] """ texts, numbers = [], [] with as_handle(handleish) as fh: if header: next(fh) for line in fh: line = line.strip().split(sep) texts.append([]) numbers.append([]) for cell in line: try: numbers[-1].append(int(cell)) except ValueError: pass else: continue # match for range, i.e. numbers separated by one or two non-alphabet character. _range = st.re_range(cell) if _range is not None: numbers[-1].extend(_range) else: texts[-1].append(cell) return texts, numbers
def write(alignments, handle, format): """Write complete set of alignments to a file. Arguments: - alignments - A list (or iterator) of MultipleSeqAlignment objects, or a single alignment object. - handle - File handle object to write to, or filename as string (note older versions of Biopython only took a handle). - format - lower case string describing the file format to write. You should close the handle after calling this function. Returns the number of alignments written (as an integer). """ from Bio import SeqIO # Try and give helpful error messages: if not isinstance(format, str): raise TypeError("Need a string for the file format (lower case)") if not format: raise ValueError("Format required (lower case string)") if format != format.lower(): raise ValueError("Format string '%s' should be lower case" % format) if isinstance(alignments, MultipleSeqAlignment): # This raised an exception in older versions of Biopython alignments = [alignments] with as_handle(handle, "w") as fp: # Map the file format to a writer class if format in _FormatToWriter: writer_class = _FormatToWriter[format] count = writer_class(fp).write_file(alignments) elif format in SeqIO._FormatToWriter: # Exploit the existing SeqIO parser to do the dirty work! # TODO - Can we make one call to SeqIO.write() and count the alignments? count = 0 for alignment in alignments: if not isinstance(alignment, MultipleSeqAlignment): raise TypeError( "Expect a list or iterator of MultipleSeqAlignment " "objects, got: %r" % alignment ) SeqIO.write(alignment, fp, format) count += 1 elif format in _FormatToIterator or format in SeqIO._FormatToIterator: raise ValueError( "Reading format '%s' is supported, but not writing" % format ) else: raise ValueError("Unknown format '%s'" % format) if not isinstance(count, int): raise RuntimeError( "Internal error - the underlying %s " "writer should have returned the alignment count, not %s" % (format, repr(count)) ) return count
def process_output(fg_aln, bg_aln, hits, alpha, output, pattern, pdb_data): """Generate the output files from the processed data.""" with as_handle(output, 'w+') as outfile: write_pvalues(hits, outfile, alpha) tophits = top_hits(hits, alpha) if pattern: with open(pattern, 'w+') as ptnfile: write_mcbpps(tophits, ptnfile) # XXX hack: don't make pairlogo in single mode if bg_aln: pairlogo.make_pairlogos(fg_aln, bg_aln, tophits, pattern.rsplit('.', 1)[0], 10) if pdb_data: patterns = [t[0] for t in tophits] if len(pdb_data) == 1: pdb_fname, pdb_rec, pdb_resnums, pdb_inserts = pdb_data[0] script = pmlscript.build_single(pdb_resnums, pdb_inserts, patterns, pdb_fname, pdb_rec.annotations['chain']) pml_fname = pdb_fname + ".pml" else: pdb_fnames, pdb_recs, pdb_resnumses, pdb_insertses = zip(*pdb_data) # TODO multi-PDB mode pml_fname = pdb_fnames[0] + "-etc.pml" with open(pml_fname, 'w+') as pmlfile: pmlfile.write(script) logging.info("Wrote %s", pml_fname)
def SnapGeneIterator(handle): """Parse a SnapGene file and return a SeqRecord object. Note that a SnapGene file can only contain one sequence, so this iterator will always return a single record. """ record = SeqRecord(None) n = 0 # check if file is empty empty = True with as_handle(handle, "rb") as handle: for n, (type, length, data) in enumerate(_PacketIterator(handle)): empty = False if n == 0 and type != 0x09: raise ValueError( "The file does not start with a SnapGene cookie packet") if type in _packet_handlers: _packet_handlers[type](length, data, record) if empty: raise ValueError("Empty file.") if not record.seq: raise ValueError("No DNA packet in file") yield record
def write_PDB(entity, file, pdbid=None, chainid=None): """Write PDB file with HEADER and TITLE.""" with as_handle(file, 'w') as fp: try: if 'S' == entity.level: if not pdbid: pdbid = entity.header.get('idcode', None) hdr = entity.header.get('head', None) dd = entity.header.get('deposition_date', None) if hdr: fp.write(('HEADER {:40}{:8} {:4}\n' ).format(hdr.upper(), (dd or ''), (pdbid or ''))) nam = entity.header.get('name', None) if nam: fp.write('TITLE ' + nam.upper() + '\n') io = PDBIO() io.set_structure(entity) io.save(fp) else: raise PDBException("level not 'S': " + str(entity.level)) except KeyError: raise Exception( "write_PIC: argument is not a Biopython PDB Entity " + str(entity))
def FastaTwoLineIterator(handle, alphabet=single_letter_alphabet): """Iterate over two-line Fasta records (as SeqRecord objects). Arguments: - handle - input file - alphabet - optional alphabet This uses a strict interpretation of the FASTA as requiring exactly two lines per record (no line wrapping). Only the default title to ID/name/description parsing offered by the relaxed FASTA parser is offered. """ with as_handle(handle) as handle: for title, sequence in FastaTwoLineParser(handle): try: first_word = title.split(None, 1)[0] except IndexError: assert not title, repr(title) # Should we use SeqRecord default for no ID? first_word = "" yield SeqRecord( Seq(sequence, alphabet), id=first_word, name=first_word, description=title, )
def get_structure(self, id, file): """Return the structure. Arguments: o id - string, the id that will be used for the structure o file - name of the PDB file OR an open filehandle """ if self.QUIET: warning_list = warnings.filters[:] warnings.filterwarnings("ignore", category=PDBConstructionWarning) self.header = None self.trailer = None # Make a StructureBuilder instance (pass id of structure as parameter) self.structure_builder.init_structure(id) with as_handle(file) as handle: self._parse(handle.readlines()) self.structure_builder.set_header(self.header) # Return the Structure instance structure = self.structure_builder.get_structure() if self.QUIET: warnings.filters = warning_list return structure
def restore(cls, fpOrFilePrefix): """ Restore state from a file. @param fpOrFilePrefix: A file pointer or the C{str} prefix of a file name. If a C{str}, self.SAVE_SUFFIX is appended to get the full file name. @return: An instance of L{WampServerConnector}. @raises ValueError: If valid JSON cannot be loaded from C{fp}. """ if isinstance(fpOrFilePrefix, str): saveFile = fpOrFilePrefix + cls.SAVE_SUFFIX filePrefix = fpOrFilePrefix else: saveFile = fpOrFilePrefix filePrefix = None with as_handle(saveFile) as fp: dbParams = DatabaseParameters.restore(fp) state = loads(fp.readline()[:-1]) disconnectedBackends = {} for name, backendInfo in state['disconnectedBackends'].items(): disconnectedBackends[name] = { 'checksum': Checksum(backendInfo['checksum']), 'subjectCount': backendInfo['subjectCount'], } return cls(dbParams, _id=state['id'], checksum=Checksum(state['checksum']), disconnectedBackends=disconnectedBackends, filePrefix=filePrefix)
def PhdIterator(handle): """Return SeqRecord objects from a PHD file. This uses the Bio.Sequencing.Phd module to do the hard work. """ with as_handle(handle, "rU") as handle: phd_records = Phd.parse(handle) for phd_record in phd_records: # Convert the PHY record into a SeqRecord... # The "filename" can contain spaces, e.g. 'HWI-EAS94_4_1_1_602_99 1' # from unit test example file phd_solexa. # This will cause problems if used as the record identifier # (e.g. output for FASTQ format). name = phd_record.file_name.split(None, 1)[0] seq_record = SeqRecord(phd_record.seq, id=name, name=name, description=phd_record.file_name) # Just re-use the comments dictionary as the SeqRecord's annotations seq_record.annotations = phd_record.comments # And store the qualities and peak locations as per-letter-annotation seq_record.letter_annotations["phred_quality"] = [ int(site[1]) for site in phd_record.sites ] try: seq_record.letter_annotations["peak_location"] = [ int(site[2]) for site in phd_record.sites ] except IndexError: # peak locations are not always there according to # David Gordon (the Consed author) pass yield seq_record
def read_bed(infile): """UCSC Browser Extensible Data (BED) format. A BED file has these columns: chromosome, start position, end position, [gene, strand, other stuff...] Coordinate indexing is from 0. Sets of regions are separated by "track" lines. This function stops reading after encountering a track line other than the first one in the file. """ # ENH: just pd.read_table, skip 'track' @report_bad_line def _parse_line(line): fields = line.split('\t', 6) chrom, start, end = fields[:3] gene = (fields[3].rstrip() if len(fields) >= 4 else '-') strand = (fields[5].rstrip() if len(fields) >= 6 else '.') return chrom, int(start), int(end), gene, strand def track2track(handle): firstline = next(handle) if firstline.startswith("track"): pass else: yield firstline for line in handle: if line.startswith('track'): raise StopIteration yield line with as_handle(infile, 'rU') as handle: rows = map(_parse_line, track2track(handle)) return pd.DataFrame.from_records( rows, columns=["chromosome", "start", "end", "gene", "strand"])
def read_vcf_simple(infile): """Read VCF file w/o samples.""" # ENH: Make all readers return a tuple (header_string, body_table) # ENH: usecols -- need to trim dtypes dict to match? header_lines = [] with as_handle(infile, 'rU') as handle: for line in handle: if line.startswith('##'): header_lines.append(line) else: assert line.startswith('#CHR') header_line = line header_lines.append(line) break # Extract sample names from VCF header, keep as column names header_fields = header_line.split('\t') sample_ids = header_fields[9:] colnames = ['chromosome', 'start', 'id', 'ref', 'alt', 'qual', 'filter', 'info', 'format'] + sample_ids dtypes = {c: str for c in colnames} dtypes['start'] = int del dtypes['qual'] table = pd.read_csv(handle, sep='\t', header=None, na_filter=False, names=colnames, converters={'qual': parse_qual}, dtype=dtypes) # ENH: do things with filter, info table['start'] -= 1 table['end'] = table['info'].apply(parse_end_from_info) set_ends(table) logging.info("Loaded %d plain records", len(table)) return table
def save(self, fpOrFilePrefix=None): """ Save state to a file. @param fpOrFilePrefix: A file pointer, or the C{str} prefix of a file name, or C{None}. If a C{str}, self.SAVE_SUFFIX is appended to get the full file name. If C{None}, self._filePrefix will be used as a file prefix unless it is also C{None}. @raises ValueError: If C{fpOrFilePrefix} and C{self._filePrefix} are both C{None} """ if isinstance(fpOrFilePrefix, str): saveFile = fpOrFilePrefix + self.SAVE_SUFFIX elif fpOrFilePrefix is None: if self._filePrefix is None: raise ValueError('save must be given an argument (or the ' 'database must have been restored from a ' 'file).') else: saveFile = self._filePrefix + self.SAVE_SUFFIX else: saveFile = fpOrFilePrefix with as_handle(saveFile, 'w') as fp: self.dbParams.save(fp) self._backend.save(fpOrFilePrefix)
def write_PDB(entity: Structure, file: str, pdbid: str = None, chainid: str = None) -> None: """Write PDB file with HEADER and TITLE.""" enumerate_atoms(entity) with as_handle(file, "w") as fp: try: if "S" == entity.level: if hasattr(entity, "header"): if not pdbid: pdbid = entity.header.get("idcode", None) hdr = entity.header.get("head", None) dd = pdb_date(entity.header.get("deposition_date", None)) if hdr: fp.write(("HEADER {:40}{:8} {:4}\n").format( hdr.upper(), (dd or ""), (pdbid or ""))) nam = entity.header.get("name", None) if nam: fp.write("TITLE " + nam.upper() + "\n") io = PDBIO() io.set_structure(entity) io.save(fp, preserve_atom_numbering=True) else: raise PDBException("level not 'S': " + str(entity.level)) except KeyError: raise Exception( "write_PIC: argument is not a Biopython PDB Entity " + str(entity))
def get_structure(self, id, file): """Return the structure. Arguments: o id - string, the id that will be used for the structure o file - name of the PDB file OR an open filehandle """ if self.QUIET: warning_list = warnings.filters[:] warnings.filterwarnings('ignore', category=PDBConstructionWarning) self.header = None self.trailer = None # Make a StructureBuilder instance (pass id of structure as parameter) self.structure_builder.init_structure(id) with as_handle(file) as handle: self._parse(handle.readlines()) self.structure_builder.set_header(self.header) # Return the Structure instance structure = self.structure_builder.get_structure() if self.QUIET: warnings.filters = warning_list return structure
def get_structure(self, id, file): """Return the structure. Arguments: - id - string, the id that will be used for the structure - file - name of the PDB file OR an open filehandle """ with warnings.catch_warnings(): if self.QUIET: warnings.filterwarnings("ignore", category=PDBConstructionWarning) self.header = None self.trailer = None # Make a StructureBuilder instance (pass id of structure as parameter) self.structure_builder.init_structure(id) with as_handle(file, mode="rU") as handle: lines = handle.readlines() if not lines: raise ValueError("Empty file.") self._parse(lines) self.structure_builder.set_header(self.header) # Return the Structure instance structure = self.structure_builder.get_structure() return structure
def write(sequences, handle, format): """Write complete set of sequences to a file. - sequences - A list (or iterator) of SeqRecord objects, or (if using Biopython 1.54 or later) a single SeqRecord. - handle - File handle object to write to, or filename as string (note older versions of Biopython only took a handle). - format - lower case string describing the file format to write. You should close the handle after calling this function. Returns the number of records written (as an integer). """ from Bio import AlignIO # Try and give helpful error messages: if not isinstance(format, basestring): raise TypeError("Need a string for the file format (lower case)") if not format: raise ValueError("Format required (lower case string)") if format != format.lower(): raise ValueError("Format string '%s' should be lower case" % format) if isinstance(sequences, SeqRecord): # This raised an exception in order version of Biopython sequences = [sequences] if format in _BinaryFormats: mode = 'wb' else: mode = 'w' with as_handle(handle, mode) as fp: # Map the file format to a writer class if format in _FormatToWriter: writer_class = _FormatToWriter[format] count = writer_class(fp).write_file(sequences) elif format in AlignIO._FormatToWriter: # Try and turn all the records into a single alignment, # and write that using Bio.AlignIO alignment = MultipleSeqAlignment(sequences) alignment_count = AlignIO.write([alignment], fp, format) assert alignment_count == 1, \ "Internal error - the underlying writer " \ " should have returned 1, not %s" % repr(alignment_count) count = len(alignment) del alignment_count, alignment elif format in _FormatToIterator or format in AlignIO._FormatToIterator: raise ValueError("Reading format '%s' is supported, but not writing" % format) else: raise ValueError("Unknown format '%s'" % format) assert isinstance(count, int), "Internal error - the underlying %s " \ "writer should have returned the record count, not %s" \ % (format, repr(count)) return count
def write(sequences, handle, format): """Write complete set of sequences to a file. - sequences - A list (or iterator) of SeqRecord objects, or (if using Biopython 1.54 or later) a single SeqRecord. - handle - File handle object to write to, or filename as string (note older versions of Biopython only took a handle). - format - lower case string describing the file format to write. You should close the handle after calling this function. Returns the number of records written (as an integer). """ from Bio import AlignIO #Try and give helpful error messages: if not isinstance(format, basestring): raise TypeError("Need a string for the file format (lower case)") if not format: raise ValueError("Format required (lower case string)") if format != format.lower(): raise ValueError("Format string '%s' should be lower case" % format) if isinstance(sequences, SeqRecord): #This raised an exception in order version of Biopython sequences = [sequences] if format in _BinaryFormats: mode = 'wb' else: mode = 'w' with as_handle(handle, mode) as fp: #Map the file format to a writer class if format in _FormatToWriter: writer_class = _FormatToWriter[format] count = writer_class(fp).write_file(sequences) elif format in AlignIO._FormatToWriter: #Try and turn all the records into a single alignment, #and write that using Bio.AlignIO alignment = MultipleSeqAlignment(sequences) alignment_count = AlignIO.write([alignment], fp, format) assert alignment_count == 1, \ "Internal error - the underlying writer " \ " should have returned 1, not %s" % repr(alignment_count) count = len(alignment) del alignment_count, alignment elif format in _FormatToIterator or format in AlignIO._FormatToIterator: raise ValueError( "Reading format '%s' is supported, but not writing" % format) else: raise ValueError("Unknown format '%s'" % format) assert isinstance(count, int), "Internal error - the underlying %s " \ "writer should have returned the record count, not %s" \ % (format, repr(count)) return count
def write_fasta(records, fname): """Write a FASTA file without wrapping lines.""" with as_handle(fname, 'w+') as outfile: for rec in records: descr = rec.description.strip() if descr: outfile.write(">%s %s\n%s\n" % (rec.id, descr, rec.seq)) else: outfile.write(">%s\n%s\n" % (rec.id, rec.seq))
def read_dict(infile): colnames = [ "chromosome", "start", "end", # "file", "md5" ] with as_handle(infile, 'r') as handle: rows = _parse_lines(handle) return pd.DataFrame.from_records(rows, columns=colnames)
def get_mmcif_dictionary(filename): def get_mmcif_dictionary_local_function(fnm): return MMCIF2Dict(fnm) try: return get_mmcif_dictionary_local_function(filename) except UnicodeDecodeError: with as_handle(filename, 'r', encoding='utf-16') as f: return get_mmcif_dictionary_local_function(f)
def convert(in_file, in_format, out_file, out_format, alphabet=None): """Convert between two sequence file formats, return number of records. Arguments: - in_file - an input handle or filename - in_format - input file format, lower case string - out_file - an output handle or filename - out_format - output file format, lower case string - alphabet - optional alphabet to assume **NOTE** - If you provide an output filename, it will be opened which will overwrite any existing file without warning. This may happen if even the conversion is aborted (e.g. an invalid out_format name is given). For example, going from a filename to a handle: >>> from Bio import SeqIO >>> from io import StringIO >>> handle = StringIO("") >>> SeqIO.convert("Quality/example.fastq", "fastq", handle, "fasta") 3 >>> print(handle.getvalue()) >EAS54_6_R1_2_1_413_324 CCCTTCTTGTCTTCAGCGTTTCTCC >EAS54_6_R1_2_1_540_792 TTGGCAGGCCAAGGCCGATGGATCA >EAS54_6_R1_2_1_443_348 GTTGCTTCTGGCGTGGGTGGGGGGG <BLANKLINE> """ in_mode = "rb" if in_format in _BinaryFormats else "r" out_mode = "wb" if out_format in _BinaryFormats else "w" # This will check the arguments and issue error messages, # after we have opened the file which is a shame. from ._convert import _handle_convert # Lazy import with as_handle(in_file, in_mode) as in_handle: with as_handle(out_file, out_mode) as out_handle: count = _handle_convert( in_handle, in_format, out_handle, out_format, alphabet ) return count
def parse(handle, format, seq_count=None): """Iterate over an alignment file as MultipleSeqAlignment objects. Arguments: - handle - handle to the file, or the filename as a string (note older versions of Biopython only took a handle). - format - string describing the file format. - seq_count - Optional integer, number of sequences expected in each alignment. Recommended for fasta format files. If you have the file name in a string 'filename', use: >>> from Bio import AlignIO >>> filename = "Emboss/needle.txt" >>> format = "emboss" >>> for alignment in AlignIO.parse(filename, format): ... print("Alignment of length %i" % alignment.get_alignment_length()) Alignment of length 124 Alignment of length 119 Alignment of length 120 Alignment of length 118 Alignment of length 125 If you have a string 'data' containing the file contents, use:: from Bio import AlignIO from io import StringIO my_iterator = AlignIO.parse(StringIO(data), format) Use the Bio.AlignIO.read() function when you expect a single record only. """ from Bio import SeqIO # Try and give helpful error messages: if not isinstance(format, str): raise TypeError("Need a string for the file format (lower case)") if not format: raise ValueError("Format required (lower case string)") if format != format.lower(): raise ValueError("Format string '%s' should be lower case" % format) if seq_count is not None and not isinstance(seq_count, int): raise TypeError("Need integer for seq_count (sequences per alignment)") with as_handle(handle) as fp: # Map the file format to a sequence iterator: if format in _FormatToIterator: iterator_generator = _FormatToIterator[format] i = iterator_generator(fp, seq_count) elif format in SeqIO._FormatToIterator: # Exploit the existing SeqIO parser to the dirty work! i = _SeqIO_to_alignment_iterator(fp, format, seq_count=seq_count) else: raise ValueError("Unknown format '%s'" % format) yield from i
def XdnaIterator(handle): """Parse a Xdna file and return a SeqRecord object. Note that this is an "iterator" in name only since a Xdna file always contain a single sequence. """ # Parse fixed-size header and do some rudimentary checks # # The "neg_length" value is the length of the part of the sequence # before the nucleotide considered as the "origin" (nucleotide number 1, # which in DNA Strider is not always the first nucleotide). # Biopython's SeqRecord has no such concept of a sequence origin as far # as I know, so we ignore that value. SerialCloner has no such concept # either and always generates files with a neg_length of zero. with as_handle(handle, "rb") as handle: header = _read_header(handle, 112) (version, type, topology, length, neg_length, com_length) = unpack(">BBB25xII60xI12x", header) if version != 0: raise ValueError("Unsupported XDNA version") if type not in _seq_types: raise ValueError("Unknown sequence type") # Read actual sequence and comment found in all XDNA files sequence = _read(handle, length).decode("ASCII") comment = _read(handle, com_length).decode("ASCII") # Try to derive a name from the first "word" of the comment name = comment.split(" ")[0] # Create record object record = SeqRecord(Seq(sequence, _seq_types[type]), description=comment, name=name, id=name) if topology in _seq_topologies: record.annotations["topology"] = _seq_topologies[topology] if len(handle.read(1)) == 1: # This is an XDNA file with an optional annotation section. # Skip the overhangs as I don't know how to represent # them in the SeqRecord model. _read_overhang(handle) # right-side overhang _read_overhang(handle) # left-side overhang # Read the features num_features = unpack(">B", _read(handle, 1))[0] while num_features > 0: _read_feature(handle, record) num_features -= 1 yield record
def sniff_region_format(infile): """Guess the format of the given file by reading the first line. Returns ------- str or None The detected format name, or None if the file is empty. """ # If the filename extension indicates the format, try that first fname_fmt = None fname = get_filename(infile) if fname: _base, ext = os.path.splitext(fname) ext = ext.lstrip('.') # if ext in known_extensions: if ext[1:] in format_patterns: fname_fmt = ext[1:] # Fallback: regex detection # has_track = False with as_handle(infile, 'rU') as handle: for line in handle: if not line.strip(): # Skip blank lines continue if line.startswith('track'): # NB: Could be UCSC BED or Ensembl GFF # has_track = True continue if fname_fmt and format_patterns[fname_fmt].match(line): return fname_fmt # Formats that (may) declare themselves in an initial '#' comment if (line.startswith('##gff-version') or format_patterns['gff'].match(line)): return 'gff' if line.startswith(('##fileformat=VCF', '#CHROM\tPOS\tID')): return 'vcf' if line.startswith('#'): continue # Formats that need to be guessed solely by regex if format_patterns['text'].match(line): return 'text' if format_patterns['tab'].match(line): return 'tab' if line.startswith('@') or format_patterns['interval'].match(line): return 'interval' if format_patterns['refflat'].match(line): return 'refflat' if format_patterns['bed'].match(line): return 'bed' raise ValueError("File %r does not appear to be a recognized " "format! (Any of: %s)\n" "First non-blank line:\n%s" % (fname, ', '.join(format_patterns.keys()), line))
def parse_bed(fname, coord_only, keep_strand): """Parse a BED file. A BED file has these columns: chromosome, start position, end position, [name, strand, other stuff...] Counting is from 0. Sets of regions are separated by "track" lines. This function stops iteration after encountering a track line other than the first one in the file. """ if coord_only: if keep_strand: @report_bad_line def _parse_line(line): chrom, start, end, _name, _score, strand = line.split('\t', 6)[:6] return chrom, int(start), int(end), strand.rstrip() else: @report_bad_line def _parse_line(line): chrom, start, end = line.split('\t', 3)[:3] return chrom, int(start), int(end) elif keep_strand: @report_bad_line def _parse_line(line): fields = line.split('\t', 6) chrom, start, end = fields[:3] name = (fields[3].rstrip() if len(fields) >= 4 else '-') strand = (fields[5].rstrip() if len(fields) >= 6 else '.') return chrom, int(start), int(end), name, strand else: @report_bad_line def _parse_line(line): fields = line.split('\t', 4) chrom, start, end = fields[:3] name = (fields[3].rstrip() if len(fields) >= 4 else '-') return chrom, int(start), int(end), name with as_handle(fname, 'rU') as handle: firstline = next(handle) if firstline.startswith("track"): pass else: yield _parse_line(firstline) for line in handle: if line.startswith('track'): raise StopIteration yield _parse_line(line)
def write(alignments, handle, format): """Write complete set of alignments to a file. Arguments: - alignments - A list (or iterator) of Alignment objects (ideally the new MultipleSeqAlignment objects), or (if using Biopython 1.54 or later) a single alignment object. - handle - File handle object to write to, or filename as string (note older versions of Biopython only took a handle). - format - lower case string describing the file format to write. You should close the handle after calling this function. Returns the number of alignments written (as an integer). """ from Bio import SeqIO #Try and give helpful error messages: if not isinstance(format, basestring): raise TypeError("Need a string for the file format (lower case)") if not format: raise ValueError("Format required (lower case string)") if format != format.lower(): raise ValueError("Format string '%s' should be lower case" % format) if isinstance(alignments, Alignment): #This raised an exception in older versions of Biopython alignments = [alignments] with as_handle(handle, 'w') as fp: #Map the file format to a writer class if format in _FormatToWriter: writer_class = _FormatToWriter[format] count = writer_class(fp).write_file(alignments) elif format in SeqIO._FormatToWriter: #Exploit the existing SeqIO parser to do the dirty work! #TODO - Can we make one call to SeqIO.write() and count the alignments? count = 0 for alignment in alignments: if not isinstance(alignment, Alignment): raise TypeError( "Expect a list or iterator of Alignment objects.") SeqIO.write(alignment, fp, format) count += 1 elif format in _FormatToIterator or format in SeqIO._FormatToIterator: raise ValueError("Reading format '%s' is supported, but not writing" % format) else: raise ValueError("Unknown format '%s'" % format) assert isinstance(count, int), "Internal error - the underlying %s " \ "writer should have returned the alignment count, not %s" \ % (format, repr(count)) return count
def iter(self): """ Iterate over the sequences in self.file_, yielding each as an instance of the desired read class. """ # Use FastqGeneralIterator because it provides access to the # unconverted quality string (i.e., it doesn't try to figure out # the numeric quality values, which we don't care about at this # point). with as_handle(self.file_) as fp: for sequenceId, sequence, quality in FastqGeneralIterator(fp): yield self.readClass(sequenceId, sequence, quality)
def parse_tsv(infile, keep_header=False): """Parse a tabular data table into an iterable of lists. Rows are split on tabs. Header row is optionally included in the output. """ with as_handle(infile) as handle: lines = iter(handle) header = next(lines) if keep_header: yield header.rstrip().split('\t') for line in lines: yield line.rstrip().split('\t')
def records(self): """ Yield BLAST records, as read by the BioPython NCBIXML.parse method. Set self.params from data in the first record. """ first = True with as_handle(self._filename) as fp: for record in NCBIXML.parse(fp): if first: self.params = self._convertBlastParamsToDict(record) first = False yield record
def parse(handle, format=None, **kwargs): """Turns a search output file into a generator that yields QueryResult objects. Arguments: handle -- Handle to the file, or the filename as a string. format -- Lower case string denoting one of the supported formats. kwargs -- Format-specific keyword arguments. This function is used to iterate over each query in a given search output file: >>> from Bio import SearchIO >>> qresults = SearchIO.parse('Blast/mirna.xml', 'blast-xml') >>> qresults <generator object ...> >>> for qresult in qresults: ... print "Search %s has %i hits" % (qresult.id, len(qresult)) ... Search 33211 has 100 hits Search 33212 has 44 hits Search 33213 has 95 hits Depending on the file format, `parse` may also accept additional keyword argument(s) that modifies the behavior of the format parser. Here is a simple example, where the keyword argument enables parsing of a commented BLAST tabular output file: >>> from Bio import SearchIO >>> for qresult in SearchIO.parse('Blast/mirna.tab', 'blast-tab', comments=True): ... print "Search %s has %i hits" % (qresult.id, len(qresult)) ... Search 33211 has 100 hits Search 33212 has 44 hits Search 33213 has 95 hits """ # get the iterator object and do error checking iterator = get_processor(format, _ITERATOR_MAP) # HACK: force BLAST XML decoding to use utf-8 handle_kwargs = {} if format == 'blast-xml' and sys.version_info[0] > 2: handle_kwargs['encoding'] = 'utf-8' # and start iterating with as_handle(handle, 'rU', **handle_kwargs) as source_file: generator = iterator(source_file, **kwargs) for qresult in generator: yield qresult
def read_text(infile): """Text coordinate format: "chr:start-end", one per line. Or sometimes: "chrom:start-end gene" or "chrom:start-end REF>ALT" Coordinate indexing is assumed to be from 1. """ parse_line = report_bad_line(from_label) with as_handle(infile, 'rU') as handle: rows = [parse_line(line) for line in handle] table = pd.DataFrame.from_records(rows, columns=["chromosome", "start", "end", "gene"]) table['gene'] = table['gene'].replace('', '-') return table
def get_structure(self, structure_id, filename): """Return the structure. Arguments: - structure_id - string, the id that will be used for the structure - filename - name of the mmCIF file OR an open filehandle """ with warnings.catch_warnings(): if self.QUIET: warnings.filterwarnings("ignore", category=PDBConstructionWarning) with as_handle(filename) as handle: self._build_structure(structure_id, handle) return self._structure_builder.get_structure()
def diamondTabularFormatToDicts(filename, fieldNames=None): """ Read DIAMOND tabular (--outfmt 6) output and convert lines to dictionaries. @param filename: Either a C{str} file name or an open file pointer. @param fieldNames: A C{list} or C{tuple} of C{str} DIAMOND field names. Run 'diamond -help' to see the full list. If C{None}, a default set of fields will be used, as compatible with convert-diamond-to-sam.py @raise ValueError: If a line of C{filename} does not have the expected number of TAB-separated fields (i.e., len(fieldNames)). Or if C{fieldNames} is empty or contains duplicates. @return: A generator that yields C{dict}s with keys that are the DIAMOND field names and values as converted by DIAMOND_FIELD_CONVERTER. """ fieldNames = fieldNames or FIELDS.split() nFields = len(fieldNames) if not nFields: raise ValueError('fieldNames cannot be empty.') c = Counter(fieldNames) if c.most_common(1)[0][1] > 1: raise ValueError( 'fieldNames contains duplicated names: %s.' % (', '.join(sorted(x[0] for x in c.most_common() if x[1] > 1)))) def identity(x): return x convertFunc = DIAMOND_FIELD_CONVERTER.get with as_handle(filename) as fp: for count, line in enumerate(fp, start=1): result = {} line = line[:-1] values = line.split('\t') if len(values) != nFields: raise ValueError( 'Line %d of %s had %d field values (expected %d). ' 'To provide input for this function, DIAMOND must be ' 'called with "--outfmt 6 %s" (without the quotes). ' 'The offending input line was %r.' % (count, (filename if isinstance(filename, six.string_types) else 'input'), len(values), nFields, FIELDS, line)) for fieldName, value in zip(fieldNames, values): value = convertFunc(fieldName, identity)(value) result[fieldName] = value yield result
def __init__(self, filename): """Parse a mmCIF file and return a dictionary. Arguments: - file - name of the PDB file OR an open filehandle """ self.quote_chars = ['\'', '\"'] self.whitespace_chars = [' ', '\t'] with as_handle(filename) as handle: loop_flag = False key = None tokens = self._tokenize(handle) try: token = next(tokens) except StopIteration: return # for Python 3.7 and PEP 479 self[token[0:5]] = token[5:] i = 0 n = 0 for token in tokens: if token.lower() == "loop_": loop_flag = True keys = [] i = 0 n = 0 continue elif loop_flag: # The second condition checks we are in the first column # Some mmCIF files (e.g. 4q9r) have values in later columns # starting with an underscore and we don't want to read # these as keys if token.startswith("_") and (n == 0 or i % n == 0): if i > 0: loop_flag = False else: self[token] = [] keys.append(token) n += 1 continue else: self[keys[i % n]].append(token) i += 1 continue if key is None: key = token else: self[key] = token key = None
def write(qresults, handle, format=None, **kwargs): """Writes QueryResult objects to a file in the given format. Arguments: qresults -- An iterator returning QueryResult objects or a single QueryResult object. handle -- Handle to the file, or the filename as a string. format -- Lower case string denoting one of the supported formats. kwargs -- Format-specific keyword arguments. The `write` function writes QueryResult object(s) into the given output handle / filename. You can supply it with a single QueryResult object or an iterable returning one or more QueryResult objects. In both cases, the function will return a tuple of four values: the number of QueryResult, Hit, HSP, and HSPFragment objects it writes to the output file. from Bio import SearchIO qresults = SearchIO.parse('Blast/mirna.xml', 'blast-xml') SearchIO.write(qresults, 'results.tab', 'blast-tab') <stdout> (3, 239, 277, 277) The output of different formats may be adjusted using the format-specific keyword arguments. Here is an example that writes BLAT PSL output file with a header: from Bio import SearchIO qresults = SearchIO.parse('Blat/psl_34_001.psl', 'blat-psl') SearchIO.write(qresults, 'results.tab', 'blat-psl', header=True) <stdout> (2, 13, 22, 26) """ # turn qresults into an iterator if it's a single QueryResult object if isinstance(qresults, QueryResult): qresults = iter([qresults]) else: qresults = iter(qresults) # get the writer object and do error checking writer_class = get_processor(format, _WRITER_MAP) # write to the handle with as_handle(handle, 'w') as target_file: writer = writer_class(target_file, **kwargs) # count how many qresults, hits, and hsps qresult_count, hit_count, hsp_count, frag_count = \ writer.write_file(qresults) return qresult_count, hit_count, hsp_count, frag_count
def parse_interval_list(fname, coord_only, keep_strand): """Parse a Picard-compatible interval list. Expected tabular columns: chromosome, start position, end position, strand, region name Counting is from 1. """ if coord_only: if keep_strand: @report_bad_line def _parse_line(line): chrom, start, end, strand = line.split('\t')[:4] return chrom, int(start) - 1, int(end), strand.rstrip() else: @report_bad_line def _parse_line(line): chrom, start, end = line.split('\t')[:3] return chrom, int(start) - 1, int(end) elif keep_strand: @report_bad_line def _parse_line(line): fields = line.split('\t') chrom, start, end, strand = fields[:4] if len(fields) > 4: name = fields[-1].rstrip() else: name = '-' return chrom, int(start) - 1, int(end), name, strand else: @report_bad_line def _parse_line(line): fields = line.split('\t') chrom, start, end = fields[:3] if len(fields) > 3: name = fields[-1].rstrip() else: name = '-' return chrom, int(start) - 1, int(end), name with as_handle(fname, 'rU') as handle: for line in handle: if line.startswith('@'): # Skip the SAM header continue yield _parse_line(line)
def read(cls, infile, sample_id=None): """Parse a tabular table of coverage data from a handle or filename. """ if sample_id is None: if isinstance(infile, basestring): sample_id = core.fbase(infile) else: sample_id = '<unknown>' with as_handle(infile) as handle: rows = _parse_lines(handle) try: xtra = next(rows) row_data = [next(rows)] row_data.extend(rows) except StopIteration: # Don't crash on empty files return cls(sample_id, [], [], [], [], []) return cls.from_rows(sample_id, row_data, xtra)
def group_bed_tracks(bedfile): """Group the parsed rows in a BED file by track. Yields (track_name, iterable_of_lines), much like itertools.groupby. """ # ENH - make this memory-efficient w/ generators or something with as_handle(bedfile, 'r') as handle: curr_track = 'DEFAULT' curr_lines = [] for line in handle: if line.startswith('track'): if curr_lines: yield curr_track, curr_lines curr_lines = [] curr_track = parse_bed_track(line) else: curr_lines.append(line) yield curr_track, curr_lines
def read(cls, infile, sample_id=None): """Parse a tabular table of coverage data from a handle or filename.""" if sample_id is None: if isinstance(infile, basestring): sample_id = core.fbase(infile) else: sample_id = '<unknown>' with as_handle(infile) as handle: try: header = next(handle) except StopIteration: # Don't crash on empty files return cls(sample_id) # Build CNA... xtra = _sniff_xtra(header) cnarr = cls(sample_id, xtra) arr = numpy.loadtxt(handle, delimiter="\t", dtype=cnarr.data.dtype, ndmin=1) cnarr.data = arr return cnarr
def read_bed(infile): """UCSC Browser Extensible Data (BED) format. A BED file has these columns: chromosome, start position, end position, [gene, strand, other stuff...] Coordinate indexing is from 0. Sets of regions are separated by "track" lines. This function stops reading after encountering a track line other than the first one in the file. """ # ENH: just pd.read_csv, skip 'track' @report_bad_line def _parse_line(line): fields = line.split('\t', 6) chrom, start, end = fields[:3] gene = (fields[3].rstrip() if len(fields) >= 4 else '-') strand = (fields[5].rstrip() if len(fields) >= 6 else '.') return chrom, int(start), int(end), gene, strand def track2track(handle): try: firstline = next(handle) if firstline.startswith("browser "): # UCSC Genome Browser feature -- ignore it firstline = next(handle) except StopIteration: pass else: if not firstline.startswith("track"): yield firstline for line in handle: if line.startswith("track"): break yield line with as_handle(infile, 'rU') as handle: rows = map(_parse_line, track2track(handle)) return pd.DataFrame.from_records(rows, columns=["chromosome", "start", "end", "gene", "strand"])
def __init__(self, filename): """Parse a mmCIF file and return a dictionary. Arguments: - file - name of the PDB file OR an open filehandle """ with as_handle(filename) as handle: loop_flag = False key = None tokens = self._tokenize(handle) token = next(tokens) self[token[0:5]] = token[5:] i = 0 n = 0 for token in tokens: if token == "loop_": loop_flag = True keys = [] i = 0 n = 0 continue elif loop_flag: if token.startswith("_"): if i > 0: loop_flag = False else: self[token] = [] keys.append(token) n += 1 continue else: self[keys[i % n]].append(token) i += 1 continue if key is None: key = token else: self[key] = token key = None