def write_vcf(path, callset, rename=None, number=None, description=None, fill=None, write_header=True): """Preliminary support for writing a VCF file. Currently does not support sample data. Needs further work.""" names, callset = normalize_callset(callset) def write_file(vcf_file): if write_header: write_vcf_header(vcf_file, names, callset=callset, rename=rename, number=number, description=description) write_vcf_data(vcf_file, names, callset=callset, rename=rename, fill=fill) path = resolve_path(path) if hasattr(path, 'write'): write_file(path) else: with open(path, 'w') as f: write_file(f)
def write_fasta(path, sequences, names, mode='w', width=80): """Write nucleotide sequences stored as numpy arrays to a FASTA file. Parameters ---------- path : string File path. sequences : sequence of arrays One or more ndarrays of dtype 'S1' containing the sequences. names : sequence of strings Names of the sequences. mode : string, optional Use 'a' to append to an existing file. width : int, optional Maximum line width. """ # check inputs if isinstance(sequences, np.ndarray): # single sequence sequences = [sequences] names = [names] if len(sequences) != len(names): raise ValueError('must provide the same number of sequences and names') for sequence in sequences: if sequence.dtype != np.dtype('S1'): raise ValueError('expected S1 dtype, found %r' % sequence.dtype) # force binary mode mode = 'ab' if 'a' in mode else 'wb' def save_as_fasta(fasta): for name, sequence in zip(names, sequences): # force bytes if isinstance(name, str): name = name.encode('ascii') header = b'>' + name + b'\n' fasta.write(header) for i in range(0, sequence.size, width): line = sequence[i:i + width].tostring() + b'\n' fasta.write(line) # write to file path = resolve_path(path) if hasattr(path, 'write'): save_as_fasta(path) else: with open(path, mode=mode) as f: save_as_fasta(f)
def write_vcf_header(vcf_file, names, callset, rename, number, description): if rename is None: rename = dict() if number is None: number = dict() if description is None: description = dict() vcf_file = resolve_path(vcf_file) # write file format version print('##fileformat=VCFv4.1', file=vcf_file) # write today's date today = date.today().strftime('%Y%m%d') print('##fileDate=%s' % today, file=vcf_file) # write source print('##source=scikit-allel-%s' % allel.__version__, file=vcf_file) info_names = [ n for n in names if not n.upper().startswith('FILTER_') and not n.upper() in VCF_FIXED_FIELDS ] info_ids = [rename[n] if n in rename else n for n in info_names] # write INFO headers, sorted by ID for name, vcf_id in sorted(zip(info_names, info_ids), key=itemgetter(1)): col = callset[name] # determine VCF Number if name in number: vcf_number = number[name] else: if col.ndim == 1 and col.dtype.kind == 'b': # Flag vcf_number = 0 elif col.ndim == 1: vcf_number = 1 elif col.ndim == 2: vcf_number = col.shape[1] else: raise NotImplementedError('only columns with 1 or two ' 'dimensions are supported') # determine VCF Type kind = col.dtype.kind if kind == 'b': vcf_type = 'Flag' elif kind in 'ui': vcf_type = 'Integer' elif kind == 'f': vcf_type = 'Float' else: vcf_type = 'String' # determine VCF Description if name in description: vcf_description = description[name] else: vcf_description = '' # construct INFO header line header_line = '##INFO=<ID=%s,Number=%s,Type=%s,Description="%s">'\ % (vcf_id, vcf_number, vcf_type, vcf_description) print(header_line, file=vcf_file) filter_names = [n for n in names if n.upper().startswith('FILTER_')] filter_ids = [rename[n] if n in rename else n[7:] for n in filter_names] # write FILTER headers, sorted by ID for name, vcf_id in sorted(zip(filter_names, filter_ids), key=itemgetter(1)): # determine VCF Description if name in description: vcf_description = description[name] else: vcf_description = '' # construct FILTER header line header_line = '##FILTER=<ID=%s,Description="%s">'\ % (vcf_id, vcf_description) print(header_line, file=vcf_file) # write column names line = '#' + '\t'.join(VCF_FIXED_FIELDS) print(line, file=vcf_file)
def write_vcf_data(vcf_file, names, callset, rename, fill): if rename is None: rename = dict() if fill is None: fill = dict() vcf_file = resolve_path(vcf_file) # find the fixed columns, allowing for case insensitive naming in the # input array col_chrom = None col_pos = None col_id = None col_ref = None col_alt = None col_qual = None for n in names: if n.upper() == 'CHROM': col_chrom = callset[n] elif n.upper() == 'POS': col_pos = callset[n] elif n.upper() == 'ID': col_id = callset[n] elif n.upper() == 'REF': col_ref = callset[n] elif n.upper() == 'ALT': col_alt = callset[n] elif n.upper() == 'QUAL': col_qual = callset[n] # check for required columns if col_chrom is None: raise ValueError('CHROM column not found') if col_pos is None: raise ValueError('POS column not found') # pad optional columns dot = itertools.repeat('.') if col_id is None: col_id = dot if col_ref is None: col_ref = dot if col_alt is None: col_alt = dot if col_qual is None: col_qual = dot # find FILTER columns filter_names = [n for n in names if n.upper().startswith('FILTER_')] filter_ids = [rename[n] if n in rename else n[7:] for n in filter_names] filter_cols = [callset[n] for n in filter_names] # sort by ID if filter_names: filters = sorted(zip(filter_names, filter_ids, filter_cols), key=itemgetter(1)) filter_names, filter_ids, filter_cols = zip(*filters) # find INFO columns info_names = [ n for n in names if not n.upper().startswith('FILTER_') and not n.upper() in VCF_FIXED_FIELDS ] info_ids = [rename[n] if n in rename else n for n in info_names] info_cols = [callset[n] for n in info_names] # sort by ID if info_names: infos = sorted(zip(info_names, info_ids, info_cols), key=itemgetter(1)) info_names, info_ids, info_cols = zip(*infos) # setup writer writer = csv.writer(vcf_file, delimiter='\t', lineterminator='\n') # zip up data as rows rows = zip(col_chrom, col_pos, col_id, col_ref, col_alt, col_qual) filter_rows = zip(*filter_cols) info_rows = zip(*info_cols) for row, filter_row, info_row in itertools.zip_longest( rows, filter_rows, info_rows): # unpack main row chrom, pos, id, ref, alt, qual = row chrom = _vcf_value_str(chrom) pos = _vcf_value_str(pos) id = _vcf_value_str(id) ref = _vcf_value_str(ref) alt = _vcf_value_str(alt, fill=fill.get('ALT', None)) qual = _vcf_value_str(qual) # construct FILTER value if filter_row is not None: flt = [i for i, v in zip(filter_ids, filter_row) if v] if flt: flt = ';'.join(flt) else: flt = 'PASS' else: flt = '.' # construct INFO value if info_row is not None: info_vals = [ _vcf_info_str(n, i, v, fill) for n, i, v in zip(info_names, info_ids, info_row) ] info_vals = [x for x in info_vals if x is not None] info = ';'.join(info_vals) else: info = '.' # repack row = chrom, pos, id, ref, alt, qual, flt, info writer.writerow(row)
def iter_gff3(path, attributes=None, region=None, score_fill=-1, phase_fill=-1, attributes_fill='.', tabix='tabix'): """Iterate over records in a GFF3 file. Parameters ---------- path : string, pathlib.Path or any file-like object Path to input file. attributes : list of strings, optional List of columns to extract from the "attributes" field. region : string, optional Genome region to extract. If given, file must be position sorted, bgzipped and tabix indexed. Tabix must also be installed and on the system path. score_fill : int, optional Value to use where score field has a missing value. phase_fill : int, optional Value to use where phase field has a missing value. attributes_fill : object or list of objects, optional Value(s) to use where attribute field(s) have a missing value. tabix : string Tabix command. Returns ------- Iterator """ # prepare fill values for attributes if attributes is not None: attributes = list(attributes) if isinstance(attributes_fill, (list, tuple)): if len(attributes) != len(attributes_fill): raise ValueError('number of fills does not match attributes') else: attributes_fill = [attributes_fill] * len(attributes) # open input stream # write to file path = resolve_path(path) if isinstance(path, str): if region is not None: cmd = [tabix, path, region] buffer = subprocess.Popen(cmd, stdout=subprocess.PIPE).stdout elif path.endswith('.gz') or path.endswith('.bgz'): buffer = gzip.open(path, mode='rb') else: buffer = open(path, mode='rb') else: buffer = path try: for line in buffer: if line[0] == b'>': # assume begin embedded FASTA return if line[0] == b'#': # skip comment lines continue vals = line.split(b'\t') if len(vals) == 9: # unpack for processing fseqid, fsource, ftype, fstart, fend, fscore, fstrand, fphase, fattrs = vals # convert numerics fstart = int(fstart) fend = int(fend) if fscore == b'.': fscore = score_fill else: fscore = float(fscore) if fphase == b'.': fphase = phase_fill else: fphase = int(fphase) fseqid = str(fseqid, 'ascii') fsource = str(fsource, 'ascii') ftype = str(ftype, 'ascii') fstrand = str(fstrand, 'ascii') fattrs = str(fattrs, 'ascii') rec = (fseqid, fsource, ftype, fstart, fend, fscore, fstrand, fphase) if attributes is not None: dattrs = gff3_parse_attributes(fattrs) vattrs = tuple( dattrs.get(k, f) for k, f in zip(attributes, attributes_fill) ) rec += vattrs yield rec finally: buffer.close()