예제 #1
0
def write_vcf(path,
              callset,
              rename=None,
              number=None,
              description=None,
              fill=None,
              write_header=True):
    """Preliminary support for writing a VCF file. Currently does not support sample data.
    Needs further work."""

    names, callset = normalize_callset(callset)

    def write_file(vcf_file):
        if write_header:
            write_vcf_header(vcf_file,
                             names,
                             callset=callset,
                             rename=rename,
                             number=number,
                             description=description)
        write_vcf_data(vcf_file,
                       names,
                       callset=callset,
                       rename=rename,
                       fill=fill)

    path = resolve_path(path)
    if hasattr(path, 'write'):
        write_file(path)
    else:
        with open(path, 'w') as f:
            write_file(f)
예제 #2
0
def write_fasta(path, sequences, names, mode='w', width=80):
    """Write nucleotide sequences stored as numpy arrays to a FASTA file.

    Parameters
    ----------

    path : string
        File path.
    sequences : sequence of arrays
        One or more ndarrays of dtype 'S1' containing the sequences.
    names : sequence of strings
        Names of the sequences.
    mode : string, optional
        Use 'a' to append to an existing file.
    width : int, optional
        Maximum line width.

    """

    # check inputs
    if isinstance(sequences, np.ndarray):
        # single sequence
        sequences = [sequences]
        names = [names]
    if len(sequences) != len(names):
        raise ValueError('must provide the same number of sequences and names')
    for sequence in sequences:
        if sequence.dtype != np.dtype('S1'):
            raise ValueError('expected S1 dtype, found %r' % sequence.dtype)

    # force binary mode
    mode = 'ab' if 'a' in mode else 'wb'

    def save_as_fasta(fasta):
        for name, sequence in zip(names, sequences):
            # force bytes
            if isinstance(name, str):
                name = name.encode('ascii')
            header = b'>' + name + b'\n'
            fasta.write(header)
            for i in range(0, sequence.size, width):
                line = sequence[i:i + width].tostring() + b'\n'
                fasta.write(line)

    # write to file
    path = resolve_path(path)
    if hasattr(path, 'write'):
        save_as_fasta(path)
    else:
        with open(path, mode=mode) as f:
            save_as_fasta(f)
예제 #3
0
def write_vcf_header(vcf_file, names, callset, rename, number, description):
    if rename is None:
        rename = dict()
    if number is None:
        number = dict()
    if description is None:
        description = dict()

    vcf_file = resolve_path(vcf_file)
    # write file format version
    print('##fileformat=VCFv4.1', file=vcf_file)

    # write today's date
    today = date.today().strftime('%Y%m%d')
    print('##fileDate=%s' % today, file=vcf_file)

    # write source
    print('##source=scikit-allel-%s' % allel.__version__, file=vcf_file)

    info_names = [
        n for n in names if not n.upper().startswith('FILTER_')
        and not n.upper() in VCF_FIXED_FIELDS
    ]
    info_ids = [rename[n] if n in rename else n for n in info_names]

    # write INFO headers, sorted by ID
    for name, vcf_id in sorted(zip(info_names, info_ids), key=itemgetter(1)):
        col = callset[name]

        # determine VCF Number
        if name in number:
            vcf_number = number[name]
        else:
            if col.ndim == 1 and col.dtype.kind == 'b':
                # Flag
                vcf_number = 0
            elif col.ndim == 1:
                vcf_number = 1
            elif col.ndim == 2:
                vcf_number = col.shape[1]
            else:
                raise NotImplementedError('only columns with 1 or two '
                                          'dimensions are supported')

        # determine VCF Type
        kind = col.dtype.kind
        if kind == 'b':
            vcf_type = 'Flag'
        elif kind in 'ui':
            vcf_type = 'Integer'
        elif kind == 'f':
            vcf_type = 'Float'
        else:
            vcf_type = 'String'

        # determine VCF Description
        if name in description:
            vcf_description = description[name]
        else:
            vcf_description = ''

        # construct INFO header line
        header_line = '##INFO=<ID=%s,Number=%s,Type=%s,Description="%s">'\
            % (vcf_id, vcf_number, vcf_type, vcf_description)
        print(header_line, file=vcf_file)

    filter_names = [n for n in names if n.upper().startswith('FILTER_')]
    filter_ids = [rename[n] if n in rename else n[7:] for n in filter_names]

    # write FILTER headers, sorted by ID
    for name, vcf_id in sorted(zip(filter_names, filter_ids),
                               key=itemgetter(1)):

        # determine VCF Description
        if name in description:
            vcf_description = description[name]
        else:
            vcf_description = ''

        # construct FILTER header line
        header_line = '##FILTER=<ID=%s,Description="%s">'\
            % (vcf_id, vcf_description)
        print(header_line, file=vcf_file)

    # write column names
    line = '#' + '\t'.join(VCF_FIXED_FIELDS)
    print(line, file=vcf_file)
예제 #4
0
def write_vcf_data(vcf_file, names, callset, rename, fill):
    if rename is None:
        rename = dict()
    if fill is None:
        fill = dict()

    vcf_file = resolve_path(vcf_file)
    # find the fixed columns, allowing for case insensitive naming in the
    # input array
    col_chrom = None
    col_pos = None
    col_id = None
    col_ref = None
    col_alt = None
    col_qual = None
    for n in names:
        if n.upper() == 'CHROM':
            col_chrom = callset[n]
        elif n.upper() == 'POS':
            col_pos = callset[n]
        elif n.upper() == 'ID':
            col_id = callset[n]
        elif n.upper() == 'REF':
            col_ref = callset[n]
        elif n.upper() == 'ALT':
            col_alt = callset[n]
        elif n.upper() == 'QUAL':
            col_qual = callset[n]

    # check for required columns
    if col_chrom is None:
        raise ValueError('CHROM column not found')
    if col_pos is None:
        raise ValueError('POS column not found')

    # pad optional columns
    dot = itertools.repeat('.')
    if col_id is None:
        col_id = dot
    if col_ref is None:
        col_ref = dot
    if col_alt is None:
        col_alt = dot
    if col_qual is None:
        col_qual = dot

    # find FILTER columns
    filter_names = [n for n in names if n.upper().startswith('FILTER_')]
    filter_ids = [rename[n] if n in rename else n[7:] for n in filter_names]
    filter_cols = [callset[n] for n in filter_names]
    # sort by ID
    if filter_names:
        filters = sorted(zip(filter_names, filter_ids, filter_cols),
                         key=itemgetter(1))
        filter_names, filter_ids, filter_cols = zip(*filters)

    # find INFO columns
    info_names = [
        n for n in names if not n.upper().startswith('FILTER_')
        and not n.upper() in VCF_FIXED_FIELDS
    ]
    info_ids = [rename[n] if n in rename else n for n in info_names]
    info_cols = [callset[n] for n in info_names]
    # sort by ID
    if info_names:
        infos = sorted(zip(info_names, info_ids, info_cols), key=itemgetter(1))
        info_names, info_ids, info_cols = zip(*infos)

    # setup writer
    writer = csv.writer(vcf_file, delimiter='\t', lineterminator='\n')

    # zip up data as rows
    rows = zip(col_chrom, col_pos, col_id, col_ref, col_alt, col_qual)
    filter_rows = zip(*filter_cols)
    info_rows = zip(*info_cols)

    for row, filter_row, info_row in itertools.zip_longest(
            rows, filter_rows, info_rows):

        # unpack main row
        chrom, pos, id, ref, alt, qual = row
        chrom = _vcf_value_str(chrom)
        pos = _vcf_value_str(pos)
        id = _vcf_value_str(id)
        ref = _vcf_value_str(ref)
        alt = _vcf_value_str(alt, fill=fill.get('ALT', None))
        qual = _vcf_value_str(qual)

        # construct FILTER value
        if filter_row is not None:
            flt = [i for i, v in zip(filter_ids, filter_row) if v]
            if flt:
                flt = ';'.join(flt)
            else:
                flt = 'PASS'
        else:
            flt = '.'

        # construct INFO value
        if info_row is not None:
            info_vals = [
                _vcf_info_str(n, i, v, fill)
                for n, i, v in zip(info_names, info_ids, info_row)
            ]
            info_vals = [x for x in info_vals if x is not None]
            info = ';'.join(info_vals)
        else:
            info = '.'

        # repack
        row = chrom, pos, id, ref, alt, qual, flt, info
        writer.writerow(row)
예제 #5
0
def iter_gff3(path, attributes=None, region=None, score_fill=-1,
              phase_fill=-1, attributes_fill='.', tabix='tabix'):
    """Iterate over records in a GFF3 file.

    Parameters
    ----------
    path : string, pathlib.Path or any file-like object
        Path to input file.
    attributes : list of strings, optional
        List of columns to extract from the "attributes" field.
    region : string, optional
        Genome region to extract. If given, file must be position
        sorted, bgzipped and tabix indexed. Tabix must also be installed
        and on the system path.
    score_fill : int, optional
        Value to use where score field has a missing value.
    phase_fill : int, optional
        Value to use where phase field has a missing value.
    attributes_fill : object or list of objects, optional
        Value(s) to use where attribute field(s) have a missing value.
    tabix : string
        Tabix command.

    Returns
    -------
    Iterator

    """

    # prepare fill values for attributes
    if attributes is not None:
        attributes = list(attributes)
        if isinstance(attributes_fill, (list, tuple)):
            if len(attributes) != len(attributes_fill):
                raise ValueError('number of fills does not match attributes')
        else:
            attributes_fill = [attributes_fill] * len(attributes)

    # open input stream

    # write to file
    path = resolve_path(path)
    if isinstance(path, str):
        if region is not None:
            cmd = [tabix, path, region]
            buffer = subprocess.Popen(cmd, stdout=subprocess.PIPE).stdout
        elif path.endswith('.gz') or path.endswith('.bgz'):
            buffer = gzip.open(path, mode='rb')
        else:
            buffer = open(path, mode='rb')
    else:
        buffer = path
    try:
        for line in buffer:
            if line[0] == b'>':
                # assume begin embedded FASTA
                return
            if line[0] == b'#':
                # skip comment lines
                continue
            vals = line.split(b'\t')
            if len(vals) == 9:
                # unpack for processing
                fseqid, fsource, ftype, fstart, fend, fscore, fstrand, fphase, fattrs = vals
                # convert numerics
                fstart = int(fstart)
                fend = int(fend)
                if fscore == b'.':
                    fscore = score_fill
                else:
                    fscore = float(fscore)
                if fphase == b'.':
                    fphase = phase_fill
                else:
                    fphase = int(fphase)
                fseqid = str(fseqid, 'ascii')
                fsource = str(fsource, 'ascii')
                ftype = str(ftype, 'ascii')
                fstrand = str(fstrand, 'ascii')
                fattrs = str(fattrs, 'ascii')
                rec = (fseqid, fsource, ftype, fstart, fend, fscore, fstrand, fphase)
                if attributes is not None:
                    dattrs = gff3_parse_attributes(fattrs)
                    vattrs = tuple(
                        dattrs.get(k, f)
                        for k, f in zip(attributes, attributes_fill)
                    )
                    rec += vattrs
                yield rec

    finally:
        buffer.close()