コード例 #1
0
def parse_vcf(assembly,
              vcf_infile,
              compressed=True,
              verbose=True,
              by_id=True,
              **tabix_params):
    t0 = time.time()
    compressed == vcf_infile.endswith('.gz')
    vcf_r = Reader(filename=vcf_infile, compressed=compressed)
    vcf_r.fetch('1', 1)  # call a dummy fetch to initialize vcf_r._tabix
    if tabix_params:
        vcf_r.reader = vcf_r._tabix.fetch(**tabix_params)
    cnt_1, cnt_2, cnt_3 = 0, 0, 0
    for rec in vcf_r:
        doc = parse_one_rec(assembly, rec)
        if by_id:
            # one hgvs id, one doc
            if doc['_id']:
                if isinstance(doc['_id'], list):
                    for i, _id in enumerate(doc['_id']):
                        _doc = copy.copy(doc)
                        _doc['alt'] = doc['alt'][i]
                        _doc[assembly] = doc[assembly][i]
                        _doc['_id'] = _id
                        yield _doc
                        cnt_2 += 1
                        if verbose:
                            logging.info("%s\t%s" %
                                         (_doc['rsid'], _doc['_id']))

                else:
                    yield doc
                    cnt_2 += 1
                    if verbose:
                        logging.info("%s\t%s" % (doc['rsid'], doc['_id']))
            else:
                cnt_3 += 1
        else:
            # one rsid, one doc
            if doc['_id']:
                yield doc
                cnt_2 += 1
                if verbose:
                    logging.info("%s\t%s" % (doc['rsid'], doc['_id']))
            else:
                cnt_3 += 1
        cnt_1 += 1
    logging.info("Done. [{}]".format(timesofar(t0)))
    logging.info("Total rs: {}; total docs: {}; skipped rs: {}".format(
        cnt_1, cnt_2, cnt_3))
コード例 #2
0
    def assertVcfHasVariantWithCall(self, vcf, chrom, pos, sample, call):
        """
        Assert that a call is made for a given sample in a given position. `call` is a dict corresponding to elements
        in the vcf sample field. Example:

        self.assertVcfHasVariantWithCall(my_vcf, 1, 3184885, 'B',
                                         call={'GT': '1/2', 'DP': 10})
        """
        self.assertVcfHasSample(vcf, sample)

        v = Reader(filename=vcf)
        variants = v.fetch(chrom=chrom, start=pos - 1, end=pos)
        variant_found = False
        for variant in variants:
            if variant.CHROM == str(chrom) and variant.POS == pos:
                for cc in variant.samples:
                    if cc.sample == sample:
                        # thank you http://stackoverflow.com/a/4527978/179444
                        shared_items = set(cc.data.__dict__.items()) & set(
                            call.items())
                        if shared_items == set(call.items()):
                            variant_found = True

        if not variant_found:
            raise AssertionError(
                "Call {} not present for sample {} at {}:{} in {}".format(
                    call, sample, chrom, pos, vcf))
コード例 #3
0
def get_haplotype_stats(template_vcf: vcf.Reader, in_vcf: vcf.Reader, out):
    contigs = in_vcf.contigs.keys()
    hap_stats = HapStats()
    for contig in contigs:
        try:
            template_vcf.fetch(contig)
            template_chromo = ChromosomoHaplotype(template_vcf, contig)
            in_chromo = ChromosomoHaplotype(in_vcf, contig)
            chromo_hap_stats = get_haplotype_stats_chromo(
                template_chromo, in_chromo, out, contig)
            hap_stats.insert_hap_stats(chromo_hap_stats)
        except:
            continue
    out.write("%s\t%d\t%d\t%d\t%d\t%.8f\t%.8f\n" %
              ("total", hap_stats.get_AN50(), hap_stats.get_N50(),
               hap_stats.get_total_phased(), hap_stats.get_total_spanned(),
               hap_stats.get_switch_error(), hap_stats.get_mismatch_error()))
コード例 #4
0
def parse_vcf(vcf_infile, compressed=True, verbose=True, by_id=True, **tabix_params):
    t0 = time.time()
    compressed == vcf_infile.endswith('.gz')
    vcf_r = Reader(filename=vcf_infile, compressed=compressed)
    vcf_r.fetch('1', 1)   # call a dummy fetch to initialize vcf_r._tabix
    if tabix_params:
        vcf_r.reader = vcf_r._tabix.fetch(**tabix_params)
    cnt_1, cnt_2, cnt_3 = 0, 0, 0
    for rec in vcf_r:
        doc = parse_one_rec(rec)
        if by_id:
            # one hgvs id, one doc
            if doc['_id']:
                if isinstance(doc['_id'], list):
                    for i, _id in enumerate(doc['_id']):
                        _doc = copy.copy(doc)
                        _doc['alt'] = doc['alt'][i]
                        _doc[POS_KEY] = doc[POS_KEY][i]
                        _doc['_id'] = _id
                        yield _doc
                        cnt_2 += 1
                        if verbose:
                            print(_doc['rsid'], '\t', _doc['_id'])

                else:
                    yield doc
                    cnt_2 += 1
                    if verbose:
                        print(doc['rsid'], '\t', doc['_id'])
            else:
                cnt_3 += 1
        else:
            # one rsid, one doc
            if doc['_id']:
                yield doc
                cnt_2 += 1
                if verbose:
                    print(doc['rsid'], '\t', doc['_id'])
            else:
                cnt_3 += 1
        cnt_1 += 1
    print("Done. [{}]".format(timesofar(t0)))
    print("Total rs: {}; total docs: {}; skipped rs: {}".format(cnt_1, cnt_2, cnt_3))
コード例 #5
0
def write_chromosome(in_vcf: vcf.Reader, out_vcf: vcf.Writer,
                     chromo_haplotype: ChromosomoHaplotype, contig: str):
    rec: vcf.model._Record
    for rec in in_vcf.fetch(contig):
        het = rec.samples[0].gt_type
        if het != 1:  # not het loci
            out_vcf.write_record(rec)
        else:
            record = chromo_haplotype.chromo_record[rec.POS]
            record.finalize_record(rec)
            out_vcf.write_record(rec)
コード例 #6
0
    def assertVcfHasVariantAt(self, vcf, chrom, pos):
        v = Reader(filename=vcf)
        variants = v.fetch(chrom=chrom, start=pos - 1, end=pos)
        variant_found = False
        for variant in variants:
            if variant.CHROM == str(chrom) and variant.POS == pos:
                variant_found = True

        if not variant_found:
            raise AssertionError("Variant at {}:{} not present in {}".format(
                chrom, pos, vcf))
コード例 #7
0
    def assertVcfHasVariantWithChromPosRefAlt(self, vcf, chrom, pos, ref, alt):
        v = Reader(filename=vcf)
        variants = v.fetch(chrom=chrom, start=pos - 1, end=pos)
        variant_found = False
        for variant in variants:
            if variant.CHROM == str(chrom) and \
                    variant.POS == pos and \
                    variant.REF == ref and \
                    alt in variant.ALT:
                variant_found = True

        if not variant_found:
            raise AssertionError(
                "Variant at {}:{} {}/{} not present in {}".format(
                    chrom, pos, ref, alt, vcf))
コード例 #8
0
    def __init__(self, in_vcf: vcf.Reader, chromo: str):
        self.chromo_record = dict()
        self.chromo_phase_set = dict()
        self.chromo_record2phaseset_map = dict()
        self.graph_struct = graph.Graph()
        rec: vcf.model._Record
        ps_label_fix = dict()
        idx = 0

        for rec in in_vcf.fetch(chromo):
            het = rec.samples[0].gt_type
            if het != 1:  # not het loci
                continue
            PS_fix = 0
            if rec.samples[0].phased:
                fmt = rec.FORMAT.split(':')
                if 'PS' in fmt:
                    PS = rec.samples[0]['PS']
                    if PS in ps_label_fix.keys():
                        PS_fix = ps_label_fix[PS]
                    else:
                        ps_label_fix[PS] = rec.POS
                        PS_fix = rec.POS
                else:
                    PS_fix = 1
            record = Record()
            record.copy_from_rec(rec, PS_fix, idx)
            idx += 1
            self.chromo_record[record.pos] = record
            if record.ps != 0:
                PS = record.ps
                self.chromo_record2phaseset_map[record.pos] = PS
                phase_set: PhaseSet
                if PS in self.chromo_phase_set.keys():
                    phase_set = self.chromo_phase_set[PS]
                else:
                    phase_set = PhaseSet(record.ps)
                    self.chromo_phase_set[PS] = phase_set
                phase_set.insert_record(record)
コード例 #9
0
ファイル: writers.py プロジェクト: JoseBlanca/vcf_crumbs
class IlluminaWriter(object):
    '''It writes the SNPs in Illumina format

    ref_fpath should be in fasta format and it has to have a name attribute.
    min_maf controls the SNPs reported in the adjacent segments as IUPAC codes.
    '''

    # TODO add extra error classes
    # TODO include the error classes inside this class to easy access
    class NotEnoughAdjacentSequenceError(Exception):
        pass

    def __init__(self,  ref_fpath, out_fhand, length=60, vcf_fpath=None,
                 min_length=None):
        ''''It inits.

        The vcf will be used to replace in the reference sequence the SNPs
        around the SNP of interest with IUPAC codes
        '''
        self._sep = u'\t'
        self._len = length
        if min_length is None:
            min_length = length
        if min_length > length:
            msg = 'Minimum length must be smaller than required length'
            raise ValueError(msg)
        self._min_len = min_length

        self._ref_seqs = SeqIO.index(ref_fpath, format='fasta')

        if vcf_fpath:
            self._snvs = Reader(filename=vcf_fpath)
        else:
            self._snvs = None
        self._out_fhand = out_fhand
        out_fhand.write(u'CHROM\tPOS\tID\tseq\n')
        self._prev_chrom = None

    def write(self, snv):
        chrom_name = snv.CHROM

        prev_chrom = self._prev_chrom
        if prev_chrom is None or prev_chrom.name != chrom_name:
            chrom = self._ref_seqs[chrom_name]
            self._prev_chrom = chrom
        else:
            chrom = prev_chrom

        length = self._len
        min_len = self._min_len

        snv_start = snv.start   # 0 based
        snv_end = snv.end       # 1 based
        desired_start = snv_start - length  # desired segment start
        end = snv_end + length      # desired segment end
        chrom_seq = chrom.seq
        first_segment = unicode(chrom_seq[desired_start:snv_start])

        if len(first_segment) < min_len:
            msg = "Not enough sequence in 3'. ID: %s, POS: %d, CHROM: %s"
            msg %= (snv.ID, snv.POS, snv.CHROM)
            raise self.NotEnoughAdjacentSequenceError(msg)

        if self._snvs:
            real_start = snv_start - len(first_segment)
            close_snvs = self._snvs.fetch(chrom.name, start=real_start,
                                          end=snv_start)
            first_segment = _replace_snvs_with_iupac(first_segment, close_snvs,
                                                     seq_offset=real_start)

        snv_segment = _build_snv_section(snv)
        second_segment = unicode(chrom_seq[snv_end:end])
        if len(second_segment) < min_len:
            msg = "Not enough sequence in 5'. ID: %s, POS: %d, CHROM: %s"
            msg %= (snv.ID, snv.POS, snv.CHROM)
            raise self.NotEnoughAdjacentSequenceError(msg)

        if self._snvs:
            real_end = snv_end + len(second_segment)
            close_snvs = self._snvs.fetch(chrom.name, start=snv_end,
                                          end=real_end)
            second_segment = _replace_snvs_with_iupac(second_segment,
                                                      close_snvs,
                                                      seq_offset=snv_end)

        out_fhand = self._out_fhand
        sep = self._sep
        out_fhand.write(unicode(snv.CHROM))
        out_fhand.write(sep)
        out_fhand.write(unicode(snv.POS))
        out_fhand.write(sep)
        snp_id = snv.ID
        if snp_id is None:
            snp_id = u'.'
        out_fhand.write(snp_id)
        out_fhand.write(sep)
        out_fhand.write(first_segment)
        out_fhand.write(snv_segment)
        out_fhand.write(second_segment)
        out_fhand.write(u'\n')

    def flush(self):
        self._out_fhand.flush()

    def close(self):
        self._out_fhand.close()