Esempio n. 1
0
 def vcf_to_vca(self,vcf_path):
     vca = []
     try:
         vcfr = ht.VCF_Reader(vcf_path)
         for vc in vcfr:
             vca += [vc]
     except Exception as E:
         pass
     return vca
Esempio n. 2
0
def vcf_glob_to_svultd(path_glob,chroms,offset_map,flt=0,flt_exclude=[]):        
    vcfs,S,V = glob.glob(path_glob),{},{}
    for vcf in vcfs:
        vcr = ht.VCF_Reader(vcf)
        s_id = id_trim(vcf)
        if s_id in flt_exclude:
            S[s_id],V[s_id] = construct_svult(vcr,chroms,offset_map,s_id,-1)
        else:
            S[s_id],V[s_id] = construct_svult(vcr,chroms,offset_map,s_id,flt)
    return S,V
Esempio n. 3
0
def parse_vcf(vcf_file,
              snp_data,
              min_reads,
              min_af,
              min_qual,
              annotations,
              seqs,
              options,
              line_num=100000):
    """
    Parse VCF file counts synonymous and non-synonymous SNPs

    :param file vcf_file: file handle to a VCF file
    :param dict snp_data: dictionary from :func:`init_count_set` with per
        sample SNPs information
    :param int min_reads: minimum number of reads to accept a SNP
    :param float min_af: minimum allele frequency to accept a SNP
    :param int min_qual: minimum quality (Phred score) to accept a SNP
    :param dict annotations: annotations grouped by their reference sequence
    :param dict seqs: reference sequences
    :param int line_num: the interval in number of lines at which progress
        will be printed
    """
    vcf_handle = HTSeq.VCF_Reader(compressed_handle(vcf_file))

    vcf_handle.parse_meta()
    vcf_handle.make_info_dict()

    # total number of SNPs accepted
    count_tot = 0
    # number of SNPs skipped for low depth
    skip_dp = 0
    # number of SNPs skipped for low allele frequency
    skip_af = 0
    # number of SNPs skipped for low quality
    skip_qual = 0
    # indels
    skip_indels = 0

    for vcf_record in vcf_handle:
        # the SNP is a sequence with no annotations
        if vcf_record.chrom not in annotations:
            continue

        if float(vcf_record.qual) < min_qual:
            # low quality SNP
            skip_qual += 1
            continue

        # unpack info records (needed for vcf_record.info to be a dictionary)
        vcf_record.unpack_info(vcf_handle.infodict)

        if vcf_record.info['INDEL']:
            skip_indels += 1
            continue

        if not isinstance(vcf_record.info['DP'], int):
            LOG.warning(vcf_record.info['DP'])

        if vcf_record.info['DP'] < min_reads:
            # not enough reads (depth) for the SNP
            skip_dp += 1
            continue

        # Samtools mpileup -> bcftools call doesn't output the allele freq.
        # it can be calculated with AC/AN for each ALT nucleotide
        # checked on bfctools (roh command) manual
        # https://samtools.github.io/bcftools/bcftools.html
        try:
            allele_freqs = vcf_record.info['AF']
        except KeyError:
            if isinstance(vcf_record.info['AC'], list):
                allele_freqs = [
                    AC / vcf_record.info['AN'] for AC in vcf_record.info['AC']
                ]
            else:
                allele_freqs = vcf_record.info['AC'] / vcf_record.info['AN']

        # if the allele frequency is a single value, make it a list, so
        # the iteration below works anyway
        if isinstance(allele_freqs, float):
            allele_freqs = [allele_freqs]

        # alt is the nucleotidic change
        iter_data = zip(allele_freqs, vcf_record.alt)
        for alt_index, (allele_freq, change) in enumerate(iter_data):
            if allele_freq < min_af:
                # the allele frequency for the SNP is too low, it'll be
                # skipped
                skip_af += 1
                continue

            # the samples that contain the SNP is a string separated by '-'
            if options.bcftools_vcf:
                samples = set()
                for sample_id, sample_info in vcf_record.samples.items():
                    # prepare the genotype list, to make the comparison easier
                    # the genotype separator to '/' only, to use only one
                    # type of split
                    sample_info_gt = sample_info['GT'].replace('|', '/')
                    sample_info_gt = sample_info_gt.split('/')
                    for genotype in sample_info_gt:
                        if genotype == '.':
                            continue
                        if int(genotype) == (alt_index + 1):
                            samples.add(sample_id)
            else:
                samples = [
                    sample for sample in vcf_record.info['set'].split('-')
                ]
            check_snp_in_set(samples, snp_data, vcf_record.pos.start, change,
                             annotations[vcf_record.chrom],
                             seqs[vcf_record.chrom])
            # increase the total number of snps available
            count_tot += 1

        if vcf_handle.line_no % line_num == 0:
            LOG.info(
                "Line %d, SNPs passed %d; skipped for: qual %d, " +
                "depth %d, freq %d, indels %d", vcf_handle.line_no, count_tot,
                skip_qual, skip_dp, skip_af, skip_indels)
Esempio n. 4
0
for feature in gff_file:
    if feature.type == "transcript":
        transcript[feature.name] = {
            'iv': feature.iv,  # .iv is GenomicInterval
            'CDSfeats': []
        }
    if feature.type == "CDS":
        transcript[feature.attr["Parent"]]['CDSfeats'].append(feature.iv)
        ## Future worry: do I need CDS.frame in transcript object?
        CDSfeat[feature.iv] = feature

print(
    "# Chrom\tPos\tPos in CDS\tBase change\tAA change\tAA pos in transcript\ttranscript ID"
)
vcfr = HTSeq.VCF_Reader(sys.argv[3])

for vc in vcfr:
    vCDS = CDSfeat[vc.pos]
    # vCDS.iv.start is base before 1st base of CDS
    if not vCDS == None and not vc.pos.start == vCDS.iv.start:
        vTranscript = transcript[vCDS.attr["Parent"]]
        refseq = str(
            HTSeq.Sequence(
                sequences[vCDS.iv.chrom].seq[vCDS.iv.start:vCDS.iv.end]))
        refseqT = ''.join(
            str(HTSeq.Sequence(sequences[CDS.chrom].seq[CDS.start:CDS.end]))
            for CDS in vTranscript['CDSfeats'])
        relpos = vc.pos.start - vCDS.iv.start
        # if variant is 1st base of CDS, relpos=1
        if refseq[relpos - 1] != vc.ref:
Esempio n. 5
0
    def __iter__(self):
        self.mc.log_debug('vcf_path: {}'.format(self.vcf_path))
        self.mc.log_debug('sample: {}'.format(self.sample))
        self.mc.log_debug('ploidy: {}'.format(self.ploidy))
        self.mc.log_debug('add_chrom_prefix: {}'.format(self.add_chrom_prefix))

        vcf = HTSeq.VCF_Reader(self.vcf_path)
        vcf.parse_meta()

        self.mc.handle_progress('Reading VCF file...')

        n = -1
        for vc in vcf:
            n += 1
            if n != 0 and n % 500000 == 0:
                self.mc.handle_progress(
                    '{} lines read from VCF file...'.format(n))

            if self.sample not in vc.samples:
                raise AnnotationParseError(
                    self.vcf_path,
                    'Sample "{}" not in VCF file.'.format(self.sample))

            gt = vc.samples[self.sample]['GT']
            if '.' in gt:
                continue
            if '/' in gt:
                phased = False
                if '|' in gt:
                    gt = gt.replace('|', '/')
                sep = '/'
            else:
                assert '|' in gt
                phased = True
                sep = '|'

            gt = gt.split(sep)
            if len(gt) != self.ploidy:
                raise AnnotationParseError(
                    self.vcf_path,
                    'The ploidy({}) may be inconsistent with the '
                    'sample "{}"({}).'.format(self.ploidy, self.sample,
                                              len(gt)))

            ref_alt = [vc.ref] + vc.alt
            alleles = [ref_alt[int(g)] for g in gt]

            for allele in alleles:
                if len(allele) != 1:
                    continue
            if len(set(alleles)) < 2:
                continue

            chrom = vc.pos.chrom
            if self.add_chrom_prefix:
                chrom = 'chr{}'.format(chrom)
            pos = vc.pos.pos - 1

            snp = SNP(chrom, pos, alleles, phased)
            assert self.ploidy == snp.ploidy
            yield snp
Esempio n. 6
0
    )
    sys.exit()


try:
    bool_keepSDCO = True
    str_vcfName = sys.argv[1]
    parent1 = sys.argv[2]
    parent2 = sys.argv[3]
    GENOTYPEQUALITYTHRESHOLD = float(sys.argv[4])
    MISSINGNESSTHRESHOLD = float(sys.argv[5])
    sys.stderr.write("\tvcfToRqtl\n\tMissingness threshold: %s\n" %
                     MISSINGNESSTHRESHOLD)
    sys.stderr.write("\tGenotype Quality threshold: %s\n" %
                     GENOTYPEQUALITYTHRESHOLD)
    vcfFile = HTSeq.VCF_Reader(str_vcfName)
    if sys.argv[6] == "--removeSDCO":
        sys.stderr.write("\tWill remove short range double crossovers\n")
        bool_keepSDCO = False
    elif sys.argv[6] == "--keepSDCO":
        sys.stderr.write("\tWill keep short range double crossovers\n")
        bool_keepSDCO = True
    else:
        usage()
except IndexError:  # Check if arguments were given
    sys.stderr.write(
        "Insufficient arguments received. Please check your input:\n")
    usage()
except IOError:  # Check if file is unabled to be opened.
    sys.stderr.write("Cannot open target file. Please check your input:\n")
    usage()
Esempio n. 7
0
        fh.write('\n')

    fh.close()

    return out_geno, mID_lookup


if __name__ == "__main__":

    #vcfn,qd,gq,chi2crit = sys.argv[1:]
    vcfn, outbase, gq, fract_max = sys.argv[1:5]
    gq = float(gq)
    fract_max = float(fract_max)
    #outbase = os.path.splitext(vcfn)[0]

    vcfr = HTSeq.VCF_Reader(vcfn)
    vcfr.parse_meta()
    vcfr.make_info_dict()

    ped, recombinants, parents, parents_spp = sample_data_from_DB(
        vcfr.sampleids)
    tests = species_tests_by_family(ped, recombinants, parents_spp)

    polarized_loci, polarized_geno = cross_genotypes_from_htseq_vcf(vcfr,
                                                                    tests,
                                                                    gq_cut=gq)
    loc_counts = dict([
        (loc, sum([polarized_geno[ind].has_key(loc) for ind in recombinants]))
        for loc in polarized_loci
    ])
    mct = max(loc_counts.values())