Example #1
0
def main(fname_in, fname_out, ambiguous_base_coverage_threshold):
    """
    ambiguous_base_coverage_threshold:
        frequency threshold to include a variant in computation of ambiguous code
    """
    vcf_reader = VCF(fname_in)
    vcf_writer = Writer(fname_out, vcf_reader)

    for variant in vcf_reader:
        base_list = [variant.REF] + variant.ALT
        coverage_list = variant.INFO.get("AD")

        total_coverage = sum(coverage_list)
        assert len(base_list) == len(coverage_list)

        # genotype 0 is reference (base is not really needed)
        genotype = [
            i
            for i, (base, coverage) in enumerate(zip(base_list, coverage_list))
            if coverage / total_coverage >= ambiguous_base_coverage_threshold
        ]

        variant.genotypes = [[*genotype, False]]

        vcf_writer.write_record(variant)

    vcf_writer.close()
    vcf_reader.close()
Example #2
0
def count_TP_FP_FN(directory_combined_caller, type_combined_caller,
                   directory_individual_caller):
    vcf = VCF(
        f'{directory_combined_caller}/{type_combined_caller}.sorted.vcf.gz')

    TP = 0
    FP = 0
    for variant in vcf:
        if variant.INFO.get('TruScore'):
            TP += 1
        else:
            FP += 1

    vcf.close()

    with open(f'{directory_individual_caller}/summary.txt',
              'r') as fh_individual:
        counts = json.load(fh_individual)
        event_count = counts['TP-base'] + counts['FN']

    with open(f'{directory_combined_caller}/counts.json', 'w') as fh_combined:
        json.dump(
            {
                'TP-base':
                TP,  # evaluate.ipynb assumes the existence of this key 
                'FP': FP,
                'FN': event_count -
                TP,  # evaluate.ipynb assumes the existence of this key 
            },
            fh_combined,
            indent=2)
def filter_annotate_calls():
    parser = argparse.ArgumentParser(description='')
    parser.add_argument('--alignments', type=str, help='')
    parser.add_argument('--regions', type=str, help='')
    parser.add_argument('--calls', type=str, help='')
    parser.add_argument('--parameters', type=str, help='')
    args = parser.parse_args()

    import json
    parameters = json.load(open('{}.json'.format(args.parameters)))

    vcf = VCF(args.calls + '.vcf.gz')
    vcf.add_info_to_header({
        'ID': 'Confidence',
        'Description':
        'Measure of confidence in call based upon unitig structure',
        'Type': 'String',
        'Number': '1'
    })
    with pysam.AlignmentFile(args.alignments + '.bam',
                             'rb') as unitigs, gzip.open(
                                 args.regions + '.bed.gz', 'rt') as regions:
        print(vcf.raw_header, end='')
        for region in regions:
            chromosome, start, end = region.strip().split('\t')
            region = '{}:{}-{}'.format(chromosome, start, end)
            for variant in vcf(region):
                retain_call, call_confidence = retainCall_reportConfidence(
                    unitigs, variant, region, parameters)
                if retain_call:
                    print(annotate(variant, call_confidence), end='')
    vcf.close()
Example #4
0
def find_SVs():
  parser = argparse.ArgumentParser(description='')
  parser.add_argument('--calls', type=str, help='')
  parser.add_argument('--svtype', type=str, help='')
  parser.add_argument('--parameters', type=str, help='')
  args = parser.parse_args()

  import json 
  parameters = json.load(open('{}.json'.format(args.parameters)))

  variants = VCF('/dev/stdin') if args.calls == 'stdin' else VCF(args.calls)
  
  svtype = args.svtype
  if svtype not in ['DEL', 'INS']:
    print('svtype', svtype, 'not permitted!', file=sys.stderr) 
    sys.exit(1) 
 
  print(variants.raw_header, end="")  
  for variant in variants: 
    # Decomposition may cause vcf records with genotype "0/0" to appear. 
    # These should be removed because truvari flags these as FPs, artificially inflating the FP rate
    if hom_ref(variant): 
      continue

    minSVSize = int(parameters['filterCalls']['minSVSize'])
    if get_svtype(variant) == svtype and abs(get_sv_length(variant)) >= minSVSize: 
      print(variant, end='')
 
  variants.close()
Example #5
0
def write_pass_vcf(annotated_vcf):

    out_vcf = re.sub(r'\.annotated\.vcf\.gz$', '.annotated.pass.vcf',
                     annotated_vcf)
    vcf = VCF(annotated_vcf)
    w = Writer(out_vcf, vcf)

    num_rejected = 0
    num_pass = 0
    for rec in vcf:
        if rec.FILTER is None or rec.FILTER == 'None':
            w.write_record(rec)
            num_pass += 1
        else:
            num_rejected += 1

    vcf.close()
    w.close()

    logger.info('Number of non-PASS/REJECTED variant calls: ' +
                str(num_rejected))
    logger.info('Number of PASSed variant calls: ' + str(num_pass))
    if num_pass == 0:
        logger.warning(
            'There are zero variants with a \'PASS\' filter in the VCF file')
        os.system('bgzip -dc ' + str(annotated_vcf) + ' egrep \'^#\' > ' +
                  str(out_vcf))
    #else:
    os.system('bgzip -f ' + str(out_vcf))
    os.system('tabix -f -p vcf ' + str(out_vcf) + '.gz')

    return
Example #6
0
def main():
    args = get_args()
    vcf_in = VCF(args.vcf)
    vcf_in.add_info_to_header({
        'ID': 'SVLEN',
        'Description': 'length of sv',
        'Type': 'Integer',
        'Number': '1'
    })
    vcf_in.add_info_to_header({
        'ID': 'SVTYPE',
        'Description': 'type of sv - just DEL or INS based on SVLEN',
        'Type': 'String',
        'Number': '1'
    })
    vcf_out = Writer(args.output, vcf_in)
    for v in vcf_in:
        if abs(len(v.REF) - max([len(alt) for alt in v.ALT])) > 49:
            v.INFO["SVLEN"] = max([len(alt) for alt in v.ALT]) - len(v.REF)
            if v.INFO["SVLEN"] > 0:
                v.INFO["SVTYPE"] = "INS"
            else:
                v.INFO["SVTYPE"] = "DEL"
            vcf_out.write_record(v)
    vcf_in.close()
    vcf_out.close()
def compute_min_SV_size():
    parser = argparse.ArgumentParser(description='')
    parser.add_argument('--calls', type=str, help='')
    parser.add_argument('--svtype', type=str, help='')
    args = parser.parse_args()

    variants = VCF('/dev/stdin') if args.calls == 'stdin' else VCF(args.calls)

    svtype = args.svtype
    if svtype not in ['DEL', 'INS']:
        print('svtype', svtype, 'not permitted!', file=sys.stderr)
        sys.exit(1)

    min_size_variant = None
    min_size = 10000

    for variant in variants:
        # Decomposition may cause vcf records with genotype "0/0" to appear.
        # These should be removed
        if hom_ref(variant):
            continue

        if get_svtype(variant) == svtype and abs(
                get_sv_length(variant)) < min_size:
            min_size_variant = variant
            min_size = get_sv_length(variant)

    variants.close()

    print('min-size variant: {}'.format(str(min_size_variant)), end='')
    print('min size: {}'.format(min_size))
    print()
Example #8
0
def extract_pharmcat_pgx_regions(tabix_executable_path, input_vcf, output_dir,
                                 input_ref_pgx_vcf):
    '''
    extract pgx regions in input_ref_pgx_vcf from input_vcf and save variants to path_output
    '''

    print(
        'Modify chromosome names.\nExtract PGx regions based on the input reference PGx position file.'
    )
    path_output = os.path.join(
        output_dir,
        obtain_vcf_file_prefix(input_vcf) + '.pgx_regions.vcf.gz')

    input_vcf_cyvcf2 = VCF(input_vcf)
    input_ref_pgx_pos_cyvcf2 = VCF(input_ref_pgx_vcf)

    # get pgx regions in each chromosome
    input_ref_pgx_pos_pandas = allel.vcf_to_dataframe(input_ref_pgx_vcf)
    input_ref_pgx_pos_pandas['CHROM'] = input_ref_pgx_pos_pandas[
        'CHROM'].replace({
            'chr': ''
        }, regex=True).astype(str).astype(int)
    ref_pgx_regions = input_ref_pgx_pos_pandas.groupby(
        ['CHROM'])['POS'].agg(get_vcf_pos_min_max).reset_index()
    # fix chr names
    chr_name_match = re.compile("^chr")
    if any(chr_name_match.match(line) for line in input_vcf_cyvcf2.seqnames):
        # chromosomes have leading 'chr' characters in the original VCF
        # pgx regions to be extracted
        ref_pgx_regions = ref_pgx_regions.apply(
            lambda row: ':'.join(row.values.astype(str)),
            axis=1).replace({'^': 'chr'}, regex=True)
    else:
        # chromosomes do not have leading 'chr' characters in the original VCF
        # add chromosome name with leading 'chr' to the VCF header
        for single_chr in input_vcf_cyvcf2.seqnames:
            input_vcf_cyvcf2.add_to_header('##contig=<ID=chr' + single_chr +
                                           '>')
        # pgx regions to be extracted
        ref_pgx_regions = ref_pgx_regions.apply(
            lambda row: ':'.join(row.values.astype(str)), axis=1)

    # write to a VCF output file
    # header
    output_vcf_cyvcf2 = Writer(path_output, input_vcf_cyvcf2, mode="wz")
    # content
    for single_region in ref_pgx_regions:
        for single_variant in input_vcf_cyvcf2(single_region):
            single_variant.CHROM = re.sub(r'^([0-9]+)', r'chr\1',
                                          single_variant.CHROM)
            output_vcf_cyvcf2.write_record(single_variant)

    # close pipe
    input_vcf_cyvcf2.close()
    input_ref_pgx_pos_cyvcf2.close()
    output_vcf_cyvcf2.close()

    tabix_index_vcf(tabix_executable_path, path_output)

    return path_output
def main(invcf: str = typer.Argument(..., help="输入的vcf文件"),
         outvcf: str = typer.Argument(..., help="输出的vcf文件"),
         mindepth: int = typer.Option(10, help="最低reads覆盖率"),
         het_altrange: Tuple[float, float] = typer.Option((0.2, 0.8),
                                                          help="杂合位点的alt频率范围"),
         homref_maxaltrate: float = typer.Option(
             0, help="纯合ref型GT,最大alt reads比例不超过这个"),
         homalt_minaltrate: float = typer.Option(
             1, help="纯合alt型GT,最小alt reads比例不低于这个")):
    """
    mask掉满足以下的genotype:
    杂合位点alt reads的频率不在20%到80%范围之内的。
    纯合位点reads支持比例不是100%的。
    覆盖的reads小于10条的。
    """
    vcf = VCF(invcf)
    w = Writer(outvcf, vcf)
    for v in vcf:
        indicies_mask = filter_samples(v, mindepth, het_altrange,
                                       homref_maxaltrate, homalt_minaltrate)
        if indicies_mask:
            for index in indicies_mask:
                v.genotypes[index] = [-1] * v.ploidy + [False]
            v.genotypes = v.genotypes
        w.write_record(v)
    w.close()
    vcf.close()
Example #10
0
def open_vcf(path: PathType) -> Iterator[VCF]:
    """A context manager for opening a VCF file."""
    vcf = VCF(path)
    try:
        yield vcf
    finally:
        vcf.close()
def augment_vcf(vcf_in_file, vcf_out_file, bed_files, decimals):
    """Parses and augments VCF file."""

    # Read in the input VCF file
    vcf_in = VCF(vcf_in_file)

    # Add rows to the header for each new field
    vcf_in = modify_header(vcf_in, bed_files)

    # Set up a write based on the tweaked input VCF file
    vcf_out = Writer(vcf_out_file, vcf_in)

    # Parse BED files
    bed = parse_bed_files(bed_files)

    # Iterate over every variant record
    for variant in vcf_in:
        # Augment the variant by adding new fields (if there are samples)
        num_samples = len(vcf_in.samples)
        if num_samples > 0:
            variant = add_fields_to_variant(variant, bed, decimals)
        # Output the augmented variant
        vcf_out.write_record(variant)

    # Close input and output VCF files
    vcf_in.close()
    vcf_out.close()
Example #12
0
def output_pharmcat_ready_vcf(input_vcf, output_dir, output_prefix):
    '''
    iteratively write to a PharmCAT-ready VCF for each sample

    "bcftools view <options> <input_vcf>". For bcftools common options, see running_bcftools().
    "-U" exclude sites without a called genotype, i.e., GT = './.'
    '''

    input_vcf_cyvcf2 = VCF(input_vcf)
    input_vcf_sample_list = input_vcf_cyvcf2.samples
    input_vcf_sample_list.remove('PharmCAT')
    input_vcf_cyvcf2.close()

    # output each single sample to a separete VCF
    for single_sample in input_vcf_sample_list:
        print('Generating a PharmCAT-ready VCF for ' + single_sample)
        input_vcf_cyvcf2 = VCF(input_vcf, samples=single_sample)

        # write to a VCF output file
        output_file_name = os.path.join(
            output_dir, output_prefix + '.' + single_sample + '.vcf')
        # header
        output_vcf_cyvcf2 = Writer(output_file_name,
                                   input_vcf_cyvcf2,
                                   mode='w')
        # content
        for single_var in input_vcf_cyvcf2:
            output_vcf_cyvcf2.write_record(single_var)
        output_vcf_cyvcf2.close()
        input_vcf_cyvcf2.close()
Example #13
0
    def seperate_vcffile(self):
        # start = time.time()
        file_list = self.search_vcf_file(self.from_directory)
        for file in file_list:
            vcf_read = VCF(file)
            samples = vcf_read.samples
            chromosome_num = ""
            for variant in vcf_read:
                chromosome_num = variant.CHROM
                break

            for sample in samples:
                start = time.time()
                # print(sample, "file write start...  ", start)
                try:
                    if not (os.path.isdir(self.target_directory)):
                        os.makedirs(os.path.join(self.target_directory))
                    if not (os.path.isdir(self.target_directory + "/" +
                                          sample)):
                        os.makedirs(
                            os.path.join(self.target_directory + "/" + sample))
                except OSError as e:
                    print("Failed to create directory!!!!!", e)
                    raise

                filepath = os.path.join(self.target_directory + "/" + sample,
                                        chromosome_num + "-" + sample + ".vcf")
                index = 0
                while os.path.exists(filepath):
                    index = index + 1
                    filepath = os.path.join(
                        self.target_directory + "/" + sample,
                        chromosome_num + "-" + sample + str(index) + ".vcf")

                out_read_vcf = VCF(file, samples=[sample])
                write_file = Writer(filepath, out_read_vcf)

                for variant in out_read_vcf:
                    if chromosome_num == "Y":
                        if not variant.genotypes[0][0] == 0:
                            write_file.write_record(variant)

                    elif not (variant.genotypes[0][0] == 0
                              and variant.genotypes[0][1] == 0):
                        write_file.write_record(variant)

                write_file.close()
                out_read_vcf.close()

                with open(filepath, "rb") as f_in:
                    with gzip.open(filepath + ".gz", "wb") as f_out:
                        shutil.copyfileobj(f_in, f_out)

                os.remove(filepath)
                sec = time.time() - start
                print(sample + " write end...",
                      time.strftime("%H:%M:%S", time.gmtime(sec)))
                break
            vcf_read.close()
Example #14
0
def main(
    vcf_path: str,
    loci_info: TextIO,
    outdir: str,
    verbose: bool,
    chrom: str,
    max_indel_len: int,
    loci_dir: str,
):
    """Apply all ALT variants in a VCF to their corresponding loci reference sequences.

    Creates multiple mutants of the reference loci sequence.
    """
    log_level = logging.DEBUG if verbose else logging.INFO
    logging.basicConfig(format="%(asctime)s [%(levelname)s]: %(message)s",
                        level=log_level)
    loci_dir = Path(loci_dir)
    outdir = Path(outdir)

    if not outdir.exists():
        outdir.mkdir()

    logging.info("Loading loci info file...")
    ivtree = load_loci_info(loci_info)
    logging.info(f"Loaded info for {len(ivtree)} loci")

    vcf = VCF(vcf_path)

    logging.info("Applying variants to loci...")
    for iv in ivtree:
        start = iv.begin  # 1-based inclusive
        end = iv.end  # 1-based inclusive
        loci_name = iv.data
        loci_path = loci_dir / chrom / f"{loci_name}.fa"
        loci_record = get_record_for_loci(loci_path)

        outpath = outdir / loci_path.name
        with outpath.open("w") as outstream:
            write_record(loci_record, outstream)
            count = 0
            alt_records: List[Record] = []
            for variant in vcf(f"{chrom}:{start}-{end}"):
                count += 1
                alt_records.extend(
                    loci_record.apply_variant(variant,
                                              relative_start=start,
                                              max_indel_len=max_indel_len))

            for rec in alt_records:
                write_record(rec, outstream)

        if count < 1:
            logging.info(f"No records associated with loci {loci_name}")
        else:
            logging.debug(
                f"{count} record(s) associated with loci {loci_name}")

    vcf.close()
    logging.info("All done.")
Example #15
0
def test_missing_samples():
    samples = ['101976-101976', 'sample_not_in_vcf']
    vcf = VCF(VCF_PATH, gts012=True, samples=samples)
    assert len(vcf.samples) == 1
    vcf.close()
    samples = '101976-101976,sample_not_in_vcf'
    vcf = VCF(VCF_PATH, gts012=True, samples=samples)
    assert len(vcf.samples) == 1
Example #16
0
def main():
    args = get_args()
    vcf_in = VCF(args.vcf)
    vcf_out = Writer(args.output, vcf_in)
    for v in vcf_in:
        if v.INFO["SVLEN"] > 49:
            vcf_out.write_record(v)
    vcf_in.close()
    vcf_out.close()
Example #17
0
def test_fd():

    fh = open(os.path.join(HERE, "decomposed.vcf"))
    fn = fh.fileno()

    vcf = VCF(fn)
    v = next(vcf)
    assert np.all(v.gt_types == np.array([vcf.HOM_REF, vcf.HOM_REF, vcf.HET, vcf.HET, vcf.UNKNOWN]))
    fh.close()
    vcf.close()
def compute_sv_lengths():
    parser = argparse.ArgumentParser(description='')
    parser.add_argument('--calls', type=str, help='')
    args = parser.parse_args()

    variants = VCF('/dev/stdin') if args.calls == 'stdin' else VCF(args.calls)

    for variant in variants:
        print(abs(get_sv_length(variant)))

    variants.close()
Example #19
0
def print_vcf(sample):
  vcf = VCF('/dev/stdin')

  for header_line in vcf.raw_header.split('\n'):
    if header_line.startswith('#CHROM'): continue
    if len(header_line) == 0 : continue
    print(header_line)    
  print('#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t{}'.format(sample))

  for variant in vcf: print(variant, end='')  
  
  vcf.close()
Example #20
0
def iter_vcf(input_file,
             output_file,
             proc_rec,
             proc_hdr=None,
             postproc_hdr=None,
             **kwargs):
    """
    :param input_file: path to input VCF file
    :param output_file: path to output VCF file (can be .vcf or .vcf.gz, but it will always bgzip/tabix and write with .vcf.gz extention)
    :param proc_rec: a function to process a single cyvcf Record object. Returns either a (new) Record object to write, or None to indicate that the record should be discarded
    :param proc_hdr: a function to process cyvcf object once (i.e. to add values to the header with vcf.add_info_to_header, etc)
    :param postproc_hdr: a function to postprocess finalized header string (vcf.rawheader), e.g. in order to remove values
    :param kwargs: any paramters to pass directly into proc_rec
    """
    from cyvcf2 import VCF
    vcf = VCF(input_file, gts012=True)
    if proc_hdr is not None:
        proc_hdr(vcf)

    # w = None
    if output_file is not None:
        out_ungz, out_gz = get_ungz_gz(output_file)
        # w = Writer(out_ungz, vcf)
        # w.write_header()
        w = open(out_ungz, 'w')
    else:
        # sys.stdout.write(vcf.raw_header)
        w = sys.stdout

    header = vcf.raw_header
    if postproc_hdr is not None:
        header = postproc_hdr(header)
    w.write(header)

    for rec in vcf:
        if proc_rec:
            rec_res = proc_rec(rec, vcf, **kwargs)
            if rec_res is not None:
                # if w is not None:
                #     sys.stderr.write('Writing record', rec_res, '\n')
                #     w.write_record(rec_res)
                # else:
                #     print(rec_res)
                # sys.stderr.write(f'Writing record {rec_res}\n')
                w.write(f'{rec_res}')

    sys.stderr.write(f'Finished writing {output_file}\n')
    vcf.close()
    if output_file is not None:
        w.close()
        run_simple(f'bgzip -f {out_ungz} && tabix -f -p vcf {out_gz}')
        sys.stderr.write(f'Compressed {output_file}\n')
Example #21
0
def compute_sv_coordinates():
    parser = argparse.ArgumentParser(description='')
    parser.add_argument('--calls', type=str, help='')
    args = parser.parse_args()

    variants = VCF('/dev/stdin') if args.calls == 'stdin' else VCF(args.calls)

    for variant in variants:
        chromosome = variant.CHROM
        start, end = coordinates(variant)
        print('{}\t{}\t{}\n'.format(chromosome, start, end), end='')

    variants.close()
Example #22
0
def main():
    args = get_args()
    vcf = VCF(args.vcf)
    output = Writer(args.output, vcf)
    incorrect = 0
    for v in vcf:
        if v.REF == v.ALT[0] and v.INFO["SVTYPE"] == "DEL":
            v.ALT = "<DEL>"
            incorrect += 1
        output.write_record(v)
    print("Fixed {} positions".format(incorrect))
    output.close()
    vcf.close()
Example #23
0
def main():
    args = get_args()
    genome = Fasta(args.genome)
    vcf = VCF(args.vcf)
    output = Writer(args.output, vcf)
    incorrect_reference = 0
    for v in vcf:
        ref_nucl = get_reference_nucleotide(v.CHROM, v.start, genome)
        if v.REF != ref_nucl:
            v.REF = ref_nucl
            incorrect_reference += 1
        output.write_record(v)
    print("Fixed {} positions".format(incorrect_reference))
    output.close()
    vcf.close()
def main():
    parser = argparse.ArgumentParser(__doc__)
    parser.add_argument("--vcf", help="VCF file", type=str, required=True)
    parser.add_argument("--statsfile", help="File with chrom, start, locus stats", type=str, required=True)
    parser.add_argument("--out", help="Prefix for output files", type=str, required=True)
    parser.add_argument("--min-hwep", help="Minimum HWE p-value", type=float, default=0)
    parser.add_argument("--min-callrate", help="Minimum call rate", type=float, default=0)
    parser.add_argument("--min-het", help="Minimum heterozygosity", type=float, default=0)
    parser.add_argument("--max-hrun-offset", help="For periods 5+, discard if the ref has " \
                            "homopolymer run > period+offset", type=int, default=100000)
    parser.add_argument("--filter-segdup", help="Filter loci overlapping a segdup", action="store_true")
    args = parser.parse_args()

    # Get VCF reader
    reader = VCF(args.vcf)

    # Load locus filters
    sys.stderr.write("Getting filters...\n")
    locstats = pd.read_csv(args.statsfile, sep="\t")
    locstats["FILTER"] = locstats.apply(lambda x: GetFilters(x, args, len(reader.samples)), 1)
    locstats.to_csv(args.out + ".tab", sep="\t", index=False)

    # Get filter dictionary
    sys.stderr.write("Getting filter dictionary...\n")
    filter_dict = dict(zip(list(locstats["start"]), list(locstats["FILTER"])))

    # Set filter field
    sys.stderr.write("Setting filter field in VCFs...\n")
    adict = {
        "HWE": "HWE less than %s"%args.min_hwep,
        "Callrate": "Callrate less than %s"%args.min_callrate,
        "Het": "Het less than %s"%args.min_het,
        "Hrun": "Hrun greater than %s"%args.max_hrun_offset,
        "Segdup": "Locus in a segmental duplication",
        "MissingInfo": "No stats provided for the locus",
        }
    for f in adict:
        reader.add_filter_to_header({"ID": f, "Description": adict[f]})
    writer = Writer("/dev/stdout", reader)
    for record in reader:
        filters = filter_dict.get(record.INFO["START"], "MissingInfo")
        if filters != ".":
            record.FILTER = filters.split(";")
        else: record.FILTER = "PASS"
        writer.write_record(record)
    writer.close()
    reader.close()
Example #25
0
def parseVCF(invcf,outbasename,reportMultipleSamples,reportNoSamples):
    vcf_data = VCF(invcf,gts012=True)
    samples=vcf_data.samples
#    print(len(samples))
    multiple_samples=defaultdict(list)
    absent_samples=copy.deepcopy(samples)
    with open(outbasename + "_genotypes.txt",'w') as out:
        out.write("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format("CHROM","POS","REF","ALT","NUM_HET","HET_SAMPLES","NUM_HOM_ALT","HOM_ALT_SAMPLES"))
        for record in vcf_data:
            home_var= (record.gt_types==2).nonzero()[0]
            heterozygous = (record.gt_types==1).nonzero()[0]
#            unknown = (record.gt_types==3).nonzero()[0]
#            genotypes=record.genotypes
            samples_het,samples_homvar = [],[]
            if heterozygous.size:
                [(samples_het.append(samples[i]),multiple_samples[samples[i]].append((record.CHROM,str(record.POS)))) for i in heterozygous]
                if reportNoSamples:
                    [absent_samples.remove(samples[i]) for i in heterozygous if samples[i] in absent_samples]
            if home_var.size:
                [(samples_homvar.append(samples[i]),multiple_samples[samples[i]].append((record.CHROM,str(record.POS)))) for i in home_var]
                if reportNoSamples:
                    [absent_samples.remove(samples[i]) for i in home_var if samples[i] in absent_samples]
 #           else:
 #               print(record.genotypes)
            out.write("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(record.CHROM, record.POS, record.REF, record.ALT[0],
                                                        record.num_het, ';'.join(samples_het),record.num_hom_alt,';'.join(samples_homvar)))
    out.close()
    vcf_data.close()
    if multiple_samples and reportMultipleSamples:
        outlist = [[k,v]for k, v in multiple_samples.items() if len(v) > 1]
        if not outlist:
            print("{}".format("No samples carry more than one variant represented in the input VCF."))
        else:
            with open(outbasename + "_multipleVariantSamples.txt",'w') as outmult:
                outmult.write("{}\t{}\t{}\n".format("Sample","#variants","variant_id"))
                for sample in outlist:
                    outmult.write("{}\t{}\t{}\n".format(sample[0],len(sample[1]),';'.join('_'.join(i) for i in sample[1])))
            outmult.close()

    if reportNoSamples:
        if absent_samples:
            with open(outbasename + "_samplesWithNoVariant.txt","w") as out:
                out.write('\n'.join(absent_samples))
        else:
            print("All samples have a variant in the given VCF. Are you sure the input VCF is a subset from the original? If it's not,"
                  "the '-n' flag doesn't make sense.")
Example #26
0
def process_vcf(vcf):

    vcf_data = VCF(vcf, gts012=True)
    vcf_data.add_info_to_header({
        'ID': 'Gene_SpliceAI',
        'Description': 'Gene for which spliceAI gave the prediction.',
        'Type': 'String',
        'Number': '.'
    })
    vcf_data.add_info_to_header({
        'ID': 'DS_AG',
        'Description': 'SpliceAI score for an acceptor gain.',
        'Type': 'String',
        'Number': '.'
    })
    vcf_data.add_info_to_header({
        'ID': 'DS_AL',
        'Description': 'SpliceAI score for an acceptor lost.',
        'Type': 'String',
        'Number': '.'
    })
    vcf_data.add_info_to_header({
        'ID': 'DS_DG',
        'Description': 'SpliceAI score for a donor gain.',
        'Type': 'String',
        'Number': '.'
    })
    vcf_data.add_info_to_header({
        'ID': 'DS_DL',
        'Description': 'SpliceAI score for a donor lost.',
        'Type': 'String',
        'Number': '.'
    })
    print(vcf_data.raw_header.rstrip())
    for record in vcf_data:
        snvs = record.INFO.get('SpliceAI')
        indels = record.INFO.get('SpliceAI_ind')
        if snvs:
            record = set_new_fields(record, snvs)
        elif indels:
            record = set_new_fields(record, indels)
        print(str(record).rstrip())
    vcf_data.close()
Example #27
0
def get_variants(vcf_file, padding, sv_padding, vcf_parse=None):
    """

        Given a vcf file, this function parses through the file and yields the variant with all
        relevant information

        Args:
            vcf_file (string): Path to vcf file

        Yields:
            variant (mutacc.builds.build_variant.Variant): Variant object
    """

    vcf_file = parse_path(vcf_file)
    vcf = VCF(str(vcf_file), "r")
    samples = vcf.samples
    parser = None
    if vcf_parse:
        parser = INFOParser(vcf_parse, "read")
    for entry in vcf:
        yield Variant(entry, samples, padding, sv_padding, parser=parser)
    vcf.close()
Example #28
0
def main(
    in_vcf: str, out_vcf: str, keep_mnps: bool, verbose: bool,
):
    """Extract SNPs from a pandora VCF. It keeps records regardless of the called
    allele, provided they record is a SNP. If the ALT allele is a '.' and the REF is a
    single character, it is considered a SNP.
    It is assumed that each record has no more than 1 alternate allele.
    """
    log_level = logging.DEBUG if verbose else logging.INFO
    logging.basicConfig(
        format="%(asctime)s [%(levelname)s]: %(message)s", level=log_level
    )
    vcf_reader = VCF(in_vcf)
    vcf_writer = Writer(out_vcf, tmpl=vcf_reader)

    logging.info("Checking for records to keep...")

    for record in vcf_reader:
        keep_this_record = False
        ref_len = len(record.REF)
        ref_is_single_base = ref_len == 1
        empty_alt = not bool(record.ALT)
        if empty_alt and (keep_mnps or ref_is_single_base):
            keep_this_record = True
        elif all(ref_len == len(alt) for alt in record.ALT) and (
            ref_is_single_base or keep_mnps
        ):
            keep_this_record = True

        if keep_this_record:
            vcf_writer.write_record(record)
        else:
            logging.debug(
                f"Discarding record CHROM: {record.CHROM} at POS: {record.POS}"
            )

    vcf_writer.close()
    vcf_reader.close()
    logging.info("Done!")
def main(vcf_path: str, loci_info: TextIO, output: str, verbose: bool, chrom: str):
    """Associate information about loci to the relevant VCF records based on position.

    This script will add three new INFO fields to the VCF. The INFO fields are
    loci_name, start, and end. See the VCF header entries for these fields for more
    information."""
    log_level = logging.DEBUG if verbose else logging.INFO
    logging.basicConfig(
        format="%(asctime)s [%(levelname)s]: %(message)s", level=log_level
    )

    logging.info("Loading loci info file...")
    ivtree = load_loci_info(loci_info)
    vcf = VCF(vcf_path)
    write_new_info_fields(vcf)
    vcf_writer = Writer(output, vcf)

    logging.info("Associating loci info to VCF records...")
    for iv in ivtree:
        start = iv.begin
        end = iv.end
        name = iv.data
        count = 0
        for record in vcf(f"{chrom}:{start}-{end}"):
            count += 1
            record.INFO[LOCI_ID] = name
            record.INFO[START_ID] = start
            record.INFO[END_ID] = end
            vcf_writer.write_record(record)

        if count < 1:
            logging.info(f"No records associated with loci {name}")
        else:
            logging.debug(f"{count} record(s) associated with loci {name}")

    vcf_writer.close()
    vcf.close()
    logging.info("All done.")
def add_absent_records(vcf_absent_gnomad, outfile, nind):
    #format_fields = get_format_fields_from_vcf(vcf_absent_gnomad)
    logging.info("Processing variants absent in gnomAD")
    gt, gt_dp, gt_ref_depth, gt_alt_depth, gt_qual = "0/0", 100, 100, 0, 50
    gt_phred_ll_homref, gt_phred_ll_het, gt_phred_ll_homalt = 0, 1500, 1500
    fmt = [
        "{}:{},{}:{}:{}:{},{},{}".format(gt, gt_ref_depth, gt_alt_depth, gt_dp,
                                         gt_qual, gt_phred_ll_homref,
                                         gt_phred_ll_het, gt_phred_ll_homalt)
    ] * nind
    vcf_data = VCF(vcf_absent_gnomad, gts012=True)
    info_fields = [
        field["ID"] for field in vcf_data.header_iter()
        if field["HeaderType"] == "INFO"
    ]

    with open(outfile, 'a') as out:
        #with gzip.open(outfile, 'ab') as out:
        for record in vcf_data:
            str_info = []
            for i in info_fields:
                try:
                    str_info.append(i + "=" + str(record.INFO[i]))
                except KeyError:
                    continue
            write_record = [
                '.' if v is None else v for v in [
                    record.CHROM,
                    str(record.POS), record.ID, record.REF, record.ALT[0],
                    str(record.QUAL), record.FILTER, ';'.join(str_info),
                    "GT:AD:DP:GQ:PL"
                ]
            ]

            out.write('\t'.join(write_record + fmt) + "\n")
            #out.write('\t'.join(write_record + fmt).encode() + "\n".encode())
        vcf_data.close()
    out.close()