def main(variant_file, bam_file, samplename, loglevel, filter_hom):
    setup_logging(loglevel)
    vcf_reader = vcf.Reader(open(variant_file, 'r'))
    bamfile = pysam.AlignmentFile(bam_file, "rb")
    vcf_reader.samples.append(samplename)
    vcf_writer = vcf.Writer(open('/dev/stdout', 'w'), vcf_reader)
    for variant in vcf_reader:
        calls = [call.data.GT for call in variant.samples]
        if filter_hom and ('0/0' in calls or '1/1' in calls):
            continue

        # only work on simple substitutions
        if len(variant.REF) == 1 and len(variant.ALT) == 1 and len(
                variant.ALT[0]) == 1:

            pile = bamfile.pileup(variant.CHROM, variant.POS, variant.POS + 1)
            bases = []
            quals = []
            for pileupcolumn in pile:
                if pileupcolumn.pos + 1 != variant.POS:
                    continue
                for pileupread in pileupcolumn.pileups:
                    if not pileupread.is_del and not pileupread.is_refskip:
                        bases.append(pileupread.alignment.query_sequence[
                            pileupread.query_position])
                        quals.append(pileupread.alignment.query_qualities[
                            pileupread.query_position])
            bases.sort()
            logging.debug("pileup at {}:{} {}/{} = {}".format(
                variant.CHROM, variant.POS, variant.REF, variant.ALT,
                "".join(bases)))
            Genotype = namedtuple(
                'Genotype', variant.FORMAT.split(":"))  # lazy genotype object
            Genotype.__new__.__defaults__ = ('.', ) * len(
                Genotype._fields)  # set defaults to 0
            dp = len(bases)
            ro = len([base for base in bases if base == variant.REF])
            ao = len([base for base in bases if base == variant.ALT[0]])
            gt = "./."
            newgt = Genotype(AO=ao, RO=ro, DP=dp, GT=gt)
            newcall = Call(site=variant, sample=samplename, data=newgt)
            variant.samples.append(newcall)
            vcf_writer.write_record(variant)

    bamfile.close()
    return 0
    def __init__(self, out_file, file_type='VCF4.1', template_file=None, template_reader=None, new_source=None,
        new_info_fields=[], new_format_fields=[], new_filters=[]):
        self.file_type = file_type
        if self.file_type == 'VCF4.1':
            if template_reader is None and template_file is not None:
                template_reader = vcf.Reader(template_file)
            elif template_reader is not None:
                pass
            else:
                metadata = OrderedDict()
                infos = OrderedDict()
                formats = OrderedDict()
                filters = OrderedDict()
                alts = OrderedDict()
                contigs = OrderedDict()
                template_reader = namedtuple('template', ['metadata', 'infos', 'formats', 'filters', 'alts', 'contigs'])
                template_reader.metadata = metadata
                template_reader.infos = infos
                template_reader.formats = formats
                template_reader.filters = filters
                template_reader.alts = alts
                template_reader.contigs = contigs

            # Add new source to metadata of header
            if not(new_source is None):
                sources = template_reader.metadata.setdefault("source", [])
                sources.append(new_source)

            # Add new info fields to header
            for info_id, info_len, info_type, info_desc, _, _ in new_info_fields:
                info_field = vcf.parser._Info(info_id, info_len, info_type, info_desc, None, None)
                template_reader.infos[info_id] = info_field

            # Add new format fields to header
            for format_id, format_len, format_type, format_desc in new_format_fields:
                format_field = vcf.parser._Format(format_id, format_len, format_type, format_desc)
                template_reader.formats[format_id] = format_field

            # Add new filters to header
            for filter_id, filter_desc in new_filters:
                filter_field = vcf.parser._Filter(filter_id, filter_desc)
                template_reader.filters[filter_id] = filter_field

            self.writer = vcf.Writer(out_file, template_reader, lineterminator='\n')
        else:
            raise NotSupportedException('File type unsupported: ' + file_type)
Example #3
0
def writer():

    # file path to the reference file
    data_file_path = "/home/ubuntu/GSoC-Strain_Diffrential/data/original.vcf.gz"
    # raw_input("Enter path to reference VCF file")
    vcf_reader = vcf.Reader(open(data_file_path))

    write_file_path = "/home/ubuntu/GSoC-Strain_Diffrential/data/Toy\ Examples/test_toy3.vcf"
    #print "Enter writing path"
    # reading from user input.
    #temp_path = raw_input("type d for default path else type a path:")
    #temp_path = temp_path + raw_input("Enter toy example name:") + ".vcf"
    #if temp_path != "d":
        #write_file_path = temp_path
    temp_path = write_file_path 

    vcf_writer = vcf.Writer(open(write_file_path, 'w'), vcf_reader)
    count  = 1
    for record in vcf_reader:
        print "\nEnter the values for input" + str(count) + ":\n"
        print (record)
        print (record.ALT)
        print( record.is_snp )
        print(record.is_indel )
        print(record.is_deletion)
        record.POS = raw_input("Enter POS:")
        record.REF = raw_input("Enter REF for POS " + record.POS + " :")
        record.ALT = raw_input("Enter ALT for POS " + record.POS + " :")
        #record.is_indel = raw_input("Enter is_indel (true or false) for POS " + record.POS + " :")
        #record.is_deletion = raw_input("Enter is_deletion (true or false) for POS " + record.POS + " :")
        #record.is_snp = raw_input("Enter is_snp (true or false) for POS " + record.POS + " :")
        vcf_writer.write_record(record)
        if raw_input("Press enter if you want to continue else press any other key:") != "":
            break

        count = count + 1

    # writing the value to permanent storage.
    # vcf_writer.write_record(record)
    vcf_writer.flush()


    vcf_reader = vcf.Reader(open(temp_path))

    for record in vcf_reader:
        print (record)
Example #4
0
def splitVcfByChromosome(source, output_folder, create_subfolders=False):
    """ Separates a vcf file into separate files for each chromosome.
		Assumes the file is sorted.
		
		Parameters
		----------
			source: string [PATH]
			output_folder: string [PATH]
			create_subfolders: bool; default False
				If 'True', each chromosome will be saved to a separate folder.
	"""
    basename = os.path.basename(source)
    basename, ext = os.path.splitext(basename)
    _match_chroms = "chr[0-9MT]{1,3}$"
    _match_chroms = re.compile(_match_chroms)

    with open(source, 'r') as input_vcf_file:
        reader = vcf.Reader(input_vcf_file)
        #pprint(reader.contigs)
        chromosomes = {i: list() for i in reader.contigs}
        # Sort the records by chromosome
        for record in reader:
            chrom = record.CHROM
            if chrom not in chromosomes:
                chromosomes[chrom] = list()
            chromosomes[chrom].append(record)

        for chromosome, record_list in chromosomes.items():
            match = _match_chroms.search(chromosome)
            if not match: continue
            print(chromosome, match)
            output_basename = "{}.{}.vcf".format(basename, chromosome)
            print(output_basename)
            if create_subfolders:
                chromosome_folder = os.path.join(output_folder, chromosome)
            else:
                chromosome_folder = output_folder

            output_filename = os.path.join(chromosome_folder, output_basename)

            filetools.checkDir(chromosome_folder, True)
            with open(output_filename, 'w') as output_vcf:
                writer = vcf.Writer(output_vcf, reader)
                if len(record_list) > 0:
                    for record in record_list:
                        writer.write_record(record)
Example #5
0
def repeat_merging(f_in, f_out):
    """takes a vcf file, collapses repetitive variant rows and write out
        to a new vcf file (without header)"""
    vcf_reader = vcf.Reader(f_in, strict_whitespace=True)
    variant_dict = {}
    num_repeats = 0
    for record in vcf_reader:
        genome_coor = "chr{0}:{1}:{2}>{3}".format(record.CHROM,
                                                  str(record.POS), record.REF,
                                                  record.ALT[0])
        if genome_coor not in variant_dict.keys():
            variant_dict[genome_coor] = deepcopy(record)
        else:
            num_repeats += 1
            for key in record.INFO:
                if key not in variant_dict[genome_coor].INFO.keys():
                    variant_dict[genome_coor].INFO[key] = deepcopy(
                        record.INFO[key])
                else:
                    new_value = deepcopy(record.INFO[key])
                    old_value = deepcopy(variant_dict[genome_coor].INFO[key])

                    if type(new_value) != list:
                        new_value = [new_value]
                    if type(old_value) != list:
                        old_value = [old_value]
                    if new_value == old_value:
                        continue
                    else:
                        if key == "individuals":
                            '''
                            LOVD individuals field values are all meaningful even if repeated
                            e.g. if two LOVD submissions for the same variant each have one individual associated with them,
                            "1,1" is a more sensible value for the variant than "1" since 2 individuals are associated.
                            '''
                            merged_value = list(new_value + old_value)
                        else:
                            merged_value = list(set(new_value + old_value))
                        variant_dict[genome_coor].INFO[key] = deepcopy(
                            merged_value)
    print "number of repeat records: ", num_repeats, "\n"
    vcf_writer = vcf.Writer(f_out, vcf_reader)
    for record in variant_dict.values():
        vcf_writer.write_record(record)
    f_in.close()
    f_out.close()
Example #6
0
 def test_null_mono(self):
     # null qualities were written as blank, causing subsequent parse to fail
     print(
         os.path.abspath(
             os.path.join(os.path.dirname(__file__),
                          'null_genotype_mono.vcf')))
     p = vcf.Reader(fh('null_genotype_mono.vcf'))
     assert p.samples
     out = StringIO()
     writer = vcf.Writer(out, p)
     for record in p:
         writer.write_record(record)
     out.seek(0)
     print(out.getvalue())
     p2 = vcf.Reader(out)
     rec = p2.next()
     assert rec.samples
Example #7
0
 def __init__(self, outstream, template, lineterminator='\n'):
     filename = template.filename
     disp = template.disp
     self.template = vcf.Reader(filename=filename)
     if template.family:
         for info in PEDINFO:
             self.template.infos[info.id] = info
     else:
         for info in INFO:
             self.template.infos[info.id] = info
     for format in FORMAT:
         self.template.formats[format.id] = format
     analysis = ''.join(
         ("input_file=%s " % filename, "disp_slope=%f " % disp['slope'],
          "disp_intercept=%f" % disp['intercept']))
     self.template.metadata['GBStools'] = [analysis]
     self.writer = vcf.Writer(outstream, self.template, lineterminator)
Example #8
0
    def testWrite(self):

        reader = vcf.Reader(fh('example-4.1-bnd.vcf'))
        out = StringIO()
        writer = vcf.Writer(out, reader)

        records = list(reader)

        for record in records:
            writer.write_record(record)
        out.seek(0)
        out_str = out.getvalue()
        for line in out_str.split("\n"):
            if line.startswith("##PEDIGREE"):
                self.assertEquals(line, '##PEDIGREE=<Derived="Tumor",Original="Germline">')
            if line.startswith("##SAMPLE"):
                assert line.startswith('##SAMPLE=<'), "Found dictionary in meta line: {0}".format(line)
Example #9
0
def GetWriter(reader, filters):
    """
    Get VCF Writer with the appropriate metadata
    """
    tmpdir = tempfile.mkdtemp(prefix="lobstr.")
    tmpfile = os.path.join(tmpdir, "header.vcf")
    f = open(tmpfile, "w")
    for line in reader._header_lines: f.write(line.strip() + "\n")
    for ft in filters.keys():
        name = ft + str(filters[ft]["Value"])
        desc = filters[ft]["Description"]
        f.write("##FILTER=<ID=%s,Description=\"%s\">\n"%(name, desc))
    f.write("##FORMAT=<ID=FT,Number=1,Type=String,Description=\"Call-level filter.\">\n")
    f.write("#" + "\t".join(reader._column_headers + reader.samples) + "\n")
    f.close()
    writer = vcf.Writer(sys.stdout, vcf.Reader(open(tmpfile, "rb")))
    return writer
Example #10
0
def copyVcf(source, destination):
    with open(source, 'r') as input_file:
        reader = vcf.Reader(input_file)
        if 'Varscan' in source:
            reader.formats['DP4'] = reader.formats['DP4']._replace(num=4)
            reader.formats['DP4'] = reader.formats['DP4']._replace(
                type='Integer')
        with open(destination, 'w') as output_file:
            writer = vcf.Writer(output_file, reader)
            for record in reader:
                filterOut = '/' in str(record.ALT[0]) or '/' in record.REF
                if not filterOut:
                    try:
                        writer.write_record(record)
                    except ValueError:
                        print(record)
    return destination
def main():
    parser = argparse.ArgumentParser(
        description='Fix dbsnp VP calls and add OXOG filter')
    parser.add_argument('validationvcf', help="Validation vcf file")
    parser.add_argument('-i',
                        '--input',
                        type=argparse.FileType('r'),
                        default=sys.stdin,
                        help="Merged and annotated VCF file (default: stdin)")
    parser.add_argument('-o',
                        '--output',
                        type=argparse.FileType('w'),
                        default=sys.stdout,
                        help="Specify output file (default:stdout)")
    parser.add_argument(
        '-f',
        '--fieldname',
        default="Validation_status",
        help="Specify INFO field name to use (default: Validation_status")
    parser.add_argument(
        '-s',
        '--skipif',
        default="LOWDEPTH",
        help=
        "Comma-delimted list of items which won't get carried over (default: LOWDEPTH)"
    )
    args = parser.parse_args()

    skips = args.skipif.split(',')
    header, infos = get_info_field(args.validationvcf, args.fieldname)
    reader = vcf.Reader(args.input)
    if len(infos) > 0:
        reader.infos[args.fieldname] = header
    writer = vcf.Writer(args.output, reader)

    for record in reader:
        assert len(record.ALT) == 1
        variant = variant_tuple(record, record.ALT[0])
        if variant in infos:
            items = infos[variant]
            apply_items = [item for item in items if item not in skips]
            if apply_items:
                record.INFO[args.fieldname] = ','.join(apply_items)

        writer.write_record(record)
    return 0
Example #12
0
def filter_somatic_variants(in_file, out_file):
    reader = vcf.Reader(filename=in_file)

    with open(out_file, 'wb') as out_fh:
        writer = vcf.Writer(out_fh, reader)

        for record in reader:
            if (record.FILTER is None) or (len(record.FILTER) == 0):
                pass_filter = True

            else:
                pass_filter = False

            if pass_filter and ('SOMATIC' in record.INFO):
                writer.write_record(record)

        writer.close()
Example #13
0
def the_thread(block, output_dir):
    index, input_filename = block
    output_filename = Blister.Output(input_filename,
                                     output_dir,
                                     "QUAL20",
                                     "vcf",
                                     rewrite=True,
                                     index=index)
    with Blister.Timestamp("Filtering",
                           filename_1=input_filename,
                           filename_2=output_filename,
                           index=index):
        vcf_reader = vcf.Reader(open(input_filename, 'r'))
        vcf_writer = vcf.Writer(open(output_filename, 'w'),
                                template=vcf_reader)
        for record in vcf_reader:
            if record.QUAL > 20: vcf_writer.write_record(record)
Example #14
0
def generate_s(metaFile, tree, l, sv_cn_idx_dict, r, seg_cn_idx_dict,
               seg_bgn_idx_dict, seg_end_idx_dict, F, U, C, c_p, c_m, a, h,
               mate_dict, outputFolder):
    vcf_reader = vcf.Reader(open(metaFile, 'r'))
    vcf_reader.metadata['filedate'][0] = datetime.datetime.now().date(
    ).strftime('%Y%m%d')  # set date to current date
    f_p = np.dot(U, c_p)
    f_m = np.dot(U, c_m)
    mixed_a = np.dot(U, a)  # m * l
    mixed_h = np.dot(U, h)  # m * l
    for i in range(len(U)):
        sample_idx = i + 1
        temp_file = outputFolder + '/sample' + str(sample_idx) + '.vcf'
        temp_writer = vcf.Writer(open(temp_file, 'w'), vcf_reader)
        alt_type, gt_cnv = 'CNV', '1|1'  # constants for all cnv records
        for chrom in sorted(seg_cn_idx_dict.keys()):
            for (key, val) in sorted(list(seg_cn_idx_dict[chrom].items()),
                                     key=lambda x: x[1]):
                pos = key[0]
                rec_id = get_cnv_rec_id(val, r)
                info_end = key[1]
                cn = [f_p[i][val], f_m[i][val]]
                temp_writer.write_record(
                    generate_cnv(chrom, pos, rec_id, alt_type, info_end,
                                 gt_cnv, cn))

        alt_ori, alt_cS, alt_wMA, gt_sv = True, str(
        ), True, '1|0'  # constants for all sv records
        for chrom in sorted(sv_cn_idx_dict.keys()):
            for (key, val) in sorted(list(sv_cn_idx_dict[chrom].items()),
                                     key=lambda x: x[1]):
                pos, isLeft = key[0], key[1]
                rec_id = get_sv_rec_id(val, l)
                (mate_chrom, mate_pos, mate_isLeft) = mate_dict[(chrom, pos,
                                                                 isLeft)]
                mate_id = sv_cn_idx_dict[mate_chrom][(mate_pos, mate_isLeft)]
                alt_chr, alt_pos = mate_chrom, mate_pos
                cnadj = F[i][val]
                bdp, dp = int(round(mixed_a[i][val])), int(
                    round(mixed_h[i][val]))
                info_mateid = get_sv_rec_id(mate_id, l)
                alt_rO = False if mate_isLeft == True else True
                temp_writer.write_record(
                    generate_sv(chrom, pos, rec_id, alt_chr, alt_pos, alt_ori,
                                alt_rO, alt_cS, alt_wMA, info_mateid, gt_sv,
                                cnadj, bdp, dp))
Example #15
0
def filterHomo(input_vcffil, output_vcffile, P1, P2):
    '''e.g. P1=[L14-1, L14-2, L14-3]
    P2 = [L17-1, L17-2, L17-3]'''
    inputvcf = open(input_vcffil, 'r')
    outputvcf = open(output_vcffile, 'w')
    invcf = vcf.Reader(inputvcf)
    outvcf = vcf.Writer(outputvcf, invcf)
    for i in invcf:
        if len(i.ALT) == 1:
            P1GT, P2GT = [], []
            for m, n in zip(P1, P2):
                P1GT.append(i.genotype(m)['GT'])
                P2GT.append(i.genotype(n)['GT'])
            if judgeGT(P1GT, P2GT):
                outvcf.write_record(i)
    inputvcf.close()
    outputvcf.close()
Example #16
0
    def write_vcf(self, vcf_path):
        """
        Write VCF file.
        
        *Keyword arguments:*

        - vcf_path -- VCF file
        
        """
        
        if not self.reader:
            raise Exception("No data available")
         
        writer = vcf.Writer(open(vcf_path, 'w'), self.reader)
        for v in self.sequences:
            record = vcf.model._Record(v.chrom, v.pos, v.id, v.ref, v.alt, v.qual, [], v.info, v.format, [], v.samples)
            writer.write_record(record)
Example #17
0
    def test_writer(self):
        """FORMAT should not be written if not present in the template and no
        extra tab character should be printed if there are no FORMAT fields."""
        reader = vcf.Reader(fh('1kg.sites.vcf', 'r'))
        out = StringIO()
        writer = vcf.Writer(out, reader, lineterminator='\n')

        for record in reader:
            writer.write_record(record)
        out.seek(0)
        out_str = out.getvalue()
        for line in out_str.split('\n'):
            if line.startswith('##'):
                continue
            if line.startswith('#CHROM'):
                assert 'FORMAT' not in line
            assert not line.endswith('\t')
def convert_snpeff_info_fields(vcf_input_fh, vcf_output_fh):
    """This function takes a VCF file on an input stream, reads it in,
    converts the single EFF field to a set of EFF fields, and then returns
    the modified VCF file on an output stream.

    The snpeff field starts out as a long string, consisting of many fields
    each separated by pipes.

    Effects information is added to the INFO field using an 'EFF' tag.
    There can be multiple effects separated by comma. The format for each
    effect is:

    Effect ( Effect_Impact | Codon_Change | Amino_Acid_change | Gene_Name
            | Gene_BioType | Coding | Transcript | Rank [ | ERRORS
            | WARNINGS ] )

    Details for each field are here:
        http://snpeff.sourceforge.net/SnpEff_manual.html

    We will pull out all of these fields separately into INFO_EFF_* and return
    a new VCF file.
    """
    vcf_reader = vcf.Reader(vcf_input_fh)

    # Generate extra header rows.
    # TODO: This method is internal to pyVCF, so if they change it,
    # this will break. Maybe we should copy their code?
    parser = vcf.parser._vcf_metadata_parser()
    for field, values in SNPEFF_FIELDS.items():

        # Create a new header line from the new field.
        new_header_line = SNPEFF_INFO_TEMPLATE.substitute(values)

        # Add this extra header line to the vcf reader.
        vcf_reader._header_lines.append(new_header_line)

        # Parse the header line as an Info obj, add it to the reader.
        key, val = parser.read_info(new_header_line)
        vcf_reader.infos[key] = val

    vcf_writer = vcf.Writer(vcf_output_fh, vcf_reader)

    # Write the old records with the new EFF INFO fields
    for record in vcf_reader:
        vcf_writer.write_record(populate_record_eff(record))
def main():
    parser = argparse.ArgumentParser(
        description='Annotate merged vcf with VAF information where available')
    parser.add_argument('mergedvcf',
                        type=argparse.FileType('r'),
                        default=sys.stdin,
                        help="Merged VCF file")
    parser.add_argument('-o',
                        '--output',
                        type=argparse.FileType('w'),
                        default=sys.stdout,
                        help="Specify output file (default:stdout)")
    parser.add_argument('-b', '--broad', type=str, help="Broad file")
    parser.add_argument('-d', '--dkfz', type=str, help="DKFZ file")
    parser.add_argument('-s', '--sanger', type=str, help="Sanger file")
    parser.add_argument('-m', '--muse', type=str, help="Muse file")
    parser.add_argument('-i',
                        '--indel',
                        action='store_true',
                        help="Variant type == indel (default:snv_mnv)")
    args = parser.parse_args()
    snvs = not args.indel

    dicts = [
        populate_dict(args.broad, broad=True, SNV=snvs),
        populate_dict(args.dkfz, dkfz=True, SNV=snvs),
        populate_dict(args.sanger, sanger=True, SNV=snvs),
        populate_dict(args.muse, muse=True, SNV=snvs)
    ]

    vcf_reader = vcf.Reader(args.mergedvcf)
    vcf_writer = vcf.Writer(args.output, vcf_reader)
    for variant in vcf_reader:
        key = variant.CHROM, variant.POS, variant.REF, str(variant.ALT[0])
        vafs = [
            vaf_dict[key] for vaf_dict in dicts if key in vaf_dict
            if vaf_dict[key] is not None
        ]
        roundvafs = [round_three(vaf) for vaf in vafs]
        if len(vafs) > 0:
            variant.INFO['VAFs'] = roundvafs
            variant.INFO['medianVAF'] = round_three(numpy.median(vafs))
        vcf_writer.write_record(variant)

    return 0
Example #20
0
def tab_to_vcf(input_file, output_file, reference_file):
    """
    Convert tab-delimited file to VCF.

    Support for the fixed VCF fields: #CHROM, POS, ID, REF, ALT, QUAL, FILTER, INFO

    PyVCF's _Record class requires the following arguments:

    CHROM, POS, ID, REF, ALT, QUAL, FILTER, INFO, FORMAT, sample_indexes
    """
    reference_dict = FastaHack(reference_file)

    with open(input_file, "r") as input_fh:
        reader = csv.DictReader(input_fh, delimiter="\t")

        with open(TEMPLATE_VCF_FILE, "r") as template_fh:
            vcf_reader = vcf.Reader(template_fh)

            with open(output_file, "w") as output_fh:
                vcf_writer = vcf.Writer(output_fh,
                                        vcf_reader,
                                        lineterminator='\n')

                for row in reader:
                    args = [
                        row.get(tab_field, ".")
                        for vcf_field, tab_field in VCF_TO_FIELDS
                    ]

                    # Convert position to an integer.
                    args[POSITION_INDEX] = int(args[POSITION_INDEX])

                    # Convert indels from GATK to VCF format.
                    if args[ALT_INDEX].startswith(
                        ("+", "-")) and not "/" in args[ALT_INDEX]:
                        args = gatk_indel_to_vcf(args, reference_dict)

                    # Convert alternate allele scalar to a list.
                    args[ALT_INDEX] = [args[ALT_INDEX]]

                    # Add empty entries for INFO, FORMAT, and sample_indexes.
                    args.extend([{}, ".", []])

                    record = _Record(*args)
                    vcf_writer.write_record(record)
Example #21
0
def construct_vcf_dict(vcf_file_path,vcf_file_output,hg38_centromere_dict):
        vcffile = vcf.Reader(open(vcf_file_path,"r"))   #vcffile = pysam.VariantFile(vcf_file_path,index_filename=vcf_file_path+".idx")
        vcf_writer = vcf.Writer(open(vcf_file_output,'w'), vcffile)
        variants_list = []
        for rec in vcffile:
                #print rec.CHROM,rec.POS,rec.REF,rec.ALT
                #print rec.CHROM,hg38_centromere_dict[rec.CHROM]
                remove = False;
                nregions1 = len(hg38_centromere_dict[rec.CHROM]["start"])
                for j in range(nregions1):
                        start_loop,end_loop = hg38_centromere_dict[rec.CHROM]["start"][j],hg38_centromere_dict[rec.CHROM]["end"][j]
                        if int(rec.POS) >= start_loop and int(rec.POS) <= end_loop:
                                remove = True;
                if not remove:
                        print(rec.CHROM,rec.POS,rec.REF,rec.ALT)
                        print("to keep")
                        vcf_writer.write_record(rec)
        return variants_list
Example #22
0
    def merge_chrs_into_one_vcf(self, file1, file2):
        '''
        Creates one VCF containing all variants of chr21 and chr22
        :return:
        '''

        print("Merging chr21_new.vcf with chr22_new.vcf")

        vcf_file1 = vcf.Reader(open(file1), "r")
        vcf_file2 = vcf.Reader(open(file2), "r")

        vcf_writer = vcf.Writer(open("merged_file.vcf", "w"),   vcf_file1)

        for file in [vcf_file1, vcf_file2]:
            for i in file:
                vcf_writer.write_record(i)

        print("Merge successful. File 'merged_file.vcf' created.")
Example #23
0
def one_variant_transform(f_in, f_out):
    """takes a vcf file, read each row, if the ALT field contains more than
       one item, create multiple variant row based on that row, writes new vcf"""
    vcf_reader = vcf.Reader(f_in, strict_whitespace=True)
    vcf_writer = vcf.Writer(f_out, vcf_reader)
    for record in vcf_reader:
        n = len(record.ALT)
        if n == 1:
            vcf_writer.write_record(record)
        else:
            for i in range(n):
                new_record = deepcopy(record)
                new_record.ALT = [deepcopy(record.ALT[i])]
                for key in record.INFO.keys():
                    value = deepcopy(record.INFO[key])
                    if type(value) == list and len(value) == n:
                        new_record.INFO[key] = [value[i]]
                vcf_writer.write_record(new_record)
Example #24
0
def main():
    parser = argparse.ArgumentParser(
        description='Fix dbsnp VP calls and add OXOG filter')
    parser.add_argument('filtervcf', help="Filter vcf file")
    parser.add_argument('-i',
                        '--input',
                        type=argparse.FileType('r'),
                        default=sys.stdin,
                        help="Merged and annotated VCF file (default: stdin)")
    parser.add_argument('-o',
                        '--output',
                        type=argparse.FileType('w'),
                        default=sys.stdout,
                        help="Specify output file (default:stdout)")
    parser.add_argument(
        '-f',
        '--filtername',
        help=
        "Specify filter name to use (default: use filter field from VCF file)")
    parser.add_argument('-d',
                        '--filterdesc',
                        default="",
                        help="Specify description of filter")
    args = parser.parse_args()

    reader = vcf.Reader(args.input)
    if args.filtername is not None:
        reader.filters[args.filtername] = vcf.parser._Filter(
            id=args.filtername, desc=args.filterdesc)
    writer = vcf.Writer(args.output, reader)

    failuredict = get_bias_failures(args.filtervcf, args.filtername)
    for record in reader:
        assert len(record.ALT) == 1
        variant = variant_tuple(record, record.ALT[0])
        if variant in failuredict:
            if not record.FILTER:
                record.FILTER = failuredict[variant]
            else:
                record.FILTER = record.FILTER + failuredict[variant]

        writer.write_record(record)

    return 0
def __main__():
    parser = argparse.ArgumentParser(
        description='Make a heterozygosity plot from a vcf file')
    parser.add_argument('--vcfFile',
                        help='VCF file generated by VarScan',
                        required=True)

    args = parser.parse_args()

    try:
        vcfReader = vcf.Reader(filename=args.vcfFile)
    except FileNotFoundError:
        raise SystemExit(
            'File {} cannot be found. Please check and try again\n'.format(
                args.vcfFile))

    try:
        vcfWriter = vcf.Writer(open('heterozygousSNPs.vcf', 'w'), vcfReader)
    except:
        raise SystemExit(
            'File heterozygousSNPs.vcf cannot be opened for writing.\n')

    for record in vcfReader:
        # at the point where we run this in the workflow, we should only ever have one sample
        if len(record.samples) != 1:
            raise SystemExit(
                'VCF file {} has more than one sample. Please check you are using the correct VCF file\n'
            )

        if 'AD' not in record.FORMAT or 'RD' not in record.FORMAT or 'DP' not in record.FORMAT:
            raise SystemExit(
                'Record format is missing AD, RD or DP tags. Tags available in this file are "{0}". Please check that {1} was generated with VarScan\n'
                .format(record.FORMAT, args.vcfFile))

        sample = record.samples[0]
        ratio = (max(sample['AD'], sample['RD']) / sample['DP'])

        # this is our cutoff for being heterozygous (as used by YMAP)
        if ratio <= 0.75:
            print(record)
            vcfWriter.write_record(record)
    vcfWriter.close()

    exit()
Example #26
0
def induce_mutations(inFile, outFile, delta):

    vcf_reader = vcf.Reader(open(inFile))
    vcf_writer = vcf.Writer(open(outFile, 'w'), vcf_reader)

    for record in vcf_reader:
        rec_toWrite = copy.deepcopy(record)
        print(record.num_called)
        if record.FORMAT.split(":")[0] != "GT":
            print("Error with FORMAT column at POSITION=" + str(record.POS) +
                  ": \'GT\' is missing.\n")
            continue

        for sm in range(len(record.samples)):
            gt_read = str(record.genotype(record.samples[sm].sample)["GT"])
            f_vals = [
                record.samples[sm].data[vx]
                for vx in range(len(record.FORMAT.split(":")))
            ]
            f_keys = record.FORMAT.split(":")
            rec_toWrite.samples[sm].data = collections.namedtuple(
                'CallData', f_keys)
            if not already_mutated(gt_read):
                if uniform(0, 9) < delta:
                    mutation_type = randint(0, 2)
                    mut_type_str = ""

                    if mutation_type == 0:
                        mut_type_str = str(randint(1, len(record.ALT))) + (
                            "|" if gt_read[1] == "|" else "/") + "0"
                    elif mutation_type == 1:
                        mut_type_str = "0" + ("|" if gt_read[1] == "|" else
                                              "/") + str(
                                                  randint(1, len(record.ALT)))
                    else:
                        mut_type_str = str(randint(1, len(record.ALT))) + (
                            "|" if gt_read[1] == "|" else "/") + str(
                                randint(1, len(record.ALT)))

                    f_vals[0] = mut_type_str
            rec_toWrite.samples[sm].data = rec_toWrite.samples[sm].data._make(
                f_vals)

        vcf_writer.write_record(rec_toWrite)
Example #27
0
def main(args):
    invcf  = vcf.Reader(filename=args.vcffile)
    outvcf = vcf.Writer(sys.stdout, invcf)

    vtype = None
    if args.vtype is not None:
        assert args.vtype in ('SNV', 'INDEL', 'SV')
        vtype = args.vtype

    bam = pysam.Samfile(args.bamfile, 'rb')

    for rec in invcf:
        output = True
        bc = basecount(bam, rec.CHROM, rec.POS)
        for alt in rec.ALT:
            if alt in bc.keys():
                if bc[str(alt)] < int(args.minreads):
                    output = False
            else:
                output = False

        if vtype == 'SNV' and (not rec.is_snp or (rec.is_snp and rec.INFO.get('VT') == 'LOH')):
            output = False

        if vtype == 'INDEL' and not rec.is_indel:
            output = False

        if vtype == 'SV' and not rec.is_sv:
            output = False

        if args.passonly and rec.FILTER:
            output = False

        if args.failonly and not rec.FILTER:
            output = False

        if args.somaticonly and not is_somatic(rec):
            output = False

        if args.germlineonly and is_somatic(rec):
            output = False

        if output:
            outvcf.write_record(rec)
Example #28
0
    def write_header(self, sample_id, filters, reference):
        """
        Write the VCF file header with the standard SNP Pipeline data elements.

        Parameters
        ----------
        sample_id : str
            Sample ID which will be written to the header line.
        filters : list of tuple(str, str)
            List of names and descriptions of filters which will be combined
            and written to the header filter lines.
        reference : str
            Reference name which will be written to the header reference line.
        """
        # Write the template header to an in-memory buffer
        in_memory_file = StringIO()
        in_memory_file.name = "header.vcf"
        in_memory_file.write(VCF_VERSION)
        in_memory_file.write(datetime.datetime.strftime(datetime.datetime.now(), VCF_DATE))
        in_memory_file.write(VCF_SOURCE)
        in_memory_file.write(VCF_INFO)
        in_memory_file.write(VCF_FILTER % ("PASS", "All filters passed"))
        for name, description in filters:
            in_memory_file.write(VCF_FILTER % (name, description))
        in_memory_file.write(VCF_FORMAT)
        in_memory_file.write(VCF_REFERENCE % reference)
        in_memory_file.write(VCF_HDR_LINE  % sample_id)

        # Rewind to the beginning of the file buffer to prepare for reading
        in_memory_file.seek(0)

        # Feed the template to pyVcf and write the header to our vcf file
        vcf_template = vcf.Reader(in_memory_file)
        self.pyvcf_writer = vcf.Writer(self.file_handle, template=vcf_template)

        # Extract the format string from the header.  It will be the same for
        # all positions, so only do this once.
        format_lines = VCF_FORMAT.split('\n')
        format_lines = [line for line in format_lines if len(line) > 0]
        format_lines = [line.replace("##FORMAT=<ID=", "") for line in format_lines]
        tokens = [line.split(',')[0] for line in format_lines]
        self.format_str = ':'.join(tokens)
        self.VcfCallData = collections.namedtuple('VcfCallData', tokens)  # this creates a new class called VcfCallData
Example #29
0
def write_diff(diff_entries, template):
    (unmatched_i_rows, unmatched_t_rows, differing_rows) = diff_entries

    writer = vcf.Writer(sys.stdout, template)

    if unmatched_i_rows:
        print("## Unmatched vcf row(s) in Input File")
        for row in unmatched_i_rows:
            writer.write_record(row)

    if unmatched_t_rows:
        print("## Unmatched vcf row(s) in Truth File")
        for row in unmatched_t_rows:
            writer.write_record(row)

    if differing_rows:
        print("## Matched row(s) which differ in Input and Truth Files")
        for row in differing_rows:
            writer.write_record(row)
Example #30
0
def main():
    """
    Driver program - Read in a VCF file and normal/tumour read counts
    for each base at each position, and output read counts at each call
    """
    parser = argparse.ArgumentParser(description='Search validation data for germline homs/hets')
    parser.add_argument('vcffile', nargs='+', help='Vcf file(s) to check')
    parser.add_argument('-o', '--output', type=argparse.FileType('w'),
                        default=sys.stdout, help='Output VCF (default: stdout)')
    parser.add_argument('-e', '--errorrate', type=float, default=0.02,
                        help='Error rate')
    parser.add_argument('-a', '--alpha', type=float, default=0.05,
                        help='prob threshold for calling hets/homs')
    args = parser.parse_args()

    vcf_reader = vcf.Reader(filename=args.vcffile[0])
    vcf_writer = vcf.Writer(args.output, vcf_reader)

    for filename in args.vcffile:
        vcf_reader = vcf.Reader(filename=filename)

        for record in vcf_reader:
            if not 'NormalReads' in record.INFO or \
                    not 'NormalEvidenceReads' in record.INFO:
                continue

            if 'LOWDEPTH' in record.FILTER:
                continue

            normdepth = int(record.INFO['NormalReads'][0])
            normevidence = sum([int(nr) for nr in record.INFO['NormalEvidenceReads']])

            impl_vaf = normevidence*1./normdepth
            p_het = scipy.stats.binom_test(normevidence, normdepth, max(args.errorrate, impl_vaf)) 
            p_hom = scipy.stats.binom_test(normevidence, normdepth, max(.75, impl_vaf)) 

            if p_het > 1.-args.alpha:
                record.INFO['HET'] = 1.-p_het
            if p_hom > 1.-args.alpha:
                record.INFO['HOM'] = 1.-p_hom
            vcf_writer.write_record(record)

        return 0