Beispiel #1
0
def test_reading_parse_nosample(tmpdir, nosample_vcf_file):
    """Read VCF file without samples, write file with samples."""
    # Perform record-wise copying, saving results in records
    path_out = tmpdir.mkdir("output").join("output.vcf")
    with vcfpy.Reader.from_path(nosample_vcf_file) as reader:
        header = reader.header.copy()
        header.samples = vcfpy.SamplesInfos(["NA00001", "NA00002", "NA00003"])
        with vcfpy.Writer.from_path(str(path_out), header) as writer:
            for record in reader:
                record.update_calls([
                    vcfpy.Call(sample, {})
                    for sample in ("NA00001", "NA00002", "NA00003")
                ])
                record.add_format("GT", "./.")
                writer.write_record(record)

    expected = textwrap.dedent("""
    ##fileformat=VCFv4.3
    ##contig=<ID=20,length=62435964,assembly=B36,md5=f126cdf8a6e0c7f379d618ff66beb2da,species="H**o sapiens",taxonomy=x>
    ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
    #CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	NA00001	NA00002	NA00003
    20	14370	.	G	A	29	.	.	GT	.	.	.
    20	17330	.	T	A	3	.	.	GT	.	.	.
    20	1110696	.	A	G,T	67	.	.	GT	.	.	.
    20	1230237	.	T	.	47	.	.	GT	.	.	.
    20	1234567	.	GTC	G,GTCT	50	.	.	GT	.	.	.
    """).lstrip()

    assert path_out.open("rt").read() == expected
 def _open(self):
     # Setup header
     lines = [
         vcfpy.HeaderLine("fileformat", "VCFv4.2"),
         vcfpy.FormatHeaderLine.from_mapping({
             "ID":
             "AD",
             "Number":
             "R",
             "Type":
             "Integer",
             "Description":
             "Allelic depths for the ref and alt alleles in the order listed",
         }),
         vcfpy.FormatHeaderLine.from_mapping({
             "ID":
             "DP",
             "Number":
             "1",
             "Type":
             "Integer",
             "Description":
             "Approximate read depth at the locus",
         }),
         vcfpy.FormatHeaderLine.from_mapping({
             "ID":
             "GQ",
             "Number":
             "1",
             "Type":
             "Integer",
             "Description":
             "Phred-scaled genotype quality",
         }),
         vcfpy.FormatHeaderLine.from_mapping({
             "ID": "GT",
             "Number": "1",
             "Type": "String",
             "Description": "Genotype"
         }),
     ]
     # Add header lines for contigs.
     # TODO: switch based on release in case
     for name, length in CONTIGS_GRCH37:
         lines.append(
             vcfpy.ContigHeaderLine.from_mapping({
                 "ID": name,
                 "length": length
             }))
     header = vcfpy.Header(lines=lines,
                           samples=vcfpy.SamplesInfos(self.members))
     # Open VCF writer
     self.vcf_writer = vcfpy.Writer.from_path(self.tmp_file.name, header)
Beispiel #3
0
def main():
    if len(sys.argv) != 2:
        print("Usage: vcf_from_scratch.py OUTPUT.vcf", file=sys.stderr)
        return 1

    header = vcfpy.Header(samples=vcfpy.SamplesInfos([]))
    with vcfpy.Writer.from_path(sys.argv[1], header) as writer:
        record = vcfpy.Record(CHROM="1",
                              POS=1,
                              ID=[],
                              REF="N",
                              ALT=[],
                              QUAL=None,
                              FILTER=[],
                              INFO={},
                              FORMAT=[])
        writer.write_record(record)
Beispiel #4
0
def write_vcf(vcffilename, sample_name, records):
    """
    Generate a VCF with the given records and randomly generated genotypes

    Arguments:
    vcffilename - path to generated file
    records - list of vcfpy.Record describing the variants
    """
    lengths = [249250621, 243199373, 198022430, 191154276, 180915260,
               171115067, 159138663, 146364022, 141213431, 135534747,
               135006516, 133851895, 115169878, 107349540, 102531392,
               90354753,  81195210,  78077248,  59128983,  63025520,
               48129895,  51304566]

    samples = vcfpy.SamplesInfos([sample_name])
    header = vcfpy.Header(samples=samples)
    header.add_line(vcfpy.HeaderLine("fileformat", "VCFv4.3"))
    header.add_line(vcfpy.HeaderLine("fileDate", "20200901"))
    for chrom, length in enumerate(lengths):
        header.add_contig_line({"ID": str(chrom), "assembly": "GRCh37", "length": length})
    header.add_format_line({"ID":"GT", "Number":1, "Type":"String", "Description": "Genotype"})

    with open(vcffilename, 'wb') as vcffile:
        writer = vcfpy.Writer.from_stream(vcffile, header, samples, use_bgzf=True)
        for record in records:
            genotype = random.choice(['0/0', '0/1', '1/1'])
            newrecord = vcfpy.Record(record.CHROM,
                                     record.POS,
                                     record.ID,
                                     record.REF,
                                     record.ALT,
                                     record.QUAL,
                                     record.FILTER,
                                     record.INFO,
                                     ["GT"],
                                     calls=[vcfpy.record.Call(sample_name, {"GT": genotype})])
            writer.write_record(newrecord)
        writer.close()
def build_header(contigs, species):
    header = vcfpy.Header()
    header.samples = vcfpy.SamplesInfos([])
    header.add_line(vcfpy.HeaderLine("fileformat", "VCFv4.2"))
    for name, length in contigs:
        header.add_contig_line({"ID": name, "length": length})
    header.add_line(vcfpy.HeaderLine("species", ",".join(species)))
    header.add_info_line({
        "ID": "END",
        "Description": "End position of the alignment",
        "Type": "Integer",
        "Number": 1,
    })
    header.add_info_line({
        "ID": "UCSC_GENE",
        "Description": "UCSC gene ID",
        "Type": "String",
        "Number": 1
    })
    header.add_info_line({
        "ID": "EXON",
        "Description": "Index of exon in transcript",
        "Type": "Integer",
        "Number": 1
    })
    header.add_info_line({
        "ID": "EXON_COUNT",
        "Description": "Number of exons in transcript",
        "Type": "Integer",
        "Number": 1,
    })
    header.add_info_line({
        "ID": "ALIGNMENT",
        "Description": "Amino acid alignment at this location",
        "Type": "String",
        "Number": 1,
    })
    return header
Beispiel #6
0
def get_header(sample_name_to_header, chromosome_set):
    """
    Returns the header of the output VCF file
    :param sample_name_to_header: a dictionary from the sample names to the headers
    :param chromosome_set: the set of chromosomes selected for analysis
    :return: a vcfpy.Header
    """
    header = vcfpy.Header()

    header.add_line(vcfpy.HeaderLine(key="fileformat", value="VCFv4.2"))

    # CONTIG headers
    first_sample_header = next(iter(sample_name_to_header.values()))
    for input_header_line in first_sample_header.lines:
        if isinstance(input_header_line, vcfpy.ContigHeaderLine):
            if chromosome_set is None or input_header_line.mapping[
                    "ID"] in chromosome_set:
                header.add_line(input_header_line)

    # INFO fields
    header.add_info_line(
        vcfpy.OrderedDict(ID="END",
                          Number=1,
                          Type="Integer",
                          Description="Stop position of the interval"))
    header.add_info_line(
        vcfpy.OrderedDict(ID="SVTYPE",
                          Number=1,
                          Type="String",
                          Description="Type of structural variant"))
    header.add_info_line(
        vcfpy.OrderedDict(
            ID="INSSEQ",
            Number=1,
            Type="String",
            Description=
            "Insertion sequence of structural variant, not including sequence marked as duplication"
        ))
    header.add_info_line(
        vcfpy.OrderedDict(
            ID="TRANCHE2",
            Number=1,
            Type="String",
            Description=
            "Quality category of GRIDSS structural variant calls determined using FILTER,SRQ,AS,RAS. Values are LOW INTERMEDIATE HIGH"
        ))
    header.add_info_line(
        vcfpy.OrderedDict(
            ID="BNDVAF",
            Number=1,
            Type="Float",
            Description=
            "VAF of this gridss-called BND calculated as (SR+RP+IC+AS)/(REF+SR+RP+IC+AS)"
        ))
    # FORMAT fields
    header.add_format_line(
        vcfpy.OrderedDict(ID="GT",
                          Number=1,
                          Type="String",
                          Description="Genotype"))
    header.add_format_line(
        vcfpy.OrderedDict(
            ID="TRANCHE2",
            Number=1,
            Type="String",
            Description=
            "Quality category of GRIDSS structural variant calls determined using FILTER,SRQ,AS,RAS. Values are LOW INTERMEDIATE HIGH"
        ))
    header.add_format_line(
        vcfpy.OrderedDict(
            ID="BNDVAF",
            Number=1,
            Type="Float",
            Description=
            "VAF of this gridss-called BND calculated as (SR+RP+IC+AS)/(REFPAIR+SR+RP+IC+AS)"
        ))
    header.add_format_line(
        vcfpy.OrderedDict(
            ID="VAF",
            Number=1,
            Type="Float",
            Description=
            "VAF of this SV call, derived from BNDVAF values of BND calls used to call this SV"
        ))
    header.add_format_line(
        vcfpy.OrderedDict(
            ID="INSSEQ",
            Number=1,
            Type="String",
            Description=
            "Insertion sequence of structural variant, not including sequence marked as duplication"
        ))

    # Samples, sorted to ensure determinism
    sample_names = sample_name_to_header.keys()
    header.samples = vcfpy.SamplesInfos(sorted(sample_names))

    return header
Beispiel #7
0
def extract_vcf_records(
        sample_name,
        # input paths
        alignments_path,
        contigs_path,
        ref_fasta_path,
        vcf_template_path,
        # output paths
        vcf_out_path,
        selected_contigs_path,
        flanked_contigs_path,
        flank_length,
        min_insert_size):

    n_records = 0
    ref_fasta = pysam.FastaFile(ref_fasta_path)
    contig_fasta = pysam.FastaFile(contigs_path)

    selected_contig_fasta = open(selected_contigs_path, "w")
    flanked_contig_fasta = open(flanked_contigs_path, "w")

    alns = pandas.read_csv(alignments_path, sep=" ")

    reader = vcfpy.Reader.from_path(vcf_template_path)
    reader.header.samples = vcfpy.SamplesInfos([sample_name])

    writer = vcfpy.Writer.from_path(vcf_out_path, reader.header)

    contig_loci = set()

    # parse each alignment and look for insertions above min_insert_size
    for r in alns.iterrows():
        # skip secondary alignments
        hit = r[1]["Hit"]
        if hit > 0:
            continue

        query_name = r[1]["QName"]

        # local alignment window in the reference
        ref_chrom, ref_start, ref_end, phase_set, phase, n = query_name.split(
            "_")

        phase_set = phase_set[2:]
        phase = phase[2:]

        # convert to ints
        ref_start, ref_end = (int(ref_start), int(ref_end))

        # alignment start and end for reference sequence
        target_start = r[1]["TStart"]
        target_end = r[1]["TEnd"]

        # alignment start and end for query sequence
        query_start = r[1]["QStart"]
        query_end = r[1]["QEnd"]

        # strand-ness of the query sequence
        strand = r[1]["Strand"]

        # parse cigar for variant extraction
        cig = cigar.Cigar(r[1]["CIGAR"])
        ops = list(cig.items())

        # convert sequences to the positive strand
        query_seq = contig_fasta.fetch(query_name)
        if strand == "-":
            query_seq = str(Bio.Seq.Seq(query_seq).reverse_complement())

        ref_seq = ref_fasta.fetch(ref_chrom, ref_start, ref_end)

        # initialize iterators for the cigar string
        query_pos = query_start
        target_pos = target_start

        # we are looking to extract insertions larger than 50bp
        for op in ops:
            # skip matches
            if op[1] == 'M':
                query_pos += op[0]
                target_pos += op[0]

            # skip deletions in the query sequence
            elif op[1] == 'D':
                target_pos += op[0]

            # insertions in the query sequence
            elif op[1] == 'I':
                # only interested in large insertions
                if op[0] > min_insert_size:
                    # Generate pysam.VariantRecord

                    # need to check conversion from 0-based coordinates to 1-based
                    ref_allele = ref_seq[target_pos]
                    alt_allele = ref_allele + query_seq[query_pos:query_pos +
                                                        op[0]]

                    gt = ""
                    if phase == "1":
                        gt = "1|0"
                    elif phase == "2":
                        gt = "0|1"
                    else:
                        gt = "0/1"

                    break_point = ref_start + target_pos
                    # output VCF record corresponding to the insertion
                    rec = vcfpy.Record(
                        CHROM=ref_chrom,
                        POS=break_point + 1,
                        ID=[query_name],
                        REF=ref_allele,
                        ALT=[vcfpy.Substitution("INS", alt_allele)],
                        QUAL=999,
                        FILTER=["PASS"],
                        INFO={},
                        FORMAT=[
                            "GT", "SVLEN", "PS", "HP", "CIGAR", "STRAND",
                            "CONTIG_START"
                        ],
                        calls=[
                            vcfpy.Call(sample=sample_name,
                                       data=vcfpy.OrderedDict(
                                           GT=gt,
                                           SVLEN=op[0],
                                           PS=phase_set,
                                           HP=phase,
                                           CIGAR=str(cig),
                                           STRAND=strand,
                                           CONTIG_START=str(query_start)))
                        ])

                    n_records += 1
                    # output contig that contains this insertion
                    writer.write_record(rec)

                    contig_locus = ">" + query_name + "_" + sample_name
                    contig_hash = sha1("_{chrom}_{pos}_{alt}".format(
                        chrom=ref_chrom, pos=ref_start,
                        alt=alt_allele[1:]).encode()).hexdigest()

                    contig_name = contig_locus + "_" + contig_hash + "_" + str(
                        op[0])

                    if contig_locus not in contig_loci:
                        selected_contig_fasta.writelines(
                            [contig_name + "\n", query_seq + "\n"])
                        contig_loci.add(contig_locus)

                    # output same insertion, but with flanking sequences
                    # note, the interval is [start, end[
                    if flank_length > 0:
                        left_flank = ref_fasta.fetch(
                            ref_chrom, break_point - flank_length, break_point)
                        right_flank = ref_fasta.fetch(
                            ref_chrom, break_point, break_point + flank_length)
                    else:
                        left_flank = ""
                        right_flank = ""
                    flanked_contig_fasta.writelines([
                        contig_name + "\n",
                        left_flank + alt_allele[1:] + right_flank + "\n"
                    ])

                query_pos += op[0]
    selected_contig_fasta.close()
    return n_records
Beispiel #8
0
def main():

    parser = argparse.ArgumentParser(description="vcf writer")
    parser.add_argument("input",
                        metavar='input.vcf',
                        action='store',
                        help='vcf file.',
                        type=str)
    parser.add_argument("output",
                        metavar='output.vcf',
                        action='store',
                        help='vcf file.',
                        type=str)

    args = parser.parse_args()

    outvcf = args.output
    invcf = args.input

    #########################
    #                       #
    #  creating the header  #
    #                       #
    #########################

    # The header can contain some fixed type lines (INFO, FORMAT, FILTER, etc.) and some general ones
    # In this case, the header will contain a line storing the name of the program which generated
    # the file. We also add the information about the name of the sample which have been analyzed

    header = vcfpy.Header(lines=[
        vcfpy.HeaderLine(key="source", value=sys.argv[0]),
        vcfpy.HeaderLine(key="fileformat", value="VCFv4.3"),
        vcfpy.HeaderLine(key="fileDate",
                         value=date.today().strftime("%d/%m/%Y"))
    ],
                          samples=vcfpy.SamplesInfos(["Sample1"]))

    # adding format lines
    header.add_format_line(
        OrderedDict([("ID", "GT"), ("Number", "1"), ("Type", "String"),
                     ("Description", "Genotype")]))
    header.add_format_line(
        OrderedDict([("ID", "DP"), ("Number", "1"), ("Type", "Integer"),
                     ("Description", "Filtered read depth (MAPQ > 30)")]))

    # read the input vcf
    with vcfpy.Reader.from_path(invcf) as reader:

        # get the FORMAT header lines of the input file
        # and convert them in INFO header lines of the output file
        format_ids = reader.header.format_ids()
        for format_id in format_ids:
            format_line = reader.header.get_format_field_info(format_id)
            '''
            output example:
        
            FormatHeaderLine('FORMAT', '<ID=AD,Number=R,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed">', {'ID': 'AD', 'Number': 'R', 'Type': 'Integer', 'Description': 'Allelic depths for the ref and alt alleles in the order listed'})

            key = 'FORMAT'
            value = '<ID=AD,Number=R,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed">
            '''
            header.add_info_line(str_to_mapping(format_line.value))
            #print(header)

    # write the vcf
    with vcfpy.Writer.from_path(outvcf, header) as writer:

        # creating one record
        record = vcfpy.Record(CHROM="1",
                              POS=1,
                              ID=[],
                              REF="C",
                              ALT=[vcfpy.Substitution(type_="SNV", value="G")],
                              QUAL=None,
                              FILTER=[],
                              INFO={},
                              FORMAT=["GT", "DP"],
                              calls=[
                                  vcfpy.Call(
                                      "Sample1",
                                      OrderedDict([("GT", "0/1"),
                                                   ("DP", "47")]))
                              ])
        #print(record)
        writer.write_record(record)
Beispiel #9
0
def main():
    parser = argparse.ArgumentParser(
        description="Looks for a given set of SNPs whithin a bam file.")

    parser.add_argument("bam",
                        metavar='sample.bam',
                        action='store',
                        help='BAM file.',
                        type=str)

    parser.add_argument(
        "barcodes",
        metavar='barcodes.list',
        action='store',
        help=
        "File containing cell barcodes (the same used in the alignment file to identify cell reads).",
        type=str)

    parser.add_argument("vcf",
                        metavar='variants.vcf',
                        action='store',
                        help="VCF file storing BULK SNPs.",
                        type=str)

    parser.add_argument("sample_name",
                        metavar='sample1',
                        action='store',
                        help="Sample identifier.",
                        type=str)

    parser.add_argument("out_prefix",
                        metavar="outdir/sample",
                        action="store",
                        help="Output VCF file prefix.",
                        type=str)

    parser.add_argument(
        "--gt",
        metavar='1/1 (0/1)',
        choices=["0/0", "0/1", "1/1"],
        action='store',
        help=
        "Genotype filter: considers only mutations with the specified GT in the original vcf file.",
        type=str)

    args = parser.parse_args()
    bam = args.bam
    barcodes = args.barcodes
    invcf = args.vcf
    sample = args.sample_name
    outvcf = args.out_prefix + ".snpseeker.vcf"

    if args.gt:
        gt_filter = True
        gt = args.gt

    else:
        gt_filter = False

    with open(barcodes, "r") as f:
        samples = f.read().splitlines()
    #read bam file
    samfile = pysam.AlignmentFile(bam, "rb")

    #build the header of the output vcf
    header_out = vcfpy.Header(lines=[
        vcfpy.HeaderLine(key="fileformat", value="VCFv4.3"),
        vcfpy.HeaderLine(key="source", value=sys.argv[0]),
        vcfpy.HeaderLine(key="fileDate",
                         value=date.today().strftime("%d/%m/%Y"))
    ],
                              samples=vcfpy.SamplesInfos(samples))

    # sample header lines
    header_out.add_line(
        vcfpy.SampleHeaderLine.from_mapping(
            OrderedDict([("ID", sample), ("Description", "Sample name")])))

    # filter header lines
    # sample header lines
    header_out.add_filter_line(
        OrderedDict([("ID", "1/1"), ("Number", "1"),
                     ("Description", "Filtered on such GT")]))
    header_out.add_filter_line(
        OrderedDict([("ID", "0/1"), ("Number", "1"),
                     ("Description", "Filtered on such GT")]))
    header_out.add_filter_line(
        OrderedDict([("ID", "0/0"), ("Number", "1"),
                     ("Description", "Filtered on such GT")]))

    #header_out.add_info_line(OrderedDict([("ID", "MUT"), ("Number", "1"), ("Type","Integer"), ("Description", "States if the record mutation is supported (1) or not (0).")]))

    # format header lines
    header_out.add_format_line(
        OrderedDict([("ID", "GT"), ("Number", "1"), ("Type", "String"),
                     ("Description", "Genotype (0/1, 0/0)")]))
    header_out.add_format_line(
        OrderedDict([
            ("ID", "DP"), ("Number", "1"), ("Type", "Integer"),
            ("Description",
             "Filtered read depth (reads with MAPQ < 30, indels and gaps are filtered)"
             )
        ]))
    header_out.add_format_line(
        OrderedDict([("ID", "RD"), ("Number", "1"), ("Type", "Integer"),
                     ("Description", "Reference allele read depth")]))
    header_out.add_format_line(
        OrderedDict([("ID", "AD"), ("Number", "1"), ("Type", "Integer"),
                     ("Description", "Alternate allele read depth")]))
    header_out.add_format_line(
        OrderedDict([
            ("ID", "AF"), ("Number", "1"), ("Type", "Float"),
            ("Description",
             "Allele frequency: AD/(RD+AD). Other alleles, in case of mutli-allelic regions, are ignored."
             )
        ]))

    # read input vcf
    reader = vcfpy.Reader.from_path(invcf)

    # info header lines
    # Use input FORMAT lines as output INFO line
    header_out.add_info_line(
        OrderedDict([("ID", "SUPP"), ("Number", "1"), ("Type", "Integer"),
                     ("Description",
                      "Number of cells supporting the mutation.")]))

    format_ids = reader.header.format_ids()
    for format_id in format_ids:
        format_line = reader.header.get_format_field_info(format_id)
        '''
            output example:
        
            FormatHeaderLine('FORMAT', '<ID=AD,Number=R,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed">', {'ID': 'AD', 'Number': 'R', 'Type': 'Integer', 'Description': 'Allelic depths for the ref and alt alleles in the order listed'})
            key = 'FORMAT'
            value = '<ID=AD,Number=R,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed">
        '''
        mapping = str_to_mapping(format_line.value)
        mapping["Description"] = "(Info about bulk mutation)" + mapping[
            "Description"]
        header_out.add_info_line(str_to_mapping(format_line.value))

    # open the output vcf
    writer = vcfpy.Writer.from_path(outvcf, header_out)

    #read bam file
    samfile = pysam.AlignmentFile(bam, "rb")

    #for each mutation in the vcf file
    for record_in in reader:
        d = samples_dict(samples)
        supp = 0
        # filter out indels: only interested in snvs in this analysis phase
        if gt_filter:
            if record.calls[0].data.get('GT') != gt:
                continue

        if not record_in.is_snv():
            continue
        chrom = record_in.CHROM
        pos = record_in.POS - 1  #to correct on 1-based positions
        ref = record_in.REF
        alt = record_in.ALT[
            0].value  #record.ALT is a list by construction which contains only one value
        # if the mutation is a SNV
        #line += [call.data.get('GT') or './.' for call in record.calls]

        #look for the pileup in the samfile at position (chrom,pos)
        for pileupcolumn in samfile.pileup(chrom,
                                           pos,
                                           pos + 1,
                                           stepper='all',
                                           truncate=True,
                                           max_depth=10000):
            for base in pileupcolumn.pileups:
                # .is_del -> the base is a deletion?
                # .is_refskip -> the base is a N in the CIGAR string ?
                if not base.is_del and not base.is_refskip and not base.alignment.mapping_quality < 30:
                    #iterate on cells
                    tags = list_to_dict(base.alignment.tags)
                    if "CB" not in tags.keys():
                        ''' reads with no error-corrected barcode are discarded '''
                        continue
                    elif tags["CB"].split("-")[0] not in samples:
                        ''' The barcode hasn't been labeled has belonging to a cell by cellranger (floating DNA)'''
                        continue
                    cb = tags["CB"].split("-")[0]  #10x barcodes
                    #print("barcode {} is a cell barcode ".format(cb))
                    d[cb][
                        'dp'] += 1  #update info for the sample identified by CB
                    if base.alignment.query_sequence[
                            base.query_position] == alt:
                        d[cb]['ad'] += 1
                    elif base.alignment.query_sequence[
                            base.query_position] == ref:
                        d[cb]['rd'] += 1
        for cb in d.keys():
            if d[cb]['ad'] > 0:
                supp += 1
                d[cb][
                    'gt'] = "0/1"  #temporary, all the supported mutations are set to 0/1
                d[cb]['af'] = d[cb]['ad'] / (d[cb]['rd'] + d[cb]['ad'])

        # generate calls for each sample/cell
        calls = []
        for cb in d.keys():
            calls.append(
                vcfpy.Call(
                    cb,
                    OrderedDict([("GT", d[cb]['gt']), ("DP", d[cb]['dp']),
                                 ("RD", d[cb]['rd']), ("AD", d[cb]['ad']),
                                 ("AF", d[cb]['af'])])))

        # create a mapping between each FORMAT entry and the
        # corresponding value, in the call, in the input vcf file
        # note that the input vcf contains only one sample, so
        # the calls field of each record contains only one entry
        info_d = {}
        info_d['SUPP'] = supp
        for f in record_in.FORMAT:
            info_d[f] = record_in.calls[0].data.get(f)

        if gt_filter == True:
            filter_l = [gt]
        else:
            filter_l = []

        # build and write the output record

        record_out = vcfpy.Record(
            CHROM=chrom,
            POS=pos + 1,
            ID=[],
            REF=ref,
            ALT=[vcfpy.Substitution(type_="SNV", value=alt)],
            QUAL=None,
            FILTER=filter_l,
            INFO=info_d,
            FORMAT=["GT", "DP", "RD", "AD", "AF"],
            calls=calls)
        writer.write_record(record_out)

    reader.close()
    writer.close()
    samfile.close()
Beispiel #10
0
def main():

    parser = argparse.ArgumentParser(description="vcf writer")
    parser.add_argument("output",
                        metavar='output.vcf',
                        action='store',
                        help='vcf file.',
                        type=str)

    args = parser.parse_args()

    outvcf = args.output

    #########################
    #                       #
    #  creating the header  #
    #                       #
    #########################

    # The header can contain some fixed type lines (INFO, FORMAT, FILTER, etc.) and some general ones
    # In this case, the header will contain a line storing the name of the program which generated
    # the file. We also add the information about the name of the sample which have been analyzed

    header = vcfpy.Header(lines=[
        vcfpy.HeaderLine(key="source", value=sys.argv[0]),
        vcfpy.HeaderLine(key="fileformat", value="VCFv4.3"),
        vcfpy.HeaderLine(key="fileDate",
                         value=date.today().strftime("%d/%m/%Y"))
    ],
                          samples=vcfpy.SamplesInfos(["Sample1", "Sample2"]))

    # Tuples of valid entries -----------------------------------------------------
    #
    #: valid INFO value types
    # INFO_TYPES = ("Integer", "Float", "Flag", "Character", "String")
    #: valid FORMAT value types
    # FORMAT_TYPES = ("Integer", "Float", "Character", "String")
    #: valid values for "Number" entries, except for integers
    # VALID_NUMBERS = ("A", "R", "G", ".")
    #: header lines that contain an "ID" entry
    # LINES_WITH_ID = ("ALT", "contig", "FILTER", "FORMAT", "INFO", "META", "PEDIGREE", "SAMPLE")
    # Constants for "Number" entries ----------------------------------------------
    #
    #: number of alleles excluding reference
    # HEADER_NUMBER_ALLELES = "A"
    #: number of alleles including reference
    # HEADER_NUMBER_REF = "R"
    #: number of genotypes
    # HEADER_NUMBER_GENOTYPES = "G"
    #: unbounded number of values
    # HEADER_NUMBER_UNBOUNDED = "."

    # adding filter lines
    header.add_filter_line(
        OrderedDict([("ID", "PASS"), ("Description", "All filters passed")]))

    # adding info lines
    header.add_info_line(
        OrderedDict([("ID", "DP"), ("Number", "1"), ("Type", "Integer"),
                     ("Description",
                      "Raw read depth (without mapping quality filters)")]))
    header.add_info_line(
        OrderedDict([
            ("ID", "MUT"), ("Number", "1"), ("Type", "Integer"),
            ("Description",
             "States if the record mutation is supported (1) or not (0).")
        ]))

    # adding format lines
    header.add_format_line(
        OrderedDict([("ID", "GT"), ("Number", "1"), ("Type", "String"),
                     ("Description", "Genotype")]))
    header.add_format_line(
        OrderedDict([("ID", "DP"), ("Number", "1"), ("Type", "Integer"),
                     ("Description", "Filtered read depth (MAPQ > 30)")]))
    #header.add_format_line(OrderedDict([vcfpy.header.RESERVED_FORMAT["GT"]]))

    # adding contig lines
    header.add_contig_line(
        OrderedDict([("ID", "chr1"), ("length", "248956422")]))

    # adding sample lines
    header.add_line(
        vcfpy.SampleHeaderLine.from_mapping(
            OrderedDict([("ID", "Sample1"), ("Description", "Tumor")])))

    # writing the vcf
    with vcfpy.Writer.from_path(outvcf, header) as writer:

        # creating one record
        calls = []
        calls.append(
            vcfpy.Call("Sample1", OrderedDict([("GT", "0/1"), ("DP", "47")])))
        calls.append(
            vcfpy.Call("Sample2", OrderedDict([("GT", "0/1"), ("DP", "31")])))

        record = vcfpy.Record(CHROM="1",
                              POS=1,
                              ID=[],
                              REF="C",
                              ALT=[vcfpy.Substitution(type_="SNV", value="G")],
                              QUAL=None,
                              FILTER=["PASS"],
                              INFO={
                                  "DP": "50",
                                  "MUT": 0
                              },
                              FORMAT=["GT", "DP"],
                              calls=calls)
        #record.add_format(key="GT")
        #record.calls.append(vcfpy.Call("Sample1", OrderedDict([("GT", "0|1")])))
        writer.write_record(record)
Beispiel #11
0
def main():
    parser = argparse.ArgumentParser(description="Looks for a given set of SNPs whithin a bam file.")


    parser.add_argument("bam", metavar='sample.bam', action='store',
        help='BAM file.', type=str)

    parser.add_argument("vcf", metavar='file.vcf', action='store',
        help="VCF file storing SNPs.", type=str)

    parser.add_argument("sample_name", metavar='sample1', action='store',
                help="Sample identifier.", type=str)



    parser.add_argument("out_prefix", metavar="outdir/sample", action="store",
        help="Output VCF file prefix.", type=str)

    #parser.add_argument("--sample_name2", metavar='sample2', action='store',
    #                            help="Another sample name", type=str)

    args = parser.parse_args()
    bam= args.bam
    invcf = args.vcf
    sample = args.sample_name
    outvcf = args.out_prefix + ".snpseeker.vcf"

    '''
    if args.sample_name2:
    sample_name2 = args.sample_name2
    else:
    sample_name2 = null
    '''

    #read bam file
    samfile = pysam.AlignmentFile(bam, "rb")

    #build the header of the output vcf
    header_out = vcfpy.Header(lines=[vcfpy.HeaderLine(key="fileformat", value="VCFv4.3"), vcfpy.HeaderLine(key="source", value=sys.argv[0]), vcfpy.HeaderLine(key="fileDate", value=date.today().strftime("%d/%m/%Y")) ], samples=vcfpy.SamplesInfos([sample]))

    # sample header lines
    header_out.add_line(vcfpy.HeaderLine(key="SampleName", value=sample))
    '''
    if sample_name2 is not null:
    header_out.add_line(vcfpy.SampleHeaderLine.from_mapping(OrderedDict([("ID", sample_name2),("Description", "Second sample name")])))
    '''
    # info header lines
    header_out.add_info_line(OrderedDict([("ID", "SUPP"), ("Number", "1"), ("Type","Integer"), ("Description", "States if the mutation is supported (1) or not (0).")]))

    # adding format lines 
    header_out.add_format_line(OrderedDict([("ID", "GT"),("Number", "1"), ("Type","String"), ("Description", "Genotype (0/1, 0/0)")]))
    header_out.add_format_line(OrderedDict([("ID", "SDP"),("Number", "1"), ("Type","Integer"), ("Description", "Samtools read depth (secondary alignments, PCR duplicates, unppammed reads and reads not passing vendor QC are filtered)")]))
    header_out.add_format_line(OrderedDict([("ID", "DP"),("Number", "1"), ("Type","Integer"), ("Description", "Filtered read depth (reads with MAPQ < 30, indels and gaps are filtered)")]))
    header_out.add_format_line(OrderedDict([("ID", "RD"),("Number", "1"), ("Type","Integer"), ("Description", "Reference allele read depth")]))
    header_out.add_format_line(OrderedDict([("ID", "AD"),("Number", "1"), ("Type","Integer"), ("Description", "Alternate allele read depth")]))
    header_out.add_format_line(OrderedDict([("ID", "AF"),("Number", "1"), ("Type","Float"), ("Description", "Allele frequency: AD/(RD+AD). Other alleles, in case of mutli-allelic regions, are ignored.")]))

    # read input vcf
    reader = vcfpy.Reader.from_path(invcf)

    format_ids = reader.header.format_ids()
    for format_id in format_ids:
        format_line = reader.header.get_format_field_info(format_id)
        '''
            output example:
        
            FormatHeaderLine('FORMAT', '<ID=AD,Number=R,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed">', {'ID': 'AD', 'Number': 'R', 'Type': 'Integer', 'Description': 'Allelic depths for the ref and alt alleles in the order listed'})
            key = 'FORMAT'
            value = '<ID=AD,Number=R,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed">
        '''
        mapping = str_to_mapping(format_line.value)
        mapping["Description"] = "(Info about mutation in the original vcf)" + mapping["Description"]
        header_out.add_info_line(str_to_mapping(format_line.value))

    # open the output vcf
    writer = vcfpy.Writer.from_path(outvcf, header_out) 

    #read bam file
    samfile = pysam.AlignmentFile(bam, "rb")

    #for each mutation in the vcf file
    for record_in in reader:
    # filter out indels: only interested in snvs in this analysis phase
        if not record_in.is_snv():
            continue
        chrom = record_in.CHROM
        pos = record_in.POS-1 #to correct on 1-based positions
        ref = record_in.REF
        alt = record_in.ALT[0].value  #record.ALT is a list by construction which contains only one value
                                    # if the mutation is a SNV
        #line += [call.data.get('GT') or './.' for call in record.calls]

        #look for the pileup in the samfile at position (chrom,pos)
        for pileupcolumn in samfile.pileup(chrom, pos, pos+1, stepper='all', truncate=True, max_depth=10000):
            #number of reads at this position
            sdp = pileupcolumn.n
            #number of supporting reads for the alternate base
            ad = 0
            rd = 0
            dp = 0
            af = 0.0
            for base in pileupcolumn.pileups:
                # .is_del -> the base is a deletion?
                # .is_refskip -> the base is a N in the CIGAR string ?
                if not base.is_del and not base.is_refskip and not base.alignment.mapping_quality < 30:
                    dp += 1
                    if base.alignment.query_sequence[base.query_position] == alt:
                        ad += 1
                    elif base.alignment.query_sequence[base.query_position] == ref:
                        rd += 1

        if ad > 0:
            af = ad / (rd + ad)
            supp = 1
            gt = "0/1" #temporary, all the supported mutations are set to 0/1
        else:
            supp = 0
            gt = "0/0" 


        #af = ad / (rd + ad)

        info_d = {}
        info_d['SUPP'] = supp
        for f in record_in.FORMAT:
            info_d[f] = record_in.calls[0].data.get(f)

        record_out = vcfpy.Record(CHROM=chrom, POS=pos+1, ID=[], REF=ref, ALT=[vcfpy.Substitution(type_="SNV", value=alt)], QUAL=None, FILTER=[], INFO=info_d, FORMAT=["GT","SDP","DP","RD","AD","AF"],
                calls=[vcfpy.Call(sample, OrderedDict([("GT", gt), ("SDP",sdp), ("DP", dp), ("RD", rd), ("AD", ad), ("AF", af)]))]
           )
        writer.write_record(record_out)


    reader.close()
    writer.close()
    samfile.close()
Beispiel #12
0
def main():
    parser = argparse.ArgumentParser(description="From single cell VCF to clones vcf.")
    parser.add_argument("input1", metavar="sample.muts.vcf", action="store", help="Single cell VCF file.", type=str)
    parser.add_argument("input2", metavar="clusters.list", action="store", help="Clusters list.", type=str)
    #parser.add_argument("input_type", choices=["gz", "vcf"], help="VCF input type (vcf/gz).", type=str)
    #parser.add_argument("sample", metavar="sample_name", action="store", help="Sample name", type=str)
    parser.add_argument("outprefix", metavar="out/path/prefix", action="store", help="Output prefix", type=str)

    args = parser.parse_args()

    input1 = args.input1
    input2 = args.input2
    prefix = args.outprefix
    #sample = args.sample
    #input_type = args.input_type

        
    clusters_df = pd.read_csv(input2)
    #clusters_df['cluster'] = clusters_df['a'].apply(lambda x: "{}_{}".format(sample, x))    

    clusters = [str(cluster) for cluster in clusters_df['cluster'].unique()]
    # Create out header
    header_out = vcfpy.Header(lines=[ vcfpy.HeaderLine(key="fileformat", value="VCFv4.3"), vcfpy.HeaderLine(key="source", value=sys.argv[0]), vcfpy.HeaderLine(key="fileDate", value=date.today().strftime("%d/%m/%Y")) ], samples=vcfpy.SamplesInfos(clusters))
     
    # format header lines 
    header_out.add_format_line(OrderedDict([("ID", "GT"),("Number", "1"), ("Type","String"), ("Description", "Genotype (0/1, 0/0)")]))
    header_out.add_format_line(OrderedDict([("ID", "DP"),("Number", "1"), ("Type","Integer"), ("Description", "Filtered read depth (reads with MAPQ < 30, indels and gaps are filtered)")]))
    header_out.add_format_line(OrderedDict([("ID", "RD"),("Number", "1"), ("Type","Integer"), ("Description", "Reference allele read depth")]))
    header_out.add_format_line(OrderedDict([("ID", "AD"),("Number", "1"), ("Type","Integer"), ("Description", "Alternate allele read depth")]))
    header_out.add_format_line(OrderedDict([("ID", "AF"),("Number", "1"), ("Type","Float"), ("Description", "Allele frequency: AD/(RD+AD). Other alleles, in case of mutli-allelic regions, are ignored.")]))
    
    # info header lines
 
    header_out.add_info_line(OrderedDict([("ID", "SUPP"), ("Number", "1"), ("Type","Integer"), ("Description", "Whether the mutation is supported or not.")]))
    
    # read input vcf
    reader = vcfpy.Reader.from_path(input1)
    # open the output vcf
    writer = vcfpy.Writer.from_path(prefix+"_clusters.vcf", header_out)
 
    """
    snps = read_vcf(input1, input_type)
    #Filtering bulk mutations not supported by cells
    snps = snps[~snps['INFO'].str.startswith("SUPP=0")]
    
    #Create mutation id column and set it as index
    snps["mutid"] = snps["CHROM"] + "_"+snps["POS"].map(str) + "_" + snps["REF"] + "_" +snps["ALT"]
    snps = snps.set_index('mutid')
    """

    #for each record in the vcf file
    for record_in in reader:
        d = samples_dict(clusters_df['cluster'].unique())
        supp = 0
        chrom = record_in.CHROM
        pos = record_in.POS-1 #to correct on 1-based positions
        ref = record_in.REF
        alt = record_in.ALT[0].value
        
        #for each cluster compute 'GT:DP:RD:AD:AF' to be provided as call argument
        for c in clusters_df['cluster'].unique():
            #retrieve cell columns for cells in current cluster
            cells = clusters_df['cellid'][clusters_df['cluster'] == c]
            
          
            #retrieve cell data
            calls = [record_in.call_for_sample[cell] for cell in cells]
            #sum total read count, alt read count and ref read count of cells in the cluster
            for call in calls:    
                d[c]['dp'] = d[c]['dp'] + call.data.get('DP') 
                d[c]['rd'] = d[c]['rd'] + call.data.get('RD')
                d[c]['ad'] = d[c]['ad'] + call.data.get('AD')

            if d[c]['ad'] > 0:
                d[c]['gt'] = "0/1"
                d[c]['af'] = d[c]['ad'] / (d[c]['rd'] + d[c]['ad'])
                supp = 1
    
        calls = []
        # create one call for each cluster
        for c in d.keys():
            calls.append(vcfpy.Call(str(c), OrderedDict([("GT", d[c]['gt']), ("DP", d[c]['dp']), ("RD", d[c]['rd']), ("AD", d[c]['ad']), ("AF", d[c]['af'])])))        
        print(calls)
         
        # write new record
        record_out = vcfpy.Record(CHROM=chrom, POS=pos+1, ID=[], REF=ref, ALT=[vcfpy.Substitution(type_="SNV", value=alt)], QUAL=None, FILTER=[], INFO={"SUPP":supp}, FORMAT=["GT","DP","RD","AD","AF"],
                calls=calls
           )
        writer.write_record(record_out)
        
    reader.close()
    writer.close()
Beispiel #13
0
def extract_consensus_insertions(contig_path, cons_path, ref_fasta_path, vcf_out_path, vcf_template_path, min_insertion_size, flank_length, flanked_contigs_path):
    n_records = 0
    # open input sequences
    cons_fasta = pysam.FastaFile(cons_path)
    ref_fasta = pysam.FastaFile(ref_fasta_path)

    flanked_contig_fasta = open(flanked_contigs_path, "w")

    (samples, loci) = collect_genotypes(contig_path)
    print("Found", len(samples), "samples for", len(loci), "phased loci")

    reader = vcfpy.Reader.from_path(vcf_template_path)
    reader.header.samples = vcfpy.SamplesInfos(list(samples))
    writer = vcfpy.Writer.from_path(vcf_out_path, reader.header)

    for contig in cons_fasta.references:
        # parse coordinates
        (chrom, start, end) = contig.split("_")
        (start, end) = int(start), int(end)

        cons_seq  = cons_fasta.fetch(contig)
        ref_seq = ref_fasta.fetch(chrom, start, end)

        aligner = mappy.Aligner(seq = ref_seq, preset = None , k = 15, w = 10, n_threads = 1,
                                max_join_long = 20000, max_join_short = 10000, min_join_flank_sc = 10,
                                min_join_flank_ratio = 0.1, max_gap = 10000, bw = 2000, end_bonus = 10,
                                zdrop = 10000, zdrop_inv = 1000,
                                scoring = (2, 4, 4, 10, 300, 0, 1),
                                extra_flags = 0x1)
        alignments = list(aligner.map(cons_seq, seq2 = None, cs = True, MD = False))

        if len(alignments) == 0:
            print("No hits in", contig)
            continue

        aln = max(alignments, key = lambda x: x.blen)

        cig = cigar.Cigar(aln.cigar_str)
        ops = list(cig.items())


        cons_pos = aln.q_st
        target_pos = aln.r_st

        strand = "+"
        if aln.strand == -1:
                cons_seq = str(Bio.Seq.Seq(cons_seq).reverse_complement())
                strand = "-"
        # print(contig)
        for op in ops:
            # skip matches
            if op[1] == 'M':
                cons_pos += op[0]
                target_pos += op[0]

            # skip deletions in the query sequence
            elif op[1] == 'D':
                target_pos += op[0]

            # insertions in the query sequence
            elif op[1] == 'I':
                # only interested in large insertions
                if op[0] > min_insertion_size:
                    # Generate pysam.VariantRecord

                    # need to check conversion from 0-based coordinates to 1-based
                    ref_allele = ref_seq[target_pos-1]
                    alt_allele = cons_seq[cons_pos:cons_pos + op[0]]

                    break_point = start + target_pos
                    # output VCF record corresponding to the insertion
                    # print(break_point, (start + end) / 2 )

                    # print(len(loci[contig]), "samples at", contig)

                    # build calls data structure
                    calls = []
                    for sample in samples:
                        sample_gt = "0/0"
                        ps = 0
                        if sample in loci[contig]:
                            sample_gt = loci[contig][sample]["1"] + "|" + loci[contig][sample]["2"]
                            ps = loci[contig][sample]["ps"]
                        sample_call = vcfpy.Call(sample = sample,
                                                 data = vcfpy.OrderedDict(GT = sample_gt, PS = ps))
                        # print(sample_call)
                        calls.append(sample_call)

                    rec = vcfpy.Record(CHROM = chrom, POS = break_point, ID = [contig + "_" + str(cons_pos)],
                                       REF = ref_allele, ALT = [vcfpy.Substitution("INS", ref_allele + alt_allele)],
                                       QUAL = 999, FILTER = ["PASS"],
                                       INFO = vcfpy.OrderedDict(SVLEN = op[0],
                                                                CIGAR = [str(cig)],
                                                                STRAND = strand,
                                                                CONTIG_START = str(aln.q_st)),
                                       FORMAT = ["GT", "PS"],
                                    calls = calls)

                    # output contig that contains this insertion
                    writer.write_record(rec)

                    # output same insertion, but with flanking sequences
                    # note, the interval is [start, end[
                    if flank_length > 0:
                        left_flank = ref_fasta.fetch(chrom, break_point - flank_length, break_point)
                        right_flank = ref_fasta.fetch(chrom, break_point, break_point + flank_length)
                    else:
                        left_flank = ""
                        right_flank = ""

                    flanked_contig_fasta.writelines([ ">" + contig + "_" + str(cons_pos) + "\n",
                                                     left_flank + alt_allele[1:] + right_flank + "\n"])

                    # output same contig, but with large flanking sequences
                    # note, the interval is [start, end[
                    n_records += 1

                cons_pos += op[0]
    flanked_contig_fasta.close()
    return n_records
Beispiel #14
0
logging.info("genotypes collected for : n = %s isolates, at %s",
             len(list(genoSample.keys())), datetime.datetime.now())
filteredEPIs = list(genoSample.keys())
filteredEPIs.sort()  # filter out EPIs contains only low-freq variants
#for acc in ['ISL_700228', 'ISL_539719', 'ISL_539706', 'ISL_539708']:
#    x = genoSample[acc]
#    print(acc, x)
#sys.exit()
#####################
# construct VCF records
#############################
timePre = datetime.datetime.now()
varCt = 0

header = vcfpy.Header(
    samples=vcfpy.SamplesInfos(filteredEPIs),
    lines=[
        vcfpy.HeaderLine('fileformat', 'VCFv4.0'),
        vcfpy.HeaderLine('fileDate', str(datetime.datetime.now())),
        vcfpy.HeaderLine('source', parser.prog),
        vcfpy.ContigHeaderLine('contig', '<ID=String,Length=Integer>', {
            'ID': 'EPI_ISL_406030',
            'length': 29903
        }),
        vcfpy.InfoHeaderLine(
            'INFO',
            '<ID=NS,Number=1,Type=Integer,Description="Number of Samples With Data">',
            {
                'ID': 'NS',
                'Number': 1,
                'Type': 'Integer',