Ejemplo n.º 1
0
def main():

    parser = argparse.ArgumentParser(description="vcf writer")
    parser.add_argument("input", metavar='input.vcf', action='store',
                            help='vcf file.', type=str)
    parser.add_argument("output", metavar='output.vcf', action='store',
                            help='vcf file.', type=str)

    args = parser.parse_args()

    outvcf = args.output
    invcf = args.input
    
    
    #########################
    #                       #
    #  creating the header  #
    #                       #
    #########################

    # The header can contain some fixed type lines (INFO, FORMAT, FILTER, etc.) and some general ones
    # In this case, the header will contain a line storing the name of the program which generated 
    # the file. We also add the information about the name of the sample which have been analyzed

    header = vcfpy.Header(lines=[vcfpy.HeaderLine(key="source", value=sys.argv[0]), vcfpy.HeaderLine(key="fileformat", value="VCFv4.3"), vcfpy.HeaderLine(key="fileDate", value=date.today().strftime("%d/%m/%Y")) ], samples=vcfpy.SamplesInfos(["Sample1"]))

    
    # adding format lines 
    header.add_format_line(OrderedDict([("ID", "GT"),("Number", "1"), ("Type","String"), ("Description", "Genotype")]))
    header.add_format_line(OrderedDict([("ID", "DP"),("Number", "1"), ("Type","Integer"), ("Description", "Filtered read depth (MAPQ > 30)")]))

    # read the input vcf
    with vcfpy.Reader.from_path(invcf) as reader:

        # get the FORMAT header lines of the input file
        # and convert them in INFO header lines of the output file 
        format_ids = reader.header.format_ids()
        for format_id in format_ids:
            format_line = reader.header.get_format_field_info(format_id)
            '''
            output example:
        
            FormatHeaderLine('FORMAT', '<ID=AD,Number=R,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed">', {'ID': 'AD', 'Number': 'R', 'Type': 'Integer', 'Description': 'Allelic depths for the ref and alt alleles in the order listed'})

            key = 'FORMAT'
            value = '<ID=AD,Number=R,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed">
            '''
            header.add_info_line(str_to_mapping(format_line.value))
            #print(header)
    
    # write the vcf
    with vcfpy.Writer.from_path(outvcf, header) as writer:
        
        # creating one record
        record = vcfpy.Record(
                CHROM="1", POS=1, ID=[], REF="C", ALT=[vcfpy.Substitution(type_="SNV", value="G")], QUAL=None, FILTER=[], INFO={}, FORMAT=["GT", "DP"], calls=[vcfpy.Call("Sample1", OrderedDict([("GT", "0/1"),("DP", "47")]))]
       )
        #print(record)
        writer.write_record(record)
Ejemplo n.º 2
0
 def _open(self):
     # Setup header
     lines = [
         vcfpy.HeaderLine("fileformat", "VCFv4.2"),
         vcfpy.FormatHeaderLine.from_mapping({
             "ID":
             "AD",
             "Number":
             "R",
             "Type":
             "Integer",
             "Description":
             "Allelic depths for the ref and alt alleles in the order listed",
         }),
         vcfpy.FormatHeaderLine.from_mapping({
             "ID":
             "DP",
             "Number":
             "1",
             "Type":
             "Integer",
             "Description":
             "Approximate read depth at the locus",
         }),
         vcfpy.FormatHeaderLine.from_mapping({
             "ID":
             "GQ",
             "Number":
             "1",
             "Type":
             "Integer",
             "Description":
             "Phred-scaled genotype quality",
         }),
         vcfpy.FormatHeaderLine.from_mapping({
             "ID": "GT",
             "Number": "1",
             "Type": "String",
             "Description": "Genotype"
         }),
     ]
     # Add header lines for contigs.
     # TODO: switch based on release in case
     for name, length in CONTIGS_GRCH37:
         lines.append(
             vcfpy.ContigHeaderLine.from_mapping({
                 "ID": name,
                 "length": length
             }))
     header = vcfpy.Header(lines=lines,
                           samples=vcfpy.SamplesInfos(self.members))
     # Open VCF writer
     self.vcf_writer = vcfpy.Writer.from_path(self.tmp_file.name, header)
Ejemplo n.º 3
0
 def __init__(self, input_vcf, info_fields, sample_fields, caller_priority,
              output_vcf):
     self.reader = vcfpy.Reader.from_path(input_vcf)
     self.info_fields = info_fields
     self.sample_fields = sample_fields
     self.caller_priority = caller_priority
     self.write_header = vcfpy.Header(samples=self.reader.header.samples)
     self.add_file_format()
     self.select_contig_header()
     self.select_filter_header()
     self.select_info_header()
     self.select_format_header()
     self.records = self.select_record_fields()
     self.output_vcf = output_vcf
     self.write_merged(self.records)
Ejemplo n.º 4
0
 def __init__(self, readers, callers, output_vcf):
     self.readers = readers
     self.callers = callers
     self.samples = list(set([name for reader in self.readers for name in reader.header.samples.names]))
     # TODO: multi-sample? using first vcf samples here
     self.merge_header = vcfpy.Header(samples=[reader.header.samples for reader in self.readers][0])
     self.add_file_format()
     self.merge_contig_header()
     self.add_caller_filter_header()
     self.merge_filter_header()
     self.merge_info_header()
     self.merge_format_header()
     self.records = self.merge_records()
     self.output_vcf = output_vcf
     self.write_merged(self.records)
Ejemplo n.º 5
0
def main():
    if len(sys.argv) != 2:
        print("Usage: vcf_from_scratch.py OUTPUT.vcf", file=sys.stderr)
        return 1

    header = vcfpy.Header(samples=vcfpy.SamplesInfos([]))
    with vcfpy.Writer.from_path(sys.argv[1], header) as writer:
        record = vcfpy.Record(CHROM="1",
                              POS=1,
                              ID=[],
                              REF="N",
                              ALT=[],
                              QUAL=None,
                              FILTER=[],
                              INFO={},
                              FORMAT=[])
        writer.write_record(record)
def create_vcf_writer(args, vcf_reader):
    if args.output_vcf:
        output_file = args.output_vcf
    else:
        (head, sep, tail) = args.input_vcf.rpartition('.vcf')
        output_file = ('').join([head, '.readcount.vcf', tail])
    new_header = vcfpy.Header(samples=vcf_reader.header.samples)
    if args.data_type == 'DNA':
        for line in vcf_reader.header.lines:
            if not (line.key == 'FORMAT' and line.id in ['DP', 'AD', 'AF']):
                new_header.add_line(line)
        new_header.add_format_line(
            OrderedDict([('ID', 'DP'), ('Number', '1'), ('Type', 'Integer'),
                         ('Description', 'Read depth')]))
        new_header.add_format_line(
            OrderedDict([
                ('ID', 'AD'), ('Number', 'R'), ('Type', 'Integer'),
                ('Description',
                 'Allelic depths for the ref and alt alleles in the order listed'
                 )
            ]))
        new_header.add_format_line(
            OrderedDict([('ID', 'AF'), ('Number', 'A'), ('Type', 'Float'),
                         ('Description',
                          'Variant-allele frequency for the alt alleles')]))
    if args.data_type == 'RNA':
        for line in vcf_reader.header.lines:
            if not (line.key == 'FORMAT' and line.id in ['RDP', 'RAD', 'RAF']):
                new_header.add_line(line)
        new_header.add_format_line(
            OrderedDict([('ID', 'RDP'), ('Number', '1'), ('Type', 'Integer'),
                         ('Description', 'RNA Read depth')]))
        new_header.add_format_line(
            OrderedDict([
                ('ID', 'RAD'), ('Number', 'R'), ('Type', 'Integer'),
                ('Description',
                 'RNA Allelic depths for the ref and alt alleles in the order listed'
                 )
            ]))
        new_header.add_format_line(
            OrderedDict([('ID', 'RAF'), ('Number', 'A'), ('Type', 'Float'),
                         ('Description',
                          'RNA Variant-allele frequency for the alt alleles')
                         ]))
    return vcfpy.Writer.from_path(output_file, new_header)
Ejemplo n.º 7
0
def create_vcf_writer(args, vcf_reader):
    if args.output_vcf:
        output_file = args.output_vcf
    else:
        (head, sep, tail) = args.input_vcf.rpartition('.vcf')
        output_file = ('').join([head, '.genotype.vcf', tail])
    sample_info = vcf_reader.header.samples
    if args.sample_name in sample_info.names:
        append_to_existing_sample = True
    else:
        append_to_existing_sample = False
        sample_info.names.append(args.sample_name)
        sample_info.name_to_idx[args.sample_name] = len(sample_info.names) - 1
    new_header = vcfpy.Header(samples=sample_info)
    for line in vcf_reader.header.lines:
        if not (line.key == 'FORMAT' and line.id == 'GT'):
            new_header.add_line(line)
    new_header.add_format_line(
        OrderedDict([('ID', 'GT'), ('Number', '1'), ('Type', 'String'),
                     ('Description', 'Genotype')]))
    return (vcfpy.Writer.from_path(output_file,
                                   new_header), append_to_existing_sample)
Ejemplo n.º 8
0
def build_header(contigs, species):
    header = vcfpy.Header()
    header.samples = vcfpy.SamplesInfos([])
    header.add_line(vcfpy.HeaderLine("fileformat", "VCFv4.2"))
    for name, length in contigs:
        header.add_contig_line({"ID": name, "length": length})
    header.add_line(vcfpy.HeaderLine("species", ",".join(species)))
    header.add_info_line({
        "ID": "END",
        "Description": "End position of the alignment",
        "Type": "Integer",
        "Number": 1,
    })
    header.add_info_line({
        "ID": "UCSC_GENE",
        "Description": "UCSC gene ID",
        "Type": "String",
        "Number": 1
    })
    header.add_info_line({
        "ID": "EXON",
        "Description": "Index of exon in transcript",
        "Type": "Integer",
        "Number": 1
    })
    header.add_info_line({
        "ID": "EXON_COUNT",
        "Description": "Number of exons in transcript",
        "Type": "Integer",
        "Number": 1,
    })
    header.add_info_line({
        "ID": "ALIGNMENT",
        "Description": "Amino acid alignment at this location",
        "Type": "String",
        "Number": 1,
    })
    return header
Ejemplo n.º 9
0
def write_vcf(vcffilename, sample_name, records):
    """
    Generate a VCF with the given records and randomly generated genotypes

    Arguments:
    vcffilename - path to generated file
    records - list of vcfpy.Record describing the variants
    """
    lengths = [249250621, 243199373, 198022430, 191154276, 180915260,
               171115067, 159138663, 146364022, 141213431, 135534747,
               135006516, 133851895, 115169878, 107349540, 102531392,
               90354753,  81195210,  78077248,  59128983,  63025520,
               48129895,  51304566]

    samples = vcfpy.SamplesInfos([sample_name])
    header = vcfpy.Header(samples=samples)
    header.add_line(vcfpy.HeaderLine("fileformat", "VCFv4.3"))
    header.add_line(vcfpy.HeaderLine("fileDate", "20200901"))
    for chrom, length in enumerate(lengths):
        header.add_contig_line({"ID": str(chrom), "assembly": "GRCh37", "length": length})
    header.add_format_line({"ID":"GT", "Number":1, "Type":"String", "Description": "Genotype"})

    with open(vcffilename, 'wb') as vcffile:
        writer = vcfpy.Writer.from_stream(vcffile, header, samples, use_bgzf=True)
        for record in records:
            genotype = random.choice(['0/0', '0/1', '1/1'])
            newrecord = vcfpy.Record(record.CHROM,
                                     record.POS,
                                     record.ID,
                                     record.REF,
                                     record.ALT,
                                     record.QUAL,
                                     record.FILTER,
                                     record.INFO,
                                     ["GT"],
                                     calls=[vcfpy.record.Call(sample_name, {"GT": genotype})])
            writer.write_record(newrecord)
        writer.close()
Ejemplo n.º 10
0
def main():

    parser = argparse.ArgumentParser(description="vcf writer")
    parser.add_argument("output",
                        metavar='output.vcf',
                        action='store',
                        help='vcf file.',
                        type=str)

    args = parser.parse_args()

    outvcf = args.output

    #########################
    #                       #
    #  creating the header  #
    #                       #
    #########################

    # The header can contain some fixed type lines (INFO, FORMAT, FILTER, etc.) and some general ones
    # In this case, the header will contain a line storing the name of the program which generated
    # the file. We also add the information about the name of the sample which have been analyzed

    header = vcfpy.Header(lines=[
        vcfpy.HeaderLine(key="source", value=sys.argv[0]),
        vcfpy.HeaderLine(key="fileformat", value="VCFv4.3"),
        vcfpy.HeaderLine(key="fileDate",
                         value=date.today().strftime("%d/%m/%Y"))
    ],
                          samples=vcfpy.SamplesInfos(["Sample1", "Sample2"]))

    # Tuples of valid entries -----------------------------------------------------
    #
    #: valid INFO value types
    # INFO_TYPES = ("Integer", "Float", "Flag", "Character", "String")
    #: valid FORMAT value types
    # FORMAT_TYPES = ("Integer", "Float", "Character", "String")
    #: valid values for "Number" entries, except for integers
    # VALID_NUMBERS = ("A", "R", "G", ".")
    #: header lines that contain an "ID" entry
    # LINES_WITH_ID = ("ALT", "contig", "FILTER", "FORMAT", "INFO", "META", "PEDIGREE", "SAMPLE")
    # Constants for "Number" entries ----------------------------------------------
    #
    #: number of alleles excluding reference
    # HEADER_NUMBER_ALLELES = "A"
    #: number of alleles including reference
    # HEADER_NUMBER_REF = "R"
    #: number of genotypes
    # HEADER_NUMBER_GENOTYPES = "G"
    #: unbounded number of values
    # HEADER_NUMBER_UNBOUNDED = "."

    # adding filter lines
    header.add_filter_line(
        OrderedDict([("ID", "PASS"), ("Description", "All filters passed")]))

    # adding info lines
    header.add_info_line(
        OrderedDict([("ID", "DP"), ("Number", "1"), ("Type", "Integer"),
                     ("Description",
                      "Raw read depth (without mapping quality filters)")]))
    header.add_info_line(
        OrderedDict([
            ("ID", "MUT"), ("Number", "1"), ("Type", "Integer"),
            ("Description",
             "States if the record mutation is supported (1) or not (0).")
        ]))

    # adding format lines
    header.add_format_line(
        OrderedDict([("ID", "GT"), ("Number", "1"), ("Type", "String"),
                     ("Description", "Genotype")]))
    header.add_format_line(
        OrderedDict([("ID", "DP"), ("Number", "1"), ("Type", "Integer"),
                     ("Description", "Filtered read depth (MAPQ > 30)")]))
    #header.add_format_line(OrderedDict([vcfpy.header.RESERVED_FORMAT["GT"]]))

    # adding contig lines
    header.add_contig_line(
        OrderedDict([("ID", "chr1"), ("length", "248956422")]))

    # adding sample lines
    header.add_line(
        vcfpy.SampleHeaderLine.from_mapping(
            OrderedDict([("ID", "Sample1"), ("Description", "Tumor")])))

    # writing the vcf
    with vcfpy.Writer.from_path(outvcf, header) as writer:

        # creating one record
        calls = []
        calls.append(
            vcfpy.Call("Sample1", OrderedDict([("GT", "0/1"), ("DP", "47")])))
        calls.append(
            vcfpy.Call("Sample2", OrderedDict([("GT", "0/1"), ("DP", "31")])))

        record = vcfpy.Record(CHROM="1",
                              POS=1,
                              ID=[],
                              REF="C",
                              ALT=[vcfpy.Substitution(type_="SNV", value="G")],
                              QUAL=None,
                              FILTER=["PASS"],
                              INFO={
                                  "DP": "50",
                                  "MUT": 0
                              },
                              FORMAT=["GT", "DP"],
                              calls=calls)
        #record.add_format(key="GT")
        #record.calls.append(vcfpy.Call("Sample1", OrderedDict([("GT", "0|1")])))
        writer.write_record(record)
Ejemplo n.º 11
0
def get_header(sample_name_to_header, chromosome_set):
    """
    Returns the header of the output VCF file
    :param sample_name_to_header: a dictionary from the sample names to the headers
    :param chromosome_set: the set of chromosomes selected for analysis
    :return: a vcfpy.Header
    """
    header = vcfpy.Header()

    header.add_line(vcfpy.HeaderLine(key="fileformat", value="VCFv4.2"))

    # CONTIG headers
    first_sample_header = next(iter(sample_name_to_header.values()))
    for input_header_line in first_sample_header.lines:
        if isinstance(input_header_line, vcfpy.ContigHeaderLine):
            if chromosome_set is None or input_header_line.mapping[
                    "ID"] in chromosome_set:
                header.add_line(input_header_line)

    # INFO fields
    header.add_info_line(
        vcfpy.OrderedDict(ID="END",
                          Number=1,
                          Type="Integer",
                          Description="Stop position of the interval"))
    header.add_info_line(
        vcfpy.OrderedDict(ID="SVTYPE",
                          Number=1,
                          Type="String",
                          Description="Type of structural variant"))
    header.add_info_line(
        vcfpy.OrderedDict(
            ID="INSSEQ",
            Number=1,
            Type="String",
            Description=
            "Insertion sequence of structural variant, not including sequence marked as duplication"
        ))
    header.add_info_line(
        vcfpy.OrderedDict(
            ID="TRANCHE2",
            Number=1,
            Type="String",
            Description=
            "Quality category of GRIDSS structural variant calls determined using FILTER,SRQ,AS,RAS. Values are LOW INTERMEDIATE HIGH"
        ))
    header.add_info_line(
        vcfpy.OrderedDict(
            ID="BNDVAF",
            Number=1,
            Type="Float",
            Description=
            "VAF of this gridss-called BND calculated as (SR+RP+IC+AS)/(REF+SR+RP+IC+AS)"
        ))
    # FORMAT fields
    header.add_format_line(
        vcfpy.OrderedDict(ID="GT",
                          Number=1,
                          Type="String",
                          Description="Genotype"))
    header.add_format_line(
        vcfpy.OrderedDict(
            ID="TRANCHE2",
            Number=1,
            Type="String",
            Description=
            "Quality category of GRIDSS structural variant calls determined using FILTER,SRQ,AS,RAS. Values are LOW INTERMEDIATE HIGH"
        ))
    header.add_format_line(
        vcfpy.OrderedDict(
            ID="BNDVAF",
            Number=1,
            Type="Float",
            Description=
            "VAF of this gridss-called BND calculated as (SR+RP+IC+AS)/(REFPAIR+SR+RP+IC+AS)"
        ))
    header.add_format_line(
        vcfpy.OrderedDict(
            ID="VAF",
            Number=1,
            Type="Float",
            Description=
            "VAF of this SV call, derived from BNDVAF values of BND calls used to call this SV"
        ))
    header.add_format_line(
        vcfpy.OrderedDict(
            ID="INSSEQ",
            Number=1,
            Type="String",
            Description=
            "Insertion sequence of structural variant, not including sequence marked as duplication"
        ))

    # Samples, sorted to ensure determinism
    sample_names = sample_name_to_header.keys()
    header.samples = vcfpy.SamplesInfos(sorted(sample_names))

    return header
Ejemplo n.º 12
0
        bam_readcount_position = str(entry.POS)
        ref_base = reference
        var_base = alt
    return (bam_readcount_position, ref_base, var_base)


(script, vcf_filename, bam_readcount_filenames, samples_list,
 output_dir) = sys.argv

samples = samples_list.split(',')
bam_readcount_files = bam_readcount_filenames.split(',')
read_counts = parse_bam_readcount_file(bam_readcount_files, samples)

vcf_reader = vcfpy.Reader.from_path(vcf_filename)

new_header = vcfpy.Header(samples=vcf_reader.header.samples)
for line in vcf_reader.header.lines:
    if not (line.key == 'FORMAT' and line.id in ['DP', 'AD', 'AF']):
        new_header.add_line(line)
new_header.add_format_line(
    OrderedDict([('ID', 'DP'), ('Number', '1'), ('Type', 'Integer'),
                 ('Description', 'Read depth')]))
new_header.add_format_line(
    OrderedDict([
        ('ID', 'AD'), ('Number', 'R'), ('Type', 'Integer'),
        ('Description',
         'Allelic depths for the ref and alt alleles in the order listed')
    ]))
new_header.add_format_line(
    OrderedDict([('ID', 'AF'), ('Number', 'A'), ('Type', 'Float'),
                 ('Description',
Ejemplo n.º 13
0
    cmd_vt = path_vt+" decompose -s "+path_filteredSort_vcf+" | "+path_vt+" normalize -r "+pathFasta+" -o "+path_normalized_vcf+" -"
    process = subprocess.Popen([cmd_vt], stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
    out, err = process.communicate()
    if process.returncode!=0: exit("🅴 🆁 🆁 🅾 🆁\n[Nk_mergeVCF] Decompose & Normalize\n    "+err.decode('utf-8'))



#***** MERGE callers VCFs *****#
lst_caller_name = []
lst_contig_line = []
dico_filter_line = {}
dico_vcf = {}
pathMergeVCF = sample+"_Nk.vcf"
pathMergeUnsortedVCF = pathMergeVCF.replace(".vcf","_unsorted.vcf")
#***** INIT new vcf header *****#
new_header = vcfpy.Header(lines=None, samples=None)
new_header.add_line(vcfpy.HeaderLine("fileformat","VCFv4.2"))
new_header.add_line(vcfpy.HeaderLine("Nk_version",niourkVersion))
#***** BROWSE caller vcf *****#
for path_vcf in lst_vcf_sample:
    caller_name = os.path.basename(path_vcf).split("_")[2].replace(".vcf","")
    lst_caller_name.append(caller_name)
    path_normalized_vcf = path_vcf.replace(".vcf","_normalize.vcf")
    vcf_tool_reader = vcfpy.Reader.from_path(path_normalized_vcf)
    vcf_header = vcf_tool_reader.header
    #***** READ HEADERS *****#
    # check header sample
    if new_header.samples==None: new_header.samples = vcf_header.samples
    # check header filters
    for filter_line in vcf_header.get_lines("FILTER"):
        if not filter_line.id in dico_filter_line: dico_filter_line[filter_line.id] = filter_line.description
Ejemplo n.º 14
0
varCt = 0

header = vcfpy.Header(
    samples=vcfpy.SamplesInfos(filteredEPIs),
    lines=[
        vcfpy.HeaderLine('fileformat', 'VCFv4.0'),
        vcfpy.HeaderLine('fileDate', str(datetime.datetime.now())),
        vcfpy.HeaderLine('source', parser.prog),
        vcfpy.ContigHeaderLine('contig', '<ID=String,Length=Integer>', {
            'ID': 'EPI_ISL_406030',
            'length': 29903
        }),
        vcfpy.InfoHeaderLine(
            'INFO',
            '<ID=NS,Number=1,Type=Integer,Description="Number of Samples With Data">',
            {
                'ID': 'NS',
                'Number': 1,
                'Type': 'Integer',
                'Description': 'Number of Samples With Data'
            }),
        vcfpy.FormatHeaderLine(
            'FORMAT', '<ID=GT,Number=1,Type=String,Description="Genotype">', {
                'ID': 'GT',
                'Number': 1,
                'Type': 'String',
                'Description': 'Genotype'
            })
    ])

with vcfpy.Writer.from_path(args.vcf, header) as writer:
Ejemplo n.º 15
0
def main():
    parser = argparse.ArgumentParser(
        description="Looks for a given set of SNPs whithin a bam file.")

    parser.add_argument("bam",
                        metavar='sample.bam',
                        action='store',
                        help='BAM file.',
                        type=str)

    parser.add_argument(
        "barcodes",
        metavar='barcodes.list',
        action='store',
        help=
        "File containing cell barcodes (the same used in the alignment file to identify cell reads).",
        type=str)

    parser.add_argument("vcf",
                        metavar='variants.vcf',
                        action='store',
                        help="VCF file storing BULK SNPs.",
                        type=str)

    parser.add_argument("sample_name",
                        metavar='sample1',
                        action='store',
                        help="Sample identifier.",
                        type=str)

    parser.add_argument("out_prefix",
                        metavar="outdir/sample",
                        action="store",
                        help="Output VCF file prefix.",
                        type=str)

    parser.add_argument(
        "--gt",
        metavar='1/1 (0/1)',
        choices=["0/0", "0/1", "1/1"],
        action='store',
        help=
        "Genotype filter: considers only mutations with the specified GT in the original vcf file.",
        type=str)

    args = parser.parse_args()
    bam = args.bam
    barcodes = args.barcodes
    invcf = args.vcf
    sample = args.sample_name
    outvcf = args.out_prefix + ".snpseeker.vcf"

    if args.gt:
        gt_filter = True
        gt = args.gt

    else:
        gt_filter = False

    with open(barcodes, "r") as f:
        samples = f.read().splitlines()
    #read bam file
    samfile = pysam.AlignmentFile(bam, "rb")

    #build the header of the output vcf
    header_out = vcfpy.Header(lines=[
        vcfpy.HeaderLine(key="fileformat", value="VCFv4.3"),
        vcfpy.HeaderLine(key="source", value=sys.argv[0]),
        vcfpy.HeaderLine(key="fileDate",
                         value=date.today().strftime("%d/%m/%Y"))
    ],
                              samples=vcfpy.SamplesInfos(samples))

    # sample header lines
    header_out.add_line(
        vcfpy.SampleHeaderLine.from_mapping(
            OrderedDict([("ID", sample), ("Description", "Sample name")])))

    # filter header lines
    # sample header lines
    header_out.add_filter_line(
        OrderedDict([("ID", "1/1"), ("Number", "1"),
                     ("Description", "Filtered on such GT")]))
    header_out.add_filter_line(
        OrderedDict([("ID", "0/1"), ("Number", "1"),
                     ("Description", "Filtered on such GT")]))
    header_out.add_filter_line(
        OrderedDict([("ID", "0/0"), ("Number", "1"),
                     ("Description", "Filtered on such GT")]))

    #header_out.add_info_line(OrderedDict([("ID", "MUT"), ("Number", "1"), ("Type","Integer"), ("Description", "States if the record mutation is supported (1) or not (0).")]))

    # format header lines
    header_out.add_format_line(
        OrderedDict([("ID", "GT"), ("Number", "1"), ("Type", "String"),
                     ("Description", "Genotype (0/1, 0/0)")]))
    header_out.add_format_line(
        OrderedDict([
            ("ID", "DP"), ("Number", "1"), ("Type", "Integer"),
            ("Description",
             "Filtered read depth (reads with MAPQ < 30, indels and gaps are filtered)"
             )
        ]))
    header_out.add_format_line(
        OrderedDict([("ID", "RD"), ("Number", "1"), ("Type", "Integer"),
                     ("Description", "Reference allele read depth")]))
    header_out.add_format_line(
        OrderedDict([("ID", "AD"), ("Number", "1"), ("Type", "Integer"),
                     ("Description", "Alternate allele read depth")]))
    header_out.add_format_line(
        OrderedDict([
            ("ID", "AF"), ("Number", "1"), ("Type", "Float"),
            ("Description",
             "Allele frequency: AD/(RD+AD). Other alleles, in case of mutli-allelic regions, are ignored."
             )
        ]))

    # read input vcf
    reader = vcfpy.Reader.from_path(invcf)

    # info header lines
    # Use input FORMAT lines as output INFO line
    header_out.add_info_line(
        OrderedDict([("ID", "SUPP"), ("Number", "1"), ("Type", "Integer"),
                     ("Description",
                      "Number of cells supporting the mutation.")]))

    format_ids = reader.header.format_ids()
    for format_id in format_ids:
        format_line = reader.header.get_format_field_info(format_id)
        '''
            output example:
        
            FormatHeaderLine('FORMAT', '<ID=AD,Number=R,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed">', {'ID': 'AD', 'Number': 'R', 'Type': 'Integer', 'Description': 'Allelic depths for the ref and alt alleles in the order listed'})
            key = 'FORMAT'
            value = '<ID=AD,Number=R,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed">
        '''
        mapping = str_to_mapping(format_line.value)
        mapping["Description"] = "(Info about bulk mutation)" + mapping[
            "Description"]
        header_out.add_info_line(str_to_mapping(format_line.value))

    # open the output vcf
    writer = vcfpy.Writer.from_path(outvcf, header_out)

    #read bam file
    samfile = pysam.AlignmentFile(bam, "rb")

    #for each mutation in the vcf file
    for record_in in reader:
        d = samples_dict(samples)
        supp = 0
        # filter out indels: only interested in snvs in this analysis phase
        if gt_filter:
            if record.calls[0].data.get('GT') != gt:
                continue

        if not record_in.is_snv():
            continue
        chrom = record_in.CHROM
        pos = record_in.POS - 1  #to correct on 1-based positions
        ref = record_in.REF
        alt = record_in.ALT[
            0].value  #record.ALT is a list by construction which contains only one value
        # if the mutation is a SNV
        #line += [call.data.get('GT') or './.' for call in record.calls]

        #look for the pileup in the samfile at position (chrom,pos)
        for pileupcolumn in samfile.pileup(chrom,
                                           pos,
                                           pos + 1,
                                           stepper='all',
                                           truncate=True,
                                           max_depth=10000):
            for base in pileupcolumn.pileups:
                # .is_del -> the base is a deletion?
                # .is_refskip -> the base is a N in the CIGAR string ?
                if not base.is_del and not base.is_refskip and not base.alignment.mapping_quality < 30:
                    #iterate on cells
                    tags = list_to_dict(base.alignment.tags)
                    if "CB" not in tags.keys():
                        ''' reads with no error-corrected barcode are discarded '''
                        continue
                    elif tags["CB"].split("-")[0] not in samples:
                        ''' The barcode hasn't been labeled has belonging to a cell by cellranger (floating DNA)'''
                        continue
                    cb = tags["CB"].split("-")[0]  #10x barcodes
                    #print("barcode {} is a cell barcode ".format(cb))
                    d[cb][
                        'dp'] += 1  #update info for the sample identified by CB
                    if base.alignment.query_sequence[
                            base.query_position] == alt:
                        d[cb]['ad'] += 1
                    elif base.alignment.query_sequence[
                            base.query_position] == ref:
                        d[cb]['rd'] += 1
        for cb in d.keys():
            if d[cb]['ad'] > 0:
                supp += 1
                d[cb][
                    'gt'] = "0/1"  #temporary, all the supported mutations are set to 0/1
                d[cb]['af'] = d[cb]['ad'] / (d[cb]['rd'] + d[cb]['ad'])

        # generate calls for each sample/cell
        calls = []
        for cb in d.keys():
            calls.append(
                vcfpy.Call(
                    cb,
                    OrderedDict([("GT", d[cb]['gt']), ("DP", d[cb]['dp']),
                                 ("RD", d[cb]['rd']), ("AD", d[cb]['ad']),
                                 ("AF", d[cb]['af'])])))

        # create a mapping between each FORMAT entry and the
        # corresponding value, in the call, in the input vcf file
        # note that the input vcf contains only one sample, so
        # the calls field of each record contains only one entry
        info_d = {}
        info_d['SUPP'] = supp
        for f in record_in.FORMAT:
            info_d[f] = record_in.calls[0].data.get(f)

        if gt_filter == True:
            filter_l = [gt]
        else:
            filter_l = []

        # build and write the output record

        record_out = vcfpy.Record(
            CHROM=chrom,
            POS=pos + 1,
            ID=[],
            REF=ref,
            ALT=[vcfpy.Substitution(type_="SNV", value=alt)],
            QUAL=None,
            FILTER=filter_l,
            INFO=info_d,
            FORMAT=["GT", "DP", "RD", "AD", "AF"],
            calls=calls)
        writer.write_record(record_out)

    reader.close()
    writer.close()
    samfile.close()
Ejemplo n.º 16
0
def main():
    parser = argparse.ArgumentParser(description="Looks for a given set of SNPs whithin a bam file.")


    parser.add_argument("bam", metavar='sample.bam', action='store',
        help='BAM file.', type=str)

    parser.add_argument("vcf", metavar='file.vcf', action='store',
        help="VCF file storing SNPs.", type=str)

    parser.add_argument("sample_name", metavar='sample1', action='store',
                help="Sample identifier.", type=str)



    parser.add_argument("out_prefix", metavar="outdir/sample", action="store",
        help="Output VCF file prefix.", type=str)

    #parser.add_argument("--sample_name2", metavar='sample2', action='store',
    #                            help="Another sample name", type=str)

    args = parser.parse_args()
    bam= args.bam
    invcf = args.vcf
    sample = args.sample_name
    outvcf = args.out_prefix + ".snpseeker.vcf"

    '''
    if args.sample_name2:
    sample_name2 = args.sample_name2
    else:
    sample_name2 = null
    '''

    #read bam file
    samfile = pysam.AlignmentFile(bam, "rb")

    #build the header of the output vcf
    header_out = vcfpy.Header(lines=[vcfpy.HeaderLine(key="fileformat", value="VCFv4.3"), vcfpy.HeaderLine(key="source", value=sys.argv[0]), vcfpy.HeaderLine(key="fileDate", value=date.today().strftime("%d/%m/%Y")) ], samples=vcfpy.SamplesInfos([sample]))

    # sample header lines
    header_out.add_line(vcfpy.HeaderLine(key="SampleName", value=sample))
    '''
    if sample_name2 is not null:
    header_out.add_line(vcfpy.SampleHeaderLine.from_mapping(OrderedDict([("ID", sample_name2),("Description", "Second sample name")])))
    '''
    # info header lines
    header_out.add_info_line(OrderedDict([("ID", "SUPP"), ("Number", "1"), ("Type","Integer"), ("Description", "States if the mutation is supported (1) or not (0).")]))

    # adding format lines 
    header_out.add_format_line(OrderedDict([("ID", "GT"),("Number", "1"), ("Type","String"), ("Description", "Genotype (0/1, 0/0)")]))
    header_out.add_format_line(OrderedDict([("ID", "SDP"),("Number", "1"), ("Type","Integer"), ("Description", "Samtools read depth (secondary alignments, PCR duplicates, unppammed reads and reads not passing vendor QC are filtered)")]))
    header_out.add_format_line(OrderedDict([("ID", "DP"),("Number", "1"), ("Type","Integer"), ("Description", "Filtered read depth (reads with MAPQ < 30, indels and gaps are filtered)")]))
    header_out.add_format_line(OrderedDict([("ID", "RD"),("Number", "1"), ("Type","Integer"), ("Description", "Reference allele read depth")]))
    header_out.add_format_line(OrderedDict([("ID", "AD"),("Number", "1"), ("Type","Integer"), ("Description", "Alternate allele read depth")]))
    header_out.add_format_line(OrderedDict([("ID", "AF"),("Number", "1"), ("Type","Float"), ("Description", "Allele frequency: AD/(RD+AD). Other alleles, in case of mutli-allelic regions, are ignored.")]))

    # read input vcf
    reader = vcfpy.Reader.from_path(invcf)

    format_ids = reader.header.format_ids()
    for format_id in format_ids:
        format_line = reader.header.get_format_field_info(format_id)
        '''
            output example:
        
            FormatHeaderLine('FORMAT', '<ID=AD,Number=R,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed">', {'ID': 'AD', 'Number': 'R', 'Type': 'Integer', 'Description': 'Allelic depths for the ref and alt alleles in the order listed'})
            key = 'FORMAT'
            value = '<ID=AD,Number=R,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed">
        '''
        mapping = str_to_mapping(format_line.value)
        mapping["Description"] = "(Info about mutation in the original vcf)" + mapping["Description"]
        header_out.add_info_line(str_to_mapping(format_line.value))

    # open the output vcf
    writer = vcfpy.Writer.from_path(outvcf, header_out) 

    #read bam file
    samfile = pysam.AlignmentFile(bam, "rb")

    #for each mutation in the vcf file
    for record_in in reader:
    # filter out indels: only interested in snvs in this analysis phase
        if not record_in.is_snv():
            continue
        chrom = record_in.CHROM
        pos = record_in.POS-1 #to correct on 1-based positions
        ref = record_in.REF
        alt = record_in.ALT[0].value  #record.ALT is a list by construction which contains only one value
                                    # if the mutation is a SNV
        #line += [call.data.get('GT') or './.' for call in record.calls]

        #look for the pileup in the samfile at position (chrom,pos)
        for pileupcolumn in samfile.pileup(chrom, pos, pos+1, stepper='all', truncate=True, max_depth=10000):
            #number of reads at this position
            sdp = pileupcolumn.n
            #number of supporting reads for the alternate base
            ad = 0
            rd = 0
            dp = 0
            af = 0.0
            for base in pileupcolumn.pileups:
                # .is_del -> the base is a deletion?
                # .is_refskip -> the base is a N in the CIGAR string ?
                if not base.is_del and not base.is_refskip and not base.alignment.mapping_quality < 30:
                    dp += 1
                    if base.alignment.query_sequence[base.query_position] == alt:
                        ad += 1
                    elif base.alignment.query_sequence[base.query_position] == ref:
                        rd += 1

        if ad > 0:
            af = ad / (rd + ad)
            supp = 1
            gt = "0/1" #temporary, all the supported mutations are set to 0/1
        else:
            supp = 0
            gt = "0/0" 


        #af = ad / (rd + ad)

        info_d = {}
        info_d['SUPP'] = supp
        for f in record_in.FORMAT:
            info_d[f] = record_in.calls[0].data.get(f)

        record_out = vcfpy.Record(CHROM=chrom, POS=pos+1, ID=[], REF=ref, ALT=[vcfpy.Substitution(type_="SNV", value=alt)], QUAL=None, FILTER=[], INFO=info_d, FORMAT=["GT","SDP","DP","RD","AD","AF"],
                calls=[vcfpy.Call(sample, OrderedDict([("GT", gt), ("SDP",sdp), ("DP", dp), ("RD", rd), ("AD", ad), ("AF", af)]))]
           )
        writer.write_record(record_out)


    reader.close()
    writer.close()
    samfile.close()
Ejemplo n.º 17
0
def main():
    parser = argparse.ArgumentParser(description="From single cell VCF to clones vcf.")
    parser.add_argument("input1", metavar="sample.muts.vcf", action="store", help="Single cell VCF file.", type=str)
    parser.add_argument("input2", metavar="clusters.list", action="store", help="Clusters list.", type=str)
    #parser.add_argument("input_type", choices=["gz", "vcf"], help="VCF input type (vcf/gz).", type=str)
    #parser.add_argument("sample", metavar="sample_name", action="store", help="Sample name", type=str)
    parser.add_argument("outprefix", metavar="out/path/prefix", action="store", help="Output prefix", type=str)

    args = parser.parse_args()

    input1 = args.input1
    input2 = args.input2
    prefix = args.outprefix
    #sample = args.sample
    #input_type = args.input_type

        
    clusters_df = pd.read_csv(input2)
    #clusters_df['cluster'] = clusters_df['a'].apply(lambda x: "{}_{}".format(sample, x))    

    clusters = [str(cluster) for cluster in clusters_df['cluster'].unique()]
    # Create out header
    header_out = vcfpy.Header(lines=[ vcfpy.HeaderLine(key="fileformat", value="VCFv4.3"), vcfpy.HeaderLine(key="source", value=sys.argv[0]), vcfpy.HeaderLine(key="fileDate", value=date.today().strftime("%d/%m/%Y")) ], samples=vcfpy.SamplesInfos(clusters))
     
    # format header lines 
    header_out.add_format_line(OrderedDict([("ID", "GT"),("Number", "1"), ("Type","String"), ("Description", "Genotype (0/1, 0/0)")]))
    header_out.add_format_line(OrderedDict([("ID", "DP"),("Number", "1"), ("Type","Integer"), ("Description", "Filtered read depth (reads with MAPQ < 30, indels and gaps are filtered)")]))
    header_out.add_format_line(OrderedDict([("ID", "RD"),("Number", "1"), ("Type","Integer"), ("Description", "Reference allele read depth")]))
    header_out.add_format_line(OrderedDict([("ID", "AD"),("Number", "1"), ("Type","Integer"), ("Description", "Alternate allele read depth")]))
    header_out.add_format_line(OrderedDict([("ID", "AF"),("Number", "1"), ("Type","Float"), ("Description", "Allele frequency: AD/(RD+AD). Other alleles, in case of mutli-allelic regions, are ignored.")]))
    
    # info header lines
 
    header_out.add_info_line(OrderedDict([("ID", "SUPP"), ("Number", "1"), ("Type","Integer"), ("Description", "Whether the mutation is supported or not.")]))
    
    # read input vcf
    reader = vcfpy.Reader.from_path(input1)
    # open the output vcf
    writer = vcfpy.Writer.from_path(prefix+"_clusters.vcf", header_out)
 
    """
    snps = read_vcf(input1, input_type)
    #Filtering bulk mutations not supported by cells
    snps = snps[~snps['INFO'].str.startswith("SUPP=0")]
    
    #Create mutation id column and set it as index
    snps["mutid"] = snps["CHROM"] + "_"+snps["POS"].map(str) + "_" + snps["REF"] + "_" +snps["ALT"]
    snps = snps.set_index('mutid')
    """

    #for each record in the vcf file
    for record_in in reader:
        d = samples_dict(clusters_df['cluster'].unique())
        supp = 0
        chrom = record_in.CHROM
        pos = record_in.POS-1 #to correct on 1-based positions
        ref = record_in.REF
        alt = record_in.ALT[0].value
        
        #for each cluster compute 'GT:DP:RD:AD:AF' to be provided as call argument
        for c in clusters_df['cluster'].unique():
            #retrieve cell columns for cells in current cluster
            cells = clusters_df['cellid'][clusters_df['cluster'] == c]
            
          
            #retrieve cell data
            calls = [record_in.call_for_sample[cell] for cell in cells]
            #sum total read count, alt read count and ref read count of cells in the cluster
            for call in calls:    
                d[c]['dp'] = d[c]['dp'] + call.data.get('DP') 
                d[c]['rd'] = d[c]['rd'] + call.data.get('RD')
                d[c]['ad'] = d[c]['ad'] + call.data.get('AD')

            if d[c]['ad'] > 0:
                d[c]['gt'] = "0/1"
                d[c]['af'] = d[c]['ad'] / (d[c]['rd'] + d[c]['ad'])
                supp = 1
    
        calls = []
        # create one call for each cluster
        for c in d.keys():
            calls.append(vcfpy.Call(str(c), OrderedDict([("GT", d[c]['gt']), ("DP", d[c]['dp']), ("RD", d[c]['rd']), ("AD", d[c]['ad']), ("AF", d[c]['af'])])))        
        print(calls)
         
        # write new record
        record_out = vcfpy.Record(CHROM=chrom, POS=pos+1, ID=[], REF=ref, ALT=[vcfpy.Substitution(type_="SNV", value=alt)], QUAL=None, FILTER=[], INFO={"SUPP":supp}, FORMAT=["GT","DP","RD","AD","AF"],
                calls=calls
           )
        writer.write_record(record_out)
        
    reader.close()
    writer.close()