Esempio n. 1
0
def get_variants_as_records(varfile):
    """
    Arguments:
    varfile -- list of desired variants as CSV

    Returns:
    recordlist -- list of those variants in vcfpy.Record form
    """
    recordlist = []
    reader = csv.DictReader(varfile, delimiter='\t')
    for row in reader:
        ref, alt = row['REF'], row['ALT']

        if len(alt) == len(ref):
            alttype = vcfpy.SNV
        else:
            alttype = vcfpy.INDEL

        altrecord = vcfpy.Substitution(alttype, alt)

        record = vcfpy.Record(row['CHROM'], int(row['POS']), [], row['REF'], [altrecord], None,
                              ['PASS'], {}, None, None)

        recordlist.append(record)
    return recordlist
Esempio n. 2
0
def main():

    parser = argparse.ArgumentParser(description="vcf writer")
    parser.add_argument("input", metavar='input.vcf', action='store',
                            help='vcf file.', type=str)
    parser.add_argument("output", metavar='output.vcf', action='store',
                            help='vcf file.', type=str)

    args = parser.parse_args()

    outvcf = args.output
    invcf = args.input
    
    
    #########################
    #                       #
    #  creating the header  #
    #                       #
    #########################

    # The header can contain some fixed type lines (INFO, FORMAT, FILTER, etc.) and some general ones
    # In this case, the header will contain a line storing the name of the program which generated 
    # the file. We also add the information about the name of the sample which have been analyzed

    header = vcfpy.Header(lines=[vcfpy.HeaderLine(key="source", value=sys.argv[0]), vcfpy.HeaderLine(key="fileformat", value="VCFv4.3"), vcfpy.HeaderLine(key="fileDate", value=date.today().strftime("%d/%m/%Y")) ], samples=vcfpy.SamplesInfos(["Sample1"]))

    
    # adding format lines 
    header.add_format_line(OrderedDict([("ID", "GT"),("Number", "1"), ("Type","String"), ("Description", "Genotype")]))
    header.add_format_line(OrderedDict([("ID", "DP"),("Number", "1"), ("Type","Integer"), ("Description", "Filtered read depth (MAPQ > 30)")]))

    # read the input vcf
    with vcfpy.Reader.from_path(invcf) as reader:

        # get the FORMAT header lines of the input file
        # and convert them in INFO header lines of the output file 
        format_ids = reader.header.format_ids()
        for format_id in format_ids:
            format_line = reader.header.get_format_field_info(format_id)
            '''
            output example:
        
            FormatHeaderLine('FORMAT', '<ID=AD,Number=R,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed">', {'ID': 'AD', 'Number': 'R', 'Type': 'Integer', 'Description': 'Allelic depths for the ref and alt alleles in the order listed'})

            key = 'FORMAT'
            value = '<ID=AD,Number=R,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed">
            '''
            header.add_info_line(str_to_mapping(format_line.value))
            #print(header)
    
    # write the vcf
    with vcfpy.Writer.from_path(outvcf, header) as writer:
        
        # creating one record
        record = vcfpy.Record(
                CHROM="1", POS=1, ID=[], REF="C", ALT=[vcfpy.Substitution(type_="SNV", value="G")], QUAL=None, FILTER=[], INFO={}, FORMAT=["GT", "DP"], calls=[vcfpy.Call("Sample1", OrderedDict([("GT", "0/1"),("DP", "47")]))]
       )
        #print(record)
        writer.write_record(record)
    def test_cell_vcf_mutation_counting(self):
        with io.StringIO() as vcf_stream:
            with vcfpy.Reader.from_path(
                    self.template_vcf) as template_vcf_reader:
                vcf_writer = vcfpy.Writer.from_stream(
                    vcf_stream, header=template_vcf_reader.header)

            cosmic_subset = self.cosmic_df.loc[self.cosmic_df["Primary site"]
                                               == "lung"]

            # Write test VCF

            expected_gene_mut_counts = {}

            hgvs_parser = hgvs.parser.Parser()
            for _, row in cosmic_subset.iterrows():
                genome_pos = GenomePosition.from_str(
                    str(row["Mutation genome position"]))

                if genome_pos is None:
                    continue

                try:
                    posedit = hgvs_parser.parse_c_posedit(
                        row["Mutation CDS"][2:])  # pylint: disable=no-member
                except:
                    continue

                record = vcfpy.Record(
                    CHROM=genome_pos.chrom,
                    POS=genome_pos.start + 1,
                    ID='.',
                    REF=posedit.edit.ref,
                    ALT=[vcfpy.Substitution(None, posedit.edit.alt)],
                    QUAL=0,
                    FILTER='.',
                    INFO={})

                vcf_writer.write_record(record)

                gene_name = row["Gene name"]
                # Remove any gene name suffixes
                gene_name = gene_name.split('_')[0]
                expected_gene_mut_counts[
                    gene_name] = expected_gene_mut_counts.get(gene_name, 0) + 1

            # Test mutation counting

            # Reset the buffer's cursor position
            vcf_stream.seek(0)
            _, filtered_gene_mut_counts = self.mutation_counter.find_cell_gene_mut_counts(
                stream=vcf_stream)

            self.assertDictEqual(expected_gene_mut_counts,
                                 filtered_gene_mut_counts)
Esempio n. 4
0
def main():
    if len(sys.argv) != 2:
        print("Usage: vcf_from_scratch.py OUTPUT.vcf", file=sys.stderr)
        return 1

    header = vcfpy.Header(samples=vcfpy.SamplesInfos([]))
    with vcfpy.Writer.from_path(sys.argv[1], header) as writer:
        record = vcfpy.Record(CHROM="1",
                              POS=1,
                              ID=[],
                              REF="N",
                              ALT=[],
                              QUAL=None,
                              FILTER=[],
                              INFO={},
                              FORMAT=[])
        writer.write_record(record)
Esempio n. 5
0
    def test_from_vcf_record(self):
        tests = [
            self.GPOS_A,
            self.GPOS_B,
            self.GPOS_C,
            self.GPOS_D,
            self.GPOS_E,
        ]

        tests = [
            (vcfpy.Record(
                CHROM=pos.chrom, POS=pos.start + 1, ID='.', REF='.' * len(pos),
                ALT=[], QUAL=0, FILTER='.', INFO={}
            ), pos) for pos in tests
        ]

        for test, expected in tests:
            self.assertEqual(expected, GenomePosition.from_vcf_record(test))
Esempio n. 6
0
def generate_sv_record(records, comparison_result, sample_names):
    """
    This method generates a single SV record after a call has been made over a set of input records
    :param records: the input records involved in the SV call
    :param comparison_result:
    :param sample_names:
    :return:
    """

    # Build a map to easily find the records by the sample name. It can be multi-valued
    sample_names_to_records = group_by(records,
                                       lambda record: get_sample_name(record))

    # Generate calls for each sample in this group
    calls = [
        get_sample_call(sample_name,
                        sample_names_to_records.get(sample_name, None))
        for sample_name in sample_names
    ]

    first_record_of_the_group = records[0]
    chrom = first_record_of_the_group.CHROM
    id_of_new_record = generate_id(chrom, comparison_result.initial_position)
    info = vcfpy.OrderedDict()
    info["SVTYPE"] = comparison_result.svtype
    info["END"] = comparison_result.final_position
    if comparison_result.insseq is not None:
        info["INSSEQ"] = comparison_result.insseq
    return vcfpy.Record(
        CHROM=chrom,  # by construction, all the grouped records have the same
        POS=comparison_result.
        initial_position,  # by construction, all the grouped records have the same
        ID=[id_of_new_record],
        REF=first_record_of_the_group.
        REF,  # by construction, all the grouped records have the same
        ALT=[
            vcfpy.Substitution(type_=comparison_result.svtype,
                               value='<{}>'.format(comparison_result.svtype))
        ],
        QUAL=maximum_qual(records),
        FILTER=["PASS"],
        INFO=info,
        FORMAT=["GT", "TRANCHE2", "VAF"],
        calls=calls)
 def __call__(self, record: vcfpy.Record) -> Union[vcfpy.Record, None]:
     if not ("AO" in record.INFO and "DP" in record.INFO):
         return None
     # VCF records have 1 (or 0?) or more ALT records supported by calls from 1 or more samples
     # and AO INFO fields with dimension matching the ALT dimensions
     # This Transform type Filter retains only those ALTs and corresponding INFO matching the
     # criteria of the filter
     # It does not modify the calls which might cause problems of calls not matching ALT
     retain = []
     for i, alt in enumerate(record.ALT):
         alt_percentage = (float(record.INFO["AO"][i]) /
                           float(record.INFO["DP"]) * 100.0)
         retain.append(not alt_percentage < self.min_percentage)
     if not any(retain):
         return None
     new_ALT = [alt for i, alt in enumerate(record.ALT) if retain[i]]
     new_INFO = OrderedDict()
     # these are produced by snpEff and keys occur once per implicated gene
     # the simplest solution is to copy them all across
     snpeff_keys = set(["ANN", "LOF", "NMD"])
     for key in record.INFO:
         if type(record.INFO[key]) == list:
             new_INFO[key] = [
                 # retain all ANN records and the only those other records that correspond to alts that we retain
                 el for i, el in enumerate(record.INFO[key])
                 if key in snpeff_keys or retain[i]
             ]
         else:
             new_INFO[key] = record.INFO[key]
     new_record = vcfpy.Record(
         record.CHROM,
         record.POS,
         record.ID,
         record.REF,
         new_ALT,
         record.QUAL,
         record.FILTER,
         new_INFO,
         record.FORMAT,
         record.calls,
     )
     return new_record
 def _write_variants_data(self):
     for small_var in self._yield_smallvars():
         # Get variant type
         if len(small_var.reference) == 1 and len(
                 small_var.alternative) == 1:
             var_type = vcfpy.SNV
         elif len(small_var.reference) == len(small_var.alternative):
             var_type = vcfpy.MNV
         else:
             var_type = vcfpy.INDEL
         # Build list of calls
         calls = [
             vcfpy.Call(
                 member,
                 {
                     key.upper(): f(
                         small_var.genotype.get(member, {}).get(
                             key, default_value))
                     for key, default_value, f in (
                         ("gt", "./.", lambda x: x),
                         ("gq", None, lambda x: x),
                         ("ad", None, lambda x: None if x is None else [x]),
                         ("dp", None, lambda x: x),
                     )
                 },
             ) for member in self.members
         ]
         # Construct and write out the VCF ``Record`` object
         self.vcf_writer.write_record(
             vcfpy.Record(
                 small_var.chromosome,
                 small_var.start,
                 [],
                 small_var.reference,
                 [vcfpy.Substitution(var_type, small_var.alternative)],
                 None,
                 [],
                 {},
                 ["GT", "GQ", "AD", "DP"],
                 calls,
             ))
Esempio n. 9
0
def write_vcf(vcffilename, sample_name, records):
    """
    Generate a VCF with the given records and randomly generated genotypes

    Arguments:
    vcffilename - path to generated file
    records - list of vcfpy.Record describing the variants
    """
    lengths = [249250621, 243199373, 198022430, 191154276, 180915260,
               171115067, 159138663, 146364022, 141213431, 135534747,
               135006516, 133851895, 115169878, 107349540, 102531392,
               90354753,  81195210,  78077248,  59128983,  63025520,
               48129895,  51304566]

    samples = vcfpy.SamplesInfos([sample_name])
    header = vcfpy.Header(samples=samples)
    header.add_line(vcfpy.HeaderLine("fileformat", "VCFv4.3"))
    header.add_line(vcfpy.HeaderLine("fileDate", "20200901"))
    for chrom, length in enumerate(lengths):
        header.add_contig_line({"ID": str(chrom), "assembly": "GRCh37", "length": length})
    header.add_format_line({"ID":"GT", "Number":1, "Type":"String", "Description": "Genotype"})

    with open(vcffilename, 'wb') as vcffile:
        writer = vcfpy.Writer.from_stream(vcffile, header, samples, use_bgzf=True)
        for record in records:
            genotype = random.choice(['0/0', '0/1', '1/1'])
            newrecord = vcfpy.Record(record.CHROM,
                                     record.POS,
                                     record.ID,
                                     record.REF,
                                     record.ALT,
                                     record.QUAL,
                                     record.FILTER,
                                     record.INFO,
                                     ["GT"],
                                     calls=[vcfpy.record.Call(sample_name, {"GT": genotype})])
            writer.write_record(newrecord)
        writer.close()
Esempio n. 10
0
def extract_vcf_records(
        sample_name,
        # input paths
        alignments_path,
        contigs_path,
        ref_fasta_path,
        vcf_template_path,
        # output paths
        vcf_out_path,
        selected_contigs_path,
        flanked_contigs_path,
        flank_length,
        min_insert_size):

    n_records = 0
    ref_fasta = pysam.FastaFile(ref_fasta_path)
    contig_fasta = pysam.FastaFile(contigs_path)

    selected_contig_fasta = open(selected_contigs_path, "w")
    flanked_contig_fasta = open(flanked_contigs_path, "w")

    alns = pandas.read_csv(alignments_path, sep=" ")

    reader = vcfpy.Reader.from_path(vcf_template_path)
    reader.header.samples = vcfpy.SamplesInfos([sample_name])

    writer = vcfpy.Writer.from_path(vcf_out_path, reader.header)

    contig_loci = set()

    # parse each alignment and look for insertions above min_insert_size
    for r in alns.iterrows():
        # skip secondary alignments
        hit = r[1]["Hit"]
        if hit > 0:
            continue

        query_name = r[1]["QName"]

        # local alignment window in the reference
        ref_chrom, ref_start, ref_end, phase_set, phase, n = query_name.split(
            "_")

        phase_set = phase_set[2:]
        phase = phase[2:]

        # convert to ints
        ref_start, ref_end = (int(ref_start), int(ref_end))

        # alignment start and end for reference sequence
        target_start = r[1]["TStart"]
        target_end = r[1]["TEnd"]

        # alignment start and end for query sequence
        query_start = r[1]["QStart"]
        query_end = r[1]["QEnd"]

        # strand-ness of the query sequence
        strand = r[1]["Strand"]

        # parse cigar for variant extraction
        cig = cigar.Cigar(r[1]["CIGAR"])
        ops = list(cig.items())

        # convert sequences to the positive strand
        query_seq = contig_fasta.fetch(query_name)
        if strand == "-":
            query_seq = str(Bio.Seq.Seq(query_seq).reverse_complement())

        ref_seq = ref_fasta.fetch(ref_chrom, ref_start, ref_end)

        # initialize iterators for the cigar string
        query_pos = query_start
        target_pos = target_start

        # we are looking to extract insertions larger than 50bp
        for op in ops:
            # skip matches
            if op[1] == 'M':
                query_pos += op[0]
                target_pos += op[0]

            # skip deletions in the query sequence
            elif op[1] == 'D':
                target_pos += op[0]

            # insertions in the query sequence
            elif op[1] == 'I':
                # only interested in large insertions
                if op[0] > min_insert_size:
                    # Generate pysam.VariantRecord

                    # need to check conversion from 0-based coordinates to 1-based
                    ref_allele = ref_seq[target_pos]
                    alt_allele = ref_allele + query_seq[query_pos:query_pos +
                                                        op[0]]

                    gt = ""
                    if phase == "1":
                        gt = "1|0"
                    elif phase == "2":
                        gt = "0|1"
                    else:
                        gt = "0/1"

                    break_point = ref_start + target_pos
                    # output VCF record corresponding to the insertion
                    rec = vcfpy.Record(
                        CHROM=ref_chrom,
                        POS=break_point + 1,
                        ID=[query_name],
                        REF=ref_allele,
                        ALT=[vcfpy.Substitution("INS", alt_allele)],
                        QUAL=999,
                        FILTER=["PASS"],
                        INFO={},
                        FORMAT=[
                            "GT", "SVLEN", "PS", "HP", "CIGAR", "STRAND",
                            "CONTIG_START"
                        ],
                        calls=[
                            vcfpy.Call(sample=sample_name,
                                       data=vcfpy.OrderedDict(
                                           GT=gt,
                                           SVLEN=op[0],
                                           PS=phase_set,
                                           HP=phase,
                                           CIGAR=str(cig),
                                           STRAND=strand,
                                           CONTIG_START=str(query_start)))
                        ])

                    n_records += 1
                    # output contig that contains this insertion
                    writer.write_record(rec)

                    contig_locus = ">" + query_name + "_" + sample_name
                    contig_hash = sha1("_{chrom}_{pos}_{alt}".format(
                        chrom=ref_chrom, pos=ref_start,
                        alt=alt_allele[1:]).encode()).hexdigest()

                    contig_name = contig_locus + "_" + contig_hash + "_" + str(
                        op[0])

                    if contig_locus not in contig_loci:
                        selected_contig_fasta.writelines(
                            [contig_name + "\n", query_seq + "\n"])
                        contig_loci.add(contig_locus)

                    # output same insertion, but with flanking sequences
                    # note, the interval is [start, end[
                    if flank_length > 0:
                        left_flank = ref_fasta.fetch(
                            ref_chrom, break_point - flank_length, break_point)
                        right_flank = ref_fasta.fetch(
                            ref_chrom, break_point, break_point + flank_length)
                    else:
                        left_flank = ""
                        right_flank = ""
                    flanked_contig_fasta.writelines([
                        contig_name + "\n",
                        left_flank + alt_allele[1:] + right_flank + "\n"
                    ])

                query_pos += op[0]
    selected_contig_fasta.close()
    return n_records
Esempio n. 11
0
    def write_haplotype_to_vcf(self, fake_genome_mapping_filename,
                               isoform_tally, output_prefix):
        """
        The following functions must first be called first:
        -- self.get_haplotype_vcf_assignment
        """
        if self.haplotype_vcf_index is None or self.alt_at_pos is None:
            raise Exception(
                "Must call self.get_haplotype_vcf_assignment() first!")

        self.sanity_check()

        name_isoforms = list(isoform_tally.keys())
        name_isoforms.sort()

        # write a fake VCF example so we can read the headers in
        with open("template.vcf", "w") as f:
            f.write(__VCF_EXAMPLE__)
        reader = vcfpy.Reader(open("template.vcf"))
        reader.samples = name_isoforms
        f_vcf = vcfpy.Writer(f"{output_prefix}.vcf", reader)

        # human readable text:
        # first line: assoc VCF filename
        # second line: haplotype, list of sorted isoforms
        # third line onwards: haplotype and assoc count
        with open(f"{output_prefix}.human_readable.txt", "w") as f_human:
            f_human.write(f"Associated VCF file: {output_prefix}.vcf\n")
            f_human.write("haplotype\t{samples}\n".format(
                samples="\t".join(name_isoforms)))
            for hap_index, hap_str in enumerate(self.haplotypes):
                f_human.write(hap_str)
                for _iso in name_isoforms:
                    if hap_index in isoform_tally[_iso]:
                        f_human.write(f"\t{isoform_tally[_iso][hap_index]}")
                    else:
                        f_human.write("\t0")
                f_human.write("\n")

        # read fake genome mapping file
        fake_map = {}  # 0-based position on fake --> (, 0-based ref position)
        with open(fake_genome_mapping_filename) as f:
            for line in f:
                fake_pos, ref_chr, ref_pos = line.strip().split(",")
                fake_map[int(fake_pos)] = (ref_chr, int(ref_pos))

        # for each position, write out the ref and alt bases
        # then fill in for each isoform (aka "sample"):
        #  if this isoform only shows one allele, then it's just that allele (0 for ref, 1+ otherwise)
        #  if this isoform shows 2+ allele, then the first allele is indicated by self.haplotypes[0]
        for i, pos in enumerate(self.hap_var_positions):
            ref_chr, ref_pos = fake_map[pos]
            total_count = sum(self.count_of_vars_by_pos[pos].values())
            alt_freq = [
                f"{self.count_of_vars_by_pos[pos][b] * 1.0 / total_count:.2f}"
                for b in self.alt_at_pos[pos]
            ]
            rec = vcfpy.Record(
                CHROM=ref_chr,
                POS=ref_pos + 1,
                ID=".",
                REF=self.ref_at_pos[pos],
                ALT=[vcfpy.Substitution(b) for b in self.alt_at_pos[pos]],
                QUAL=".",
                FILTER="PASS",
                INFO={
                    "AF": alt_freq,
                    "DP": total_count
                },
                FORMAT="GT:HQ",
                sample_indexes=None,
            )

            rec.samples = []
            for _iso in name_isoforms:
                # isoform_tally[_iso] is a dict of haplotype index --> count
                # the index for thos base at this pos would thus be haplotype_vcf_index[hap_index][i]
                # we always need to show the phases in haplotype index order sorted
                hap_indices = list(isoform_tally[_iso].keys())
                hap_indices.sort()
                genotype = "|".join(
                    str(self.haplotype_vcf_index[hap_index][pos])
                    for hap_index in hap_indices)
                counts = ",".join(
                    str(isoform_tally[_iso][hap_index])
                    for hap_index in hap_indices)
                rec.samples.append(
                    vcfpy.Call(
                        rec, _iso,
                        vcfpy.OrderedDict([("GT", genotype), ("HQ", counts)])))
            f_vcf.write_record(rec)
        f_vcf.close()
Esempio n. 12
0
def main():
    parser = argparse.ArgumentParser(
        description="Looks for a given set of SNPs whithin a bam file.")

    parser.add_argument("bam",
                        metavar='sample.bam',
                        action='store',
                        help='BAM file.',
                        type=str)

    parser.add_argument(
        "barcodes",
        metavar='barcodes.list',
        action='store',
        help=
        "File containing cell barcodes (the same used in the alignment file to identify cell reads).",
        type=str)

    parser.add_argument("vcf",
                        metavar='variants.vcf',
                        action='store',
                        help="VCF file storing BULK SNPs.",
                        type=str)

    parser.add_argument("sample_name",
                        metavar='sample1',
                        action='store',
                        help="Sample identifier.",
                        type=str)

    parser.add_argument("out_prefix",
                        metavar="outdir/sample",
                        action="store",
                        help="Output VCF file prefix.",
                        type=str)

    parser.add_argument(
        "--gt",
        metavar='1/1 (0/1)',
        choices=["0/0", "0/1", "1/1"],
        action='store',
        help=
        "Genotype filter: considers only mutations with the specified GT in the original vcf file.",
        type=str)

    args = parser.parse_args()
    bam = args.bam
    barcodes = args.barcodes
    invcf = args.vcf
    sample = args.sample_name
    outvcf = args.out_prefix + ".snpseeker.vcf"

    if args.gt:
        gt_filter = True
        gt = args.gt

    else:
        gt_filter = False

    with open(barcodes, "r") as f:
        samples = f.read().splitlines()
    #read bam file
    samfile = pysam.AlignmentFile(bam, "rb")

    #build the header of the output vcf
    header_out = vcfpy.Header(lines=[
        vcfpy.HeaderLine(key="fileformat", value="VCFv4.3"),
        vcfpy.HeaderLine(key="source", value=sys.argv[0]),
        vcfpy.HeaderLine(key="fileDate",
                         value=date.today().strftime("%d/%m/%Y"))
    ],
                              samples=vcfpy.SamplesInfos(samples))

    # sample header lines
    header_out.add_line(
        vcfpy.SampleHeaderLine.from_mapping(
            OrderedDict([("ID", sample), ("Description", "Sample name")])))

    # filter header lines
    # sample header lines
    header_out.add_filter_line(
        OrderedDict([("ID", "1/1"), ("Number", "1"),
                     ("Description", "Filtered on such GT")]))
    header_out.add_filter_line(
        OrderedDict([("ID", "0/1"), ("Number", "1"),
                     ("Description", "Filtered on such GT")]))
    header_out.add_filter_line(
        OrderedDict([("ID", "0/0"), ("Number", "1"),
                     ("Description", "Filtered on such GT")]))

    #header_out.add_info_line(OrderedDict([("ID", "MUT"), ("Number", "1"), ("Type","Integer"), ("Description", "States if the record mutation is supported (1) or not (0).")]))

    # format header lines
    header_out.add_format_line(
        OrderedDict([("ID", "GT"), ("Number", "1"), ("Type", "String"),
                     ("Description", "Genotype (0/1, 0/0)")]))
    header_out.add_format_line(
        OrderedDict([
            ("ID", "DP"), ("Number", "1"), ("Type", "Integer"),
            ("Description",
             "Filtered read depth (reads with MAPQ < 30, indels and gaps are filtered)"
             )
        ]))
    header_out.add_format_line(
        OrderedDict([("ID", "RD"), ("Number", "1"), ("Type", "Integer"),
                     ("Description", "Reference allele read depth")]))
    header_out.add_format_line(
        OrderedDict([("ID", "AD"), ("Number", "1"), ("Type", "Integer"),
                     ("Description", "Alternate allele read depth")]))
    header_out.add_format_line(
        OrderedDict([
            ("ID", "AF"), ("Number", "1"), ("Type", "Float"),
            ("Description",
             "Allele frequency: AD/(RD+AD). Other alleles, in case of mutli-allelic regions, are ignored."
             )
        ]))

    # read input vcf
    reader = vcfpy.Reader.from_path(invcf)

    # info header lines
    # Use input FORMAT lines as output INFO line
    header_out.add_info_line(
        OrderedDict([("ID", "SUPP"), ("Number", "1"), ("Type", "Integer"),
                     ("Description",
                      "Number of cells supporting the mutation.")]))

    format_ids = reader.header.format_ids()
    for format_id in format_ids:
        format_line = reader.header.get_format_field_info(format_id)
        '''
            output example:
        
            FormatHeaderLine('FORMAT', '<ID=AD,Number=R,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed">', {'ID': 'AD', 'Number': 'R', 'Type': 'Integer', 'Description': 'Allelic depths for the ref and alt alleles in the order listed'})
            key = 'FORMAT'
            value = '<ID=AD,Number=R,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed">
        '''
        mapping = str_to_mapping(format_line.value)
        mapping["Description"] = "(Info about bulk mutation)" + mapping[
            "Description"]
        header_out.add_info_line(str_to_mapping(format_line.value))

    # open the output vcf
    writer = vcfpy.Writer.from_path(outvcf, header_out)

    #read bam file
    samfile = pysam.AlignmentFile(bam, "rb")

    #for each mutation in the vcf file
    for record_in in reader:
        d = samples_dict(samples)
        supp = 0
        # filter out indels: only interested in snvs in this analysis phase
        if gt_filter:
            if record.calls[0].data.get('GT') != gt:
                continue

        if not record_in.is_snv():
            continue
        chrom = record_in.CHROM
        pos = record_in.POS - 1  #to correct on 1-based positions
        ref = record_in.REF
        alt = record_in.ALT[
            0].value  #record.ALT is a list by construction which contains only one value
        # if the mutation is a SNV
        #line += [call.data.get('GT') or './.' for call in record.calls]

        #look for the pileup in the samfile at position (chrom,pos)
        for pileupcolumn in samfile.pileup(chrom,
                                           pos,
                                           pos + 1,
                                           stepper='all',
                                           truncate=True,
                                           max_depth=10000):
            for base in pileupcolumn.pileups:
                # .is_del -> the base is a deletion?
                # .is_refskip -> the base is a N in the CIGAR string ?
                if not base.is_del and not base.is_refskip and not base.alignment.mapping_quality < 30:
                    #iterate on cells
                    tags = list_to_dict(base.alignment.tags)
                    if "CB" not in tags.keys():
                        ''' reads with no error-corrected barcode are discarded '''
                        continue
                    elif tags["CB"].split("-")[0] not in samples:
                        ''' The barcode hasn't been labeled has belonging to a cell by cellranger (floating DNA)'''
                        continue
                    cb = tags["CB"].split("-")[0]  #10x barcodes
                    #print("barcode {} is a cell barcode ".format(cb))
                    d[cb][
                        'dp'] += 1  #update info for the sample identified by CB
                    if base.alignment.query_sequence[
                            base.query_position] == alt:
                        d[cb]['ad'] += 1
                    elif base.alignment.query_sequence[
                            base.query_position] == ref:
                        d[cb]['rd'] += 1
        for cb in d.keys():
            if d[cb]['ad'] > 0:
                supp += 1
                d[cb][
                    'gt'] = "0/1"  #temporary, all the supported mutations are set to 0/1
                d[cb]['af'] = d[cb]['ad'] / (d[cb]['rd'] + d[cb]['ad'])

        # generate calls for each sample/cell
        calls = []
        for cb in d.keys():
            calls.append(
                vcfpy.Call(
                    cb,
                    OrderedDict([("GT", d[cb]['gt']), ("DP", d[cb]['dp']),
                                 ("RD", d[cb]['rd']), ("AD", d[cb]['ad']),
                                 ("AF", d[cb]['af'])])))

        # create a mapping between each FORMAT entry and the
        # corresponding value, in the call, in the input vcf file
        # note that the input vcf contains only one sample, so
        # the calls field of each record contains only one entry
        info_d = {}
        info_d['SUPP'] = supp
        for f in record_in.FORMAT:
            info_d[f] = record_in.calls[0].data.get(f)

        if gt_filter == True:
            filter_l = [gt]
        else:
            filter_l = []

        # build and write the output record

        record_out = vcfpy.Record(
            CHROM=chrom,
            POS=pos + 1,
            ID=[],
            REF=ref,
            ALT=[vcfpy.Substitution(type_="SNV", value=alt)],
            QUAL=None,
            FILTER=filter_l,
            INFO=info_d,
            FORMAT=["GT", "DP", "RD", "AD", "AF"],
            calls=calls)
        writer.write_record(record_out)

    reader.close()
    writer.close()
    samfile.close()
Esempio n. 13
0
#                logging.info("ref alleles assigned for %s at %s", acc, site)
            gt = str(allele) + "|" + str(allele) if args.diploid else str(
                allele)
            sampleCall = vcfpy.Call(
                sample=acc,
                data={'GT': gt},  # has to be string; diploid
                #                data = {'GT': str(allele) }, # has to be string
                site=site)
            genoCalls.append(sampleCall)

        record = vcfpy.Record(
            CHROM=refEPI,
            POS=site,
            ID=snpInfo[site]['varID'],
            REF=snpInfo[site]['refNT'],
            ALT=subs,
            QUAL=None,
            FILTER=[],  # PASS
            INFO={},  # consequence calls, locus, etc; a dict
            FORMAT=['GT'],  # a list
            calls=genoCalls)
        varCt += 1
        writer.write_record(record)
    logging.info("SNPs records written to file: n = %s at %s", len(sitesSNPs),
                 datetime.datetime.now())

    for change in indelDict:  # change is "cv-" varID
        geno = {}
        genoCalls = []
        refNT = indelInfo[change]['refNT']  # 'NAAAAA'
        altNTs = indelInfo[change]['altNT']  # [ 'N' ]
Esempio n. 14
0
def main():

    parser = argparse.ArgumentParser(description="vcf writer")
    parser.add_argument("output",
                        metavar='output.vcf',
                        action='store',
                        help='vcf file.',
                        type=str)

    args = parser.parse_args()

    outvcf = args.output

    #########################
    #                       #
    #  creating the header  #
    #                       #
    #########################

    # The header can contain some fixed type lines (INFO, FORMAT, FILTER, etc.) and some general ones
    # In this case, the header will contain a line storing the name of the program which generated
    # the file. We also add the information about the name of the sample which have been analyzed

    header = vcfpy.Header(lines=[
        vcfpy.HeaderLine(key="source", value=sys.argv[0]),
        vcfpy.HeaderLine(key="fileformat", value="VCFv4.3"),
        vcfpy.HeaderLine(key="fileDate",
                         value=date.today().strftime("%d/%m/%Y"))
    ],
                          samples=vcfpy.SamplesInfos(["Sample1", "Sample2"]))

    # Tuples of valid entries -----------------------------------------------------
    #
    #: valid INFO value types
    # INFO_TYPES = ("Integer", "Float", "Flag", "Character", "String")
    #: valid FORMAT value types
    # FORMAT_TYPES = ("Integer", "Float", "Character", "String")
    #: valid values for "Number" entries, except for integers
    # VALID_NUMBERS = ("A", "R", "G", ".")
    #: header lines that contain an "ID" entry
    # LINES_WITH_ID = ("ALT", "contig", "FILTER", "FORMAT", "INFO", "META", "PEDIGREE", "SAMPLE")
    # Constants for "Number" entries ----------------------------------------------
    #
    #: number of alleles excluding reference
    # HEADER_NUMBER_ALLELES = "A"
    #: number of alleles including reference
    # HEADER_NUMBER_REF = "R"
    #: number of genotypes
    # HEADER_NUMBER_GENOTYPES = "G"
    #: unbounded number of values
    # HEADER_NUMBER_UNBOUNDED = "."

    # adding filter lines
    header.add_filter_line(
        OrderedDict([("ID", "PASS"), ("Description", "All filters passed")]))

    # adding info lines
    header.add_info_line(
        OrderedDict([("ID", "DP"), ("Number", "1"), ("Type", "Integer"),
                     ("Description",
                      "Raw read depth (without mapping quality filters)")]))
    header.add_info_line(
        OrderedDict([
            ("ID", "MUT"), ("Number", "1"), ("Type", "Integer"),
            ("Description",
             "States if the record mutation is supported (1) or not (0).")
        ]))

    # adding format lines
    header.add_format_line(
        OrderedDict([("ID", "GT"), ("Number", "1"), ("Type", "String"),
                     ("Description", "Genotype")]))
    header.add_format_line(
        OrderedDict([("ID", "DP"), ("Number", "1"), ("Type", "Integer"),
                     ("Description", "Filtered read depth (MAPQ > 30)")]))
    #header.add_format_line(OrderedDict([vcfpy.header.RESERVED_FORMAT["GT"]]))

    # adding contig lines
    header.add_contig_line(
        OrderedDict([("ID", "chr1"), ("length", "248956422")]))

    # adding sample lines
    header.add_line(
        vcfpy.SampleHeaderLine.from_mapping(
            OrderedDict([("ID", "Sample1"), ("Description", "Tumor")])))

    # writing the vcf
    with vcfpy.Writer.from_path(outvcf, header) as writer:

        # creating one record
        calls = []
        calls.append(
            vcfpy.Call("Sample1", OrderedDict([("GT", "0/1"), ("DP", "47")])))
        calls.append(
            vcfpy.Call("Sample2", OrderedDict([("GT", "0/1"), ("DP", "31")])))

        record = vcfpy.Record(CHROM="1",
                              POS=1,
                              ID=[],
                              REF="C",
                              ALT=[vcfpy.Substitution(type_="SNV", value="G")],
                              QUAL=None,
                              FILTER=["PASS"],
                              INFO={
                                  "DP": "50",
                                  "MUT": 0
                              },
                              FORMAT=["GT", "DP"],
                              calls=calls)
        #record.add_format(key="GT")
        #record.calls.append(vcfpy.Call("Sample1", OrderedDict([("GT", "0|1")])))
        writer.write_record(record)
Esempio n. 15
0
def write_snp_to_vcf(
    snp_filename: Path,
    vcf_filename: Path,
    genome_filename: Path,
    genome_d: LazyFastaReader = None,
) -> None:
    # read the genome is genome_d is not given
    if genome_d is None:
        genome_d = LazyFastaReader(genome_filename)

    # read the first SNP record so we know the query name
    snp_reader = SNPReader(snp_filename)
    snp_rec = next(snp_reader)
    sample_name = snp_rec.query_name
    cur_recs = [snp_rec]
    genome_rec = genome_d[snp_rec.ref_name]

    with open("template.vcf", "w+") as f:
        f.write(f"{__VCF_EXAMPLE__}\n")
        reader = vcfpy.Reader(f)
        reader.samples = [sample_name]
        f_vcf = vcfpy.Writer(vcf_filename, reader)

        for r1 in snp_reader:
            if r1.ref_pos == cur_recs[
                    -1].ref_pos:  # multi-nt insertion, keep recording
                cur_recs.append(r1)
            elif (r1.query_base == "." and cur_recs[-1].query_base
                  == "."):  # multi-nt deletion, keep recording
                cur_recs.append(r1)
            else:  # time to write out the current set of records
                # multiple records mean it could be:
                # 1. multi-nucleotide insertions
                # 2. multi-nucleotide deletions

                if (len(cur_recs) == 1 and cur_recs[0].ref_base != "." and
                        cur_recs[0].query_base != "."):  # just a SNP record
                    pos = cur_recs[0].ref_pos
                    ref_base = cur_recs[0].ref_base
                    alt_base = cur_recs[0].query_base
                elif cur_recs[0].ref_base == ".":
                    # is a single or multi-nt insertions, must retrieve ref base from genome
                    # ex: in out.snps_files it is . --> ATG
                    # in VCF it should be T --> TATG (meaning insertion of ATG)
                    pos = cur_recs[0].ref_pos
                    ref_base = genome_rec[cur_recs[0].ref_pos]
                    alt_base = ref_base + "".join(r.query_base
                                                  for r in cur_recs)
                else:
                    # is a single multi-nt deletions, we need to get one more ref base before the first deletion
                    # ex: in out.snps_files it is GGG --> deletion
                    # in VCF it should be TGGG --> T (meaning deletion of GGG)
                    pos = cur_recs[0].ref_pos - 1
                    ref_base_prev = genome_rec[pos]
                    ref_base = ref_base_prev + "".join(r.ref_base
                                                       for r in cur_recs)
                    alt_base = ref_base_prev

                rec = vcfpy.Record(
                    CHROM=snp_rec.ref_name,
                    POS=pos + 1,
                    ID=".",
                    REF=ref_base,
                    ALT=[vcfpy.Substitution(alt_base)],
                    QUAL=".",
                    FILTER="PASS",
                    INFO={"AF": 0.5},
                    FORMAT="GT",
                    sample_indexes=None,
                )

                rec.samples.append(
                    vcfpy.Call(rec, sample_name,
                               vcfpy.OrderedDict([("GT", "0|1")])))
                f_vcf.write_record(rec)
                if r1.ref_name != cur_recs[0].ref_name:
                    genome_rec = genome_d[r1.ref_name]
                cur_recs = [r1]
Esempio n. 16
0
def main():
    parser = argparse.ArgumentParser(description="From single cell VCF to clones vcf.")
    parser.add_argument("input1", metavar="sample.muts.vcf", action="store", help="Single cell VCF file.", type=str)
    parser.add_argument("input2", metavar="clusters.list", action="store", help="Clusters list.", type=str)
    #parser.add_argument("input_type", choices=["gz", "vcf"], help="VCF input type (vcf/gz).", type=str)
    #parser.add_argument("sample", metavar="sample_name", action="store", help="Sample name", type=str)
    parser.add_argument("outprefix", metavar="out/path/prefix", action="store", help="Output prefix", type=str)

    args = parser.parse_args()

    input1 = args.input1
    input2 = args.input2
    prefix = args.outprefix
    #sample = args.sample
    #input_type = args.input_type

        
    clusters_df = pd.read_csv(input2)
    #clusters_df['cluster'] = clusters_df['a'].apply(lambda x: "{}_{}".format(sample, x))    

    clusters = [str(cluster) for cluster in clusters_df['cluster'].unique()]
    # Create out header
    header_out = vcfpy.Header(lines=[ vcfpy.HeaderLine(key="fileformat", value="VCFv4.3"), vcfpy.HeaderLine(key="source", value=sys.argv[0]), vcfpy.HeaderLine(key="fileDate", value=date.today().strftime("%d/%m/%Y")) ], samples=vcfpy.SamplesInfos(clusters))
     
    # format header lines 
    header_out.add_format_line(OrderedDict([("ID", "GT"),("Number", "1"), ("Type","String"), ("Description", "Genotype (0/1, 0/0)")]))
    header_out.add_format_line(OrderedDict([("ID", "DP"),("Number", "1"), ("Type","Integer"), ("Description", "Filtered read depth (reads with MAPQ < 30, indels and gaps are filtered)")]))
    header_out.add_format_line(OrderedDict([("ID", "RD"),("Number", "1"), ("Type","Integer"), ("Description", "Reference allele read depth")]))
    header_out.add_format_line(OrderedDict([("ID", "AD"),("Number", "1"), ("Type","Integer"), ("Description", "Alternate allele read depth")]))
    header_out.add_format_line(OrderedDict([("ID", "AF"),("Number", "1"), ("Type","Float"), ("Description", "Allele frequency: AD/(RD+AD). Other alleles, in case of mutli-allelic regions, are ignored.")]))
    
    # info header lines
 
    header_out.add_info_line(OrderedDict([("ID", "SUPP"), ("Number", "1"), ("Type","Integer"), ("Description", "Whether the mutation is supported or not.")]))
    
    # read input vcf
    reader = vcfpy.Reader.from_path(input1)
    # open the output vcf
    writer = vcfpy.Writer.from_path(prefix+"_clusters.vcf", header_out)
 
    """
    snps = read_vcf(input1, input_type)
    #Filtering bulk mutations not supported by cells
    snps = snps[~snps['INFO'].str.startswith("SUPP=0")]
    
    #Create mutation id column and set it as index
    snps["mutid"] = snps["CHROM"] + "_"+snps["POS"].map(str) + "_" + snps["REF"] + "_" +snps["ALT"]
    snps = snps.set_index('mutid')
    """

    #for each record in the vcf file
    for record_in in reader:
        d = samples_dict(clusters_df['cluster'].unique())
        supp = 0
        chrom = record_in.CHROM
        pos = record_in.POS-1 #to correct on 1-based positions
        ref = record_in.REF
        alt = record_in.ALT[0].value
        
        #for each cluster compute 'GT:DP:RD:AD:AF' to be provided as call argument
        for c in clusters_df['cluster'].unique():
            #retrieve cell columns for cells in current cluster
            cells = clusters_df['cellid'][clusters_df['cluster'] == c]
            
          
            #retrieve cell data
            calls = [record_in.call_for_sample[cell] for cell in cells]
            #sum total read count, alt read count and ref read count of cells in the cluster
            for call in calls:    
                d[c]['dp'] = d[c]['dp'] + call.data.get('DP') 
                d[c]['rd'] = d[c]['rd'] + call.data.get('RD')
                d[c]['ad'] = d[c]['ad'] + call.data.get('AD')

            if d[c]['ad'] > 0:
                d[c]['gt'] = "0/1"
                d[c]['af'] = d[c]['ad'] / (d[c]['rd'] + d[c]['ad'])
                supp = 1
    
        calls = []
        # create one call for each cluster
        for c in d.keys():
            calls.append(vcfpy.Call(str(c), OrderedDict([("GT", d[c]['gt']), ("DP", d[c]['dp']), ("RD", d[c]['rd']), ("AD", d[c]['ad']), ("AF", d[c]['af'])])))        
        print(calls)
         
        # write new record
        record_out = vcfpy.Record(CHROM=chrom, POS=pos+1, ID=[], REF=ref, ALT=[vcfpy.Substitution(type_="SNV", value=alt)], QUAL=None, FILTER=[], INFO={"SUPP":supp}, FORMAT=["GT","DP","RD","AD","AF"],
                calls=calls
           )
        writer.write_record(record_out)
        
    reader.close()
    writer.close()
Esempio n. 17
0
def generate_non_sv_records(colocated_records, sample_names):
    """
    This function processes records that have not been used to call a SV.
    :param colocated_records:
    :param sample_names:
    :return:
    """

    # The co-located records need to be re-grouped based not just on their true position (CHROM+POS) but also similarity
    subgrouping_function = lambda record: (
        record.CHROM, record.POS, record.REF, str(record.ALT),
        record.INFO.get("END", None), record.INFO.get("INSSEQ", None))
    records_grouped_by_all_coordinates = group_by(colocated_records,
                                                  key=subgrouping_function)

    # Once the regrouping has happened, each group will generate exactly one line in the output. These lines
    # may be produced out-of-order, but we don't care because we will sort them later before generating the VCF.
    output = []
    for subkey, group in records_grouped_by_all_coordinates.items():
        # Build a map to easily find the records by the sample name
        sample_names_to_record = group_by(group, get_sample_name)

        # Generate calls for each sample in this group
        calls = [
            get_sample_call(sample_name,
                            sample_names_to_record.get(sample_name, []))
            for sample_name in sample_names
        ]

        # Add a record to the output
        first_record_of_the_group = group[0]
        id_of_new_record = generate_id(first_record_of_the_group.CHROM,
                                       first_record_of_the_group.POS)
        info = vcfpy.OrderedDict()
        info["SVTYPE"] = "BND"
        info["TRANCHE2"] = maximum_tranche(group)
        info["BNDVAF"] = get_average_vaf(group)
        if "END" in first_record_of_the_group.INFO:
            # by construction, all the grouped records have the same
            info["END"] = first_record_of_the_group.INFO["END"]
        if "INSSEQ" in first_record_of_the_group.INFO:
            # by construction, all the grouped records have the same
            info["INSSEQ"] = first_record_of_the_group.INFO["INSSEQ"]
        output.append(
            vcfpy.Record(
                CHROM=first_record_of_the_group.
                CHROM,  # by construction, all the grouped records have the same
                POS=first_record_of_the_group.
                POS,  # by construction, all the grouped records have the same
                ID=[id_of_new_record],
                REF=first_record_of_the_group.
                REF,  # by construction, all the grouped records have the same
                ALT=first_record_of_the_group.
                ALT,  # by construction, all the grouped records have the same
                QUAL=maximum_qual(group),
                FILTER=["PASS"],
                INFO=info,
                FORMAT=["GT", "TRANCHE2", "VAF"],
                calls=calls))

    return output
Esempio n. 18
0
    #***** FORMAT *****#
    lst_format_id = ['GT', 'DP', 'AF']
    # Merge GT field
    try: set_gt.remove('./.')
    except: pass
    if len(set_gt)==1: field_gt = set_gt.pop()
    else: field_gt = "./."
    # Merge DP field
    field_dp = int(round(numpy.median(lst_dp),0))
    # Merge AF field
    while lst_af.count(".")>0: lst_af.remove(".")
    field_af = float(round(numpy.median(lst_af),2))
    # Create call
    dico_calls = [vcfpy.Call(dico_vcf[var_id]["sample"], {'GT':field_gt, 'DP':field_dp, 'AF':[field_af]})] 
    #***** WRITE VARIANT *****#
    new_record = vcfpy.Record(chrom, pos, ".", ref, dico_vcf[var_id]["ALT"], field_qual, [field_filter], dico_info, lst_format_id, dico_calls)
    writer.write_record(new_record)
writer.close()



#***** POST-PROCESSING *****#
# Sort
sortVCF(pathMergeUnsortedVCF,pathMergeVCF)
# Validate
boolvalid,lst_errors = validateVCF(path_vcfvalidator,pathMergeVCF)
if boolvalid==False: exit("🅴 🆁 🆁 🅾 🆁\n[Nk_mergeVCF] Validate VCF `"+os.path.basename(pathMergeVCF)+"`\n    "+"\n    ".join(lst_errors))
# bgzip
cmd_bgzip = "bgzip -f "+pathMergeVCF
process = subprocess.Popen([cmd_bgzip], stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
out, err = process.communicate()
def block_to_records(block, prev_block):
    """Given an alignment block, yield the VCF records.
    
    NB: the first/last amino acids are stored for the exon with the major part of the codon. 
    """
    logging.debug("Starting new block")
    meta = block.meta
    location = block.meta.location
    in_frame = block.meta.in_frame
    out_frame = block.meta.out_frame

    logging.debug("prev_block: %s", prev_block)
    logging.debug("block: %s", block)
    logging.debug("%s\n%s", meta, "\n".join(block.aa_seqs))

    # Special case handling for first codon.
    assert not in_frame or prev_block
    if in_frame == 1:
        # The alignment for this exon has the amino acid, yield last amino acid for previous exon.
        prev_location = prev_block.meta.location
        start, end = pos_magic(
            prev_location,
            prev_location[2] - prev_location[1] - 1,
            prev_location[2] - prev_location[1],
        )
        logging.debug("in_frame == 1, start, end = %d, %d", start, end)
        yield vcfpy.Record(
            CHROM=prev_block.meta.location[0],
            POS=start + 1,
            ID=[],
            REF="N",
            ALT=[],
            FILTER=[],
            QUAL=None,
            INFO={
                "END": end,
                "UCSC_GENE": prev_block.meta.ucsc_gene_id,
                "EXON": prev_block.meta.exon_idx,
                "EXON_COUNT": prev_block.meta.exon_count,
                "ALIGNMENT": "".join(seq[0] for seq in block.aa_seqs),
            },
            FORMAT={},
            calls=[],
        )
        # We start at the first codon of this exon
        starts = [0]
        ends = [2]
    elif in_frame == 2:
        # The alignment for the previous exon has the amino acid, yield first amino acid for this exon.
        start, end = pos_magic(location, 0, 1)
        logging.debug("in_frame == 2, start, end = %d, %d", start, end)
        yield vcfpy.Record(
            CHROM=location[0],
            POS=start + 1,
            ID=[],
            REF="N",
            ALT=[],
            FILTER=[],
            QUAL=None,
            INFO={
                "END": end,
                "UCSC_GENE": meta.ucsc_gene_id,
                "EXON": meta.exon_idx,
                "EXON_COUNT": meta.exon_count,
                "ALIGNMENT": "".join(seq[-1] for seq in prev_block.aa_seqs),
            },
            FORMAT={},
            calls=[],
        )
        # We start at the second codon on this exon
        starts = [1]
        ends = [4]
    else:
        # Start at codon border
        logging.debug("in_frame == 0, start, end = 0, 3")
        starts = [0]
        ends = [3]

    # Handle major part of exon.
    nts = location[2] - location[1]
    assert (nts - (3 - in_frame) - out_frame) % 3 == 0
    starts = starts + list(range(ends[0], nts, 3))
    ends = ends + list(range(ends[0] + 3, nts, 3))
    if ends[-1] != nts:
        ends.append(nts)
    logging.debug("nts=%s", nts)
    logging.debug("starts=%s", starts)
    logging.debug("ends=%s", ends)

    if out_frame == 2:
        # We have the amino acid for the last partial codon.  If out_frame == 1 then the next exon will take care of writing record.
        starts += [ends[-1]]
        ends += [ends[-1] + 2]
    for i, (start, end) in enumerate(zip(starts, ends)):
        if i >= meta.exon_len:
            logging.debug("Too short AA seq found for %s, this happens...",
                          meta.ucsc_gene_id)
            continue
        start, end = pos_magic(location, start, end)
        yield vcfpy.Record(
            CHROM=location[0],
            POS=start + 1,
            ID=[],
            REF="N",
            ALT=[],
            FILTER=[],
            QUAL=None,
            INFO={
                "END": end,
                "UCSC_GENE": meta.ucsc_gene_id,
                "EXON": meta.exon_idx,
                "EXON_COUNT": meta.exon_count,
                "ALIGNMENT": "".join(seq[i] for seq in block.aa_seqs),
            },
            FORMAT={},
            calls=[],
        )
Esempio n. 20
0
def main():
    parser = argparse.ArgumentParser(description="Looks for a given set of SNPs whithin a bam file.")


    parser.add_argument("bam", metavar='sample.bam', action='store',
        help='BAM file.', type=str)

    parser.add_argument("vcf", metavar='file.vcf', action='store',
        help="VCF file storing SNPs.", type=str)

    parser.add_argument("sample_name", metavar='sample1', action='store',
                help="Sample identifier.", type=str)



    parser.add_argument("out_prefix", metavar="outdir/sample", action="store",
        help="Output VCF file prefix.", type=str)

    #parser.add_argument("--sample_name2", metavar='sample2', action='store',
    #                            help="Another sample name", type=str)

    args = parser.parse_args()
    bam= args.bam
    invcf = args.vcf
    sample = args.sample_name
    outvcf = args.out_prefix + ".snpseeker.vcf"

    '''
    if args.sample_name2:
    sample_name2 = args.sample_name2
    else:
    sample_name2 = null
    '''

    #read bam file
    samfile = pysam.AlignmentFile(bam, "rb")

    #build the header of the output vcf
    header_out = vcfpy.Header(lines=[vcfpy.HeaderLine(key="fileformat", value="VCFv4.3"), vcfpy.HeaderLine(key="source", value=sys.argv[0]), vcfpy.HeaderLine(key="fileDate", value=date.today().strftime("%d/%m/%Y")) ], samples=vcfpy.SamplesInfos([sample]))

    # sample header lines
    header_out.add_line(vcfpy.HeaderLine(key="SampleName", value=sample))
    '''
    if sample_name2 is not null:
    header_out.add_line(vcfpy.SampleHeaderLine.from_mapping(OrderedDict([("ID", sample_name2),("Description", "Second sample name")])))
    '''
    # info header lines
    header_out.add_info_line(OrderedDict([("ID", "SUPP"), ("Number", "1"), ("Type","Integer"), ("Description", "States if the mutation is supported (1) or not (0).")]))

    # adding format lines 
    header_out.add_format_line(OrderedDict([("ID", "GT"),("Number", "1"), ("Type","String"), ("Description", "Genotype (0/1, 0/0)")]))
    header_out.add_format_line(OrderedDict([("ID", "SDP"),("Number", "1"), ("Type","Integer"), ("Description", "Samtools read depth (secondary alignments, PCR duplicates, unppammed reads and reads not passing vendor QC are filtered)")]))
    header_out.add_format_line(OrderedDict([("ID", "DP"),("Number", "1"), ("Type","Integer"), ("Description", "Filtered read depth (reads with MAPQ < 30, indels and gaps are filtered)")]))
    header_out.add_format_line(OrderedDict([("ID", "RD"),("Number", "1"), ("Type","Integer"), ("Description", "Reference allele read depth")]))
    header_out.add_format_line(OrderedDict([("ID", "AD"),("Number", "1"), ("Type","Integer"), ("Description", "Alternate allele read depth")]))
    header_out.add_format_line(OrderedDict([("ID", "AF"),("Number", "1"), ("Type","Float"), ("Description", "Allele frequency: AD/(RD+AD). Other alleles, in case of mutli-allelic regions, are ignored.")]))

    # read input vcf
    reader = vcfpy.Reader.from_path(invcf)

    format_ids = reader.header.format_ids()
    for format_id in format_ids:
        format_line = reader.header.get_format_field_info(format_id)
        '''
            output example:
        
            FormatHeaderLine('FORMAT', '<ID=AD,Number=R,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed">', {'ID': 'AD', 'Number': 'R', 'Type': 'Integer', 'Description': 'Allelic depths for the ref and alt alleles in the order listed'})
            key = 'FORMAT'
            value = '<ID=AD,Number=R,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed">
        '''
        mapping = str_to_mapping(format_line.value)
        mapping["Description"] = "(Info about mutation in the original vcf)" + mapping["Description"]
        header_out.add_info_line(str_to_mapping(format_line.value))

    # open the output vcf
    writer = vcfpy.Writer.from_path(outvcf, header_out) 

    #read bam file
    samfile = pysam.AlignmentFile(bam, "rb")

    #for each mutation in the vcf file
    for record_in in reader:
    # filter out indels: only interested in snvs in this analysis phase
        if not record_in.is_snv():
            continue
        chrom = record_in.CHROM
        pos = record_in.POS-1 #to correct on 1-based positions
        ref = record_in.REF
        alt = record_in.ALT[0].value  #record.ALT is a list by construction which contains only one value
                                    # if the mutation is a SNV
        #line += [call.data.get('GT') or './.' for call in record.calls]

        #look for the pileup in the samfile at position (chrom,pos)
        for pileupcolumn in samfile.pileup(chrom, pos, pos+1, stepper='all', truncate=True, max_depth=10000):
            #number of reads at this position
            sdp = pileupcolumn.n
            #number of supporting reads for the alternate base
            ad = 0
            rd = 0
            dp = 0
            af = 0.0
            for base in pileupcolumn.pileups:
                # .is_del -> the base is a deletion?
                # .is_refskip -> the base is a N in the CIGAR string ?
                if not base.is_del and not base.is_refskip and not base.alignment.mapping_quality < 30:
                    dp += 1
                    if base.alignment.query_sequence[base.query_position] == alt:
                        ad += 1
                    elif base.alignment.query_sequence[base.query_position] == ref:
                        rd += 1

        if ad > 0:
            af = ad / (rd + ad)
            supp = 1
            gt = "0/1" #temporary, all the supported mutations are set to 0/1
        else:
            supp = 0
            gt = "0/0" 


        #af = ad / (rd + ad)

        info_d = {}
        info_d['SUPP'] = supp
        for f in record_in.FORMAT:
            info_d[f] = record_in.calls[0].data.get(f)

        record_out = vcfpy.Record(CHROM=chrom, POS=pos+1, ID=[], REF=ref, ALT=[vcfpy.Substitution(type_="SNV", value=alt)], QUAL=None, FILTER=[], INFO=info_d, FORMAT=["GT","SDP","DP","RD","AD","AF"],
                calls=[vcfpy.Call(sample, OrderedDict([("GT", gt), ("SDP",sdp), ("DP", dp), ("RD", rd), ("AD", ad), ("AF", af)]))]
           )
        writer.write_record(record_out)


    reader.close()
    writer.close()
    samfile.close()
Esempio n. 21
0
def extract_consensus_insertions(contig_path, cons_path, ref_fasta_path, vcf_out_path, vcf_template_path, min_insertion_size, flank_length, flanked_contigs_path):
    n_records = 0
    # open input sequences
    cons_fasta = pysam.FastaFile(cons_path)
    ref_fasta = pysam.FastaFile(ref_fasta_path)

    flanked_contig_fasta = open(flanked_contigs_path, "w")

    (samples, loci) = collect_genotypes(contig_path)
    print("Found", len(samples), "samples for", len(loci), "phased loci")

    reader = vcfpy.Reader.from_path(vcf_template_path)
    reader.header.samples = vcfpy.SamplesInfos(list(samples))
    writer = vcfpy.Writer.from_path(vcf_out_path, reader.header)

    for contig in cons_fasta.references:
        # parse coordinates
        (chrom, start, end) = contig.split("_")
        (start, end) = int(start), int(end)

        cons_seq  = cons_fasta.fetch(contig)
        ref_seq = ref_fasta.fetch(chrom, start, end)

        aligner = mappy.Aligner(seq = ref_seq, preset = None , k = 15, w = 10, n_threads = 1,
                                max_join_long = 20000, max_join_short = 10000, min_join_flank_sc = 10,
                                min_join_flank_ratio = 0.1, max_gap = 10000, bw = 2000, end_bonus = 10,
                                zdrop = 10000, zdrop_inv = 1000,
                                scoring = (2, 4, 4, 10, 300, 0, 1),
                                extra_flags = 0x1)
        alignments = list(aligner.map(cons_seq, seq2 = None, cs = True, MD = False))

        if len(alignments) == 0:
            print("No hits in", contig)
            continue

        aln = max(alignments, key = lambda x: x.blen)

        cig = cigar.Cigar(aln.cigar_str)
        ops = list(cig.items())


        cons_pos = aln.q_st
        target_pos = aln.r_st

        strand = "+"
        if aln.strand == -1:
                cons_seq = str(Bio.Seq.Seq(cons_seq).reverse_complement())
                strand = "-"
        # print(contig)
        for op in ops:
            # skip matches
            if op[1] == 'M':
                cons_pos += op[0]
                target_pos += op[0]

            # skip deletions in the query sequence
            elif op[1] == 'D':
                target_pos += op[0]

            # insertions in the query sequence
            elif op[1] == 'I':
                # only interested in large insertions
                if op[0] > min_insertion_size:
                    # Generate pysam.VariantRecord

                    # need to check conversion from 0-based coordinates to 1-based
                    ref_allele = ref_seq[target_pos-1]
                    alt_allele = cons_seq[cons_pos:cons_pos + op[0]]

                    break_point = start + target_pos
                    # output VCF record corresponding to the insertion
                    # print(break_point, (start + end) / 2 )

                    # print(len(loci[contig]), "samples at", contig)

                    # build calls data structure
                    calls = []
                    for sample in samples:
                        sample_gt = "0/0"
                        ps = 0
                        if sample in loci[contig]:
                            sample_gt = loci[contig][sample]["1"] + "|" + loci[contig][sample]["2"]
                            ps = loci[contig][sample]["ps"]
                        sample_call = vcfpy.Call(sample = sample,
                                                 data = vcfpy.OrderedDict(GT = sample_gt, PS = ps))
                        # print(sample_call)
                        calls.append(sample_call)

                    rec = vcfpy.Record(CHROM = chrom, POS = break_point, ID = [contig + "_" + str(cons_pos)],
                                       REF = ref_allele, ALT = [vcfpy.Substitution("INS", ref_allele + alt_allele)],
                                       QUAL = 999, FILTER = ["PASS"],
                                       INFO = vcfpy.OrderedDict(SVLEN = op[0],
                                                                CIGAR = [str(cig)],
                                                                STRAND = strand,
                                                                CONTIG_START = str(aln.q_st)),
                                       FORMAT = ["GT", "PS"],
                                    calls = calls)

                    # output contig that contains this insertion
                    writer.write_record(rec)

                    # output same insertion, but with flanking sequences
                    # note, the interval is [start, end[
                    if flank_length > 0:
                        left_flank = ref_fasta.fetch(chrom, break_point - flank_length, break_point)
                        right_flank = ref_fasta.fetch(chrom, break_point, break_point + flank_length)
                    else:
                        left_flank = ""
                        right_flank = ""

                    flanked_contig_fasta.writelines([ ">" + contig + "_" + str(cons_pos) + "\n",
                                                     left_flank + alt_allele[1:] + right_flank + "\n"])

                    # output same contig, but with large flanking sequences
                    # note, the interval is [start, end[
                    n_records += 1

                cons_pos += op[0]
    flanked_contig_fasta.close()
    return n_records