コード例 #1
0
def main(args_input=sys.argv[1:]):
    parser = define_parser()
    args = parser.parse_args(args_input)

    vcf_reader = create_vcf_reader(args)
    (vcf_writer,
     append_to_existing_sample) = create_vcf_writer(args, vcf_reader)

    for entry in vcf_reader:
        if "GT" not in entry.FORMAT:
            if isinstance(entry.FORMAT, tuple):
                entry.FORMAT = ["GT"]
            else:
                entry.FORMAT.insert(0, 'GT')
        if append_to_existing_sample:
            entry.call_for_sample[
                args.sample_name].data['GT'] = args.genotype_value
        else:
            new_sample_call = vcfpy.Call(args.sample_name,
                                         data={'GT': args.genotype_value})
            if entry.calls:
                entry.calls.append(new_sample_call)
            else:
                entry.calls = [new_sample_call]
            entry.call_for_sample = {call.sample: call for call in entry.calls}
        vcf_writer.write_record(entry)

    vcf_reader.close()
    vcf_writer.close()
コード例 #2
0
ファイル: test_reader_nosample.py プロジェクト: yeemey/vcfpy
def test_reading_parse_nosample(tmpdir, nosample_vcf_file):
    """Read VCF file without samples, write file with samples."""
    # Perform record-wise copying, saving results in records
    path_out = tmpdir.mkdir("output").join("output.vcf")
    with vcfpy.Reader.from_path(nosample_vcf_file) as reader:
        header = reader.header.copy()
        header.samples = vcfpy.SamplesInfos(["NA00001", "NA00002", "NA00003"])
        with vcfpy.Writer.from_path(str(path_out), header) as writer:
            for record in reader:
                record.update_calls([
                    vcfpy.Call(sample, {})
                    for sample in ("NA00001", "NA00002", "NA00003")
                ])
                record.add_format("GT", "./.")
                writer.write_record(record)

    expected = textwrap.dedent("""
    ##fileformat=VCFv4.3
    ##contig=<ID=20,length=62435964,assembly=B36,md5=f126cdf8a6e0c7f379d618ff66beb2da,species="H**o sapiens",taxonomy=x>
    ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
    #CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	NA00001	NA00002	NA00003
    20	14370	.	G	A	29	.	.	GT	.	.	.
    20	17330	.	T	A	3	.	.	GT	.	.	.
    20	1110696	.	A	G,T	67	.	.	GT	.	.	.
    20	1230237	.	T	.	47	.	.	GT	.	.	.
    20	1234567	.	GTC	G,GTCT	50	.	.	GT	.	.	.
    """).lstrip()

    assert path_out.open("rt").read() == expected
コード例 #3
0
def collect_all_vcf(
    dirs: str,
    vcf_filename: str = "phased.partial.vcf",
    output: str = "IsoSeq_IsoPhase.vcf",
) -> None:
    no_snp_found_filename = Path(f"{Path(vcf_filename).stem}.NO_SNPS_FOUND")
    snps_by_chrom = defaultdict(lambda: [])

    reader = None

    for d in dirs:
        filename = Path(d, vcf_filename)
        if not filename.exists():
            if not no_snp_found_filename.exists():
                logger.info("VCF file {filename} does not exist. Skipping.")
            continue
        with open(filename) as rf:
            reader = vcfpy.Reader(rf)

            for r in reader:
                c = Counter()  # genotype -> count
                for x in r.samples:
                    if x.data.GT.count("|") == 0:
                        c[x.data.GT] += x.data.HQ
                    else:
                        for i, gt in enumerate(x.data.GT.split("|")):
                            c[gt] += x.data.HQ[i]
                c_keys = c.keys()
                genotype = "|".join(str(k) for k in c_keys)
                counts = ",".join(str(c[k]) for k in c_keys)
                r.samples = [
                    vcfpy.Call(
                        r,
                        "SAMPLE",
                        vcfpy.OrderedDict([("GT", genotype), ("HQ", counts)]),
                    )
                ]
                snps_by_chrom[r.CHROM].append((r.POS, r))

    keys = list(snps_by_chrom.keys())
    keys.sort()

    if reader is not None:
        reader.samples = ["SAMPLE"]
        with open(output, "w") as f:
            f = vcfpy.Writer(f, reader)
            for k in keys:
                v = snps_by_chrom[k]
                v.sort(key=lambda x: x[0])
                for _, rec in v:
                    f.write_record(rec)
        print("Output written to:", output)
コード例 #4
0
def get_sample_call(sample_name, records):
    """
    This function generates the Call for a single sample at at a given location, given a single record, multiple records or no record at all
    :param sample_name:
    :param records:
    :return:
    """
    call_data = vcfpy.OrderedDict.fromkeys(["GT", "TRANCHE2", "VAF"])

    if records:
        average_vaf = get_average_vaf(records)
        call_data["GT"] = get_gt(average_vaf)
        call_data["TRANCHE2"] = maximum_tranche(records)
        call_data["VAF"] = average_vaf

    return vcfpy.Call(sample=sample_name, data=call_data)
コード例 #5
0
 def _write_variants_data(self):
     for small_var in self._yield_smallvars():
         # Get variant type
         if len(small_var.reference) == 1 and len(
                 small_var.alternative) == 1:
             var_type = vcfpy.SNV
         elif len(small_var.reference) == len(small_var.alternative):
             var_type = vcfpy.MNV
         else:
             var_type = vcfpy.INDEL
         # Build list of calls
         calls = [
             vcfpy.Call(
                 member,
                 {
                     key.upper(): f(
                         small_var.genotype.get(member, {}).get(
                             key, default_value))
                     for key, default_value, f in (
                         ("gt", "./.", lambda x: x),
                         ("gq", None, lambda x: x),
                         ("ad", None, lambda x: None if x is None else [x]),
                         ("dp", None, lambda x: x),
                     )
                 },
             ) for member in self.members
         ]
         # Construct and write out the VCF ``Record`` object
         self.vcf_writer.write_record(
             vcfpy.Record(
                 small_var.chromosome,
                 small_var.start,
                 [],
                 small_var.reference,
                 [vcfpy.Substitution(var_type, small_var.alternative)],
                 None,
                 [],
                 {},
                 ["GT", "GQ", "AD", "DP"],
                 calls,
             ))
コード例 #6
0
def generate_records(locus, individuals, chrom, offset):
    """Generate VCF records for all mutations of the given locus."""
    records = []

    mutation_allele_frequencies = dict()
    mutation_sample_count = defaultdict(set)

    # assemble normalized and sorted list of mutations, each of which
    # will later receive one record in the VCF file
    # additionally, create a dictionary that maps mutations to their
    # allele frequency: the sum of all alleles with this specific mutation
    for allele, frequency in locus["allele frequencies"].items():
        # print("allele", allele)
        if allele == 0:
            # skip reference alleles
            continue
        else:
            # collect all mutated positions
            all_mutations = set()
            for ind in individuals:
                mutations_per_individual = set()
                try:
                    mutations_per_individual.update(
                        locus["individuals"][ind][allele]["mutations"])
                    # print(mutations_per_individual)
                    for m in mutations_per_individual:
                        mutation_sample_count[parse_mutation(m, offset)].add
                        (ind)
                    all_mutations.update(mutations_per_individual)
                except KeyError:
                    # this allele is not present in this individual
                    ...

            # normalize them and sort them by position in merged read
            # this is rad seq stacks specific
            normalized_mutations = sorted(
                [
                    parse_mutation(a, offset)
                    for a in all_mutations
                ],
                key=lambda mut: mut.pos,
            )

            for mut in normalized_mutations:
                if mut in mutation_allele_frequencies:
                    mutation_allele_frequencies[mut] += frequency
                else:
                    mutation_allele_frequencies[mut] = frequency

    individual_allele_coverage = defaultdict(lambda: 0)
    for ind_name, alleles in locus["individuals"].items():
        for name, allele in alleles.items():
            if not allele["mutations"]:
                individual_allele_coverage[ind_name, 0] += allele["cov"]
            else:
                for mut in allele["mutations"]:
                    individual_allele_coverage[ind_name, parse_mutation(mut, offset)] += allele["cov"]

    # TODO: make sure that there is no position with two different alt bases
    # right now, these are not handled properly
    #
    # create one record for each mutation,
    # i.e. each variant at each mutated position
    # print("norm mut", normalized_mutations)
    for mut in normalized_mutations:
        info = OrderedDict()
        locus_calls = []
        # print(f"looking for {mut}")
        # per mutated pos -> record

        # round allele frequencies
        info["AF"] = [round(mutation_allele_frequencies[mut], 3)]
        # coverage of the variant site is the sum of all reads
        # of all individuals
        info["DP"] = sum(locus["allele coverages"].values())
        # number of samples with mutation
        info["NS"] = len(mutation_sample_count[mut])

        # check for each individual, if the reference base
        # or another base is present at this location
        for ind in individuals:
            individual_calls = OrderedDict()
            individual_alleles = parse_alleles(locus["individuals"][ind],
                                               offset)
            # print(f"norm individual alleles for {ind}", individual_alleles)
            # coverage for the individual is ths sum of all reads coveraging
            # the site
            individual_calls["DP"] = sum(
                (i.cov for i in individual_alleles.values())
            )
            # get call strings
            allele_presence, allele_str = allele_present(individual_alleles, mut)
            individual_calls["GT"] = allele_str
            # print(individual_calls["GT"])

            # fill individual allele coverage as a tuple of
            # (coverage of ref allele, coverage of alt allele)
            try:
                
                if allele_presence is None:
                    ind_allele_cov = (0, 0)
                elif allele_presence == (0, 0):
                    ind_allele_cov = (individual_allele_coverage[(ind, 0)], 0)
                elif allele_presence == (1, 1):
                    ind_allele_cov = (0, individual_allele_coverage[(ind, mut)])
                elif allele_presence == (1, 0):
                    ind_allele_cov = (individual_allele_coverage[(ind, mut)], individual_allele_coverage[(ind, 0)])
                elif allele_presence == (0, 1):
                    ind_allele_cov = (individual_allele_coverage[(ind, 0)], individual_allele_coverage[(ind, mut)])
                else:
                    raise ValueError("Invalid mutation")
                individual_calls["AD"] = ind_allele_cov
            except KeyError:
                print(individual_allele_coverage)
                raise
            
            # TODO: handle different variants of the same base on
            # different alleles => REF = A, ALT = C,T, GT= 0|1|2
            locus_calls.append(vcfpy.Call(ind, individual_calls))

        rec = vcfpy.record.Record(
                CHROM=chrom,
                POS=mut.pos,
                ID=[""],
                REF=mut.ref,
                ALT=[vcfpy.Substitution("SNP", mut.alt)],
                QUAL="",
                FILTER=["PASS"],
                INFO=info,
                FORMAT=["GT", "DP", "AD"],
                calls=locus_calls
            )
        # print("Record:", rec)
        records.append(
            rec
        )
    return records
コード例 #7
0
ファイル: Nk_mergeVCF.py プロジェクト: dooguypapua/NiourK_nf
    # Info
    dico_info = { "CALLNB":[nb_call], "CALLAF":["|".join(numpy.array(lst_af,dtype=str))], "CALLFILTER":["|".join(lst_filter).replace(" ","")], "CALLQUAL":["|".join(numpy.array(lst_qual,dtype=str))] }
    #***** FORMAT *****#
    lst_format_id = ['GT', 'DP', 'AF']
    # Merge GT field
    try: set_gt.remove('./.')
    except: pass
    if len(set_gt)==1: field_gt = set_gt.pop()
    else: field_gt = "./."
    # Merge DP field
    field_dp = int(round(numpy.median(lst_dp),0))
    # Merge AF field
    while lst_af.count(".")>0: lst_af.remove(".")
    field_af = float(round(numpy.median(lst_af),2))
    # Create call
    dico_calls = [vcfpy.Call(dico_vcf[var_id]["sample"], {'GT':field_gt, 'DP':field_dp, 'AF':[field_af]})] 
    #***** WRITE VARIANT *****#
    new_record = vcfpy.Record(chrom, pos, ".", ref, dico_vcf[var_id]["ALT"], field_qual, [field_filter], dico_info, lst_format_id, dico_calls)
    writer.write_record(new_record)
writer.close()



#***** POST-PROCESSING *****#
# Sort
sortVCF(pathMergeUnsortedVCF,pathMergeVCF)
# Validate
boolvalid,lst_errors = validateVCF(path_vcfvalidator,pathMergeVCF)
if boolvalid==False: exit("🅴 🆁 🆁 🅾 🆁\n[Nk_mergeVCF] Validate VCF `"+os.path.basename(pathMergeVCF)+"`\n    "+"\n    ".join(lst_errors))
# bgzip
cmd_bgzip = "bgzip -f "+pathMergeVCF
コード例 #8
0
ファイル: alignment_to_vcf.py プロジェクト: cgroza/BarcodeAsm
def extract_vcf_records(
        sample_name,
        # input paths
        alignments_path,
        contigs_path,
        ref_fasta_path,
        vcf_template_path,
        # output paths
        vcf_out_path,
        selected_contigs_path,
        flanked_contigs_path,
        flank_length,
        min_insert_size):

    n_records = 0
    ref_fasta = pysam.FastaFile(ref_fasta_path)
    contig_fasta = pysam.FastaFile(contigs_path)

    selected_contig_fasta = open(selected_contigs_path, "w")
    flanked_contig_fasta = open(flanked_contigs_path, "w")

    alns = pandas.read_csv(alignments_path, sep=" ")

    reader = vcfpy.Reader.from_path(vcf_template_path)
    reader.header.samples = vcfpy.SamplesInfos([sample_name])

    writer = vcfpy.Writer.from_path(vcf_out_path, reader.header)

    contig_loci = set()

    # parse each alignment and look for insertions above min_insert_size
    for r in alns.iterrows():
        # skip secondary alignments
        hit = r[1]["Hit"]
        if hit > 0:
            continue

        query_name = r[1]["QName"]

        # local alignment window in the reference
        ref_chrom, ref_start, ref_end, phase_set, phase, n = query_name.split(
            "_")

        phase_set = phase_set[2:]
        phase = phase[2:]

        # convert to ints
        ref_start, ref_end = (int(ref_start), int(ref_end))

        # alignment start and end for reference sequence
        target_start = r[1]["TStart"]
        target_end = r[1]["TEnd"]

        # alignment start and end for query sequence
        query_start = r[1]["QStart"]
        query_end = r[1]["QEnd"]

        # strand-ness of the query sequence
        strand = r[1]["Strand"]

        # parse cigar for variant extraction
        cig = cigar.Cigar(r[1]["CIGAR"])
        ops = list(cig.items())

        # convert sequences to the positive strand
        query_seq = contig_fasta.fetch(query_name)
        if strand == "-":
            query_seq = str(Bio.Seq.Seq(query_seq).reverse_complement())

        ref_seq = ref_fasta.fetch(ref_chrom, ref_start, ref_end)

        # initialize iterators for the cigar string
        query_pos = query_start
        target_pos = target_start

        # we are looking to extract insertions larger than 50bp
        for op in ops:
            # skip matches
            if op[1] == 'M':
                query_pos += op[0]
                target_pos += op[0]

            # skip deletions in the query sequence
            elif op[1] == 'D':
                target_pos += op[0]

            # insertions in the query sequence
            elif op[1] == 'I':
                # only interested in large insertions
                if op[0] > min_insert_size:
                    # Generate pysam.VariantRecord

                    # need to check conversion from 0-based coordinates to 1-based
                    ref_allele = ref_seq[target_pos]
                    alt_allele = ref_allele + query_seq[query_pos:query_pos +
                                                        op[0]]

                    gt = ""
                    if phase == "1":
                        gt = "1|0"
                    elif phase == "2":
                        gt = "0|1"
                    else:
                        gt = "0/1"

                    break_point = ref_start + target_pos
                    # output VCF record corresponding to the insertion
                    rec = vcfpy.Record(
                        CHROM=ref_chrom,
                        POS=break_point + 1,
                        ID=[query_name],
                        REF=ref_allele,
                        ALT=[vcfpy.Substitution("INS", alt_allele)],
                        QUAL=999,
                        FILTER=["PASS"],
                        INFO={},
                        FORMAT=[
                            "GT", "SVLEN", "PS", "HP", "CIGAR", "STRAND",
                            "CONTIG_START"
                        ],
                        calls=[
                            vcfpy.Call(sample=sample_name,
                                       data=vcfpy.OrderedDict(
                                           GT=gt,
                                           SVLEN=op[0],
                                           PS=phase_set,
                                           HP=phase,
                                           CIGAR=str(cig),
                                           STRAND=strand,
                                           CONTIG_START=str(query_start)))
                        ])

                    n_records += 1
                    # output contig that contains this insertion
                    writer.write_record(rec)

                    contig_locus = ">" + query_name + "_" + sample_name
                    contig_hash = sha1("_{chrom}_{pos}_{alt}".format(
                        chrom=ref_chrom, pos=ref_start,
                        alt=alt_allele[1:]).encode()).hexdigest()

                    contig_name = contig_locus + "_" + contig_hash + "_" + str(
                        op[0])

                    if contig_locus not in contig_loci:
                        selected_contig_fasta.writelines(
                            [contig_name + "\n", query_seq + "\n"])
                        contig_loci.add(contig_locus)

                    # output same insertion, but with flanking sequences
                    # note, the interval is [start, end[
                    if flank_length > 0:
                        left_flank = ref_fasta.fetch(
                            ref_chrom, break_point - flank_length, break_point)
                        right_flank = ref_fasta.fetch(
                            ref_chrom, break_point, break_point + flank_length)
                    else:
                        left_flank = ""
                        right_flank = ""
                    flanked_contig_fasta.writelines([
                        contig_name + "\n",
                        left_flank + alt_allele[1:] + right_flank + "\n"
                    ])

                query_pos += op[0]
    selected_contig_fasta.close()
    return n_records
コード例 #9
0
    def write_haplotype_to_vcf(self, fake_genome_mapping_filename,
                               isoform_tally, output_prefix):
        """
        The following functions must first be called first:
        -- self.get_haplotype_vcf_assignment
        """
        if self.haplotype_vcf_index is None or self.alt_at_pos is None:
            raise Exception(
                "Must call self.get_haplotype_vcf_assignment() first!")

        self.sanity_check()

        name_isoforms = list(isoform_tally.keys())
        name_isoforms.sort()

        # write a fake VCF example so we can read the headers in
        with open("template.vcf", "w") as f:
            f.write(__VCF_EXAMPLE__)
        reader = vcfpy.Reader(open("template.vcf"))
        reader.samples = name_isoforms
        f_vcf = vcfpy.Writer(f"{output_prefix}.vcf", reader)

        # human readable text:
        # first line: assoc VCF filename
        # second line: haplotype, list of sorted isoforms
        # third line onwards: haplotype and assoc count
        with open(f"{output_prefix}.human_readable.txt", "w") as f_human:
            f_human.write(f"Associated VCF file: {output_prefix}.vcf\n")
            f_human.write("haplotype\t{samples}\n".format(
                samples="\t".join(name_isoforms)))
            for hap_index, hap_str in enumerate(self.haplotypes):
                f_human.write(hap_str)
                for _iso in name_isoforms:
                    if hap_index in isoform_tally[_iso]:
                        f_human.write(f"\t{isoform_tally[_iso][hap_index]}")
                    else:
                        f_human.write("\t0")
                f_human.write("\n")

        # read fake genome mapping file
        fake_map = {}  # 0-based position on fake --> (, 0-based ref position)
        with open(fake_genome_mapping_filename) as f:
            for line in f:
                fake_pos, ref_chr, ref_pos = line.strip().split(",")
                fake_map[int(fake_pos)] = (ref_chr, int(ref_pos))

        # for each position, write out the ref and alt bases
        # then fill in for each isoform (aka "sample"):
        #  if this isoform only shows one allele, then it's just that allele (0 for ref, 1+ otherwise)
        #  if this isoform shows 2+ allele, then the first allele is indicated by self.haplotypes[0]
        for i, pos in enumerate(self.hap_var_positions):
            ref_chr, ref_pos = fake_map[pos]
            total_count = sum(self.count_of_vars_by_pos[pos].values())
            alt_freq = [
                f"{self.count_of_vars_by_pos[pos][b] * 1.0 / total_count:.2f}"
                for b in self.alt_at_pos[pos]
            ]
            rec = vcfpy.Record(
                CHROM=ref_chr,
                POS=ref_pos + 1,
                ID=".",
                REF=self.ref_at_pos[pos],
                ALT=[vcfpy.Substitution(b) for b in self.alt_at_pos[pos]],
                QUAL=".",
                FILTER="PASS",
                INFO={
                    "AF": alt_freq,
                    "DP": total_count
                },
                FORMAT="GT:HQ",
                sample_indexes=None,
            )

            rec.samples = []
            for _iso in name_isoforms:
                # isoform_tally[_iso] is a dict of haplotype index --> count
                # the index for thos base at this pos would thus be haplotype_vcf_index[hap_index][i]
                # we always need to show the phases in haplotype index order sorted
                hap_indices = list(isoform_tally[_iso].keys())
                hap_indices.sort()
                genotype = "|".join(
                    str(self.haplotype_vcf_index[hap_index][pos])
                    for hap_index in hap_indices)
                counts = ",".join(
                    str(isoform_tally[_iso][hap_index])
                    for hap_index in hap_indices)
                rec.samples.append(
                    vcfpy.Call(
                        rec, _iso,
                        vcfpy.OrderedDict([("GT", genotype), ("HQ", counts)])))
            f_vcf.write_record(rec)
        f_vcf.close()
コード例 #10
0
def main():

    parser = argparse.ArgumentParser(description="vcf writer")
    parser.add_argument("input",
                        metavar='input.vcf',
                        action='store',
                        help='vcf file.',
                        type=str)
    parser.add_argument("output",
                        metavar='output.vcf',
                        action='store',
                        help='vcf file.',
                        type=str)

    args = parser.parse_args()

    outvcf = args.output
    invcf = args.input

    #########################
    #                       #
    #  creating the header  #
    #                       #
    #########################

    # The header can contain some fixed type lines (INFO, FORMAT, FILTER, etc.) and some general ones
    # In this case, the header will contain a line storing the name of the program which generated
    # the file. We also add the information about the name of the sample which have been analyzed

    header = vcfpy.Header(lines=[
        vcfpy.HeaderLine(key="source", value=sys.argv[0]),
        vcfpy.HeaderLine(key="fileformat", value="VCFv4.3"),
        vcfpy.HeaderLine(key="fileDate",
                         value=date.today().strftime("%d/%m/%Y"))
    ],
                          samples=vcfpy.SamplesInfos(["Sample1"]))

    # adding format lines
    header.add_format_line(
        OrderedDict([("ID", "GT"), ("Number", "1"), ("Type", "String"),
                     ("Description", "Genotype")]))
    header.add_format_line(
        OrderedDict([("ID", "DP"), ("Number", "1"), ("Type", "Integer"),
                     ("Description", "Filtered read depth (MAPQ > 30)")]))

    # read the input vcf
    with vcfpy.Reader.from_path(invcf) as reader:

        # get the FORMAT header lines of the input file
        # and convert them in INFO header lines of the output file
        format_ids = reader.header.format_ids()
        for format_id in format_ids:
            format_line = reader.header.get_format_field_info(format_id)
            '''
            output example:
        
            FormatHeaderLine('FORMAT', '<ID=AD,Number=R,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed">', {'ID': 'AD', 'Number': 'R', 'Type': 'Integer', 'Description': 'Allelic depths for the ref and alt alleles in the order listed'})

            key = 'FORMAT'
            value = '<ID=AD,Number=R,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed">
            '''
            header.add_info_line(str_to_mapping(format_line.value))
            #print(header)

    # write the vcf
    with vcfpy.Writer.from_path(outvcf, header) as writer:

        # creating one record
        record = vcfpy.Record(CHROM="1",
                              POS=1,
                              ID=[],
                              REF="C",
                              ALT=[vcfpy.Substitution(type_="SNV", value="G")],
                              QUAL=None,
                              FILTER=[],
                              INFO={},
                              FORMAT=["GT", "DP"],
                              calls=[
                                  vcfpy.Call(
                                      "Sample1",
                                      OrderedDict([("GT", "0/1"),
                                                   ("DP", "47")]))
                              ])
        #print(record)
        writer.write_record(record)
コード例 #11
0
def main():
    parser = argparse.ArgumentParser(
        description="Looks for a given set of SNPs whithin a bam file.")

    parser.add_argument("bam",
                        metavar='sample.bam',
                        action='store',
                        help='BAM file.',
                        type=str)

    parser.add_argument(
        "barcodes",
        metavar='barcodes.list',
        action='store',
        help=
        "File containing cell barcodes (the same used in the alignment file to identify cell reads).",
        type=str)

    parser.add_argument("vcf",
                        metavar='variants.vcf',
                        action='store',
                        help="VCF file storing BULK SNPs.",
                        type=str)

    parser.add_argument("sample_name",
                        metavar='sample1',
                        action='store',
                        help="Sample identifier.",
                        type=str)

    parser.add_argument("out_prefix",
                        metavar="outdir/sample",
                        action="store",
                        help="Output VCF file prefix.",
                        type=str)

    parser.add_argument(
        "--gt",
        metavar='1/1 (0/1)',
        choices=["0/0", "0/1", "1/1"],
        action='store',
        help=
        "Genotype filter: considers only mutations with the specified GT in the original vcf file.",
        type=str)

    args = parser.parse_args()
    bam = args.bam
    barcodes = args.barcodes
    invcf = args.vcf
    sample = args.sample_name
    outvcf = args.out_prefix + ".snpseeker.vcf"

    if args.gt:
        gt_filter = True
        gt = args.gt

    else:
        gt_filter = False

    with open(barcodes, "r") as f:
        samples = f.read().splitlines()
    #read bam file
    samfile = pysam.AlignmentFile(bam, "rb")

    #build the header of the output vcf
    header_out = vcfpy.Header(lines=[
        vcfpy.HeaderLine(key="fileformat", value="VCFv4.3"),
        vcfpy.HeaderLine(key="source", value=sys.argv[0]),
        vcfpy.HeaderLine(key="fileDate",
                         value=date.today().strftime("%d/%m/%Y"))
    ],
                              samples=vcfpy.SamplesInfos(samples))

    # sample header lines
    header_out.add_line(
        vcfpy.SampleHeaderLine.from_mapping(
            OrderedDict([("ID", sample), ("Description", "Sample name")])))

    # filter header lines
    # sample header lines
    header_out.add_filter_line(
        OrderedDict([("ID", "1/1"), ("Number", "1"),
                     ("Description", "Filtered on such GT")]))
    header_out.add_filter_line(
        OrderedDict([("ID", "0/1"), ("Number", "1"),
                     ("Description", "Filtered on such GT")]))
    header_out.add_filter_line(
        OrderedDict([("ID", "0/0"), ("Number", "1"),
                     ("Description", "Filtered on such GT")]))

    #header_out.add_info_line(OrderedDict([("ID", "MUT"), ("Number", "1"), ("Type","Integer"), ("Description", "States if the record mutation is supported (1) or not (0).")]))

    # format header lines
    header_out.add_format_line(
        OrderedDict([("ID", "GT"), ("Number", "1"), ("Type", "String"),
                     ("Description", "Genotype (0/1, 0/0)")]))
    header_out.add_format_line(
        OrderedDict([
            ("ID", "DP"), ("Number", "1"), ("Type", "Integer"),
            ("Description",
             "Filtered read depth (reads with MAPQ < 30, indels and gaps are filtered)"
             )
        ]))
    header_out.add_format_line(
        OrderedDict([("ID", "RD"), ("Number", "1"), ("Type", "Integer"),
                     ("Description", "Reference allele read depth")]))
    header_out.add_format_line(
        OrderedDict([("ID", "AD"), ("Number", "1"), ("Type", "Integer"),
                     ("Description", "Alternate allele read depth")]))
    header_out.add_format_line(
        OrderedDict([
            ("ID", "AF"), ("Number", "1"), ("Type", "Float"),
            ("Description",
             "Allele frequency: AD/(RD+AD). Other alleles, in case of mutli-allelic regions, are ignored."
             )
        ]))

    # read input vcf
    reader = vcfpy.Reader.from_path(invcf)

    # info header lines
    # Use input FORMAT lines as output INFO line
    header_out.add_info_line(
        OrderedDict([("ID", "SUPP"), ("Number", "1"), ("Type", "Integer"),
                     ("Description",
                      "Number of cells supporting the mutation.")]))

    format_ids = reader.header.format_ids()
    for format_id in format_ids:
        format_line = reader.header.get_format_field_info(format_id)
        '''
            output example:
        
            FormatHeaderLine('FORMAT', '<ID=AD,Number=R,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed">', {'ID': 'AD', 'Number': 'R', 'Type': 'Integer', 'Description': 'Allelic depths for the ref and alt alleles in the order listed'})
            key = 'FORMAT'
            value = '<ID=AD,Number=R,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed">
        '''
        mapping = str_to_mapping(format_line.value)
        mapping["Description"] = "(Info about bulk mutation)" + mapping[
            "Description"]
        header_out.add_info_line(str_to_mapping(format_line.value))

    # open the output vcf
    writer = vcfpy.Writer.from_path(outvcf, header_out)

    #read bam file
    samfile = pysam.AlignmentFile(bam, "rb")

    #for each mutation in the vcf file
    for record_in in reader:
        d = samples_dict(samples)
        supp = 0
        # filter out indels: only interested in snvs in this analysis phase
        if gt_filter:
            if record.calls[0].data.get('GT') != gt:
                continue

        if not record_in.is_snv():
            continue
        chrom = record_in.CHROM
        pos = record_in.POS - 1  #to correct on 1-based positions
        ref = record_in.REF
        alt = record_in.ALT[
            0].value  #record.ALT is a list by construction which contains only one value
        # if the mutation is a SNV
        #line += [call.data.get('GT') or './.' for call in record.calls]

        #look for the pileup in the samfile at position (chrom,pos)
        for pileupcolumn in samfile.pileup(chrom,
                                           pos,
                                           pos + 1,
                                           stepper='all',
                                           truncate=True,
                                           max_depth=10000):
            for base in pileupcolumn.pileups:
                # .is_del -> the base is a deletion?
                # .is_refskip -> the base is a N in the CIGAR string ?
                if not base.is_del and not base.is_refskip and not base.alignment.mapping_quality < 30:
                    #iterate on cells
                    tags = list_to_dict(base.alignment.tags)
                    if "CB" not in tags.keys():
                        ''' reads with no error-corrected barcode are discarded '''
                        continue
                    elif tags["CB"].split("-")[0] not in samples:
                        ''' The barcode hasn't been labeled has belonging to a cell by cellranger (floating DNA)'''
                        continue
                    cb = tags["CB"].split("-")[0]  #10x barcodes
                    #print("barcode {} is a cell barcode ".format(cb))
                    d[cb][
                        'dp'] += 1  #update info for the sample identified by CB
                    if base.alignment.query_sequence[
                            base.query_position] == alt:
                        d[cb]['ad'] += 1
                    elif base.alignment.query_sequence[
                            base.query_position] == ref:
                        d[cb]['rd'] += 1
        for cb in d.keys():
            if d[cb]['ad'] > 0:
                supp += 1
                d[cb][
                    'gt'] = "0/1"  #temporary, all the supported mutations are set to 0/1
                d[cb]['af'] = d[cb]['ad'] / (d[cb]['rd'] + d[cb]['ad'])

        # generate calls for each sample/cell
        calls = []
        for cb in d.keys():
            calls.append(
                vcfpy.Call(
                    cb,
                    OrderedDict([("GT", d[cb]['gt']), ("DP", d[cb]['dp']),
                                 ("RD", d[cb]['rd']), ("AD", d[cb]['ad']),
                                 ("AF", d[cb]['af'])])))

        # create a mapping between each FORMAT entry and the
        # corresponding value, in the call, in the input vcf file
        # note that the input vcf contains only one sample, so
        # the calls field of each record contains only one entry
        info_d = {}
        info_d['SUPP'] = supp
        for f in record_in.FORMAT:
            info_d[f] = record_in.calls[0].data.get(f)

        if gt_filter == True:
            filter_l = [gt]
        else:
            filter_l = []

        # build and write the output record

        record_out = vcfpy.Record(
            CHROM=chrom,
            POS=pos + 1,
            ID=[],
            REF=ref,
            ALT=[vcfpy.Substitution(type_="SNV", value=alt)],
            QUAL=None,
            FILTER=filter_l,
            INFO=info_d,
            FORMAT=["GT", "DP", "RD", "AD", "AF"],
            calls=calls)
        writer.write_record(record_out)

    reader.close()
    writer.close()
    samfile.close()
コード例 #12
0
ファイル: export-vcf.py プロジェクト: weigangq/cov-db
            if site in genoSample[acc]:  # is mutated
                #               print(genoSample[acc][site])
                if genoSample[acc][site] in geno:  # alt is valid
                    allele = geno[genoSample[acc][site]]
#                    logging.info("alt assigned for %s at %s: %s", acc, site, allele)
                else:  # alt is singleton/discarded
                    logging.warning(
                        "alt is singleton for %s at %s: assign ref allele",
                        acc, site)
#            else:
#                logging.info("ref alleles assigned for %s at %s", acc, site)
            gt = str(allele) + "|" + str(allele) if args.diploid else str(
                allele)
            sampleCall = vcfpy.Call(
                sample=acc,
                data={'GT': gt},  # has to be string; diploid
                #                data = {'GT': str(allele) }, # has to be string
                site=site)
            genoCalls.append(sampleCall)

        record = vcfpy.Record(
            CHROM=refEPI,
            POS=site,
            ID=snpInfo[site]['varID'],
            REF=snpInfo[site]['refNT'],
            ALT=subs,
            QUAL=None,
            FILTER=[],  # PASS
            INFO={},  # consequence calls, locus, etc; a dict
            FORMAT=['GT'],  # a list
            calls=genoCalls)
コード例 #13
0
def write_snp_to_vcf(
    snp_filename: Path,
    vcf_filename: Path,
    genome_filename: Path,
    genome_d: LazyFastaReader = None,
) -> None:
    # read the genome is genome_d is not given
    if genome_d is None:
        genome_d = LazyFastaReader(genome_filename)

    # read the first SNP record so we know the query name
    snp_reader = SNPReader(snp_filename)
    snp_rec = next(snp_reader)
    sample_name = snp_rec.query_name
    cur_recs = [snp_rec]
    genome_rec = genome_d[snp_rec.ref_name]

    with open("template.vcf", "w+") as f:
        f.write(f"{__VCF_EXAMPLE__}\n")
        reader = vcfpy.Reader(f)
        reader.samples = [sample_name]
        f_vcf = vcfpy.Writer(vcf_filename, reader)

        for r1 in snp_reader:
            if r1.ref_pos == cur_recs[
                    -1].ref_pos:  # multi-nt insertion, keep recording
                cur_recs.append(r1)
            elif (r1.query_base == "." and cur_recs[-1].query_base
                  == "."):  # multi-nt deletion, keep recording
                cur_recs.append(r1)
            else:  # time to write out the current set of records
                # multiple records mean it could be:
                # 1. multi-nucleotide insertions
                # 2. multi-nucleotide deletions

                if (len(cur_recs) == 1 and cur_recs[0].ref_base != "." and
                        cur_recs[0].query_base != "."):  # just a SNP record
                    pos = cur_recs[0].ref_pos
                    ref_base = cur_recs[0].ref_base
                    alt_base = cur_recs[0].query_base
                elif cur_recs[0].ref_base == ".":
                    # is a single or multi-nt insertions, must retrieve ref base from genome
                    # ex: in out.snps_files it is . --> ATG
                    # in VCF it should be T --> TATG (meaning insertion of ATG)
                    pos = cur_recs[0].ref_pos
                    ref_base = genome_rec[cur_recs[0].ref_pos]
                    alt_base = ref_base + "".join(r.query_base
                                                  for r in cur_recs)
                else:
                    # is a single multi-nt deletions, we need to get one more ref base before the first deletion
                    # ex: in out.snps_files it is GGG --> deletion
                    # in VCF it should be TGGG --> T (meaning deletion of GGG)
                    pos = cur_recs[0].ref_pos - 1
                    ref_base_prev = genome_rec[pos]
                    ref_base = ref_base_prev + "".join(r.ref_base
                                                       for r in cur_recs)
                    alt_base = ref_base_prev

                rec = vcfpy.Record(
                    CHROM=snp_rec.ref_name,
                    POS=pos + 1,
                    ID=".",
                    REF=ref_base,
                    ALT=[vcfpy.Substitution(alt_base)],
                    QUAL=".",
                    FILTER="PASS",
                    INFO={"AF": 0.5},
                    FORMAT="GT",
                    sample_indexes=None,
                )

                rec.samples.append(
                    vcfpy.Call(rec, sample_name,
                               vcfpy.OrderedDict([("GT", "0|1")])))
                f_vcf.write_record(rec)
                if r1.ref_name != cur_recs[0].ref_name:
                    genome_rec = genome_d[r1.ref_name]
                cur_recs = [r1]
コード例 #14
0
def main():

    parser = argparse.ArgumentParser(description="vcf writer")
    parser.add_argument("output",
                        metavar='output.vcf',
                        action='store',
                        help='vcf file.',
                        type=str)

    args = parser.parse_args()

    outvcf = args.output

    #########################
    #                       #
    #  creating the header  #
    #                       #
    #########################

    # The header can contain some fixed type lines (INFO, FORMAT, FILTER, etc.) and some general ones
    # In this case, the header will contain a line storing the name of the program which generated
    # the file. We also add the information about the name of the sample which have been analyzed

    header = vcfpy.Header(lines=[
        vcfpy.HeaderLine(key="source", value=sys.argv[0]),
        vcfpy.HeaderLine(key="fileformat", value="VCFv4.3"),
        vcfpy.HeaderLine(key="fileDate",
                         value=date.today().strftime("%d/%m/%Y"))
    ],
                          samples=vcfpy.SamplesInfos(["Sample1", "Sample2"]))

    # Tuples of valid entries -----------------------------------------------------
    #
    #: valid INFO value types
    # INFO_TYPES = ("Integer", "Float", "Flag", "Character", "String")
    #: valid FORMAT value types
    # FORMAT_TYPES = ("Integer", "Float", "Character", "String")
    #: valid values for "Number" entries, except for integers
    # VALID_NUMBERS = ("A", "R", "G", ".")
    #: header lines that contain an "ID" entry
    # LINES_WITH_ID = ("ALT", "contig", "FILTER", "FORMAT", "INFO", "META", "PEDIGREE", "SAMPLE")
    # Constants for "Number" entries ----------------------------------------------
    #
    #: number of alleles excluding reference
    # HEADER_NUMBER_ALLELES = "A"
    #: number of alleles including reference
    # HEADER_NUMBER_REF = "R"
    #: number of genotypes
    # HEADER_NUMBER_GENOTYPES = "G"
    #: unbounded number of values
    # HEADER_NUMBER_UNBOUNDED = "."

    # adding filter lines
    header.add_filter_line(
        OrderedDict([("ID", "PASS"), ("Description", "All filters passed")]))

    # adding info lines
    header.add_info_line(
        OrderedDict([("ID", "DP"), ("Number", "1"), ("Type", "Integer"),
                     ("Description",
                      "Raw read depth (without mapping quality filters)")]))
    header.add_info_line(
        OrderedDict([
            ("ID", "MUT"), ("Number", "1"), ("Type", "Integer"),
            ("Description",
             "States if the record mutation is supported (1) or not (0).")
        ]))

    # adding format lines
    header.add_format_line(
        OrderedDict([("ID", "GT"), ("Number", "1"), ("Type", "String"),
                     ("Description", "Genotype")]))
    header.add_format_line(
        OrderedDict([("ID", "DP"), ("Number", "1"), ("Type", "Integer"),
                     ("Description", "Filtered read depth (MAPQ > 30)")]))
    #header.add_format_line(OrderedDict([vcfpy.header.RESERVED_FORMAT["GT"]]))

    # adding contig lines
    header.add_contig_line(
        OrderedDict([("ID", "chr1"), ("length", "248956422")]))

    # adding sample lines
    header.add_line(
        vcfpy.SampleHeaderLine.from_mapping(
            OrderedDict([("ID", "Sample1"), ("Description", "Tumor")])))

    # writing the vcf
    with vcfpy.Writer.from_path(outvcf, header) as writer:

        # creating one record
        calls = []
        calls.append(
            vcfpy.Call("Sample1", OrderedDict([("GT", "0/1"), ("DP", "47")])))
        calls.append(
            vcfpy.Call("Sample2", OrderedDict([("GT", "0/1"), ("DP", "31")])))

        record = vcfpy.Record(CHROM="1",
                              POS=1,
                              ID=[],
                              REF="C",
                              ALT=[vcfpy.Substitution(type_="SNV", value="G")],
                              QUAL=None,
                              FILTER=["PASS"],
                              INFO={
                                  "DP": "50",
                                  "MUT": 0
                              },
                              FORMAT=["GT", "DP"],
                              calls=calls)
        #record.add_format(key="GT")
        #record.calls.append(vcfpy.Call("Sample1", OrderedDict([("GT", "0|1")])))
        writer.write_record(record)
コード例 #15
0
def main():
    parser = argparse.ArgumentParser(description="Looks for a given set of SNPs whithin a bam file.")


    parser.add_argument("bam", metavar='sample.bam', action='store',
        help='BAM file.', type=str)

    parser.add_argument("vcf", metavar='file.vcf', action='store',
        help="VCF file storing SNPs.", type=str)

    parser.add_argument("sample_name", metavar='sample1', action='store',
                help="Sample identifier.", type=str)



    parser.add_argument("out_prefix", metavar="outdir/sample", action="store",
        help="Output VCF file prefix.", type=str)

    #parser.add_argument("--sample_name2", metavar='sample2', action='store',
    #                            help="Another sample name", type=str)

    args = parser.parse_args()
    bam= args.bam
    invcf = args.vcf
    sample = args.sample_name
    outvcf = args.out_prefix + ".snpseeker.vcf"

    '''
    if args.sample_name2:
    sample_name2 = args.sample_name2
    else:
    sample_name2 = null
    '''

    #read bam file
    samfile = pysam.AlignmentFile(bam, "rb")

    #build the header of the output vcf
    header_out = vcfpy.Header(lines=[vcfpy.HeaderLine(key="fileformat", value="VCFv4.3"), vcfpy.HeaderLine(key="source", value=sys.argv[0]), vcfpy.HeaderLine(key="fileDate", value=date.today().strftime("%d/%m/%Y")) ], samples=vcfpy.SamplesInfos([sample]))

    # sample header lines
    header_out.add_line(vcfpy.HeaderLine(key="SampleName", value=sample))
    '''
    if sample_name2 is not null:
    header_out.add_line(vcfpy.SampleHeaderLine.from_mapping(OrderedDict([("ID", sample_name2),("Description", "Second sample name")])))
    '''
    # info header lines
    header_out.add_info_line(OrderedDict([("ID", "SUPP"), ("Number", "1"), ("Type","Integer"), ("Description", "States if the mutation is supported (1) or not (0).")]))

    # adding format lines 
    header_out.add_format_line(OrderedDict([("ID", "GT"),("Number", "1"), ("Type","String"), ("Description", "Genotype (0/1, 0/0)")]))
    header_out.add_format_line(OrderedDict([("ID", "SDP"),("Number", "1"), ("Type","Integer"), ("Description", "Samtools read depth (secondary alignments, PCR duplicates, unppammed reads and reads not passing vendor QC are filtered)")]))
    header_out.add_format_line(OrderedDict([("ID", "DP"),("Number", "1"), ("Type","Integer"), ("Description", "Filtered read depth (reads with MAPQ < 30, indels and gaps are filtered)")]))
    header_out.add_format_line(OrderedDict([("ID", "RD"),("Number", "1"), ("Type","Integer"), ("Description", "Reference allele read depth")]))
    header_out.add_format_line(OrderedDict([("ID", "AD"),("Number", "1"), ("Type","Integer"), ("Description", "Alternate allele read depth")]))
    header_out.add_format_line(OrderedDict([("ID", "AF"),("Number", "1"), ("Type","Float"), ("Description", "Allele frequency: AD/(RD+AD). Other alleles, in case of mutli-allelic regions, are ignored.")]))

    # read input vcf
    reader = vcfpy.Reader.from_path(invcf)

    format_ids = reader.header.format_ids()
    for format_id in format_ids:
        format_line = reader.header.get_format_field_info(format_id)
        '''
            output example:
        
            FormatHeaderLine('FORMAT', '<ID=AD,Number=R,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed">', {'ID': 'AD', 'Number': 'R', 'Type': 'Integer', 'Description': 'Allelic depths for the ref and alt alleles in the order listed'})
            key = 'FORMAT'
            value = '<ID=AD,Number=R,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed">
        '''
        mapping = str_to_mapping(format_line.value)
        mapping["Description"] = "(Info about mutation in the original vcf)" + mapping["Description"]
        header_out.add_info_line(str_to_mapping(format_line.value))

    # open the output vcf
    writer = vcfpy.Writer.from_path(outvcf, header_out) 

    #read bam file
    samfile = pysam.AlignmentFile(bam, "rb")

    #for each mutation in the vcf file
    for record_in in reader:
    # filter out indels: only interested in snvs in this analysis phase
        if not record_in.is_snv():
            continue
        chrom = record_in.CHROM
        pos = record_in.POS-1 #to correct on 1-based positions
        ref = record_in.REF
        alt = record_in.ALT[0].value  #record.ALT is a list by construction which contains only one value
                                    # if the mutation is a SNV
        #line += [call.data.get('GT') or './.' for call in record.calls]

        #look for the pileup in the samfile at position (chrom,pos)
        for pileupcolumn in samfile.pileup(chrom, pos, pos+1, stepper='all', truncate=True, max_depth=10000):
            #number of reads at this position
            sdp = pileupcolumn.n
            #number of supporting reads for the alternate base
            ad = 0
            rd = 0
            dp = 0
            af = 0.0
            for base in pileupcolumn.pileups:
                # .is_del -> the base is a deletion?
                # .is_refskip -> the base is a N in the CIGAR string ?
                if not base.is_del and not base.is_refskip and not base.alignment.mapping_quality < 30:
                    dp += 1
                    if base.alignment.query_sequence[base.query_position] == alt:
                        ad += 1
                    elif base.alignment.query_sequence[base.query_position] == ref:
                        rd += 1

        if ad > 0:
            af = ad / (rd + ad)
            supp = 1
            gt = "0/1" #temporary, all the supported mutations are set to 0/1
        else:
            supp = 0
            gt = "0/0" 


        #af = ad / (rd + ad)

        info_d = {}
        info_d['SUPP'] = supp
        for f in record_in.FORMAT:
            info_d[f] = record_in.calls[0].data.get(f)

        record_out = vcfpy.Record(CHROM=chrom, POS=pos+1, ID=[], REF=ref, ALT=[vcfpy.Substitution(type_="SNV", value=alt)], QUAL=None, FILTER=[], INFO=info_d, FORMAT=["GT","SDP","DP","RD","AD","AF"],
                calls=[vcfpy.Call(sample, OrderedDict([("GT", gt), ("SDP",sdp), ("DP", dp), ("RD", rd), ("AD", ad), ("AF", af)]))]
           )
        writer.write_record(record_out)


    reader.close()
    writer.close()
    samfile.close()
コード例 #16
0
def main():
    parser = argparse.ArgumentParser(description="From single cell VCF to clones vcf.")
    parser.add_argument("input1", metavar="sample.muts.vcf", action="store", help="Single cell VCF file.", type=str)
    parser.add_argument("input2", metavar="clusters.list", action="store", help="Clusters list.", type=str)
    #parser.add_argument("input_type", choices=["gz", "vcf"], help="VCF input type (vcf/gz).", type=str)
    #parser.add_argument("sample", metavar="sample_name", action="store", help="Sample name", type=str)
    parser.add_argument("outprefix", metavar="out/path/prefix", action="store", help="Output prefix", type=str)

    args = parser.parse_args()

    input1 = args.input1
    input2 = args.input2
    prefix = args.outprefix
    #sample = args.sample
    #input_type = args.input_type

        
    clusters_df = pd.read_csv(input2)
    #clusters_df['cluster'] = clusters_df['a'].apply(lambda x: "{}_{}".format(sample, x))    

    clusters = [str(cluster) for cluster in clusters_df['cluster'].unique()]
    # Create out header
    header_out = vcfpy.Header(lines=[ vcfpy.HeaderLine(key="fileformat", value="VCFv4.3"), vcfpy.HeaderLine(key="source", value=sys.argv[0]), vcfpy.HeaderLine(key="fileDate", value=date.today().strftime("%d/%m/%Y")) ], samples=vcfpy.SamplesInfos(clusters))
     
    # format header lines 
    header_out.add_format_line(OrderedDict([("ID", "GT"),("Number", "1"), ("Type","String"), ("Description", "Genotype (0/1, 0/0)")]))
    header_out.add_format_line(OrderedDict([("ID", "DP"),("Number", "1"), ("Type","Integer"), ("Description", "Filtered read depth (reads with MAPQ < 30, indels and gaps are filtered)")]))
    header_out.add_format_line(OrderedDict([("ID", "RD"),("Number", "1"), ("Type","Integer"), ("Description", "Reference allele read depth")]))
    header_out.add_format_line(OrderedDict([("ID", "AD"),("Number", "1"), ("Type","Integer"), ("Description", "Alternate allele read depth")]))
    header_out.add_format_line(OrderedDict([("ID", "AF"),("Number", "1"), ("Type","Float"), ("Description", "Allele frequency: AD/(RD+AD). Other alleles, in case of mutli-allelic regions, are ignored.")]))
    
    # info header lines
 
    header_out.add_info_line(OrderedDict([("ID", "SUPP"), ("Number", "1"), ("Type","Integer"), ("Description", "Whether the mutation is supported or not.")]))
    
    # read input vcf
    reader = vcfpy.Reader.from_path(input1)
    # open the output vcf
    writer = vcfpy.Writer.from_path(prefix+"_clusters.vcf", header_out)
 
    """
    snps = read_vcf(input1, input_type)
    #Filtering bulk mutations not supported by cells
    snps = snps[~snps['INFO'].str.startswith("SUPP=0")]
    
    #Create mutation id column and set it as index
    snps["mutid"] = snps["CHROM"] + "_"+snps["POS"].map(str) + "_" + snps["REF"] + "_" +snps["ALT"]
    snps = snps.set_index('mutid')
    """

    #for each record in the vcf file
    for record_in in reader:
        d = samples_dict(clusters_df['cluster'].unique())
        supp = 0
        chrom = record_in.CHROM
        pos = record_in.POS-1 #to correct on 1-based positions
        ref = record_in.REF
        alt = record_in.ALT[0].value
        
        #for each cluster compute 'GT:DP:RD:AD:AF' to be provided as call argument
        for c in clusters_df['cluster'].unique():
            #retrieve cell columns for cells in current cluster
            cells = clusters_df['cellid'][clusters_df['cluster'] == c]
            
          
            #retrieve cell data
            calls = [record_in.call_for_sample[cell] for cell in cells]
            #sum total read count, alt read count and ref read count of cells in the cluster
            for call in calls:    
                d[c]['dp'] = d[c]['dp'] + call.data.get('DP') 
                d[c]['rd'] = d[c]['rd'] + call.data.get('RD')
                d[c]['ad'] = d[c]['ad'] + call.data.get('AD')

            if d[c]['ad'] > 0:
                d[c]['gt'] = "0/1"
                d[c]['af'] = d[c]['ad'] / (d[c]['rd'] + d[c]['ad'])
                supp = 1
    
        calls = []
        # create one call for each cluster
        for c in d.keys():
            calls.append(vcfpy.Call(str(c), OrderedDict([("GT", d[c]['gt']), ("DP", d[c]['dp']), ("RD", d[c]['rd']), ("AD", d[c]['ad']), ("AF", d[c]['af'])])))        
        print(calls)
         
        # write new record
        record_out = vcfpy.Record(CHROM=chrom, POS=pos+1, ID=[], REF=ref, ALT=[vcfpy.Substitution(type_="SNV", value=alt)], QUAL=None, FILTER=[], INFO={"SUPP":supp}, FORMAT=["GT","DP","RD","AD","AF"],
                calls=calls
           )
        writer.write_record(record_out)
        
    reader.close()
    writer.close()
コード例 #17
0
def extract_consensus_insertions(contig_path, cons_path, ref_fasta_path, vcf_out_path, vcf_template_path, min_insertion_size, flank_length, flanked_contigs_path):
    n_records = 0
    # open input sequences
    cons_fasta = pysam.FastaFile(cons_path)
    ref_fasta = pysam.FastaFile(ref_fasta_path)

    flanked_contig_fasta = open(flanked_contigs_path, "w")

    (samples, loci) = collect_genotypes(contig_path)
    print("Found", len(samples), "samples for", len(loci), "phased loci")

    reader = vcfpy.Reader.from_path(vcf_template_path)
    reader.header.samples = vcfpy.SamplesInfos(list(samples))
    writer = vcfpy.Writer.from_path(vcf_out_path, reader.header)

    for contig in cons_fasta.references:
        # parse coordinates
        (chrom, start, end) = contig.split("_")
        (start, end) = int(start), int(end)

        cons_seq  = cons_fasta.fetch(contig)
        ref_seq = ref_fasta.fetch(chrom, start, end)

        aligner = mappy.Aligner(seq = ref_seq, preset = None , k = 15, w = 10, n_threads = 1,
                                max_join_long = 20000, max_join_short = 10000, min_join_flank_sc = 10,
                                min_join_flank_ratio = 0.1, max_gap = 10000, bw = 2000, end_bonus = 10,
                                zdrop = 10000, zdrop_inv = 1000,
                                scoring = (2, 4, 4, 10, 300, 0, 1),
                                extra_flags = 0x1)
        alignments = list(aligner.map(cons_seq, seq2 = None, cs = True, MD = False))

        if len(alignments) == 0:
            print("No hits in", contig)
            continue

        aln = max(alignments, key = lambda x: x.blen)

        cig = cigar.Cigar(aln.cigar_str)
        ops = list(cig.items())


        cons_pos = aln.q_st
        target_pos = aln.r_st

        strand = "+"
        if aln.strand == -1:
                cons_seq = str(Bio.Seq.Seq(cons_seq).reverse_complement())
                strand = "-"
        # print(contig)
        for op in ops:
            # skip matches
            if op[1] == 'M':
                cons_pos += op[0]
                target_pos += op[0]

            # skip deletions in the query sequence
            elif op[1] == 'D':
                target_pos += op[0]

            # insertions in the query sequence
            elif op[1] == 'I':
                # only interested in large insertions
                if op[0] > min_insertion_size:
                    # Generate pysam.VariantRecord

                    # need to check conversion from 0-based coordinates to 1-based
                    ref_allele = ref_seq[target_pos-1]
                    alt_allele = cons_seq[cons_pos:cons_pos + op[0]]

                    break_point = start + target_pos
                    # output VCF record corresponding to the insertion
                    # print(break_point, (start + end) / 2 )

                    # print(len(loci[contig]), "samples at", contig)

                    # build calls data structure
                    calls = []
                    for sample in samples:
                        sample_gt = "0/0"
                        ps = 0
                        if sample in loci[contig]:
                            sample_gt = loci[contig][sample]["1"] + "|" + loci[contig][sample]["2"]
                            ps = loci[contig][sample]["ps"]
                        sample_call = vcfpy.Call(sample = sample,
                                                 data = vcfpy.OrderedDict(GT = sample_gt, PS = ps))
                        # print(sample_call)
                        calls.append(sample_call)

                    rec = vcfpy.Record(CHROM = chrom, POS = break_point, ID = [contig + "_" + str(cons_pos)],
                                       REF = ref_allele, ALT = [vcfpy.Substitution("INS", ref_allele + alt_allele)],
                                       QUAL = 999, FILTER = ["PASS"],
                                       INFO = vcfpy.OrderedDict(SVLEN = op[0],
                                                                CIGAR = [str(cig)],
                                                                STRAND = strand,
                                                                CONTIG_START = str(aln.q_st)),
                                       FORMAT = ["GT", "PS"],
                                    calls = calls)

                    # output contig that contains this insertion
                    writer.write_record(rec)

                    # output same insertion, but with flanking sequences
                    # note, the interval is [start, end[
                    if flank_length > 0:
                        left_flank = ref_fasta.fetch(chrom, break_point - flank_length, break_point)
                        right_flank = ref_fasta.fetch(chrom, break_point, break_point + flank_length)
                    else:
                        left_flank = ""
                        right_flank = ""

                    flanked_contig_fasta.writelines([ ">" + contig + "_" + str(cons_pos) + "\n",
                                                     left_flank + alt_allele[1:] + right_flank + "\n"])

                    # output same contig, but with large flanking sequences
                    # note, the interval is [start, end[
                    n_records += 1

                cons_pos += op[0]
    flanked_contig_fasta.close()
    return n_records