Beispiel #1
0
    def makeHeader(self):
        header = pysam.VariantHeader()

        header.filters.add('PBAAFAIL', None, None,
                           'Consensus failed pbAA filters')
        header.filters.add('HP', None, None, 'Homopolymer length variant')
        header.info.add('NS', 1, 'Integer', 'Number of samples with data')
        header.info.add('AF', 'A', 'Float', 'Allele frequency')
        header.formats.add('GT', 1, 'String', 'Genotype ')
        header.formats.add('DP', 1, 'Integer', "Read depth")
        #maybe
        header.formats.add('FT', '.', 'String', 'pbAA filter')
        #header.formats.add('GQ',1,'Integer',"Conditional genotype quality")
        header.formats.add('AQ', '.', 'Float', "pbAA mean cluster quality")
        #header.formats.add('MIN_DP',1,'Integer',"Minimum DP observed within the GVCF block.")
        header.formats.add('AD', '.', 'Integer',
                           "Reads supporting each alt call")
        header.formats.add('VAF', '.', 'Float', "pbAA cluster frequency")
        #header.formats.add('PL','G','Integer',"Phred-scaled genotype likelihoods rounded to the closest integer")
        header.formats.add('TG', '.', 'String', 'pbAA guide')
        header.formats.add('HP', '.', 'Integer', "pbAA cluster identifier")
        header.formats.add('DV', '.', 'Float', "pbAA diversity score")
        header.formats.add('CH', '.', 'Float', "pbAA chimera score")

        for ctg, length in zip(self.reference.references,
                               self.reference.lengths):
            header.contigs.add(ctg, length=length)

        header.add_meta('commandline', value=self._getCommandLine())

        #add samples
        for s in self.samples:
            header.add_sample(s)

        return header
Beispiel #2
0
def build_header(
    reader: VariantFileT,
    patient_barcode: str,
    case_id: str,
    tumor_barcode: str,
    tumor_aliquot_uuid: str,
    tumor_bam_uuid: str,
    normal_barcode: str,
    normal_aliquot_uuid: str,
    normal_bam_uuid: str,
    reference_name: str,
) -> VcfHeaderT:
    """
    Takes the user arguments and the input VCF to generate the GDC
    formatted header entries and returns the header object.
    """
    # First, load the old header, skipping ones that we will update
    lst = []
    for record in reader.header.records:
        if (
            record.key == "fileDate"
            or record.key == "fileformat"
            or record.key == "reference"
        ):
            continue
        lst.append(str(record))

    # Add GDC specific metadata
    lst.extend(
        [
            "##fileDate={0}".format(datetime.date.today().strftime("%Y%m%d")),
            '##center="NCI Genomic Data Commons (GDC)"',
            "##reference={0}".format(reference_name),
            "##INDIVIDUAL=<NAME={0},ID={1}>".format(patient_barcode, case_id),
            "##SAMPLE=<ID=NORMAL,NAME={0},ALIQUOT_ID={1},BAM_ID={2}>".format(
                normal_barcode, normal_aliquot_uuid, normal_bam_uuid
            ),
            "##SAMPLE=<ID=TUMOR,NAME={0},ALIQUOT_ID={1},BAM_ID={2}>".format(
                tumor_barcode, tumor_aliquot_uuid, tumor_bam_uuid
            ),
        ]
    )

    # Initialize new header object
    new_head = pysam.VariantHeader()
    for line in lst:
        new_head.add_line(line)

    # Add samples
    for sample in reader.header.samples:
        new_head.add_sample(sample)

    # Return updated header
    return new_head
def get_header(old_header: VariantHeaderT) -> VariantHeaderT:
    """
    Creates a new header with the new elements that will be
    handled in this tool.
    """
    header = pysam.VariantHeader()
    added_flag = False
    for record in old_header.records:
        if record.type == "INFO":
            if not added_flag:
                header.add_meta(
                    "INFO",
                    items=[
                        ("ID", "forcedHet"),
                        ("Number", 0),
                        ("Type", "Flag"),
                        (
                            "Description",
                            "The original homozygous-reference call "
                            "was converted to heterozygous-alt.",
                        ),
                    ],
                )
                added_flag = True

            if record.get("ID", "") == "SVTYPE":
                curr = []
                for k, v in record.items():
                    if k == "ID":
                        curr.append((k, "TYPEOFSV"))
                    else:
                        if k == "IDX":
                            continue
                        curr.append((k, v.replace('"', "")))
                header.add_meta(record.key, items=curr)
            else:
                header.add_record(record)

        elif (
            record.type == "GENERIC" and record.key == "center" and record.value == '""'
        ):
            continue
        else:
            header.add_record(record)

    for sample in old_header.samples:
        header.add_sample(sample)

    return header
Beispiel #4
0
    def _get_test_vcf_header(meta=None, samples=None):
        hdr = pysam.VariantHeader()
        if meta:
            if isinstance(meta, list):
                for rec in meta:
                    hdr.add_meta(**rec)
            else:
                hdr.add_meta(**meta)
                # hdr.add_meta(key=meta['key'], value=meta.get('value'), items=meta.get('items'))

        if samples:
            if isinstance(samples, list):
                for sample in samples:
                    hdr.add_sample(sample)
            else:
                hdr.add_sample(samples)

        res = GenericVcfObject(header=hdr)
        return res
Beispiel #5
0
def write_vcf(call_data, output_file, verboseness=0):
    """
    Write out variant calling data to a VCF file. By default only writes
    entries where something else than the reference has been observed.

    Parameters
    ----------
    call_data : VariantCallData
    output_file : file
    verboseness : int
        Determines the verboseness of the VCF. Higher value will result in
        more entries in the VCF. Accepted values:

        0: Only output StrainGR strong SNPs
        1: Output strong and weak SNPs
        2: Output an entry for every position in the genome, even if nothing
           else but the reference is observed.
    """

    contig_lengths = []
    for scaffold in call_data.scaffolds_data.values():
        contig_lengths.append(
            f"##contig=<ID={scaffold.name},length={scaffold.length}>")

    header = pysam.VariantHeader()
    header_str = VCF_TEMPLATE.format(date=datetime.now(),
                                     ref=call_data.reference_fasta,
                                     contig_lengths="\n".join(contig_lengths))

    for line in header_str.split('\n'):
        header.add_line(line)

    header.add_sample("straingr")

    vcf_writer = pysam.VariantFile(output_file, 'w', header=header)

    record_iter = itertools.chain.from_iterable(
        vcf_records_for_scaffold(vcf_writer, scaffold, verboseness)
        for scaffold in call_data.scaffolds_data.values())

    for record in record_iter:
        vcf_writer.write(record)
Beispiel #6
0
def create_vcf_file(path, sample):
    """ Creates VCF header and Variant File. 
    Writes VCF header in Variant File and returns it. 
        
    Parameters
    ----------
    path: str
        Name and path of an output vcf file, for example output/out.vcf
    sample: str
        Name of a sample to add to the VCF file
    
    Returns
    -------
    pysam.VariantFile
        Created VCF file with header written in it
    """

    vcf_header = pysam.VariantHeader()
    vcf_header.add_sample(sample)

    current_time = datetime.datetime.now()
    date = current_time.strftime('%Y%m%d')
    vcf_header.add_line('##fileDate=' + date)
    vcf_header.add_line('##source=Ema&Nikola')

    faifile = open("test_data/human_g1k_v37_decoy.fasta.fai")
    for line in faifile:
        split_line = line.split("\t")
        contig = '##contig=<ID=' + str(split_line[0]) + ', length=' + str(
            split_line[1]) + '>'
        vcf_header.add_line(contig)
    faifile.close()
    vcf_header.add_line(
        "##ALT=<ID=*,Description=Different allele than referent.>")
    vcf_header.add_line(
        "##FORMAT=<ID=GT,Number=1,Type=String,Description=Genotype>")
    vcf_header.add_line(
        "##FORMAT=<ID=VAF,Number=1,Type=String,Description=Variant allele frequency>"
    )

    vcf = pysam.VariantFile(path, 'w', header=vcf_header)
    return vcf
Beispiel #7
0
 def _compile_common_header(self, varcall_template, no_filters=False):
     # fix the header generated by VarScan
     # by adding reference and contig information
     common_header = pysam.VariantHeader()
     common_header.add_meta('reference', value=self.ref_genome)
     self._add_ref_contigs_to_header(common_header)
     if not no_filters:
         # add filter info
         self._add_filters_to_header(common_header)
     # change the source information
     common_header.add_meta('source', value='varscan.py')
     # declare an INDEL flag for record INFO fields
     self._add_indel_info_flag_to_header(common_header)
     # take the remaining metadata from the template header produced by
     # VarScan
     with pysam.VariantFile(varcall_template, 'r') as original_data:
         varscan_header = original_data.header
     for sample in varscan_header.samples:
         common_header.samples.add(sample)
     common_header.merge(varscan_header)
     return common_header
Beispiel #8
0
    def testConstructionWithRecords(self):

        fn_in = os.path.join(DATADIR, self.filename)
        fn_out = get_temp_filename(suffix=".vcf")
        vcf_in = pysam.VariantFile(fn_in)

        header = pysam.VariantHeader()

        for record in vcf_in.header.records:
            header.add_record(record)

        fn = str("tmp_VariantFileTest_testConstructionWithRecords") + ".vcf"
        vcf_out = pysam.VariantFile(fn, "w", header=header)
        for record in vcf_in:
            # currently segfaults here:
            # vcf_out.write(record)
            pass
        return

        vcf_out.close()
        self.complete_check(fn_in, fn_out)
Beispiel #9
0
    def testConstructionWithLines(self):

        fn_in = os.path.join(CBCF_DATADIR, self.filename)
        fn_out = get_temp_filename(suffix=".vcf")
        vcf_in = pysam.VariantFile(fn_in)

        header = pysam.VariantHeader()
        for sample in vcf_in.header.samples:
            header.add_sample(sample)

        for hr in vcf_in.header.records:
            header.add_line(str(hr))

        vcf_out = pysam.VariantFile(fn_out, "w", header=header)

        for record in vcf_in:
            vcf_out.write(record)

        vcf_out.close()
        vcf_in.close()

        self.complete_check(fn_in, fn_out)
def VCFHeader(sample):
    '''Creates VariantHeader object for the vcf file containing variant calling results for sample from the input'''

    VCFheader = pysam.VariantHeader()
    VCFheader.add_sample(sample)
    faifile = open("human_g1k_v37_decoy.fasta.fai")
    for line in faifile:
        split_line = line.split("\t")
        contig = '##contig=<ID=' + str(split_line[0]) + ', length=' + str(
            split_line[1]) + '>'
        VCFheader.add_line(contig)
    faifile.close()

    VCFheader.add_line(
        "##ALT=<ID=*,Description=Represents allele(s) other than observed.>")
    VCFheader.add_line(
        "##FORMAT=<ID=GT,Number=1,Type=String,Description=Genotype>")
    VCFheader.add_line(
        "##FORMAT=<ID=VAF,Number=1,Type=String,Description=Variant allele frequency>"
    )

    return VCFheader
Beispiel #11
0
    def extract(self, contig, start, end, lVarFiles):
        """Extract region with 0-based indexing system"""

        logging.info('Extracting from variant files')

        self.checkIndexOfVariantFiles(lVarFiles)

        for varFile in lVarFiles:
            varFileHeader = pysam.VariantHeader()
            f = pysam.VariantFile(varFile, header=varFileHeader)

            # get list of samples
            lCurrentSamples = self.addSamples(list(f.header.samples), varFile)

            # get list of pos
            for val in f.fetch(str(contig), int(start), int(end)):
                for sample in val.samples.items():
                    lAlleles, depth = self.getAllelesFromPosition(
                        list(val.alleles), dict(val.info), sample[1].items())
                    if depth > 0:
                        genotype = (self._getFormatTagValue(sample[1].items(),
                                                            'GT',
                                                            index=0),
                                    self._getFormatTagValue(sample[1].items(),
                                                            'GT',
                                                            index=1))
                    else:
                        genotype = (None, None)
                # depth = self._getFormatTagValue(sample[1].items(), 'DP')
                    self.dSamples[sample[0]].addPosition(
                        Position(val.contig,
                                 val.pos,
                                 genotype=genotype,
                                 depth=depth,
                                 lAlleles=lAlleles,
                                 lFilters=val.filter))
Beispiel #12
0
    type=str,
    nargs='+',
    required=True,
    help=
    'List of sample names. Order must correspond to samtools mpileup input BAMs/CRAMs.'
)
argparser.add_argument('-o',
                       '--output',
                       metavar='file',
                       dest='output_file',
                       required=True,
                       help='Output file compressed using bgzip.')

if __name__ == '__main__':
    args = argparser.parse_args()
    header = pysam.VariantHeader()
    header.add_line(
        '##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Number of bases">')
    for chrom in list(map(str, range(1, 23))) + [
            'X', 'Y'
    ]:  # add all possible chromosome names. needed by htslibs to track chromosomes in records.
        header.contigs.add(chrom)
        header.contigs.add('chr' + chrom)
    for sample in args.in_samples:
        header.add_sample(sample)
    with pysam.VariantFile(args.output_file, 'wb', header=header) as ofile:
        for line in args.in_mpileup_file:
            fields = line.rstrip().split('\t')
            record = header.new_record(contig=fields[0],
                                       start=int(fields[1]) - 1,
                                       stop=int(fields[1]),
def vcf_header():
    vcf_header = pysam.VariantHeader()
    vcf_header.add_sample("sample1")
    vcf_header.add_sample("sample2")
    vcf_header.contigs.add("1")
    return vcf_header
Beispiel #14
0
    def write_to_file(gwas_file,
                      gwas_idx,
                      path,
                      fasta,
                      build,
                      trait_id,
                      sample_metadata=None,
                      file_metadata=None,
                      csi=False):
        logging.info("Writing headers to BCF/VCF: {}".format(path))

        header = pysam.VariantHeader()

        # INFO
        header.add_line(
            '##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency">'
        )

        # FORMAT
        header.add_line(
            '##FORMAT=<ID=ES,Number=A,Type=Float,Description="Effect size estimate relative to the alternative allele">'
        )
        header.add_line(
            '##FORMAT=<ID=SE,Number=A,Type=Float,Description="Standard error of effect size estimate">'
        )
        header.add_line(
            '##FORMAT=<ID=LP,Number=A,Type=Float,Description="-log10 p-value for effect estimate">'
        )
        header.add_line(
            '##FORMAT=<ID=AF,Number=A,Type=Float,Description="Alternate allele frequency in the association study">'
        )
        header.add_line(
            '##FORMAT=<ID=SS,Number=A,Type=Integer,Description="Sample size used to estimate genetic effect">'
        )
        header.add_line(
            '##FORMAT=<ID=EZ,Number=A,Type=Float,Description="Z-score provided if it was used to derive the EFFECT and SE fields">'
        )
        header.add_line(
            '##FORMAT=<ID=SI,Number=A,Type=Float,Description="Accuracy score of summary data imputation">'
        )
        header.add_line(
            '##FORMAT=<ID=NC,Number=A,Type=Integer,Description="Number of cases used to estimate genetic effect">'
        )
        header.add_line(
            '##FORMAT=<ID=ID,Number=1,Type=String,Description="Study variant identifier">'
        )

        # META
        header.add_line(
            '##META=<ID=TotalVariants,Number=1,Type=Integer,Description="Total number of variants in input">'
        )
        header.add_line(
            '##META=<ID=VariantsNotRead,Number=1,Type=Integer,Description="Number of variants that could not be read">'
        )
        header.add_line(
            '##META=<ID=HarmonisedVariants,Number=1,Type=Integer,Description="Total number of harmonised variants">'
        )
        header.add_line(
            '##META=<ID=VariantsNotHarmonised,Number=1,Type=Integer,Description="Total number of variants that could not be harmonised">'
        )
        header.add_line(
            '##META=<ID=SwitchedAlleles,Number=1,Type=Integer,Description="Total number of variants strand switched">'
        )
        header.add_line(
            '##META=<ID=TotalControls,Number=1,Type=Integer,Description="Total number of controls in the association study">'
        )
        header.add_line(
            '##META=<ID=TotalCases,Number=1,Type=Integer,Description="Total number of cases in the association study">'
        )
        header.add_line(
            '##META=<ID=StudyType,Number=1,Type=String,Description="Type of GWAS study [Continuous or CaseControl]">'
        )

        # SAMPLES
        header.samples.add(trait_id)
        if file_metadata is not None:
            s = ""
            for k in sample_metadata:
                s += ",{}={}".format(k, sample_metadata[k])
            header.add_line('##SAMPLE=<ID={}{}>'.format(trait_id, s))

        # CONTIG
        assert len(fasta.references) == len(fasta.lengths)
        for n, contig in enumerate(fasta.references):
            header.add_line("##contig=<ID={},length={}, assembly={}>".format(
                contig, fasta.lengths[n], build))

        # add metadata
        if file_metadata is not None:
            for k in file_metadata:
                header.add_line('##{}={}'.format(k, file_metadata[k]))

        vcf = pysam.VariantFile(path, "w", header=header)

        # recall variant objects in chromosome position order
        logging.info("Writing variants to BCF/VCF: {}".format(path))
        for contig in fasta.references:
            if contig not in gwas_idx:
                continue

            while gwas_idx[contig]:
                chr_pos = heappop(gwas_idx[contig])

                # load GWAS result
                gwas_file.seek(chr_pos[1])
                result = pickle.load(gwas_file)

                lpval = -np.log10(result.pval)

                # check floats
                if Vcf.is_float32_lossy(result.b):
                    logging.warning(
                        "Effect field cannot fit into float32. Expect loss of precision for: {}"
                        .format(result.b))
                if Vcf.is_float32_lossy(result.se):
                    result.se = np.float64(np.finfo(np.float32).tiny).item()
                    logging.warning(
                        "Standard error field cannot fit into float32. Expect loss of precision for: {}"
                        .format(result.se))
                if Vcf.is_float32_lossy(lpval):
                    logging.warning(
                        "-log10(pval) field cannot fit into float32. Expect loss of precision for: {}"
                        .format(lpval))
                if Vcf.is_float32_lossy(result.alt_freq):
                    logging.warning(
                        "Allele frequency field cannot fit into float32. Expect loss of precision for: {}"
                        .format(result.alt_freq))
                if Vcf.is_float32_lossy(result.imp_z):
                    logging.warning(
                        "Imputation Z score field cannot fit into float32. Expect loss of precision for: {}"
                        .format(result.imp_z))
                if Vcf.is_float32_lossy(result.imp_info):
                    logging.warning(
                        "Imputation INFO field cannot fit into float32. Expect loss of precision for: {}"
                        .format(result.imp_info))

                record = vcf.new_record()
                record.chrom = result.chrom
                assert " " not in record.chrom
                record.pos = result.pos
                assert record.pos > 0
                record.id = Vcf.remove_illegal_chars(result.dbsnpid)
                record.alleles = (result.ref, result.alt)
                record.filter.add(result.vcf_filter)

                if result.alt_freq is not None:
                    record.info['AF'] = result.alt_freq

                if result.b is not None:
                    record.samples[trait_id]['ES'] = result.b
                if result.se is not None:
                    record.samples[trait_id]['SE'] = result.se
                if lpval is not None:
                    record.samples[trait_id]['LP'] = lpval
                if result.alt_freq is not None:
                    record.samples[trait_id]['AF'] = result.alt_freq
                if result.n is not None:
                    record.samples[trait_id]['SS'] = round(result.n)
                if result.imp_z is not None:
                    record.samples[trait_id]['EZ'] = result.imp_z
                if result.imp_info is not None:
                    record.samples[trait_id]['SI'] = result.imp_info
                if result.ncase is not None:
                    record.samples[trait_id]['NC'] = round(result.ncase)
                if result.dbsnpid is not None:
                    record.samples[trait_id]['ID'] = record.id

                # write to file
                vcf.write(record)

        vcf.close()

        # index output file
        logging.info("Indexing output file")
        pysam.tabix_index(path, preset="vcf", force=True, csi=csi)
Beispiel #15
0
def writevcf_pysam(outpath,bampath):
	bamfile=pysam.AlignmentFile(bampath,'r')
	bamhead=bamfile.header

	filedate=str(datetime.datetime.now()).split(' ')[0]
	vcfheader=pysam.VariantHeader()
	vcfheader.add_sample(bampath)
	vcfheader.add_line('##source=DeBreak\n')
	vcfheader.add_line('##fileDate='+filedate+'\n')

	for i in range(len(bamhead.references)):
		vcfheader.contigs.add(bamhead.references[i],length=bamhead.lengths[i])

	vcfheader.add_line('##ALT=<ID=DEL,Description="Deletion">\n')
	vcfheader.add_line('##ALT=<ID=INS,Description="Insertion">\n')
	vcfheader.add_line('##ALT=<ID=DUP,Description="Duplication">\n')
	vcfheader.add_line('##ALT=<ID=INV,Description="Inversion">\n')
	vcfheader.add_line('##ALT=<ID=TRA,Description="Translocation">\n')

	vcfheader.info.add('CHR2',1,'String','Chromosome for END')
	vcfheader.info.add('END',1,'Integer','End position of the structural variant')
	vcfheader.info.add('MAPQ',1,'Integer','Mean mapping quality of supporting reads')
	vcfheader.info.add('SUPPREAD',1,'Integer','Number of supporting reads')
	vcfheader.info.add('SVLEN',1,'Integer','Length of the SV')
	vcfheader.info.add('SVMETHOD',1,'String','Type of approach used to detect SV')
	vcfheader.info.add('SVTYPE',1,'String','Type of structural variant')
	vcfheader.info.add('PRECISE',0,'Flag','Variant with precise breakpoint position from POA')

	vcfheader.info.add('MULTI',0,'Flag','If the SV is multi-allelic SV')
	vcfheader.info.add('LARGEINS',0,'Flag','Large insertion indentified from local assembly')
	vcfheader.info.add('START2',1,'Integer','SV start position on the second haplotype of multi-allilic SV')
	vcfheader.info.add('END2',1,'Integer','SV end position on the second haplotype of multi-allilic SV')
	vcfheader.info.add('SVLEN2',1,'Integer','SV length on the second haplotype of multi-allilic SV')

	vcfheader.add_meta('FORMAT', items=[('ID',"GT"), ('Number',1), ('Type','String'),('Description','Genotype')])
	vcfheader.add_line('##CommandLine=debreak '+" ".join(sys.argv[1:])+'\n')

	f=pysam.VariantFile(outpath+'debreak.vcf','w',header=vcfheader)
	allsv=open(outpath+'debreak-allsv-merged-final','r').read().split('\n')[:-1]

	svid=1

	for sv in allsv:
		sv=sv.split('\t')
		try:
			newrec=f.new_record(contig=sv[0],start=int(sv[1]),filter='PASS')
		except:
			continue
		newrec.id='DB'+str(svid)
		newrec.ref='N'
		svid+=1
		if 'Translocation' in sv:
			newrec.info['CHR2']=sv[2]
		else:
			newrec.info['CHR2']=sv[0]
		if 'Insertion' in sv:
			newrec.stop=newrec.start+1
		elif 'Translocation' in sv:
			newrec.stop=int(sv[3])
		else:
			newrec.stop=newrec.start+int(sv[2])
		if  'Translocation' in sv:
			newrec.info['SVLEN']=0
			newrec.info['SUPPREAD']=int(sv[4])
			newrec.info['MAPQ']=int(float(sv[5]))
			
		else:
			newrec.info['SVLEN']=int(sv[2])
			if 'rescue_largeins_' in ''.join(sv):
				newrec.info['SUPPREAD']=0
			else:
				newrec.info['SUPPREAD']=int(sv[3])
			newrec.info['MAPQ']=int(float(sv[5]))
		newrec.info['SVMETHOD']='DeBreak'
		if 'Insertion' in sv and 'rescue_largeins_' in ''.join(sv):
			newrec.info['LARGEINS']=True

		if 'Precise' in sv:
			newrec.info['PRECISE']=True
		newrec.info['SVTYPE']='DEL' if 'Deletion' in sv else ('INS' if 'Insertion' in sv else ('INV' if 'Inversion' in sv else ('DUP' if 'Duplication' in sv else 'TRA')))

		if 'GT=1/0' in sv:
			newrec.samples[bampath]['GT']=(0,1)
		if 'GT=1/1' in sv:
			newrec.samples[bampath]['GT']=(1,1)
		if 'GT=./.' in sv:
			newrec.samples[bampath]['GT']=(0,1)
		if 'CompoundSV' in sv:
			newrec.info['MULTI']=True
			
		f.write(newrec)

	f.close()
	return 0