def __init__(self, output, header_str): self.output = output self.header_str = header_str # create a cyvcf2 file for formatting, not for writing the file tmp = tempfile.NamedTemporaryFile(mode="w", suffix=".vcf") self.vcf = Writer.from_string(tmp, self.header_str) # print the header print(self.header_str, end="", file=self.output)
def to_vcf(self, path): from cyvcf2 import Writer header = '''##fileformat=VCFv4.2 #CHROM POS ID REF ALT QUAL FILTER INFO ''' writer = Writer.from_string(path, header) for v in self: variant = writer.variant_from_string('\t'.join( [v.chrom, str(v.pos), '.', v.ref, v.alt, '.', '.', '.'])) writer.write_record(variant)
def test_writer_from_string(): header = """##fileformat=VCFv4.1 ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype"> ##contig=<ID=chr2,length=249250621,assembly=hg19> #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT samplea """ w = Writer.from_string("out.vcf", header) w.write_header() v = w.variant_from_string("chr1\t234\t.\tA\tC\t40\tPASS\t.\tGT\t0/0") w.write_record(v) w.close()
def to_vcf(self, path, remove_samples=False, clean_info=False): """ Parse query result as vcf file. Args: path: path of the file. remove_samples: remove sample columns from vcf file clean_info: clean info fields from vcf file """ from cyvcf2 import Writer header = self.vcf.raw_header # remove sample columns from header # 2th last column in the header is columns # all the columns after 8th is for samples if remove_samples: header = header.split('\n') columns = header[-2].strip().split('\t') header[-2] = '\t'.join(columns[:8]) header = '\n'.join(header) writer = Writer.from_string(path, header) for v in self: variant = v.source if remove_samples or clean_info: variant = str(variant).strip().split() if remove_samples: # all the columns after 8th is for samples so remove them variant = variant[:8] if clean_info: # 7th column contains info fields # replace it with N/A variant[7] = '.' variant = writer.variant_from_string('\t'.join(variant)) writer.write_record(variant)
def main(): parser = argparse.ArgumentParser(description=""" Utility to convert bed-like file with alleles to a simple VCF """) parser.add_argument('-b', '--bed', help=""" bed file to convert to vcf must have chrom, start, end, ref, and alt columns """, required=True) parser.add_argument('-f', '--fai', help=""" fasta index file generated by samtools faidx """, required=True) parser.add_argument('-o', '--output', help=""", output file name, prints to standard out if not supplied """) args = parser.parse_args() # - is accepted as standard out for cyvcf2 if args.output is None: output = "-" else: output = args.output hdr = build_header(args.fai) w = Writer.from_string(output, hdr) bed_to_vcf(args.bed, w) w.close()
# Getting the vcf header rawheader = vcf.raw_header newline = "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t%s" headerlist = [] for line in rawheader.split('\n'): if line.startswith('##'): headerlist.append(line) else: pass headerlist.append(newline) header = '\n'.join(map(str, headerlist)) w_a = Writer.from_string(''.join([sample, '_a.vcf.gz']), header % sample_a) w_a.write_header() w_b = Writer.from_string(''.join([sample, '_b.vcf.gz']), header % sample_b) w_b.write_header() for v in vcf: # class to get genotype in 0/0, 0/1, ... etc. format. gts = v.genotypes class Genotype(object): __slots__ = ('alleles', 'phased') def __init__(self, li): self.alleles = li[:-1] self.phased = li[-1]