def setUpClass(cls): cls.dir = tempfile.mkdtemp() # create a fasta file cls.fa = os.path.join(cls.dir, 'genome.fa') with open(cls.fa, mode='wt') as handle: handle.write('>chr1\n') handle.write('ACTGATGCTAGCTAGTATCTGACTCAGTAGCTCGAT\n') # index the fasta file fai = Faidx(cls.fa) fai.close() # set the final args that depend on the temp directory get_options.set_attr('tempdir', cls.dir) get_options.set_attr('reference', cls.fa) outvcf = os.path.join(cls.dir, 'out.vcf.gz') invcf = os.path.join(cls.dir, 'in.vcf.gz') get_options.set_attr('vcf', invcf) get_options.set_attr('out', outvcf) # write a VCF to be converted. This includes one variant which cannot be # converted. TODO: make a unit test to check for expected log output for # unconvertible variant with gzip.open(invcf, 'wt') as handle: handle.write('##fileformat=VCFv4.1\n' \ '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n' \ '1\t10\t.\tT\tG\t100\tPASS\tAC=100\n' \ '1\t1000000\t.\tT\tG\t100\tPASS\tAC=100\n' \ '1\t2000000\t.\tA\tG\t100\tPASS\tAC=100\n')
def test_reindex_on_modification(self): """ This test ensures that the index is regenerated when the FASTA modification time is newer than the index modification time. mdshw5/pyfaidx#50 """ faidx = Faidx('data/genes.fasta') index_mtime = getmtime(faidx.indexname) faidx.close() os.utime('data/genes.fasta', (index_mtime + 10, ) * 2) time.sleep(2) faidx = Faidx('data/genes.fasta') assert getmtime(faidx.indexname) > index_mtime
class Genome(object): def __init__(self, db): from pyfaidx import Faidx fa = os.path.join(app.config["DATA_FOLDER"], db, db + ".fa") self.fasta = Faidx(fa) def get_sequence(self, chr, start, end): return self.fasta.fetch(chr, start, end) def destroy(self): self.fasta.close()
def setUpClass(cls): cls.dir = tempfile.mkdtemp() # create a fasta file cls.fa = os.path.join(cls.dir, 'genome.fa') with open(cls.fa, mode='wt') as handle: handle.write('>chrN\n') handle.write('NNTGATGCTAGCTAGTATCTG\n') # index the fasta file fai = Faidx(cls.fa) fai.close()
def profile(self, input_vcf_file, ref_genome_file, output_file, raw_gt_format="GTR", sample_id=None, ): # unzip decompose and clean vcf file clean_vcf_file = join_path(mkdtemp(), "clean.vcf") if input_vcf_file.endswith(".gz"): cmd = "gunzip -c " + input_vcf_file else: cmd = "cat " + input_vcf_file cmd += " | vt decompose -s -" cmd += " | grep -Pv \"\t\*\t\"" cmd += " | grep -v \"\\x3b\"" cmd += " | grep -v \"^M\"" cmd += " > " + clean_vcf_file p, stdout_data = exec_sh(cmd, silent=True) # parse the clean vcf file for the required fields vcf_query_format = "'" vcf_query_format += "%CHROM" vcf_query_format += "\t%POS" vcf_query_format += "\t%REF" vcf_query_format += "\t%ALT" vcf_query_format += "[\t%SAMPLE=%" + raw_gt_format + "]" vcf_query_format += "\n" vcf_query_format += "'" cmd = "vcf-query" cmd += " -f " + vcf_query_format cmd += " " + clean_vcf_file p, stdout_data = exec_sh(cmd, silent=True) # get list of smaple id and prepare data structure first_variant_record = stdout_data.decode('utf-8').split("\n")[0] variant_items = first_variant_record.strip().split("\t") samples_features = {} for sample_idx in range(4, len(variant_items)): gt_data = variant_items[sample_idx] m = re.match(r"(?P<sample_id>.*)=(?P<raw_gt>.*)", gt_data) sample_id = m.group("sample_id") samples_features[sample_id] = copy.deepcopy(SNV_FEATURES_TEMPLATE) # iterate over all vcf record and count variants for each sample fa = Faidx(ref_genome_file) for variant_record in stdout_data.decode('utf-8').split("\n"): variant_items = variant_record.strip().split("\t") if len(variant_items) < 4: continue chrom = variant_items[0] pos = variant_items[1] ref = variant_items[2] alt = variant_items[3] if len(ref) > 1: continue if ref == "-": continue if len(alt) > 1: continue if alt == "-": continue triplet = fa.fetch(chrom, int(pos)-1, int(pos)+1).seq feature_id = SNV_FEATURES_HASH[ref][alt][triplet] # iterate over all samples in the record for sample_idx in range(4, len(variant_items)): gt_data = variant_items[sample_idx] m = re.match(r"(?P<sample_id>.*)=(?P<raw_gt>.*)", gt_data) raw_gt = m.group("raw_gt") if raw_gt == "0/0": continue samples_features[m.group("sample_id")][feature_id][FEATURE_QUANTITY] += 1 fa.close() # write output feature file with open(output_file, "w") as f_o: header = VARIANT_TYPE header += "\t" + VARIANT_SUBGROUP header += "\t" + FEATURE_ID for sample_id in samples_features: header += "\t" + sample_id f_o.write(header+"\n") for feature_id in SNV_FEATURES_TEMPLATE: feature_info = "{:s}\t{:s}\t{:s}".format(SNV_FEATURES_TEMPLATE[feature_id][VARIANT_TYPE], SNV_FEATURES_TEMPLATE[feature_id][VARIANT_SUBGROUP], feature_id, ) for sample_id in samples_features: feature_info += "\t" + str(samples_features[sample_id][feature_id][FEATURE_QUANTITY]) f_o.write(feature_info + "\n") self.info() self.info("Done!! The output file is at " + output_file)