def extracAllels(chrom, vcf_coord, var_descr, genref): '''compute alternative allel from description in bed file. must be one of sub(C->T), ins(CCCT), del(5) ''' ref_allel = pysam.faidx(genref, chrom + ':' + vcf_coord + '-' + vcf_coord)[1].strip() if 'sub' in var_descr: yy = var_descr.split('->') ref = yy[0][-1] if ref == ref_allel: return '\t'.join([ref, yy[1][0]]) else: print >> sys.stderr, 'ref allels do not match, exiting...' print >> sys.api_version, chrom, vcf_coord, var_descr sys.exit(1) elif 'ins' in var_descr: yy = var_descr.split('(')[1] return '\t'.join([ref_allel, ref_allel + yy[:-1]]) elif 'del' in var_descr: yy = var_descr.split('(')[1] del_len = int(yy[:-1]) vcf_coord_end = str(int(vcf_coord) + int(del_len)) ref = pysam.faidx(genref, chrom + ':' + vcf_coord + '-' + vcf_coord_end)[1].strip() if ref[0] == ref_allel: return '\t'.join([ref, ref_allel]) else: print >> sys.api_version, 'ref allels do not match in del, exiting...' print >> sys.api_version, chrom, vcf_coord, var_descr sys.exit(1) else: print >> sys.api_version, 'format not found, exiting...' sys.exit(1)
def gatk_realigner(align_bam, ref_file, config, dbsnp=None, region=None, out_file=None, deep_coverage=False): """Realign a BAM file around indels using GATK, returning sorted BAM. """ runner = broad.runner_from_config(config) runner.run_fn("picard_index", align_bam) runner.run_fn("picard_index_ref", ref_file) if not os.path.exists("%s.fai" % ref_file): pysam.faidx(ref_file) if region: align_bam = subset_bam_by_region(align_bam, region, out_file) runner.run_fn("picard_index", align_bam) if has_aligned_reads(align_bam, region): variant_regions = config["algorithm"].get("variant_regions", None) realign_target_file = gatk_realigner_targets(runner, align_bam, ref_file, dbsnp, region, out_file, deep_coverage, variant_regions) realign_bam = gatk_indel_realignment(runner, align_bam, ref_file, realign_target_file, region, out_file, deep_coverage) # No longer required in recent GATK (> Feb 2011) -- now done on the fly # realign_sort_bam = runner.run_fn("picard_fixmate", realign_bam) return realign_bam elif out_file: shutil.copy(align_bam, out_file) return out_file else: return align_bam
def __init__(self, filename): if not os.path.exists(filename + '.fai'): import pysam pysam.faidx(filename) self.fasta = open(filename) self.index = self.load_index(filename + '.fai')
def resolved_tool_contract_runner(resolved_contract): rc = resolved_contract alignment_path = rc.task.input_files[0] reference_path = rc.task.input_files[1] gff_path = rc.task.output_files[0] dataset_path = rc.task.output_files[1] fasta_path = re.sub(".contigset.xml", ".fasta", dataset_path) fastq_path = rc.task.output_files[2] args = [ alignment_path, "--verbose", "--reference", reference_path, "--outputFilename", gff_path, "--outputFilename", fasta_path, "--outputFilename", fastq_path, "--numWorkers", str(rc.task.nproc), "--minCoverage", str(rc.task.options[Constants.MIN_COVERAGE_ID]), "--minConfidence", str(rc.task.options[Constants.MIN_CONFIDENCE_ID]), "--algorithm", rc.task.options[Constants.ALGORITHM_ID], "--alignmentSetRefWindows", ] if rc.task.options[Constants.DIPLOID_MODE_ID]: args.append("--diploid") args_ = get_parser().arg_parser.parser.parse_args(args) rc = args_runner(args_) if rc == 0: pysam.faidx(fasta_path) ds = ContigSet(fasta_path, strict=True) ds.write(dataset_path) return rc
def upstream_and_downstream_seq(args): chromosome = split_coords(args.coords)[0] start = str(split_coords(args.coords)[1]) downstream = str(int(start)-1000) end = str(split_coords(args.coords.replace('"', ""))[2]) upstream = str(int(end)+1000) #using the samtools faidx function to take the appropriate sequence from a reference genome downstream_fa = Seq(pysam.faidx(args.genome, chromosome+":"+downstream+"-"+start), generic_dna) upstream_fa = Seq(pysam.faidx(args.genome, chromosome+":"+end+"-"+upstream), generic_dna) # Selecting only the sequence and converting to uppercase downstream_seq = downstream_fa[(len(downstream_fa.split('\n')[0])):-1].upper() # Selecting only the sequence, converting to uppercase, reversing and then getting the complementary sequence reverse_compliment_upstream_seq = upstream_fa[(len(upstream_fa.split('\n')[0])):-1].upper().reverse_complement() # Making sequence records with ID header and sequence downstream_seq = SeqRecord(downstream_seq, id="downstream_sequence") reverse_compliment_upstream_seq = SeqRecord(reverse_compliment_upstream_seq, id="upstream_sequence") if os.path.isdir(args.directory+"tmp/") == False: os.mkdir(args.directory+"tmp/") # Writing sequences to fasta file downstream_outfile = open(os.path.join(args.directory+"tmp/", "downstream.fa"), "w") downstream_outfile.write(">"+str(downstream_seq.id) + "\n" + str(downstream_seq.seq)) upstream_outfile = open(os.path.join(args.directory+"tmp/", "upstream.fa"), "w") upstream_outfile.write(">"+str(reverse_compliment_upstream_seq.id) + "\n" + str(reverse_compliment_upstream_seq.seq))
def run(referenceset, fastq, gff, fasta, contigset, alignmentset, options, log_level): #'--log-file foo.log', #'--verbose', #'--debug', # requires 'ipdb' #'-j NWORKERS', #'--algorithm quiver', #'--diploid', # binary #'--minConfidence 40', #'--minCoverage 5', #'--alignmentSetRefWindows', cmd = "variantCaller --log-level {log_level} {options} --referenceFilename {referenceset} -o {fastq} -o {gff} -o {fasta} {alignmentset}" system(cmd.format(**locals())) try: say('Converting fasta {!r} to contigset {!r}'.format(fasta, contigset)) # Convert to contigset.xml import pysam pysam.faidx(fasta) # pylint: disable=no-member # I do not know why pylint does not see this defined. ds = ContigSet(fasta, strict=True) ds.write(contigset, relPaths=True) say('Successfully wrapped fasta {!r} in contigset {!r}'.format(fasta, contigset)) except Exception: say(traceback.format_exc()) say('Skipping conversion to contigset.')
def do_download(output_path): real_url = self.base_url + url raw = get_page(real_url) if not raw: # pragma: no cover raise ValueError("Retrieving url failed: %s" % real_url) for aregexps in regexps: matches = re.findall(aregexps, raw) if len(matches) == 1: Path(str(output_path / output_filename) + ".url").write_text( (real_url + matches[0]) ) download_func( real_url + match_transformer(matches[0]), output_path / output_filename, ) break else: raise ValueError( # pragma: no cover - defensive "Found either too few or too many for every regexps. \nRaw was %s" % (raw,) ) if Path(output_filename).suffix == ".fasta": import pysam pysam.faidx(str((output_path / output_filename).absolute()))
def chrom_length(fasta_in): """ Compute chromosome lengths of fasta file and store them into a file. More about the .fai file format can be found here: http://www.htslib.org/doc/faidx.html Parameters ---------- fasta_in : str Path to genome FASTA file (can be .gz). Returns ------- str Absolute path to output file. """ iCount.log_inputs(LOGGER, level=logging.INFO) temp = iCount.files.decompress_to_tempfile(fasta_in) pysam.faidx(temp) # pylint: disable=no-member fai_file = os.path.abspath(fasta_in + '.fai') shutil.move(temp + '.fai', fai_file) LOGGER.info('Fai file saved to : %s', fai_file) return fai_file
def merge_mut2(mutation_file_list, output_file, reference): mut2sample = {} sample_ind = 0 for mut_file in mutation_file_list: sample_ind = sample_ind + 1 is_vcf = True if mut_file.endswith(".vcf") or mut_file.endswith(".vcf.gz") else False hin2 = gzip.open(mut_file, 'r') if mut_file.endswith(".gz") else open(mut_file, 'r') for line2 in hin2: F2 = line2.rstrip('\n').split('\t') if F2[0].startswith('#'): continue if F2[0] == "Chr": continue if is_vcf == False: pos, ref, alt = F2[1], F2[3], F2[4] # insertion if F2[3] == "-": # get the sequence for the reference base seq = "" for item in pysam.faidx(reference, F2[0] + ":" + str(F2[1]) + "-" + str(F2[1])): seq = seq + item.rstrip('\n') seq = seq.replace('>', '') seq = seq.replace(F2[0] + ":" + str(F2[1]) + "-" + str(F2[1]), '') ref, alt = seq, seq + F2[4] # deletion if F2[4] == "-": # get the sequence for the reference base seq = "" for item in pysam.faidx(reference, F2[0] + ":" + str(int(F2[1]) - 1) + "-" + str(int(F2[1]) - 1)): seq = seq + item.rstrip('\n') seq = seq.replace('>', '') seq = seq.replace(F2[0] + ":" + str(int(F2[1]) - 1) + "-" + str(int(F2[1]) - 1), '') pos, ref, alt = str(int(F2[1]) - 1), seq + F2[3], seq QUAL = 60 INFO = "SOMATIC" key = '\t'.join([F2[0], pos, '.', ref, alt, str(QUAL), "PASS", INFO]) else: key = '\t'.join(F2[0:8]) if key not in mut2sample: mut2sample[key] = [] mut2sample[key].append(str(sample_ind)) sample_num = sample_ind hout = open(output_file, 'w') for mut in sorted(mut2sample): if len(mut2sample[mut]) == sample_num: continue print >> hout, mut + '\t' + ','.join(mut2sample[mut]) hout.close()
def extracAllels(chrom, vcf_coord, var_descr, genref): '''compute alternative allel from description in bed file. must be one of sub(C->T), ins(CCCT), del(5) ''' ref_allel = pysam.faidx(genref, chrom+':'+vcf_coord+'-'+vcf_coord)[1].strip() if 'sub' in var_descr: yy = var_descr.split('->') ref = yy[0][-1] if ref == ref_allel: return '\t'.join([ref, yy[1][0]]) else: print 'ref allels do not match, exiting...' print chrom, vcf_coord, var_descr sys.exit(1) elif 'ins' in var_descr: yy = var_descr.split('(')[1] return '\t'.join([ref_allel, ref_allel + yy[:-1]]) elif 'del' in var_descr: yy = var_descr.split('(')[1] del_len = int(yy[:-1]) vcf_coord_end = str(int(vcf_coord) + int(del_len)) ref = pysam.faidx(genref, chrom+':'+vcf_coord+'-'+vcf_coord_end)[1].strip() if ref[0] == ref_allel: return '\t'.join([ref, ref_allel]) else: print 'ref allels do not match in del, exiting...' print chrom, vcf_coord, var_descr sys.exit(1) else: print 'format not found, exiting...' sys.exit(1)
def bed_tofasta(bed, ref_fasta, min_size=50, stranded=True, include_name=False, out=sys.stdout): if not os.path.exists('%s.fai' % ref_fasta): pysam.faidx(ref_fasta) fasta = pysam.Fastafile(ref_fasta) refs = set() with open('%s.fai' % ref_fasta) as f: for line in f: refs.add(line.split('\t')[0].strip()) name = '' for region in bed: if include_name: name = '%s|' % (region.name.strip()) if region.end - region.start >= min_size and region.chrom in refs: seq = fasta.fetch(region.chrom, region.start, region.end) if stranded and region.strand: if region.strand == '-': seq = revcomp(seq) out.write('>%s%s:%d-%d[%s]\n%s\n' % (name, region.chrom, region.start, region.end, region.strand, seq)) else: out.write('>%s%s:%d-%d%s\n%s\n' % (name, region.chrom, region.start, region.end, seq)) fasta.close()
def finalize_outputs(options, tdb_writer, out_fasta, out_genepred, out_genepred_annovar, out_fasta_annovar, gbk_dir, out_id, out_excl): tdb_writer.finalize(options) out_fasta.close() out_id.close() out_excl.close() pysam.faidx(options.output + '.fa') out_genepred.close() if options.annovar: out_genepred_annovar.close() out_fasta_annovar.close() pysam.faidx('{}_refGeneMrna.fa'.format(options.output)) if options.gbk: shutil.make_archive('{}_gbk'.format(options.output), "zip", './', gbk_dir) shutil.rmtree(gbk_dir)
def gen_restricted_reference(reference, regions_bed, out_reference, use_short_contigs_names=False): logger = logging.getLogger(gen_restricted_reference.__name__) reference_handle = pysam.Fastafile(reference) regions_bedtool = pybedtools.BedTool(regions_bed) with open(out_reference, "w") as out_fasta: for region_index, region in enumerate(regions_bedtool, start=1): sequence = reference_handle.fetch(reference=str(region.chrom), start=region.start, end=region.end) region_name = str(region_index) if use_short_contigs_names else ( "%s_%d_%d" % (str(region.chrom), region.start, region.end)) if region_index == 1: out_fasta.write(">{}\n{}".format(region_name, sequence)) else: out_fasta.write("\n>{}\n{}".format(region_name, sequence)) pysam.faidx(out_reference) logger.info("Lifted over the reference to {}".format(out_reference)) reference_handle.close() return out_reference
def run(referenceset, fastq, gff, fasta, contigset, alignmentset, options, log_level): #'--log-file foo.log', #'--verbose', #'--debug', # requires 'ipdb' #'-j NWORKERS', #'--algorithm quiver', #'--diploid', # binary #'--minConfidence 40', #'--minCoverage 5', #'--alignmentSetRefWindows', cmd = "variantCaller --log-level {log_level} {options} --referenceFilename {referenceset} -o {fastq} -o {gff} -o {fasta} {alignmentset}" system(cmd.format(**locals())) try: say('Converting fasta {!r} to contigset {!r}'.format(fasta, contigset)) # Convert to contigset.xml import pysam pysam.faidx(fasta) # pylint: disable=no-member # I do not know why pylint does not see this defined. ds = ContigSet(fasta, strict=True) ds.write(contigset, relPaths=True) say('Successfully wrapped fasta {!r} in contigset {!r}'.format( fasta, contigset)) except Exception: say(traceback.format_exc()) say('Skipping conversion to contigset.')
def make_index(file_name): """Make index file for input file""" f_bs, f_ext = os.path.splitext(file_name) def indexed(fn, ext): return os.path.exists(fn + ext) def uptodate(fn, ext): return os.getmtime(fn) < os.getmtime(fn + ext) infomsg = "{} was indexed and is uptodate. Skipping".format(file_name) if f_ext == ".fa": if indexed(file_name, ".fai") and uptodate(file_name, ".fai"): print(infomsg) else: pysam.faidx(file_name) elif f_ext in [".bam", ".cram"]: if indexed(file_name, ".bai") and uptodate(file_name, ".bai"): print(infomsg) else: pysam.index(file_name) elif f_ext in [".gff", ".bed", ".vcf", ".sam"]: if indexed(file_name, ".gz.tbi") and uptodate( file_name, ".gz.tbi"): print(infomsg) else: pysam.tabix_index(file_name, preset=f_ext.replace(".", ""))
def bed_tofasta(bed, ref_fasta, min_size=50, stranded=True, include_name=False, out=sys.stdout): if not os.path.exists('%s.fai' % ref_fasta): pysam.faidx(ref_fasta) fasta = pysam.Fastafile(ref_fasta) refs = set() with open('%s.fai' % ref_fasta) as f: for line in f: refs.add(line.split('\t')[0].strip()) name = '' for region in bed: if include_name: name = '%s|' % (region.name.strip()) if region.end - region.start >= min_size and region.chrom in refs: seq = fasta.fetch(region.chrom, region.start, region.end) if stranded and region.strand: if region.strand == '-': seq = revcomp(seq) out.write('>%s%s:%d-%d[%s]\n%s\n' % (name, region.chrom, region.start, region.end, region.strand, seq)) else: out.write('>%s%s:%d-%d\n%s\n' % (name, region.chrom, region.start, region.end, seq)) fasta.close()
def resolved_tool_contract_runner(resolved_contract): rc = resolved_contract alignment_path = rc.task.input_files[0] reference_path = rc.task.input_files[1] gff_path = rc.task.output_files[0] vcf_path = rc.task.output_files[1] dataset_path = rc.task.output_files[2] fasta_path = re.sub(".contigset.xml", ".fasta", dataset_path) fastq_path = rc.task.output_files[3] args = [ alignment_path, "--verbose", "--reference", reference_path, "--outputFilename", gff_path, "--outputFilename", fasta_path, "--outputFilename", fastq_path, "--outputFilename", vcf_path, "--numWorkers", str(rc.task.nproc), "--minCoverage", str(rc.task.options[Constants.MIN_COVERAGE_ID]), "--minConfidence", str(rc.task.options[Constants.MIN_CONFIDENCE_ID]), "--maskRadius", str(Constants.DEFAULT_MASK_RADIUS) if \ bool(rc.task.options[Constants.MASKING_ID]) else "0", "--algorithm", rc.task.options[Constants.ALGORITHM_ID], "--alignmentSetRefWindows", ] args_ = get_parser().arg_parser.parser.parse_args(args) rc = args_runner(args_) if rc == 0: pysam.faidx(fasta_path) ds = ContigSet(fasta_path, strict=True) ds.write(dataset_path) return rc
def generate_data_files(dir_name=None): import logging logging.basicConfig(level=logging.INFO) if dir_name is not None: os.chdir(dir_name) with open("tst1.fasta", "w") as f: f.write(">ecoliK12_pbi_March2013_2955000_to_2980000\n") f.write("AAAGAGAGAG" * 2500) pysam.faidx("tst1.fasta") for i in range(len(sam_strings)): sam_file = "tst_%d_subreads.sam" % (i + 1) bam_file = "tst_%d_subreads.bam" % (i + 1) with open(sam_file, "w") as sam_out: sam_out.write(sam_strings[i]) logging.info("Converting {s} to BAM".format(s=sam_file)) # FIXME pysam is way broken - can't handle unmapped input? # convert to bam using pysam # with pysam.AlignmentFile(sam_file, "r", check_sq=False) as sam_in: # with pysam.AlignmentFile(bam_file, "wb", # template=sam_in) as bam_out: # for s in sam_in: # bam_out.write(s) args = ["samtools", "view", "-b", "-o", bam_file, sam_file] assert subprocess.call(args) == 0, args os.remove(sam_file) # XXX don't create .pbi for this file, we want it to be absent if bam_file != "tst_2_subreads.bam": logging.info("Indexing {b}".format(b=bam_file)) subprocess.call(["pbindex", bam_file])
def run(self): AbstractAnalysis.run(self) #Call base method to do some logging localBamFile = os.path.join(self.getLocalTempDir(), "mapping.bam") localSortedBamFile = os.path.join(self.getLocalTempDir(), "mapping.sorted") samToBamFile(self.samFile, localBamFile) pysam.sort(localBamFile, localSortedBamFile) pysam.index(localSortedBamFile + ".bam") pysam.faidx(self.referenceFastaFile) file_header = self.readFastqFile.split(".fastq")[0].split("/")[-1] + "_" + self.referenceFastaFile.split(".fa")[0].split("/")[-1] consensus_vcf = os.path.join(self.outputDir, file_header + "_Consensus.vcf") consensus_fastq = os.path.join(self.outputDir, file_header + "_Consensus.fastq") system("samtools mpileup -Q 0 -uf %s %s | bcftools view -cg - > %s" \ % (self.referenceFastaFile, localSortedBamFile + ".bam", consensus_vcf)) system("vcfutils.pl vcf2fq %s > %s" % (consensus_vcf, consensus_fastq)) system("rm -rf %s" % (self.referenceFastaFile + ".fai")) formatted_consensus_fastq = os.path.join(self.getLocalTempDir(), "Consensus.fastq") formatConsensusFastq(consensus_fastq, formatted_consensus_fastq) system("mv %s %s" % (formatted_consensus_fastq, consensus_fastq)) self.finish()
def fetch_file(options): if len(options) != 4: sys.exit('fetch_ucsc.py hg19/hg38/mm9/mm10 ref/kg/ens/fa out') if options[1] in {'hg19', 'hg38', 'mm9', 'mm10'}: path = 'http://hgdownload.soe.ucsc.edu/goldenPath/%s/' % options[1] else: sys.exit('Only support human or mouse!') s = {32: 95} if options[2] == 'ref': # RefSeq gene annotations download_file(path + 'database/refFlat.txt.gz', 'refFlat.txt.gz') with open(options[3], 'wb') as outf: outf.write(gzip.open('refFlat.txt.gz', 'rb').read()) elif options[2] == 'kg': # KnownGenes gene annotations download_file(path + 'database/knownGene.txt.gz', 'knownGene.txt.gz') download_file(path + 'database/kgXref.txt.gz', 'kgXref.txt.gz') kg_iso = {} with gzip.open('kgXref.txt.gz', 'rb') as kg_id_f: for line in kg_id_f: iso = line.decode().split('\t')[0] gene = line.decode().split('\t')[4].translate(s) kg_iso[iso] = gene with gzip.open('knownGene.txt.gz', 'rb') as kg_f: with open(options[3], 'w') as outf: for line in kg_f: entry = line.decode().split('\t') iso = entry[0] outf.write('\t'.join([kg_iso[iso]] + entry[:10]) + '\n') elif options[2] == 'ens': # Ensembl gene annotations if options[1] == 'hg38' or options[1] == 'mm10': sys.exit('No Ensembl gene annotations for hg38 or mm10!') download_file(path + 'database/ensGene.txt.gz', 'ensGene.txt.gz') download_file(path + 'database/ensemblToGeneName.txt.gz', 'ensemblToGeneName.txt.gz') ens_iso = {} with gzip.open('ensemblToGeneName.txt.gz', 'rb') as ens_id_f: for line in ens_id_f: iso, gene = line.decode().split() ens_iso[iso] = gene with gzip.open('ensGene.txt.gz', 'rb') as ens_f: with open(options[3], 'w') as outf: for line in ens_f: entry = line.decode().split() iso = entry[1] outf.write('\t'.join([ens_iso[iso]] + entry[1:11]) + '\n') elif options[2] == 'fa': # Genome sequences if options[1] == 'hg38': fa_path = 'bigZips/hg38.chromFa.tar.gz' else: fa_path = 'bigZips/chromFa.tar.gz' download_file(path + fa_path, 'chromFa.tar.gz') with tarfile.open('chromFa.tar.gz', 'r:gz') as fa: with open(options[3], 'w') as outf: for f in fa: if f.isfile(): content = fa.extractfile(f).read() outf.write(content.decode()) pysam.faidx(options[3]) else: sys.exit('Only support ref/kg/ens/fa!')
def fetch_file(options): if len(options) != 4: sys.exit('fetch_ucsc.py hg19/hg38/mm10 ref/kg/ens/fa out') if options[1] in {'hg19', 'hg38', 'mm10'}: path = 'http://hgdownload.soe.ucsc.edu/goldenPath/%s/' % options[1] else: sys.exit('Only support human or mouse!') s = string.maketrans(' ', '_') if options[2] == 'ref': # RefSeq gene annotations urllib.urlretrieve(path + 'database/refFlat.txt.gz', 'refFlat.txt.gz') with open(options[3], 'w') as outf: outf.write(gzip.open('refFlat.txt.gz', 'rb').read()) elif options[2] == 'kg': # KnownGenes gene annotations urllib.urlretrieve(path + 'database/knownGene.txt.gz', 'knownGene.txt.gz') urllib.urlretrieve(path + 'database/kgXref.txt.gz', 'kgXref.txt.gz') kg_iso = {} with gzip.open('kgXref.txt.gz', 'rb') as kg_id_f: for line in kg_id_f: iso = line.split('\t')[0] gene = line.split('\t')[4].translate(s) kg_iso[iso] = gene with gzip.open('knownGene.txt.gz', 'rb') as kg_f: with open(options[3], 'w') as outf: for line in kg_f: entry = line.split('\t') iso = entry[0] outf.write('\t'.join([kg_iso[iso]] + entry[:10]) + '\n') elif options[2] == 'ens': # Ensembl gene annotations if options[1] == 'hg38': sys.exit('No Ensembl gene annotations for hg38!') urllib.urlretrieve(path + 'database/ensGene.txt.gz', 'ensGene.txt.gz') urllib.urlretrieve(path + 'database/ensemblToGeneName.txt.gz', 'ensemblToGeneName.txt.gz') ens_iso = {} with gzip.open('ensemblToGeneName.txt.gz', 'rb') as ens_id_f: for line in ens_id_f: iso, gene = line.split() ens_iso[iso] = gene with gzip.open('ensGene.txt.gz', 'rb') as ens_f: with open(options[3], 'w') as outf: for line in ens_f: entry = line.split() iso = entry[1] outf.write('\t'.join([ens_iso[iso]] + entry[1:11]) + '\n') elif options[2] == 'fa': # Genome sequences if options[1] == 'hg38': fa_path = 'bigZips/hg38.chromFa.tar.gz' else: fa_path = 'bigZips/chromFa.tar.gz' urllib.urlretrieve(path + fa_path, 'chromFa.tar.gz') with tarfile.open('chromFa.tar.gz', 'r:gz') as fa: with open(options[3], 'w') as outf: for f in fa: if f.isfile(): outf.write(fa.extractfile(f).read()) pysam.faidx(options[3]) else: sys.exit('Only support ref/kg/ens/fa!')
def write_fa_subset(seq_names, infile, outfile): if not os.path.exists(infile + '.fai'): pysam.faidx(infile) f = pyfastaq.utils.open_file_write(outfile) for name in seq_names: print(pysam.faidx(infile, name), end='', file=f) pyfastaq.utils.close(f)
def _generate_chunk_output_file(self, i=None): fn = tempfile.NamedTemporaryFile(suffix=".fasta").name suffix = "|arrow" with open(fn, "w") as f: header, seq = self.CHUNK_CONTIGS[i] f.write(">{h}{s}\n{q}".format(h=header, s=suffix, q=seq)) pysam.faidx(fn) return self._make_dataset_file(fn)
def check_fasta(fa_f, pysam_flag=True): if not os.path.isfile(fa_f + '.fai'): pysam.faidx(fa_f) if pysam_flag: # return pysam FastaFile object fa = pysam.FastaFile(fa_f) return fa else: # return fasta file path return fa_f
def index_fasta(infile): '''index fasta file using samTools''' if os.path.isfile(infile): pass else: print >>sys.stderr, "Indexing " + infile + ' ...', pysam.faidx(infile) print >>sys.stderr, "Done!"
def index_fasta(infile): """index fasta file using samTools""" if os.path.isfile(infile): pass else: print >>sys.stderr, "Indexing " + infile + " ...", pysam.faidx(infile) print >>sys.stderr, "Done!"
def ensure_fasta_index(fasta_fname): """Ensure a FASTA file is indexed for samtools, to enable fast lookup.""" fai_fname = fasta_fname + '.fai' if not is_newer_than(fai_fname, fasta_fname): echo("Indexing FASTA file", fasta_fname) pysam.faidx(fasta_fname) assert os.path.isfile(fai_fname), "Failed to generate index " + fai_fname return fai_fname
def __init__(self, num, refname): self.num = int(num) self.refname = refname if not os.path.exists('%s.fai' % refname): pysam.faidx(refname) self.ref = pysam.Fastafile(refname)
def index_fasta(infile): '''index fasta file using samTools''' if os.path.isfile(infile): pass else: print("Indexing " + infile + ' ...', end=' ', file=sys.stderr) pysam.faidx(infile) print("Done!", file=sys.stderr)
def run_spades_parallel(bam=None, spades=None, bed=None, work=None, pad=SPADES_PAD, nthreads=1, chrs=[], max_interval_size=SPADES_MAX_INTERVAL_SIZE, timeout=SPADES_TIMEOUT, isize_min=ISIZE_MIN, isize_max=ISIZE_MAX, svs_to_assemble=SVS_ASSEMBLY_SUPPORTED, stop_on_fail=False, max_read_pairs=EXTRACTION_MAX_READ_PAIRS): pybedtools.set_tempdir(work) logger.info("Running SPAdes on the intervals in %s" % bed) if not bed: logger.info("No BED file specified") return None, None bedtool = pybedtools.BedTool(bed) total = bedtool.count() chrs = set(chrs) all_intervals = [interval for interval in bedtool] if not chrs else [interval for interval in bedtool if interval.chrom in chrs] selected_intervals = filter(partial(should_be_assembled, max_interval_size=max_interval_size, svs_to_assemble=svs_to_assemble), all_intervals) ignored_intervals = filter(partial(shouldnt_be_assembled, max_interval_size=max_interval_size, svs_to_assemble=svs_to_assemble), all_intervals) pool = multiprocessing.Pool(nthreads) assembly_fastas = [] for i in xrange(nthreads): intervals = [interval for (j, interval) in enumerate(selected_intervals) if (j % nthreads) == i] kwargs_dict = {"intervals": intervals, "bam": bam, "spades": spades, "work": "%s/%d" % (work, i), "pad": pad, "timeout": timeout, "isize_min": isize_min, "isize_max": isize_max, "stop_on_fail": stop_on_fail, "max_read_pairs": max_read_pairs} pool.apply_async(run_spades_single, kwds=kwargs_dict, callback=partial(run_spades_single_callback, result_list=assembly_fastas)) pool.close() pool.join() logger.info("Merging the contigs from %s" % (str(assembly_fastas))) assembled_fasta = os.path.join(work, "spades_assembled.fa") with open(assembled_fasta, "w") as assembled_fd: for line in fileinput.input(assembly_fastas): assembled_fd.write("%s\n" % (line.strip())) if os.path.getsize(assembled_fasta) > 0: logger.info("Indexing the assemblies") pysam.faidx(assembled_fasta) else: logger.error("No assembly generated") assembled_fasta = None ignored_bed = None if ignored_intervals: ignored_bed = os.path.join(work, "ignored.bed") pybedtools.BedTool(ignored_intervals).each(add_breakpoints).saveas(ignored_bed) pybedtools.cleanup(remove_all=True) return assembled_fasta, ignored_bed
def checkFASTA( fastaFileStr ): fastaIndex = fastaFileStr + '.fai' if os.path.isfile(fastaFileStr) == False: print('ERROR: FASTA file does not exist') exit() elif os.path.isfile(fastaIndex) == False: print('WARNING: FASTA index file does not exist...creating') pysam.faidx( fastaFileStr ) return True
def checkFASTA(fastaFileStr): fastaIndex = fastaFileStr + '.fai' if os.path.isfile(fastaFileStr) == False: print('ERROR: FASTA file does not exist') exit() elif os.path.isfile(fastaIndex) == False: print('WARNING: FASTA index file does not exist...creating') pysam.faidx(fastaFileStr) return True
def write_fasta(seqs, fasta_path, index=True): with open(fasta_path, 'w') as fasta: for k in seqs: fasta.write('\n'.join( ['>%s' % k] + [seqs[k][i:(i + 80)] for i in range(0, len(seqs[k]), 80)] + ['\n'])) if index: pysam.faidx(fasta_path) #reindex return True
def __init__(self, num, refname, dbsnpname): sys.stderr.write('Note: MismatchRefDbSNP is considered *experimental*\n') self.num = int(num) self.refname = refname self.dbsnp = DBSNP(dbsnpname) if not os.path.exists('%s.fai' % refname): pysam.faidx(refname) self.ref = pysam.Fastafile(refname)
def get_genome_stats(genome_fasta): reference_fasta_index = genome_fasta + '.fai' if not os.path.exists(reference_fasta_index): print("\nIndexing %s\n" % os.path.abspath(genome_fasta)) pysam.faidx(genome_fasta) reference_genome = pysam.FastaFile(genome_fasta) total_bases = sum(reference_genome.lengths) return reference_genome.nreferences, total_bases
def __faidx(self): if not os.path.isfile(fafile + '.fai'): try: pysam.faidx(self.fafile) return True except: raise RuntimeError() else: print "already exist" return False
def get_reference_sequence(ref_location, contig, start_pos, end_pos): # ensure faidx if not os.path.isfile("{}.fai".format(ref_location)): subprocess.check_call(['samtools', 'faidx', ref_location]) if not os.path.isfile("{}.fai".format(ref_location)): pysam.faidx(ref_location) # use pysam with closing(pysam.FastaFile(ref_location)) as ref: return ref.fetch(reference=contig, start=start_pos, end=end_pos)
def index_genomefq(self): """ Index whole genome fasta with samtools :return: """ try: pysam.faidx(self.whole_genome) except Exception as e: print('Problem in pysam faidx') print(e)
def _write_fasta_or_contigset(file_name, make_faidx=False, n_records=251): fasta_file = re.sub(".contigset.xml", ".fasta", file_name) rec = [">chr%d\nacgtacgtacgt" % x for x in range(n_records)] with open(fasta_file, "w") as f: f.write("\n".join(rec)) f.flush() if make_faidx: pysam.faidx(fasta_file) if file_name.endswith(".xml"): cs = ContigSet(fasta_file, strict=make_faidx) cs.write(file_name)
def main(): if len(sys.argv) != 4: sys.exit('fetch_ucsc.py human/mouse ref/kg/ens/fa out') if sys.argv[1] == 'human': path = 'http://hgdownload.soe.ucsc.edu/goldenPath/hg19/' elif sys.argv[1] == 'mouse': path = 'http://hgdownload.soe.ucsc.edu/goldenPath/mm10/' else: sys.exit('Only support human or mouse!') s = string.maketrans(' ', '_') if sys.argv[2] == 'ref': # RefSeq gene annotations urllib.urlretrieve(path + 'database/refFlat.txt.gz', 'refFlat.txt.gz') with open(sys.argv[3], 'w') as outf: outf.write(gzip.open('refFlat.txt.gz', 'rb').read()) elif sys.argv[2] == 'kg': # KnownGenes gene annotations urllib.urlretrieve(path + 'database/knownGene.txt.gz', 'knownGene.txt.gz') urllib.urlretrieve(path + 'database/kgXref.txt.gz', 'kgXref.txt.gz') kg_iso = {} with gzip.open('kgXref.txt.gz', 'rb') as kg_id_f: for line in kg_id_f: iso = line.split('\t')[0] gene = line.split('\t')[4].translate(s) kg_iso[iso] = gene with gzip.open('knownGene.txt.gz', 'rb') as kg_f: with open(sys.argv[3], 'w') as outf: for line in kg_f: entry = line.split('\t') iso = entry[0] outf.write('\t'.join([kg_iso[iso]] + entry[:10]) + '\n') elif sys.argv[2] == 'ens': # Ensembl gene annotations urllib.urlretrieve(path + 'database/ensGene.txt.gz', 'ensGene.txt.gz') urllib.urlretrieve(path + 'database/ensemblToGeneName.txt.gz', 'ensemblToGeneName.txt.gz') ens_iso = {} with gzip.open('ensemblToGeneName.txt.gz', 'rb') as ens_id_f: for line in ens_id_f: iso, gene = line.split() ens_iso[iso] = gene with gzip.open('ensGene.txt.gz', 'rb') as ens_f: with open(sys.argv[3], 'w') as outf: for line in ens_f: entry = line.split() iso = entry[1] outf.write('\t'.join([ens_iso[iso]] + entry[1:11]) + '\n') elif sys.argv[2] == 'fa': # Genome sequences urllib.urlretrieve(path + 'bigZips/chromFa.tar.gz', 'chromFa.tar.gz') with tarfile.open('chromFa.tar.gz', 'r:gz') as seq: with open(sys.argv[3], 'w') as outf: for f in seq: outf.write(seq.extractfile(f).read()) pysam.faidx(sys.argv[3]) else: sys.exit('Only support ref/kg/ens/fa!')
def prep(output_filename): import pysam with open(output_filename, "wb") as op: for fn in filenames: for key, seq in iter_fasta( fn, lambda x: x[:x.find(b" ")] if b" " in x else x): op.write(b">%s\n%s\n" % (key, b"\n".join(wrappedIterator(80)(seq)))) pysam.faidx(output_filename)
def create_index(cls, fasta_file, force_overwrite=False): logger = logging.getLogger(cls.__name__) fasta_file = Path(fasta_file) if not fasta_file.is_file(): logger.error("File {} not found".format(fasta_file)) exit(1) if not fasta_file.with_name(fasta_file.name + '.fai').is_file() or force_overwrite: pysam.faidx(str(fasta_file))
def build_index(self, force=False): self._import_pysam() if not isinstance(self.fos, str): raise TypeError, "This function only works with FastaReader objects " + "connected to a fasta file via file name" index_filename = self.fos + ".fai" if os.access(index_filename, os.R_OK): if (not force) and os.stat(self.filename_or_sequence).st_mtime <= os.stat(index_filename).st_mtime: # index is up to date return pysam.faidx(self.fos) if not os.access(index_filename, os.R_OK): raise SystemError, "Building of Fasta index failed due to unknown error."
def index_fasta(infile): ''' Index fasta file using samTools. ''' try: if os.path.getsize(infile + '.fai'): logging.debug("\"%s\" exists. Skip indexing!" % (infile + '.fai')) pass except OSError: logging.warning ("Can not find the index file: \"%s\"" % (infile + '.fai')) logging.info("Indexing \"%s\" using the \"pysam\" module..." % infile) pysam.faidx(infile) logging.info("Done!")
def _make_barcodes(file_name=None): if file_name is None: file_name = tempfile.NamedTemporaryFile(suffix=".barcodeset.xml").name fasta_file_name = file_name if file_name.endswith(".barcodeset.xml"): fasta_file_name = re.sub(".barcodeset.xml", ".fasta", file_name) with FastaWriter(fasta_file_name) as fa_out: for i in range(1010): fa_out.writeRecord("%04d_Forward" % i, "A" * 16) pysam.faidx(fasta_file_name, catch_stdout=False) ds = BarcodeSet(fasta_file_name, strict=True) ds.write(file_name) return file_name
def gatk_realigner(align_bam, ref_file, config, dbsnp=None, deep_coverage=False): """Realign a BAM file around indels using GATK, returning sorted BAM. """ runner = broad.runner_from_config(config) runner.run_fn("picard_index", align_bam) runner.run_fn("picard_index_ref", ref_file) if not os.path.exists("%s.fai" % ref_file): pysam.faidx(ref_file) realign_target_file = gatk_realigner_targets(runner, align_bam, ref_file, dbsnp, deep_coverage) realign_bam = gatk_indel_realignment(runner, align_bam, ref_file, realign_target_file, deep_coverage) # No longer required in recent GATK (> Feb 2011) -- now done on the fly # realign_sort_bam = runner.run_fn("picard_fixmate", realign_bam) return realign_bam
def check_fasta(fa, return_handle=True): ''' Check fasta files. http://pysam.readthedocs.io/en/latest/api.html?highlight=faidx#fasta-files ''' if not os.path.isfile(fa): sys.exit('No such file: %s!' % fa) if not os.path.isfile(fa + '.fai'): pysam.faidx(fa) if return_handle: return pysam.FastaFile(fa) else: return fa
def makeTwoReference(self, chr,start,end,ref,alt, output): hOUT = open(output, 'w') seq = "" label = ','.join([chr, str(start), str(end), ref, alt]) range = chr + ":" + str(int(start) - self.window + 1) +"-"+ str(int(end) + self.window) for item in pysam.faidx(self.reference_genome, range): if item[0] == ">": continue seq = seq + item.rstrip('\n').upper() print >> hOUT, '>' + label + "_ref" print >> hOUT, seq # for insertion if ref == "-": seq = seq[0:(self.window + 1)] + alt + seq[-self.window:] # for deletion elif alt == "-": seq = seq[0:self.window] + seq[-self.window:] # for SNV else: seq = seq[0:self.window] + alt + seq[-self.window:] print >> hOUT, '>' + label + "_alt" print >> hOUT, seq hOUT.close()
def bgzip_index(original_file, new_file, file_format): """ :param original_file: :param new_file: :param file_format: :return: """ if file_format.lower() == 'fa': tabix_compress(original_file, new_file) faidx(new_file) delete_file(original_file) elif file_format.lower() == 'vcf': tabix_index(original_file, preset="vcf", force=True) else: raise G2GValueError("Unknown file format: {0}".format(file_format))
def read_pysam(f, headers): tstart = time.time() for k in islice(headers, 0, None, 100): for start, end in intervals: if time.time() - tstart > 300: print(k) tstart = time.time() str(pysam.faidx(f, '{0}:{1}-{2}'.format(k, start + 1, end)))