def __determine_sequences(self, query_seqs, target_seqs): """Private method to assign the sequence file variables if necessary. :param query_seqs: :param target_seqs: :return: """ if isinstance(query_seqs, str): assert os.path.exists(query_seqs) self.query_seqs = pyfaidx.Fasta(query_seqs) elif query_seqs is None: self.query_seqs = None else: self.logger.warn("Query type: %s", type(query_seqs)) # assert "SeqIO.index" in repr(query_seqs) self.query_seqs = query_seqs self.target_seqs = [] for target in target_seqs: if not os.path.exists(target): raise ValueError("{} not found!".format(target)) self.target_seqs.append(pyfaidx.Fasta(target)) return
def test_write(self): """ The the writing method of the fragment simulator. """ self.__fragments = tempfile.mkstemp()[1] self.__chromosomes = tempfile.mkstemp()[1] self.__map = tempfile.mkstemp()[1] self.__simulator.write(self.__map, self.__fragments, self.__chromosomes) # check if the correct number of fragment and chromosome # sequences was written fragment_fasta = pyfaidx.Fasta(self.__fragments) self.assertEqual(len(fragment_fasta.keys()), self.__fragment_number + self.__unplaced_number) chromosome_fasta = pyfaidx.Fasta(self.__chromosomes) self.assertEqual(len(chromosome_fasta.keys()), self.__chromosome_number) # check if a correct fragment map was written test_map = Map() test_map.read(self.__map) os.unlink(self.__fragments) os.unlink(self.__fragments + '.fai') os.unlink(self.__chromosomes) os.unlink(self.__chromosomes + '.fai') os.unlink(self.__map)
def load_fasta(names, *filepaths): """ Load lazy FASTA records from one or multiple files without reading them into memory. Parameters ---------- names : sequence of str Names of sequence records in FASTA file or files. filepaths : str Paths to one or more FASTA files to gather records from. Returns ------- OrderedDict of sequence name -> sequence record """ import pyfaidx if len(filepaths) == 0: raise ValueError("Need at least one file") if len(filepaths) == 1: fa = pyfaidx.Fasta(filepaths[0], as_raw=True) else: fa = {} for filepath in filepaths: fa.update(pyfaidx.Fasta(filepath, as_raw=True).records) records = OrderedDict((chrom, fa[chrom]) for chrom in names) return records
def SSSimulate(cores, haplotype, chromosome, start, end, error, coverage, length, indels, probability, insertsize, standarddev, output): #prepare region fa=pyfaidx.Fasta(os.path.abspath(haplotype)) if chromosome not in fa.keys(): message='Abort' return message chr_= fa[chromosome] seq = chr_[:len(chr_)].seq with open(os.path.abspath(output + '/region.tmp.fa'), 'w') as regionout: subprocess.call(['samtools', 'faidx', haplotype, chromosome + ':' + str(start) + '-' +str(end)], stdout=regionout, stderr=open(os.devnull, 'wb')) regionfa=pyfaidx.Fasta(os.path.abspath(output + '/region.tmp.fa')) chrf=regionfa[chromosome + ':' + str(start) + '-' +str(end)] seqfa=chrf[:len(chrf)].seq Ns=seqfa.count('N') if len(seq) < end-start: logging.warning(str(chromosome) + ' in haplotype ' + os.path.abspath(haplotype) + ' is shorter than region to simulate.') numreads= round((coverage*(len(seq)-Ns)) / length)/2 #calculate chosen coverage and divide by 2 'cause they are pairs else: numreads= round((coverage*(end-start-Ns)) / length)/2 #simulate reads subprocess.call(['wgsim', '-e', str(error), '-N', str(numreads), '-1', str(length), '-2', str(length), '-R', str(indels), '-X', str(probability), os.path.abspath(output + '/region.tmp.fa'), os.path.abspath(output + '/region.1.fq'), os.path.abspath(output + '/region.2.fq')], stderr=open(os.devnull, 'wb'), stdout=open(os.devnull, 'wb')) os.remove(os.path.abspath(output + '/region.tmp.fa')) os.remove(os.path.abspath(output + '/region.tmp.fa.fai')) #align to modified reference with open(os.path.abspath(output + '/region.tmp.sam'), 'w') as samout: subprocess.call(['bwa', 'mem', '-t', str(cores), haplotype, os.path.abspath(output + '/region.1.fq'), os.path.abspath(output + '/region.2.fq')], stdout=samout, stderr=open(os.devnull, 'wb')) with open(os.path.abspath(output + '/region.tmp.bam'), 'w') as bamout: subprocess.call(['samtools', 'view', '-b', os.path.abspath(output + '/region.tmp.sam')], stdout=bamout, stderr=open(os.devnull, 'wb')) os.remove(os.path.abspath(output + '/region.tmp.sam')) with open(os.path.abspath(output + '/region.tmp.srt.bam'), 'w') as srtbamout: subprocess.call(['samtools', 'sort', '-@', str(cores-1), os.path.abspath(output + '/region.tmp.bam')], stdout=srtbamout, stderr=open(os.devnull, 'wb')) os.remove(os.path.abspath(output + '/region.tmp.bam')) subprocess.call(['samtools', 'index', os.path.abspath(output + '/region.tmp.srt.bam')],stderr=open(os.devnull, 'wb'))
def ancestral_fasta(args): """subroutine for ancestor subcommand """ # single chromosome fasta file for reference genome ref = pyfaidx.Fasta(args.reference, read_ahead=10000) # make a copy to build our ancestor for this chromosome copyfile(args.reference, args.output) anc = pyfaidx.Fasta(args.output, read_ahead=10000, mutable=True) # reference genome for outgroup species (all chromosomes) out = pyfaidx.Fasta(args.outgroup, read_ahead=10000) # outgroup to reference alignment chain file lo = LiftOver(args.chain) # snps database for the same chromosome vcf = cyvcf2.VCF(args.vcf) # change regions outside of callability mask to all N bases if args.bed: if args.bed == '-': bed = sys.stdin else: bed = open(args.bed, 'r') last_end = 0 for line in bed: chrom, start, end = line.rstrip().split('\t')[:3] start = int(start) anc[chrom][last_end:start] = 'N' * (start - last_end) last_end = int(end) anc[chrom][last_end:len(anc[chrom])] = 'N' * (len(anc[chrom]) - last_end) for variant in vcf: # change variants that are not biallelic SNPs to N bases if not (variant.is_snp and len(variant.ALT) == 1): anc[variant.CHROM][variant.start:variant.end] = 'N' * ( variant.end - variant.start) else: out_coords = lo.convert_coordinate(variant.CHROM, variant.start) # change ambiguously aligning sites to N bases if out_coords is None or len(out_coords) != 1: anc[variant.CHROM][variant.start] = 'N' else: if variant.REF != ref[variant.CHROM][ variant.start].seq.upper(): raise ValueError(f'variant reference allele {variant.REF} ' f'mismatches reference sequence ' f'{ref[variant.CHROM][variant.start]}') out_chromosome, out_position, out_strand = out_coords[0][:3] out_allele = out[out_chromosome][out_position].seq # if negative strand, take reverse complement base if out_strand == '-': out_allele = reverse_complement(out_allele) # and finally, polarize if out_allele.upper() == variant.ALT[0]: anc[variant.CHROM][variant.start] = out_allele elif out_allele.upper() != variant.REF: # triallelic anc[variant.CHROM][variant.start] = 'N'
def __init__(self, input_path, in_memory=False): """ Constructs a new `Genome` object. """ super(Genome, self).__init__() self.in_memory = in_memory if in_memory is True: fasta = pyfaidx.Fasta(input_path) self.data = {k: str(fasta[k][:].seq).upper() for k in fasta.keys()} fasta.close() else: self.data = pyfaidx.Fasta(input_path) self.chrom_len_dict = {k: len(self.data[k]) for k in self.data.keys()}
def parse_fasta(fa_file): _LOGGER.debug("Hashing {}".format(fa_file)) try: fa_object = pyfaidx.Fasta(fa_file) except pyfaidx.UnsupportedCompressionFormat: # pyfaidx can handle bgzip but not gzip; so we just hack it here and # unzip the file for checksumming, then rezip it for the rest of the # asset build. # TODO: streamline this to avoid repeated compress/decompress os.system("gunzip {}".format(fa_file)) fa_file_unzipped = fa_file.replace(".gz", "") fa_object = pyfaidx.Fasta(fa_file_unzipped) os.system("gzip {}".format(fa_file_unzipped)) return fa_object
def test_reorder(self): """ Check if input sequences are properly reordered. """ test = Reorder(self.__order) test.write(self.__input, self.__output, ignore_missing=True) # check if sequences are in the specified order input_fasta = pyfaidx.Fasta(self.__input) output_fasta = pyfaidx.Fasta(self.__output) present_seq = [x for x in test.order if x in input_fasta.keys()] self.assertEqual(present_seq, list(output_fasta.keys())) with self.assertRaises(BioformatsError): test.write(self.__input, self.__output, ignore_missing=False)
def parse_fasta(fa_file): try: fa_object = pyfaidx.Fasta(fa_file) except pyfaidx.UnsupportedCompressionFormat: # pyfaidx can handle bgzip but not gzip; so we just hack it here and # unzip the file for checksumming, then rezip it for the rest of the # asset build. # TODO: streamline this to avoid repeated compress/decompress # in refgenie we feed this function with uncompressed, newly built # FASTA file, so compression issues are not relevant os.system("gunzip {}".format(fa_file)) fa_file_unzipped = fa_file.replace(".gz", "") fa_object = pyfaidx.Fasta(fa_file_unzipped) os.system("gzip {}".format(fa_file_unzipped)) return fa_object
def main(): parser = argparse.ArgumentParser( description= 'Gather all the indels for the tag-targeted sites in a 40 bp window.') parser.add_argument('--bam_file', help='Sorted bam file with the mapped reads', required=True) parser.add_argument( '--primer_file', help= 'Tab separated. A single line per target containing the closest primer to it.', required=True) parser.add_argument('--basename', help='basename to be used', required=True) parser.add_argument('--genome_reference', help='Indexed Genome Reference', required=True) parser.add_argument('--output_folder', help='output folder', required=True) args = parser.parse_args() if not os.path.exists(args.output_folder): os.makedirs(args.output_folder) genome = pyfaidx.Fasta(args.genome_reference) print('*** Running indelsGathering ***', file=sys.stderr) storeIndels(args.bam_file, args.primer_file, args.basename, genome, args.output_folder)
def __init__( self, submission_queue, logging_queue, fasta, identifier, fasta_out, gtf_out, tmpdir, lenient=False, # strand_specific=False, canonical_splices=(("GT", "AG"), ("GC", "AG"), ("AT", "AC")), log_level="WARNING"): super().__init__() self.__identifier = identifier # self.strand_specific = strand_specific self.canonical = canonical_splices self.log_level = log_level self.logger = None self.logging_queue = logging_queue self.name = "Checker-{0}".format(self.identifier) create_queue_logger(self) self.lenient = lenient self.__fasta = fasta self.submission_queue = submission_queue self.fasta = pyfaidx.Fasta(self.__fasta) self.fasta_out = os.path.join( tmpdir, "{0}-{1}".format(fasta_out, self.identifier)) self.gtf_out = os.path.join(tmpdir, "{0}-{1}".format(gtf_out, self.identifier)) self.logger.debug(self.canonical)
def SPARKcreateBam(DataFrame, outbam): fa = pyfaidx.Fasta('chr1.fa') dict_fa = {'HD': {'VN': 1.6, 'SO': 'coordinate'}, 'SQ': [{'SN': x, 'LN': len(fa[x])} for x in fa.keys()]} dictSorted = DataFrame.take(DataFrame.count()) fh = pysam.AlignmentFile(outbam, mode="wb", header=dict_fa) for i in range(0, DataFrame.count()): s = pysam.AlignedSegment(fh.header) if dictSorted[i].flag == 4: s.is_unmapped = True s.query_name = dictSorted[i].Rname s.query_sequence = dictSorted[i].seq s.query_qualities = np.array([ord(x) - 33 for x in list(dictSorted[i].QUAL)]) else: s.is_unmapped = False s.reference_name = dictSorted[i].contig s.query_name = dictSorted[i].Rname s.query_sequence = dictSorted[i].seq s.reference_start = dictSorted[i].pos s.cigarstring = dictSorted[i].cigar s.is_reverse = True if dictSorted[i].flag == 16 else False s.mapping_quality = dictSorted[i].mapq s.set_tags([("MD", dictSorted[i].MDtag, "Z"), ("cs", dictSorted[i].cstag, "Z")]) s.query_qualities = np.array([ord(x) - 33 for x in list(dictSorted[i].QUAL)]) fh.write(s) fh.close() pysam.sort("-o", "test.srt.bam", "test.bam") pysam.index("test.srt.bam")
def create_Bam(alignments, outbam): fa = pyfaidx.Fasta('chr1.fa') dict_fa = {'HD': {'VN': 1.6, 'SO': 'coordinate'}, 'SQ': [{'SN': x, 'LN': len(fa[x])} for x in fa.keys()]} alignmentsSorted = sorted(alignments, key = attrgetter('contig', 'pos')) fh=pysam.AlignmentFile(outbam, mode="wb", header=dict_fa) for i, subreads in enumerate(alignmentsSorted): s = pysam.AlignedSegment(fh.header) if subreads.flag == 4: s.is_unmapped = True s.query_name = subreads.Rname s.query_sequence = subreads.seq s.query_qualities = np.array([ord(x) - 33 for x in list(subreads.basequal)]) else: #s = pysam.AlignedSegment(fh.header) s.is_unmapped = False s.reference_name = subreads.contig s.query_name = subreads.Rname s.query_sequence = subreads.seq s.reference_start = subreads.pos s.cigarstring = subreads.cigar s.is_reverse = True if subreads.flag == 16 else False s.mapping_quality = subreads.mapq s.set_tags([("MD", subreads.MDtag, "Z"), ("cs", subreads.cstag, "Z")]) s.query_qualities = np.array([ord(x) - 33 for x in list(subreads.basequal)]) fh.write(s) fh.close() pysam.sort("-o", "test.srt.bam", "test.bam") pysam.index("test.srt.bam")
def __init__(self, input_path, features): """ Constructs a new `FastaFeatures` object. """ self.data = [] fasta_files = os.listdir(input_path) valid_fastas = [] for i, fasta_in in enumerate(fasta_files): if not '.fa' in fasta_in: continue if '.fai' in fasta_in: continue self.data.append( pyfaidx.Fasta(os.path.join(input_path, fasta_in), duplicate_action="first")) valid_fastas.append(fasta_in) self.n_features = len(features) self.feature_index_dict = dict([ (feat, index) for index, feat in enumerate(features) ]) self.index_feature_dict = dict(list(enumerate(features))) self.file_index_dict = dict([ (fasta, index) for index, fasta in enumerate(valid_fastas) ]) self.index_file_dict = dict(list(enumerate(valid_fastas))) self._features = features
def __init__(self, input_path): """ Constructs a `Proteome` object. """ self.proteome = pyfaidx.Fasta(input_path) self.prots = sorted(self.proteome.keys()) self.len_prots = self._get_len_prots()
def main(): args = parse_args() ## read FASTA genome file fasta = pyfaidx.Fasta(args.genome) ## read VCF file vcf_reader = list(vcf.Reader(open(args.vcf, 'r'))) ## commands #### SELECT COMMAND if args.command == 'select': print("read VCF and select variants located on annotated '"+args.selectionAnnotationType+"' genome regions...", end="") dbfnFile = 'currentgff.db' ## read GFF3 file if os.path.exists(dbfnFile): os.remove(dbfnFile) db = gffutils.create_db(args.annotation, dbfn=dbfnFile) ## list of selected type annotated region annotationRegionList = select_annotation_type(db, fasta, args.selectionAnnotationType) ## write variant VCF into annotated region vcf_writer_annotated = vcf.Writer(open(args.output_prefix+'_'+args.selectionAnnotationType+'.vcf', 'w'), vcf.Reader(open(args.vcf, 'r'))) for variant in vcf_reader: for region in annotationRegionList: if variant_position_within(variant, region): vcf_writer_annotated.write_record(variant) break #### SYNONYMOUS COMMAND elif args.command == 'synonymous': print("read VCF and detect synonymous and non-synonymous coding variants...", end="") dbfnFile = 'currentgff.db' ## read GFF3 file if os.path.exists(dbfnFile): os.remove(dbfnFile) db = gffutils.create_db(args.annotation, dbfn=dbfnFile) ## From the genome(GFF3, FASTA), ## extract a list of CDS (coding sequences) objects cdsSeqList = dbfasta2CdsSeq(db, fasta) ## check wether variant is within a CDS vcf_writer_synonymous = vcf.Writer(open(args.output_prefix+'_synonymous.vcf', 'w'), vcf.Reader(open(args.vcf, 'r'))) vcf_writer_non_synonymous = vcf.Writer(open(args.output_prefix+'_nonsynonymous.vcf', 'w'), vcf.Reader(open(args.vcf, 'r'))) for variant in vcf_reader: #print(variant.CHROM, variant.POS, variant.REF, variant.ALT[0]) for cdsSeq in cdsSeqList: if variant_position_within(variant, cdsSeq): #print("cds #", i) #print(variant.CHROM,variant.POS, "|", cdsSeq.seqid, cdsSeq.start, cdsSeq.end) if is_synonymous(variant, cdsSeq): vcf_writer_synonymous.write_record(variant) else: vcf_writer_non_synonymous.write_record(variant) break #### FLANK COMMAND elif args.command == 'flank': print("read VCF and extract flanking sequences of variants from the genome...", end= "") windowsSize = args.windowsSize sequences = vcf_flanking_sequences(vcf_reader, fasta, windowsSize) with open(args.output_prefix+"_flanking.fasta", "w") as output_handle: SeqIO.write(sequences, output_handle, "fasta") else: print("Au revoir !") sys.exit(0) print("done")
def worker(task_queue, output_queue, rest, fasta): cutting_idx, rest_seq = parse_rest(rest) rest_seq_rc = rc(rest_seq) faidx = pyfaidx.Fasta(fasta) while 1: chr_ = task_queue.get() if chr_ is None: log.debug("Process-%d done" % mp.current_process().pid) break seq = faidx[chr_][:].seq # read sequence seq_len = len(seq) out_chunk = [0] for match in re.finditer(rest_seq, seq, re.IGNORECASE): out_chunk.append(match.start() + cutting_idx # fragment start ) out_chunk.append(seq_len) output_queue.put((chr_, '+', out_chunk)) if rest_seq_rc != rest_seq: # find reverse complement restriction site out_chunk = [] for match in re.finditer(rest_seq_rc, seq, re.IGNORECASE): out_chunk.append(match.start() + cutting_idx) output_queue.put((chr_, '-', out_chunk))
def main(): fichiers = parse_args() dbfnFile = 'currentgff.db' ## read GFF3 file if os.path.exists(dbfnFile): os.remove(dbfnFile) db = gffutils.create_db(fichiers.genomeAnnotation, dbfn=dbfnFile) ## read FASTA genome file fasta = pyfaidx.Fasta(fichiers.genomeFasta) ## From the genome(GFF3, FASTA), cdsSeqList = dbfasta2CdsSeq(db, fasta) ## extract a list of CDS (coding sequences) objects ## read VCF file vcf_reader = list(vcf.Reader(open(fichiers.vcf, 'r'))) ## check wether variant is within a CDS vcf_writer_synonymous = vcf.Writer( open(fichiers.outputPrefix + '_synonymous.vcf', 'w'), vcf.Reader(open(fichiers.vcf, 'r'))) vcf_writer_non_synonymous = vcf.Writer( open(fichiers.outputPrefix + '_nonsynonymous.vcf', 'w'), vcf.Reader(open(fichiers.vcf, 'r'))) for variant in vcf_reader: #print(variant.CHROM, variant.POS, variant.REF, variant.ALT[0]) for cdsSeq in cdsSeqList: if variant_position_within(variant, cdsSeq): #print("cds #", i) #print(variant.CHROM,variant.POS, "|", cdsSeq.seqid, cdsSeq.start, cdsSeq.end) if is_synonymous(variant, cdsSeq): vcf_writer_synonymous.write_record(variant) else: vcf_writer_non_synonymous.write_record(variant) break
def canonical_transcripts(db, fasta_filename): import pyfaidx fasta = pyfaidx.Fasta(fasta_filename, as_raw=True) for gene in db.features_of_type('gene'): # exons_list will contain (CDS_length, total_length, transcript, [exons]) tuples. exon_list = [] for ti, transcript in enumerate(db.children(gene, level=1)): cds_len = 0 total_len = 0 exons = list(db.children(transcript, level=1)) for exon in exons: exon_length = len(exon) if exon.featuretype == 'CDS': cds_len += exon_length total_len += exon_length exon_list.append((cds_len, total_len, transcript, exons)) # If we have CDS, then use the longest coding transcript if max(i[0] for i in exon_list) > 0: best = sorted(exon_list)[0] # Otherwise, just choose the longest else: best = sorted(exon_list, lambda x: x[1])[0] print(best) canonical_exons = best[-1] transcript = best[-2] seqs = [i.sequence(fasta) for i in canonical_exons] yield transcript, ''.join(seqs)
def test_build(self, build_kws, mocker): """Tests build using example files.""" # Mock STAR call. mock = mocker.patch.object(star, 'star_index') # Build reference. indexer = star.StarIndexer() indexer.build(**build_kws) # Check if reference files exist. ref = star.StarReference(build_kws['output_dir']) assert ref.base_path.exists() assert ref.fasta_path.exists() assert ref.gtf_path.exists() assert ref.indexed_gtf_path.exists() # assert ref.index_path.exists() assert ref.transposon_name == 'T2onc' assert ref.transposon_path.exists() assert ref.features_path.exists() # Check presence of augmented reference sequences. refseq = pyfaidx.Fasta(str(ref.fasta_path)) assert sorted(refseq.keys()) == ['1', '2', 'T2onc'] # Check call to STAR for building the index. mock.assert_called_once_with(fasta_path=ref.fasta_path, gtf_path=ref.gtf_path, output_dir=ref.index_path, log_path=build_kws['output_dir'] / 'star.log', overhang=100, threads=1)
def create_vac(self, bam_filename: str, vcf_filename: str, out_vac_filename: str, ref_fasta_filename: str, skip_indels: bool): """ BAM and VCF should use same reference genome. VCF must contain INFO column with sub-fields AC and AN. :param bam_filename: filename of the SAM/BAM file, from which the header is extracted :param vcf_filename: filename of the input VCF file :param out_vac_filename: filename of the output VAC file :param ref_fasta_filename: filename to reference FASTA file :param skip_indels: whether to skip indels and keep only SNPs """ # TODO use fasta index / vcf header instead of BAM header # load the reference FASTA ref_fasta = None if ref_fasta_filename is not None: if self._verbose: print('--- Loading Reference Fasta ---') ref_fasta = pyfaidx.Fasta(ref_fasta_filename) # is VCF gzipped? # is_gzipped = vcf_filename.endswith(('.gz', '.bgz')) # open all files and create the VAC file if self._verbose: print('--- Processing VCF %s ---' % vcf_filename) with pysam.VariantFile(vcf_filename) as vcf_file, \ open_bam(bam_filename, 'rb') as sam_file, \ open(out_vac_filename, 'wb') as out_vac_file: vac = Vac(FastaIndex.from_bam(sam_file), self._verbose) vac.vcf2vac(vcf_file, out_vac_file, ref_fasta, skip_indels)
def main(): """Initialize the logic of the program.""" gtf_file, fasta_file, out_file = argv[1], argv[2], argv[3] gtf_db = gtf_file + "_gffutils.db" if not os.path.isfile(gtf_db): gffutils.create_db(gtf_file, dbfn=gtf_db, disable_infer_genes=True, disable_infer_transcripts=True) db = gffutils.FeatureDB(gtf_db) fasta = pyfaidx.Fasta(fasta_file) genes = list() for gene in db.features_of_type("gene"): genes.append(" ".join([ ">" + gene.id, gene.chrom, str(gene.start), str(gene.end), "\n" + gene.sequence(fasta) ])) with open(out_file, 'w') as out: for gene in genes: out.write("%s\n" % gene)
def test_lowercase_ref_splice_site(self): reference = pkg_resources.resource_filename("Mikado.tests", "NC_037283.1.fa.gz") fasta = pyfaidx.Fasta(reference) fai = pysam.FastaFile(reference) lines = dict() lines["chrom"] = "NC_037283.1" lines["strand"] = '-' lines["start"] = 200431 lines["end"] = 204262 lines["attributes"] = dict() lines["tid"], lines["parent"] = "STRG.4616.1", "STRG.4616" lines["features"] = dict() lines["features"]["exon"] = [(200431, 200919), (201096, 201282), (201446, 201512), (201776, 203421), (203570, 204262)] seq = str(fasta[lines["chrom"]][lines["start"] - 1:lines["end"]]) logger, listener, logging_queue = self.create_logger("test_example_model") res = checking.create_transcript(lines, seq, lines["start"], lines["end"], logger=logger) listener.stop() self.assertIsInstance(res, transcripts.TranscriptChecker) self.assertEqual(res.attributes["canonical_number"], 4)
def __init__(self, input_path, blacklist_regions=None, bases_order=None): """ Constructs a `Genome` object. """ self.genome = pyfaidx.Fasta(input_path) self.chrs = sorted(self.genome.keys()) self.len_chrs = self._get_len_chrs() self._blacklist_tabix = None if blacklist_regions == "hg19": self._blacklist_tabix = tabix.open( pkg_resources.resource_filename( "selene_sdk", "sequences/data/hg19_blacklist_ENCFF001TDO.bed.gz")) elif blacklist_regions == "hg38": self._blacklist_tabix = tabix.open( pkg_resources.resource_filename( "selene_sdk", "sequences/data/hg38.blacklist.bed.gz")) elif blacklist_regions is not None: # user-specified file self._blacklist_tabix = tabix.open(blacklist_regions) if bases_order is not None: bases = [str.upper(b) for b in bases_order] self.BASES_ARR = bases lc_bases = [str.lower(b) for b in bases] self.BASE_TO_INDEX = { **{b: ix for (ix, b) in enumerate(bases)}, **{b: ix for (ix, b) in enumerate(lc_bases)} } self.INDEX_TO_BASE = {ix: b for (ix, b) in enumerate(bases)} self.update_bases_order(bases)
def get_fasta_regions(fastaname, threads): fasta = pyfaidx.Fasta(args.fasta, key_function=lambda key: key.split()[0]) total_reference_length = 0 for chrom in sorted(fasta.keys()): total_reference_length += len(fasta[chrom]) step_length = int(math.ceil(total_reference_length / threads)) regions = [] region = [] region_so_far = 0 chrom_so_far = 0 for chrom in sorted(fasta.keys()): chrom_length = len(fasta[chrom]) if chrom_length < 250000: continue while True: if region_so_far + (chrom_length - chrom_so_far) < step_length: region.append((chrom, chrom_so_far, chrom_length)) region_so_far += chrom_length - chrom_so_far chrom_so_far = 0 break else: region.append((chrom, chrom_so_far, chrom_so_far + step_length - region_so_far)) regions.append(region) region = [] chrom_so_far += step_length - region_so_far region_so_far = 0 if len(region) > 0: if len(regions) == args.threads: regions[-1] = regions[-1] + region else: regions.append(region) return regions
def calc_all(all_genes, bases_to_exclude, rscu_fh, gerp_fp, genome_fa, syn_gerp_out, bed_out): """ Calculates mean gerp score for all Gene objects contained in list all_genes and writes values to outfile :param all_genes: dict of Gene objects :param gerp_fp: path to tabix-indexed gerp file :param genome_fa: path to reference genome fasta that has been indexed via samtools faidx :param outfile: path to output file """ with gzip.open(gerp_fp, 'rt') as gerp_f: gerp_header = gerp_f.readline() gerp_header = gerp_header.strip().split("\t") gerp_tb = tabix.open(gerp_fp) genome = pyfaidx.Fasta(genome_fa) rscu = read_rscu_f(rscu_fh) syn_gerp_out.write("#GENE\tSYN_GERP\n") bed_out.write("#CHROM\tPOS\tSTRAND\tGENE\tCDS_POS\tCODON\tRSCU\tGERP\n") for gene_obj in all_genes.values(): gene_obj.calc_syn_gerp(genome, gerp_header, gerp_tb, rscu, bases_to_exclude) syn_gerp_out.write("{}\t{}\n".format(gene_obj.gene, \ gene_obj.syn_gerp)) for line in gene_obj.bed: bed_out.write(line)
def calculate_refpos(): trace(1, 'calculating refpos') fname = get_FASTA_fromweb(config['ref_fasta_hg38']) # assumes format for firs line is >seqname bla bla bla seqname = open(fname).read(100).split()[0][1:] ref = pyfaidx.Fasta(fname) # helper: return False if SNP definition's anc != reference genome's value def check(snpname): a1 = snpdict[snpname][1] a2 = str( ref.faidx.fetch(seqname, int(snpdict[snpname][0]), int(snpdict[snpname][0]))).upper() if a1 != a2: return False return True snpdict = {} snpdef = data_path(os.path.join('cache', config['b38_snp_file'])) with open(snpdef) as snpfile: c = csv.DictReader(snpfile) for line in c: snpdict[line['Name']] = (line['start'], line['allele_anc'], line['allele_der']) # write out the detected refpos along with its definition refpos = data_path(os.path.join('cache', 'refpos-detect.out')) with open(refpos, 'w') as fn: for snp in snpdict: if (snpdict[snp][1] not in ('ins','del')) and \ (not check(str(snp))) and (int(snpdict[snp][0]) > 1): fn.write('{} {}\n'.format(snp, snpdict[snp])) return
def calc_proteins(sourcebase, reffile, vcffile): gfffiles = get_gff_files(sourcebase) if vcffile != None: print "VCFFile Provided" vcfrecords = VariantFile(vcffile) else: no_vcf = True vcfrecords = '' print "DEBUG: reffile for pyfaidx is: %s" % reffile fasta = pyfaidx.Fasta(reffile) for infile in gfffiles: total_genes = 0 total_exons = 0 total_genes_wrote = 0 total_exons_wrote = 0 total_mod = 0 db = gffutils.create_db(infile, ':memory:') for gene in db.features_of_type('gene'): #still ok?? total_genes += 1 seqs, geneseq, num_mod = get_gene_sequences( gene, db, vcfrecords, fasta) total_exons += len(seqs) total_mod += num_mod #a=seqs # return seqs #return geneseq total_exons_wrote += write_records(translate_records(seqs), 'exons') total_genes_wrote += write_records(translate_records(geneseq), 'genes') print "processed (and wrote): %s with %i(%i) genes and %i(%i) exons (%i modified))" % ( infile, total_genes, total_genes_wrote, total_exons, total_exons_wrote, total_mod)
def main(): def check_type(string): if os.path.exists(string) and not os.path.isdir(string): return open(string) else: return set(string.split(',')) parser=argparse.ArgumentParser(description='A simple script that retrieves the FASTA sequences from a file given a list of ids.') parser.add_argument("-v", "--reverse", action="store_true", default=False, help="Retrieve entries which are not in the list, as in grep -v (a homage).") parser.add_argument('list', type=check_type, help='File with the list of the ids to recover, one by line. Alternatively, names separated by commas.') parser.add_argument('fasta', type=argparse.FileType('r'), help='FASTA file.') parser.add_argument('out', type=argparse.FileType('w'), help='Optional output file.', nargs='?', default=sys.stdout) args=parser.parse_args() if isinstance(args.list, IOBase): ids = set([line.rstrip() for line in args.list.readlines()]) else: ids=args.list args.fasta.close() fasta = pyfaidx.Fasta(args.fasta.name) for name in ids: assert name in fasta print(">{0}".format(name), file=args.out) print(*textwrap.wrap(str(fasta[name]), width=60), sep="\n", file=args.out)
def __load_configuration(self): """Private method to load the configuration""" if isinstance(self.configuration, str): assert os.path.exists(self.configuration) self.configuration = load_and_validate_config(self.configuration, logger=self.logger) assert isinstance(self.configuration, (MikadoConfiguration, DaijinConfiguration)) # pylint: disable=no-member elif not isinstance(self.configuration, (MikadoConfiguration,DaijinConfiguration)): raise TypeError(type(self.configuration)) multiprocessing.set_start_method(self.configuration.multiprocessing_method, force=True) self.input_file = self.configuration.pick.files.input self.setup_logger() if self.configuration.pick.alternative_splicing.pad is True: # Check that, when asks for padding, the reference genome is present self.logger.debug("Checking for the presence of the reference genome") try: _ = pyfaidx.Fasta(self.configuration.reference.genome) except (pyfaidx.FastaIndexingError, FileNotFoundError, pyfaidx.FastaNotFoundError): self.logger.error("Transcript padding cannot be executed without a valid genome file.\ Please, either disable the padding or provide a valid genome sequence.") sys.exit(1) self.logger.debug("Valid reference genome found") else: pass self.context = multiprocessing.get_context() self.logger.debug("Configuration loaded successfully")