def test_exception(self): with self.assertRaises(TypeError): pyfastx.Fasta(flat_fasta, key_func=1) with self.assertRaises(FileExistsError): pyfastx.Fasta('a_file_not_exists') with self.assertRaises(ValueError): self.fastx.fetch('seq1', {'a': 1}) with self.assertRaises(NameError): self.fastx.fetch('seq1', (1, 10)) with self.assertRaises(ValueError): self.fastx.fetch(self.fastx[0].name, (1, 10, 20)) with self.assertRaises(ValueError): self.fastx.fetch(self.fastx[0].name, (20, 10)) with self.assertRaises(ValueError): self.fastx.fetch(self.fastx[0].name, [20, 10]) with self.assertRaises(IndexError): _ = self.fastx[self.count] with self.assertRaises(KeyError): _ = self.fastx[list()] with self.assertRaises(ValueError): self.fastx.nl(101)
def setUp(self): self.fastx = pyfastx.Fasta(gzip_fasta) self.fasta = pyfastx.Fasta(flat_fasta) self.faidx = pyfaidx.Fasta(flat_fasta, sequence_always_upper=True) self.count = len(self.fastx)
def test_seq_type(self): #test dna format self.assertEqual(self.fastx.type, 'DNA') #test rna format rna = pyfastx.Fasta(rna_fasta) self.assertEqual(rna.type, "RNA") #test protein format prot = pyfastx.Fasta(protein_fasta) self.assertEqual(prot.type, "protein")
def load_seqfile(infile): fxifile = infile + ".fxi" if os.path.exists(fxifile) and infile.endswith(FASTA_SUFFIX): seqfile = pyfastx.Fasta(infile, build_index=False) elif not os.path.exists(fxifile) and infile.endswith(FASTA_SUFFIX): seqfile = pyfastx.Fasta(infile, build_index=True) elif os.path.exists(fxifile) and infile.endswith(FASTQ_SUFFIX): seqfile = pyfastx.Fastq(infile, build_index=False) elif not os.path.exists(fxifile) and infile.endswith(FASTQ_SUFFIX): seqfile = pyfastx.Fastq(infile, build_index=True) return seqfile
def setUp(self): self.fastx = pyfastx.Fasta(gzip_fasta, build_index=False) self.fastx.build_index() self.fastx.rebuild_index() #reload index self.fastx = pyfastx.Fasta(gzip_fasta) self.fasta = pyfastx.Fasta(flat_fasta) self.faidx = pyfaidx.Fasta(flat_fasta, sequence_always_upper=True) self.count = len(self.fastx)
def fasta_sample(args): fa = pyfastx.Fasta(args.fastx) if args.num is not None and args.num > 0: seq_num = args.num if seq_num > len(fa): seq_num = len(fa) elif args.prop is not None and 0 < args.prop <= 1: seq_num = round(len(fa)*args.prop) if seq_num == 0: raise RuntimeError("the proportion is too small") else: raise RuntimeError("specify a right number for seq number or proportion") selected = random.sample(range(len(fa)), k=seq_num) if args.outfile is None: fw = sys.stdout else: fw = open(args.outfile, 'w') for idx in selected: s = fa[idx] fw.write(">{}\n{}\n".format(s.name, s.seq)) if args.outfile is None: fw.flush() else: fw.close()
def fastx_subseq(args): fa = pyfastx.Fasta(args.fastx) if args.chr is not None: if args.chr not in fa: raise RuntimeError("no sequence named {} in fasta file".format(args.chr)) subseq = fa[args.chr] else: if args.id <= 0: raise RuntimeError("sequence id must be a integer between 1 and {}".format(len(fa))) subseq = fa[args.id] if args.region: start, end = args.region.split(':') if start: start = int(start) - 1 else: start = 0 if end: end = int(end) else: end = len(s) sys.stdout.write("{}\n".format(subseq[start:end].seq)) else: sys.stdout.write("{}\n".format(subseq.seq)) sys.stdout.flush()
def get_output_handle(fpath: str, fastx: bool = False, out: bool = True): if fpath == "-": if out: handle = sys.stdout else: handle = sys.stdin else: p = Path(fpath) if not p.parent.is_dir(): raise NotADirectoryError( "Directory specified for output file does not exist: {}".format( p.parent ) ) if fastx: if fpath.endswith("a"): handle = pyfastx.Fasta(p) else: handle = pyfastx.Fastq(p) else: handle = p.open("w") return handle
def main(): # Configure argparser argparser = get_argparser() # Parse the arguments args = argparser.parse_args() # Input files: json input file to be used as template and in_fasta = args.in_fasta out_fasta = args.out_fasta # Configure logging appropriate for verbosity configure_logging(args.verbosity_level) # Count total length of bases in fasta logging.info("Reading input fasta...") fa = pyfastx.Fasta(in_fasta) total_contigs = len(fa) total_bases = fa.size logging.info("Total input contigs: {0}".format(total_contigs)) logging.info("Total input bases: {0}".format(total_bases)) # Merge contigs logging.info("Cleaning fastq records") filter_fasta(fa, out_fasta)
def test_build(self): self.fastx = pyfastx.Fasta(gzip_fasta, build_index=False) if os.path.exists('{}.fxi'.format(gzip_fasta)): os.remove('{}.fxi'.format(gzip_fasta)) self.fastx.build_index()
def generateHtml(self): sql = "SELECT * FROM primer,primer_meta WHERE id=pid AND id=%s" % self.id primer = self.db.get_row(sql) table = primer.category tid = primer.target #table, tid = primer.target.split('-') sql = "SELECT path FROM fasta LIMIT 1" fasta_file = self.db.get_one(sql.format(table, tid)) self.fasta = pyfastx.Fasta(fasta_file) sql = "SELECT * FROM %s WHERE id=%s" % (table, tid) ssr = self.db.get_row(sql) seq, left, right = self.getSequence(ssr.sequence, ssr.start, ssr.end) tandem = "%s%s%s" % ( self.formatPrimer(left, primer.start1, primer.length1), self.formatTarget(seq), self.formatPrimer(right, primer.start2-primer.length2-len(seq)-len(left)+1, primer.length2) ) return template_render("sequence.html", tandem=tandem, ssr=ssr, table=self.table)
def parse_fasta(fasta_file): busco_seqs = pyfastx.Fasta(fasta_file) ids = busco_seqs.keys() busco_names = pd.DataFrame(data=list(ids), columns=['seqNames']) busco_names = pd.DataFrame(busco_names.seqNames.str.split("_", 1).tolist(), columns=['buscoId', 'sampleId']) return busco_names, busco_seqs
def fastx_info(args): fastx_type = fastx_format_check(args.fastx) if fastx_type == 'fasta': fa = pyfastx.Fasta(args.fastx) comp = fa.composition print("Sequence counts: {}".format(len(fa))) print("Total bases: {}".format(fa.size)) print("GC content: {:.2f}%".format(fa.gc_content)) for b in comp: print("{} counts: {}".format(b, comp[b])) print("Mean length: {:.2f}".format(fa.mean)) print("Median length: {:.2f}".format(fa.median)) print("Max length: {}".format(len(fa.longest))) print("Min length: {}".format(len(fa.shortest))) print("N50, L50: {}, {}".format(*fa.nl())) print("length >= 1000: {}".format(fa.count(1000))) elif fastx_type == 'fastq': fq = pyfastx.Fastq(args.fastx) comp = fq.composition print("Read counts: {}".format(len(fq))) print("Total bases: {}".format(fq.size)) print("GC content: {:.2f}%".format(fq.gc_content)) for b in comp: print("{} counts: {}".format(b, comp[b])) print("Quality encoding system maybe: {}".format(", ".join(fq.encoding_type)))
def create_fastx_index(fastx): if is_fasta(fastx): return pyfastx.Fasta(str(fastx), build_index=True), build_read_fasta elif is_fastq(fastx): return pyfastx.Fastq(str(fastx), build_index=True), build_read_fastq else: raise ValueError(f'Could not determine input file format: {fastx}')
def stat_query_mismatch(alnfile: str, reffile: str, sitesfile: str): """ 统计指定位点的错配情况 alnfile为bam/cram文件 reffile为对应的参考基因组文件 sitesfile为待检测的位点,两列,gzip或bgzip压缩,第一列为染色体,第二列为坐标位置(1-based),vcf.gz文件符合这一格式,可以直接用vcf.gz作为输入 """ print(f'stat_query_mismatch: {alnfile}, {sitesfile}') # 打开reffile reffa = pyfastx.Fasta(reffile, uppercase=True) # always output uppercase sequence, 为了方便下面比较 # 读取位点信息 sites = [] with gzip.open(sitesfile, 'rb') as f: for line in f: tline = line.decode().strip().split() if tline[0][0] != '#': sites.append([tline[0], int(tline[1])-1]) # 后续pysam输入的是0-base索引,在这儿提前转换了 # 打开bam文件 if alnfile.endswith('bam'): alnfile = pysam.AlignmentFile(alnfile, 'rb', threads=10) elif alnfile.endswith('.cram'): alnfile = pysam.AlignmentFile(alnfile, 'rc', reference_filename=reffile, threads=10) # 开始遍历位点进行判断 reads2QMis = defaultdict(int) # 每对reads在目标位点(query sites)上与参考基因组(reffile)之间的错配数量。统计错配的话,后续缺失值用0填充就比较合理。 reads2nQuery = defaultdict(int) # 记录每个reads cover到了几个目标位点,用来区分没有错配还是没有cover到query位点 for chrom, pos in sites: refbase = reffa[chrom][pos: pos+1].seq # stepper='all': skip reads in which any of the following flags are set: BAM_FUNMAP, BAM_FSECONDARY, BAM_FQCFAIL, BAM_FDUP for ncolumn, pileupcolumn in enumerate(alnfile.pileup(chrom, pos, pos+1, truncate=True, stepper='all', ignore_orphans=False, min_base_quality=0, min_mapping_quality=0)): assert pileupcolumn.reference_pos == pos for pileupread in pileupcolumn.pileups: readname = pileupread.alignment.query_name reads2nQuery[readname] += 1 if not pileupread.is_del: # query position is None if is_del or is_refskip is set. 统一大小写进行比较,fasta前面已经转过了 if pileupread.alignment.query_sequence[pileupread.query_position].upper() != refbase: reads2QMis[readname] += 1 else: reads2QMis[readname] += 1 alnfile.close() maxnQuery = max(reads2nQuery.values()) nQuery_dtype = select_min_dtype_uint(maxnQuery) reads2nQuery = pd.Series(reads2nQuery, dtype=nQuery_dtype) print(f'max(nQuery): {maxnQuery}, select dtype: {nQuery_dtype}') maxQMis = max(reads2QMis.values()) QMis_dtype = select_min_dtype_uint(maxQMis) reads2QMis = pd.Series(reads2QMis, dtype=QMis_dtype) print(f'max(QMis): {maxQMis}, select dtype: {QMis_dtype}') return reads2nQuery, reads2QMis
def build_fasta_index(self, fasta_id, fasta_path): ''' build index for fasta file and write fasta sequence to database @para fasta_id int, the fasta file id in database @para fasta_path str, the file path of fasta @return Fasta object ''' #seqs = fasta.GzipFasta(fasta_path) self.emit_message("Building fasta index for %s" % fasta_path) with multiprocessing.Pool() as pool: pool.apply_async(build_full_index, (fasta_path, )) pool.close() pool.join() seqs = pyfastx.Fasta(fasta_path) #get sequence detail information #sql = "SELECT * FROM seq INNER JOIN fasta ON (seq.fid=fasta.id) WHERE fasta.path='{}' LIMIT 1".format(fasta_path) #if not self.db.get_one(sql): # rows = [] # for seq in seqs: # compos = seq.composition # ns = sum(compos[b] for b in compos if b not in ['A', 'T', 'G', 'C']) # row = (None, seq.name, fasta_id, len(seq), compos.get('G',0)+compos.get('C',0), ns) # rows.append(row) # self.db.insert("INSERT INTO seq VALUES (?,?,?,?,?,?)", rows) sql = "SELECT * FROM option WHERE name='gc_content'" if not self.db.get_one(sql): gc = seqs.gc_content compos = seqs.composition ns = sum(compos[b] for b in compos if b not in ['A', 'T', 'G', 'C']) self.db.insert("INSERT INTO option (name, value) VALUES (?,?)", [('total_base', str(seqs.size)), ('total_seqs', str(len(seqs))), ('gc_content', str(gc)), ('unkown_base', str(ns))]) self.total_bases = seqs.size seqs = pyfastx.Fasta(fasta_path, build_index=False) return seqs
def randomizeMatchingPositions(): ######################## #command line arguments# ######################## parser = argparse.ArgumentParser() #MANDATORY PARAMETERS parser.add_argument("outfile",help="Output fasta-file name.",type=str) #OPTIONAL PARAMETERS parser.add_argument("--wt",help="Full path to a fasta-file containing the wild type sequence.",type=str) parser.add_argument("--seqs",help="Full path to the fasta-file containing the sequences where we want to randomize the positions matching to wild type.",type=str) parser.add_argument("--N",help="Exact number of mismatches in seqs needed for including to output.",type=int,default=2) parser.add_argument("--addToReadName",help="String added to read names to distinguish them from input reads (default=:randomized).",type=str,default=":randomized") parser.add_argument("--alphabet",help="Alphabet used as a string containing each possible character (case sensitive, default=ACGT).",type=str,default='ACGT') args = parser.parse_args() #read in the wild type sequence for name,seq in pyfastx.Fasta(args.wt): wtseq = seq #read in rest of the sequences, save to outfile those that have N mismatches to wtseq #and randomize other positions from them with open(args.outfile,'wt') as outfile: w = csv.writer(outfile,delimiter='\t') for name,seq in pyfastx.Fasta(args.seqs): rands = np.random.randint(0,high=len(args.alphabet),size=len(seq)) #draw the random sequence newseq = "" N_mismatch = 0 #mismatch counter for i in range(0,len(seq)): if seq[i]!=wtseq[i]: N_mismatch += 1 newseq += seq[i] else: newseq += args.alphabet[rands[i]] if N_mismatch>args.N: break if N_mismatch==args.N: #save the sequence w.writerow(['>'+name+args.addToReadName]) w.writerow([newseq])
def get_total_seq_len(fasta_file): """Simple function that uses pyfastx to quickly read in a fasta, and then the sum of sequence lengths is returned """ try: x = [ len(seq) for h, seq in pyfastx.Fasta(fasta_file, build_index=False) ] except RuntimeError: x = [0] return sum(x)
def test_key_func(self): del self.fastx #remove previously created index file if os.path.exists("{}.fxi".format(gzip_fasta)): os.remove("{}.fxi".format(gzip_fasta)) self.fastx = pyfastx.Fasta(gzip_fasta, key_func=lambda x: x.split()[1]) idx = self.get_random_index() self.assertEqual(self.fastx[idx].name, self.fastx[idx].description.split()[1])
def build_index(infile): fxifile = infile + ".fxi" if os.path.exists(fxifile): print("fxi index is present") else: print("buliding fxi index for {}".format(infile)) if infile.endswith((".fa", ".fa.gz", ".fasta", ".fasta.gz")): pyfastx.Fasta(infile) else: pyfastx.Fastq(infile) print("fxi index has been created for {}".format(infile))
def main(): if len(sys.argv) < 3: print(f"USAGE: {sys.argv[0]} <a.fa> <b.fa>") sys.exit(1) fp1 = sys.argv[1] fp2 = sys.argv[2] if CLEAN_IDX: for fp in (fp1, fp2): Path(fp + ".fxi").unlink(missing_ok=True) fa1 = pyfastx.Fasta(fp1) fa2 = pyfastx.Fasta(fp2) n_seqs = len(fa1) if n_seqs != len(fa2): raise ValueError("Different number of sequences in the two files") ids = sorted(fa1.keys()) all_seq_ids_same = all(x == y for x, y in zip(ids, sorted(fa2.keys()))) if not all_seq_ids_same: raise ValueError( "Sequence IDs are not identical between the two files") mtx = np.zeros(shape=(n_seqs, n_seqs), dtype=np.int) for i, j in tqdm(product(range(n_seqs), range(n_seqs)), total=n_seqs * n_seqs): dist = hamming(fa1[ids[i]].seq, fa2[ids[j]].seq) mtx[i][j] = dist print(DELIM.join(["sample", *ids])) for i, sample in enumerate(ids): row = DELIM.join(map(str, mtx[i])) print(f"{sample}{DELIM}{row}") if CLEAN_IDX: for fp in (fp1, fp2): Path(fp + ".fxi").unlink(missing_ok=True)
def generateHtml(self): sql = "SELECT path FROM fasta LIMIT 1" fasta_file = self.db.get_one(sql.format(self.table, self.id)) self.fasta = pyfastx.Fasta(fasta_file) sql = "SELECT * FROM %s WHERE id=%s" % (self.table, self.id) ssr = self.db.get_row(sql) seq, left, right = self.getSequence(ssr.sequence, ssr.start, ssr.end) tandem = "%s%s%s" % (self.formatFlank(left), self.formatTarget(seq), self.formatFlank(right)) return template_render("sequence.html", tandem=tandem, ssr=ssr, table=self.table)
def chunkPolishSeq(chunkBarcodeWithReadIter, nanoporeReadPath, tempDirPath, finalDirPath, penaltyPath, i, minimapPath, poaPath, raconPath): nanoporeRead = pyfastx.Fasta(nanoporeReadPath) commandExecuted = list( map(polishSeq, chunkBarcodeWithReadIter, repeat(nanoporeRead), repeat(tempDirPath), repeat(finalDirPath), repeat(penaltyPath), repeat(minimapPath), repeat(poaPath), repeat(raconPath))) commandExecuted = ' ;\\\n'.join(commandExecuted) os.system(commandExecuted) if i % 100 == 0: logger.info(f'{i*100} reads processed')
def generate_coverage(read1, read2, mapping, ref, pwid=0.95, ncpu=1, chunk_size=500000, quiet=False): if not quiet: print("Building index and data structures...") seq_cov = {} for name, seq in pyfastx.Fasta(ref, build_index=False): seq_cov[name] = np.zeros(len(seq), dtype=int) nreads = 0 read_len = 0 for r in mp.fastx_read(read1): nreads+=1 read_len += len(r[1]) read_len /= nreads min_chain_score = int(0.9*read_len) min_mis_match = int(read_len-pwid*read_len) a = mp.Aligner(ref, preset='sr', n_threads=ncpu, best_n=1000, min_chain_score=min_chain_score) # load or build index if not a: raise Exception("ERROR: failed to load/build index") def mpile(seqs): if seqs is None: return([]) thrbuf = mp.ThreadBuffer() hits = [] chrom=None for hit in a.map(seqs[1], buf=thrbuf): if (hit.NM<=min_mis_match) and ('S' not in hit.cigar_str) and ('H' not in hit.cigar_str): if chrom is None: chrom=mapping[hit.ctg] hits.append((hit.ctg, hit.r_st-1, hit.r_en)) elif mapping[hit.ctg] == chrom: hits.append((hit.ctg, hit.r_st-1, hit.r_en)) else: break return(hits) if not quiet: print("Aligning reads...") pool = ThreadPool(ncpu) for reads in tqdm(grouper(chain( mp.fastx_read(read1), mp.fastx_read(read2)), chunk_size), total=int(1+2*nreads/chunk_size), disable=quiet): hits = pool.map(mpile, reads) for hit in chain.from_iterable(hits): if hit is None: continue seq_cov[hit[0]][hit[1]:hit[2]] += 1 #close the pool and wait for the work to finish pool.close() pool.join() return(seq_cov)
def fasta_split(args): fa = pyfastx.Fasta(args.fastx) if args.seq_count: parts_num = math.ceil(len(fa)/args.seq_count) else: parts_num = args.file_num name, suffix1 = os.path.splitext(os.path.basename(args.fastx)) if fa.is_gzip: name, suffix2 = os.path.splitext(name) digit = len(str(parts_num)) lens = [0] * parts_num if args.seq_count: seqs = [0] * parts_num fhs = [] for i in range(1, parts_num+1): if fa.is_gzip: subfile = "{}.{}{}{}".format(name, str(i).zfill(digit), suffix2, suffix1) else: subfile = "{}.{}{}".format(name, str(i).zfill(digit), suffix1) if args.outdir is not None: subfile = os.path.join(args.outdir, subfile) if fa.is_gzip: fh = gzip.open(subfile, 'wt') else: fh = open(subfile, 'w') fhs.append(fh) ids = fa.keys() for chrom in ids.sort('length', reverse=True): idx = min_index(lens) fhs[idx].write(">%s\n%s\n" % (chrom, fa[chrom].seq)) lens[idx] += len(fa[chrom]) if args.seq_count: seqs[idx] += 1 if seqs[idx] == args.seq_count: lens[idx] = fa.size for fh in fhs: fh.close()
def main(args): with open(args.prefix + ".fasta", "w") as O: with open(args.prefix + ".meta.csv", "w") as M: writer = csv.DictWriter(M, fieldnames=[ "id", "iso_a3", "country", "continent", "date", "seqlen", "missing_fraction" ]) writer.writeheader() for entry in tqdm(pyfastx.Fasta(args.fasta, full_name=True)): seqname = entry.name # print(seqname) if check_for_disallowed_countries(seqname): continue meta = seqname.split("|") if len(meta) != 3: continue if len(meta[0].split("/")) == 1: continue country = meta[0].split("/")[1] country = country2country.get(country, country) iso_a3 = country2iso_a3[country2country.get(country, country)] if country == "": continue continent = country2continent[country] date = meta[2] if date_qc(date) == False: continue if entry.end < args.seqlen: continue missing_chars = sum([ n for d, n in entry.composition.items() if d.upper() not in acgt ]) if missing_chars / entry.end > args.missing: continue seqid = meta[1] writer.writerow({ "id": seqid, "country": country, "continent": continent, "iso_a3": iso_a3, "date": date, "seqlen": entry.end, "missing_fraction": missing_chars / entry.end, }) seq = list(entry.seq) for pos in [i for i, n in enumerate(seq) if n not in acgt]: seq[pos] = "N" seq = "".join(seq) O.write(">%s\n%s\n" % (seqid, seq))
def main(): args = get_options() print("Calling genes from reads is still under active development and may change frequently!") # make sure trailing forward slash is present args.output_dir = os.path.join(args.output_dir, "") # create temporary directory temp_dir = os.path.join(tempfile.mkdtemp(dir=args.output_dir), "") # check files exist # clean database and remove sequences shorter than 300bp. temp_db = temp_dir + "temp_db.fasta" mapping = {} mapping_clust = {} with open(temp_db, 'w') as outfile: index = 0 for name, seq in pyfastx.Fasta(args.db, build_index=False): if len(seq)<=100: continue outfile.write('>' + str(index) + '\n' + seq + '\n') mapping[str(index)] = name mapping_clust[str(index)] = name.split("__")[0] index += 1 # align reads coverage = generate_coverage( read1=args.r1, read2=args.r2, ref=temp_db, mapping=mapping_clust, pwid=args.pwid, ncpu=args.n_cpu, chunk_size=500000, quiet=args.quiet) # call genes and write output prefix = os.path.basename(args.r1).split('.')[0].strip('1').strip('_') prefix += '_' + os.path.basename(args.db).split('.')[0] find_genes(coverage, mapping, cov_threshold=args.min_cov, prefix=prefix, outdir=args.output_dir, fold_threshold=args.min_fold, quiet=args.quiet) # clean up shutil.rmtree(temp_dir) return
def main(): # Parse arguments: Requires n_words and outfile destination parser = argparse.ArgumentParser( description='Produce Byte Pair Encoder trained from .FASTA file.') parser.add_argument('-i', metavar='INFILE', type=str, help='Path of FASTA file for training model.') args = parser.parse_args() assert args.i OUTFILE = "{}mer_compare_bpe_dna_wordsize_256.model" # Load all seqs into memory. This may be an issue depending on the machine. fa = pyfastx.Fasta(args.i) #Make everything upper case DNA #seqs = ['{}'.format(record.seq).upper().replace('U', 'T') for record in fa] seqs = ['{}'.format(record.seq).upper() for record in fa] k_compares = [6, 8] #Calculate target vocabulary sizes, in descending order k_compares = sorted(k_compares, reverse=True) vocab_sizes = [4**k for k in k_compares] #Calculation loop # SentencePiece does not appear to have a natural way of continuing # training from a checkpoint. # Therefore, approach here is train -> encode -> train etc... for i, vocab_size in enumerate(vocab_sizes): #Create iterable for model input s_iter = iter(seqs) #Create bytes stream for model output model = BytesIO() #Train encoder. spm.SentencePieceTrainer.train(sentence_iterator=s_iter, model_writer=model, vocab_size=vocab_size, hard_vocab_limit="False", max_sentencepiece_length=256) #Save the model with open(OUTFILE.format(k_compares[i]), 'wb') as f: f.write(model.getvalue()) #Encode the corpus to the reduced vocabulary sp = spm.SentencePieceProcessor(model_proto=model.getvalue()) print("DONE.")
def create_fastx_index(fastx: Path) -> (pyfastx.Fasta, Path): if is_fasta(fastx): return pyfastx.Fasta( str(fastx), build_index=True ), Path(str(fastx) + '.fxi') elif is_fastq(fastx): return pyfastx.Fastq( str(fastx), build_index=True ), Path(str(fastx) + '.fxi') else: raise ValueError( f'Could not determine input file format: {fastx}' )
def process(self): self.emit_message("Exporting fasta sequence to %s" % self.outfile) table_name = self.model.tableName() whole_ssrs = self.db.get_one("SELECT COUNT(1) FROM %s" % table_name) total_ssrs = whole_ssrs if self.selected == 'whole' or len(self.model.selected) == whole_ssrs: sql = "SELECT * FROM {}".format(table_name) else: ids = sorted(self.model.selected) total_ssrs = len(ids) sql = "SELECT * FROM {} WHERE id IN ({})".format( table_name, ",".join(map(str, ids))) current = 0 progress = 0 prev_progress = 0 current_seq = None current_name = None with open(self.outfile, 'wt') as fp: for item in self.db.query(sql): if item.sequence != current_name: sql = "SELECT path FROM fasta LIMIT 1" seqfile = self.db.get_one(sql) seqs = pyfastx.Fasta(seqfile) current_seq = seqs[item.sequence].seq current_name = item.sequence start = item.start - self.flank if start < 1: start = 1 end = item.end + self.flank #ssr = seqs.fetch(item.sequence, (start, end)) ssr = current_seq[start - 1:end] name = ">{}{} {}:{}-{}|motif:{}".format( table_name.upper(), item.id, item.sequence, item.start, item.end, item.motif) fp.write("{}\n{}".format(name, format_fasta_sequence(ssr, 70))) current += 1 progress = int(current / total_ssrs * 100) if progress > prev_progress: self.emit_progress(progress) prev_progress = progress self.emit_finish("Successfully exported to fasta %s" % self.outfile)