def main(input, blastndb, output, probe_length, search_step, evalue, blastn_tmpdir, threads): """ Search unique mapped probe(sub-sequence) within a series of sequences stored in a fasta file. \b For example: select 30 candidate probe regions with length 500bp, firstly, ``` $ python uniformly_spaced.py data/hg19.fa ./candidate.fa chr1:89000000-90000000 -n 30 -l 500 ``` then select unique maped probe(sub-sequence) from it. ``` $ python search_uniq.py candidate.fa example/blastn_db/hg19 probe.fa ``` \b Args ---- input : str Path to input fasta file. blastndb : str Path to blastn database. build with `makeblastdb` command. output : str Path to output fasta file. """ with open(input) as f: input_seqs = FastaIO.FastaIterator(f) probes = search_passed_probes(input_seqs, blastndb, evalue, probe_length, search_step, blastn_tmpdir, threads) save_fasta(probes, output)
def load_data(k, stride, pos_fasta, neg_fasta): vocab = Vocabulary(k=k) X = [] n_pos = 0 n_neg = 0 for fasta in pos_fasta, neg_fasta: with open(fasta) as f: for s in tqdm(FastaIO.FastaIterator(f)): seq = str(s.seq) if vocab.unknow_char in seq: continue try: x = vocab.kmer_count(seq, stride) except AssertionError: continue X.append(x) if fasta == pos_fasta: n_pos += 1 else: n_neg += 1 X = np.vstack(X) y = np.hstack([np.ones(n_pos), np.zeros(n_neg)]) return X, y
def input_text_to_df(input_text): """Converts fasta contents to a df with columns sequence_name and sequence.""" with io.StringIO(initial_value=input_text) as f: fasta_records = list(FastaIO.FastaIterator(f)) fasta_df = pd.DataFrame([(f.name, str(f.seq)) for f in fasta_records], columns=['sequence_name', 'sequence']) return fasta_df
def _assert_fasta_parsable(input_text): with io.StringIO(initial_value=input_text) as f: fasta_itr = FastaIO.FastaIterator(f) end_iteration_sentinel = object() # Avoid parsing the entire FASTA contents by using `next`. # A malformed FASTA file will have no entries in its FastaIterator. # This is unfortunate (instead of it throwing an error). if next(fasta_itr, end_iteration_sentinel) is end_iteration_sentinel: raise ValueError( 'Failed to parse any input from fasta file. ' 'Consider checking the formatting of your fasta file. ' 'First bit of contents from the fasta file was\n' '{}'.format(input_text.splitlines()[:3]))
help="An ensembl cdna fasta file with the properly formatted header") parser.add_argument("--output", default=None, type=str, help="output tr2gene file") args = parser.parse_args() L.info("args:") print(args) outf = iotools.open_file(args.output, "w") with iotools.open_file(args.fasta, "r") as handle: for record in FastaIO.FastaIterator(handle): description = record.description trans = description.split(" ")[0] m = re.search('gene:(\S+)', description) gene = m.group(1) try: x = re.search('gene_symbol:(\S+)', description) symbol = x.group(1) except Exception: pass outf.write("%s\t%s\t%s\n" % (trans, gene, symbol)) outf.close()