def subset_fasta(fasta, seqids_l): """ Get a subset of the fasta file with only the sequence which ids are in ids_file. """ seq_d = bfas.fasta_2_dict(fasta) seq_i = bfas.filter_by_id(seq_d, seqids_l) bfas.write_as_fas(seq_i)
def get_chro_len(g_fas): seq_d = bf.fasta_2_dict(g_fas) len_d = dict() for chro in seq_d: len_d[chro] = len(seq_d[chro]) return sum(len_d.values())
def filterFastaByIds(fasta, idsFile): seqs = bf.fasta_2_dict(fasta) ids_to_keep = [] with open(idsFile, 'r') as f: ids_to_keep = [seqid.strip() for seqid in f] filt_seqs = bf.filter_by_id(seqs, ids_to_keep, False) bf.write_as_fas(filt_seqs)
def loadFasta(fas_f, jid, dbName): fas_d = bfas.fasta_2_dict(fas_f) with sqlite3.connect(dbName) as conn: c = conn.cursor() for seqid in fas_d: seq = fas_d[seqid] c.execute( 'INSERT INTO seqs (jid, seqid, seq, len) VALUES (?,?,?,?)', (jid, seqid, seq, len(seq)))
def fasta_stats(fasta, graphs_path=None): seq_d = bfas.fasta_2_dict(fasta) bfas.make_summary(seq_d, graphs_path)
def sample_fasta(fasta, nseq): seq_d = bfas.fasta_2_dict(fasta) seq_i = bfas.get_random_seqs(seq_d, nseq) bfas.write_as_fas(seq_i)