def _msa_count_columns(input_fn): """Count columns in a MSA in FASTA format. """ with open(input_fn, "rU") as input_handle: parser = SimpleFastaParser(input_handle) try: title, seq = parser.next() except StopIteration: raise ValueError("{} is not a valid FASTA template file" .format(input_fn)) seqlen = len(seq) for title, seq in parser: if len(seq) != seqlen: raise ValueError("sequences in template file must all be the " "same length") return seqlen
def read_fasta(files, in_type=Peptide, id_position=1): """ Generator function: Read a (couple of) peptide, protein or rna sequence from a FASTA file. User needs to specify the correct type of the underlying sequences. It can either be: Peptide, Protein or Transcript (for RNA). :param files: A (list) of file names to read in :in_type files: list(str) or str :param in_type: The type to read in :type in_type: :class:`~Fred2.Core.Peptide.Peptide` or :class:`~Fred2.Core.Transcript.Transcript` or :class:`~Fred2.Core.Protein.Protein` :param int id_position: the position of the id specified counted by | :returns: a list of the specified sequence type derived from the FASTA file sequences. :rtype: (list(:attr:`in_type`)) :raises ValueError: if a file is not readable """ if isinstance(files, basestring): files = [files] else: if any(not os.path.exists(f) for f in files): raise ValueError("Specified Files do not exist") collect = set() # open all specified files: for name in files: with open(name, 'r') as handle: # iterate over all FASTA entries: for _id, seq in SimpleFastaParser(handle): # generate element: try: _id = _id.split("|")[id_position] except IndexError: _id = _id try: collect.add(in_type(seq.strip().upper(), transcript_id=_id)) except TypeError: collect.add(in_type(seq.strip().upper())) return list(collect)
def main(fasta_file, bin_composition, set_bins, output): Dico_contigs_bin = { line.rstrip().split(",")[0]: line.rstrip().split(",")[1] for line in open(bin_composition) if line.rstrip().split(",")[1] in set_bins } Dico_bin_Handle = {} for bins in set(Dico_contigs_bin.values()): Dico_bin_Handle[bins] = open( output + "/Bin_" + bins + "." + fasta_file.split(".")[-1], "w") for contig_id, seq in SimpleFastaParser(open(fasta_file)): contig_id2 = contig_id.split()[0] if contig_id2 in Dico_contigs_bin: Dico_bin_Handle[Dico_contigs_bin[contig_id2]].write(">" + contig_id + "\n" + seq + "\n") for handle in Dico_bin_Handle.values(): handle.close()
def _rename_seqids(input_fn, otuids_fn, prefix=""): output_dir = os.path.dirname(input_fn) tmp_fn = micca.ioutils.make_tempfile(output_dir) input_handle = open(input_fn, "rU") otuids_handle = open(otuids_fn, "wb") tmp_handle = open(tmp_fn, "wb") for i, (title, seq) in enumerate(SimpleFastaParser(input_handle)): origid = title.split()[0] newid = "{}{:d}".format(prefix, i + 1) otuids_handle.write("{}\t{}\n".format(newid, origid)) tmp_handle.write(">{}\n{}\n".format(newid, seq)) tmp_handle.close() input_handle.close() otuids_handle.close() os.rename(tmp_fn, input_fn)
def minimise_large_cluster_1(nodes, orig_roary_dirs): ''' find identical sequences in large clusters using cdhit and remove those nodes from the graph G to make everything smoother''' tmp_out = open("1_cluster_fasta.fa", "w") members_per_cluster = {} for m in nodes: cluster = m.split("_")[0] if cluster not in members_per_cluster: members_per_cluster[cluster] = [] members_per_cluster[cluster].append("_".join(m.split("_")[1:])) for curr_cluster in members_per_cluster: for d in orig_roary_dirs: if d.split("/")[-1].split("_")[0] == str(curr_cluster): ref_dir = d break with open( os.path.join(ref_dir, prefix + "mode_pan_genome_reference.fa")) as handle: for values in SimpleFastaParser(handle): name = values[0].split()[1] if name in members_per_cluster[curr_cluster]: ## THis output proves how choosing a different reference can really alter the results. tmp_out.write(">" + curr_cluster + "_" + name + "\n" + values[1].split(";")[0] + "\n") tmp_out.close() ## run cdhit to reduce the number of sequences p = subprocess.Popen([ "cd-hit-est", "-i", "1_cluster_fasta.fa", "-o", "1_cluster_clustered.fa", "-c", "0.9", "-T", "4", "-d", "0", "-A", "0.9", "-n", "8", "-s", "0.9" ]) p.wait() nodes_to_remove = [] with open("1_cluster_clustered.fa.clstr") as f: for line in f: if line.startswith(">"): continue member = line.strip().split("...")[0].split(">")[-1] if "*" not in line: nodes_to_remove.append(member) return nodes_to_remove
def Sortbysize(input, n50, minlen=500): contigs = [] keep = [] Seqs = [] with open(input, 'r') as infile: for header, sequence in SimpleFastaParser(infile): Seqs.append((header, len(sequence))) # sort by length sortedSeqs = sorted(Seqs, key=lambda x: x[1], reverse=True) # loop through and return contigs and keepers for name, length in sortedSeqs: if length >= minlen: if n50: if length >= n50: keep.append(name) else: contigs.append(name) else: contigs.append(name) return contigs, keep
def count_total_c_and_g_in_reads(ref_name, cg_posits, in_file): c_counter = 0 g_counter = 0 nucs = ["A", "T", "G", "C"] with open("./input_data/" + in_file) as in_handle: for record in SimpleFastaParser(in_handle): if record[0] == ref_name: continue for c_index in cg_posits["C_pos"]: if record[1][c_index] in nucs: c_counter += 1 for g_index in cg_posits["G_pos"]: if record[1][g_index] in nucs: g_counter += 1 return c_counter, g_counter
def qc_fasta(fasta_file, qc, out, name=""): ''' go over FASTA file and see if it meets QC requirements Output: LOG of number of contigs and length. Return: True, if it does False: If it doesn't, and remove the file ''' if name == "": name = fasta_file num_contigs = 0 length = 0 with open(fasta_file) as handle: for values in SimpleFastaParser(handle): num_contigs += 1 length += len(values[1].strip()) length = length / 1000000.0 if qc["max_contigs"] < num_contigs or qc["min_length"] > length or qc[ "max_length"] < length: out.write(name + "," + str(num_contigs) + "," + str(length) + "\n") return False return True
def launch_seq_kmers_pool( fastx, ftype, k, threads, target_range, combined_kmers, count, frac ): args = [] lengths_d = {} if ftype=="fastq": for read_num, (read_id, seq, qual) in enumerate(FastqGeneralIterator(open(fastx))): args,status = build_args_for_kmer_calc(read_num, target_range, args, read_id, seq, k, combined_kmers, lengths_d, count, frac) if status=="over": break elif ftype=="fasta": for read_num, (read_id, seq) in enumerate(SimpleFastaParser(open(fastx))): args,status = build_args_for_kmer_calc(read_num, target_range, args, read_id, seq, k, combined_kmers, lengths_d, count, frac) if status=="over": break results = launch_pool( threads, calc_seq_kmer_freqs, args ) return dict(results), lengths_d
def quick_FASTA_reader(file): """Simple FASTA reader, returning a list of string tuples (DEPRECATED). The single argument 'file' should be the filename of a FASTA format file. This function will open and read in the entire file, constructing a list of all the records, each held as a tuple of strings (the sequence name or title, and its sequence). >>> seqs = quick_FASTA_reader("Fasta/dups.fasta") >>> for title, sequence in seqs: ... print("%s %s" % (title, sequence)) alpha ACGTA beta CGTC gamma CCGCC alpha (again - this is a duplicate entry to test the indexing code) ACGTA delta CGCGC This function was is fast, but because it returns the data as a single in memory list, is unsuitable for large files where an iterator approach is preferable. You are generally encouraged to use Bio.SeqIO.parse(handle, "fasta") which allows you to iterate over the records one by one (avoiding having all the records in memory at once). Using Bio.SeqIO also makes it easy to switch between different input file formats. However, please note that rather than simple strings, Bio.SeqIO uses SeqRecord objects for each record. If you want to use simple strings, use the function SimpleFastaParser added to Bio.SeqIO.FastaIO in Biopython 1.61 instead. """ import warnings from Bio import BiopythonDeprecationWarning warnings.warn( "The quick_FASTA_reader has been deprecated and will be " "removed in a future release of Biopython. Please try " "function SimpleFastaParser from Bio.SeqIO.FastaIO " "instead.", BiopythonDeprecationWarning) from Bio.SeqIO.FastaIO import SimpleFastaParser with open(file) as handle: entries = list(SimpleFastaParser(handle)) return entries
def cat_by_id(in_fastas=False, out_name="cat_by_id_"): """ cats sequences by their ids from separate fasta files into one sequence and outputs a single file ---------------- in_fastas: list or tuple input fasta files out_name: str name for output fasta file ---------------- NOTE: files must start with number like this : '1_', to denote the order of concatenation """ valid_extensions = ["fasta", "fas", "fa"] if in_fastas: intersected_ids = _get_intersected_ids(in_fastas) else: in_fastas = os.listdir("./") in_fastas = [ f for f in in_fastas if f.rsplit(".", 1)[-1] in valid_extensions ] intersected_ids = _get_intersected_ids(in_fastas) intersected_ids.sort() # to sort record by their number, which goes first intersected_records = [] for seq_id in intersected_ids: cat_seq = "" for f in in_fastas: with open(f) as in_handle: for title, seq in SimpleFastaParser(in_handle): if title == seq_id: cat_seq += seq break intersected_records.append( SeqRecord(Seq(cat_seq), id=title, description="")) time_stamp = _get_current_time() time_stamp = _format_time_stamp(time_stamp) SeqIO.write(intersected_records, out_name + time_stamp + ".fasta", "fasta")
def check_property(input, stop_table): global uniqs global out_df with open(input) as infasta: uniqs = [] # to save row names freq_list = [] # to save frequency info len_list = [] # to save length info stop_list = [] # to save stop codon # # LOOP (is the best!) print('#Start checking freq and len#') # size = len([head for head, seq in SimpleFastaParser(infasta)]) # progress bar # step = 0 # progress bar # bar = 20 # progress barlength for head, seq in SimpleFastaParser(infasta): uniqs.append( head.split(';')[0].replace('>', '') ) ## split by ; symbol, keep the first part then remove the > symbol freq_list.append(freq_check(head)) # add frequency info len_list.append(length_check(seq)) # add length info stop_codon = min( stopcount( SeqRecord(Seq(seq)), stop_table, frame=(1, 2, 3))) # check stop codon and retain the minimum stop_list.append(stop_codon) # append stop codon info # step += 1 # progress bar # percen = step/size # progress bar # heases = '#'* int(percen*bar)# progress bar # spaces = '-'* (bar - len(hashes))# progress bar # percen = round(percen*100,2)# progress bar # sys.stdout.write("\r %d%% |%s| %d/%d lines"%(percen,heases+spaces,step, size))# progress bar # sys.stdout.flush()# progress bar # # out_df = pd.DataFrame(data=uniqs, columns=['name']) # reate dataframe for data out_df['freq'] = freq_list # add list to dataframe out_df['length'] = len_list out_df['stop_codon'] = stop_list infasta.close()
def get_assembly_stats(assembly_fp): # Get contig lengths with assembly_fp.open('r') as f: contig_lengths = [len(s) for d, s in SimpleFastaParser(f)] # Calculate stats contig_number = len(contig_lengths) length = sum(contig_lengths) smallest = min(contig_lengths) largest = max(contig_lengths) mean = int(round(statistics.mean(contig_lengths), 0)) q1, q2, q3 = calculate_quartiles(contig_lengths) # n50 func requires more than one contig if contig_number > 1: n50 = calculate_n50(contig_lengths, length / 2) else: n50 = largest # Return ordered stats return contig_number, n50, q1, q2, q3, mean, smallest, largest, length
def read_fasta(fasta, length=False): """ Read fasta format file in. Parameters: ----------- fasta:str fasta format file length:bool output length instead of sequence, default False. Returns: -------- Return a dict as id & sequence/length of seqeunce as key-value pairs. """ seqs = {} fh = files.perfect_open(fasta) for t, seq in SimpleFastaParser(fh): seqs[t] = seq if length: seqs[t] = len(seq) fh.close() return seqs
def main(): #To parse command line usage = "usage: %prog [options]" p = optparse.OptionParser(usage) p.add_option('-i', '--input', help='Input fasta [None,REQD]') p.add_option('--min', type="int", default=1000, help="Minimum size of record to keep [1000]") p.add_option('--max', type="int", help="Maximum size of record to keep [None]") p.add_option('-o', '--output', help='Output fasta [None,REQD]') opts, args = p.parse_args() with open(opts.input, "r") as fin: with open(opts.output, "w") as fout: for record in SimpleFastaParser(fin): if opts.max: if len(record[1]) >= opts.min and len(record[1]) <= opts.max: fout.write(">%s\n%s\n" % (record[0], record[1])) else: if len(record[1]) >= opts.min: fout.write(">%s\n%s\n" % (record[0], record[1]))
def _count_ins(input_file, ref_seq, ref_seq_id, cov): total_ins = _get_ref_array(ref_seq) with open("./input_data/" + input_file) as in_handle: reads = SimpleFastaParser(in_handle) for record in reads: if record[0] == ref_seq_id: continue read = record[1] pairs_gaps_cut = _split_gaps_from_pairs(ref_seq, read) # take reference, not read. different from deletions counting seq_to_count = [x[0] for x in pairs_gaps_cut] seq_to_count_ins = "".join(seq_to_count) # skip if read has noly gaps. it's not valid to count if _seq_has_gaps_only(seq_to_count_ins): continue ins_start_positions = _get_indel_start_positions(seq_to_count_ins) ins_pos_and_lens = _count_indel_lens(seq_to_count_ins, ins_start_positions) if not all(value == None for key, value in ins_pos_and_lens.items()): _write_read_into_fasta(input_file, record, "a+") correct_ins_pos_and_lens = {} len_correction = 0 for key, value in ins_pos_and_lens.items(): correct_ins_pos_and_lens[key - len_correction] = value len_correction += value for key, value in correct_ins_pos_and_lens.items(): total_ins[key].append(value) return total_ins
def make_taxonomy(fasta): tax = dict() lineages = set() with open(fasta) as reader: for (title, _) in SimpleFastaParser(reader): for (kw, repl) in FIXES.items(): if kw in title: title = title.replace(kw, repl) (acc, lin) = title.split() if lin.lower() not in lineages: tax[acc] = lin.split(";") lineages.add(lin.lower()) tax_df = pd.DataFrame(tax).T tax_df.index.name = "acc" tax_df.columns = [ "kingdom", "phylum", "class", "order", "family", "genus", "species" ] return tax_df
def extract(input_file, parameters, output_file, file_type): read_id,start,stop,margin,strand = parameters.split(':') start = int(start) stop = int(stop) margin = int(margin) handle = open(output_file, "w") for title, seq in SimpleFastaParser(open(input_file)): if (title == read_id): subseq = seq[start-1-margin:stop+margin] if strand == '+': handle.write(">%s\n%s\n" % (title, subseq)) elif strand == '-': subseq = str( Seq.Seq(subseq).reverse_complement() ) handle.write(">%s\n%s\n" % (title, subseq)) else: print('Strand is not specified correctly!) return handle.close() return
def delete_N(input_file, parameters, output_file, file_type): print('\nReading your file... \n') print('Searching reads containing N \n') if file_type == "fastq": handle = open(output_file, "w") for title, seq, qual in FastqGeneralIterator(open(input_file)): if 'N' not in seq.upper(): handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual)) handle.close() else: handle = open(output_file, "w") for title, seq in SimpleFastaParser(open(input_file)): if 'N' not in seq.upper(): handle.write(">%s\n%s\n" % (title, seq)) handle.close() print('Reads containing N are written to {} \n'.format(output_file)) return
def add_rep_seqs_to_feature_table(feature_table, rep_seq_filepath): """ Adds representative sequences as the ReprSequences column to a QIIME2 feature table :param feature_table: QIIME2 FeatureTable[Frequency] artifact loaded as a pandas DataFrame :param rep_seq_filepath: Path to the dna-sequences.fasta file output by the QIIME2 denoising/clustering step :return: QIIME2 FeatureTable[Frequency] artifact with representative sequences in the ReprSequences column """ # Check if ReprSequence column already exists if 'ReprSequence' in feature_table.columns.values.tolist(): logger.error( '"ReprSequence" column already exists in provided feature table. ' 'Cannot add representative sequences. Exiting...') sys.exit(1) # Load the FastA file as a pandas dataframe # Based on https://stackoverflow.com/a/19452991 (accessed Sept. 12, 2019) logger.info('Loading representative sequences FastA file') with open(rep_seq_filepath, 'r') as fasta_data: fasta_ids = [] fasta_seqs = [] for id, seq in SimpleFastaParser(fasta_data): fasta_ids.append(id) fasta_seqs.append(seq) rep_seq_dict = {'Feature ID': fasta_ids, 'ReprSequence': fasta_seqs} rep_seq_table = pd.DataFrame(rep_seq_dict) # Merge logger.debug('Adding representative sequences') feature_table = pd.merge(feature_table, rep_seq_table, how='left', on='Feature ID', sort=False, validate='one_to_one') return (feature_table)
def filterFasta(assemblies, binid, output, ncontigs, field, reverse): """ Parse each fasta defline and filter if in binID or write non-binners """ contigList = readBinID(binid, ncontigs, field) with open(output, 'w') as o: for assembly in assemblies: with open(assembly) as f: for values in SimpleFastaParser(f): defline = values[0] index = int(values[0].split('_')[1]) if (reverse and contigList[index] == 0): o.write('>' + defline + '\n') o.write(values[1] + '\n') elif (not reverse and contigList[index] == 1): o.write('>' + defline + '\n') o.write(values[1] + '\n') else: pass return 0
def fasta_to_pandas(path, separator=";"): """ Args: path: of the fasta file separator: used in title of fasta file entry Returns: pandas dataframe with 3 columns (id, title, sequence) """ with open(path) as fasta_file: identifiers, sequences, titles = [], [], [] for title, sequence in SimpleFastaParser(fasta_file): title_parts = title.split(separator, 1) identifiers.append(title_parts[0]) # First word is ID titles.append("|".join(title_parts[1:])) sequences.append(sequence) return pd.DataFrame({ "id": identifiers, "title": titles, "sequence": sequences })
def parse_and_exclude(infile, exclusion_list, fastx_type): """ Takes in path to a fasta or fastq. Keeps sequences whose IDs are not in the exclusion_list. """ result = "" with open(infile) as infile_handle: # Parse based on type if fastx_type == "fasta": for title, seq in SimpleFastaParser(infile_handle): if not title in exclusion_list: result += ">{}\n{}\n".format(title, seq) elif fastx_type == "fastq": for title, seq, qual in FastqGeneralIterator(infile_handle): if not title in exclusion_list: result += "@{}\n{}\n+\n{}\n".format(title, seq, qual) return result
def fasta_to_saf(path, compression="infer"): """ # GeneID Chr Start End Strand # http://bioinf.wehi.edu.au/featureCounts/ # Useful: import re record_id = "lcl|NC_018632.1_cds_WP_039228897.1_1 [gene=dnaA] [locus_tag=MASE_RS00005] [protein=chromosomal replication initiator protein DnaA] [protein_id=WP_039228897.1] [location=410..2065] [gbkey=CDS]" re.search("\[locus_tag=(\w+)\]", record_id).group(1) # 'MASE_RS00005' """ saf_data = list() if path == "stdin": f = sys.stdin else: f = get_file_object(path, mode="read", compression=compression, verbose=False) for id_record, seq in pv(SimpleFastaParser(f), "Reading sequences [{}]".format(path)): id_record = id_record.split(" ")[0] fields = [ id_record, id_record, 1, len(seq), "+", ] saf_data.append(fields) if f is not sys.stdin: f.close() return pd.DataFrame(saf_data, columns=["GeneID", "Chr", "Start", "End", "Strand"])
def output_file_parser(folder, prefix): """Collect the set of results from a folder.""" output = {} # Collect the FASTA records for contigs, transcripts, and proteins for tag, file_ending in [ ("contigs", ".fna"), ("transcripts", ".ffn"), ("proteins", ".faa"), ]: filepath = os.path.join(folder, prefix + file_ending) if os.path.exists(filepath): # Read in the FASTA logging.info("Reading in {}".format(filepath)) records = [r for r in SimpleFastaParser(open(filepath, "rt"))] output[tag] = records # Record the features from the TSV features_fp = os.path.join(folder, prefix + ".tsv") if os.path.exists(features_fp): logging.info("Reading in {}".format(features_fp)) output["features"] = read_tsv(features_fp) # Also read in the Genbank file genbank_fp = os.path.join(folder, prefix + ".gbk") if os.path.exists(genbank_fp): logging.info("Reading in {}".format(genbank_fp)) with open(genbank_fp, "rt") as f: output["genbank"] = f.readlines() # Also read in the GFF file gff_fp = os.path.join(folder, prefix + ".gff") if os.path.exists(gff_fp): logging.info("Reading in {}".format(gff_fp)) with open(gff_fp, "rt") as f: output["gff"] = f.readlines() return output
def insert_reference_genes(args, temp_dir, cxn): """Prepare reference sequences for exonerate.""" batch = [] ref_genes = args.reference_genes log.info('Preparing reference genes: {}'.format(ref_genes)) with open(ref_genes) as ref_in: for ref_name, ref_seq in SimpleFastaParser(ref_in): ref_name = util.clean_name(ref_name) ref_file = abspath(join(temp_dir, '{}.fasta'.format(ref_name))) batch.append({ 'ref_name': ref_name, 'ref_seq': ref_seq, 'ref_file': ref_file }) db.insert_reference_genes(cxn, batch)
def insert_exonerate_results(cxn, iteration, results_file): """Insert the exonerate results into the database.""" ExonerateHeader = namedtuple( 'ExonerateHeader', ['ref_name', 'taxon_name', 'contig_name', 'beg', 'end']) batch = [] with open(results_file) as results_fasta: for header, seq in SimpleFastaParser(results_fasta): header = header.split(',') field = ExonerateHeader(*header) result = { 'ref_name': field.ref_name, 'taxon_name': field.taxon_name, 'contig_name': field.contig_name, 'beg': field.beg, 'end': field.end, 'iteration': iteration, 'seq': seq } batch.append(result) db.insert_exonerate_results(cxn, batch)
def createGFF(file, output): ''' Program designed to take in a BAM file and output a GFF file containing the information of contigs ''' with open(output, 'w') as o: with open(file) as f: for values in SimpleFastaParser(f): seqname = values[0] end = seqname.split('_')[3] source = "bowtie2" feature = "contig" start = '0' score = '40' strand = "." frame = "." attribute = ' ' writeline = '\t'.join([ seqname, source, feature, start, end, score, strand, frame, attribute ]) + '\n' o.write(writeline) return 0
def test_separator(protein_file, separator): # 1) Check the separator divides all fasta headers in 2 parts # 2) Find out the order of the contig and protein parts in the header total_proteins = set([]) total_contigs = set([]) with open(protein_file) as in_handle: for id, seq in SimpleFastaParser(in_handle): split_id = id.split(separator) assert ( len(split_id) == 2 ), 'header {} in file {} does not work with separator {}'.format( id, protein_file, separator) total_proteins.add(split_id[0]) total_contigs.add(split_id[1]) assert ( len(total_proteins) != len(total_contigs) ), 'Protein file {} does not look properly formatted (same number of protein IDs and contig IDs)'.format( protein_file) if len(total_proteins) > len(total_contigs): return False else: return True
def extract_kmers(input_path: str, kmer_size: int) -> List[Tuple[str, str]]: if not isfile(input_path): raise AssertionError with open(input_path) as FH: oligos_list: List[Tuple[str, str]] = [] for record_header, record_sequence in tqdm( SimpleFastaParser(FH), desc="Parsing region", leave=False ): for i in tqdm( range(len(record_sequence) - kmer_size + 1), desc="Extracting oligos", leave=False, ): oligo_sequence = str(record_sequence)[slice(i, i + kmer_size)] if "N" in oligo_sequence: continue oligos_list.append( ( f"{record_header}|{i+1}:{i+kmer_size+1}", oligo_sequence, ) ) return oligos_list
def fasta_to_numpy(path, length): """ Args: path: of the fasta file separator: used in title of fasta file entry Returns: numpy array of sequences """ with open(path) as fasta_file: sequences = [] for title, sequence in SimpleFastaParser(fasta_file): sequence = sequence[:length] to_pad = length - len(sequence) sequence = sequence.rjust(len(sequence) - (to_pad // 2), '0') sequence = sequence.ljust(length, '0') if len(sequence) < length: print(sequence.rjust(to_pad // 2, '0')) print(to_pad, to_pad // 2, length - len(sequence)) np_seq = np.asarray([AMINO_ACID_TO_ID[a] for a in sequence]) sequences.append(np_seq) return np.stack(sequences, axis=0)