def create_annotations_table(annotations, output_directory, header, schema_name, loci_info): """ Creates output table with loci information. Parameters ---------- annotations : dcit Dictionary with loci identifiers as keys and lists with information about loci as values (each list contains the information extracted from the "cds_info.tsv" table, if it was passed to the process, and the product and URL link for the match found through UniProt's SPARQL endpoint). output_directory : str Path to the output directory where the table will be written to. header : list File header (first line with column names). schema_name : str Name of the schema. loci_info : bool True if the user passed the "cds_info.tsv" table to the process, false otherwise. Returns ------- output_table : str Path to the table with loci information. """ new_lines = [header] for locus, data in annotations.items(): new_line = [locus] if loci_info is True: new_line += data[1:9] else: new_line += data[7:9] if len(data[-1]) > 0: relevant_data = [d[4:] + [str(round(d[3], 2))] for d in data[-1]] proteome_data = list(zip(*relevant_data)) proteome_data = [ ';'.join(list(map(str, d))) for d in proteome_data ] proteome_data = [ '' if set(d) == {';'} else d for d in proteome_data ] new_line.extend(proteome_data) new_lines.append(new_line) new_lines = ['\t'.join(l) for l in new_lines] table_text = '\n'.join(new_lines) table_basename = '{0}_annotations.tsv'.format(schema_name) output_table = fo.join_paths(output_directory, [table_basename]) with open(output_table, 'w') as outfile: outfile.write(table_text + '\n') return output_table
def get_self_scores(fasta_file, output_directory, blast_threads, blastp_path, makeblastdb_path): """ Aligns a set of sequences against itself to determine the raw score of the self-alignment. Parameters ---------- fasta_file : str Path to a FASTA file with protein sequences. output_directory : str Path to the directory where intermediate files will be created. blast_threads : int Number of threads for BLASTp execution. blastp_path : str Path to the BLASTp executable. makeblastdb_path : str Path to the makeblastdb executable. Returns ------- self_lines_ids : dict Dictionary with sequences identifiers as keys and the BLASTp raw score from self-alignment. """ basename = fo.file_basename(fasta_file, suffix=False) integer_seqids = fo.join_paths(output_directory, ['{0}_int.fasta'.format(basename)]) ids_dict = integer_headers(fasta_file, integer_seqids) blastdb = fo.join_paths(output_directory, ['{0}_db'.format(basename)]) stderr = bw.make_blast_db(makeblastdb_path, integer_seqids, blastdb, 'prot') blastout = fo.join_paths(output_directory, ['self_blastout.tsv']) self_results = bw.run_blast(blastp_path, blastdb, integer_seqids, blastout, threads=blast_threads, max_targets=1) self_lines = fo.read_tabular(blastout) self_lines_ids = {ids_dict[l[0]]: l[-1] for l in self_lines} return self_lines_ids
def cds_batch_extractor(genomes, prodigal_path, temp_directory, index): """ Extracts coding sequences from a set of genomes. Parameters ---------- input_data : list List with a set of paths for FASTA files with genomic sequences, followed by the path to the directory with files with Prodigal resutls, the path to the temporary directory for all files and directories that will be read and written and an index/identifier to add to the output files with coding sequences and coding sequences info. Returns ------- A list with the following elements: protein_table : str Path to the TSV file to which coding sequences info was written. cds_file : str Path to the FASTA file to which coding sequences were written. batch_total : int Total number of coding sequences extracted from the set of input genomes. """ protein_table = fo.join_paths(temp_directory, ['protein_info_{0}.tsv'.format(index)]) cds_file = fo.join_paths(temp_directory, ['coding_sequences_{0}.fasta'.format(index)]) batch_total = 0 for g in genomes: # determine Prodigal ORF file path for current genome identifier = fo.file_basename(g, False) orf_file_path = fo.join_paths(prodigal_path, ['{0}_ORF.txt'.format(identifier)]) total = save_extracted_cds(g, identifier, orf_file_path, protein_table, cds_file) batch_total += total return [protein_table, cds_file, batch_total]
def get_proteomes(proteome_ids, output_dir): """ Downloads reference proteomes from UniProt's FTP. Parameters ---------- proteomes : list List with a sublist per proteome to download. Each sublist has the information about a proteome that was contained in the README file with the list of UniProt's reference proteomes. output_dir : str Path to the output directory where downloaded proteomes will be saved to. Returns ------- Local paths to the downloaded proteomes. """ print('Downloading reference proteomes...') # construct FTP URLs for each proteome downloaded = 0 proteomes_files = [] for pid in proteome_ids: domain = '{0}{1}'.format(pid[3][0].upper(), pid[3][1:]) proteome_id = '{0}_{1}'.format(pid[0], pid[1]) proteome_file = '{0}.fasta.gz'.format(proteome_id) local_proteome_file = fo.join_paths(output_dir, [proteome_file]) proteome_url = fo.join_paths(ct.UNIPROT_PROTEOMES_FTP, [domain, pid[0], proteome_file]) res = fo.download_file(proteome_url, local_proteome_file) proteomes_files.append(local_proteome_file) downloaded += 1 print('\r', 'Downloaded {0}/{1}'.format(downloaded, len(proteome_ids)), end='') time.sleep(0.1) return proteomes_files
def split_fasta(fasta_path, output_path, num_seqs, filenames): """ Splits a FASTA file. Parameters ---------- fasta_path : str Path to a FASTA file. output_path : str Path to the output directory where new FASTA files will be created. num_seqs : int Split FASTA file into files with this number of sequences. filenames : gen Generator with names to attribute to new files. Returns ------- splitted_files : list List with paths to the new files that were created by splitting the input FASTA file. """ splitted_files = [] current_recs = [] records = [rec for rec in SeqIO.parse(fasta_path, 'fasta')] for record in records: current_recs.append(record) if len(current_recs) == num_seqs or record.id == records[-1].id: file_name = filenames.__next__() file_name = im.replace_multiple_characters(file_name, ct.CHAR_REPLACEMENTS) new_file = fo.join_paths(output_path, ['{0}{1}'.format(file_name, '.fasta')]) splitted_files.append(new_file) write_records(current_recs, new_file) current_recs = [] return splitted_files
def translate_fastas(fasta_paths, output_directory, translation_table): """ Translates DNA sequences in a set of FASTA files. Parameters ---------- fasta_paths : list List with the paths to the FASTA files that contain the DNA sequences to translate. output_directory : str Path to the output directory where FASTA files with protein sequences will be writen to. translation_table : int Genetic code used to translate DNA sequences. Returns ------- protein_files : list List that contains the paths to the FASTA files with translated sequences. """ protein_files = [] for path in fasta_paths: records = import_sequences(path) translated_records = {seqid: str(sm.translate_dna(seq, translation_table, 0)[0][0]) for seqid, seq in records.items()} translated_lines = fasta_lines(list(translated_records.keys()), translated_records) basename = fo.file_basename(path).replace('.fasta', '_protein.fasta') prot_file = fo.join_paths(output_directory, [basename]) fo.write_lines(translated_lines, prot_file) protein_files.append(prot_file) return protein_files
def write_gene_list(schema_dir): """ Creates list with gene files in a schema and uses the pickle module to save the list to a file. Parameters ---------- schema_dir : str Path to the directory with schema files. Returns ------- A list with two elements. A boolean value that is True if the file with the list of genes was created and False otherwise. The second element is the path to the created file. """ schema_files = [ file for file in os.listdir(schema_dir) if '.fasta' in file ] schema_list_file = fo.join_paths(schema_dir, ['.genes_list']) fo.pickle_dumper(schema_files, schema_list_file) return [os.path.isfile(schema_list_file), schema_list_file]
def main(input_files, output_directory, protein_table, blast_score_ratio, cpu_cores, taxa, proteome_matches, no_cleanup, blast_path): # create output directory fo.create_directory(output_directory) # create temp directory temp_directory = fo.join_paths(output_directory, ['temp']) fo.create_directory(temp_directory) # validate input files genes_list = fo.join_paths(temp_directory, ['listGenes.txt']) genes_list = pv.check_input_type(input_files, genes_list) loci_paths = fo.read_lines(genes_list) schema_directory = os.path.dirname(loci_paths[0]) schema_basename = fo.file_basename(schema_directory) print('Schema: {0}'.format(schema_directory)) print('Number of loci: {0}'.format(len(loci_paths))) # find annotations based on reference proteomes for species proteome_results = {} if taxa is not None: proteome_results = proteome_annotations(schema_directory, temp_directory, taxa, blast_score_ratio, cpu_cores, proteome_matches, blast_path) # find annotations in SPARQL endpoint print('\nQuerying UniProt\'s SPARQL endpoint...') config_file = fo.join_paths(input_files, '.schema_config') if os.path.isfile(config_file) is True: config = fo.pickle_loader(config_file) translation_table = config.get('translation_table', [11])[0] else: translation_table = 11 sparql_results = sparql_annotations(loci_paths, translation_table, cpu_cores) loci_info = {} if protein_table is not None: # read cds_info table # read "cds_info.tsv" file created by CreateSchema table_lines = fo.read_tabular(protein_table) for l in table_lines[1:]: # create locus identifier based on genome identifier and # cds identifier in file locus_id = l[0].replace('_', '-') locus_id = locus_id + '-protein{0}'.format(l[-2]) loci_info[locus_id] = l annotations = join_annotations(sparql_results, proteome_results, loci_info) # table header header = ['Locus_ID'] if len(loci_info) > 0: header += table_lines[0] header += ['Uniprot_Name', 'UniProt_URL'] if len(proteome_results) > 0: header.extend(['Proteome_ID', 'Proteome_Product', 'Proteome_Gene_Name', 'Proteome_Species', 'Proteome_BSR']) loci_info_bool = True if len(loci_info) > 0 else False output_table = create_annotations_table(annotations, output_directory, header, schema_basename, loci_info_bool) if no_cleanup is False: shutil.rmtree(temp_directory) print('\n\nThe table with new information can be found at:' '\n{0}'.format(output_table))
def proteome_annotations(schema_directory, temp_directory, taxa, blast_score_ratio, cpu_cores, proteome_matches, blast_path): """ Determines loci annotations based on alignment against UniProt's reference proteomes. Parameters ---------- schema_directory : str Path to the schema's directory. temp_directory : str Path to the temporary directory where intermediate files will be written to. taxa : list List of taxa scientific names. The process will search for reference proteomes whose "Species Name" field contain any of the provided taxa names. blast_score_ratio : float BLAST Score Ratio value. Hits with a BSR value >= than this value will be considered as high scoring hits that can be included in the final table according to the maximum number of matches to report. cpu_cores : int Number of threads used to run BLASTp. proteome_matches : int Maximum number of proteome matches to report. blast_path : str Path to BLAST executables. Returns ------- proteome_results : dict Dictionary with loci identifiers as keys and a list with information about loci retrieved from the most similar records in UniProt's reference proteomes. """ # get paths to files with representative sequences short_directory = fo.join_paths(schema_directory, ['short']) reps_paths = [fo.join_paths(short_directory, [file]) for file in os.listdir(short_directory) if file.endswith('.fasta') is True] print('Translating representative sequences...', end='') # translate representatives for all loci translated_reps = fo.join_paths(temp_directory, ['translated_reps']) fo.create_directory(translated_reps) reps_protein_files = fao.translate_fastas(reps_paths, translated_reps, 11) print('done.') print('Downloading list of reference proteomes...', end='') remote_readme = fo.join_paths(ct.UNIPROT_PROTEOMES_FTP, ['README']) local_readme = fo.join_paths(temp_directory, ['reference_proteomes_readme.txt']) # get README file with list of reference proteomes res = fo.download_file(remote_readme, local_readme) print('done.') # get lines with proteomes info for species of interest readme_lines = fo.read_lines(local_readme, strip=False) selected_proteomes = im.contained_terms(readme_lines, taxa) selected_proteomes = [line.strip('\n') for line in selected_proteomes] selected_proteomes = [line.split('\t') for line in selected_proteomes] print('Found {0} reference proteomes for ' '{1}.'.format(len(selected_proteomes), taxa)) proteome_results = {} if len(selected_proteomes) > 0: # create directory to store proteomes proteomes_directory = fo.join_paths(temp_directory, ['proteomes']) fo.create_directory(proteomes_directory) proteomes_files = ur.get_proteomes(selected_proteomes, proteomes_directory) # uncompress files and concatenate into single FASTA uncompressed_proteomes = [fo.unzip_file(file) for file in proteomes_files] proteomes_concat = fo.join_paths(proteomes_directory, ['full_proteome.fasta']) proteomes_concat = fo.concatenate_files(uncompressed_proteomes, proteomes_concat) # get self-scores # concatenate protein files reps_concat = fo.concatenate_files(reps_protein_files, fo.join_paths(temp_directory, ['reps_concat.fasta'])) print('\nDetermining self-score of representatives...', end='') blastp_path = os.path.join(blast_path, ct.BLASTP_ALIAS) makeblastdb_path = os.path.join(blast_path, ct.MAKEBLASTDB_ALIAS) self_scores = fao.get_self_scores(reps_concat, temp_directory, cpu_cores, blastp_path, makeblastdb_path) print('done.') # create BLASTdb with proteome sequences proteome_blastdb = fo.join_paths(proteomes_directory, ['proteomes_db']) stderr = bw.make_blast_db('makeblastdb', proteomes_concat, proteome_blastdb, 'prot') # BLASTp to determine annotations blast_inputs = [['blastp', proteome_blastdb, file, file+'_blastout.tsv', 1, 1, None, None, proteome_matches, None, bw.run_blast] for file in reps_protein_files] print('\nBLASTing representatives against proteomes...') blast_results = mo.map_async_parallelizer(blast_inputs, mo.function_helper, cpu_cores, show_progress=True) blastout_files = [fo.join_paths(translated_reps, [file]) for file in os.listdir(translated_reps) if 'blastout' in file] # index proteome file indexed_proteome = SeqIO.index(proteomes_concat, 'fasta') # process results for each BLASTp proteome_results = extract_annotations(blastout_files, indexed_proteome, self_scores, blast_score_ratio, proteome_matches) return proteome_results
def main(input_files, output_directory, cpu_cores, blast_score_ratio, minimum_length, translation_table, ptf_path, size_threshold, blast_path): print('Adapting schema in the following ' 'directory:\n{0}'.format(os.path.abspath(input_files))) print('Prodigal training file:\n{0}'.format(ptf_path)) print('Number of cores: {0}'.format(cpu_cores)) print('BLAST Score Ratio: {0}'.format(blast_score_ratio)) print('Translation table: {0}'.format(translation_table)) print('Minimum accepted sequence length: {0}'.format(minimum_length)) print('Size threshold: {0}'.format(size_threshold)) # define output paths schema_path = os.path.abspath(output_directory) schema_short_path = fo.join_paths(schema_path, ['short']) # create output directories # check if they exist first fo.create_directory(schema_path) fo.create_directory(schema_short_path) # list schema gene files genes_file = pv.check_input_type(input_files, os.path.join(output_directory, 'schema_genes.txt')) # import list of schema files with open(genes_file, 'r') as gf: genes_list = [line.rstrip('\n') for line in gf] os.remove(genes_file) print('Number of genes to adapt: {0}\n'.format(len(genes_list))) print('Determining the total number of alleles and ' 'allele mean length per gene...\n'.format()) # count number of sequences and mean length per gene genes_info = [] genes_pools = multiprocessing.Pool(processes=cpu_cores) gp = genes_pools.map_async(fao.gene_seqs_info, genes_list, callback=genes_info.extend) gp.wait() # split files according to number of sequences and sequence mean length # in each file to pass even groups of sequences to all cores even_genes_groups = mo.split_genes_by_core(genes_info, cpu_cores*4, 'seqcount') # with few inputs, some sublists might be empty even_genes_groups = [i for i in even_genes_groups if len(i) > 0] # add common arguments blastp_path = os.path.join(blast_path, ct.BLASTP_ALIAS) makeblastdb_path = os.path.join(blast_path, ct.MAKEBLASTDB_ALIAS) even_genes_groups = [[i, schema_path, schema_short_path, blast_score_ratio, minimum_length, translation_table, size_threshold, blastp_path, makeblastdb_path, adapt_loci] for i in even_genes_groups] print('Adapting {0} genes...\n'.format(len(genes_list))) invalid_data = mo.map_async_parallelizer(even_genes_groups, mo.function_helper, cpu_cores, show_progress=True) # define paths and write files with list of invalid # alleles and invalid genes output_schema_basename = os.path.basename(output_directory.rstrip('/')) schema_parent_directory = os.path.dirname(schema_path) # write file with alleles that were determined to be invalid invalid_alleles = [sub[0] for sub in invalid_data] invalid_alleles = list(itertools.chain.from_iterable(invalid_alleles)) invalid_alleles_file = os.path.join(schema_parent_directory, '{0}_{1}'.format(output_schema_basename, 'invalid_alleles.txt')) with open(invalid_alleles_file, 'w') as inv: lines = ['{0}: {1}\n'.format(allele[0], allele[1]) for allele in invalid_alleles] inv.writelines(lines) # write file with identifiers of genes that had no valid alleles invalid_genes = [sub[1] for sub in invalid_data] invalid_genes = list(itertools.chain.from_iterable(invalid_genes)) invalid_genes_file = os.path.join(schema_parent_directory, '{0}_{1}'.format(output_schema_basename, 'invalid_genes.txt')) with open(invalid_genes_file, 'w') as inv: invalid_geqids = '\n'.join(invalid_genes) inv.write(invalid_geqids) stats_lines = [sub[2] for sub in invalid_data] stats_lines = list(itertools.chain.from_iterable(stats_lines)) stats_lines = ['\t'.join(line) for line in stats_lines] stats_genes_file = '{0}/{1}_{2}'.format(schema_parent_directory, output_schema_basename, 'summary_stats.txt') with open(stats_genes_file, 'w') as stats: summary_stats_text = '\n'.join(stats_lines) stats.write('Gene\tTotal_alleles\tValid_alleles\tNumber_representatives\n') stats.write(summary_stats_text) print('\n\nNumber of invalid genes: {0}'.format(len(invalid_genes))) print('Number of invalid alleles: {0}'.format(len(invalid_alleles))) print('\nSuccessfully adapted {0}/{1} genes present in the ' 'input schema.'.format(len(genes_list)-len(invalid_genes), len(genes_list)))
def adapt_loci(genes, schema_path, schema_short_path, bsr, min_len, table_id, size_threshold, blastp_path, makeblastdb_path): """ Adapts a set of genes/loci from an external schema so that that schema can be used with chewBBACA. Removes invalid alleles and selects representative alleles to include in the "short" directory. Parameters ---------- genes_list : list A list with the following elements: - List with paths to the files to be processed. - Path to the schema directory. - Path to the "short" directory. - BLAST Score Ratio value. - Minimum sequence length value. - Genetic code. - Sequence size variation threshold. Returns ------- invalid_alleles : list List with the identifiers of the alleles that were determined to be invalid. invalid_genes : list List with the identifiers of the genes that had no valid alleles. summary_stats : list of list List with one sublist per processed locus. Each sublist has four elements: - The identifier of the locus. - The number of alleles in the external file. - The number of alleles that were a valid CDS. - The number of representatives determined determined by the process. The function writes the schema files . """ # divide input list into variables summary_stats = [] invalid_genes = [] invalid_alleles = [] for gene in genes: representatives = [] final_representatives = [] # get gene basename and identifier gene_basename = os.path.basename(gene) gene_id = gene_basename.split('.f')[0] # create paths to gene files in new schema gene_file = fo.join_paths(schema_path, ['{0}{1}'.format(gene_id, '.fasta')]) gene_short_file = fo.join_paths(schema_short_path, ['{0}{1}'.format(gene_id, '_short.fasta')]) # create path to temp working directory for current gene gene_temp_dir = fo.join_paths(schema_path, ['{0}{1}'.format(gene_id, '_temp')]) # create temp directory for the current gene fo.create_directory(gene_temp_dir) # dictionaries mapping gene identifiers to DNA sequences # and Protein sequences gene_seqs, prot_seqs, gene_invalid, seqids_map, total_sequences = \ sm.get_seqs_dicts(gene, gene_id, table_id, min_len, size_threshold) invalid_alleles.extend(gene_invalid) # if locus has no valid CDS sequences, # continue to next locus if len(prot_seqs) == 0: shutil.rmtree(gene_temp_dir) invalid_genes.append(gene_id) summary_stats.append([gene_id, str(total_sequences), '0', '0']) continue if len(gene_seqs) > 1: # identify DNA sequences that code for same protein equal_prots = sm.determine_duplicated_seqs(prot_seqs) # get only one identifier per protein ids_to_blast = [protids[0] for protein, protids in equal_prots.items()] # get longest sequence as first representative longest = sm.determine_longest(ids_to_blast, prot_seqs) representatives.append(longest) final_representatives.append(longest) # create FASTA file with distinct protein sequences protein_file = fo.join_paths(gene_temp_dir, ['{0}_protein.fasta'.format(gene_id)]) protein_lines = fao.fasta_lines(ids_to_blast, prot_seqs) fo.write_list(protein_lines, protein_file) # create blastdb with all distinct proteins blastp_db = os.path.join(gene_temp_dir, gene_id) bw.make_blast_db(makeblastdb_path, protein_file, blastp_db, 'prot') # determine appropriate blastp task (proteins < 30aa need blastp-short) blastp_task = bw.determine_blast_task(equal_prots) # cycles to BLAST representatives against non-representatives until # all non-representatives have a representative while len(set(ids_to_blast) - set(representatives)) != 0: # create FASTA file with representative sequences rep_file = fo.join_paths(gene_temp_dir, ['{0}_rep_protein.fasta'.format(gene_id)]) rep_protein_lines = fao.fasta_lines(representatives, prot_seqs) fo.write_list(rep_protein_lines, rep_file) # create file with seqids to BLAST against ids_str = im.concatenate_list([str(i) for i in ids_to_blast], '\n') ids_file = fo.join_paths(gene_temp_dir, ['{0}_ids.txt'.format(gene_id)]) fo.write_to_file(ids_str, ids_file, 'w', '') # BLAST representatives against non-represented blast_output = fo.join_paths(gene_temp_dir, ['{0}_blast_out.tsv'.format(gene_id)]) # set max_target_seqs to huge number because BLAST only # returns 500 hits by default blast_stderr = bw.run_blast(blastp_path, blastp_db, rep_file, blast_output, 1, 1, ids_file, blastp_task, 100000, ignore=ct.IGNORE_RAISED) if len(blast_stderr) > 0: raise ValueError(blast_stderr) # import BLAST results blast_results = fo.read_tabular(blast_output) # get self-score for representatives rep_self_scores = {res[1]: res[2] for res in blast_results if res[0] == res[1]} # divide results into high, low and hot BSR values hitting_high, hitting_low, hotspots, high_reps, low_reps, hot_reps = \ bsr_categorizer(blast_results, representatives, rep_self_scores, bsr, bsr+0.1) excluded_reps = [] # remove high BSR hits that have representative hitting_high = set(hitting_high) ids_to_blast = [i for i in ids_to_blast if i not in hitting_high] # remove representatives that led to high BSR with subjects that were removed prunned_high_reps = {k: [r for r in v if r in ids_to_blast] for k, v in high_reps.items()} reps_to_remove = [k for k, v in prunned_high_reps.items() if len(v) == 0] excluded_reps.extend(reps_to_remove) # determine smallest set of representatives that allow to get all cycle candidates excluded = [] hotspot_reps = set(im.flatten_list(list(hot_reps.values()))) for rep, hits in hot_reps.items(): common = hotspot_reps.intersection(set(hits)) if len(common) > 0: hotspot_reps = hotspot_reps - common else: excluded.append(rep) excluded_reps.extend(excluded) # remove representatives that only led to low BSR excluded_reps.extend(low_reps) representatives = [rep for rep in representatives if rep not in excluded_reps] ids_to_blast = [i for i in ids_to_blast if i not in excluded_reps] # determine next representative from candidates rep_candidates = list(set(hotspots) - hitting_high) # sort to guarantee reproducible results with same datasets rep_candidates = sorted(rep_candidates, key=lambda x: int(x)) representatives, final_representatives = select_candidate(rep_candidates, prot_seqs, ids_to_blast, representatives, final_representatives) # remove files created for current gene iteration os.remove(rep_file) os.remove(blast_output) os.remove(ids_file) else: final_representatives = list(prot_seqs.keys()) # write schema file with all alleles gene_lines = fao.fasta_lines(list(gene_seqs.keys()), gene_seqs) fo.write_list(gene_lines, gene_file) # get total number of valid sequences valid_sequences = len(gene_lines) # write schema file with representatives final_representatives = [seqids_map[rep] for rep in final_representatives] gene_rep_lines = fao.fasta_lines(final_representatives, gene_seqs) fo.write_list(gene_rep_lines, gene_short_file) # get number of representatives representatives_number = len(gene_rep_lines) summary_stats.append([gene_id, str(total_sequences), str(valid_sequences), str(representatives_number)]) shutil.rmtree(gene_temp_dir) return [invalid_alleles, invalid_genes, summary_stats]