Beispiel #1
0
def create_annotations_table(annotations, output_directory, header,
                             schema_name, loci_info):
    """ Creates output table with loci information.

        Parameters
        ----------
        annotations : dcit
            Dictionary with loci identifiers as keys and
            lists with information about loci as values (each
            list contains the information extracted from the
            "cds_info.tsv" table, if it was passed to the process,
            and the product and URL link for the match found
            through UniProt's SPARQL endpoint).
        output_directory : str
            Path to the output directory where the table
            will be written to.
        header : list
            File header (first line with column names).
        schema_name : str
            Name of the schema.
        loci_info : bool
            True if the user passed the "cds_info.tsv" table
            to the process, false otherwise.

        Returns
        -------
        output_table : str
            Path to the table with loci information.
    """

    new_lines = [header]
    for locus, data in annotations.items():
        new_line = [locus]
        if loci_info is True:
            new_line += data[1:9]
        else:
            new_line += data[7:9]

        if len(data[-1]) > 0:
            relevant_data = [d[4:] + [str(round(d[3], 2))] for d in data[-1]]
            proteome_data = list(zip(*relevant_data))
            proteome_data = [
                ';'.join(list(map(str, d))) for d in proteome_data
            ]
            proteome_data = [
                '' if set(d) == {';'} else d for d in proteome_data
            ]
            new_line.extend(proteome_data)
        new_lines.append(new_line)

    new_lines = ['\t'.join(l) for l in new_lines]
    table_text = '\n'.join(new_lines)

    table_basename = '{0}_annotations.tsv'.format(schema_name)
    output_table = fo.join_paths(output_directory, [table_basename])
    with open(output_table, 'w') as outfile:
        outfile.write(table_text + '\n')

    return output_table
Beispiel #2
0
def get_self_scores(fasta_file, output_directory, blast_threads,
                    blastp_path, makeblastdb_path):
    """ Aligns a set of sequences against itself to determine
        the raw score of the self-alignment.

        Parameters
        ----------
        fasta_file : str
            Path to a FASTA file with protein sequences.
        output_directory : str
            Path to the directory where intermediate files
            will be created.
        blast_threads : int
            Number of threads for BLASTp execution.
        blastp_path : str
            Path to the BLASTp executable.
        makeblastdb_path : str
            Path to the makeblastdb executable.

        Returns
        -------
        self_lines_ids : dict
            Dictionary with sequences identifiers as keys
            and the BLASTp raw score from self-alignment.
    """

    basename = fo.file_basename(fasta_file, suffix=False)

    integer_seqids = fo.join_paths(output_directory,
                                   ['{0}_int.fasta'.format(basename)])
    ids_dict = integer_headers(fasta_file, integer_seqids)

    blastdb = fo.join_paths(output_directory, ['{0}_db'.format(basename)])
    stderr = bw.make_blast_db(makeblastdb_path, integer_seqids,
                              blastdb, 'prot')

    blastout = fo.join_paths(output_directory, ['self_blastout.tsv'])
    self_results = bw.run_blast(blastp_path, blastdb, integer_seqids,
                                blastout, threads=blast_threads,
                                max_targets=1)

    self_lines = fo.read_tabular(blastout)
    self_lines_ids = {ids_dict[l[0]]: l[-1] for l in self_lines}

    return self_lines_ids
Beispiel #3
0
def cds_batch_extractor(genomes, prodigal_path, temp_directory, index):
    """ Extracts coding sequences from a set of genomes.

        Parameters
        ----------
        input_data : list
            List with a set of paths for FASTA files with
            genomic sequences, followed by the path to the
            directory with files with Prodigal resutls, the
            path to the temporary directory for all files and
            directories that will be read and written and
            an index/identifier to add to the output files
            with coding sequences and coding sequences info.

        Returns
        -------
        A list with the following elements:
            protein_table : str
                Path to the TSV file to which coding sequences
                info was written.
            cds_file : str
                Path to the FASTA file to which coding sequences
                were written.
            batch_total : int
                Total number of coding sequences extracted from
                the set of input genomes.
    """

    protein_table = fo.join_paths(temp_directory,
                                  ['protein_info_{0}.tsv'.format(index)])

    cds_file = fo.join_paths(temp_directory,
                             ['coding_sequences_{0}.fasta'.format(index)])

    batch_total = 0
    for g in genomes:
        # determine Prodigal ORF file path for current genome
        identifier = fo.file_basename(g, False)
        orf_file_path = fo.join_paths(prodigal_path,
                                      ['{0}_ORF.txt'.format(identifier)])
        total = save_extracted_cds(g, identifier, orf_file_path, protein_table,
                                   cds_file)
        batch_total += total

    return [protein_table, cds_file, batch_total]
def get_proteomes(proteome_ids, output_dir):
    """ Downloads reference proteomes from UniProt's FTP.
    
        Parameters
        ----------
        proteomes : list
            List with a sublist per proteome to download.
            Each sublist has the information about a proteome
            that was contained in the README file with the list
            of UniProt's reference proteomes.
        output_dir : str
            Path to the output directory where downloaded
            proteomes will be saved to.

        Returns
        -------
        Local paths to the downloaded proteomes.
    """

    print('Downloading reference proteomes...')
    # construct FTP URLs for each proteome
    downloaded = 0
    proteomes_files = []
    for pid in proteome_ids:
        domain = '{0}{1}'.format(pid[3][0].upper(), pid[3][1:])
        proteome_id = '{0}_{1}'.format(pid[0], pid[1])
        proteome_file = '{0}.fasta.gz'.format(proteome_id)
        local_proteome_file = fo.join_paths(output_dir, [proteome_file])
        proteome_url = fo.join_paths(ct.UNIPROT_PROTEOMES_FTP, [domain, pid[0], proteome_file])
        res = fo.download_file(proteome_url, local_proteome_file)
        proteomes_files.append(local_proteome_file)
        downloaded += 1
        print('\r', 'Downloaded {0}/{1}'.format(downloaded, len(proteome_ids)), end='')
        time.sleep(0.1)

    return proteomes_files
Beispiel #5
0
def split_fasta(fasta_path, output_path, num_seqs, filenames):
    """ Splits a FASTA file.

        Parameters
        ----------
        fasta_path : str
            Path to a FASTA file.
        output_path : str
            Path to the output directory where new FASTA
            files will be created.
        num_seqs : int
            Split FASTA file into files with this number
            of sequences.
        filenames : gen
            Generator with names to attribute to new files.

        Returns
        -------
        splitted_files : list
            List with paths to the new files that were
            created by splitting the input FASTA file.
    """

    splitted_files = []
    current_recs = []
    records = [rec for rec in SeqIO.parse(fasta_path, 'fasta')]
    for record in records:
        current_recs.append(record)
        if len(current_recs) == num_seqs or record.id == records[-1].id:
            file_name = filenames.__next__()
            file_name = im.replace_multiple_characters(file_name, ct.CHAR_REPLACEMENTS)

            new_file = fo.join_paths(output_path,
                                     ['{0}{1}'.format(file_name, '.fasta')])

            splitted_files.append(new_file)

            write_records(current_recs, new_file)

            current_recs = []

    return splitted_files
Beispiel #6
0
def translate_fastas(fasta_paths, output_directory, translation_table):
    """ Translates DNA sequences in a set of FASTA files.

        Parameters
        ----------
        fasta_paths : list
            List with the paths to the FASTA files that contain
            the DNA sequences to translate.
        output_directory : str
            Path to the output directory where FASTA files with
            protein sequences will be writen to.
        translation_table : int
            Genetic code used to translate DNA sequences.

        Returns
        -------
        protein_files : list
            List that contains the paths to the FASTA files with
            translated sequences.
    """

    protein_files = []
    for path in fasta_paths:
        records = import_sequences(path)
        translated_records = {seqid: str(sm.translate_dna(seq, translation_table, 0)[0][0])
                              for seqid, seq in records.items()}
        translated_lines = fasta_lines(list(translated_records.keys()),
                                       translated_records)

        basename = fo.file_basename(path).replace('.fasta', '_protein.fasta')
        prot_file = fo.join_paths(output_directory, [basename])

        fo.write_lines(translated_lines, prot_file)
        protein_files.append(prot_file)

    return protein_files
def write_gene_list(schema_dir):
    """ Creates list with gene files in a schema and
        uses the pickle module to save the list to a file.

        Parameters
        ----------
        schema_dir : str
            Path to the directory with schema files.

        Returns
        -------
        A list with two elements. A boolean value that
        is True if the file with the list of genes was
        created and False otherwise. The second element
        is the path to the created file.
    """

    schema_files = [
        file for file in os.listdir(schema_dir) if '.fasta' in file
    ]
    schema_list_file = fo.join_paths(schema_dir, ['.genes_list'])
    fo.pickle_dumper(schema_files, schema_list_file)

    return [os.path.isfile(schema_list_file), schema_list_file]
Beispiel #8
0
def main(input_files, output_directory, protein_table, blast_score_ratio,
         cpu_cores, taxa, proteome_matches, no_cleanup, blast_path):

    # create output directory
    fo.create_directory(output_directory)

    # create temp directory
    temp_directory = fo.join_paths(output_directory, ['temp'])
    fo.create_directory(temp_directory)

    # validate input files
    genes_list = fo.join_paths(temp_directory, ['listGenes.txt'])
    genes_list = pv.check_input_type(input_files, genes_list)
    loci_paths = fo.read_lines(genes_list)

    schema_directory = os.path.dirname(loci_paths[0])
    schema_basename = fo.file_basename(schema_directory)
    print('Schema: {0}'.format(schema_directory))
    print('Number of loci: {0}'.format(len(loci_paths)))

    # find annotations based on reference proteomes for species
    proteome_results = {}
    if taxa is not None:
        proteome_results = proteome_annotations(schema_directory,
                                                temp_directory,
                                                taxa,
                                                blast_score_ratio,
                                                cpu_cores,
                                                proteome_matches,
                                                blast_path)

    # find annotations in SPARQL endpoint
    print('\nQuerying UniProt\'s SPARQL endpoint...')
    config_file = fo.join_paths(input_files, '.schema_config')
    if os.path.isfile(config_file) is True:
        config = fo.pickle_loader(config_file)
        translation_table = config.get('translation_table', [11])[0]
    else:
        translation_table = 11
    sparql_results = sparql_annotations(loci_paths,
                                        translation_table,
                                        cpu_cores)

    loci_info = {}
    if protein_table is not None:
        # read cds_info table
        # read "cds_info.tsv" file created by CreateSchema
        table_lines = fo.read_tabular(protein_table)
        for l in table_lines[1:]:
            # create locus identifier based on genome identifier and
            # cds identifier in file
            locus_id = l[0].replace('_', '-')
            locus_id = locus_id + '-protein{0}'.format(l[-2])
            loci_info[locus_id] = l

    annotations = join_annotations(sparql_results, proteome_results, loci_info)

    # table header
    header = ['Locus_ID']
    if len(loci_info) > 0:
        header += table_lines[0]

    header += ['Uniprot_Name', 'UniProt_URL']

    if len(proteome_results) > 0:
        header.extend(['Proteome_ID', 'Proteome_Product',
                       'Proteome_Gene_Name', 'Proteome_Species',
                       'Proteome_BSR'])

    loci_info_bool = True if len(loci_info) > 0 else False
    output_table = create_annotations_table(annotations, output_directory,
                                            header, schema_basename,
                                            loci_info_bool)

    if no_cleanup is False:
        shutil.rmtree(temp_directory)

    print('\n\nThe table with new information can be found at:'
          '\n{0}'.format(output_table))
Beispiel #9
0
def proteome_annotations(schema_directory, temp_directory, taxa,
                         blast_score_ratio, cpu_cores, proteome_matches,
                         blast_path):
    """ Determines loci annotations based on alignment against
        UniProt's reference proteomes.

        Parameters
        ----------
        schema_directory : str
            Path to the schema's directory.
        temp_directory : str
            Path to the temporary directory where intermediate
            files will be written to.
        taxa : list
            List of taxa scientific names. The process will
            search for reference proteomes whose "Species Name"
            field contain any of the provided taxa names.
        blast_score_ratio : float
            BLAST Score Ratio value. Hits with a BSR value
            >= than this value will be considered as high
            scoring hits that can be included in the final
            table according to the maximum number of matches
            to report.
        cpu_cores : int
            Number of threads used to run BLASTp.
        proteome_matches : int
            Maximum number of proteome matches to report.
        blast_path : str
            Path to BLAST executables.

        Returns
        -------
        proteome_results : dict
            Dictionary with loci identifiers as keys and a list
            with information about loci retrieved from the most
            similar records in UniProt's reference proteomes.
    """

    # get paths to files with representative sequences
    short_directory = fo.join_paths(schema_directory, ['short'])
    reps_paths = [fo.join_paths(short_directory, [file])
                  for file in os.listdir(short_directory)
                  if file.endswith('.fasta') is True]

    print('Translating representative sequences...', end='')
    # translate representatives for all loci
    translated_reps = fo.join_paths(temp_directory, ['translated_reps'])
    fo.create_directory(translated_reps)

    reps_protein_files = fao.translate_fastas(reps_paths, translated_reps, 11)
    print('done.')

    print('Downloading list of reference proteomes...', end='')
    remote_readme = fo.join_paths(ct.UNIPROT_PROTEOMES_FTP, ['README'])
    local_readme = fo.join_paths(temp_directory,
                                 ['reference_proteomes_readme.txt'])

    # get README file with list of reference proteomes
    res = fo.download_file(remote_readme, local_readme)
    print('done.')

    # get lines with proteomes info for species of interest
    readme_lines = fo.read_lines(local_readme, strip=False)

    selected_proteomes = im.contained_terms(readme_lines, taxa)
    selected_proteomes = [line.strip('\n') for line in selected_proteomes]
    selected_proteomes = [line.split('\t') for line in selected_proteomes]
    print('Found {0} reference proteomes for '
          '{1}.'.format(len(selected_proteomes), taxa))
    proteome_results = {}
    if len(selected_proteomes) > 0:
        # create directory to store proteomes
        proteomes_directory = fo.join_paths(temp_directory, ['proteomes'])
        fo.create_directory(proteomes_directory)

        proteomes_files = ur.get_proteomes(selected_proteomes,
                                           proteomes_directory)

        # uncompress files and concatenate into single FASTA
        uncompressed_proteomes = [fo.unzip_file(file) for file in proteomes_files]
        proteomes_concat = fo.join_paths(proteomes_directory,
                                         ['full_proteome.fasta'])
        proteomes_concat = fo.concatenate_files(uncompressed_proteomes,
                                                proteomes_concat)

        # get self-scores
        # concatenate protein files
        reps_concat = fo.concatenate_files(reps_protein_files,
                                           fo.join_paths(temp_directory,
                                                         ['reps_concat.fasta']))

        print('\nDetermining self-score of representatives...', end='')
        blastp_path = os.path.join(blast_path, ct.BLASTP_ALIAS)
        makeblastdb_path = os.path.join(blast_path, ct.MAKEBLASTDB_ALIAS)
        self_scores = fao.get_self_scores(reps_concat, temp_directory, cpu_cores,
                                          blastp_path, makeblastdb_path)
        print('done.')

        # create BLASTdb with proteome sequences
        proteome_blastdb = fo.join_paths(proteomes_directory,
                                         ['proteomes_db'])
        stderr = bw.make_blast_db('makeblastdb', proteomes_concat,
                                  proteome_blastdb, 'prot')

        # BLASTp to determine annotations
        blast_inputs = [['blastp', proteome_blastdb, file, file+'_blastout.tsv',
                         1, 1, None, None, proteome_matches, None, bw.run_blast]
                        for file in reps_protein_files]

        print('\nBLASTing representatives against proteomes...')
        blast_results = mo.map_async_parallelizer(blast_inputs,
                                                  mo.function_helper,
                                                  cpu_cores,
                                                  show_progress=True)

        blastout_files = [fo.join_paths(translated_reps, [file])
                          for file in os.listdir(translated_reps)
                          if 'blastout' in file]

        # index proteome file
        indexed_proteome = SeqIO.index(proteomes_concat, 'fasta')

        # process results for each BLASTp
        proteome_results = extract_annotations(blastout_files,
                                               indexed_proteome,
                                               self_scores,
                                               blast_score_ratio,
                                               proteome_matches)

    return proteome_results
def main(input_files, output_directory, cpu_cores, blast_score_ratio,
         minimum_length, translation_table, ptf_path, size_threshold,
         blast_path):

    print('Adapting schema in the following '
          'directory:\n{0}'.format(os.path.abspath(input_files)))
    print('Prodigal training file:\n{0}'.format(ptf_path))
    print('Number of cores: {0}'.format(cpu_cores))
    print('BLAST Score Ratio: {0}'.format(blast_score_ratio))
    print('Translation table: {0}'.format(translation_table))
    print('Minimum accepted sequence length: {0}'.format(minimum_length))
    print('Size threshold: {0}'.format(size_threshold))

    # define output paths
    schema_path = os.path.abspath(output_directory)
    schema_short_path = fo.join_paths(schema_path, ['short'])

    # create output directories
    # check if they exist first
    fo.create_directory(schema_path)
    fo.create_directory(schema_short_path)

    # list schema gene files
    genes_file = pv.check_input_type(input_files,
                                     os.path.join(output_directory, 'schema_genes.txt'))

    # import list of schema files
    with open(genes_file, 'r') as gf:
        genes_list = [line.rstrip('\n') for line in gf]
    os.remove(genes_file)

    print('Number of genes to adapt: {0}\n'.format(len(genes_list)))

    print('Determining the total number of alleles and '
          'allele mean length per gene...\n'.format())

    # count number of sequences and mean length per gene
    genes_info = []
    genes_pools = multiprocessing.Pool(processes=cpu_cores)
    gp = genes_pools.map_async(fao.gene_seqs_info, genes_list,
                               callback=genes_info.extend)
    gp.wait()

    # split files according to number of sequences and sequence mean length
    # in each file to pass even groups of sequences to all cores
    even_genes_groups = mo.split_genes_by_core(genes_info, cpu_cores*4,
                                               'seqcount')
    # with few inputs, some sublists might be empty
    even_genes_groups = [i for i in even_genes_groups if len(i) > 0]

    # add common arguments
    blastp_path = os.path.join(blast_path, ct.BLASTP_ALIAS)
    makeblastdb_path = os.path.join(blast_path, ct.MAKEBLASTDB_ALIAS)
    even_genes_groups = [[i, schema_path, schema_short_path,
                          blast_score_ratio, minimum_length,
                          translation_table, size_threshold,
                          blastp_path, makeblastdb_path,
                          adapt_loci] for i in even_genes_groups]

    print('Adapting {0} genes...\n'.format(len(genes_list)))

    invalid_data = mo.map_async_parallelizer(even_genes_groups,
                                             mo.function_helper,
                                             cpu_cores,
                                             show_progress=True)

    # define paths and write files with list of invalid
    # alleles and invalid genes
    output_schema_basename = os.path.basename(output_directory.rstrip('/'))
    schema_parent_directory = os.path.dirname(schema_path)

    # write file with alleles that were determined to be invalid
    invalid_alleles = [sub[0] for sub in invalid_data]
    invalid_alleles = list(itertools.chain.from_iterable(invalid_alleles))
    invalid_alleles_file = os.path.join(schema_parent_directory,
                                        '{0}_{1}'.format(output_schema_basename, 'invalid_alleles.txt'))

    with open(invalid_alleles_file, 'w') as inv:
        lines = ['{0}: {1}\n'.format(allele[0], allele[1]) for allele in invalid_alleles]
        inv.writelines(lines)

    # write file with identifiers of genes that had no valid alleles
    invalid_genes = [sub[1] for sub in invalid_data]
    invalid_genes = list(itertools.chain.from_iterable(invalid_genes))
    invalid_genes_file = os.path.join(schema_parent_directory,
                                      '{0}_{1}'.format(output_schema_basename, 'invalid_genes.txt'))

    with open(invalid_genes_file, 'w') as inv:
        invalid_geqids = '\n'.join(invalid_genes)
        inv.write(invalid_geqids)

    stats_lines = [sub[2] for sub in invalid_data]
    stats_lines = list(itertools.chain.from_iterable(stats_lines))
    stats_lines = ['\t'.join(line) for line in stats_lines]
    stats_genes_file = '{0}/{1}_{2}'.format(schema_parent_directory,
                                            output_schema_basename,
                                            'summary_stats.txt')

    with open(stats_genes_file, 'w') as stats:
        summary_stats_text = '\n'.join(stats_lines)
        stats.write('Gene\tTotal_alleles\tValid_alleles\tNumber_representatives\n')
        stats.write(summary_stats_text)

    print('\n\nNumber of invalid genes: {0}'.format(len(invalid_genes)))
    print('Number of invalid alleles: {0}'.format(len(invalid_alleles)))

    print('\nSuccessfully adapted {0}/{1} genes present in the '
          'input schema.'.format(len(genes_list)-len(invalid_genes),
                                 len(genes_list)))
def adapt_loci(genes, schema_path, schema_short_path, bsr, min_len,
               table_id, size_threshold, blastp_path, makeblastdb_path):
    """ Adapts a set of genes/loci from an external schema so that
        that schema  can be used with chewBBACA. Removes invalid alleles
        and selects representative alleles to include in the "short" directory.

        Parameters
        ----------
        genes_list : list
            A list with the following elements:

            - List with paths to the files to be processed.
            - Path to the schema directory.
            - Path to the "short" directory.
            - BLAST Score Ratio value.
            - Minimum sequence length value.
            - Genetic code.
            - Sequence size variation threshold.

        Returns
        -------
        invalid_alleles : list
            List with the identifiers of the alleles that were
            determined to be invalid.
        invalid_genes : list
            List with the identifiers of the genes that had no
            valid alleles.
        summary_stats : list of list
            List with one sublist per processed locus. Each
            sublist has four elements:

            - The identifier of the locus.
            - The number of alleles in the external file.
            - The number of alleles that were a valid CDS.
            - The number of representatives determined determined
              by the process.

        The function writes the schema files .
    """

    # divide input list into variables
    summary_stats = []
    invalid_genes = []
    invalid_alleles = []
    for gene in genes:

        representatives = []
        final_representatives = []

        # get gene basename and identifier
        gene_basename = os.path.basename(gene)
        gene_id = gene_basename.split('.f')[0]

        # create paths to gene files in new schema
        gene_file = fo.join_paths(schema_path,
                                  ['{0}{1}'.format(gene_id, '.fasta')])

        gene_short_file = fo.join_paths(schema_short_path,
                                        ['{0}{1}'.format(gene_id, '_short.fasta')])

        # create path to temp working directory for current gene
        gene_temp_dir = fo.join_paths(schema_path,
                                      ['{0}{1}'.format(gene_id, '_temp')])

        # create temp directory for the current gene
        fo.create_directory(gene_temp_dir)

        # dictionaries mapping gene identifiers to DNA sequences
        # and Protein sequences
        gene_seqs, prot_seqs, gene_invalid, seqids_map, total_sequences = \
            sm.get_seqs_dicts(gene, gene_id, table_id, min_len, size_threshold)
        invalid_alleles.extend(gene_invalid)

        # if locus has no valid CDS sequences,
        # continue to next locus
        if len(prot_seqs) == 0:
            shutil.rmtree(gene_temp_dir)
            invalid_genes.append(gene_id)
            summary_stats.append([gene_id, str(total_sequences), '0', '0'])
            continue

        if len(gene_seqs) > 1:
            # identify DNA sequences that code for same protein
            equal_prots = sm.determine_duplicated_seqs(prot_seqs)

            # get only one identifier per protein
            ids_to_blast = [protids[0] for protein, protids in equal_prots.items()]

            # get longest sequence as first representative
            longest = sm.determine_longest(ids_to_blast, prot_seqs)
            representatives.append(longest)
            final_representatives.append(longest)

            # create FASTA file with distinct protein sequences
            protein_file = fo.join_paths(gene_temp_dir,
                                         ['{0}_protein.fasta'.format(gene_id)])
            protein_lines = fao.fasta_lines(ids_to_blast, prot_seqs)
            fo.write_list(protein_lines, protein_file)

            # create blastdb with all distinct proteins
            blastp_db = os.path.join(gene_temp_dir, gene_id)
            bw.make_blast_db(makeblastdb_path, protein_file, blastp_db, 'prot')

            # determine appropriate blastp task (proteins < 30aa need blastp-short)
            blastp_task = bw.determine_blast_task(equal_prots)

            # cycles to BLAST representatives against non-representatives until
            # all non-representatives have a representative
            while len(set(ids_to_blast) - set(representatives)) != 0:

                # create FASTA file with representative sequences
                rep_file = fo.join_paths(gene_temp_dir,
                                         ['{0}_rep_protein.fasta'.format(gene_id)])
                rep_protein_lines = fao.fasta_lines(representatives, prot_seqs)
                fo.write_list(rep_protein_lines, rep_file)

                # create file with seqids to BLAST against
                ids_str = im.concatenate_list([str(i) for i in ids_to_blast], '\n')
                ids_file = fo.join_paths(gene_temp_dir,
                                         ['{0}_ids.txt'.format(gene_id)])
                fo.write_to_file(ids_str, ids_file, 'w', '')

                # BLAST representatives against non-represented
                blast_output = fo.join_paths(gene_temp_dir,
                                             ['{0}_blast_out.tsv'.format(gene_id)])
                # set max_target_seqs to huge number because BLAST only
                # returns 500 hits by default

                blast_stderr = bw.run_blast(blastp_path, blastp_db, rep_file,
                                            blast_output, 1, 1, ids_file,
                                            blastp_task, 100000, ignore=ct.IGNORE_RAISED)
                if len(blast_stderr) > 0:
                    raise ValueError(blast_stderr)

                # import BLAST results
                blast_results = fo.read_tabular(blast_output)

                # get self-score for representatives
                rep_self_scores = {res[1]: res[2] for res in blast_results
                                   if res[0] == res[1]}

                # divide results into high, low and hot BSR values
                hitting_high, hitting_low, hotspots, high_reps, low_reps, hot_reps = \
                    bsr_categorizer(blast_results, representatives,
                                    rep_self_scores, bsr, bsr+0.1)

                excluded_reps = []

                # remove high BSR hits that have representative
                hitting_high = set(hitting_high)
                ids_to_blast = [i for i in ids_to_blast if i not in hitting_high]

                # remove representatives that led to high BSR with subjects that were removed
                prunned_high_reps = {k: [r for r in v if r in ids_to_blast] for k, v in high_reps.items()}
                reps_to_remove = [k for k, v in prunned_high_reps.items() if len(v) == 0]

                excluded_reps.extend(reps_to_remove)

                # determine smallest set of representatives that allow to get all cycle candidates
                excluded = []
                hotspot_reps = set(im.flatten_list(list(hot_reps.values())))
                for rep, hits in hot_reps.items():
                    common = hotspot_reps.intersection(set(hits))
                    if len(common) > 0:
                        hotspot_reps = hotspot_reps - common
                    else:
                        excluded.append(rep)

                excluded_reps.extend(excluded)

                # remove representatives that only led to low BSR
                excluded_reps.extend(low_reps)

                representatives = [rep for rep in representatives if rep not in excluded_reps]
                ids_to_blast = [i for i in ids_to_blast if i not in excluded_reps]

                # determine next representative from candidates
                rep_candidates = list(set(hotspots) - hitting_high)
                # sort to guarantee reproducible results with same datasets
                rep_candidates = sorted(rep_candidates, key=lambda x: int(x))
                representatives, final_representatives = select_candidate(rep_candidates,
                                                                          prot_seqs,
                                                                          ids_to_blast,
                                                                          representatives,
                                                                          final_representatives)

                # remove files created for current gene iteration
                os.remove(rep_file)
                os.remove(blast_output)
                os.remove(ids_file)

        else:
            final_representatives = list(prot_seqs.keys())

        # write schema file with all alleles
        gene_lines = fao.fasta_lines(list(gene_seqs.keys()), gene_seqs)
        fo.write_list(gene_lines, gene_file)

        # get total number of valid sequences
        valid_sequences = len(gene_lines)

        # write schema file with representatives
        final_representatives = [seqids_map[rep] for rep in final_representatives]
        gene_rep_lines = fao.fasta_lines(final_representatives, gene_seqs)
        fo.write_list(gene_rep_lines, gene_short_file)

        # get number of representatives
        representatives_number = len(gene_rep_lines)

        summary_stats.append([gene_id,
                              str(total_sequences),
                              str(valid_sequences),
                              str(representatives_number)])

        shutil.rmtree(gene_temp_dir)

    return [invalid_alleles, invalid_genes, summary_stats]