コード例 #1
0
ファイル: down_schema.py プロジェクト: anaSDCorreia/chewBBACA
def download_compressed(zip_uri, species_name, schema_name, download_folder,
                        headers_get):
    """ Downloads and extracts a ZIP archive with a ready-to-use
        version of a schema in the Chewie-NS.

        Parameters
        ----------
        zip_uri : str
            Endpoint URL to make the request to download
            the compressed schema.
        species_name : str
            Scientific name of the schema species.
        schema_name : str
            Name of the schema in the Chewie-NS.
        download_folder : str
            Path to the directory to which the ZIP archive
            will be saved.
        headers_get : dict
            HTTP headers for GET requests.

        Returns
        -------
        schema_path : str
            ZIP archive contents will be extracted to this
            directory.
    """

    zip_name = '{0}{1}_{2}.zip'.format(species_name[0].lower(),
                                       species_name.split(' ')[-1],
                                       schema_name)
    schema_path = os.path.join(download_folder, zip_name.split('.zip')[0])
    fo.create_directory(schema_path)

    # download ZIP archive
    url, zip_response = cr.simple_get_request(
        zip_uri, headers_get, parameters={'request_type': 'download'})
    zip_path = os.path.join(schema_path, zip_name)
    open(zip_path, 'wb').write(zip_response.content)
    # uncompress
    print('Decompressing schema...')
    shutil.unpack_archive(zip_path, extract_dir=schema_path)
    # delete ZIP
    os.remove(zip_path)

    return schema_path
コード例 #2
0
def main(input_files, output_directory, protein_table, blast_score_ratio,
         cpu_cores, taxa, proteome_matches, no_cleanup, blast_path):

    # create output directory
    fo.create_directory(output_directory)

    # create temp directory
    temp_directory = fo.join_paths(output_directory, ['temp'])
    fo.create_directory(temp_directory)

    # validate input files
    genes_list = fo.join_paths(temp_directory, ['listGenes.txt'])
    genes_list = pv.check_input_type(input_files, genes_list)
    loci_paths = fo.read_lines(genes_list)

    schema_directory = os.path.dirname(loci_paths[0])
    schema_basename = fo.file_basename(schema_directory)
    print('Schema: {0}'.format(schema_directory))
    print('Number of loci: {0}'.format(len(loci_paths)))

    # find annotations based on reference proteomes for species
    proteome_results = {}
    if taxa is not None:
        proteome_results = proteome_annotations(schema_directory,
                                                temp_directory,
                                                taxa,
                                                blast_score_ratio,
                                                cpu_cores,
                                                proteome_matches,
                                                blast_path)

    # find annotations in SPARQL endpoint
    print('\nQuerying UniProt\'s SPARQL endpoint...')
    config_file = fo.join_paths(input_files, '.schema_config')
    if os.path.isfile(config_file) is True:
        config = fo.pickle_loader(config_file)
        translation_table = config.get('translation_table', [11])[0]
    else:
        translation_table = 11
    sparql_results = sparql_annotations(loci_paths,
                                        translation_table,
                                        cpu_cores)

    loci_info = {}
    if protein_table is not None:
        # read cds_info table
        # read "cds_info.tsv" file created by CreateSchema
        table_lines = fo.read_tabular(protein_table)
        for l in table_lines[1:]:
            # create locus identifier based on genome identifier and
            # cds identifier in file
            locus_id = l[0].replace('_', '-')
            locus_id = locus_id + '-protein{0}'.format(l[-2])
            loci_info[locus_id] = l

    annotations = join_annotations(sparql_results, proteome_results, loci_info)

    # table header
    header = ['Locus_ID']
    if len(loci_info) > 0:
        header += table_lines[0]

    header += ['Uniprot_Name', 'UniProt_URL']

    if len(proteome_results) > 0:
        header.extend(['Proteome_ID', 'Proteome_Product',
                       'Proteome_Gene_Name', 'Proteome_Species',
                       'Proteome_BSR'])

    loci_info_bool = True if len(loci_info) > 0 else False
    output_table = create_annotations_table(annotations, output_directory,
                                            header, schema_basename,
                                            loci_info_bool)

    if no_cleanup is False:
        shutil.rmtree(temp_directory)

    print('\n\nThe table with new information can be found at:'
          '\n{0}'.format(output_table))
コード例 #3
0
def main(input_files, output_directory, cpu_cores, blast_score_ratio,
         minimum_length, translation_table, ptf_path, size_threshold,
         blast_path):

    print('Adapting schema in the following '
          'directory:\n{0}'.format(os.path.abspath(input_files)))
    print('Prodigal training file:\n{0}'.format(ptf_path))
    print('Number of cores: {0}'.format(cpu_cores))
    print('BLAST Score Ratio: {0}'.format(blast_score_ratio))
    print('Translation table: {0}'.format(translation_table))
    print('Minimum accepted sequence length: {0}'.format(minimum_length))
    print('Size threshold: {0}'.format(size_threshold))

    # define output paths
    schema_path = os.path.abspath(output_directory)
    schema_short_path = fo.join_paths(schema_path, ['short'])

    # create output directories
    # check if they exist first
    fo.create_directory(schema_path)
    fo.create_directory(schema_short_path)

    # list schema gene files
    genes_file = pv.check_input_type(input_files,
                                     os.path.join(output_directory, 'schema_genes.txt'))

    # import list of schema files
    with open(genes_file, 'r') as gf:
        genes_list = [line.rstrip('\n') for line in gf]
    os.remove(genes_file)

    print('Number of genes to adapt: {0}\n'.format(len(genes_list)))

    print('Determining the total number of alleles and '
          'allele mean length per gene...\n'.format())

    # count number of sequences and mean length per gene
    genes_info = []
    genes_pools = multiprocessing.Pool(processes=cpu_cores)
    gp = genes_pools.map_async(fao.gene_seqs_info, genes_list,
                               callback=genes_info.extend)
    gp.wait()

    # split files according to number of sequences and sequence mean length
    # in each file to pass even groups of sequences to all cores
    even_genes_groups = mo.split_genes_by_core(genes_info, cpu_cores*4,
                                               'seqcount')
    # with few inputs, some sublists might be empty
    even_genes_groups = [i for i in even_genes_groups if len(i) > 0]

    # add common arguments
    blastp_path = os.path.join(blast_path, ct.BLASTP_ALIAS)
    makeblastdb_path = os.path.join(blast_path, ct.MAKEBLASTDB_ALIAS)
    even_genes_groups = [[i, schema_path, schema_short_path,
                          blast_score_ratio, minimum_length,
                          translation_table, size_threshold,
                          blastp_path, makeblastdb_path,
                          adapt_loci] for i in even_genes_groups]

    print('Adapting {0} genes...\n'.format(len(genes_list)))

    invalid_data = mo.map_async_parallelizer(even_genes_groups,
                                             mo.function_helper,
                                             cpu_cores,
                                             show_progress=True)

    # define paths and write files with list of invalid
    # alleles and invalid genes
    output_schema_basename = os.path.basename(output_directory.rstrip('/'))
    schema_parent_directory = os.path.dirname(schema_path)

    # write file with alleles that were determined to be invalid
    invalid_alleles = [sub[0] for sub in invalid_data]
    invalid_alleles = list(itertools.chain.from_iterable(invalid_alleles))
    invalid_alleles_file = os.path.join(schema_parent_directory,
                                        '{0}_{1}'.format(output_schema_basename, 'invalid_alleles.txt'))

    with open(invalid_alleles_file, 'w') as inv:
        lines = ['{0}: {1}\n'.format(allele[0], allele[1]) for allele in invalid_alleles]
        inv.writelines(lines)

    # write file with identifiers of genes that had no valid alleles
    invalid_genes = [sub[1] for sub in invalid_data]
    invalid_genes = list(itertools.chain.from_iterable(invalid_genes))
    invalid_genes_file = os.path.join(schema_parent_directory,
                                      '{0}_{1}'.format(output_schema_basename, 'invalid_genes.txt'))

    with open(invalid_genes_file, 'w') as inv:
        invalid_geqids = '\n'.join(invalid_genes)
        inv.write(invalid_geqids)

    stats_lines = [sub[2] for sub in invalid_data]
    stats_lines = list(itertools.chain.from_iterable(stats_lines))
    stats_lines = ['\t'.join(line) for line in stats_lines]
    stats_genes_file = '{0}/{1}_{2}'.format(schema_parent_directory,
                                            output_schema_basename,
                                            'summary_stats.txt')

    with open(stats_genes_file, 'w') as stats:
        summary_stats_text = '\n'.join(stats_lines)
        stats.write('Gene\tTotal_alleles\tValid_alleles\tNumber_representatives\n')
        stats.write(summary_stats_text)

    print('\n\nNumber of invalid genes: {0}'.format(len(invalid_genes)))
    print('Number of invalid alleles: {0}'.format(len(invalid_alleles)))

    print('\nSuccessfully adapted {0}/{1} genes present in the '
          'input schema.'.format(len(genes_list)-len(invalid_genes),
                                 len(genes_list)))
コード例 #4
0
def proteome_annotations(schema_directory, temp_directory, taxa,
                         blast_score_ratio, cpu_cores, proteome_matches,
                         blast_path):
    """ Determines loci annotations based on alignment against
        UniProt's reference proteomes.

        Parameters
        ----------
        schema_directory : str
            Path to the schema's directory.
        temp_directory : str
            Path to the temporary directory where intermediate
            files will be written to.
        taxa : list
            List of taxa scientific names. The process will
            search for reference proteomes whose "Species Name"
            field contain any of the provided taxa names.
        blast_score_ratio : float
            BLAST Score Ratio value. Hits with a BSR value
            >= than this value will be considered as high
            scoring hits that can be included in the final
            table according to the maximum number of matches
            to report.
        cpu_cores : int
            Number of threads used to run BLASTp.
        proteome_matches : int
            Maximum number of proteome matches to report.
        blast_path : str
            Path to BLAST executables.

        Returns
        -------
        proteome_results : dict
            Dictionary with loci identifiers as keys and a list
            with information about loci retrieved from the most
            similar records in UniProt's reference proteomes.
    """

    # get paths to files with representative sequences
    short_directory = fo.join_paths(schema_directory, ['short'])
    reps_paths = [fo.join_paths(short_directory, [file])
                  for file in os.listdir(short_directory)
                  if file.endswith('.fasta') is True]

    print('Translating representative sequences...', end='')
    # translate representatives for all loci
    translated_reps = fo.join_paths(temp_directory, ['translated_reps'])
    fo.create_directory(translated_reps)

    reps_protein_files = fao.translate_fastas(reps_paths, translated_reps, 11)
    print('done.')

    print('Downloading list of reference proteomes...', end='')
    remote_readme = fo.join_paths(ct.UNIPROT_PROTEOMES_FTP, ['README'])
    local_readme = fo.join_paths(temp_directory,
                                 ['reference_proteomes_readme.txt'])

    # get README file with list of reference proteomes
    res = fo.download_file(remote_readme, local_readme)
    print('done.')

    # get lines with proteomes info for species of interest
    readme_lines = fo.read_lines(local_readme, strip=False)

    selected_proteomes = im.contained_terms(readme_lines, taxa)
    selected_proteomes = [line.strip('\n') for line in selected_proteomes]
    selected_proteomes = [line.split('\t') for line in selected_proteomes]
    print('Found {0} reference proteomes for '
          '{1}.'.format(len(selected_proteomes), taxa))
    proteome_results = {}
    if len(selected_proteomes) > 0:
        # create directory to store proteomes
        proteomes_directory = fo.join_paths(temp_directory, ['proteomes'])
        fo.create_directory(proteomes_directory)

        proteomes_files = ur.get_proteomes(selected_proteomes,
                                           proteomes_directory)

        # uncompress files and concatenate into single FASTA
        uncompressed_proteomes = [fo.unzip_file(file) for file in proteomes_files]
        proteomes_concat = fo.join_paths(proteomes_directory,
                                         ['full_proteome.fasta'])
        proteomes_concat = fo.concatenate_files(uncompressed_proteomes,
                                                proteomes_concat)

        # get self-scores
        # concatenate protein files
        reps_concat = fo.concatenate_files(reps_protein_files,
                                           fo.join_paths(temp_directory,
                                                         ['reps_concat.fasta']))

        print('\nDetermining self-score of representatives...', end='')
        blastp_path = os.path.join(blast_path, ct.BLASTP_ALIAS)
        makeblastdb_path = os.path.join(blast_path, ct.MAKEBLASTDB_ALIAS)
        self_scores = fao.get_self_scores(reps_concat, temp_directory, cpu_cores,
                                          blastp_path, makeblastdb_path)
        print('done.')

        # create BLASTdb with proteome sequences
        proteome_blastdb = fo.join_paths(proteomes_directory,
                                         ['proteomes_db'])
        stderr = bw.make_blast_db('makeblastdb', proteomes_concat,
                                  proteome_blastdb, 'prot')

        # BLASTp to determine annotations
        blast_inputs = [['blastp', proteome_blastdb, file, file+'_blastout.tsv',
                         1, 1, None, None, proteome_matches, None, bw.run_blast]
                        for file in reps_protein_files]

        print('\nBLASTing representatives against proteomes...')
        blast_results = mo.map_async_parallelizer(blast_inputs,
                                                  mo.function_helper,
                                                  cpu_cores,
                                                  show_progress=True)

        blastout_files = [fo.join_paths(translated_reps, [file])
                          for file in os.listdir(translated_reps)
                          if 'blastout' in file]

        # index proteome file
        indexed_proteome = SeqIO.index(proteomes_concat, 'fasta')

        # process results for each BLASTp
        proteome_results = extract_annotations(blastout_files,
                                               indexed_proteome,
                                               self_scores,
                                               blast_score_ratio,
                                               proteome_matches)

    return proteome_results
コード例 #5
0
def adapt_loci(genes, schema_path, schema_short_path, bsr, min_len,
               table_id, size_threshold, blastp_path, makeblastdb_path):
    """ Adapts a set of genes/loci from an external schema so that
        that schema  can be used with chewBBACA. Removes invalid alleles
        and selects representative alleles to include in the "short" directory.

        Parameters
        ----------
        genes_list : list
            A list with the following elements:

            - List with paths to the files to be processed.
            - Path to the schema directory.
            - Path to the "short" directory.
            - BLAST Score Ratio value.
            - Minimum sequence length value.
            - Genetic code.
            - Sequence size variation threshold.

        Returns
        -------
        invalid_alleles : list
            List with the identifiers of the alleles that were
            determined to be invalid.
        invalid_genes : list
            List with the identifiers of the genes that had no
            valid alleles.
        summary_stats : list of list
            List with one sublist per processed locus. Each
            sublist has four elements:

            - The identifier of the locus.
            - The number of alleles in the external file.
            - The number of alleles that were a valid CDS.
            - The number of representatives determined determined
              by the process.

        The function writes the schema files .
    """

    # divide input list into variables
    summary_stats = []
    invalid_genes = []
    invalid_alleles = []
    for gene in genes:

        representatives = []
        final_representatives = []

        # get gene basename and identifier
        gene_basename = os.path.basename(gene)
        gene_id = gene_basename.split('.f')[0]

        # create paths to gene files in new schema
        gene_file = fo.join_paths(schema_path,
                                  ['{0}{1}'.format(gene_id, '.fasta')])

        gene_short_file = fo.join_paths(schema_short_path,
                                        ['{0}{1}'.format(gene_id, '_short.fasta')])

        # create path to temp working directory for current gene
        gene_temp_dir = fo.join_paths(schema_path,
                                      ['{0}{1}'.format(gene_id, '_temp')])

        # create temp directory for the current gene
        fo.create_directory(gene_temp_dir)

        # dictionaries mapping gene identifiers to DNA sequences
        # and Protein sequences
        gene_seqs, prot_seqs, gene_invalid, seqids_map, total_sequences = \
            sm.get_seqs_dicts(gene, gene_id, table_id, min_len, size_threshold)
        invalid_alleles.extend(gene_invalid)

        # if locus has no valid CDS sequences,
        # continue to next locus
        if len(prot_seqs) == 0:
            shutil.rmtree(gene_temp_dir)
            invalid_genes.append(gene_id)
            summary_stats.append([gene_id, str(total_sequences), '0', '0'])
            continue

        if len(gene_seqs) > 1:
            # identify DNA sequences that code for same protein
            equal_prots = sm.determine_duplicated_seqs(prot_seqs)

            # get only one identifier per protein
            ids_to_blast = [protids[0] for protein, protids in equal_prots.items()]

            # get longest sequence as first representative
            longest = sm.determine_longest(ids_to_blast, prot_seqs)
            representatives.append(longest)
            final_representatives.append(longest)

            # create FASTA file with distinct protein sequences
            protein_file = fo.join_paths(gene_temp_dir,
                                         ['{0}_protein.fasta'.format(gene_id)])
            protein_lines = fao.fasta_lines(ids_to_blast, prot_seqs)
            fo.write_list(protein_lines, protein_file)

            # create blastdb with all distinct proteins
            blastp_db = os.path.join(gene_temp_dir, gene_id)
            bw.make_blast_db(makeblastdb_path, protein_file, blastp_db, 'prot')

            # determine appropriate blastp task (proteins < 30aa need blastp-short)
            blastp_task = bw.determine_blast_task(equal_prots)

            # cycles to BLAST representatives against non-representatives until
            # all non-representatives have a representative
            while len(set(ids_to_blast) - set(representatives)) != 0:

                # create FASTA file with representative sequences
                rep_file = fo.join_paths(gene_temp_dir,
                                         ['{0}_rep_protein.fasta'.format(gene_id)])
                rep_protein_lines = fao.fasta_lines(representatives, prot_seqs)
                fo.write_list(rep_protein_lines, rep_file)

                # create file with seqids to BLAST against
                ids_str = im.concatenate_list([str(i) for i in ids_to_blast], '\n')
                ids_file = fo.join_paths(gene_temp_dir,
                                         ['{0}_ids.txt'.format(gene_id)])
                fo.write_to_file(ids_str, ids_file, 'w', '')

                # BLAST representatives against non-represented
                blast_output = fo.join_paths(gene_temp_dir,
                                             ['{0}_blast_out.tsv'.format(gene_id)])
                # set max_target_seqs to huge number because BLAST only
                # returns 500 hits by default

                blast_stderr = bw.run_blast(blastp_path, blastp_db, rep_file,
                                            blast_output, 1, 1, ids_file,
                                            blastp_task, 100000, ignore=ct.IGNORE_RAISED)
                if len(blast_stderr) > 0:
                    raise ValueError(blast_stderr)

                # import BLAST results
                blast_results = fo.read_tabular(blast_output)

                # get self-score for representatives
                rep_self_scores = {res[1]: res[2] for res in blast_results
                                   if res[0] == res[1]}

                # divide results into high, low and hot BSR values
                hitting_high, hitting_low, hotspots, high_reps, low_reps, hot_reps = \
                    bsr_categorizer(blast_results, representatives,
                                    rep_self_scores, bsr, bsr+0.1)

                excluded_reps = []

                # remove high BSR hits that have representative
                hitting_high = set(hitting_high)
                ids_to_blast = [i for i in ids_to_blast if i not in hitting_high]

                # remove representatives that led to high BSR with subjects that were removed
                prunned_high_reps = {k: [r for r in v if r in ids_to_blast] for k, v in high_reps.items()}
                reps_to_remove = [k for k, v in prunned_high_reps.items() if len(v) == 0]

                excluded_reps.extend(reps_to_remove)

                # determine smallest set of representatives that allow to get all cycle candidates
                excluded = []
                hotspot_reps = set(im.flatten_list(list(hot_reps.values())))
                for rep, hits in hot_reps.items():
                    common = hotspot_reps.intersection(set(hits))
                    if len(common) > 0:
                        hotspot_reps = hotspot_reps - common
                    else:
                        excluded.append(rep)

                excluded_reps.extend(excluded)

                # remove representatives that only led to low BSR
                excluded_reps.extend(low_reps)

                representatives = [rep for rep in representatives if rep not in excluded_reps]
                ids_to_blast = [i for i in ids_to_blast if i not in excluded_reps]

                # determine next representative from candidates
                rep_candidates = list(set(hotspots) - hitting_high)
                # sort to guarantee reproducible results with same datasets
                rep_candidates = sorted(rep_candidates, key=lambda x: int(x))
                representatives, final_representatives = select_candidate(rep_candidates,
                                                                          prot_seqs,
                                                                          ids_to_blast,
                                                                          representatives,
                                                                          final_representatives)

                # remove files created for current gene iteration
                os.remove(rep_file)
                os.remove(blast_output)
                os.remove(ids_file)

        else:
            final_representatives = list(prot_seqs.keys())

        # write schema file with all alleles
        gene_lines = fao.fasta_lines(list(gene_seqs.keys()), gene_seqs)
        fo.write_list(gene_lines, gene_file)

        # get total number of valid sequences
        valid_sequences = len(gene_lines)

        # write schema file with representatives
        final_representatives = [seqids_map[rep] for rep in final_representatives]
        gene_rep_lines = fao.fasta_lines(final_representatives, gene_seqs)
        fo.write_list(gene_rep_lines, gene_short_file)

        # get number of representatives
        representatives_number = len(gene_rep_lines)

        summary_stats.append([gene_id,
                              str(total_sequences),
                              str(valid_sequences),
                              str(representatives_number)])

        shutil.rmtree(gene_temp_dir)

    return [invalid_alleles, invalid_genes, summary_stats]