Ejemplo n.º 1
0
def write_clusters(clusters, outfile):
    """ Writes information about clusters to file.

        Parameters
        ----------
        clusters : dict
            Dictionary with the identifiers of sequences
            that are cluster representatives as keys and
            a list with tuples as values. Each tuple has
            the identifier of a sequence that was added to
            the cluster, the decimal proportion of shared
            distinct kmers/minimizers and the length of the
            clustered sequence.
        outfile : str
            Path to the file that will be created to save
            information about clusters.
    """

    cluster_lines = []
    for rep, seqids in clusters.items():
        current_cluster = []
        current_cluster.append('>{0}'.format(rep))
        clustered = [', '.join(['{}'] * len(s)).format(*s) for s in seqids]
        current_cluster.extend(clustered)
        cluster_lines.append(current_cluster)

    # sort by number of lines to get clusters with more sequences first
    cluster_lines = im.sort_data(cluster_lines,
                                 sort_key=lambda x: len(x),
                                 reverse=True)
    cluster_lines = im.flatten_list(cluster_lines)
    cluster_text = im.join_list(cluster_lines, '\n')

    fo.write_to_file(cluster_text, outfile, 'w', '\n')
def determine_distinct(sequences_file, unique_fasta):
    """ Identifies duplicated sequences in a FASTA file.
        Returns a single sequence identifier per distinct
        sequence and saves distinct sequences to a FASTA
        file.

        Parameters
        ----------
        sequences_file : str
            Path to a FASTA file.
        unique_fasta : str
            Path to a FASTA file that will be created to
            store distinct sequences.

        Returns
        -------
        List with following elements:
            total : int
                Total number of times sequences were repeated.
            unique_seqids : list
                List with one sequence identifier per distinct
                sequence. The first identifier observed for a
                distinct sequence is the one stored in the list.
    """

    total = 0
    seqs_dict = {}
    out_limit = 10000
    out_seqs = []
    exausted = False
    seq_generator = SeqIO.parse(sequences_file, 'fasta')
    while exausted is False:
        record = next(seq_generator, None)
        if record is not None:
            # seq object has to be converted to string
            sequence = str(record.seq.upper())
            seqid = record.id
            seq_hash = im.hash_sequence(sequence)

            # store only the hash for distinct sequences
            if seq_hash not in seqs_dict:
                seqs_dict[seq_hash] = seqid
                recout = fao.fasta_str_record(seqid, sequence)
                out_seqs.append(recout)
            elif seq_hash in seqs_dict:
                total += 1
        else:
            exausted = True

        if len(out_seqs) == out_limit or exausted is True:
            if len(out_seqs) > 0:
                out_seqs = im.join_list(out_seqs, '\n')
                fo.write_to_file(out_seqs, unique_fasta, 'a', '\n')
                out_seqs = []

    unique_seqids = list(seqs_dict.values())

    return [total, unique_seqids]
Ejemplo n.º 3
0
def blast_inputs(clusters, output_directory, ids_dict):
    """ Creates files with the identifiers of the sequences
        in each cluster.

        Parameters
        ----------
        clusters : dict
            Dictionary with the identifiers of cluster
            representatives as keys and a list with tuples
            as values (each tuple has the identifier of a
            sequence that is in the cluster, the decimal
            proportion of shared minimizers and the length
            of that sequence).
        output_directory : str
            Path to the directory where files with identifiers
            will be created.
        ids_dict : dict
            Dictionary that maps sequence identifiers to
            shorter and unique identifiers that will be
            saved in the files and used as sequence
            identifiers during BLAST to avoid errors
            related with sequence headers/identifiers
            that exceed length limit allowed by BLAST.

        Returns
        -------
        ids_to_blast : list
            List with the identifiers of all clusters.
    """

    rev_ids = {v: k for k, v in ids_dict.items()}

    ids_to_blast = []
    for rep in clusters:

        cluster_file = os.path.join(output_directory,
                                    '{0}_ids.txt'.format(rev_ids[rep]))
        cluster_ids = [rev_ids[rep]
                       ] + [rev_ids[seqid[0]] for seqid in clusters[rep]]
        cluster_lines = im.join_list(cluster_ids, '\n')
        fo.write_to_file(cluster_lines, cluster_file, 'w', '')
        ids_to_blast.append((rev_ids[rep], len(cluster_ids)))

    return ids_to_blast
Ejemplo n.º 4
0
def get_sequences_by_id(sequences, seqids, out_file, limit=5000):
    """ Retrieves sequences from an indexed FASTA file.

        Parameters
        ----------
        sequences : dict or Bio.File._IndexedSeqFileDict
            Dictionary with seqids as keys and sequences
            as values or a Fasta file index created with
            BioPython.
        seqids : list
            List with the identifiers of the sequences
            that should be retrieved.
        out_file : str
            Path to the FASTA file to which selected
            sequences will be saved.
        limit : int
            Maximum number of sequences that will be
            kept in memory at a time (to avoid keeping
            huge datasets in memory).

        Returns
        -------
        Creates a file with the sequences that have the
        identifiers in the input list.
    """

    if type(sequences) == dict:
        seqs = [(seqid, sequences[seqid]) for seqid in seqids]
    else:
        seqs = [(seqid, str(sequences[seqid].seq)) for seqid in seqids]

    records = []
    for seq in seqs:
        record = fasta_str_record(seq[0], seq[1])
        records.append(record)

        if len(records) == limit or seq[0] == seqids[-1]:
            lines = im.join_list(records, '\n')
            fo.write_to_file(lines, out_file, 'a', '\n')
            records = []
Ejemplo n.º 5
0
def write_protein_table(output_file, genome_id, cds_info):
    """ Writes information about coding sequences in a
        genome to a file.

        Parameters
        ----------
        output_file : str
            Path to the output file to which info will
            be saved.
        genome_id : str
            Identifier of the genome to add to first field
            of every new line.
        cds_info : list
            List with information about each coding sequence
            identified in the genome (contig identifier,
            CDS start position, CDS stop position, CDS
            identifier and CDS coding strand).
    """

    table_lines = [[genome_id] + protein_info for protein_info in cds_info]
    table_lines = [im.join_list(line, '\t') for line in table_lines]
    table_text = im.join_list(table_lines, '\n')
    fo.write_to_file(table_text, output_file, 'a', '\n')
def translate_coding_sequences(seqids, sequences_file, translation_table,
                               minimum_length, dna_file, protein_file):
    """ Translates CDSs into protein sequences.

        Parameters
        ----------
        seqids : list
            List with the sequence identifiers of the sequences
            to be translated.
        sequences_file : str
            Path to the FASTA file that contains the DNA sequences.
        translation_table : int
            Translation table identifier.
        minimum_length : int
            The minimum sequence length value.
        dna_file : str
            Path to a file to save DNA sequences.
        protein_file : str
            Path to a file to save protein sequences.

        Returns
        -------
        A list with following elements:
            invalid_alleles : list
                List with one sublist per invalid allele.
                Each sublist contains a sequence identifer
                and the exception message returned after
                attempting translation.
            total_seqs : int
                Total number of DNA sequences that were
                translated.
    """

    # define limit of records to keep in memory
    dna_lines = []
    total_seqs = 0
    prot_lines = []
    line_limit = 5000
    invalid_alleles = []

    cds_index = SeqIO.index(sequences_file, 'fasta')

    for i, seqid in enumerate(seqids):
        try:
            sequence = str(cds_index.get(seqid).seq)
        except Exception as e:
            print(e)

        translation = sm.translate_dna(sequence, translation_table,
                                       minimum_length)
        if isinstance(translation, list):
            dna_lines.append('>{0}'.format(seqid))
            dna_lines.append(translation[0][1])
            prot_lines.append('>{0}'.format(seqid))
            prot_lines.append(str(translation[0][0]))
            total_seqs += 1
        # if returned value is a string, translation failed and
        # string contains exceptions
        elif isinstance(translation, str):
            invalid_alleles.append([seqid, translation])

        if len(dna_lines) // 2 == line_limit or i + 1 == len(seqids):

            dna_lines = im.join_list(dna_lines, '\n')
            fo.write_to_file(dna_lines, dna_file, 'a', '\n')
            dna_lines = []

            prot_lines = im.join_list(prot_lines, '\n')
            fo.write_to_file(prot_lines, protein_file, 'a', '\n')
            prot_lines = []

    return [invalid_alleles, total_seqs]
Ejemplo n.º 7
0
def adapt_loci(genes, schema_path, schema_short_path, bsr, min_len,
               table_id, size_threshold, blastp_path, makeblastdb_path):
    """ Adapts a set of genes/loci from an external schema so that
        that schema  can be used with chewBBACA. Removes invalid alleles
        and selects representative alleles to include in the "short" directory.

        Parameters
        ----------
        genes_list : list
            A list with the following elements:

            - List with paths to the files to be processed.
            - Path to the schema directory.
            - Path to the "short" directory.
            - BLAST Score Ratio value.
            - Minimum sequence length value.
            - Genetic code.
            - Sequence size variation threshold.

        Returns
        -------
        invalid_alleles : list
            List with the identifiers of the alleles that were
            determined to be invalid.
        invalid_genes : list
            List with the identifiers of the genes that had no
            valid alleles.
        summary_stats : list of list
            List with one sublist per processed locus. Each
            sublist has four elements:

            - The identifier of the locus.
            - The number of alleles in the external file.
            - The number of alleles that were a valid CDS.
            - The number of representatives determined determined
              by the process.

        The function writes the schema files .
    """

    # divide input list into variables
    summary_stats = []
    invalid_genes = []
    invalid_alleles = []
    for gene in genes:

        representatives = []
        final_representatives = []

        # get gene basename and identifier
        gene_basename = os.path.basename(gene)
        gene_id = gene_basename.split('.f')[0]

        # create paths to gene files in new schema
        gene_file = fo.join_paths(schema_path,
                                  ['{0}{1}'.format(gene_id, '.fasta')])

        gene_short_file = fo.join_paths(schema_short_path,
                                        ['{0}{1}'.format(gene_id, '_short.fasta')])

        # create path to temp working directory for current gene
        gene_temp_dir = fo.join_paths(schema_path,
                                      ['{0}{1}'.format(gene_id, '_temp')])

        # create temp directory for the current gene
        fo.create_directory(gene_temp_dir)

        # dictionaries mapping gene identifiers to DNA sequences
        # and Protein sequences
        gene_seqs, prot_seqs, gene_invalid, seqids_map, total_sequences = \
            sm.get_seqs_dicts(gene, gene_id, table_id, min_len, size_threshold)
        invalid_alleles.extend(gene_invalid)

        # if locus has no valid CDS sequences,
        # continue to next locus
        if len(prot_seqs) == 0:
            shutil.rmtree(gene_temp_dir)
            invalid_genes.append(gene_id)
            summary_stats.append([gene_id, str(total_sequences), '0', '0'])
            continue

        if len(gene_seqs) > 1:
            # identify DNA sequences that code for same protein
            equal_prots = sm.determine_duplicated_seqs(prot_seqs)

            # get only one identifier per protein
            ids_to_blast = [protids[0] for protein, protids in equal_prots.items()]

            # get longest sequence as first representative
            longest = sm.determine_longest(ids_to_blast, prot_seqs)
            representatives.append(longest)
            final_representatives.append(longest)

            # create FASTA file with distinct protein sequences
            protein_file = fo.join_paths(gene_temp_dir,
                                         ['{0}_protein.fasta'.format(gene_id)])
            protein_lines = fao.fasta_lines(ids_to_blast, prot_seqs)
            fo.write_list(protein_lines, protein_file)

            # create blastdb with all distinct proteins
            blastp_db = os.path.join(gene_temp_dir, gene_id)
            bw.make_blast_db(makeblastdb_path, protein_file, blastp_db, 'prot')

            # determine appropriate blastp task (proteins < 30aa need blastp-short)
            blastp_task = bw.determine_blast_task(equal_prots)

            # cycles to BLAST representatives against non-representatives until
            # all non-representatives have a representative
            while len(set(ids_to_blast) - set(representatives)) != 0:

                # create FASTA file with representative sequences
                rep_file = fo.join_paths(gene_temp_dir,
                                         ['{0}_rep_protein.fasta'.format(gene_id)])
                rep_protein_lines = fao.fasta_lines(representatives, prot_seqs)
                fo.write_list(rep_protein_lines, rep_file)

                # create file with seqids to BLAST against
                ids_str = im.concatenate_list([str(i) for i in ids_to_blast], '\n')
                ids_file = fo.join_paths(gene_temp_dir,
                                         ['{0}_ids.txt'.format(gene_id)])
                fo.write_to_file(ids_str, ids_file, 'w', '')

                # BLAST representatives against non-represented
                blast_output = fo.join_paths(gene_temp_dir,
                                             ['{0}_blast_out.tsv'.format(gene_id)])
                # set max_target_seqs to huge number because BLAST only
                # returns 500 hits by default

                blast_stderr = bw.run_blast(blastp_path, blastp_db, rep_file,
                                            blast_output, 1, 1, ids_file,
                                            blastp_task, 100000, ignore=ct.IGNORE_RAISED)
                if len(blast_stderr) > 0:
                    raise ValueError(blast_stderr)

                # import BLAST results
                blast_results = fo.read_tabular(blast_output)

                # get self-score for representatives
                rep_self_scores = {res[1]: res[2] for res in blast_results
                                   if res[0] == res[1]}

                # divide results into high, low and hot BSR values
                hitting_high, hitting_low, hotspots, high_reps, low_reps, hot_reps = \
                    bsr_categorizer(blast_results, representatives,
                                    rep_self_scores, bsr, bsr+0.1)

                excluded_reps = []

                # remove high BSR hits that have representative
                hitting_high = set(hitting_high)
                ids_to_blast = [i for i in ids_to_blast if i not in hitting_high]

                # remove representatives that led to high BSR with subjects that were removed
                prunned_high_reps = {k: [r for r in v if r in ids_to_blast] for k, v in high_reps.items()}
                reps_to_remove = [k for k, v in prunned_high_reps.items() if len(v) == 0]

                excluded_reps.extend(reps_to_remove)

                # determine smallest set of representatives that allow to get all cycle candidates
                excluded = []
                hotspot_reps = set(im.flatten_list(list(hot_reps.values())))
                for rep, hits in hot_reps.items():
                    common = hotspot_reps.intersection(set(hits))
                    if len(common) > 0:
                        hotspot_reps = hotspot_reps - common
                    else:
                        excluded.append(rep)

                excluded_reps.extend(excluded)

                # remove representatives that only led to low BSR
                excluded_reps.extend(low_reps)

                representatives = [rep for rep in representatives if rep not in excluded_reps]
                ids_to_blast = [i for i in ids_to_blast if i not in excluded_reps]

                # determine next representative from candidates
                rep_candidates = list(set(hotspots) - hitting_high)
                # sort to guarantee reproducible results with same datasets
                rep_candidates = sorted(rep_candidates, key=lambda x: int(x))
                representatives, final_representatives = select_candidate(rep_candidates,
                                                                          prot_seqs,
                                                                          ids_to_blast,
                                                                          representatives,
                                                                          final_representatives)

                # remove files created for current gene iteration
                os.remove(rep_file)
                os.remove(blast_output)
                os.remove(ids_file)

        else:
            final_representatives = list(prot_seqs.keys())

        # write schema file with all alleles
        gene_lines = fao.fasta_lines(list(gene_seqs.keys()), gene_seqs)
        fo.write_list(gene_lines, gene_file)

        # get total number of valid sequences
        valid_sequences = len(gene_lines)

        # write schema file with representatives
        final_representatives = [seqids_map[rep] for rep in final_representatives]
        gene_rep_lines = fao.fasta_lines(final_representatives, gene_seqs)
        fo.write_list(gene_rep_lines, gene_short_file)

        # get number of representatives
        representatives_number = len(gene_rep_lines)

        summary_stats.append([gene_id,
                              str(total_sequences),
                              str(valid_sequences),
                              str(representatives_number)])

        shutil.rmtree(gene_temp_dir)

    return [invalid_alleles, invalid_genes, summary_stats]