Example #1
0
def internal_homology_blast(record: secmet.Record) -> Dict[int, List[List[str]]]:
    """ Run BLAST on gene cluster proteins of each cluster on itself to find
        internal homologs
        store groups of homologs - including singles - in a dictionary
        as a list of lists accordingly

        Arguments:
            record: the Record to generate groups from

        Returns:
            a dictionary mapping cluster_number to
                a list containing distinct groups represented by
                    lists of query ids
    """
    with TemporaryDirectory(change=True):
        logging.info("Finding internal homologs in each gene cluster...")
        internalhomologygroups = {}
        for cluster in record.get_clusters():
            cluster_number = cluster.get_cluster_number()
            iquerycluster_names, iqueryclusterseqs = create_blast_inputs(cluster)
            query_filename = "internal_input.fasta"
            fasta.write_fasta(iquerycluster_names, iqueryclusterseqs, query_filename)
            blastoutput = run_internal_blastsearch(query_filename)
            queries, _ = blastparse(blastoutput, record, min_seq_coverage=25,
                                    min_perc_identity=30)
            groups = find_internal_orthologous_groups(queries, iquerycluster_names)
            internalhomologygroups[cluster_number] = groups
    return internalhomologygroups
Example #2
0
def run_muscle_single(seq_name: str, seq: str,
                      comparison_file: str) -> Dict[str, str]:
    """ Runs muscle over a single sequence against a comparison file in profile
        mode and returns a dictionary of the resulting alignments

        Arguments:
            seq_name: the name of the query
            seq: the sequence to align
            comparison_file: the path of the file containing comparison sequences

        Returns:
            a dictionary mapping sequence name (query or reference) to alignment
    """
    with NamedTemporaryFile(mode="w+") as temp_in:
        with NamedTemporaryFile(mode="w+") as temp_out:
            write_fasta([seq_name], [seq], temp_in.name)
            # Run muscle and collect sequence positions from file
            result = execute([
                get_config().executables.muscle, "-profile", "-quiet", "-in1",
                comparison_file, "-in2", temp_in.name, "-out", temp_out.name
            ])
            if not result.successful():
                raise RuntimeError(
                    "muscle returned %d: %r while comparing query named %s" %
                    (result.return_code, result.stderr.replace("\n",
                                                               ""), seq_name))
            fasta = read_fasta(temp_out.name)
    return fasta
Example #3
0
def smcog_tree_analysis(cds: CDSFeature, input_number: int, smcog: str, output_dir: str) -> None:
    "run smCOG search on all gene cluster CDS features"
    gene_id = cds.get_name()
    seq = cds.translation
    # create input.fasta file with single query sequence to be used as input for MSA
    fasta.write_fasta([gene_id], [seq], "input" + str(input_number) + ".fasta")
    alignment_file = alignsmcogs(smcog, input_number)
    # Generate trimmed alignment
    trim_alignment(input_number, alignment_file)
    # Draw phylogenetic tree
    draw_tree(input_number, output_dir, gene_id)
Example #4
0
def trim_alignment(input_number: int, alignment_file: str) -> None:
    """ remove all positions before the first and after the last position shared
        by at least a third of all sequences
    """
    def find_first_aa_position(conservations: List[Dict[str, int]],
                               sequence_count: int) -> int:
        """ Finds the first position of a shared amino acid """
        for position, conservation in enumerate(conservations):
            aa = sorted(conservation.items(),
                        key=lambda x: (x[1], x[0]),
                        reverse=True)
            base, count = aa[0]
            # skip best hits that are gaps
            if base == "-":
                continue
            # check that the count is greater than required
            if count >= sequence_count / 3:
                return position
        return 0  # can't be earlier than the start

    contents = fasta.read_fasta(alignment_file)
    # check all sequences are the same length
    sequence_length = len(list(contents.values())[0])
    for name, seq in contents.items():
        assert sequence_length == len(
            seq), "%s has different sequence length" % name
    # stripping ( and ) because it breaks newick tree parsing
    # and keeping only the last two fields (id and description)
    names = [
        "|".join(name.replace("(", "_").replace(")", "_").rsplit('|', 2)[-2:])
        for name in list(contents)
    ]
    seqs = list(contents.values())

    # store conservation of residues
    conservations = [defaultdict(lambda: 0) for i in range(sequence_length)
                     ]  # type: List[Dict[str, int]]
    for seq in seqs:
        for position, base in enumerate(seq):
            conservations[position][base] += 1

    # Find first and last amino acids shared
    first_shared_amino = find_first_aa_position(conservations, len(seqs))

    conservations.reverse()
    last_shared_amino = sequence_length - find_first_aa_position(
        conservations, len(seqs))

    # Shorten sequences to detected conserved regions
    seqs = [seq[first_shared_amino:last_shared_amino] for seq in seqs]
    seed_fasta_name = "trimmed_alignment" + str(input_number) + ".fasta"
    fasta.write_fasta(names, seqs, seed_fasta_name)
Example #5
0
def write_fastas_with_all_genes(regions: Iterable[secmet.Region],
                                filename: str,
                                partitions: int = 1) -> List[str]:
    """ Write fasta files containing all genes in all clusters in a
        blast friendly form.

        If partitioning the data into multiple files, the index of the partition
        will be included in the filename before the extension, e.g.
        input.fasta -> input0.fasta, input1.fasta, ...

        Arguments:
            regions: an iterable of clusters to find genes in
            filename: the filename to use for the file
            partitions: the number of files to create (approx. equally sized)

        Returns:
            a list containing filenames of the written files
    """
    if not isinstance(partitions, int):
        raise TypeError("Partitions must be an int greater than 0")
    if partitions < 1:
        raise ValueError("Partitions must be greater than 0")

    all_names, all_seqs = [], []
    for region in regions:
        names, seqs = create_blast_inputs(region)
        all_names.extend(names)
        all_seqs.extend(seqs)
    if not (all_names and all_seqs):
        raise ValueError("Diamond search space contains no sequences")
    if partitions == 1:
        fasta.write_fasta(all_names, all_seqs, filename)
        return [filename]

    chunk_filename = "%d".join(os.path.splitext(filename))
    size = len(all_names) // partitions
    for i in range(partitions):
        chunk_names = all_names[i * size:(i + 1) * size]
        chunk_seqs = all_seqs[i * size:(i + 1) * size]
        if i == partitions - 1:
            chunk_names = all_names[i * size:]
            chunk_seqs = all_seqs[i * size:]
        fasta.write_fasta(chunk_names, chunk_seqs, chunk_filename % i)
    return [chunk_filename % i for i in range(partitions)]