コード例 #1
0
    def clusterization(filename, outdir, eps, ms, dm=None):
        """
        Clusterization of sequences. Visualisation and saving of clusters into files.

        :param filename: filename of sequences in fasta format to for clusterisation
        :param outdir: out directory name for out files
        :param eps: The maximum distance between two samples
        :param ms: The number of samples (or total weight) in a neighborhood for a point to be considered
        as a core point. This includes the point itself.
        :param dm: path to distance matrix (optional). If is not None dm will be used as distance matrix.
        """
        records = Records(Extractor.extract_records(filename))
        if dm is None:
            dm = records.create_dist_matrix()
        else:
            dm = Extractor.extract_dist_matrix(dm)

        c = Clusterization(dm)
        ci = c.get_clusters_indexes(eps=eps, ms=ms)
        coordinates = c.get_coordinates()

        clusters = records.get_clusters(ci)
        for key in clusters:
            rw = RecordsWriter(clusters[key])
            rw.write_to_dir(f"cluster_{key}.fasta", outdir)

        # visualisation and saving
        cv = ClusterVisualisation(coordinates, dm, ci)
        cv.write_dm(outdir)
        cv.visualize(outdir)
コード例 #2
0
    def unite_aligns(dirname,
                     outfile,
                     full_length,
                     ignore_gaps=False,
                     ignore_level=0.9):
        """
        Uniting consensuses of aligning from directory into one fasta file.

        :param dirname: directory name with aligns
        :param outfile: out filename
        :param full_length: boolean, True or False. If True the confidence level will be done using all length of
        sequences. For False parameter calculating will be done without 'full gap' ends of sequences
        :param ignore_gaps: boolean, True or False. Ignoring gaps with high level of confidence
        (with confidence >= ignore_level)
        :param ignore_level: float, level of ignoring gaps
        """
        filenames = Extractor.extract_filenames(dirname)
        records = []
        for file in filenames:
            align = AlignController.__get_alignment_from(file)
            consensus = align.get_consensus(full_length=full_length)
            str_cons = consensus.get_str_consensus(ignore_gaps=ignore_gaps,
                                                   ignore_level=ignore_level)
            name = f"{basename(file).split('.')[0]}:"
            description = f"consensus sequence with parameters: full_length={full_length}, ignore_gaps={ignore_gaps}, " \
                          f"gnore_level={ignore_level}"
            records.append(
                SeqRecord(Seq(str_cons),
                          id=name,
                          name=name,
                          description=description))
        RecordsWriter(records).write_to(outfile)
コード例 #3
0
    def get_random(filename, out_file, number_of_random_seqs):
        """
        Choosing 'number_of_random_seqs' sequences from input file

        :param filename: filename with sequences in fasta
        :param out_file: out filename
        :param number_of_random_seqs: number of random sequence to choose from input file
        """
        records = Records(Extractor.extract_records(filename))
        random_seqs = records.get_random_seqs(number_of_random_seqs)
        RecordsWriter(random_seqs).write_to(out_file)
コード例 #4
0
    def node_filtrating(filename, out_file, min_cov):
        """
        Filtrating NODES by depth of coverage

        :param filename: filename with sequences in fasta
        :param out_file: out filename
        :param min_cov: minimal depth of coverage
        """
        records = Records(Extractor.extract_records(filename))
        fseqs = records.filtr_by_coverage(min_cov)
        RecordsWriter(fseqs).write_to(out_file)
コード例 #5
0
    def __get_alignment_from(filename):
        """
        Private function for extracting alignment from file fasta format. Sequences in file should be the same length
        for correct work

        :param filename: fasta file with sequences the same length, is alignment in fasta format.
        :return: Alignment object
        """
        records = Extractor.extract_records(filename)
        aligned_seqs = [AlignedSeq(rec.seq) for rec in records]
        return Alignment(aligned_seqs)
コード例 #6
0
    def filtrating(filename, out_file, organism, minsize, maxsize):
        """
        Filtrating sequences by parameters: organism, minsize and maxsize

        :param filename: filename with sequences in fasta
        :param out_file: out filename
        :param organism: full name of organism
        :param minsize: minimal size of sequences
        :param maxsize: maximal size of sequences
        """
        records = Records(Extractor.extract_records(filename))
        fseqs = records.filtr_organism_by_size(organism, minsize, maxsize)
        RecordsWriter(fseqs).write_to(out_file)
コード例 #7
0
    def grouping(filename, outdir, minsog, maxsog):
        """
        Grouping sequences by names. All big sequences (genomes) with minsog <= size <= maxsog form separate group 'cds'

        :param filename: filename with sequences in fasta
        :param outdir: output directory for saving groups
        :param minsog: int, min size of genome
        :param maxsog: int, max size of genome
        """
        records = Records(Extractor.extract_records(filename))

        groups = records.group(minsog, maxsog)
        for key in groups:
            rw = RecordsWriter(groups[key])
            rw.write_to_dir(key + ".fasta", outdir)
コード例 #8
0
    def consensus_with(align_file, seqs_file, outdir):
        """
        Get new consensus based on aligning of complete genomes and short sequences from database. This function
        calculate consensus taking into account all data from nucleotide database (genomes and other shorter sequences)
        and save it in file. Aligning of genomes is used as first approximation of consensus that modified in running
        program.

        :param align_file: filename of aligning complete genomes in fasta format
        :param seqs_file: filename with short sequences in fasta
        :param outdir: outdir name
        """
        align = AlignController.__get_alignment_from(align_file)
        recs = Extractor.extract_records(seqs_file)
        seqs = [r.seq for r in recs]
        new_consensus = align.consensus_with(seqs, full_length=True)
        ConsensusWriter(new_consensus).write_all(outdir, "reconsensused", 0.9)
コード例 #9
0
    def convert_to_mutations(html_file,
                             outfile,
                             levels_of_confidence,
                             cut_from=0,
                             cut_to=None,
                             fmt="fasta"):
        """
        Converting html consensus into '.fasta' or 'primer explorer' format file containing consensus string
        and confidence string with information about mutations. Mutations is positions with low level of confidence
        (in this position high probability to find different nucleotides) marked as '-'

        :param html_file: filename with consensus
        :param outfile: out filename
        :param levels_of_confidence: list with classes, that we considered as 'reliable' position. All other positions
        will mark as mutations. Format: ['c90', 'c80', ... ]
        :param cut_to: cutting consensus from this position
        :param cut_from: cutting consensus to this position
        :param fmt: format of output file: 'fasta' or 'pe' for primer explorer
        """
        html_consensus = Extractor.extract_html_consensus(html_file)

        html_consensus_parser = HtmlConsensusParser()
        html_consensus_parser.parse_html_consensus(html_consensus,
                                                   levels_of_confidence)

        sequence = html_consensus_parser.consensus_string[cut_from:cut_to]
        consensus = html_consensus_parser.confidence_string[cut_from:cut_to]

        if fmt == "fasta":
            consensus_record = SeqRecord(
                Seq(sequence),
                id="sequence",
                description=f"sequence of {html_file}")
            confidence_record = SeqRecord(Seq(consensus),
                                          id="consensus",
                                          description=f"consensus with levels:"
                                          f" {levels_of_confidence}")
            RecordsWriter([consensus_record,
                           confidence_record]).write_to(outfile)
        elif fmt == "pe":
            PeMutateConsensusWriter(sequence,
                                    consensus).write_in_pe_format(outfile)
        else:
            raise AttributeError(
                "Only 'fasta' or 'pe' output formats are available")
コード例 #10
0
    def __write_html_to(self,
                        filename,
                        coloring="c",
                        ignore_gaps=False,
                        ignore_level=0.9):
        """
        Private function for writing html consensus
        :param filename: filename to write
        :param coloring: 'c' or 'd'. First for coloring confidence, second for coloring deeps
        :param ignore_gaps: boolean, True or False. Ignoring gaps with high level of confidence
        (with confidence >= ignore_level)
        :param ignore_level: float, level of ignoring gaps
        """
        html_header = Extractor.extract_html_header(coloring)
        html_body = self.__get_html_body(coloring, ignore_gaps, ignore_level)

        check_dir(filename)

        with open(filename, "w") as f:
            f.write(html_header + html_body)