Ejemplo n.º 1
0
    def test_get_alignment_int_form(self):
        rna_seqs_int_form = fasta_reader.get_alignment_int_form(
            self.__rna_msa_file,
            biomolecule=self.__rna,
        )
        self.assertIsNotNone(rna_seqs_int_form)
        protein_seqs_int_form = fasta_reader.get_alignment_int_form(
            self.__protein_msa_file,
            biomolecule=self.__protein,
        )

        self.assertIsNotNone(protein_seqs_int_form)
Ejemplo n.º 2
0
    def get_single_site_freqs(self):
        """Computes single site frequencies from MSA data

        Parameters 
        ----------
            self : PlmDCA
                An instance of PlmDCA class
        
        Returns
        -------
            single_site_freqs :
                A 2d numpy array of type float64. The shape of this array is
                (seqs_len, num_site_states) where seqs_len is the length of sequences
                in the alignment data.
        """
        alignment_data = np.array(
            get_alignment_int_form(self.__msa_file,
                                   biomolecule=self.__biomolecule))
        seqs_weight = msa_numerics.compute_sequences_weight(
            alignment_data=alignment_data, sequence_identity=self.__seqid)
        logger.info('\n\tComputing single site frequencies')

        single_site_freqs = msa_numerics.compute_single_site_freqs(
            alignment_data=alignment_data,
            num_site_states=self.__num_site_states,
            seqs_weight=seqs_weight)
        return single_site_freqs
Ejemplo n.º 3
0
    def compute_seqs_weight(self):
        """Computes sequences weight

        Parameters
        ----------
            self: PlmDCA
                An instance of PlmDCA class
        
        Returns 
        -------
            seqs_weight : np.array
                A 1d numpy array containing sequences weight.
        """
        logger.info(
            '\n\tComputing sequences weight with sequence identity {}'.format(
                self.__seqid))
        alignment_data = np.array(
            get_alignment_int_form(self.__msa_file,
                                   biomolecule=self.__biomolecule))
        seqs_weight = msa_numerics.compute_sequences_weight(
            alignment_data=alignment_data, sequence_identity=self.__seqid)
        Meff = np.sum(seqs_weight)
        logger.info('\n\tEffective number of sequences: {}'.format(Meff))
        self.__seqs_weight = seqs_weight
        self.__eff_num_seqs = Meff
        return seqs_weight
Ejemplo n.º 4
0
    def setUp(self):
        """
        """
        self.__rna_msa_file = InputFilesPath.rna_msa_file
        self.__rna_ref_file = InputFilesPath.rna_ref_file
        self.__protein_msa_file = InputFilesPath.protein_msa_file
        self.__protein_ref_file = InputFilesPath.protein_ref_file

        rna_alignment_int_form = fasta_reader.get_alignment_int_form(
            self.__rna_msa_file, biomolecule='rna')

        self.__rna_backmapper = seq_backmapper.SequenceBackmapper(
            alignment_data=rna_alignment_int_form,
            refseq_file=self.__rna_ref_file,
            biomolecule='rna',
        )

        protein_alignment_int_form = fasta_reader.get_alignment_int_form(
            self.__protein_msa_file, biomolecule='protein')

        self.__protein_backmapper = seq_backmapper.SequenceBackmapper(
            alignment_data=protein_alignment_int_form,
            refseq_file=self.__protein_ref_file,
            biomolecule='protein')
Ejemplo n.º 5
0
    def __init__(self,
                 msa_file_name,
                 biomolecule,
                 pseudocount=None,
                 seqid=None):
        """MeanFieldDCA object class initializer
        Parameters
        ----------
            msa_file : str
                Name of the FASTA formatted file containing alignmnet
            biomolecule : str
                Type of biomolecule (must be protein or RNA, lower or
                upper case)
            pseudocount : float
                Parameter for regularizing data before DCA analysis.
                Default value is 0.5
            seqid : float
                This parameter's value measure the maximum
                similarity two or more sequences can have so that they can be
                considered distinct, or lumped together otherwise.
        Returns
        -------
            None : None
        """

        self.__pseudocount = pseudocount if pseudocount is not None else 0.5
        self.__seqid = seqid if seqid is not None else 0.8
        #Validate the value of pseudo count incase user provide an invalid one
        if self.__pseudocount >= 1.0 or self.__pseudocount < 0:
            logger.error('\n\tValue of relative pseudo-count must be'
                         ' between 0 and 1.0. Typical value is 0.5')
            raise ValueError
        #Validate the value of sequence identity
        if self.__seqid > 1.0 or self.__seqid <= 0.0:
            logger.error(
                '\n\tValue of sequence-identity must'
                ' not exceed 1 nor less than 0. Typical values are 0.7, 0.8., 0.9'
            )
            raise ValueError
        biomolecule = biomolecule.strip().upper()
        self.__msa_file_name = msa_file_name
        if biomolecule == 'RNA':
            self.__num_site_states = 5
        elif biomolecule == 'PROTEIN':
            self.__num_site_states = 21
        else:
            logger.error(
                '\n\tUnknown biomolecule ... must be protein (PROTEIN) or rna (RNA)',
            )
            raise ValueError

        self.__sequences = fasta_reader.get_alignment_int_form(
            self.__msa_file_name,
            biomolecule=biomolecule,
        )

        self.__num_sequences = len(self.__sequences)
        self.__sequences_len = len(self.__sequences[0])
        self.__biomolecule = biomolecule
        if self.__seqid < 1.0:
            self.__sequences_weight = self.compute_sequences_weight()
        else:
            # assign each sequence a weight of one
            self.__sequences_weight = np.ones((self.__num_sequences, ),
                                              dtype=np.float64)
        self.__effective_num_sequences = np.sum(self.__sequences_weight)
        #sometimes users might enter the wrong biomolecule type
        #verify biomolecule type

        mf_dca_info = """\n\tCreated a MeanFieldDCA object with the following attributes
        \tbiomolecule: {}
        \ttotal states at sites: {}
        \tpseudocount: {}
        \tsequence identity: {}
        \talignment length: {}
        \ttotal number of unique sequences (excluding redundant sequences with 100 percent similarity): {}
        \teffective number of sequences (with sequence identity {}): {}
        """.format(
            biomolecule,
            self.__num_site_states,
            self.__pseudocount,
            self.__seqid,
            self.__sequences_len,
            self.__num_sequences,
            self.__seqid,
            self.__effective_num_sequences,
        )
        logger.info(mf_dca_info)
        return None