Ejemplo n.º 1
0
def pseudo_bg(bgs, no_reverse):
    """
        Add the pseudocount to the background frequencies
        ----
        Parameters:
            bgs (dict) : dictionary of the background frequencies
            no_reverse (bool) : if set to True, the background
                                frequencies will be averaged with the
                                reverse complement frequencies
        ----
        Returns:
            bgs_proc (dict) : normalized (and averaged) background frequencies
    """

    if not isinstance(bgs, dict):
        raise NotValidBGException(
            "\n\nERROR: unable to add the pseudocount to the background")

    if not isinstance(no_reverse, bool):
        raise ValueException(' '.join(
            ["Boolean value required, got",
             str(type(no_reverse))]))

    if not no_reverse:
        bgs_avg = average_bg_with_rc(bgs)
    else:
        bgs_avg = bgs

    bgs_proc = norm_bg(bgs_avg)

    return bgs_proc
Ejemplo n.º 2
0
def print_scoring_msg(no_reverse, motif):
    """Print a message to display on terminal during scoring step of
    GRAFIMO analysis.

    Parameters
    ----------
    no_reverse : bool
        if True will be considered only the forward DNA strand
    motif : Motif
        Motif object
    """

    if not isinstance(motif, Motif):
        errmsg: str = '\n\nERROR: The given motif is not an instance of Motif'
        raise ValueException(errmsg)

    motif_id: str = motif.getMotifID()
    fw_id: str = ''.join(['+', motif_id])

    # we take into account also the reverse complement
    if not no_reverse:
        rev_id: str = ''.join(['-', motif_id])

    print('\nScoring hits for motif', fw_id)

    # if we score also the reverse complement
    if not no_reverse:
        print('Scoring hits for motif', rev_id, end="\n\n")
Ejemplo n.º 3
0
def compute_qvalues(pvalues: List[np.double]) -> List[np.double]:
    """Compute q-values for a given list of P-values.

    The q-values are obtained using the Benjamini-Hochberg method.

    Parameters
    ----------
    pvalues : list
        list of P-values

    Returns
    -------
    list
        list of q-values
    """

    if not isinstance(pvalues, list):
        errmsg: str = "\n\nERROR: P-values must be in a list"
        raise ValueException(errmsg)

    print("\nComputing q-values...\n")

    # use Benjamini-Hochberg procedure to correct P-values
    mt_obj: Tuple[np.ndarray, np.ndarray, np.double, float]
    mt_obj = multipletests(pvalues, method="fdr_bh")
    qvalues: List[float] = list(mt_obj[1])

    return qvalues
Ejemplo n.º 4
0
    def setScale(self, scale: int) -> None:

        if not isinstance(scale, int):
            raise ValueException("\n\nERROR: the scale factor must be an int")

        assert scale > 0

        self._scale = scale
Ejemplo n.º 5
0
    def setMax_val(self, max_val: int) -> None:

        if max_val >= np.inf:
            errmsg = ' '.join([
                "\n\nERROR: impossible to assign", max_val, "to Motif.max_val"
            ])
            raise ValueException(errmsg)

        self._max_val = max_val
Ejemplo n.º 6
0
    def setMin_val(self, min_val):

        if min_val <= -np.inf:
            errmsg = ' '.join([
                "\n\nERROR: impossible to assign", min_val, "to Motif.min_val"
            ])
            raise ValueException(errmsg)

        self._min_val = min_val
Ejemplo n.º 7
0
def pseudo_bg(bgs: Dict, no_reverse: bool) -> Dict:
    """Add a pseudocount and normalize the background probabilities of 
    nucleotides used to build the motif scoring matrix.

    A pseudocount value is added to the background probability 
    distribution.

    If are to be considered both the forward and the reverse strand the 
    background probabilities are averaged for the two strands. 

    The resulting background probabilities are then normalized.

    Parameters
    ----------
    bgs : dict
        background probability distribution
    no_reverse : bool
        if False only the forward strand will be considered, otherwise
        both forward and reverse are considered

    Returns 
    -------
    dict
        normalized background probablity distribution
    """

    bgs_avg: Dict
    bgs_proc: Dict

    errmsg: str
    if not isinstance(bgs, dict):
        errmsg = "\n\nERROR: unable to add the pseudocount to the background"
        raise NotValidBGException(errmsg)

    if not isinstance(no_reverse, bool):
        errmsg = ' '.join(
            ["Boolean value required, got",
             str(type(no_reverse))])
        raise ValueException(errmsg)

    if not no_reverse:
        bgs_avg = average_bg_with_rc(bgs)
    else:
        bgs_avg = bgs

    bgs_proc = norm_bg(bgs_avg)

    return bgs_proc
Ejemplo n.º 8
0
def print_scoring_msg(no_reverse, motif):
    if not isinstance(motif, Motif):
        raise ValueException(
            '\n\nERROR: The given motif is not an instance of Motif')

    motif_id = motif.getMotifID()
    fw_id = ''.join(['+', motif_id])

    # we take into account also the reverse complement
    if not no_reverse:
        rev_id = ''.join(['-', motif_id])

    print()  # newline
    print('Scoring hits for motif', fw_id)

    # if we score also the reverse complement
    if not no_reverse:
        print('Scoring hits for motif', rev_id)

    print()  # newline
Ejemplo n.º 9
0
def compute_qvalues(pvalues):
    """
        Compute the q-values for a given list
        of P-values, using the Benjamini-Hochberg method
        ----
        Parameters:
            pvalues (list) : list of P-values
        ----
        Returns:
            qvalues (list) : list of computed q-values
    """

    if not isinstance(pvalues, list):
        errmsg = "\n\nERROR: P-values must be in a list"
        raise ValueException(errmsg)

    print("\nComputing q-values...\n")

    # use Benjamini-Hochberg procedure to correct P-values
    mt_obj = multipletests(pvalues, method="fdr_bh")
    qvalues = list(mt_obj[1])

    return qvalues
Ejemplo n.º 10
0
def build_df(motif: Motif,
             seqnames: List[str],
             starts: List[int],
             stops: List[int],
             strands: List[str],
             scores: List[np.double],
             pvalues: List[np.double],
             qvalues: List[np.double],
             sequences: List[str],
             frequencies: List[int],
             references: List[str],
             threshold: float,
             qval_t: bool,
             no_qvalue: bool, 
             recomb: bool
) -> pd.DataFrame:
    """Build the results summary report. The results are stored in a 
    pandas DataFrame object.

    The motif occurrence candidates are filtered applying a threshold on
    the P-value or on the q-value.

    The remaining entries are reported in the final results.

    Parameters
    ----------
    motif : Motif
        Motif object
    seqnames : list
        sequence names
    starts : list
        starting coordinates
    stops : list
        stopping coordinates
    strands : list
        DNA strands
    pvalues: list
        P-values
    qvalues : list
        q-values
    sequences : list
        sequences
    references : list
        flag values stating if the sequences contain genomi variants
    threshold : float
        threshold to apply on P-values or q-values in order to filter
        the motif occurrence candidates to report
    qval_t : bool
        if True the threshold will be applied on q-values rather on
        P-values
    no_qvalue:
        if True the q-values have not been computed
    recomb : bool
        if True will be reported also sequences which can be built with 
        the given set of genomic variants but do not appear in the
        available samples haplotypes
    
    Returns
    -------
    pandas.DataFrame
        final results report
    """

    errmsg: str = "\n\nERROR: unknown data-type for motif"
    if not isinstance(motif, Motif):
        raise ValueException(errmsg)

    if not isinstance(seqnames, list):
        raise ValueException(errmsg)

    if not isinstance(starts, list):
        raise ValueException(errmsg)

    if not isinstance(stops, list):
        raise ValueException(errmsg)

    if not isinstance(strands, list):
        raise ValueException(errmsg)

    if not isinstance(pvalues, list):
        raise ValueException(errmsg)

    if not isinstance(qvalues, list):
        raise ValueException(errmsg)

    if not isinstance(sequences, list):
        raise ValueException(errmsg)

    if not isinstance(references, list):
        raise ValueException(errmsg)

    if not isinstance(references, list):
        raise ValueException(errmsg)

    if not isinstance(qval_t, bool):
        raise ValueException(errmsg)

    if not isinstance(no_qvalue, bool):
        raise ValueException(errmsg)

    if not isinstance(recomb, bool):
        raise ValueException(errmsg)

    lst_len: int = len(seqnames)

    assert len(starts) == lst_len
    assert len(stops) == lst_len
    assert len(strands) == lst_len
    assert len(scores) == lst_len
    assert len(pvalues) == lst_len
    assert len(sequences) == lst_len
    assert len(frequencies) == lst_len
    assert len(references) == lst_len

    # check if we want also the q-values
    if not no_qvalue: 
        assert len(qvalues) == lst_len

    # apply the threshold on the q-values rather than on P-values
    if qval_t:  
        assert (not no_qvalue)
        assert len(qvalues) > 0 

    seqnames_thresh: List[str] = list() 
    starts_thresh: List[int] = list()
    ends_thresh: List[int] = list()
    strands_thresh: List[str] = list() 
    scores_thresh: List[np.double] = list()
    pvalues_thresh: List[np.double] = list()
    sequences_thresh: List[str] = list()
    frequencies_thresh: List[int] = list()
    references_thresh: List[str] = list()

    if not no_qvalue:
        qvalues_thresh: List[np.double] = list()

    for i in range(lst_len):

        # ignore binding site candidates which does not appear in any sample
        # if not required by tyhe user to analyze them
        if not recomb and int(frequencies[i]) == 0:
                continue

        if not qval_t:  # apply threshold on P-values
            pvalue: np.double = pvalues[i]

            if pvalue < threshold:
                # only the sequences with a P-value under the threshold survive
                seqnames_thresh.append(seqnames[i])
                starts_thresh.append(starts[i])
                ends_thresh.append(stops[i])
                strands_thresh.append(strands[i])
                scores_thresh.append(scores[i])
                pvalues_thresh.append(pvalues[i])
                sequences_thresh.append(sequences[i])
                frequencies_thresh.append(frequencies[i])
                references_thresh.append(references[i])

                if not no_qvalue:
                    qvalues_thresh.append(qvalues[i])
            # end if

        else:  # apply threshold on q-values
            qvalue: np.double = qvalues[i]

            if qvalue < threshold:

                # only the sequences with a q-value under the threshold survive
                seqnames_thresh.append(seqnames[i])
                starts_thresh.append(starts[i])
                ends_thresh.append(stops[i])
                strands_thresh.append(strands[i])
                scores_thresh.append(scores[i])
                pvalues_thresh.append(pvalues[i])
                sequences_thresh.append(sequences[i])
                frequencies_thresh.append(frequencies[i])
                references_thresh.append(references[i])

                # the last control statement, in the if, in this case is not
                # necessary (we must have the q-values)
                # otherwise we should not be here
                qvalues_thresh.append(qvalues[i])
            # end if
        # end if
    # end for

    df_len: int = len(seqnames_thresh)

    # TF's name and ID list
    motif_ids: List[str] = [motif.getMotifID()] * df_len
    motif_names: List[str] = [motif.getMotifName()] * df_len

    df = pd.DataFrame()
    df['motif_id'] = motif_ids
    df['motif_alt_id'] = motif_names
    df['sequence_name'] = seqnames_thresh
    df['start'] = starts_thresh
    df['stop'] = ends_thresh
    df['strand'] = strands_thresh
    df['score'] = scores_thresh
    df['p-value'] = pvalues_thresh

    # add the q-values to the final data frame if they have been computed
    if not no_qvalue:
        df['q-value'] = qvalues_thresh

    # finish to build the data frame
    df['matched_sequence'] = sequences_thresh
    df['haplotype_frequency'] = frequencies_thresh 
    df['reference'] = references_thresh

    # sort entries by p-value
    df = df.sort_values(['p-value'], ascending=True)

    # reindex the data frame in order to have indexes in range [1, (df_len + 1)]
    df.index = list(range(1, (df_len + 1)))

    return df
Ejemplo n.º 11
0
    def get_references(self):
        if not self._references:
            errmsg: str = "\n\nERROR: attempting to access an empty attribute"
            raise ValueException(errmsg)

        return self._references
Ejemplo n.º 12
0
def build_df(motif, seqnames, starts, stops, strands, scores, pvalues, qvalues,
             sequences, references, threshold, qval_t, no_qvalue):
    """
        Build a pandas DataFrame to summarize the results
        of GRAFIMO analysis
        ----
        Parameters:
            motif (Motif) : motif
            seqnames (list) : list of sequence names
            starts (list) : list of sequence starting positions
            stops (list) : list of sequence ending positions
            strands (list) : list of sequence strands
            scores (list) : list of sequence scores
            pvalues (list) : list of sequence score P-values
            qvalues (list) : list of sequence q-values
            sequences (list) : list of sequences
            references (list) : list of sequence flag values. If 'ref',
                                then the sequence belong to the reference genome,
                                if 'non.ref', then the sequence contains variants
            threshold (float) : threshold to apply on the P-value (default behavior)
                                or on the q-values
            qval_t (bool) : if set to True, the threshold will be applied on the
                            q-values, on the P-values otherwise
        ----
        Returns:
             df (pd.DataFrame)
    """

    if not isinstance(motif, Motif):
        errmsg = "\n\nERROR: unknown data-type for motif"
        raise ValueException(errmsg)

    if not isinstance(seqnames, list):
        errmsg = "\n\nERROR: unknown data-type, cannot proceed"
        raise ValueException(errmsg)

    if not isinstance(starts, list):
        errmsg = "\n\nERROR: unknown data-type, cannot proceed"
        raise ValueException(errmsg)

    if not isinstance(stops, list):
        errmsg = "\n\nERROR: unknown data-type, cannot proceed"
        raise ValueException(errmsg)

    if not isinstance(strands, list):
        errmsg = "\n\nERROR: unknown data-type, cannot proceed"
        raise ValueException(errmsg)

    if not isinstance(pvalues, list):
        errmsg = "\n\nERROR: unknown data-type, cannot proceed"
        raise ValueException(errmsg)

    if not isinstance(qvalues, list):
        errmsg = "\n\nERROR: unknown data-type, cannot proceed"
        raise ValueException(errmsg)

    if not isinstance(sequences, list):
        errmsg = "\n\nERROR: unknown data-type, cannot proceed"
        raise ValueException(errmsg)

    if not isinstance(references, list):
        errmsg = "\n\nERROR: unknown data-type, cannot proceed"
        raise ValueException(errmsg)

    if not isinstance(qval_t, bool):
        errmsg = "\n\nERROR: unknown data-type, cannot proceed"
        raise ValueException(errmsg)

    if not isinstance(no_qvalue, bool):
        errmsg = "\n\nERROR: unknown data-type, cannot proceed"
        raise ValueException(errmsg)

    # all lists must have the same length
    lst_len = len(seqnames)

    assert len(starts) == lst_len
    assert len(stops) == lst_len
    assert len(strands) == lst_len
    assert len(scores) == lst_len
    assert len(pvalues) == lst_len
    assert len(sequences) == lst_len
    assert len(references) == lst_len

    # check if we want also the q-values
    if not no_qvalue:  # we want the q-values
        assert len(qvalues) == lst_len

    if qval_t:  # apply the threshold on the q-values rather than on P-values
        assert (not no_qvalue)
        assert len(qvalues) > 0  # we must have computed them

    seqnames_thresh = []
    starts_thresh = []
    ends_thresh = []
    strands_thresh = []
    scores_thresh = []
    pvalues_thresh = []
    sequences_thresh = []
    references_thresh = []

    if not no_qvalue:
        qvalues_thresh = []

    for i in range(lst_len):

        if not qval_t:  # apply threshold on P-values
            pvalue = pvalues[i]

            if pvalue < threshold:

                # only the sequences with a P-value under the threshold survive
                seqnames_thresh.append(seqnames[i])
                starts_thresh.append(starts[i])
                ends_thresh.append(stops[i])
                strands_thresh.append(strands[i])
                scores_thresh.append(scores[i])
                pvalues_thresh.append(pvalues[i])
                sequences_thresh.append(sequences[i])
                references_thresh.append(references[i])

                if not no_qvalue:
                    qvalues_thresh.append(qvalues[i])
            # end if

        else:  # apply threshold on q-values
            qvalue = qvalues[i]

            if qvalue < threshold:

                # only the sequences with a q-value under the threshold survive
                seqnames_thresh.append(seqnames[i])
                starts_thresh.append(starts[i])
                ends_thresh.append(stops[i])
                strands_thresh.append(strands[i])
                scores_thresh.append(scores[i])
                pvalues_thresh.append(pvalues[i])
                sequences_thresh.append(sequences[i])
                references_thresh.append(references[i])

                # the last control statement, in the if, in this case is not
                # necessary (we must have the q-values)
                # otherwise we should not be here
                qvalues_thresh.append(qvalues[i])
            # end if
        # end if
    # end for

    df_len = len(seqnames_thresh)

    # TF's name and ID list
    motif_ids = [motif.getMotifID()] * df_len
    motif_names = [motif.getMotifName()] * df_len
    """
        build the final data frame
        
        structure:
        
           |motif_id|motif_alt_id|sequence_name|start|stop|strand|score|p-value|q-value|matched_sequence|reference|   
    """

    df = pd.DataFrame()
    df['motif_id'] = motif_ids
    df['motif_alt_id'] = motif_names
    df['sequence_name'] = seqnames_thresh
    df['start'] = starts_thresh
    df['stop'] = ends_thresh
    df['strand'] = strands_thresh
    df['score'] = scores_thresh
    df['p-value'] = pvalues_thresh

    # add the q-values to the final data frame if they have been computed
    if not no_qvalue:
        df['q-value'] = qvalues_thresh

    # finish to build the data frame
    df['matched_sequence'] = sequences_thresh
    df['reference'] = references_thresh

    # sort entries by p-value
    df = df.sort_values(['p-value'], ascending=True)

    # reindex the data frame in order to have indexes in range [1, (df_len + 1)]
    df.index = list(range(1, (df_len + 1)))

    return df
Ejemplo n.º 13
0
    def get_strands(self):
        if not self._strands:
            errmsg = "\n\nERROR: attempting to access an empty attribute"
            raise ValueException(errmsg)

        return self._strands