Esempio n. 1
0
def score_seqs(sequences: List[str],
               motif: Motif,
               no_reverse: bool,
               return_dict: DictProxy,
               scanned_seqs_dict: DictProxy,
               scanned_nucs_dict: DictProxy,
               pid: int
) -> None:
    """Score the retrieved sequences using motif scoring matrix data.

    The partial results are stored in a dictionary with current
    process ID as key for the current entry.

    The different entries will be merged at the end, to obtian the 
    final results report.

    Parameters
    ----------
    sequences : list
        sequences to score
    motif : Motif
        motif object containing thescoring matrix and the P-value matrix
    no_reverse:
        if True only the sequences belonging to the forward strand will
        be scored
    return_dict : multiprocessing.managers.DictProxy
        dictionary where the current chunk of results will be
        stored
    scanned_seqs_dict : mp.managers.DictProxy
        dictionary storing the number of sequences scanned in each 
        sequence chunk
    scanned_nucs_dict : mp.managers.DictProxy
        dictionary storing the number of nucleotides scanned in each
        sequence chunk
    """

    try:
        # get Motif attributes to score sequences
        score_matrix: np.ndarray = motif.getMotif_scoreMatrix()
        pval_mat: np.array = motif.getMotif_pval_mat()
        min_score: int = motif.getMin_val()
        scale: int = motif.getScale()
        width: int = motif.getWidth()
        offset: np.double = motif.getOffset()

        # initialize lists where results will be stored
        seqs: List[str] = list()
        scores: List[np.double] = list()
        pvalues: List[np.double] = list()
        seqnames: List[str] = list()
        chroms: List[str] = list()
        starts: List[int] = list()
        stops: List[int] = list()
        strands: List[str] = list()
        frequencies: List[int] = list()
        references: List[str] = list()

        seqs_scanned: int = 0  # counter for scanned sequences 

        width: int = motif.getWidth()

        for s in sequences:
            with open(s, mode='r') as raw_sequences:
                for line in raw_sequences:
                    data = line.split('\t')  

                    strand = data[2][-1]

                    if no_reverse:  # score only the fw strand

                        if strand == '+':

                            # read the final values
                            seq = data[1]
                            seqname = ''.join(['chr', data[0]])
                            chrom = seqname.split(':')[0]
                            start = data[2].split(':')[1]
                            start = start[:-1]
                            stop = data[3].split(':')[1]
                            stop = stop[:-1]
                            freq = data[4]
                            ref = data[5]
                            score, pvalue = compute_score_seq(seq, score_matrix, 
                                                              pval_mat, min_score, 
                                                              scale, width, offset)
                            seqs_scanned += 1
                            seqs.append(seq)
                            scores.append(score)
                            pvalues.append(pvalue)
                            seqnames.append(seqname)
                            chroms.append(chrom)
                            starts.append(start)
                            stops.append(stop)
                            strands.append(strand)
                            frequencies.append(freq)
                            
                            # fix indels reference report bug
                            distance: int = np.abs(int(stop) - int(start))
                            if (ref == "ref" and distance != width):
                                ref = "non.ref"

                            references.append(ref)
                        # end if

                    else:  # score both fw and reverse strands

                        seq = data[1]
                        seqname = ''.join(['chr', data[0]])
                        chrom = seqname.split(':')[0]
                        start = data[2].split(':')[1]
                        start = start[:-1]
                        stop = data[3].split(':')[1]
                        stop = stop[:-1]
                        freq = data[4]
                        ref = data[5]
                        score, pvalue = compute_score_seq(seq, score_matrix, 
                                                          pval_mat, min_score, 
                                                          scale, width, offset)
                        seqs_scanned += 1
                        seqs.append(seq)
                        scores.append(score)
                        pvalues.append(pvalue)
                        seqnames.append(seqname)
                        chroms.append(chrom)
                        starts.append(start)
                        stops.append(stop)
                        strands.append(strand)
                        frequencies.append(freq)

                        # fix indels reference report bug
                        distance: int = np.abs(int(stop) - int(start))
                        if (ref == "ref" and distance != width):
                            ref = "non.ref"

                        references.append(ref)
                    # end if
                # end for
            # end open
        # end for

    except KeyboardInterrupt:
        pass

    else:

        res_tmp = ResultTmp(seqnames, seqs, chroms, starts, stops, strands, 
                            scores, pvalues, frequencies, references)

        return_dict[pid] = res_tmp
        scanned_seqs_dict[pid] = seqs_scanned
        scanned_nucs_dict[pid] = seqs_scanned * width 
Esempio n. 2
0
def build_df(motif: Motif,
             seqnames: List[str],
             starts: List[int],
             stops: List[int],
             strands: List[str],
             scores: List[np.double],
             pvalues: List[np.double],
             qvalues: List[np.double],
             sequences: List[str],
             frequencies: List[int],
             references: List[str],
             threshold: float,
             qval_t: bool,
             no_qvalue: bool, 
             recomb: bool
) -> pd.DataFrame:
    """Build the results summary report. The results are stored in a 
    pandas DataFrame object.

    The motif occurrence candidates are filtered applying a threshold on
    the P-value or on the q-value.

    The remaining entries are reported in the final results.

    Parameters
    ----------
    motif : Motif
        Motif object
    seqnames : list
        sequence names
    starts : list
        starting coordinates
    stops : list
        stopping coordinates
    strands : list
        DNA strands
    pvalues: list
        P-values
    qvalues : list
        q-values
    sequences : list
        sequences
    references : list
        flag values stating if the sequences contain genomi variants
    threshold : float
        threshold to apply on P-values or q-values in order to filter
        the motif occurrence candidates to report
    qval_t : bool
        if True the threshold will be applied on q-values rather on
        P-values
    no_qvalue:
        if True the q-values have not been computed
    recomb : bool
        if True will be reported also sequences which can be built with 
        the given set of genomic variants but do not appear in the
        available samples haplotypes
    
    Returns
    -------
    pandas.DataFrame
        final results report
    """

    errmsg: str = "\n\nERROR: unknown data-type for motif"
    if not isinstance(motif, Motif):
        raise ValueException(errmsg)

    if not isinstance(seqnames, list):
        raise ValueException(errmsg)

    if not isinstance(starts, list):
        raise ValueException(errmsg)

    if not isinstance(stops, list):
        raise ValueException(errmsg)

    if not isinstance(strands, list):
        raise ValueException(errmsg)

    if not isinstance(pvalues, list):
        raise ValueException(errmsg)

    if not isinstance(qvalues, list):
        raise ValueException(errmsg)

    if not isinstance(sequences, list):
        raise ValueException(errmsg)

    if not isinstance(references, list):
        raise ValueException(errmsg)

    if not isinstance(references, list):
        raise ValueException(errmsg)

    if not isinstance(qval_t, bool):
        raise ValueException(errmsg)

    if not isinstance(no_qvalue, bool):
        raise ValueException(errmsg)

    if not isinstance(recomb, bool):
        raise ValueException(errmsg)

    lst_len: int = len(seqnames)

    assert len(starts) == lst_len
    assert len(stops) == lst_len
    assert len(strands) == lst_len
    assert len(scores) == lst_len
    assert len(pvalues) == lst_len
    assert len(sequences) == lst_len
    assert len(frequencies) == lst_len
    assert len(references) == lst_len

    # check if we want also the q-values
    if not no_qvalue: 
        assert len(qvalues) == lst_len

    # apply the threshold on the q-values rather than on P-values
    if qval_t:  
        assert (not no_qvalue)
        assert len(qvalues) > 0 

    seqnames_thresh: List[str] = list() 
    starts_thresh: List[int] = list()
    ends_thresh: List[int] = list()
    strands_thresh: List[str] = list() 
    scores_thresh: List[np.double] = list()
    pvalues_thresh: List[np.double] = list()
    sequences_thresh: List[str] = list()
    frequencies_thresh: List[int] = list()
    references_thresh: List[str] = list()

    if not no_qvalue:
        qvalues_thresh: List[np.double] = list()

    for i in range(lst_len):

        # ignore binding site candidates which does not appear in any sample
        # if not required by tyhe user to analyze them
        if not recomb and int(frequencies[i]) == 0:
                continue

        if not qval_t:  # apply threshold on P-values
            pvalue: np.double = pvalues[i]

            if pvalue < threshold:
                # only the sequences with a P-value under the threshold survive
                seqnames_thresh.append(seqnames[i])
                starts_thresh.append(starts[i])
                ends_thresh.append(stops[i])
                strands_thresh.append(strands[i])
                scores_thresh.append(scores[i])
                pvalues_thresh.append(pvalues[i])
                sequences_thresh.append(sequences[i])
                frequencies_thresh.append(frequencies[i])
                references_thresh.append(references[i])

                if not no_qvalue:
                    qvalues_thresh.append(qvalues[i])
            # end if

        else:  # apply threshold on q-values
            qvalue: np.double = qvalues[i]

            if qvalue < threshold:

                # only the sequences with a q-value under the threshold survive
                seqnames_thresh.append(seqnames[i])
                starts_thresh.append(starts[i])
                ends_thresh.append(stops[i])
                strands_thresh.append(strands[i])
                scores_thresh.append(scores[i])
                pvalues_thresh.append(pvalues[i])
                sequences_thresh.append(sequences[i])
                frequencies_thresh.append(frequencies[i])
                references_thresh.append(references[i])

                # the last control statement, in the if, in this case is not
                # necessary (we must have the q-values)
                # otherwise we should not be here
                qvalues_thresh.append(qvalues[i])
            # end if
        # end if
    # end for

    df_len: int = len(seqnames_thresh)

    # TF's name and ID list
    motif_ids: List[str] = [motif.getMotifID()] * df_len
    motif_names: List[str] = [motif.getMotifName()] * df_len

    df = pd.DataFrame()
    df['motif_id'] = motif_ids
    df['motif_alt_id'] = motif_names
    df['sequence_name'] = seqnames_thresh
    df['start'] = starts_thresh
    df['stop'] = ends_thresh
    df['strand'] = strands_thresh
    df['score'] = scores_thresh
    df['p-value'] = pvalues_thresh

    # add the q-values to the final data frame if they have been computed
    if not no_qvalue:
        df['q-value'] = qvalues_thresh

    # finish to build the data frame
    df['matched_sequence'] = sequences_thresh
    df['haplotype_frequency'] = frequencies_thresh 
    df['reference'] = references_thresh

    # sort entries by p-value
    df = df.sort_values(['p-value'], ascending=True)

    # reindex the data frame in order to have indexes in range [1, (df_len + 1)]
    df.index = list(range(1, (df_len + 1)))

    return df
Esempio n. 3
0
def read_MEME_motif(motif_file: str, bg_file: str, pseudocount: float,
                    no_reverse: bool, verbose: bool,
                    debug: bool) -> List[Motif]:
    """Read motif PWM in MEME format.

    The data read are then used to build the scoring matrix for the 
    motif, the P-value matrix, etc.

    Since a MEME file can contain one or more motifs, for each stored PWM
    is built the corresponding Motif object. The resulting set of motifs are 
    stored in a list, which will constitute a MotifSet object.

    ...
    
    Parameters
    ----------
    motif_file : str
        path to the motif PWM in JASPAR format
    bg_file
        path to the background file in Markov Background Format
        (http://meme-suite.org/doc/bfile-format.html).
    pseudocount : float
        value to add to motif PWM counts
    no_reverse : bool
        if False only the forward strand will be considered, otherwise
        both forward and reverse are considered
    verbose : bool
        print additional information
    debug:
        trace the full error stack

    Returns
    -------
    List[Motif]
        list of Motif objects
    """

    if not isinstance(motif_file, str):
        errmsg = "Expected str, got {}.\n"
        exception_handler(TypeError, errmsg.format(type(motif_file).__name__),
                          debug)
    if not os.path.isfile(motif_file):
        errmsg = "Unable to locate {}.\n"
        exception_handler(FileNotFoundError, errmsg.format(motif_file), debug)
    if not isinstance(bg_file, str):
        errmsg = "Expected str, got {}.\n"
        exception_handler(TypeError, errmsg.format(type(bg_file).__name__),
                          debug)
    if bg_file != UNIF and not os.path.isfile(bg_file):
        errmsg = "Unable to locate {}.\n"
        exception_handler(FileNotFoundError, errmsg.format(bg_file), debug)
    if not isinstance(pseudocount, float):
        errmsg = "Expected float, got {}.\n"
        exception_handler(TypeError, errmsg.format(type(pseudocount).__name__),
                          debug)
    if pseudocount <= 0:
        errmsg = "The pseudocount must be > 0.\n"
        exception_handler(ValueError, errmsg, debug)
    if not isinstance(no_reverse, bool):
        errmsg = "Expected bool, got {}.\n"
        exception_handler(TypeError, errmsg.format(type(no_reverse).__name__),
                          debug)

    motifs_raw = list()
    motifs: List[Motif] = list()
    motifs_num = 0
    proceed = False
    # begin motif parsing
    try:
        ifstream = open(motif_file, mode="r")
        alphabet = __read_alphabet_meme(motif_file, ifstream,
                                        debug)  # shared by all motifs
        nucsmap = dict()  # used with np object
        for i in range(len(alphabet)):
            nucsmap.update({alphabet[i]: i})
        while True:
            for line in ifstream:
                if line.startswith("MOTIF"): break  # new motif instance
            else:
                assert motifs_num == len(motifs_raw)
                proceed = True
                break
            if proceed: break  # read all motifs
            if verbose: start_rm = time.time()
            motifids = line.split()
            if len(motifids) == 2:  # only name
                motif_id = motifids[1]
                motif_name = motif_id
            else:  # assume first two fieds: id, name
                motif_id, motif_name = motifids[1:3]
            statistics = __read_statistics_meme(motif_file, ifstream, debug)
            probs = __read_counts_meme(motif_file, ifstream,
                                       statistics["width"], debug)
            motifs_raw.append({
                "motifId": motif_id,
                "motifName": motif_name,
                "statistics": statistics,
                "counts": probs
            })
            motifs_num += 1
            if verbose:
                end_rm = time.time()
                print("Read motif %s in %.2fs." % (motif_name,
                                                   (end_rm - start_rm)))
        if not proceed:
            errmsg = "Unexpected premature EOF in {}.\n"
            exception_handler(EOFError, errmsg.format(motif_file), debug)
    except:
        errmsg = "An error occurred while reading {}.\n"
        exception_handler(MotifFileReadError, errmsg.format(motif_file), debug)
    else:
        if bg_file == UNIF: bgs = get_uniformBG(alphabet, debug)
        elif os.path.isfile(bg_file): bgs = readBGfile(bg_file, debug)
        else:
            errmsg = "Unable to parse {}.\n"
            exception_handler(BGFileError, errmsg.format(bg_file), debug)
        bgs = pseudo_bg(bgs, no_reverse, debug)  # add pseudocount to bg
        for i in range(motifs_num):
            mp = pd.DataFrame(np.matrix(motifs_raw[i]["counts"]))
            mp.index = alphabet
            mp = norm_motif(mp, motifs_raw[i]["statistics"]["width"], alphabet,
                            debug)
            mp = apply_pseudocount_meme(mp.to_numpy(), pseudocount,
                                        motifs_raw[i]["statistics"]["nsites"],
                                        motifs_raw[i]["statistics"]["width"],
                                        bgs, alphabet, nucsmap, debug)
            motif: Motif = Motif(mp, motifs_raw[i]["statistics"]["width"],
                                 alphabet, motifs_raw[i]["motifId"],
                                 motifs_raw[i]["motifName"], nucsmap)
            motif.setBg(bgs)
            motifs.append(motif)
    finally:
        ifstream.close()

    return motifs
Esempio n. 4
0
def process_motif_for_logodds(motif: Motif, debug: bool) -> Motif:
    """Computes log-odds values from motif probability matrix (PFM).

    While processing  motif probability matrix for log-odds values is also
    computed the p-value matrix for the current motif PWM. 

    ...

    Parameters
    ----------
    motif : Motif
        DNA motif 
    debug : bool
        trace the full error stack
        
    Returns
    -------
    Motif
        motif log-odds matrix
    """

    if not isinstance(motif, Motif):
        errmsg = "Expected Motif, got {}.\n"
        exception_handler(TypeError, errmsg.format(type(motif).__name__),
                          debug)

    # compute log-odds
    motif_log_odds = compute_log_odds(motif.countMatrix, motif.width, motif.bg,
                                      motif.alphabet, motif.nucsmap, debug)
    motif.set_motifScoreMatrix(motif_log_odds)

    # log-odds matrix scaling
    scaled_scores, min_val, max_val, scale, offset = scale_pwm(
        motif.scoreMatrix, motif.alphabet, motif.width, motif.nucsmap, debug)
    motif.set_motifScoreMatrix(scaled_scores)
    motif.set_isScaled()
    motif.set_scale(scale)
    motif.set_minVal(min_val)
    motif.set_maxVal(max_val)
    motif.set_offset(offset)

    # compute p-value matrix
    pval_mat = comp_pval_mat(motif, debug)
    motif.set_motifPvalMatrix(pval_mat)

    return motif
Esempio n. 5
0
def read_JASPAR_motif(motif_file: str, bg_file: str, pseudocount: float,
                      no_reverse: bool, verbose: bool, debug: bool) -> Motif:
    """Read a motif PWM in JASPAR format.

    The data read are then used to build the scoring matrix for the 
    motif, the P-value matrix, etc.

    ...

    Parameters
    ----------
    motif_file : str
        path to the motif PWM in JASPAR format
    bg_file
        path to the background file in Markov Background Format
        (http://meme-suite.org/doc/bfile-format.html).
    pseudocount : float
        value to add to motif PWM counts
    no_reverse : bool
        if False only the forward strand will be considered, otherwise
        both forward and reverse are considered
    verbose : bool
        print additional information
    debug:
        trace the full error stack

    Returns
    -------
    Motif
        Motif object 
    """

    nucs: List[str] = list()
    counts: List[float] = list()
    if verbose:
        start_rm: float = time.time()
    try:
        ifstream = open(motif_file, mode="r")
        readlines = 0  # check for empty files
        # begin parsing
        header: str = str(ifstream.readline().strip()[1:])
        if not header:  # empty file?
            errmsg = "{} seems to empty.\n"
            exception_handler(IOError, errmsg.format(motif_file), debug)
        motifID, motifName = header.split('\t')[0:2]
        readlines += 1
        while True:
            line = ifstream.readline().strip()
            if not line: break  # EOF or empty file?
            nuc = line.strip()[:1]
            count = list(map(float, line.strip()[1:].split()[1:][:-1]))
            nucs.append(nuc.upper())
            counts.append(count)
            readlines += 1
        if readlines <= 1:  # only header read ?
            errmsg = "{} seems to be empty.\n"
            exception_handler(IOError, errmsg.format(motif_file), debug)
    except:
        errmsg = "An error occurred while reading {}.\n"
        exception_handler(MotifFileReadError, errmsg.format(motif_file), debug)
    else:
        if any([len(c) != len(counts[0]) for c in counts]):
            errmsg = "Motif counts width mismatch.\n"
            exception_handler(ValueError, errmsg, debug)
        nucsmap = dict()  # used with np object
        for i in range(len(nucs)):
            nucsmap.update({nucs[i]: i})
        motif_counts: pd.DataFrame = pd.DataFrame(
            data=counts, index=nucs)  # motif count matrix
        motif_width: int = int(len(counts[0]))
        alphabet: list = sorted(nucs)

        # compute background
        if bg_file == UNIF: bgs = get_uniformBG(alphabet, debug)
        elif os.path.isfile(bg_file): bgs = readBGfile(bg_file, debug)
        else:
            errmsg = "Unable to parse {}.\n"
            exception_handler(BGFileError, errmsg.format(bg_file), debug)
        bgs = pseudo_bg(bgs, no_reverse, debug)  # add pseudocount to bg

        # motif probability matrix
        motif_probs = (motif_counts / motif_counts.sum(0))
        motif_probs = norm_motif(motif_probs, motif_width, alphabet, debug)
        motif_probs = apply_pseudocount_jaspar(motif_counts.to_numpy(),
                                               motif_probs.to_numpy(),
                                               pseudocount, bgs, motif_width,
                                               alphabet, nucsmap, debug)
        motif: Motif = Motif(motif_probs, motif_width, alphabet, motifID,
                             motifName, nucsmap)
        motif.setBg(bgs)

        if verbose:
            end_rm: float = time.time()
            msg: str = "Read motif %s in %.2fs" % (motifID,
                                                   (end_rm - start_rm))
            print(msg)
    finally:
        ifstream.close()

    return motif
Esempio n. 6
0
def read_JASPAR_motif(motif_file: str, bg_file: str, pseudocount: float,
                      no_reverse: bool, verbose: bool) -> Motif:
    """Read a motif PWM in JASPAR format.

    The data read are then used to build the scoring matrix for the 
    motif, the P-value matrix, etc.

    Parameters:
    motif_file : str
        path to the motif PWM in JASPAR format
    bg_file
        path to the background file in Markov Background Format
        (http://meme-suite.org/doc/bfile-format.html).
    pseudocount : float
        value to add to motif PWM counts
    no_reverse : bool
        if False only the forward strand will be considered, otherwise
        both forward and reverse are considered
    verbose : bool
        print additional information

    Returns
    -------
    Motif
        Motif object storing the data contained in motif_file
    """

    nucs: List[str]
    counts: List[float]

    # lists where store nucleotides and raw counts
    nucs = list()
    counts = list()

    if verbose:
        start_rm: float = time.time()

    try:
        # open the motif file
        with open(motif_file) as in_mtf:

            header: str
            motifID: str
            motifName: str

            # read the header
            header = str(in_mtf.readline()[1:])
            # get the jaspar ID and the common TF name
            motifID, motifName = header.split('\t')[0:2]
            motifName = motifName[:-1]  # remove '\n'

            for line in in_mtf:
                line = line.strip()
                nuc = line.strip()[:1]
                count = list(map(float, line.strip()[1:].split()[1:][:-1]))

                nucs.append(nuc)
                counts.append(count)
            # end for
        # end open

    except:
        errmsg: str = ' '.join(["\n\nERROR: unable to read file", motif_file])
        raise FileReadingException(errmsg)

    else:

        motif_counts = pd.DataFrame(data=counts, index=nucs)
        # the check of equal length for all raw counts is made building
        # the DataFrame
        motif_width: int = int(len(counts[0]))
        alphabet: list = sorted(nucs)  # alphabet as list

        bgs: Dict
        if bg_file == 'UNIF':
            bgs = get_uniformBG(alphabet)
        elif os.path.exists(bg_file):
            bgs = readBGfile(bg_file)
        else:
            errmsg = "\n\nERROR: unable to find the given background file"
            raise NotValidBGException(errmsg)
        # end if

        bgs = pseudo_bg(bgs, no_reverse)

        motif_probs: pd.DataFrame
        motif_probs = (motif_counts / motif_counts.sum(0))
        motif_probs = norm_motif(motif_probs, motif_width, alphabet)
        motif_probs = apply_pseudocount_jaspar(motif_counts, motif_probs,
                                               pseudocount, bgs, motif_width,
                                               alphabet)

        motif: Motif = Motif(motif_probs, motif_width, alphabet, motifID,
                             motifName)
        motif.setBg(bgs)

        if verbose:
            end_rm: float = time.time()
            msg: str = ''.join(
                ["Read motif ", motifID, " in ",
                 str(end_rm - start_rm), "s"])
            print(msg)
        # end if

        return motif

    finally:
        in_mtf.close()  # close the motif file anyway
Esempio n. 7
0
def process_motif_for_logodds(motif: Motif) -> Motif:
    """Computes the log-odds values from a probability matrix of a given
    PWM motif.

    During the computation of the log-odds matrix is also computed the 
    corresponding P-value matrix, using the dynamic programming 
    algorithm presented in Staden, 1994.

    Parameters
    ----------
    motif : Motif
        DNA motif 
        
    Returns
    -------
    Motif
        Input DNA motif with the log-odds matrix
    """

    # get the log-odds
    motif_log_odds: pd.DataFrame
    motif_log_odds = compute_log_odds(motif.getMotif_matrix(),
                                      motif.getWidth(), motif.getBg(),
                                      motif.getAlphabet())
    motif.setMotif_scoreMatrix(motif_log_odds)

    # scale the log-odds scores
    scaled_scores: np.ndarray
    min_val: int
    max_val: int
    scale: int
    offset: np.double

    scaled_scores, min_val, max_val, scale, offset = scale_pwm(
        motif.getMotif_scoreMatrix(), motif.getAlphabet(), motif.getWidth())
    motif.setMotif_scoreMatrix(scaled_scores)
    motif.setIsScaled(True)
    motif.setScale(scale)
    motif.setMin_val(min_val)
    motif.setMax_val(max_val)
    motif.setOffset(offset)

    # compute the p-value matrix
    pval_mat: np.array
    pval_mat = comp_pval_mat(motif)
    motif.setMotif_pval_matrix(pval_mat)

    motif.setMotif_scoreMatrix(scaled_scores.values)

    return motif
Esempio n. 8
0
def read_MEME_motif(motif_file: str, bg_file: str, pseudocount: float,
                    no_reverse: bool, verbose: bool) -> List[Motif]:
    """Read a motif PWM in MEME format.

    The data read are then used to build the scoring matrix for the 
    motif, the P-value matrix, etc.

    Since a MEME file can contain one or more motifs, for each PWM
    contained is built the corresponding motif object.
    The resulting set of motifs are then stored in a list.
    
    Parameters
    ----------
    motif_file : str
        path to the motif PWM
    bg_file : str
        path to the background probability distribution
    pseudocount : float
        pseudocount to add to the PWM values
    no_reverse : bool
        if False only the forward strand will be considered, otherwise
        both forward and reverse are considered
    verbose : bool
        print additional information

    Returns
    -------
    List[Motif]
        List of Motif objects storing the data contained in motif_file
    """

    try:
        with open(motif_file, 'r') as in_mtf:  # open the motif file

            # flag to keep track were the infos about the motif begin
            infostart: bool
            # flag to keep track were the motif data begin
            datastart: bool
            # number of motifs found in the MEME file
            motifs_found: int
            # list of the found motif IDs
            motifID_lst: List[str]
            # list of the found motif names
            motifName_lst: List[str]
            # list of the found motif widths
            motif_width_lst: List[int]
            # list of the found motif site counts
            site_counts_lst: List[int]
            # list of the found motif alphabet lengths
            alphalen_lst: List[int]
            # list of the found motif probability matrices
            motif_probs_lst: List[pd.DataFrame]
            # list of the found As probabilities for each motif
            a_lst: List[np.double]
            # list of the found Cs probabilities for each motif
            c_lst: List[np.double]
            # list of the found Gs probabilities for each motif
            g_lst: List[np.double]
            # list of the found Ts probabilities for each motif
            t_lst: List[np.double]

            infostart = False
            datastart = False
            motifs_found = 0

            motifID_lst = list()
            motifName_lst = list()
            motif_width_lst = list()
            site_counts_lst = list()
            alphalen_lst = list()
            motif_probs_lst = list()
            a_lst = list()
            c_lst = list()
            g_lst = list()
            t_lst = list()
            motif_width = None
            pos_read = 0

            for line in in_mtf:
                if line[0:8] == 'ALPHABET':
                    alphabet: List = sorted(list(set(line[10:-1])))
                    assert isListEqual(alphabet, DNA_ALPHABET)

                if line[0:5] == 'MOTIF':

                    if verbose:
                        start_rm: float = time.time()

                    # read motif ID and full name
                    motif_header: str = line.split()

                    assert len(motif_header) > 0

                    # there are two ways to define the motif name line
                    # in MEME file
                    # (refer to http://meme-suite.org/doc/meme-format.html?man_type=web):
                    #   1 - MOTIF motif_alternate_name
                    #   2 - MOTIF motif_identifier motif_alternate_name

                    motifID: str
                    motifName: str

                    if len(motif_header) == 2:  # support case (1)
                        motifID = motif_header[1]
                        motifName = motif_header[1]

                    else:  # support case (2)
                        motifID, motifName = motif_header[1:3]
                    # end if

                    motifID_lst.append(motifID)
                    motifName_lst.append(motifName)

                    # the informations about motif start here
                    infostart = True
                    continue
                # end if

                if infostart and len(line.strip()) != 0:
                    infos: str = line[26:]
                    infosplit: List[str] = infos.split()
                    alphalen: int = int(infosplit[1])
                    alphalen_lst.append(alphalen)

                    assert alphalen == len(alphabet)

                    motif_width: int = int(infosplit[3])
                    site_counts: int = int(infosplit[5])
                    infostart = False  # informations end here

                    # allocate space for the motif probability matrix
                    motif_probs: pd.DataFrame = pd.DataFrame(
                        index=alphabet,
                        columns=range(motif_width),
                        data=np.double(0))

                    motif_width_lst.append(motif_width)
                    site_counts_lst.append(site_counts)
                    motif_probs_lst.append(motif_probs)

                    datastart = True  # at next step begin data

                    # initialize nucleotide data
                    a = list()
                    c = list()
                    g = list()
                    t = list()
                    continue
                # end if

                if datastart and pos_read < motif_width:
                    freqs = line.split()
                    a.append(np.double(freqs[0]))
                    c.append(np.double(freqs[1]))
                    g.append(np.double(freqs[2]))
                    t.append(np.double(freqs[3]))
                    pos_read += 1
                # end if

                # we read all current motif data
                if pos_read == motif_width:
                    a_lst.append(a)
                    c_lst.append(c)
                    g_lst.append(g)
                    t_lst.append(t)

                    # update stats about found motifs
                    motifs_found += 1

                    # clear the statistics
                    pos_read: int = 0
                    motif_width = None
                    datastart = False
                    alphalen = -1
                    datastart = False

                    if verbose:
                        end_rm: float = time.time()
                        msg: str = ''.join([
                            "Read motif ", motifID, " in ",
                            str(end_rm - start_rm), "s"
                        ])
                        print(msg)
                    # end if
                # end if

    except:  # something went wrong
        errmsg: str = ' '.join(["Unable to read file", motif_file])
        raise FileReadingException(errmsg)

    else:

        bgs: dict
        # read the background
        if bg_file == 'UNIF':
            bgs = get_uniformBG(alphabet)
        elif os.path.exists(bg_file):
            bgs = readBGfile(bg_file)
        else:
            errmsg = "\n\nERROR: unable to find the given background file"
            raise NotValidBGException(errmsg)
        # end if

        bgs = pseudo_bg(bgs, no_reverse)

        motif_lst: List[Motif]
        motif_lst = list()

        for i in range(motifs_found):
            mp: pd.DataFrame = motif_probs_lst[i]

            mp.loc['A'] = a_lst[i]
            mp.loc['C'] = c_lst[i]
            mp.loc['G'] = g_lst[i]
            mp.loc['T'] = t_lst[i]

            mw: int = motif_width_lst[i]
            sc: int = site_counts_lst[i]

            mp = norm_motif(mp, mw, alphabet)
            mp = apply_pseudocount_meme(mp, pseudocount, sc, mw, bgs, alphabet)

            motif: Motif = Motif(mp, mw, alphabet, motifID_lst[i],
                                 motifName_lst[i])
            motif.setBg(bgs)

            motif_lst.append(motif)
        # end for

        return motif_lst

    finally:
        in_mtf.close()  # close the file anyway
Esempio n. 9
0
def scan_graph(motif: Motif, args_obj: Findmotif) -> str:
    """Obtain all the sequences of length K from the given genome
    variation graph, where K is the motif length.

    The sequences are obtained from the regions defined in the input
    BED file (UCSC BED file format).

    The sequences extracted correspond to all possible recombinant
    ones which can be obtained using the genomic variants given in the
    VCF file used to build the queried VG. 
    
    Then, they are filtered to keep only those beloning to haplotypes of 
    the samples from which the VCF file variants come from.

    The user can also decide to keep them all using the --recomb option. 
    
    Parameters
    ----------
    motif : Motif 
        DNA motif PWM to search on the VG
    args_obj : Findmotif  
        container of the arguments used during genome variation graph
        scanning
        
    Returns
    -------
    str 
        location of the files with the sequences extracted
    """

    errmsg: str
    vg: str
    chroms: List[str]

    # check the input arguments
    if not isinstance(motif, Motif):
        errmsg = "\n\nERROR: unknown motif object type"
        raise ValueError(errmsg)

    if not isinstance(args_obj, Findmotif):
        errmsg = "Unknown arguments object type. "
        errmsg += "Cannot scan the genome variation graph. Exiting"
        raise ValueError(errmsg)

    if args_obj.has_graph_genome():
        vg = args_obj.get_graph_genome()

        if not isGraph_genome_xg(vg):
            errmsg = "\n\nERROR: the genome variation graph is not in XG format"
            raise VGException(errmsg)
        # end if

    elif args_obj.has_graph_genome_dir():
        vg = args_obj.get_graph_genome_dir()

    else:
        raise VGException("\n\nERROR: the genome variation graph is missing")
    # end if

    bedfile: str = args_obj.get_bedfile()
    motif_width: int = motif.getWidth()
    cores: int = args_obj.get_cores()

    global verbose
    verbose = args_obj.get_verbose()

    print("\nExtracting regions defined in", bedfile, "\n")

    # read the regions where search the motif occurrences from the given
    # BED file
    regions: Dict
    region_num: int
    regions, region_num = getBEDregions(bedfile)

    if (args_obj.get_chroms_num() == 1
            and args_obj.get_chroms()[0] == 'ALL_CHROMS'):
        chroms = list(regions.keys())
    else:
        chroms = [''.join(['chr', c]) for c in args_obj.get_chroms()]

    if verbose:
        print("\nFound", region_num, "regions in", bedfile)

    # create a tmp working directory
    tmpwd: str = tempfile.mkdtemp(prefix='grafimo_')

    # get the new location of graphs wrt the tmp dir
    cwd: str = os.getcwd()

    # enter the tmp dir where store the extracted sequences
    os.chdir(tmpwd)

    # list of queries
    queries: List[str] = list()

    # redefine default SIGINT handler
    original_sigint_handler = signal.signal(signal.SIGINT, signal.SIG_IGN)
    pool: mp.Pool = mp.Pool(processes=cores)  # use no. cores processes
    # overwrite the default SIGINT handler to exit gracefully
    # https://stackoverflow.com/questions/11312525/catch-ctrlc-sigint-and-exit-multiprocesses-gracefully-in-python
    signal.signal(signal.SIGINT, original_sigint_handler)

    positions: List[Tuple[int, int]]

    if args_obj.has_graph_genome_dir():

        # vg -> directory containing a set of VGs
        if vg[-1] == "/":
            pass
        else:
            vg = ''.join([vg, "/"])
        # end if

        for chrom in chroms:
            positions = regions[chrom]

            for pos in positions:
                start: int = pos[0]
                stop: int = pos[1]

                # the chromosome is among the ones to query
                region_index: str = ''.join(
                    [chrom.split('chr')[-1], ':',
                     str(start), '-',
                     str(stop)])
                region_name: str = ''.join(
                    [chrom, '_', str(start), '-',
                     str(stop)])
                seqs: str = os.path.join('.', ''.join([region_name, '.tsv']))

                xg: str = ''.join([vg, chrom, '.xg'])
                # the GBWT must have the same prefix as XG
                gbwt: str = ''.join([vg, chrom, '.gbwt'])

                if not os.path.exists(xg):
                    errmsg = ''.join(
                        ["\n\nERROR: unable to use ", xg, ". Exiting"])
                    raise FileNotFoundError(errmsg)

                if not os.path.isfile(gbwt):
                    errmsg = "ERROR: unable to find GBWT file for"
                    errmsg = ' '.join([errmsg, xg])
                    raise FileNotFoundError(errmsg)

                query: str = 'vg find -p {0} -x {1} -H {2} -K {3} -E > {4}'.format(
                    region_index, xg, gbwt, motif_width, seqs)
                queries.append(query)

        # extract from the graph the binding site candidates to score
        get_kmers(queries, pool, verbose)

    elif args_obj.has_graph_genome():

        for chrom in chroms:
            positions = regions[chrom]

            for pos in positions:
                start: int = pos[0]
                stop: int = pos[1]

                # the chromosome is among the ones to query
                region_index: str = ''.join(
                    [chrom.split('chr')[-1], ':',
                     str(start), '-',
                     str(stop)])
                region_name: str = ''.join(
                    [chrom, '_', str(start), '-',
                     str(stop)])
                seqs: str = os.path.join('.', ''.join([region_name, '.tsv']))

                xg: str = vg
                xg_prefix: str = xg.split(".xg")[-2]
                # the GBWT must have the same prefix as XG
                gbwt: str = ''.join([xg_prefix, '.gbwt'])

                if not os.path.exists(xg):
                    errmsg = ''.join(
                        ["\n\nERROR: unable to use ", xg, ". Exiting"])
                    raise FileNotFoundError(errmsg)

                if not os.path.isfile(gbwt):
                    errmsg = "ERROR: unable to find GBWT file for"
                    errmsg = ' '.join([errmsg, xg])
                    raise FileNotFoundError(errmsg)

                query = 'vg find -p {0} -x {1} -H {2} -K {3} -E > {4}'.format(
                    region_index, xg, gbwt, motif_width, seqs)
                queries.append(query)

        # extract from the graph the binding site candidates to score
        get_kmers(queries, pool, verbose)

    else:
        raise Exception("\n\nERROR: do not know how to proceed. Exiting")
    # end if

    # the extracted sequences are store in the cwd
    sequence_loc: str = os.getcwd()
    os.chdir(cwd)

    return sequence_loc