コード例 #1
0
    def __init__(self, count_matrix, width, alphabet, motif_id, motif_name):

        if count_matrix.empty:
            errmsg = "\n\nERROR: attempt to initialize the motif object with an empty count matrix"
            raise NotValidMotifMatrixException(errmsg)

        if not isinstance(count_matrix, pd.DataFrame):
            raise NoDataFrameException(
                "\n\nERROR: the given value is not a pandas.DatFrame instance")

        if not isinstance(width, int) or width < 0:
            errmsg = "\n\nERROR: attempt to initialize motif without a valid width"
            raise WrongMotifWidthException(errmsg)

        if not isinstance(motif_id, str) or not motif_id:
            raise WrongMotifIDException(
                "\n\nERROR: cannot initialize the motif with the given ID")

        if not isinstance(motif_name, str) or not motif_name:
            raise WrongMotifNameException(
                "\n\nERROR: cannot initialize the motif with the given name")

        if not isinstance(alphabet, list) or not isListEqual(
                alphabet, DNA_ALPHABET):
            errmsg = "\n\nERROR: cannot initialize a motif object with a wrong alphabet"
            raise NotValidAlphabetException(errmsg)

        self._count_matrix = count_matrix
        self._width = width
        self._motif_id = motif_id
        self._motif_name = motif_name
        self._alphabet = alphabet
コード例 #2
0
    def setMotif_matrix(self, motif_matrix):

        if motif_matrix.empty:
            errmsg = "\n\nERROR: attempt to use an empty motif matrix"
            raise NotValidMotifMatrixException(errmsg)

        if not isinstance(motif_matrix, pd.DataFrame):
            raise NoDataFrameException(
                "\n\nERROR: the given value is not a pandas.DataFrame instance"
            )

        self._count_matrix = motif_matrix
コード例 #3
0
ファイル: res_writer.py プロジェクト: ManuelTgn/GRAFIMO
def print_results(results):
    """Print GRAFIMO results on terminal without storing them on 
    the three files (TSV, HTML, GFF3)

    Parameters
    ----------
    results : pandas.DataFrame
        GRAFIMO results
    """

    if not isinstance(results, pd.DataFrame):
        errmsg: str = "\n\nERROR: the results must be stored in a pandas DataFrame"
        raise NoDataFrameException(errmsg)

    # little hack in pd df parameters to avoid the weird default
    # print of a DataFrame (cut the majority of lines)
    pd.set_option("display.max_rows", len(results))
    print()  # newline
    print(results)
    pd.reset_option("display.max_rows")
コード例 #4
0
ファイル: utils.py プロジェクト: kant/GRAFIMO
def list_data(data, qvalue):
    """
        Convert a pandas DataFrame in a list of lists, where
        each column is a list of values
        ----
        Parameters:
            data (pd.DataFrame) : input pandas DataFrame
        ----
        Returns:
            summary (list) : pandas DataFrame converted in a
                                list of lists
    """

    if not isinstance(data, pd.DataFrame):
        raise NoDataFrameException(
            "DataFrame given is not an instance of pandas.DataFrame")
        die(1)

    assert len(data.columns) <= 11
    assert len(data.columns) >= 10

    seqnames = data['sequence_name'].to_list()
    starts = data['start'].to_list()
    stops = data['stop'].to_list()
    scores = data['score'].to_list()
    strands = data['strand'].to_list()
    motifIDs = data['motif_id'].to_list()
    motifNames = data['motif_alt_id'].to_list()
    pvalues = data['p-value'].to_list()
    sequences = data['matched_sequence'].to_list()
    references = data['reference'].to_list()

    if qvalue:
        qvalues = data['q-value'].to_list()

    if qvalue:
        summary = [
            motifIDs, motifNames, seqnames, starts, stops, strands, scores,
            pvalues, sequences, references, qvalues
        ]
    else:
        summary = [
            motifIDs, motifNames, seqnames, starts, stops, strands, scores,
            pvalues, sequences, references
        ]

    summary_len = len(motifIDs)

    assert summary_len == len(data.index)
    assert summary_len == len(motifNames)
    assert summary_len == len(seqnames)
    assert summary_len == len(starts)
    assert summary_len == len(stops)
    assert summary_len == len(strands)
    assert summary_len == len(scores)
    assert summary_len == len(pvalues)
    assert summary_len == len(sequences)

    if qvalue:
        assert summary_len == len(qvalues)

    return summary
コード例 #5
0
ファイル: res_writer.py プロジェクト: ManuelTgn/GRAFIMO
def write_results(results: pd.DataFrame, motif_id: str, motif_num: int,
                  args_obj: Findmotif) -> None:
    """Write GRAFIMO analysis results in three files (TSV report, HTML 
    report, GFF3 file).

    To first two reports contain in a tabular format the motif occurrence
    candidates retrieved by GRAFIMO, with scores, P-values, q-values, etc.
    for each table entry.

    The third file contains a suitabl input for custom track on the UCSC
    genome browser.

    The user can also ask to display the results directly on the terminal
    enabling the correspondent flag, when calling GRAFIMO in command-line.

    Parameters
    ----------
    results : pandas.DataFrame
        results of GRAFIMO analysis
    motif_id : str
        motif ID
    motif_num : int
        number of searched motifs
    args_obj : Findmotif
        container for arguments needed to store the results
    """

    errmsg: str
    if not isinstance(args_obj, Findmotif):
        errmsg = "\n\nERROR: incorrect data-type. Exiting"
        raise ValueError(errmsg)

    if not isinstance(results, pd.DataFrame):
        errmsg = "\n\nERROR: results must be stored in a pandas DataFrame"
        raise NoDataFrameException(errmsg)

    # get resuls storing arguments
    outdir: str = args_obj.get_outdir()
    no_qvalue: bool = args_obj.get_no_qvalue()
    top_graphs: int = args_obj.get_top_graphs()
    verbose: bool = args_obj.get_verbose()

    vg: str
    if args_obj.has_graph_genome():
        vg = args_obj.get_graph_genome()
    elif args_obj.has_graph_genome_dir():
        vg = args_obj.get_graph_genome_dir()
    else:
        errmsg = "\n\nERROR: no VG given"
        raise VGException(errmsg)

    dirname_default: bool = False

    cwd: str = os.getcwd()

    if outdir == DEFAULT_OUTDIR:
        # to make unique the output directory we add the PID
        # to the name.
        #
        # This is useful when calling grafimo in different runs on the
        # same machine.

        # append the PID and the motif ID
        outdir = '_'.join(["grafimo_out", str(os.getpid()), motif_id])
        dirname_default = True
    # end if

    cmd: str
    code: int
    if not os.path.isdir(outdir):
        cmd = 'mkdir -p {0}'.format(outdir)
        code = subprocess.call(cmd, shell=True)

        if code != 0:
            errmsg = ''.join(
                ["\n\nERROR: An error occurred while executing ", cmd, "\n"])
            raise SubprocessError(errmsg)

        os.chdir(outdir)

    else:
        os.chdir(outdir)
        # NB the content of the directory will be automatically
        # overwritten
    # end if

    print("\nWriting results in %s\n" % outdir)

    prefix: str
    # get the filename prefix
    if not dirname_default and motif_num > 1:
        # each file is labeled with the motif ID
        prefix = '_'.join(['grafimo_out', motif_id])
    else:
        prefix = 'grafimo_out'

    if verbose:
        start_tsv: float = time.time()

    # write the TSV
    results.to_csv(''.join([prefix, '.tsv']), sep='\t', encoding='utf-8')

    if verbose:
        end_tsv: float = time.time()
        print("%s.tsv written in %.2fs" % (prefix, (end_tsv - start_tsv)))
        start_html: float = time.time()

    # write the HTML
    results.to_html(''.join([prefix, '.html']))

    if verbose:
        end_html: float = time.time()
        print("%s.html written in %.2fs" % (prefix, (end_html - start_html)))
        start_gff: float = time.time()

    # write the GFF3
    writeGFF3(prefix, results, no_qvalue)

    if verbose:
        end_gff: float = time.time()
        print("%s.gff written in %.2fs" % (prefix, (end_gff - start_gff)))

    # get the graphs of the top n regions
    regions: List[str]
    if top_graphs > 0:
        regions = results['sequence_name'].to_list()

        # the results are empty
        if len(regions) == 0:
            print(
                "WARNING: no region was available. Are your results empty?\n")
            os.chdir(cwd)
            return

        # get the n different top regions' graph
        regions = unique_lst(regions, size=top_graphs)

        if verbose:
            print("Extracting %d region variation graphs" % top_graphs)

        if len(regions) < top_graphs:
            top_graphs = len(regions)
            print("WARNING: possible to visualize only the top %d regions\n" %
                  (top_graphs))

        # create the directory for the regions pictures
        image_dir: str
        if motif_num > 1:
            image_dir = '_'.join(["top_graphs", motif_id])
        else:
            image_dir = "top_graphs"

        if verbose:
            print("Graphs will be stored in %s" % image_dir)

        cmd = "mkdir -p {0}".format(image_dir)
        code = subprocess.call(cmd, shell=True)
        if code != 0:
            errmsg = ' '.join(
                ["\n\nERROR: an error occurred while executing", cmd, "\n"])
            raise SubprocessError()

        os.chdir(image_dir)

        print("Writing the top %d graphs in %s\n" % (top_graphs, image_dir))

        for i in range(top_graphs):
            region: str = regions[i]
            # the VG accepts only queries like 1:100-200 and not like
            # chr1:100-200
            region = region.split("chr")[1]

            if verbose:
                print("Computing the PNG image of %s" % region)

            getRegion_graph(region, vg)

    os.chdir(cwd)
コード例 #6
0
ファイル: res_writer.py プロジェクト: ManuelTgn/GRAFIMO
def writeGFF3(prefix: str, data: pd.DataFrame, no_qvalue: bool) -> None:
    """Write a GFF3 file 
    (https://www.ensembl.org/info/website/upload/gff3.html) containing 
    all the motif occurrence candidates retrieved by GRAFIMO.
    
    The resulting file can be loaded on the UCSC genome browser to view 
    the occurrence
        
    Parameters
    ----------
    prefix : str
        filename prefix
    data : pandas.DataFrame
        results of GRAFIMO analysis
    no_qvalue : bool
        if set to True the GFF3 entrues will contain also the 
        corresponding q-value
    """

    qvalue: bool
    errmsg: str
    if not isinstance(data, pd.DataFrame):
        errmsg = "\n\nERROR: the object is not an instance of pandas.DataFrame"
        raise NoDataFrameException(errmsg)

    if no_qvalue:
        qvalue = False
    else:
        qvalue = True

    try:
        f = open(''.join([prefix, '.gff']), mode='w+')

        header = "##gff-version 3\n"
        f.write(header)

        data_list: List[List[str], List[str], List[str], List[int], List[int],
                        List[str], List[float], List[float], List[str],
                        List[int], List[str], Optional[List[float]]]
        data_list = list_data(data, qvalue)

        if qvalue and len(data_list) < 12:
            errmsg = "\n\nERROR: wrong data size. Unable to write the GFF3 report\n"
            raise Exception(errmsg)

        data_list_size: int = len(data_list[0])
        for i in range(data_list_size):

            seqname: str = data_list[2][i]
            chrom: str = seqname.split(':')[
                0]  # takes only the chromosome name
            score: float = round(data_list[6][i], 1)
            strand: str = data_list[5][i]

            start: int
            end: int
            if strand == '-':
                # keep forward strand coordinates
                start = data_list[4][i]
                end = data_list[3][i]
            else:
                start = data_list[3][i]
                end = data_list[4][i]

            motifID: str = data_list[0][i]
            motifName: str = data_list[1][i]
            pvalue: float = np.format_float_scientific(data_list[7][i],
                                                       exp_digits=2)
            sequence: str = data_list[8][i]
            reference: str = data_list[10][i]

            if qvalue:
                qvalue: float = np.format_float_scientific(data_list[11][i],
                                                           exp_digits=2)

            att1: str = ''.join(
                ['Name=', motifID, '_', seqname, strand, ':', reference])
            att2: str = ''.join(["Alias=", motifName])
            att3: str = ''.join(["ID=", motifID, '-', motifName, '-', seqname])
            att4: str = ''.join(['pvalue=', str(pvalue)])
            att5: str = ''.join(['sequence=', sequence, ';\n'])

            atts: str
            if qvalue:
                attqv: str = ''.join(['qvalue=', str(qvalue)])
                atts = ';'.join([att1, att2, att3, att4, attqv, att5])
            else:
                atts = ';'.join([att1, att2, att3, att4, att5])

            gffline: str = '\t'.join([
                chrom, SOURCE, TP, start, end,
                str(score), strand, PHASE, atts
            ])

            f.write(gffline)

        # end for
    except:
        errmsg = ''.join(
            ["\n\nERROR: unable to open or write data on ", prefix, ".gff"])
        raise FileReadingException(errmsg)
    finally:
        f.close()
コード例 #7
0
def scale_pwm(motif_matrix: pd.DataFrame, alphabet: List[str],
              motif_width: int) -> Tuple[np.ndarray, int, int, int, np.double]:
    """Scale the log-odds values of the motif scoring matrix.

    The values are scaled in the range [0, 1000]. The scaling improves
    computational speed while computing the score for each motif 
    occurrence candidate, and allows a constant time computation of 
    the corresponding P-value. 
        
    Parameters
    ----------
    motif_matrix : pd.DataFrame
        motif log-odds matrix
    alphabet: list
        DNA motif alphabet
    motif_width: int
        motif width

    Returns
    -------
    numpy.ndarray
        scaled motif scoring matrix
    int
        minimum value of the scaled scoring matrix
    int
        maximum value of the scaled scoring matrix
    int
        scaling factor
    numpy.double
        scaling offset
    """

    errmsg: str
    if not isinstance(motif_matrix, pd.DataFrame):
        errmsg = "\n\nERROR: The given motif matrix must be an instance of pandas.DataFrame"
        raise NoDataFrameException(errmsg)

    if motif_matrix.empty:
        errmsg = "\n\nERROR: The given motif matrix is empty"
        raise NotValidMotifMatrixException(errmsg)

    if not isinstance(alphabet, list):
        errmsg = "\n\nERROR: The alphabet given is not in a list"
        raise NotValidAlphabetException(errmsg)

    if not isListEqual(alphabet, DNA_ALPHABET):
        errmsg = "\n\nERROR: The alphabet given is not a valid DNA alphabet"
        raise NotValidAlphabetException(errmsg)

    assert motif_width > 0

    min_val: int
    max_val: int
    motif_matrix_sc: pd.DataFrame

    min_val = min(motif_matrix.min())
    max_val = max(motif_matrix.max())
    motif_matrix_sc = pd.DataFrame(index=list(motif_matrix.index),
                                   columns=list(motif_matrix.columns),
                                   data=0)

    lower: int = min_val
    upper: int = max_val

    if lower == upper:  # all values are equal
        lower = np.double(upper - 1)

    offset: np.double
    scale_factor: int

    lower = np.floor(lower)
    offset = np.round(np.floor(lower))
    scale_factor = np.floor(RANGE / (upper - lower))

    # values will be in [0, 1000]
    for nuc in alphabet:
        for j in range(motif_width):
            scaled_score = np.round(
                (motif_matrix.loc[nuc, j] - (offset)) * scale_factor)
            motif_matrix_sc.loc[nuc, j] = scaled_score
        # end for
    # end for

    # make sure the values are integers
    motif_matrix_sc[:] = motif_matrix_sc[:].astype(int)

    # now they are scaled
    min_val = min(motif_matrix_sc.min())
    max_val = max(motif_matrix_sc.max())

    return motif_matrix_sc, min_val, max_val, int(scale_factor), offset
コード例 #8
0
ファイル: utils.py プロジェクト: ManuelTgn/GRAFIMO
def list_data(data: pd.DataFrame, qvalue: bool) -> List:
    """Convert a pandas DataFrame in a list containign each dataframe 
    column as a list of values

    Parameters
    ----------
    data : pandas.DataFrame
        input DataFrame
    qvalue : bool
        if True the column of q-values has to be considered

    Returns
    -------
    list
        list containing DataFrame's columns as list of values
    """


    if not isinstance(data, pd.DataFrame):
        errmsg: str = "\n\nERROR: not allowed data type given"
        raise NoDataFrameException(errmsg)

    assert len(data.columns) <= 12
    assert len(data.columns) >= 11

    seqnames: List[str] = data['sequence_name'].to_list()
    starts: List[int] = data['start'].to_list()
    stops: List[int] = data['stop'].to_list()
    scores: List[np.double] = data['score'].to_list()
    strands: List[str] = data['strand'].to_list()
    motifIDs: List[str] = data['motif_id'].to_list()
    motifNames: List[str] = data['motif_alt_id'].to_list()
    pvalues: List[np.double] = data['p-value'].to_list()
    sequences: List[str] = data['matched_sequence'].to_list()
    frequencies:List[int] = data['haplotype_frequency'].to_list()
    references: List[str] = data['reference'].to_list()

    if qvalue:
        qvalues: List[np.double] = data['q-value'].to_list()

    if qvalue:
        summary = [motifIDs, motifNames, seqnames, starts, stops, strands, scores,
                   pvalues, sequences, frequencies, references, qvalues]
    else:
        summary = [motifIDs, motifNames, seqnames, starts, stops, strands, scores,
                   pvalues, sequences, frequencies, references]

    summary_len: int = len(motifIDs)

    assert summary_len == len(data.index)
    assert summary_len == len(motifNames)
    assert summary_len == len(seqnames)
    assert summary_len == len(starts)
    assert summary_len == len(stops)
    assert summary_len == len(strands)
    assert summary_len == len(scores)
    assert summary_len == len(pvalues)
    assert summary_len == len(sequences)
    assert summary_len == len(frequencies)
    assert summary_len == len(references)

    if qvalue:
        assert summary_len == len(qvalues)

    return summary
コード例 #9
0
def scale_pwm(motif_matrix, alphabet, motif_width):
    """
        Scale the motif matrix values
        ----
        Parameters:
            motif_matrix (str) : count matrix
            alphabet (str) : motif alphabet
            motif_width (int) : motif width
        ----
        Returns:
            motif_matrix_sc (np.ndarray) : scaled motif matrix
            min_val (int) : lowest value in the scaled motif matrix
            max_val (int) : higest value in the scaled motif matrix
            scale_factor (int)
            offset (int)
    """

    if not isinstance(motif_matrix, pd.DataFrame):
        raise NoDataFrameException(
            "The given motif matrix must be an instance of pandas.DataFrame")
        die(1)

    if motif_matrix.empty:
        raise NotValidMotifMatrixException("The given motif matrix is empty")
        die(1)

    if not isinstance(alphabet, list):
        raise NotValidAlphabetException("The alphabet given is not in a list")
        die(1)

    if not isListEqual(alphabet, DNA_ALPHABET):
        raise NotValidAlphabetException(
            "The alphabet given is not a valid DNA alphabet")
        die(1)

    assert motif_width > 0

    min_val = min(motif_matrix.min())
    max_val = max(motif_matrix.max())
    motif_matrix_sc = pd.DataFrame(index=list(motif_matrix.index),
                                   columns=list(motif_matrix.columns),
                                   data=0)

    lower = min_val
    upper = max_val

    if lower == upper:  # all values are equal
        lower = np.double(upper - 1)

    lower = np.floor(lower)
    offset = np.round(np.floor(lower))
    scale_factor = np.floor(RANGE / (upper - lower))

    # values will be in [0, 1000]
    for nuc in alphabet:
        for j in range(motif_width):
            scaled_score = np.round(
                (motif_matrix.loc[nuc, j] - (offset)) * scale_factor)
            motif_matrix_sc.loc[nuc, j] = scaled_score
        # end for
    # end for

    # make sure the values are integers
    motif_matrix_sc[:] = motif_matrix_sc[:].astype(int)

    # now they are scaled
    min_val = min(motif_matrix_sc.min())
    max_val = max(motif_matrix_sc.max())

    return motif_matrix_sc, min_val, max_val, int(scale_factor), offset