def __init__(self, count_matrix, width, alphabet, motif_id, motif_name): if count_matrix.empty: errmsg = "\n\nERROR: attempt to initialize the motif object with an empty count matrix" raise NotValidMotifMatrixException(errmsg) if not isinstance(count_matrix, pd.DataFrame): raise NoDataFrameException( "\n\nERROR: the given value is not a pandas.DatFrame instance") if not isinstance(width, int) or width < 0: errmsg = "\n\nERROR: attempt to initialize motif without a valid width" raise WrongMotifWidthException(errmsg) if not isinstance(motif_id, str) or not motif_id: raise WrongMotifIDException( "\n\nERROR: cannot initialize the motif with the given ID") if not isinstance(motif_name, str) or not motif_name: raise WrongMotifNameException( "\n\nERROR: cannot initialize the motif with the given name") if not isinstance(alphabet, list) or not isListEqual( alphabet, DNA_ALPHABET): errmsg = "\n\nERROR: cannot initialize a motif object with a wrong alphabet" raise NotValidAlphabetException(errmsg) self._count_matrix = count_matrix self._width = width self._motif_id = motif_id self._motif_name = motif_name self._alphabet = alphabet
def setMotif_matrix(self, motif_matrix): if motif_matrix.empty: errmsg = "\n\nERROR: attempt to use an empty motif matrix" raise NotValidMotifMatrixException(errmsg) if not isinstance(motif_matrix, pd.DataFrame): raise NoDataFrameException( "\n\nERROR: the given value is not a pandas.DataFrame instance" ) self._count_matrix = motif_matrix
def print_results(results): """Print GRAFIMO results on terminal without storing them on the three files (TSV, HTML, GFF3) Parameters ---------- results : pandas.DataFrame GRAFIMO results """ if not isinstance(results, pd.DataFrame): errmsg: str = "\n\nERROR: the results must be stored in a pandas DataFrame" raise NoDataFrameException(errmsg) # little hack in pd df parameters to avoid the weird default # print of a DataFrame (cut the majority of lines) pd.set_option("display.max_rows", len(results)) print() # newline print(results) pd.reset_option("display.max_rows")
def list_data(data, qvalue): """ Convert a pandas DataFrame in a list of lists, where each column is a list of values ---- Parameters: data (pd.DataFrame) : input pandas DataFrame ---- Returns: summary (list) : pandas DataFrame converted in a list of lists """ if not isinstance(data, pd.DataFrame): raise NoDataFrameException( "DataFrame given is not an instance of pandas.DataFrame") die(1) assert len(data.columns) <= 11 assert len(data.columns) >= 10 seqnames = data['sequence_name'].to_list() starts = data['start'].to_list() stops = data['stop'].to_list() scores = data['score'].to_list() strands = data['strand'].to_list() motifIDs = data['motif_id'].to_list() motifNames = data['motif_alt_id'].to_list() pvalues = data['p-value'].to_list() sequences = data['matched_sequence'].to_list() references = data['reference'].to_list() if qvalue: qvalues = data['q-value'].to_list() if qvalue: summary = [ motifIDs, motifNames, seqnames, starts, stops, strands, scores, pvalues, sequences, references, qvalues ] else: summary = [ motifIDs, motifNames, seqnames, starts, stops, strands, scores, pvalues, sequences, references ] summary_len = len(motifIDs) assert summary_len == len(data.index) assert summary_len == len(motifNames) assert summary_len == len(seqnames) assert summary_len == len(starts) assert summary_len == len(stops) assert summary_len == len(strands) assert summary_len == len(scores) assert summary_len == len(pvalues) assert summary_len == len(sequences) if qvalue: assert summary_len == len(qvalues) return summary
def write_results(results: pd.DataFrame, motif_id: str, motif_num: int, args_obj: Findmotif) -> None: """Write GRAFIMO analysis results in three files (TSV report, HTML report, GFF3 file). To first two reports contain in a tabular format the motif occurrence candidates retrieved by GRAFIMO, with scores, P-values, q-values, etc. for each table entry. The third file contains a suitabl input for custom track on the UCSC genome browser. The user can also ask to display the results directly on the terminal enabling the correspondent flag, when calling GRAFIMO in command-line. Parameters ---------- results : pandas.DataFrame results of GRAFIMO analysis motif_id : str motif ID motif_num : int number of searched motifs args_obj : Findmotif container for arguments needed to store the results """ errmsg: str if not isinstance(args_obj, Findmotif): errmsg = "\n\nERROR: incorrect data-type. Exiting" raise ValueError(errmsg) if not isinstance(results, pd.DataFrame): errmsg = "\n\nERROR: results must be stored in a pandas DataFrame" raise NoDataFrameException(errmsg) # get resuls storing arguments outdir: str = args_obj.get_outdir() no_qvalue: bool = args_obj.get_no_qvalue() top_graphs: int = args_obj.get_top_graphs() verbose: bool = args_obj.get_verbose() vg: str if args_obj.has_graph_genome(): vg = args_obj.get_graph_genome() elif args_obj.has_graph_genome_dir(): vg = args_obj.get_graph_genome_dir() else: errmsg = "\n\nERROR: no VG given" raise VGException(errmsg) dirname_default: bool = False cwd: str = os.getcwd() if outdir == DEFAULT_OUTDIR: # to make unique the output directory we add the PID # to the name. # # This is useful when calling grafimo in different runs on the # same machine. # append the PID and the motif ID outdir = '_'.join(["grafimo_out", str(os.getpid()), motif_id]) dirname_default = True # end if cmd: str code: int if not os.path.isdir(outdir): cmd = 'mkdir -p {0}'.format(outdir) code = subprocess.call(cmd, shell=True) if code != 0: errmsg = ''.join( ["\n\nERROR: An error occurred while executing ", cmd, "\n"]) raise SubprocessError(errmsg) os.chdir(outdir) else: os.chdir(outdir) # NB the content of the directory will be automatically # overwritten # end if print("\nWriting results in %s\n" % outdir) prefix: str # get the filename prefix if not dirname_default and motif_num > 1: # each file is labeled with the motif ID prefix = '_'.join(['grafimo_out', motif_id]) else: prefix = 'grafimo_out' if verbose: start_tsv: float = time.time() # write the TSV results.to_csv(''.join([prefix, '.tsv']), sep='\t', encoding='utf-8') if verbose: end_tsv: float = time.time() print("%s.tsv written in %.2fs" % (prefix, (end_tsv - start_tsv))) start_html: float = time.time() # write the HTML results.to_html(''.join([prefix, '.html'])) if verbose: end_html: float = time.time() print("%s.html written in %.2fs" % (prefix, (end_html - start_html))) start_gff: float = time.time() # write the GFF3 writeGFF3(prefix, results, no_qvalue) if verbose: end_gff: float = time.time() print("%s.gff written in %.2fs" % (prefix, (end_gff - start_gff))) # get the graphs of the top n regions regions: List[str] if top_graphs > 0: regions = results['sequence_name'].to_list() # the results are empty if len(regions) == 0: print( "WARNING: no region was available. Are your results empty?\n") os.chdir(cwd) return # get the n different top regions' graph regions = unique_lst(regions, size=top_graphs) if verbose: print("Extracting %d region variation graphs" % top_graphs) if len(regions) < top_graphs: top_graphs = len(regions) print("WARNING: possible to visualize only the top %d regions\n" % (top_graphs)) # create the directory for the regions pictures image_dir: str if motif_num > 1: image_dir = '_'.join(["top_graphs", motif_id]) else: image_dir = "top_graphs" if verbose: print("Graphs will be stored in %s" % image_dir) cmd = "mkdir -p {0}".format(image_dir) code = subprocess.call(cmd, shell=True) if code != 0: errmsg = ' '.join( ["\n\nERROR: an error occurred while executing", cmd, "\n"]) raise SubprocessError() os.chdir(image_dir) print("Writing the top %d graphs in %s\n" % (top_graphs, image_dir)) for i in range(top_graphs): region: str = regions[i] # the VG accepts only queries like 1:100-200 and not like # chr1:100-200 region = region.split("chr")[1] if verbose: print("Computing the PNG image of %s" % region) getRegion_graph(region, vg) os.chdir(cwd)
def writeGFF3(prefix: str, data: pd.DataFrame, no_qvalue: bool) -> None: """Write a GFF3 file (https://www.ensembl.org/info/website/upload/gff3.html) containing all the motif occurrence candidates retrieved by GRAFIMO. The resulting file can be loaded on the UCSC genome browser to view the occurrence Parameters ---------- prefix : str filename prefix data : pandas.DataFrame results of GRAFIMO analysis no_qvalue : bool if set to True the GFF3 entrues will contain also the corresponding q-value """ qvalue: bool errmsg: str if not isinstance(data, pd.DataFrame): errmsg = "\n\nERROR: the object is not an instance of pandas.DataFrame" raise NoDataFrameException(errmsg) if no_qvalue: qvalue = False else: qvalue = True try: f = open(''.join([prefix, '.gff']), mode='w+') header = "##gff-version 3\n" f.write(header) data_list: List[List[str], List[str], List[str], List[int], List[int], List[str], List[float], List[float], List[str], List[int], List[str], Optional[List[float]]] data_list = list_data(data, qvalue) if qvalue and len(data_list) < 12: errmsg = "\n\nERROR: wrong data size. Unable to write the GFF3 report\n" raise Exception(errmsg) data_list_size: int = len(data_list[0]) for i in range(data_list_size): seqname: str = data_list[2][i] chrom: str = seqname.split(':')[ 0] # takes only the chromosome name score: float = round(data_list[6][i], 1) strand: str = data_list[5][i] start: int end: int if strand == '-': # keep forward strand coordinates start = data_list[4][i] end = data_list[3][i] else: start = data_list[3][i] end = data_list[4][i] motifID: str = data_list[0][i] motifName: str = data_list[1][i] pvalue: float = np.format_float_scientific(data_list[7][i], exp_digits=2) sequence: str = data_list[8][i] reference: str = data_list[10][i] if qvalue: qvalue: float = np.format_float_scientific(data_list[11][i], exp_digits=2) att1: str = ''.join( ['Name=', motifID, '_', seqname, strand, ':', reference]) att2: str = ''.join(["Alias=", motifName]) att3: str = ''.join(["ID=", motifID, '-', motifName, '-', seqname]) att4: str = ''.join(['pvalue=', str(pvalue)]) att5: str = ''.join(['sequence=', sequence, ';\n']) atts: str if qvalue: attqv: str = ''.join(['qvalue=', str(qvalue)]) atts = ';'.join([att1, att2, att3, att4, attqv, att5]) else: atts = ';'.join([att1, att2, att3, att4, att5]) gffline: str = '\t'.join([ chrom, SOURCE, TP, start, end, str(score), strand, PHASE, atts ]) f.write(gffline) # end for except: errmsg = ''.join( ["\n\nERROR: unable to open or write data on ", prefix, ".gff"]) raise FileReadingException(errmsg) finally: f.close()
def scale_pwm(motif_matrix: pd.DataFrame, alphabet: List[str], motif_width: int) -> Tuple[np.ndarray, int, int, int, np.double]: """Scale the log-odds values of the motif scoring matrix. The values are scaled in the range [0, 1000]. The scaling improves computational speed while computing the score for each motif occurrence candidate, and allows a constant time computation of the corresponding P-value. Parameters ---------- motif_matrix : pd.DataFrame motif log-odds matrix alphabet: list DNA motif alphabet motif_width: int motif width Returns ------- numpy.ndarray scaled motif scoring matrix int minimum value of the scaled scoring matrix int maximum value of the scaled scoring matrix int scaling factor numpy.double scaling offset """ errmsg: str if not isinstance(motif_matrix, pd.DataFrame): errmsg = "\n\nERROR: The given motif matrix must be an instance of pandas.DataFrame" raise NoDataFrameException(errmsg) if motif_matrix.empty: errmsg = "\n\nERROR: The given motif matrix is empty" raise NotValidMotifMatrixException(errmsg) if not isinstance(alphabet, list): errmsg = "\n\nERROR: The alphabet given is not in a list" raise NotValidAlphabetException(errmsg) if not isListEqual(alphabet, DNA_ALPHABET): errmsg = "\n\nERROR: The alphabet given is not a valid DNA alphabet" raise NotValidAlphabetException(errmsg) assert motif_width > 0 min_val: int max_val: int motif_matrix_sc: pd.DataFrame min_val = min(motif_matrix.min()) max_val = max(motif_matrix.max()) motif_matrix_sc = pd.DataFrame(index=list(motif_matrix.index), columns=list(motif_matrix.columns), data=0) lower: int = min_val upper: int = max_val if lower == upper: # all values are equal lower = np.double(upper - 1) offset: np.double scale_factor: int lower = np.floor(lower) offset = np.round(np.floor(lower)) scale_factor = np.floor(RANGE / (upper - lower)) # values will be in [0, 1000] for nuc in alphabet: for j in range(motif_width): scaled_score = np.round( (motif_matrix.loc[nuc, j] - (offset)) * scale_factor) motif_matrix_sc.loc[nuc, j] = scaled_score # end for # end for # make sure the values are integers motif_matrix_sc[:] = motif_matrix_sc[:].astype(int) # now they are scaled min_val = min(motif_matrix_sc.min()) max_val = max(motif_matrix_sc.max()) return motif_matrix_sc, min_val, max_val, int(scale_factor), offset
def list_data(data: pd.DataFrame, qvalue: bool) -> List: """Convert a pandas DataFrame in a list containign each dataframe column as a list of values Parameters ---------- data : pandas.DataFrame input DataFrame qvalue : bool if True the column of q-values has to be considered Returns ------- list list containing DataFrame's columns as list of values """ if not isinstance(data, pd.DataFrame): errmsg: str = "\n\nERROR: not allowed data type given" raise NoDataFrameException(errmsg) assert len(data.columns) <= 12 assert len(data.columns) >= 11 seqnames: List[str] = data['sequence_name'].to_list() starts: List[int] = data['start'].to_list() stops: List[int] = data['stop'].to_list() scores: List[np.double] = data['score'].to_list() strands: List[str] = data['strand'].to_list() motifIDs: List[str] = data['motif_id'].to_list() motifNames: List[str] = data['motif_alt_id'].to_list() pvalues: List[np.double] = data['p-value'].to_list() sequences: List[str] = data['matched_sequence'].to_list() frequencies:List[int] = data['haplotype_frequency'].to_list() references: List[str] = data['reference'].to_list() if qvalue: qvalues: List[np.double] = data['q-value'].to_list() if qvalue: summary = [motifIDs, motifNames, seqnames, starts, stops, strands, scores, pvalues, sequences, frequencies, references, qvalues] else: summary = [motifIDs, motifNames, seqnames, starts, stops, strands, scores, pvalues, sequences, frequencies, references] summary_len: int = len(motifIDs) assert summary_len == len(data.index) assert summary_len == len(motifNames) assert summary_len == len(seqnames) assert summary_len == len(starts) assert summary_len == len(stops) assert summary_len == len(strands) assert summary_len == len(scores) assert summary_len == len(pvalues) assert summary_len == len(sequences) assert summary_len == len(frequencies) assert summary_len == len(references) if qvalue: assert summary_len == len(qvalues) return summary
def scale_pwm(motif_matrix, alphabet, motif_width): """ Scale the motif matrix values ---- Parameters: motif_matrix (str) : count matrix alphabet (str) : motif alphabet motif_width (int) : motif width ---- Returns: motif_matrix_sc (np.ndarray) : scaled motif matrix min_val (int) : lowest value in the scaled motif matrix max_val (int) : higest value in the scaled motif matrix scale_factor (int) offset (int) """ if not isinstance(motif_matrix, pd.DataFrame): raise NoDataFrameException( "The given motif matrix must be an instance of pandas.DataFrame") die(1) if motif_matrix.empty: raise NotValidMotifMatrixException("The given motif matrix is empty") die(1) if not isinstance(alphabet, list): raise NotValidAlphabetException("The alphabet given is not in a list") die(1) if not isListEqual(alphabet, DNA_ALPHABET): raise NotValidAlphabetException( "The alphabet given is not a valid DNA alphabet") die(1) assert motif_width > 0 min_val = min(motif_matrix.min()) max_val = max(motif_matrix.max()) motif_matrix_sc = pd.DataFrame(index=list(motif_matrix.index), columns=list(motif_matrix.columns), data=0) lower = min_val upper = max_val if lower == upper: # all values are equal lower = np.double(upper - 1) lower = np.floor(lower) offset = np.round(np.floor(lower)) scale_factor = np.floor(RANGE / (upper - lower)) # values will be in [0, 1000] for nuc in alphabet: for j in range(motif_width): scaled_score = np.round( (motif_matrix.loc[nuc, j] - (offset)) * scale_factor) motif_matrix_sc.loc[nuc, j] = scaled_score # end for # end for # make sure the values are integers motif_matrix_sc[:] = motif_matrix_sc[:].astype(int) # now they are scaled min_val = min(motif_matrix_sc.min()) max_val = max(motif_matrix_sc.max()) return motif_matrix_sc, min_val, max_val, int(scale_factor), offset