def read_MEME_motif(motif_file: str, bg_file: str, pseudocount: float, no_reverse: bool, verbose: bool, debug: bool) -> List[Motif]: """Read motif PWM in MEME format. The data read are then used to build the scoring matrix for the motif, the P-value matrix, etc. Since a MEME file can contain one or more motifs, for each stored PWM is built the corresponding Motif object. The resulting set of motifs are stored in a list, which will constitute a MotifSet object. ... Parameters ---------- motif_file : str path to the motif PWM in JASPAR format bg_file path to the background file in Markov Background Format (http://meme-suite.org/doc/bfile-format.html). pseudocount : float value to add to motif PWM counts no_reverse : bool if False only the forward strand will be considered, otherwise both forward and reverse are considered verbose : bool print additional information debug: trace the full error stack Returns ------- List[Motif] list of Motif objects """ if not isinstance(motif_file, str): errmsg = "Expected str, got {}.\n" exception_handler(TypeError, errmsg.format(type(motif_file).__name__), debug) if not os.path.isfile(motif_file): errmsg = "Unable to locate {}.\n" exception_handler(FileNotFoundError, errmsg.format(motif_file), debug) if not isinstance(bg_file, str): errmsg = "Expected str, got {}.\n" exception_handler(TypeError, errmsg.format(type(bg_file).__name__), debug) if bg_file != UNIF and not os.path.isfile(bg_file): errmsg = "Unable to locate {}.\n" exception_handler(FileNotFoundError, errmsg.format(bg_file), debug) if not isinstance(pseudocount, float): errmsg = "Expected float, got {}.\n" exception_handler(TypeError, errmsg.format(type(pseudocount).__name__), debug) if pseudocount <= 0: errmsg = "The pseudocount must be > 0.\n" exception_handler(ValueError, errmsg, debug) if not isinstance(no_reverse, bool): errmsg = "Expected bool, got {}.\n" exception_handler(TypeError, errmsg.format(type(no_reverse).__name__), debug) motifs_raw = list() motifs: List[Motif] = list() motifs_num = 0 proceed = False # begin motif parsing try: ifstream = open(motif_file, mode="r") alphabet = __read_alphabet_meme(motif_file, ifstream, debug) # shared by all motifs nucsmap = dict() # used with np object for i in range(len(alphabet)): nucsmap.update({alphabet[i]: i}) while True: for line in ifstream: if line.startswith("MOTIF"): break # new motif instance else: assert motifs_num == len(motifs_raw) proceed = True break if proceed: break # read all motifs if verbose: start_rm = time.time() motifids = line.split() if len(motifids) == 2: # only name motif_id = motifids[1] motif_name = motif_id else: # assume first two fieds: id, name motif_id, motif_name = motifids[1:3] statistics = __read_statistics_meme(motif_file, ifstream, debug) probs = __read_counts_meme(motif_file, ifstream, statistics["width"], debug) motifs_raw.append({ "motifId": motif_id, "motifName": motif_name, "statistics": statistics, "counts": probs }) motifs_num += 1 if verbose: end_rm = time.time() print("Read motif %s in %.2fs." % (motif_name, (end_rm - start_rm))) if not proceed: errmsg = "Unexpected premature EOF in {}.\n" exception_handler(EOFError, errmsg.format(motif_file), debug) except: errmsg = "An error occurred while reading {}.\n" exception_handler(MotifFileReadError, errmsg.format(motif_file), debug) else: if bg_file == UNIF: bgs = get_uniformBG(alphabet, debug) elif os.path.isfile(bg_file): bgs = readBGfile(bg_file, debug) else: errmsg = "Unable to parse {}.\n" exception_handler(BGFileError, errmsg.format(bg_file), debug) bgs = pseudo_bg(bgs, no_reverse, debug) # add pseudocount to bg for i in range(motifs_num): mp = pd.DataFrame(np.matrix(motifs_raw[i]["counts"])) mp.index = alphabet mp = norm_motif(mp, motifs_raw[i]["statistics"]["width"], alphabet, debug) mp = apply_pseudocount_meme(mp.to_numpy(), pseudocount, motifs_raw[i]["statistics"]["nsites"], motifs_raw[i]["statistics"]["width"], bgs, alphabet, nucsmap, debug) motif: Motif = Motif(mp, motifs_raw[i]["statistics"]["width"], alphabet, motifs_raw[i]["motifId"], motifs_raw[i]["motifName"], nucsmap) motif.setBg(bgs) motifs.append(motif) finally: ifstream.close() return motifs
def read_JASPAR_motif(motif_file: str, bg_file: str, pseudocount: float, no_reverse: bool, verbose: bool, debug: bool) -> Motif: """Read a motif PWM in JASPAR format. The data read are then used to build the scoring matrix for the motif, the P-value matrix, etc. ... Parameters ---------- motif_file : str path to the motif PWM in JASPAR format bg_file path to the background file in Markov Background Format (http://meme-suite.org/doc/bfile-format.html). pseudocount : float value to add to motif PWM counts no_reverse : bool if False only the forward strand will be considered, otherwise both forward and reverse are considered verbose : bool print additional information debug: trace the full error stack Returns ------- Motif Motif object """ nucs: List[str] = list() counts: List[float] = list() if verbose: start_rm: float = time.time() try: ifstream = open(motif_file, mode="r") readlines = 0 # check for empty files # begin parsing header: str = str(ifstream.readline().strip()[1:]) if not header: # empty file? errmsg = "{} seems to empty.\n" exception_handler(IOError, errmsg.format(motif_file), debug) motifID, motifName = header.split('\t')[0:2] readlines += 1 while True: line = ifstream.readline().strip() if not line: break # EOF or empty file? nuc = line.strip()[:1] count = list(map(float, line.strip()[1:].split()[1:][:-1])) nucs.append(nuc.upper()) counts.append(count) readlines += 1 if readlines <= 1: # only header read ? errmsg = "{} seems to be empty.\n" exception_handler(IOError, errmsg.format(motif_file), debug) except: errmsg = "An error occurred while reading {}.\n" exception_handler(MotifFileReadError, errmsg.format(motif_file), debug) else: if any([len(c) != len(counts[0]) for c in counts]): errmsg = "Motif counts width mismatch.\n" exception_handler(ValueError, errmsg, debug) nucsmap = dict() # used with np object for i in range(len(nucs)): nucsmap.update({nucs[i]: i}) motif_counts: pd.DataFrame = pd.DataFrame( data=counts, index=nucs) # motif count matrix motif_width: int = int(len(counts[0])) alphabet: list = sorted(nucs) # compute background if bg_file == UNIF: bgs = get_uniformBG(alphabet, debug) elif os.path.isfile(bg_file): bgs = readBGfile(bg_file, debug) else: errmsg = "Unable to parse {}.\n" exception_handler(BGFileError, errmsg.format(bg_file), debug) bgs = pseudo_bg(bgs, no_reverse, debug) # add pseudocount to bg # motif probability matrix motif_probs = (motif_counts / motif_counts.sum(0)) motif_probs = norm_motif(motif_probs, motif_width, alphabet, debug) motif_probs = apply_pseudocount_jaspar(motif_counts.to_numpy(), motif_probs.to_numpy(), pseudocount, bgs, motif_width, alphabet, nucsmap, debug) motif: Motif = Motif(motif_probs, motif_width, alphabet, motifID, motifName, nucsmap) motif.setBg(bgs) if verbose: end_rm: float = time.time() msg: str = "Read motif %s in %.2fs" % (motifID, (end_rm - start_rm)) print(msg) finally: ifstream.close() return motif
def read_JASPAR_motif(motif_file: str, bg_file: str, pseudocount: float, no_reverse: bool, verbose: bool) -> Motif: """Read a motif PWM in JASPAR format. The data read are then used to build the scoring matrix for the motif, the P-value matrix, etc. Parameters: motif_file : str path to the motif PWM in JASPAR format bg_file path to the background file in Markov Background Format (http://meme-suite.org/doc/bfile-format.html). pseudocount : float value to add to motif PWM counts no_reverse : bool if False only the forward strand will be considered, otherwise both forward and reverse are considered verbose : bool print additional information Returns ------- Motif Motif object storing the data contained in motif_file """ nucs: List[str] counts: List[float] # lists where store nucleotides and raw counts nucs = list() counts = list() if verbose: start_rm: float = time.time() try: # open the motif file with open(motif_file) as in_mtf: header: str motifID: str motifName: str # read the header header = str(in_mtf.readline()[1:]) # get the jaspar ID and the common TF name motifID, motifName = header.split('\t')[0:2] motifName = motifName[:-1] # remove '\n' for line in in_mtf: line = line.strip() nuc = line.strip()[:1] count = list(map(float, line.strip()[1:].split()[1:][:-1])) nucs.append(nuc) counts.append(count) # end for # end open except: errmsg: str = ' '.join(["\n\nERROR: unable to read file", motif_file]) raise FileReadingException(errmsg) else: motif_counts = pd.DataFrame(data=counts, index=nucs) # the check of equal length for all raw counts is made building # the DataFrame motif_width: int = int(len(counts[0])) alphabet: list = sorted(nucs) # alphabet as list bgs: Dict if bg_file == 'UNIF': bgs = get_uniformBG(alphabet) elif os.path.exists(bg_file): bgs = readBGfile(bg_file) else: errmsg = "\n\nERROR: unable to find the given background file" raise NotValidBGException(errmsg) # end if bgs = pseudo_bg(bgs, no_reverse) motif_probs: pd.DataFrame motif_probs = (motif_counts / motif_counts.sum(0)) motif_probs = norm_motif(motif_probs, motif_width, alphabet) motif_probs = apply_pseudocount_jaspar(motif_counts, motif_probs, pseudocount, bgs, motif_width, alphabet) motif: Motif = Motif(motif_probs, motif_width, alphabet, motifID, motifName) motif.setBg(bgs) if verbose: end_rm: float = time.time() msg: str = ''.join( ["Read motif ", motifID, " in ", str(end_rm - start_rm), "s"]) print(msg) # end if return motif finally: in_mtf.close() # close the motif file anyway
def read_MEME_motif(motif_file: str, bg_file: str, pseudocount: float, no_reverse: bool, verbose: bool) -> List[Motif]: """Read a motif PWM in MEME format. The data read are then used to build the scoring matrix for the motif, the P-value matrix, etc. Since a MEME file can contain one or more motifs, for each PWM contained is built the corresponding motif object. The resulting set of motifs are then stored in a list. Parameters ---------- motif_file : str path to the motif PWM bg_file : str path to the background probability distribution pseudocount : float pseudocount to add to the PWM values no_reverse : bool if False only the forward strand will be considered, otherwise both forward and reverse are considered verbose : bool print additional information Returns ------- List[Motif] List of Motif objects storing the data contained in motif_file """ try: with open(motif_file, 'r') as in_mtf: # open the motif file # flag to keep track were the infos about the motif begin infostart: bool # flag to keep track were the motif data begin datastart: bool # number of motifs found in the MEME file motifs_found: int # list of the found motif IDs motifID_lst: List[str] # list of the found motif names motifName_lst: List[str] # list of the found motif widths motif_width_lst: List[int] # list of the found motif site counts site_counts_lst: List[int] # list of the found motif alphabet lengths alphalen_lst: List[int] # list of the found motif probability matrices motif_probs_lst: List[pd.DataFrame] # list of the found As probabilities for each motif a_lst: List[np.double] # list of the found Cs probabilities for each motif c_lst: List[np.double] # list of the found Gs probabilities for each motif g_lst: List[np.double] # list of the found Ts probabilities for each motif t_lst: List[np.double] infostart = False datastart = False motifs_found = 0 motifID_lst = list() motifName_lst = list() motif_width_lst = list() site_counts_lst = list() alphalen_lst = list() motif_probs_lst = list() a_lst = list() c_lst = list() g_lst = list() t_lst = list() motif_width = None pos_read = 0 for line in in_mtf: if line[0:8] == 'ALPHABET': alphabet: List = sorted(list(set(line[10:-1]))) assert isListEqual(alphabet, DNA_ALPHABET) if line[0:5] == 'MOTIF': if verbose: start_rm: float = time.time() # read motif ID and full name motif_header: str = line.split() assert len(motif_header) > 0 # there are two ways to define the motif name line # in MEME file # (refer to http://meme-suite.org/doc/meme-format.html?man_type=web): # 1 - MOTIF motif_alternate_name # 2 - MOTIF motif_identifier motif_alternate_name motifID: str motifName: str if len(motif_header) == 2: # support case (1) motifID = motif_header[1] motifName = motif_header[1] else: # support case (2) motifID, motifName = motif_header[1:3] # end if motifID_lst.append(motifID) motifName_lst.append(motifName) # the informations about motif start here infostart = True continue # end if if infostart and len(line.strip()) != 0: infos: str = line[26:] infosplit: List[str] = infos.split() alphalen: int = int(infosplit[1]) alphalen_lst.append(alphalen) assert alphalen == len(alphabet) motif_width: int = int(infosplit[3]) site_counts: int = int(infosplit[5]) infostart = False # informations end here # allocate space for the motif probability matrix motif_probs: pd.DataFrame = pd.DataFrame( index=alphabet, columns=range(motif_width), data=np.double(0)) motif_width_lst.append(motif_width) site_counts_lst.append(site_counts) motif_probs_lst.append(motif_probs) datastart = True # at next step begin data # initialize nucleotide data a = list() c = list() g = list() t = list() continue # end if if datastart and pos_read < motif_width: freqs = line.split() a.append(np.double(freqs[0])) c.append(np.double(freqs[1])) g.append(np.double(freqs[2])) t.append(np.double(freqs[3])) pos_read += 1 # end if # we read all current motif data if pos_read == motif_width: a_lst.append(a) c_lst.append(c) g_lst.append(g) t_lst.append(t) # update stats about found motifs motifs_found += 1 # clear the statistics pos_read: int = 0 motif_width = None datastart = False alphalen = -1 datastart = False if verbose: end_rm: float = time.time() msg: str = ''.join([ "Read motif ", motifID, " in ", str(end_rm - start_rm), "s" ]) print(msg) # end if # end if except: # something went wrong errmsg: str = ' '.join(["Unable to read file", motif_file]) raise FileReadingException(errmsg) else: bgs: dict # read the background if bg_file == 'UNIF': bgs = get_uniformBG(alphabet) elif os.path.exists(bg_file): bgs = readBGfile(bg_file) else: errmsg = "\n\nERROR: unable to find the given background file" raise NotValidBGException(errmsg) # end if bgs = pseudo_bg(bgs, no_reverse) motif_lst: List[Motif] motif_lst = list() for i in range(motifs_found): mp: pd.DataFrame = motif_probs_lst[i] mp.loc['A'] = a_lst[i] mp.loc['C'] = c_lst[i] mp.loc['G'] = g_lst[i] mp.loc['T'] = t_lst[i] mw: int = motif_width_lst[i] sc: int = site_counts_lst[i] mp = norm_motif(mp, mw, alphabet) mp = apply_pseudocount_meme(mp, pseudocount, sc, mw, bgs, alphabet) motif: Motif = Motif(mp, mw, alphabet, motifID_lst[i], motifName_lst[i]) motif.setBg(bgs) motif_lst.append(motif) # end for return motif_lst finally: in_mtf.close() # close the file anyway
def read_MEME_motif(motif_file, bg_file, pseudocount, no_reverse, verbose): """ Read the motif file in MEME format and build a motif object from it. Note that a MEME file can contain a variable number of motifs ---- Params: motif_file (str) : path to the motif file bg_file (str) : path to the background file pseudocount (np.double) : pseudocount to add to motif frequencies no_reverse (bool) : if set to True, only data related to forward strand will be used ---- Returns: motif (Motif) : returns a Motif object """ try: with open(motif_file, 'r') as in_mtf: # open the motif file infostart = False # flag to keep track were the infos about the motif begin datastart = False # flag to keep track were the motif data begin motifs_found = 0 # number of motifs found in the MEME file motifID_lst = [] # list of the found motif IDs motifName_lst = [] # list of the found motif names motif_width_lst = [] # list of the found motif widths site_counts_lst = [] # list of the found motif site counts alphalen_lst = [] # list of the found motif alphabet lengths motif_probs_lst = [ ] # list of the found motif probability matrices a_lst = [] # list of the found As probabilities for each motif c_lst = [] # list of the found Cs probabilities for each motif g_lst = [] # list of the found Gs probabilities for each motif t_lst = [] # list of the found Ts probabilities for each motif motif_width = None pos_read = 0 for line in in_mtf: if line[0:8] == 'ALPHABET': alphabet = sorted(list(set(line[10:-1]))) assert isListEqual(alphabet, DNA_ALPHABET) if line[0:5] == 'MOTIF': if verbose: start_rm = time.time() motifID, motifName = line.split()[1:3] motifID_lst.append(motifID) motifName_lst.append(motifName) # the informations about motif start here infostart = True continue # end if if infostart and len(line.strip()) != 0: infos = line[26:] infosplit = infos.split() alphalen = int(infosplit[1]) alphalen_lst.append(alphalen) assert alphalen == len(alphabet) motif_width = int(infosplit[3]) site_counts = int(infosplit[5]) infostart = False # informations end here # allocate space for the motif probability matrix motif_probs = pd.DataFrame(index=alphabet, columns=range(motif_width), data=np.double(0)) motif_width_lst.append(motif_width) site_counts_lst.append(site_counts) motif_probs_lst.append(motif_probs) datastart = True # at next step begin data # initialize nucleotide data a = [] c = [] g = [] t = [] continue # end if if datastart and pos_read < motif_width: freqs = line.split() a.append(np.double(freqs[0])) c.append(np.double(freqs[1])) g.append(np.double(freqs[2])) t.append(np.double(freqs[3])) pos_read += 1 # end if # we read all current motif data if pos_read == motif_width: a_lst.append(a) c_lst.append(c) g_lst.append(g) t_lst.append(t) # update stats about found motifs motifs_found += 1 # clear the statistics pos_read = 0 motif_width = None datastart = False alphalen = -1 datastart = False if verbose: end_rm = time.time() msg = ''.join([ "Read motif ", motifID, " in ", str(end_rm - start_rm), "s" ]) print(msg) # end if # end if except: # something went wrong errmsg = ' '.join(["Unable to read file", motif_file]) raise FileReadingException(errmsg) else: # read the background if bg_file == 'UNIF': bgs = get_uniformBG(alphabet) elif os.path.exists(bg_file): bgs = readBGfile(bg_file) else: raise NotValidBGException( "\n\nERROR: unable to find the given background file") # end if bgs = pseudo_bg(bgs, no_reverse) motif_lst = [] # list of found motifs for i in range(motifs_found): mp = motif_probs_lst[i] mp.loc['A'] = a_lst[i] mp.loc['C'] = c_lst[i] mp.loc['G'] = g_lst[i] mp.loc['T'] = t_lst[i] mw = motif_width_lst[i] sc = site_counts_lst[i] mp = norm_motif(mp, mw, alphabet) mp = apply_pseudocount_meme(mp, pseudocount, sc, mw, bgs, alphabet) motif = Motif(mp, mw, alphabet, motifID_lst[i], motifName_lst[i]) motif.setBg(bgs) motif_lst.append(motif) # end for return motif_lst finally: in_mtf.close() # close the file anyway
def read_JASPAR_motif(motif_file, bg_file, pseudocount, no_reverse, verbose): """ Read data contained in a JASPAR motif file and build a Motif object from them ---- Params: motif_file (str) : path to the motif file (in JASPAR format) bg_file (str) : path to the background file no_reverse (bool) : flag parameter to consider or not the reverse complement building the Motif object ---- Returns: motif (Motif) : Motif object summarizing data contained in motif_file """ # lists where store nucleotides and raw counts nucs = [] counts = [] if verbose: start_rm = time.time() try: # open the motif file with open(motif_file) as in_mtf: header = str(in_mtf.readline()[1:]) # read the header motifID, motifName = header.split('\t')[ 0:2] # get the jaspar ID and the common TF name motifName = motifName[:-1] # remove '\n' for line in in_mtf: line = line.strip() nuc = line.strip()[:1] # read nucleotide count = list(map( float, line.strip()[1:].split()[1:][:-1])) # read raw counts nucs.append(nuc) counts.append(count) # end for # end open except: # something went wrong errmsg = ' '.join(["\n\nERROR: unable to read file", motif_file]) raise FileReadingException(errmsg) else: motif_counts = pd.DataFrame(data=counts, index=nucs) # raw counts motif_width = int( len(counts[0]) ) # the check of equal length for all raw counts is made building the DataFrame alphabet = sorted(nucs) # alphabet as list # read the background file if bg_file == 'UNIF': bgs = get_uniformBG(alphabet) elif os.path.exists(bg_file): bgs = readBGfile(bg_file) else: raise NotValidBGException( "\n\nERROR: unable to find the given background file") # end if bgs = pseudo_bg(bgs, no_reverse) motif_probs = (motif_counts / motif_counts.sum(0)) # get probabilities motif_probs = norm_motif(motif_probs, motif_width, alphabet) motif_probs = apply_pseudocount_jaspar(motif_counts, motif_probs, pseudocount, bgs, motif_width, alphabet) motif = Motif(motif_probs, motif_width, alphabet, motifID, motifName) motif.setBg(bgs) if verbose: end_rm = time.time() msg = ''.join( ["Read motif ", motifID, " in ", str(end_rm - start_rm), "s"]) print(msg) # end if return motif finally: in_mtf.close() # close the motif file anyway