def score_seqs(sequences: List[str], motif: Motif, no_reverse: bool, return_dict: DictProxy, scanned_seqs_dict: DictProxy, scanned_nucs_dict: DictProxy, pid: int ) -> None: """Score the retrieved sequences using motif scoring matrix data. The partial results are stored in a dictionary with current process ID as key for the current entry. The different entries will be merged at the end, to obtian the final results report. Parameters ---------- sequences : list sequences to score motif : Motif motif object containing thescoring matrix and the P-value matrix no_reverse: if True only the sequences belonging to the forward strand will be scored return_dict : multiprocessing.managers.DictProxy dictionary where the current chunk of results will be stored scanned_seqs_dict : mp.managers.DictProxy dictionary storing the number of sequences scanned in each sequence chunk scanned_nucs_dict : mp.managers.DictProxy dictionary storing the number of nucleotides scanned in each sequence chunk """ try: # get Motif attributes to score sequences score_matrix: np.ndarray = motif.getMotif_scoreMatrix() pval_mat: np.array = motif.getMotif_pval_mat() min_score: int = motif.getMin_val() scale: int = motif.getScale() width: int = motif.getWidth() offset: np.double = motif.getOffset() # initialize lists where results will be stored seqs: List[str] = list() scores: List[np.double] = list() pvalues: List[np.double] = list() seqnames: List[str] = list() chroms: List[str] = list() starts: List[int] = list() stops: List[int] = list() strands: List[str] = list() frequencies: List[int] = list() references: List[str] = list() seqs_scanned: int = 0 # counter for scanned sequences width: int = motif.getWidth() for s in sequences: with open(s, mode='r') as raw_sequences: for line in raw_sequences: data = line.split('\t') strand = data[2][-1] if no_reverse: # score only the fw strand if strand == '+': # read the final values seq = data[1] seqname = ''.join(['chr', data[0]]) chrom = seqname.split(':')[0] start = data[2].split(':')[1] start = start[:-1] stop = data[3].split(':')[1] stop = stop[:-1] freq = data[4] ref = data[5] score, pvalue = compute_score_seq(seq, score_matrix, pval_mat, min_score, scale, width, offset) seqs_scanned += 1 seqs.append(seq) scores.append(score) pvalues.append(pvalue) seqnames.append(seqname) chroms.append(chrom) starts.append(start) stops.append(stop) strands.append(strand) frequencies.append(freq) # fix indels reference report bug distance: int = np.abs(int(stop) - int(start)) if (ref == "ref" and distance != width): ref = "non.ref" references.append(ref) # end if else: # score both fw and reverse strands seq = data[1] seqname = ''.join(['chr', data[0]]) chrom = seqname.split(':')[0] start = data[2].split(':')[1] start = start[:-1] stop = data[3].split(':')[1] stop = stop[:-1] freq = data[4] ref = data[5] score, pvalue = compute_score_seq(seq, score_matrix, pval_mat, min_score, scale, width, offset) seqs_scanned += 1 seqs.append(seq) scores.append(score) pvalues.append(pvalue) seqnames.append(seqname) chroms.append(chrom) starts.append(start) stops.append(stop) strands.append(strand) frequencies.append(freq) # fix indels reference report bug distance: int = np.abs(int(stop) - int(start)) if (ref == "ref" and distance != width): ref = "non.ref" references.append(ref) # end if # end for # end open # end for except KeyboardInterrupt: pass else: res_tmp = ResultTmp(seqnames, seqs, chroms, starts, stops, strands, scores, pvalues, frequencies, references) return_dict[pid] = res_tmp scanned_seqs_dict[pid] = seqs_scanned scanned_nucs_dict[pid] = seqs_scanned * width
def build_df(motif: Motif, seqnames: List[str], starts: List[int], stops: List[int], strands: List[str], scores: List[np.double], pvalues: List[np.double], qvalues: List[np.double], sequences: List[str], frequencies: List[int], references: List[str], threshold: float, qval_t: bool, no_qvalue: bool, recomb: bool ) -> pd.DataFrame: """Build the results summary report. The results are stored in a pandas DataFrame object. The motif occurrence candidates are filtered applying a threshold on the P-value or on the q-value. The remaining entries are reported in the final results. Parameters ---------- motif : Motif Motif object seqnames : list sequence names starts : list starting coordinates stops : list stopping coordinates strands : list DNA strands pvalues: list P-values qvalues : list q-values sequences : list sequences references : list flag values stating if the sequences contain genomi variants threshold : float threshold to apply on P-values or q-values in order to filter the motif occurrence candidates to report qval_t : bool if True the threshold will be applied on q-values rather on P-values no_qvalue: if True the q-values have not been computed recomb : bool if True will be reported also sequences which can be built with the given set of genomic variants but do not appear in the available samples haplotypes Returns ------- pandas.DataFrame final results report """ errmsg: str = "\n\nERROR: unknown data-type for motif" if not isinstance(motif, Motif): raise ValueException(errmsg) if not isinstance(seqnames, list): raise ValueException(errmsg) if not isinstance(starts, list): raise ValueException(errmsg) if not isinstance(stops, list): raise ValueException(errmsg) if not isinstance(strands, list): raise ValueException(errmsg) if not isinstance(pvalues, list): raise ValueException(errmsg) if not isinstance(qvalues, list): raise ValueException(errmsg) if not isinstance(sequences, list): raise ValueException(errmsg) if not isinstance(references, list): raise ValueException(errmsg) if not isinstance(references, list): raise ValueException(errmsg) if not isinstance(qval_t, bool): raise ValueException(errmsg) if not isinstance(no_qvalue, bool): raise ValueException(errmsg) if not isinstance(recomb, bool): raise ValueException(errmsg) lst_len: int = len(seqnames) assert len(starts) == lst_len assert len(stops) == lst_len assert len(strands) == lst_len assert len(scores) == lst_len assert len(pvalues) == lst_len assert len(sequences) == lst_len assert len(frequencies) == lst_len assert len(references) == lst_len # check if we want also the q-values if not no_qvalue: assert len(qvalues) == lst_len # apply the threshold on the q-values rather than on P-values if qval_t: assert (not no_qvalue) assert len(qvalues) > 0 seqnames_thresh: List[str] = list() starts_thresh: List[int] = list() ends_thresh: List[int] = list() strands_thresh: List[str] = list() scores_thresh: List[np.double] = list() pvalues_thresh: List[np.double] = list() sequences_thresh: List[str] = list() frequencies_thresh: List[int] = list() references_thresh: List[str] = list() if not no_qvalue: qvalues_thresh: List[np.double] = list() for i in range(lst_len): # ignore binding site candidates which does not appear in any sample # if not required by tyhe user to analyze them if not recomb and int(frequencies[i]) == 0: continue if not qval_t: # apply threshold on P-values pvalue: np.double = pvalues[i] if pvalue < threshold: # only the sequences with a P-value under the threshold survive seqnames_thresh.append(seqnames[i]) starts_thresh.append(starts[i]) ends_thresh.append(stops[i]) strands_thresh.append(strands[i]) scores_thresh.append(scores[i]) pvalues_thresh.append(pvalues[i]) sequences_thresh.append(sequences[i]) frequencies_thresh.append(frequencies[i]) references_thresh.append(references[i]) if not no_qvalue: qvalues_thresh.append(qvalues[i]) # end if else: # apply threshold on q-values qvalue: np.double = qvalues[i] if qvalue < threshold: # only the sequences with a q-value under the threshold survive seqnames_thresh.append(seqnames[i]) starts_thresh.append(starts[i]) ends_thresh.append(stops[i]) strands_thresh.append(strands[i]) scores_thresh.append(scores[i]) pvalues_thresh.append(pvalues[i]) sequences_thresh.append(sequences[i]) frequencies_thresh.append(frequencies[i]) references_thresh.append(references[i]) # the last control statement, in the if, in this case is not # necessary (we must have the q-values) # otherwise we should not be here qvalues_thresh.append(qvalues[i]) # end if # end if # end for df_len: int = len(seqnames_thresh) # TF's name and ID list motif_ids: List[str] = [motif.getMotifID()] * df_len motif_names: List[str] = [motif.getMotifName()] * df_len df = pd.DataFrame() df['motif_id'] = motif_ids df['motif_alt_id'] = motif_names df['sequence_name'] = seqnames_thresh df['start'] = starts_thresh df['stop'] = ends_thresh df['strand'] = strands_thresh df['score'] = scores_thresh df['p-value'] = pvalues_thresh # add the q-values to the final data frame if they have been computed if not no_qvalue: df['q-value'] = qvalues_thresh # finish to build the data frame df['matched_sequence'] = sequences_thresh df['haplotype_frequency'] = frequencies_thresh df['reference'] = references_thresh # sort entries by p-value df = df.sort_values(['p-value'], ascending=True) # reindex the data frame in order to have indexes in range [1, (df_len + 1)] df.index = list(range(1, (df_len + 1))) return df
def read_MEME_motif(motif_file: str, bg_file: str, pseudocount: float, no_reverse: bool, verbose: bool, debug: bool) -> List[Motif]: """Read motif PWM in MEME format. The data read are then used to build the scoring matrix for the motif, the P-value matrix, etc. Since a MEME file can contain one or more motifs, for each stored PWM is built the corresponding Motif object. The resulting set of motifs are stored in a list, which will constitute a MotifSet object. ... Parameters ---------- motif_file : str path to the motif PWM in JASPAR format bg_file path to the background file in Markov Background Format (http://meme-suite.org/doc/bfile-format.html). pseudocount : float value to add to motif PWM counts no_reverse : bool if False only the forward strand will be considered, otherwise both forward and reverse are considered verbose : bool print additional information debug: trace the full error stack Returns ------- List[Motif] list of Motif objects """ if not isinstance(motif_file, str): errmsg = "Expected str, got {}.\n" exception_handler(TypeError, errmsg.format(type(motif_file).__name__), debug) if not os.path.isfile(motif_file): errmsg = "Unable to locate {}.\n" exception_handler(FileNotFoundError, errmsg.format(motif_file), debug) if not isinstance(bg_file, str): errmsg = "Expected str, got {}.\n" exception_handler(TypeError, errmsg.format(type(bg_file).__name__), debug) if bg_file != UNIF and not os.path.isfile(bg_file): errmsg = "Unable to locate {}.\n" exception_handler(FileNotFoundError, errmsg.format(bg_file), debug) if not isinstance(pseudocount, float): errmsg = "Expected float, got {}.\n" exception_handler(TypeError, errmsg.format(type(pseudocount).__name__), debug) if pseudocount <= 0: errmsg = "The pseudocount must be > 0.\n" exception_handler(ValueError, errmsg, debug) if not isinstance(no_reverse, bool): errmsg = "Expected bool, got {}.\n" exception_handler(TypeError, errmsg.format(type(no_reverse).__name__), debug) motifs_raw = list() motifs: List[Motif] = list() motifs_num = 0 proceed = False # begin motif parsing try: ifstream = open(motif_file, mode="r") alphabet = __read_alphabet_meme(motif_file, ifstream, debug) # shared by all motifs nucsmap = dict() # used with np object for i in range(len(alphabet)): nucsmap.update({alphabet[i]: i}) while True: for line in ifstream: if line.startswith("MOTIF"): break # new motif instance else: assert motifs_num == len(motifs_raw) proceed = True break if proceed: break # read all motifs if verbose: start_rm = time.time() motifids = line.split() if len(motifids) == 2: # only name motif_id = motifids[1] motif_name = motif_id else: # assume first two fieds: id, name motif_id, motif_name = motifids[1:3] statistics = __read_statistics_meme(motif_file, ifstream, debug) probs = __read_counts_meme(motif_file, ifstream, statistics["width"], debug) motifs_raw.append({ "motifId": motif_id, "motifName": motif_name, "statistics": statistics, "counts": probs }) motifs_num += 1 if verbose: end_rm = time.time() print("Read motif %s in %.2fs." % (motif_name, (end_rm - start_rm))) if not proceed: errmsg = "Unexpected premature EOF in {}.\n" exception_handler(EOFError, errmsg.format(motif_file), debug) except: errmsg = "An error occurred while reading {}.\n" exception_handler(MotifFileReadError, errmsg.format(motif_file), debug) else: if bg_file == UNIF: bgs = get_uniformBG(alphabet, debug) elif os.path.isfile(bg_file): bgs = readBGfile(bg_file, debug) else: errmsg = "Unable to parse {}.\n" exception_handler(BGFileError, errmsg.format(bg_file), debug) bgs = pseudo_bg(bgs, no_reverse, debug) # add pseudocount to bg for i in range(motifs_num): mp = pd.DataFrame(np.matrix(motifs_raw[i]["counts"])) mp.index = alphabet mp = norm_motif(mp, motifs_raw[i]["statistics"]["width"], alphabet, debug) mp = apply_pseudocount_meme(mp.to_numpy(), pseudocount, motifs_raw[i]["statistics"]["nsites"], motifs_raw[i]["statistics"]["width"], bgs, alphabet, nucsmap, debug) motif: Motif = Motif(mp, motifs_raw[i]["statistics"]["width"], alphabet, motifs_raw[i]["motifId"], motifs_raw[i]["motifName"], nucsmap) motif.setBg(bgs) motifs.append(motif) finally: ifstream.close() return motifs
def process_motif_for_logodds(motif: Motif, debug: bool) -> Motif: """Computes log-odds values from motif probability matrix (PFM). While processing motif probability matrix for log-odds values is also computed the p-value matrix for the current motif PWM. ... Parameters ---------- motif : Motif DNA motif debug : bool trace the full error stack Returns ------- Motif motif log-odds matrix """ if not isinstance(motif, Motif): errmsg = "Expected Motif, got {}.\n" exception_handler(TypeError, errmsg.format(type(motif).__name__), debug) # compute log-odds motif_log_odds = compute_log_odds(motif.countMatrix, motif.width, motif.bg, motif.alphabet, motif.nucsmap, debug) motif.set_motifScoreMatrix(motif_log_odds) # log-odds matrix scaling scaled_scores, min_val, max_val, scale, offset = scale_pwm( motif.scoreMatrix, motif.alphabet, motif.width, motif.nucsmap, debug) motif.set_motifScoreMatrix(scaled_scores) motif.set_isScaled() motif.set_scale(scale) motif.set_minVal(min_val) motif.set_maxVal(max_val) motif.set_offset(offset) # compute p-value matrix pval_mat = comp_pval_mat(motif, debug) motif.set_motifPvalMatrix(pval_mat) return motif
def read_JASPAR_motif(motif_file: str, bg_file: str, pseudocount: float, no_reverse: bool, verbose: bool, debug: bool) -> Motif: """Read a motif PWM in JASPAR format. The data read are then used to build the scoring matrix for the motif, the P-value matrix, etc. ... Parameters ---------- motif_file : str path to the motif PWM in JASPAR format bg_file path to the background file in Markov Background Format (http://meme-suite.org/doc/bfile-format.html). pseudocount : float value to add to motif PWM counts no_reverse : bool if False only the forward strand will be considered, otherwise both forward and reverse are considered verbose : bool print additional information debug: trace the full error stack Returns ------- Motif Motif object """ nucs: List[str] = list() counts: List[float] = list() if verbose: start_rm: float = time.time() try: ifstream = open(motif_file, mode="r") readlines = 0 # check for empty files # begin parsing header: str = str(ifstream.readline().strip()[1:]) if not header: # empty file? errmsg = "{} seems to empty.\n" exception_handler(IOError, errmsg.format(motif_file), debug) motifID, motifName = header.split('\t')[0:2] readlines += 1 while True: line = ifstream.readline().strip() if not line: break # EOF or empty file? nuc = line.strip()[:1] count = list(map(float, line.strip()[1:].split()[1:][:-1])) nucs.append(nuc.upper()) counts.append(count) readlines += 1 if readlines <= 1: # only header read ? errmsg = "{} seems to be empty.\n" exception_handler(IOError, errmsg.format(motif_file), debug) except: errmsg = "An error occurred while reading {}.\n" exception_handler(MotifFileReadError, errmsg.format(motif_file), debug) else: if any([len(c) != len(counts[0]) for c in counts]): errmsg = "Motif counts width mismatch.\n" exception_handler(ValueError, errmsg, debug) nucsmap = dict() # used with np object for i in range(len(nucs)): nucsmap.update({nucs[i]: i}) motif_counts: pd.DataFrame = pd.DataFrame( data=counts, index=nucs) # motif count matrix motif_width: int = int(len(counts[0])) alphabet: list = sorted(nucs) # compute background if bg_file == UNIF: bgs = get_uniformBG(alphabet, debug) elif os.path.isfile(bg_file): bgs = readBGfile(bg_file, debug) else: errmsg = "Unable to parse {}.\n" exception_handler(BGFileError, errmsg.format(bg_file), debug) bgs = pseudo_bg(bgs, no_reverse, debug) # add pseudocount to bg # motif probability matrix motif_probs = (motif_counts / motif_counts.sum(0)) motif_probs = norm_motif(motif_probs, motif_width, alphabet, debug) motif_probs = apply_pseudocount_jaspar(motif_counts.to_numpy(), motif_probs.to_numpy(), pseudocount, bgs, motif_width, alphabet, nucsmap, debug) motif: Motif = Motif(motif_probs, motif_width, alphabet, motifID, motifName, nucsmap) motif.setBg(bgs) if verbose: end_rm: float = time.time() msg: str = "Read motif %s in %.2fs" % (motifID, (end_rm - start_rm)) print(msg) finally: ifstream.close() return motif
def read_JASPAR_motif(motif_file: str, bg_file: str, pseudocount: float, no_reverse: bool, verbose: bool) -> Motif: """Read a motif PWM in JASPAR format. The data read are then used to build the scoring matrix for the motif, the P-value matrix, etc. Parameters: motif_file : str path to the motif PWM in JASPAR format bg_file path to the background file in Markov Background Format (http://meme-suite.org/doc/bfile-format.html). pseudocount : float value to add to motif PWM counts no_reverse : bool if False only the forward strand will be considered, otherwise both forward and reverse are considered verbose : bool print additional information Returns ------- Motif Motif object storing the data contained in motif_file """ nucs: List[str] counts: List[float] # lists where store nucleotides and raw counts nucs = list() counts = list() if verbose: start_rm: float = time.time() try: # open the motif file with open(motif_file) as in_mtf: header: str motifID: str motifName: str # read the header header = str(in_mtf.readline()[1:]) # get the jaspar ID and the common TF name motifID, motifName = header.split('\t')[0:2] motifName = motifName[:-1] # remove '\n' for line in in_mtf: line = line.strip() nuc = line.strip()[:1] count = list(map(float, line.strip()[1:].split()[1:][:-1])) nucs.append(nuc) counts.append(count) # end for # end open except: errmsg: str = ' '.join(["\n\nERROR: unable to read file", motif_file]) raise FileReadingException(errmsg) else: motif_counts = pd.DataFrame(data=counts, index=nucs) # the check of equal length for all raw counts is made building # the DataFrame motif_width: int = int(len(counts[0])) alphabet: list = sorted(nucs) # alphabet as list bgs: Dict if bg_file == 'UNIF': bgs = get_uniformBG(alphabet) elif os.path.exists(bg_file): bgs = readBGfile(bg_file) else: errmsg = "\n\nERROR: unable to find the given background file" raise NotValidBGException(errmsg) # end if bgs = pseudo_bg(bgs, no_reverse) motif_probs: pd.DataFrame motif_probs = (motif_counts / motif_counts.sum(0)) motif_probs = norm_motif(motif_probs, motif_width, alphabet) motif_probs = apply_pseudocount_jaspar(motif_counts, motif_probs, pseudocount, bgs, motif_width, alphabet) motif: Motif = Motif(motif_probs, motif_width, alphabet, motifID, motifName) motif.setBg(bgs) if verbose: end_rm: float = time.time() msg: str = ''.join( ["Read motif ", motifID, " in ", str(end_rm - start_rm), "s"]) print(msg) # end if return motif finally: in_mtf.close() # close the motif file anyway
def process_motif_for_logodds(motif: Motif) -> Motif: """Computes the log-odds values from a probability matrix of a given PWM motif. During the computation of the log-odds matrix is also computed the corresponding P-value matrix, using the dynamic programming algorithm presented in Staden, 1994. Parameters ---------- motif : Motif DNA motif Returns ------- Motif Input DNA motif with the log-odds matrix """ # get the log-odds motif_log_odds: pd.DataFrame motif_log_odds = compute_log_odds(motif.getMotif_matrix(), motif.getWidth(), motif.getBg(), motif.getAlphabet()) motif.setMotif_scoreMatrix(motif_log_odds) # scale the log-odds scores scaled_scores: np.ndarray min_val: int max_val: int scale: int offset: np.double scaled_scores, min_val, max_val, scale, offset = scale_pwm( motif.getMotif_scoreMatrix(), motif.getAlphabet(), motif.getWidth()) motif.setMotif_scoreMatrix(scaled_scores) motif.setIsScaled(True) motif.setScale(scale) motif.setMin_val(min_val) motif.setMax_val(max_val) motif.setOffset(offset) # compute the p-value matrix pval_mat: np.array pval_mat = comp_pval_mat(motif) motif.setMotif_pval_matrix(pval_mat) motif.setMotif_scoreMatrix(scaled_scores.values) return motif
def read_MEME_motif(motif_file: str, bg_file: str, pseudocount: float, no_reverse: bool, verbose: bool) -> List[Motif]: """Read a motif PWM in MEME format. The data read are then used to build the scoring matrix for the motif, the P-value matrix, etc. Since a MEME file can contain one or more motifs, for each PWM contained is built the corresponding motif object. The resulting set of motifs are then stored in a list. Parameters ---------- motif_file : str path to the motif PWM bg_file : str path to the background probability distribution pseudocount : float pseudocount to add to the PWM values no_reverse : bool if False only the forward strand will be considered, otherwise both forward and reverse are considered verbose : bool print additional information Returns ------- List[Motif] List of Motif objects storing the data contained in motif_file """ try: with open(motif_file, 'r') as in_mtf: # open the motif file # flag to keep track were the infos about the motif begin infostart: bool # flag to keep track were the motif data begin datastart: bool # number of motifs found in the MEME file motifs_found: int # list of the found motif IDs motifID_lst: List[str] # list of the found motif names motifName_lst: List[str] # list of the found motif widths motif_width_lst: List[int] # list of the found motif site counts site_counts_lst: List[int] # list of the found motif alphabet lengths alphalen_lst: List[int] # list of the found motif probability matrices motif_probs_lst: List[pd.DataFrame] # list of the found As probabilities for each motif a_lst: List[np.double] # list of the found Cs probabilities for each motif c_lst: List[np.double] # list of the found Gs probabilities for each motif g_lst: List[np.double] # list of the found Ts probabilities for each motif t_lst: List[np.double] infostart = False datastart = False motifs_found = 0 motifID_lst = list() motifName_lst = list() motif_width_lst = list() site_counts_lst = list() alphalen_lst = list() motif_probs_lst = list() a_lst = list() c_lst = list() g_lst = list() t_lst = list() motif_width = None pos_read = 0 for line in in_mtf: if line[0:8] == 'ALPHABET': alphabet: List = sorted(list(set(line[10:-1]))) assert isListEqual(alphabet, DNA_ALPHABET) if line[0:5] == 'MOTIF': if verbose: start_rm: float = time.time() # read motif ID and full name motif_header: str = line.split() assert len(motif_header) > 0 # there are two ways to define the motif name line # in MEME file # (refer to http://meme-suite.org/doc/meme-format.html?man_type=web): # 1 - MOTIF motif_alternate_name # 2 - MOTIF motif_identifier motif_alternate_name motifID: str motifName: str if len(motif_header) == 2: # support case (1) motifID = motif_header[1] motifName = motif_header[1] else: # support case (2) motifID, motifName = motif_header[1:3] # end if motifID_lst.append(motifID) motifName_lst.append(motifName) # the informations about motif start here infostart = True continue # end if if infostart and len(line.strip()) != 0: infos: str = line[26:] infosplit: List[str] = infos.split() alphalen: int = int(infosplit[1]) alphalen_lst.append(alphalen) assert alphalen == len(alphabet) motif_width: int = int(infosplit[3]) site_counts: int = int(infosplit[5]) infostart = False # informations end here # allocate space for the motif probability matrix motif_probs: pd.DataFrame = pd.DataFrame( index=alphabet, columns=range(motif_width), data=np.double(0)) motif_width_lst.append(motif_width) site_counts_lst.append(site_counts) motif_probs_lst.append(motif_probs) datastart = True # at next step begin data # initialize nucleotide data a = list() c = list() g = list() t = list() continue # end if if datastart and pos_read < motif_width: freqs = line.split() a.append(np.double(freqs[0])) c.append(np.double(freqs[1])) g.append(np.double(freqs[2])) t.append(np.double(freqs[3])) pos_read += 1 # end if # we read all current motif data if pos_read == motif_width: a_lst.append(a) c_lst.append(c) g_lst.append(g) t_lst.append(t) # update stats about found motifs motifs_found += 1 # clear the statistics pos_read: int = 0 motif_width = None datastart = False alphalen = -1 datastart = False if verbose: end_rm: float = time.time() msg: str = ''.join([ "Read motif ", motifID, " in ", str(end_rm - start_rm), "s" ]) print(msg) # end if # end if except: # something went wrong errmsg: str = ' '.join(["Unable to read file", motif_file]) raise FileReadingException(errmsg) else: bgs: dict # read the background if bg_file == 'UNIF': bgs = get_uniformBG(alphabet) elif os.path.exists(bg_file): bgs = readBGfile(bg_file) else: errmsg = "\n\nERROR: unable to find the given background file" raise NotValidBGException(errmsg) # end if bgs = pseudo_bg(bgs, no_reverse) motif_lst: List[Motif] motif_lst = list() for i in range(motifs_found): mp: pd.DataFrame = motif_probs_lst[i] mp.loc['A'] = a_lst[i] mp.loc['C'] = c_lst[i] mp.loc['G'] = g_lst[i] mp.loc['T'] = t_lst[i] mw: int = motif_width_lst[i] sc: int = site_counts_lst[i] mp = norm_motif(mp, mw, alphabet) mp = apply_pseudocount_meme(mp, pseudocount, sc, mw, bgs, alphabet) motif: Motif = Motif(mp, mw, alphabet, motifID_lst[i], motifName_lst[i]) motif.setBg(bgs) motif_lst.append(motif) # end for return motif_lst finally: in_mtf.close() # close the file anyway
def scan_graph(motif: Motif, args_obj: Findmotif) -> str: """Obtain all the sequences of length K from the given genome variation graph, where K is the motif length. The sequences are obtained from the regions defined in the input BED file (UCSC BED file format). The sequences extracted correspond to all possible recombinant ones which can be obtained using the genomic variants given in the VCF file used to build the queried VG. Then, they are filtered to keep only those beloning to haplotypes of the samples from which the VCF file variants come from. The user can also decide to keep them all using the --recomb option. Parameters ---------- motif : Motif DNA motif PWM to search on the VG args_obj : Findmotif container of the arguments used during genome variation graph scanning Returns ------- str location of the files with the sequences extracted """ errmsg: str vg: str chroms: List[str] # check the input arguments if not isinstance(motif, Motif): errmsg = "\n\nERROR: unknown motif object type" raise ValueError(errmsg) if not isinstance(args_obj, Findmotif): errmsg = "Unknown arguments object type. " errmsg += "Cannot scan the genome variation graph. Exiting" raise ValueError(errmsg) if args_obj.has_graph_genome(): vg = args_obj.get_graph_genome() if not isGraph_genome_xg(vg): errmsg = "\n\nERROR: the genome variation graph is not in XG format" raise VGException(errmsg) # end if elif args_obj.has_graph_genome_dir(): vg = args_obj.get_graph_genome_dir() else: raise VGException("\n\nERROR: the genome variation graph is missing") # end if bedfile: str = args_obj.get_bedfile() motif_width: int = motif.getWidth() cores: int = args_obj.get_cores() global verbose verbose = args_obj.get_verbose() print("\nExtracting regions defined in", bedfile, "\n") # read the regions where search the motif occurrences from the given # BED file regions: Dict region_num: int regions, region_num = getBEDregions(bedfile) if (args_obj.get_chroms_num() == 1 and args_obj.get_chroms()[0] == 'ALL_CHROMS'): chroms = list(regions.keys()) else: chroms = [''.join(['chr', c]) for c in args_obj.get_chroms()] if verbose: print("\nFound", region_num, "regions in", bedfile) # create a tmp working directory tmpwd: str = tempfile.mkdtemp(prefix='grafimo_') # get the new location of graphs wrt the tmp dir cwd: str = os.getcwd() # enter the tmp dir where store the extracted sequences os.chdir(tmpwd) # list of queries queries: List[str] = list() # redefine default SIGINT handler original_sigint_handler = signal.signal(signal.SIGINT, signal.SIG_IGN) pool: mp.Pool = mp.Pool(processes=cores) # use no. cores processes # overwrite the default SIGINT handler to exit gracefully # https://stackoverflow.com/questions/11312525/catch-ctrlc-sigint-and-exit-multiprocesses-gracefully-in-python signal.signal(signal.SIGINT, original_sigint_handler) positions: List[Tuple[int, int]] if args_obj.has_graph_genome_dir(): # vg -> directory containing a set of VGs if vg[-1] == "/": pass else: vg = ''.join([vg, "/"]) # end if for chrom in chroms: positions = regions[chrom] for pos in positions: start: int = pos[0] stop: int = pos[1] # the chromosome is among the ones to query region_index: str = ''.join( [chrom.split('chr')[-1], ':', str(start), '-', str(stop)]) region_name: str = ''.join( [chrom, '_', str(start), '-', str(stop)]) seqs: str = os.path.join('.', ''.join([region_name, '.tsv'])) xg: str = ''.join([vg, chrom, '.xg']) # the GBWT must have the same prefix as XG gbwt: str = ''.join([vg, chrom, '.gbwt']) if not os.path.exists(xg): errmsg = ''.join( ["\n\nERROR: unable to use ", xg, ". Exiting"]) raise FileNotFoundError(errmsg) if not os.path.isfile(gbwt): errmsg = "ERROR: unable to find GBWT file for" errmsg = ' '.join([errmsg, xg]) raise FileNotFoundError(errmsg) query: str = 'vg find -p {0} -x {1} -H {2} -K {3} -E > {4}'.format( region_index, xg, gbwt, motif_width, seqs) queries.append(query) # extract from the graph the binding site candidates to score get_kmers(queries, pool, verbose) elif args_obj.has_graph_genome(): for chrom in chroms: positions = regions[chrom] for pos in positions: start: int = pos[0] stop: int = pos[1] # the chromosome is among the ones to query region_index: str = ''.join( [chrom.split('chr')[-1], ':', str(start), '-', str(stop)]) region_name: str = ''.join( [chrom, '_', str(start), '-', str(stop)]) seqs: str = os.path.join('.', ''.join([region_name, '.tsv'])) xg: str = vg xg_prefix: str = xg.split(".xg")[-2] # the GBWT must have the same prefix as XG gbwt: str = ''.join([xg_prefix, '.gbwt']) if not os.path.exists(xg): errmsg = ''.join( ["\n\nERROR: unable to use ", xg, ". Exiting"]) raise FileNotFoundError(errmsg) if not os.path.isfile(gbwt): errmsg = "ERROR: unable to find GBWT file for" errmsg = ' '.join([errmsg, xg]) raise FileNotFoundError(errmsg) query = 'vg find -p {0} -x {1} -H {2} -K {3} -E > {4}'.format( region_index, xg, gbwt, motif_width, seqs) queries.append(query) # extract from the graph the binding site candidates to score get_kmers(queries, pool, verbose) else: raise Exception("\n\nERROR: do not know how to proceed. Exiting") # end if # the extracted sequences are store in the cwd sequence_loc: str = os.getcwd() os.chdir(cwd) return sequence_loc