def get_motif_list(self, pseudocounts=1.0, fpr=0.0001): motif_list = [] # iterate over all available PWM files for motif_dir_path in self.motif_data.pwm_list: # iterate over all motif elements in this set for motif_name, ma in self.motifs_map.items(): motif_file_name = os.path.join(motif_dir_path, motif_name + ".pwm") # if the motif annotation has a corresponding PWM file, add to return list if os.path.isfile(motif_file_name): # check whether ma provides the motif matching threshold for the given fpr # recalculate (and store) it otherwise if fpr in ma.thresholds and ma.thresholds[fpr]: threshold = ma.thresholds[fpr] else: pfm = parsers.pfm(str(motif_file_name)) bg = tools.flat_bg( len(pfm) ) # total number of "points" to add, not per-row pssm = tools.log_odds(pfm, bg, pseudocounts, 2) threshold = tools.threshold_from_p(pssm, bg, fpr) ma.thresholds[fpr] = threshold motif_list.append( Motif(motif_file_name, pseudocounts, threshold)) return motif_list
def get_motif_list(self, pseudocounts=1.0, fpr=0.0001): motif_list = [] # iterate over all available PWM files for motif_dir_path in self.motif_data.pwm_list: # iterate over all motif elements in this set for motif_name, ma in self.motifs_map.items(): motif_file_name = os.path.join(motif_dir_path, motif_name + ".pwm") # if the motif annotation has a corresponding PWM file, add to return list if os.path.isfile(motif_file_name): # check whether ma provides the motif matching threshold for the given fpr # recalculate (and store) it otherwise if fpr in ma.thresholds and ma.thresholds[fpr]: threshold = ma.thresholds[fpr] else: pfm = parsers.pfm(str(motif_file_name)) bg = tools.flat_bg(len(pfm)) # total number of "points" to add, not per-row pssm = tools.log_odds(pfm, bg, pseudocounts, 2) threshold = tools.threshold_from_p(pssm, bg, fpr) ma.thresholds[fpr] = threshold motif_list.append(Motif(motif_file_name, pseudocounts, threshold)) return motif_list
def __init__(self, input_file_name, pseudocounts, fpr, thresholds): """ Initializes Motif. Fields: pfm -- Position Frequency Matrix. bg -- Background frequencies. pssm -- Position Specific Scoring Matrix. alphabet -- A list of letters, eg ["Aa", "Cc", "Gg", "Tt"] threshold -- Motif matching threshold. len -- Length of the motif. max -- Maximum PSSM score possible. is_palindrome -- True if consensus is biologically palindromic. """ # Initializing name self.name = ".".join(basename(input_file_name).split(".")[:-1]) repository = input_file_name.split("/")[-2] # Creating PFM & PSSM self.pfm = parsers.pfm(str(input_file_name)) self.bg = tools.flat_bg(len( self.pfm)) # total number of "points" to add, not per-row self.pssm = tools.log_odds(self.pfm, self.bg, pseudocounts, 2) self.pssm_rc = tools.reverse_complement(self.pssm) # how many bases this motif has self.len = len(self.pfm[0]) # maximum value found in the whole PSSM self.max = max([max(e) for e in self.pssm]) # we only support pure DNA or methylated DNA, for now. self.alphabet = ["Aa", "Cc", "Gg", "Tt"] if len(self.pfm) == 6: self.alphabet += ["m", "1"] # Evaluating threshold try: if pseudocounts != 1.0: raise ValueError() self.threshold = thresholds.dict[repository][self.name][fpr] except Exception: # FIXME: this requires a modified version of MOODS. Not sure if we actually need it. # self.threshold = tools.threshold_from_p(self.pssm, self.bg, fpr, 2000.0) # 10000.0 would take too long self.threshold = tools.threshold_from_p(self.pssm, self.bg, fpr) print(">>> recomputing threshold for %s: %f" % (self.name, self.threshold)) self.threshold_rc = tools.threshold_from_p(self.pssm_rc, self.bg, fpr) self.consensus = "".join( [self.alphabet[i][0] for i in argmax(self.pssm, axis=0)]) self.consensus_rc = "".join( [self.alphabet[i][0] for i in argmax(self.pssm_rc, axis=0)]) # Evaluating if motif is palindromic self.is_palindrome = self.consensus == self.consensus_rc
def __init__(self, input_file_name, pseudocounts, threshold): """ Initializes Motif. Fields: pfm -- Position Frequency Matrix. bg -- Background frequencies. pssm -- Position Specific Scoring Matrix. alphabet -- A list of letters, eg ["Aa", "Cc", "Gg", "Tt"] threshold -- Motif matching threshold. len -- Length of the motif. max -- Maximum PSSM score possible. is_palindrome -- True if consensus is biologically palindromic. """ # Initializing name self.name = ".".join(basename(input_file_name).split(".")[:-1]) # Creating PFM & PSSM self.pfm = parsers.pfm(str(input_file_name)) self.bg = tools.flat_bg(len( self.pfm)) # total number of "points" to add, not per-row self.pssm = tools.log_odds(self.pfm, self.bg, pseudocounts, 2) self.pssm_rc = tools.reverse_complement(self.pssm) # how many bases this motif has self.len = len(self.pfm[0]) # maximum value found in the whole PSSM self.max = max([max(e) for e in self.pssm]) # we only support pure DNA or methylated DNA, for now. self.alphabet = ["Aa", "Cc", "Gg", "Tt"] if len(self.pfm) == 6: self.alphabet += ["m", "1"] self.threshold = threshold self.consensus = "".join( [self.alphabet[i][0] for i in argmax(self.pssm, axis=0)]) self.consensus_rc = "".join( [self.alphabet[i][0] for i in argmax(self.pssm_rc, axis=0)]) # Evaluating if motif is palindromic self.is_palindrome = self.consensus == self.consensus_rc
def __init__(self, input_file_name, pseudocounts, threshold): """ Initializes Motif. Fields: pfm -- Position Frequency Matrix. bg -- Background frequencies. pssm -- Position Specific Scoring Matrix. alphabet -- A list of letters, eg ["Aa", "Cc", "Gg", "Tt"] threshold -- Motif matching threshold. len -- Length of the motif. max -- Maximum PSSM score possible. is_palindrome -- True if consensus is biologically palindromic. """ # Initializing name self.name = ".".join(basename(input_file_name).split(".")[:-1]) # Creating PFM & PSSM self.pfm = parsers.pfm(str(input_file_name)) self.bg = tools.flat_bg(len(self.pfm)) # total number of "points" to add, not per-row self.pssm = tools.log_odds(self.pfm, self.bg, pseudocounts, 2) self.pssm_rc = tools.reverse_complement(self.pssm) # how many bases this motif has self.len = len(self.pfm[0]) # maximum value found in the whole PSSM self.max = max([max(e) for e in self.pssm]) # we only support pure DNA or methylated DNA, for now. self.alphabet = ["Aa", "Cc", "Gg", "Tt"] if len(self.pfm) == 6: self.alphabet += ["m", "1"] self.threshold = threshold self.consensus = "".join([self.alphabet[i][0] for i in argmax(self.pssm, axis=0)]) self.consensus_rc = "".join([self.alphabet[i][0] for i in argmax(self.pssm_rc, axis=0)]) # Evaluating if motif is palindromic self.is_palindrome = self.consensus == self.consensus_rc
version = ".".join(pwm_name.split(".")[2:]) gene_names = hocomoco_anno[pwm_name][0] group = hocomoco_anno[pwm_name][1] if not group: group = "." uniprot = hocomoco_anno[pwm_name][2] data_source = hocomoco_anno[pwm_name][3] taxGroup = "vertebrates" species = (pwm_name.split("_")[1]).split(".")[0] if species == "HUMAN": species = "H**o sapiens" elif species == "MOUSE": species = "Mus musculus" # Creating PSSM pfm = parsers.pfm(inputFileName) bg = tools.flat_bg(len(pfm)) # total number of "points" to add, not per-row pssm = tools.log_odds(pfm, bg, pseudocounts, 2) threshold_list = [] # Evaluating thresholds for fpr in fprList: # Note: this requires a modified version of MOODS. Only use it if you know what you are doing # resVec.append(str(tools.threshold_from_p(pssm, bg, fpr, 10000.0))) threshold = tools.threshold_from_p(pssm, bg, fpr) threshold_list.append(str(threshold)) threshold = ",".join(threshold_list) resultMatrix.append([matrix_id, pwm_name, version, gene_names, group, uniprot, data_source, taxGroup, species, threshold])
version = ".".join(pwm_name.split(".")[2:]) gene_names = hocomoco_anno[pwm_name][0] group = hocomoco_anno[pwm_name][1] if not group: group = "." uniprot = hocomoco_anno[pwm_name][2] data_source = hocomoco_anno[pwm_name][3] taxGroup = "vertebrates" species = (pwm_name.split("_")[1]).split(".")[0] if species == "HUMAN": species = "H**o sapiens" elif species == "MOUSE": species = "Mus musculus" # Creating PSSM pfm = parsers.pfm(inputFileName) bg = tools.flat_bg( len(pfm)) # total number of "points" to add, not per-row pssm = tools.log_odds(pfm, bg, pseudocounts, 2) threshold_list = [] # Evaluating thresholds for fpr in fprList: # Note: this requires a modified version of MOODS. Only use it if you know what you are doing # resVec.append(str(tools.threshold_from_p(pssm, bg, fpr, 10000.0))) threshold = tools.threshold_from_p(pssm, bg, fpr) threshold_list.append(str(threshold)) threshold = ",".join(threshold_list) resultMatrix.append([ matrix_id, pwm_name, version, gene_names, group, uniprot,
inFolder = sys.argv[1] outFileName = sys.argv[2] # Parameters fprList = [0.005, 0.001, 0.0005, 0.0001, 0.00005, 0.00001] pseudocounts = 1.0 outFile = open(outFileName, "w") outFile.write("\t".join(["MOTIF"] + [str(e) for e in fprList]) + "\n") # Iterating on all PWMs for pwmFileName in sorted(glob(inFolder + "*.pwm")): # Creating PSSM name = ".".join(basename(pwmFileName).split(".")[:-1]) pfm = parsers.pfm(pwmFileName) bg = tools.flat_bg( len(pfm)) # total number of "points" to add, not per-row pssm = tools.log_odds(pfm, bg, pseudocounts, 2) # Evaluating thresholds resVec = [name] for fpr in fprList: # Note: this requires a modified version of MOODS. Only use it if you know what you are doing # resVec.append(str(tools.threshold_from_p(pssm, bg, fpr, 10000.0))) resVec.append(str(tools.threshold_from_p(pssm, bg, fpr))) # Writing results outFile.write("\t".join(resVec) + "\n") outFile.close()