def main(): parser = argparse.ArgumentParser(description='') parser.add_argument("-i", "--input", dest="infile", required=True, help="input file") parser.add_argument("-n", "--name", dest="name", required=True, help="name of motif") args = parser.parse_args() name = args.name with open(args.infile) as handle: m = motifs.read(handle, "pfm") pwm = m.counts.normalize(pseudocounts={ "A": 0.6, "C": 0.4, "G": 0.4, "T": 0.6 }) print(">{0}\t{1}".format(str(pwm.consensus), name)) for i in range(len(pwm[0])): print("{0:f}\t{1:f}\t{2:f}\t{3:f}".format(pwm[0][i], pwm[1][i], pwm[2][i], pwm[3][i]))
def process_data(data, data_type='counts', seq_type='dna'): if data_type == 'counts': pfm, total = count_to_pfm(data) ic = calc_relative_information(pfm, total) elif data_type in ['fasta', 'stockholm']: #motif, ic = read_alignment(data, data_type, seq_type) #pfm = motif.counts.normalize(pseudocounts=1) data, total = read_alignment(data, data_type, seq_type) pfm, _ = count_to_pfm(data) ic = calc_relative_information(pfm, total) elif data_type in [ 'alignace', 'meme', 'mast', 'transfac', 'pfm', 'sites', 'jaspar' ]: if data_type in ['jaspar', 'transfac']: motif = motifs.parse(open(data, 'r'), data_type.upper())[0] pfm = dict(motif.counts.normalize()) total = sum(list(motif.counts.values())[0]) else: motif = motifs.read(open(data, 'r'), data_type) try: pfm = motif.counts.normalize(psuedocounts=1) except: pfm = motif.counts.normalize() total = motif.counts ic = calc_relative_information(pfm, total) return (format_matrix(pfm), format_matrix(ic))
def load_motifs(motif_dir, pseudocounts=0.05, key='full'): ''' read in motifs; motifs have to be in jaspar format as below: >MA0002.2 RUNX1 A [ 287 234 123 57 0 87 0 17 10 131 500 ] C [ 496 485 1072 0 75 127 0 42 400 463 158 ] G [ 696 467 149 7 1872 70 1987 1848 251 81 289 ] T [ 521 814 656 1936 53 1716 13 93 1339 1325 1053 ] Parameters: motif_dir: folder that contains motif files; one file for individual motif pseudocounts: w.r.t. position weight matrix, the probability adding to every nucleotide key: specify the way to name the motifs in the output dictionary options: 'full' (default), 'id' ''' motif_dict = {} nuc = ['A', 'C', 'G', 'T'] for mf in os.listdir(motif_dir): with open(motif_dir + mf) as f: m = motifs.read(f, 'jaspar') counts = np.array([m.counts[n] for n in nuc]) avg_counts = counts.sum(axis=0).mean() m.pseudocounts = avg_counts*pseudocounts m.background = None if key == 'full': motif_dict[m.name+'$'+m.matrix_id] = m elif key == 'id': motif_dict[m.matrix_id] = m return motif_dict
def __init__(self,matrixfile=None,pfmdir=None): annotationmatrixfile= 'SaccCereAnnotation.txt' matrixfile='SaccCereTFMATRIX.txt' annotationmatrixfile= path.join('Data', annotationmatrixfile) matrixfile= path.join('Data', matrixfile) TFmatrixFile= path.join(current_directory, matrixfile) annotationfile= path.join(current_directory, annotationmatrixfile) medline= open(annotationfile, 'r') medlinematrix = csv.reader(medline, dialect='excel-tab') self.medline_dict= {} for row in medlinematrix: if row[1] == 'medline': self.medline_dict[row[0]]= row[2] if pfmdir is None: pfmdir="SaccCerePFMFlatFileDir" pfm_folder= path.join(current_directory,'Data', pfmdir) else: pfm_folder=pfmdir TFmatrixReader= open(TFmatrixFile, 'r') TFmatrix = csv.reader(TFmatrixReader, dialect='excel-tab') self.motif_dict = dict() for row in TFmatrix: tf_accession= row[0] common_name= row[4] tf_pfm= "{}.{}.pfm".format(row[2],row[3]) filename= path.join('Data',pfm_folder, tf_pfm) with open(filename, 'r') as handle: self.motif_dict[common_name] = motifs.read(handle,'pfm') try: sgdid= idconverter.getgene(common_name).SGDID self.motif_dict[sgdid]= self.motif_dict.pop(common_name) self.motif_dict[sgdid].medline= self.medline_dict[tf_accession] except Exception: self.motif_dict.pop(common_name, None) continue
def jaspar_to_pwm(output_dir="./"): """ For each taxon, this function reformats all profiles from JASPAR to PWMScan format. """ # Initialize # perl_script = os.path.join(os.path.dirname(os.path.realpath(__file__)), # "jasparconvert.pl") # For each taxon... for taxon in taxons: # Initialize taxon_dir = os.path.join(os.path.abspath(output_dir), taxon) # For each profile... for f in os.listdir(taxon_dir): # Skip non-JASPAR profiles if not f.endswith(".jaspar"): continue # JASPAR to PWMScan with open(os.path.join(taxon_dir, f)) as handle: m = motifs.read(handle, "jaspar") m.pseudocounts = motifs.jaspar.calculate_pseudocounts(m) pwm = list(map(list, zip(*[m.pssm[nt] for nt in "ACGT"]))) pwm_file = os.path.join(taxon_dir, f"{f[:8]}.pwm") if not os.path.exists(pwm_file): with open(pwm_file, "w") as handle: for i in pwm: s = " ".join( ["{:7d}".format(round(j * 100)) for j in i]) handle.write("%s\n" % s)
def motif2pssm(path2motif, format): if format == "ppm": ppm = np.loadtxt(path2motif) print "PPM:" print ppm print "" pssm = np.log2((ppm + 1E-9) / 0.25) return pssm else: with open(path2motif) as handle: m = motifs.read(handle, format) pfm = m.counts print "PFM:" print pfm ppm = pfm.normalize(pseudocounts=C_PSEUDOCOUNTS) print "PPM:" print ppm pssm = ppm.log_odds(background=C_BACKGROUND) np_pssm = np.zeros(shape=(4, pssm.length)) for i, nt in enumerate(['A', 'C', 'G', 'T']): np_pssm[i] = pssm[nt] return np_pssm
def read_motif(motif): """ motif: a single .fm file within the directory. Returns a motif.jaspar.Motif (the motif matrix) from the .fm file. """ return motifs.read(open(motif), "pfm")
def readMotifFile(motifPath): with open(motifPath) as f: m = motifs.read(f, 'jaspar') name = m.name matrix = [m.pwm['A'], m.pwm['C'], m.pwm['G'], m.pwm['T']] matrix = np.array(matrix).T return (name, np.array(matrix), m.matrix_id)
def __init__(self, input_file_name, pseudocounts, precision, fpr, thresholds): """ Initializes Motif. Variables: pfm -- Position Frequency Matrix. pwm -- Position Weight Matrix. pssm -- Position Specific Scoring Matrix. threshold -- Motif matching threshold. len -- Length of the motif. max -- Maximum PSSM score possible. is_palindrome -- True if consensus is biologically palindromic. """ # Initializing error handler err = ErrorHandler() # Initializing name self.name = ".".join(basename(input_file_name).split(".")[:-1]) repository = input_file_name.split("/")[-2] # Creating PFM & PWM input_file = open(input_file_name, "r") self.pfm = motifs.read(input_file, "pfm") self.pwm = self.pfm.counts.normalize(pseudocounts) input_file.close() self.len = len(self.pfm) # Creating PSSM background = {'A': 0.25, 'C': 0.25, 'G': 0.25, 'T': 0.25} self.pssm = self.pwm.log_odds(background) self.pssm_list = [self.pssm[e] for e in ["A", "C", "G", "T"]] self.max = self.pssm.max # Evaluating threshold try: if pseudocounts != 0.1 or precision != 10000: raise ValueError() self.threshold = thresholds.dict[repository][self.name][fpr] except Exception: err.throw_warning( "DEFAULT_WARNING", add_msg="Parameters not matching pre-computed Fpr data. " "Recalculating (might take a while)..") try: distribution = self.pssm.distribution(background=background, precision=precision) except Exception: err.throw_error("MM_PSEUDOCOUNT_0") self.threshold = distribution.threshold_fpr(fpr) # Evaluating if motif is palindromic if str(self.pfm.consensus) == str( self.pfm.consensus.reverse_complement()): self.is_palindrome = True else: self.is_palindrome = False
def yield_motifs(path): with open(path) as handle: for key, lines in groupby(handle, methodcaller('startswith', '>')): if key: name = lines.next().strip().split()[-1].lower() else: tmp = ''.join(lines) mot = motifs.read(StringIO(tmp), 'pfm') yield name, mot yield name+'-R', true_motif_rev_complement(mot)
def MotifToBP(motif,name): motifStr = '>' + name + '\n' motifStr += 'A ' + str(motif['PWM']['A']).replace(',','') + '\n' motifStr += 'C ' + str(motif['PWM']['C']).replace(',','') + '\n' motifStr += 'G ' + str(motif['PWM']['G']).replace(',','') + '\n' motifStr += 'T ' + str(motif['PWM']['T']).replace(',','') + '\n' handle = StringIO(motifStr) motif = motifs.read(handle, 'jaspar') return motif
def pearsonpwm( pwm1, pwm2): #this function computes the Pearson coefficient between 2 pwm cisbpmat = motifs.read(open(pwm1), "pfm") cisbpmat.pseudocounts = 3.0 pwmnp1 = (np.loadtxt(pwm1, skiprows=1)) tf1 = cisbpmat.pssm cisbpmat2 = motifs.read(open(pwm2), "pfm") cisbpmat2.pseudocounts = 3.0 pwmnp2 = (np.loadtxt(pwm2, skiprows=1)) tf2 = cisbpmat2.pssm distance, offset = tf2.dist_pearson(tf1) return [pwm1, 1 - distance, math.fabs(offset), len(np.transpose(pwmnp1))], [ pwm2, 1 - distance, math.fabs(offset), len(np.transpose(pwmnp2)) ]
def pfm(self): s = '' for l in ['a', 'c', 'g', 't']: for n in self.PFM[l]: s += str(n) + ' ' s += '\n' sio = StringIO(s) m = motifs.read(sio, 'pfm') return m
def __init__(self, input_file_name): input_file = open(input_file_name,"r") self.pfm = motifs.read(input_file, "pfm") self.pwm = self.pfm.counts.normalize(0.0001) input_file.close() self.len = len(self.pfm) background = {'A':0.25,'C':0.25,'G':0.25,'T':0.25} self.pssm = self.pwm.log_odds(background) self.pssm_list = [self.pssm[e] for e in ["A","C","G","T"]] self.max = self.pssm.max self.min = self.pssm.min
def motif_reader(path_name): motif_list = [] for filename in os.listdir(path_name): print(filename) with open(path_name + filename) as handle: word = motifs.read(handle, "pfm") handle.close() motif = str(word.consensus) print(motif) motif_list.append(motif) return motif_list
def read_jaspar_motif_file(motifPath, pseudocount): ''' reads jaspar motif file inputs: path to a jaspar motif file outputs: a tuple representing a motif ''' with open(motifPath) as f: m = motifs.read(f, 'jaspar') default_pseudocount = motifs.jaspar.calculate_pseudocounts(m) scaled_pseudocount = pseudocount/0.01 * default_pseudocount['A'] m.pseudocounts = int(scaled_pseudocount) return (m.name, m)
def __init__(self, input_file_name, pseudocounts, precision, fpr, thresholds): """ Initializes Motif. Variables: pfm -- Position Frequency Matrix. pwm -- Position Weight Matrix. pssm -- Position Specific Scoring Matrix. threshold -- Motif matching threshold. len -- Length of the motif. max -- Maximum PSSM score possible. is_palindrome -- True if consensus is biologically palindromic. """ # Initializing error handler err = ErrorHandler() # Initializing name self.name = ".".join(basename(input_file_name).split(".")[:-1]) repository = input_file_name.split("/")[-2] # Creating PFM & PWM input_file = open(input_file_name, "r") self.pfm = motifs.read(input_file, "pfm") self.pwm = self.pfm.counts.normalize(pseudocounts) input_file.close() self.len = len(self.pfm) # Creating PSSM background = {'A': 0.25, 'C': 0.25, 'G': 0.25, 'T': 0.25} self.pssm = self.pwm.log_odds(background) self.pssm_list = [self.pssm[e] for e in ["A", "C", "G", "T"]] self.max = self.pssm.max # Evaluating threshold try: if pseudocounts != 0.1 or precision != 10000: raise ValueError() self.threshold = thresholds.dict[repository][self.name][fpr] except Exception: err.throw_warning("DEFAULT_WARNING", add_msg="Parameters not matching pre-computed Fpr data. " "Recalculating (might take a while)..") try: distribution = self.pssm.distribution(background=background, precision=precision) except Exception: err.throw_error("MM_PSEUDOCOUNT_0") self.threshold = distribution.threshold_fpr(fpr) # Evaluating if motif is palindromic if str(self.pfm.consensus) == str(self.pfm.consensus.reverse_complement()): self.is_palindrome = True else: self.is_palindrome = False
def read_pfm(filename): """Facilitates readings a Bio.motif object with set parameters. The output is a Bio.motif object that can quickly be trasnformed to a PWM, or a PSSM using the associated arguments (.pwm , .pssm). The expected PFM file format can be found in 'docs/ex_pfmfile.txt' Additional information can be found on the Biopython motifs page""" with open(filename, "r") as handle: motif = motifs.read(handle, "pfm") motif.pseudocounts = .25 motif.background = {'A': 0.3, 'C': 0.2, 'G': 0.2, 'T': 0.3} return motif
def __init__(self, input_file_name): # Standardize input file to be only the nucleotide frequencies input_file = open(input_file_name,"r") self.pfm = motifs.read(input_file, "pfm") self.pwm = self.pfm.counts.normalize(0.1) input_file.close() self.len = len(self.pfm) background = {'A':0.25,'C':0.25,'G':0.25,'T':0.25} self.pssm = self.pwm.log_odds(background) self.pssm_list = [self.pssm[e] for e in ["A","C","G","T"]] self.max = self.pssm.max self.min = self.pssm.min
def get_motifs(self, transcription_factor): motifs = [] # Try Jaspar JASPAR_dir = '../data/preprocess/JASPAR/' for f in os.listdir(JASPAR_dir): if transcription_factor.upper() in f.upper(): with open(os.path.join(JASPAR_dir, f)) as handle: motif = read(handle, 'pfm') print "motif found in JASPAR", f motifs.append(motif) # Try SELEX SELEX_dir = '../data/preprocess/SELEX_PWMs_for_Ensembl_1511_representatives/' for f in os.listdir(SELEX_dir): if f.upper().startswith(transcription_factor.upper()): with open(os.path.join(SELEX_dir, f)) as handle: motif = read(handle, 'pfm') print "motif found in SELEX", f motifs.append(motif) # Try Factorbook return motifs
def read_motif(motif_filename, verb=0): """Reads a motif as a collection of sites from a file Reads a motif and uses the biopython.motifs class to store it. If the motif is in FASTA format, it uses the parser directly. Otherwise, it loads and reads a concatenated text file and creates the motif. File type is determined by extension: * FASTA for .fas, .fasta and .fa files * One-per-line text file otherwise Input: * The motif filename; required * Verbose mode (default=0) Returns: * the read motif """ #create file handler for reading file try: motif_file = open(motif_filename, "r") except (IOError, OSError) as file_open_exception: print "*** The file name provided:", motif_filename, " does not exist" print "*** Error: ", file_open_exception.errno, " - ",\ file_open_exception.strerror sys.exit() #Figure out file type based on extension, read sites and create motif extension = motif_filename.split('.')[-1] if extension not in ['fas', 'fasta', 'fa']: if verb: print 'Reading motif... raw text sequence mode assumed \ (one site per line, not FASTA parsing)' sites = [] for line in motif_file: sites.append(Seq(line.rstrip('\n\r'), IUPAC.unambiguous_dna)) mot = motifs.create(sites) if verb: print mot.degenerate_consensus else: if verb: print 'Reading motif... attempting to parse FASTA file' mot = motifs.read(motif_file, 'sites') motif_file.close() return mot
def get_observed_result(self, protein): """ Find the DNA binding motif for a given protein Args: protein (:obj:`models.ProteinSubunit`): protein subunit to find data for Returns: :obj:`list` of :obj:`data_model.Observable`: list of observables """ versions = self.get_DNA_by_protein(protein) index = 0 observed_result = [] for motif in versions: binding_matrix = [] for position in motif.all(): binding_matrix.append([ position.frequency_a, position.frequency_c, position.frequency_g, position.frequency_t ]) binding_matrix = map(list, zip(*binding_matrix)) self.cache_dirname = tempfile.mkdtemp() with open(self.cache_dirname + '/data.pfm', 'w') as pfm: for items in binding_matrix: writer = csv.writer(pfm, delimiter='\t') writer.writerow(items) m = motifs.read(open(self.cache_dirname + '/data.pfm'), 'pfm') dna_specie = data_model.DnaSpecie(binding_matrix=m.counts, sequence=str(m.counts.consensus)) metadata = self.metadata_dump(motif.all()[0].dataset) observed_result.append( data_model.ObservedSpecie(specie=dna_specie, metadata=metadata)) for position in motif.all(): observed_result[index].specie.cross_references = data_model.Resource(namespace ='pubmed',\ id = position.dataset._metadata.resource[0]._id), break shutil.rmtree(self.cache_dirname) index += 1 return observed_result
def get_protein_by_DNA_sequence(self, sequence, select=models.ProteinSubunit): """ NOTE: Currently there are no Gene objects in common schema models. When added this query will be updated to input models.Gene and output data_model.ProteinSpecie Args: sequence (:obj:`data_model.DnaSpecie.sequence`): sequence of DNA segment Returns: :obj:`list` of :obj:`tuple`: Returns the query for a protein, sequence position, and score """ #TODO: Make more efficient. Add gene location filter ans = [] size = len(sequence) all_matricies = self.data_source.session.query( models.DNABindingDataset).all() all_sequences = [] for matricies in all_matricies: if len(matricies.dna_binding_data) == size: binding_matrix = [] for position in matricies.dna_binding_data: binding_matrix.append([ position.frequency_a, position.frequency_c, position.frequency_g, position.frequency_t ]) binding_matrix = map(list, zip(*binding_matrix)) self.cache_dirname = tempfile.mkdtemp() with open(self.cache_dirname + '/data.pfm', 'w') as pfm: for items in binding_matrix: writer = csv.writer(pfm, delimiter='\t') writer.writerow(items) m = motifs.read(open(self.cache_dirname + '/data.pfm'), 'pfm') my_seq = Seq(sequence, IUPAC.unambiguous_dna) ##TODO: Include selective threshold # distribution = m.pssm.distribution().threshold_paster() # print distribution.threshold_paster() for position, score in m.pssm.search(my_seq, threshold=2): ans.append((matricies.subunit, position, score)) return ans
def read_motif(motif_filename, verb=0): """Reads a motif as a collection of sites from a file Reads a motif and uses the biopython.motifs class to store it. If the motif is in FASTA format, it uses the parser directly. Otherwise, it loads and reads a concatenated text file and creates the motif. File type is determined by extension: * FASTA for .fas, .fasta and .fa files * One-per-line text file otherwise Input: * The motif filename; required * Verbose mode (default=0) Returns: * the read motif """ #create file handler for reading file try: motif_file = open(motif_filename,"r") except (IOError, OSError) as file_open_exception: print "*** The file name provided:", motif_filename, " does not exist" print "*** Error: ", file_open_exception.errno, " - ",\ file_open_exception.strerror sys.exit() #Figure out file type based on extension, read sites and create motif extension = motif_filename.split('.')[-1] if extension not in ['fas', 'fasta', 'fa']: if verb: print 'Reading motif... raw text sequence mode assumed \ (one site per line, not FASTA parsing)' sites = [] for line in motif_file: sites.append(Seq(line.rstrip('\n\r'),IUPAC.unambiguous_dna)) mot = motifs.create(sites) if verb: print mot.degenerate_consensus else: if verb: print 'Reading motif... attempting to parse FASTA file' mot = motifs.read(motif_file,'sites') motif_file.close() return mot
def _get_profiles(profiles_dir, latest=False, profile=[], taxon=taxons): # Initialize profiles = [] profiles_dict = {} # For each taxon... for t in taxon: # Initialize taxon_dir = os.path.join(os.path.abspath(profiles_dir), t) # For each profile... for profile_file in sorted(os.listdir(taxon_dir), reverse=True): # Skip wrong profiles if len(profile) > 0: if profile_file[:8] not in profile: continue # Load profile with open(os.path.join(taxon_dir, profile_file)) as f: p = motifs.read(f, "jaspar") # Initialize key key = profile_file[:6] profiles_dict.setdefault(key, []) # Skip profile if only using the latest version of each profile if latest: if len(profiles_dict[key]) == 1: continue # Add profile profiles_dict[key].append(p) # Create list of profiles for value_list in profiles_dict.values(): for p in value_list: profiles.append(p) return (profiles)
def parse_cisBP_pwm(self): pwms_original_dir = os.path.join(self.cisBP_rna_dir, "pwms_all_motifs") pwms_jaspar_dir = os.path.join(self.cisBP_rna_dir, "pwms_all_motifs", "jaspar") pwms_info = os.path.join(self.cisBP_rna_dir, "RBP_information_all_motifs.txt") df = pd.read_csv(pwms_info, sep='\t', header=0) if not os.path.isdir(pwms_jaspar_dir): os.mkdir(pwms_jaspar_dir) if len(os.listdir(pwms_jaspar_dir)) == 0: self.create_jaspar_files(pwms_original_dir, pwms_jaspar_dir) for file in os.listdir(pwms_jaspar_dir): with open(os.path.join(pwms_jaspar_dir, file)) as handle: motif_ID = os.path.splitext(file)[0] pwm = motifs.read(handle, "pfm") if not motif_ID in self.pwms_dict: self.pwms_dict[motif_ID] = pwm handle.close()
def from_fasta(fasta, motifid, name=None): """ Create motif from fasta. Will use captital letters as motif sites (see JASPAR sites format). Parameters: fasta (string): Path to fasta file. motifid (string): Unique id of the motif. name (string): Name of the motif. Defaults to 'None'. Returns: OneMotif object """ with open(fasta) as handle: motif = motifs.read(handle, "sites") return OneMotif( motifid=motifid, counts=[motif.counts[base] for base in ["A", "C", "G", "T"]], name=name)
def get_names(output_dir="./"): """ This function extracts the name of each JASPAR profile and saves them in a JSON file. """ # Initialize names = {} # Skip if already done json_file = os.path.join(output_dir, "names.json") if not os.path.exists(json_file): # For each taxon... for taxon in taxons: # Initialize taxon_dir = os.path.join(os.path.abspath(output_dir), taxon) # For each profile... for f in os.listdir(taxon_dir): # Skip non-JASPAR profiles if not f.endswith(".jaspar"): continue # Get profile name with open(os.path.join(taxon_dir, f)) as handle: m = motifs.read(handle, "jaspar") if m.name.startswith(m.matrix_id): name = m.name[len(m.matrix_id) + 1:] else: name = m.name names.setdefault(m.matrix_id, name) # Write JSON with open(json_file, "w") as handle: json.dump(names, handle, sort_keys=True, indent=4)
def line_plot(arguments): (mpbs_name, num_fp, signal_1, signal_2, factor1, factor2, condition1, condition2, pwm_dict, output_location, window_size, standardize) = arguments mpbs_name = mpbs_name.replace("(", "_") mpbs_name = mpbs_name.replace(")", "") mean_signal_1 = (signal_1 / num_fp) / factor1 mean_signal_2 = (signal_2 / num_fp) / factor2 # output signal signal_fname = os.path.join(output_location, "{}.txt".format(mpbs_name)) with open(signal_fname, "w") as f: f.write(condition1 + "\t" + condition2 + "\n") for i in range(window_size): f.write(str(mean_signal_1[i]) + "\t" + str(mean_signal_2[i]) + "\n") if standardize: mean_signal_1, mean_signal_2 = standard(mean_signal_1, mean_signal_2) # Output PWM and create logo pwm_fname = os.path.join(output_location, "{}.pwm".format(mpbs_name)) pwm_file = open(pwm_fname, "w") for e in ["A", "C", "G", "T"]: pwm_file.write(" ".join([str(int(f)) for f in pwm_dict[e]]) + "\n") pwm_file.close() logo_fname = os.path.join(output_location, "{}.logo.eps".format(mpbs_name)) pwm = motifs.read(open(pwm_fname), "pfm") pwm.weblogo(logo_fname, format="eps", stack_width="large", stacks_per_line=str(window_size), color_scheme="color_classic", unit_name="", show_errorbars=False, logo_title="", show_xaxis=False, xaxis_label="", show_yaxis=False, yaxis_label="", show_fineprint=False, show_ends=False) start = -(window_size / 2) end = (window_size / 2) - 1 x = np.linspace(start, end, num=window_size) plt.close('all') fig, ax = plt.subplots() ax.plot(x, mean_signal_2, color='red', label=condition2) ax.plot(x, mean_signal_1, color='blue', label=condition1) ax.text(0.15, 0.9, 'n = {}'.format(num_fp), verticalalignment='bottom', horizontalalignment='right', transform=ax.transAxes, fontweight='bold') ax.xaxis.set_ticks_position('bottom') ax.yaxis.set_ticks_position('left') ax.spines['top'].set_visible(False) ax.spines['right'].set_visible(False) ax.spines['left'].set_position(('outward', 15)) ax.tick_params(direction='out') ax.set_xticks([start, 0, end]) ax.set_xticklabels([str(start), 0, str(end)]) min_signal = min(min(mean_signal_1), min(mean_signal_2)) max_signal = max(max(mean_signal_1), max(mean_signal_2)) ax.set_yticks([min_signal, max_signal]) ax.set_yticklabels([str(round(min_signal, 2)), str(round(max_signal, 2))], rotation=90) ax.set_title(mpbs_name, fontweight='bold') ax.set_xlim(start, end) ax.set_ylim([min_signal, max_signal]) ax.legend(loc="upper right", frameon=False) ax.spines['bottom'].set_position(('outward', 70)) figure_name = os.path.join(output_location, "{}.line.eps".format(mpbs_name)) fig.tight_layout() fig.savefig(figure_name, format="eps", dpi=300) # Creating canvas and printing eps / pdf with merged results output_fname = os.path.join(output_location, "{}.eps".format(mpbs_name)) c = pyx.canvas.canvas() c.insert(pyx.epsfile.epsfile(0, 0, figure_name, scale=1.0)) c.insert(pyx.epsfile.epsfile(0.45, 0.8, logo_fname, width=16.5, height=3)) c.writeEPSfile(output_fname) os.system(" ".join(["epstopdf", output_fname])) os.remove(figure_name) os.remove(logo_fname) os.remove(output_fname) os.remove(pwm_fname)
output_logos_dir = path.join(curr_dir, "logos") if not path.exists(output_logos_dir): mkdir(output_logos_dir) for dir_name, subdir_list, file_list in walk(path.join(curr_dir, "motifs")): base_name = path.basename(dir_name) if ((options.hocomoco and base_name == "hocomoco") or (options.jaspar_vertebrates and base_name == "jaspar_vertebrates") or (options.uniprobe_primary and base_name == "uniprobe_primary") or (options.uniprobe_secondary and base_name == "uniprobe_secondary")): output_dir = path.join(curr_dir, "logos", base_name) if not path.exists(output_dir): mkdir(output_dir) else: continue print("Creating logos for " + base_name) for pwm_file_name in file_list: pwm_full_file_name = path.join(dir_name, pwm_file_name) if pwm_file_name.split(".")[-1] != "pwm": continue pwm_file = open(pwm_full_file_name, "r") logo_file_name = path.join( output_dir, ".".join(pwm_file_name.split(".")[:-1]) + ".png") pwm = motifs.read(pwm_file, "pfm") pwm.weblogo(logo_file_name, format="png_print", stack_width="medium", color_scheme="color_classic") pwm_file.close() print("OK")
def estimate_bias_pwm(args): # Parameters max_duplicates = 100 # Initializing bam and fasta bamFile = Samfile(args.reads_file, "rb") genome_data = GenomeData(args.organism) fastaFile = Fastafile(genome_data.get_genome()) regions = GenomicRegionSet("regions") regions.read(args.regions_file) obs_f_pwm_dict = dict([("A", [0.0] * args.k_nb), ("C", [0.0] * args.k_nb), ("G", [0.0] * args.k_nb), ("T", [0.0] * args.k_nb), ("N", [0.0] * args.k_nb)]) exp_f_pwm_dict = dict([("A", [0.0] * args.k_nb), ("C", [0.0] * args.k_nb), ("G", [0.0] * args.k_nb), ("T", [0.0] * args.k_nb), ("N", [0.0] * args.k_nb)]) obs_r_pwm_dict = dict([("A", [0.0] * args.k_nb), ("C", [0.0] * args.k_nb), ("G", [0.0] * args.k_nb), ("T", [0.0] * args.k_nb), ("N", [0.0] * args.k_nb)]) exp_r_pwm_dict = dict([("A", [0.0] * args.k_nb), ("C", [0.0] * args.k_nb), ("G", [0.0] * args.k_nb), ("T", [0.0] * args.k_nb), ("N", [0.0] * args.k_nb)]) # Iterating on HS regions for region in regions: # Initialization prev_pos = -1 true_counter = 0 # Evaluating observed frequencies # Fetching reads for r in bamFile.fetch(region.chrom, region.initial, region.final): # Calculating positions if not r.is_reverse: cut_site = r.pos + args.forward_shift - 1 p1 = cut_site - int(floor(args.k_nb / 2)) else: cut_site = r.aend + args.reverse_shift + 1 p1 = cut_site - int(floor(args.k_nb / 2)) p2 = p1 + args.k_nb # Verifying PCR artifacts if p1 == prev_pos: true_counter += 1 else: prev_pos = p1 true_counter = 0 if true_counter > max_duplicates: continue # Fetching k-mer try: currStr = str(fastaFile.fetch(region.chrom, p1, p2)).upper() except Exception: continue if r.is_reverse: currStr = AuxiliaryFunctions.revcomp(currStr) # Counting k-mer in dictionary if not r.is_reverse: for i in range(0, len(currStr)): obs_f_pwm_dict[currStr[i]][i] += 1 else: for i in range(0, len(currStr)): obs_r_pwm_dict[currStr[i]][i] += 1 # Evaluating expected frequencies # Fetching whole sequence try: currStr = str(fastaFile.fetch(region.chrom, region.initial, region.final)).upper() except Exception: continue # Iterating on each sequence position s = None for i in range(0, len(currStr) - args.k_nb): # Counting k-mer in dictionary s = currStr[i:i + args.k_nb] for i in range(0, len(s)): exp_f_pwm_dict[s[i]][i] += 1 # Counting k-mer in dictionary for reverse complement s = AuxiliaryFunctions.revcomp(s) for i in range(0, len(s)): exp_r_pwm_dict[s[i]][i] += 1 # Closing files bamFile.close() fastaFile.close() # Output pwms os.system("mkdir -p " + os.path.join(args.output_location, "pfm")) pwm_dict_list = [obs_f_pwm_dict, obs_r_pwm_dict, exp_f_pwm_dict, exp_r_pwm_dict] pwm_file_list = [] pwm_obs_f = os.path.join(args.output_location, "pfm", "obs_{}_f.pfm".format(str(args.k_nb))) pwm_obs_r = os.path.join(args.output_location, "pfm", "obs_{}_r.pfm".format(str(args.k_nb))) pwm_exp_f = os.path.join(args.output_location, "pfm", "exp_{}_f.pfm".format(str(args.k_nb))) pwm_exp_r = os.path.join(args.output_location, "pfm", "exp_{}_r.pfm".format(str(args.k_nb))) pwm_file_list.append(pwm_obs_f) pwm_file_list.append(pwm_obs_r) pwm_file_list.append(pwm_exp_f) pwm_file_list.append(pwm_exp_r) for i in range(len(pwm_dict_list)): with open(pwm_file_list[i], "w") as pwm_file: for e in ["A", "C", "G", "T"]: pwm_file.write(" ".join([str(int(f)) for f in pwm_dict_list[i][e]]) + "\n") motif_obs_f = motifs.read(open(pwm_obs_f), "pfm") motif_obs_r = motifs.read(open(pwm_obs_r), "pfm") motif_exp_f = motifs.read(open(pwm_exp_f), "pfm") motif_exp_r = motifs.read(open(pwm_exp_r), "pfm") # Output logos os.system("mkdir -p " + os.path.join(args.output_location, "logo")) logo_obs_f = os.path.join(args.output_location, "logo", "obs_{}_f.pdf".format(str(args.k_nb))) logo_obs_r = os.path.join(args.output_location, "logo", "obs_{}_r.pdf".format(str(args.k_nb))) logo_exp_f = os.path.join(args.output_location, "logo", "exp_{}_f.pdf".format(str(args.k_nb))) logo_exp_r = os.path.join(args.output_location, "logo", "exp_{}_r.pdf".format(str(args.k_nb))) motif_obs_f.weblogo(logo_obs_f, format="pdf", stack_width="large", color_scheme="color_classic", yaxis_scale=0.2, yaxis_tic_interval=0.1) motif_obs_r.weblogo(logo_obs_r, format="pdf", stack_width="large", color_scheme="color_classic", yaxis_scale=0.2, yaxis_tic_interval=0.1) motif_exp_f.weblogo(logo_exp_f, format="pdf", stack_width="large", color_scheme="color_classic", yaxis_scale=0.02, yaxis_tic_interval=0.01) motif_exp_r.weblogo(logo_exp_r, format="pdf", stack_width="large", color_scheme="color_classic", yaxis_scale=0.02, yaxis_tic_interval=0.01) # Creating bias dictionary alphabet = ["A", "C", "G", "T"] k_mer_comb = ["".join(e) for e in product(alphabet, repeat=args.k_nb)] bias_table_F = dict([(e, 0.0) for e in k_mer_comb]) bias_table_R = dict([(e, 0.0) for e in k_mer_comb]) for k_mer in k_mer_comb: obs_f = get_ppm_score(k_mer, motif_obs_f.pwm, args.k_nb) exp_f = get_ppm_score(k_mer, motif_exp_f.pwm, args.k_nb) bias_table_F[k_mer] = round(obs_f / exp_f, 6) obs_r = get_ppm_score(k_mer, motif_obs_r.pwm, args.k_nb) exp_r = get_ppm_score(k_mer, motif_exp_r.pwm, args.k_nb) bias_table_R[k_mer] = round(obs_r / exp_r, 6) write_table(args.output_location, args.output_prefix, [bias_table_F, bias_table_R])
#threshold= math.pow(10,-float(sys.argv[2])) # P-val, 5 means -log(n,10) out = open(out_file, 'w') out.write("Sequence\tMotif\tMotif_consensus\tHit_Seq\tHit_position\tHit_score\tThreshold\n") # Specify if you only want to include certain pwm's in the directory if include == "all": include_if = os.listdir(f) else: include_if = open(include, 'r').read().splitlines() if f[-4:] == ".pwm": #Make PSSM for mapping motif_name = f.strip().split("/")[-1] m = motifs.read(open(f), "pfm") motif_len = len(m) pssm, consensus, threshold = make_pssm(m, ATbias, GCbias, p) hits = mapping(seq_file, pssm, threshold, consensus, motif_name, motif_len) else: for motif_file in os.listdir(f): if motif_file in include_if: motif_name = motif_file.strip().split("/")[-1] print(motif_name) path = f + motif_file m = motifs.read(open(path), "pfm") motif_len = len(m) pssm, consensus, threshold = make_pssm(m, ATbias, GCbias, p) hits = mapping(seq_file, pssm, threshold, consensus, motif_name, motif_len) else:
def line(self): signal = GenomicSignal(self.bam_file) signal.load_sg_coefs(slope_window_size=9) bias_table = BiasTable() bias_table_list = self.bias_table.split(",") table = bias_table.load_table(table_file_name_F=bias_table_list[0], table_file_name_R=bias_table_list[1]) genome_data = GenomeData(self.organism) fasta = Fastafile(genome_data.get_genome()) pwm_dict = dict([("A", [0.0] * self.window_size), ("C", [0.0] * self.window_size), ("G", [0.0] * self.window_size), ("T", [0.0] * self.window_size), ("N", [0.0] * self.window_size)]) mean_raw_signal = np.zeros(self.window_size) mean_bc_signal = np.zeros(self.window_size) mean_raw_signal_f = np.zeros(self.window_size) mean_bc_signal_f = np.zeros(self.window_size) mean_raw_signal_r = np.zeros(self.window_size) mean_bc_signal_r = np.zeros(self.window_size) mean_bias_signal_f = np.zeros(self.window_size) mean_bias_signal_r = np.zeros(self.window_size) num_sites = 0 mpbs_regions = GenomicRegionSet("Motif Predicted Binding Sites") mpbs_regions.read_bed(self.motif_file) total_nc_signal = 0 total_nl_signal = 0 total_nr_signal = 0 for region in mpbs_regions: if str(region.name).split(":")[-1] == "Y": num_sites += 1 # Extend by 50 bp mid = (region.initial + region.final) / 2 p1 = mid - (self.window_size / 2) p2 = mid + (self.window_size / 2) if not self.strands_specific: # Fetch raw signal raw_signal, _ = signal.get_signal(ref=region.chrom, start=p1, end=p2, downstream_ext=self.atac_downstream_ext, upstream_ext=self.atac_upstream_ext, forward_shift=self.atac_forward_shift, reverse_shift=self.atac_reverse_shift, genome_file_name=genome_data.get_genome()) mean_raw_signal = np.add(mean_raw_signal, raw_signal) # Fetch bias correction signal bc_signal, _ = signal.get_signal(ref=region.chrom, start=p1, end=p2, bias_table=table, downstream_ext=self.atac_downstream_ext, upstream_ext=self.atac_upstream_ext, forward_shift=self.atac_forward_shift, reverse_shift=self.atac_reverse_shift, genome_file_name=genome_data.get_genome()) mean_bc_signal = np.add(mean_bc_signal, bc_signal) else: raw_signal_f, _, raw_signal_r, _ = signal.get_signal_per_strand(ref=region.chrom, start=p1, end=p2, downstream_ext=self.atac_downstream_ext, upstream_ext=self.atac_upstream_ext, forward_shift=self.atac_forward_shift, reverse_shift=self.atac_reverse_shift, genome_file_name=genome_data.get_genome()) mean_raw_signal_f = np.add(mean_raw_signal_f, raw_signal_f) mean_raw_signal_r = np.add(mean_raw_signal_r, raw_signal_r) bc_signal_f, _, bc_signal_r, _ = signal.get_signal_per_strand(ref=region.chrom, start=p1, end=p2, bias_table=table, downstream_ext=self.atac_downstream_ext, upstream_ext=self.atac_upstream_ext, forward_shift=self.atac_forward_shift, reverse_shift=self.atac_reverse_shift, genome_file_name=genome_data.get_genome()) mean_bc_signal_f = np.add(mean_bc_signal_f, bc_signal_f) mean_bc_signal_r = np.add(mean_bc_signal_r, bc_signal_r) # Update pwm aux_plus = 1 dna_seq = str(fasta.fetch(region.chrom, p1, p2)).upper() if (region.final - region.initial) % 2 == 0: aux_plus = 0 dna_seq_rev = AuxiliaryFunctions.revcomp(str(fasta.fetch(region.chrom, p1 + aux_plus, p2 + aux_plus)).upper()) if region.orientation == "+": for i in range(0, len(dna_seq)): pwm_dict[dna_seq[i]][i] += 1 elif region.orientation == "-": for i in range(0, len(dna_seq_rev)): pwm_dict[dna_seq_rev[i]][i] += 1 # Create bias signal bias_table_f = table[0] bias_table_r = table[1] self.k_nb = len(bias_table_f.keys()[0]) bias_signal_f = [] bias_signal_r = [] p1_wk = p1 - int(self.k_nb / 2) p2_wk = p2 + int(self.k_nb / 2) dna_seq = str(fasta.fetch(region.chrom, p1_wk, p2_wk - 1)).upper() dna_seq_rev = AuxiliaryFunctions.revcomp(str(fasta.fetch(region.chrom, p1_wk, p2_wk + 1)).upper()) for i in range(int(self.k_nb / 2), len(dna_seq) - int(self.k_nb / 2) + 1): fseq = dna_seq[i - int(self.k_nb / 2):i + int(self.k_nb / 2)] rseq = dna_seq_rev[len(dna_seq) - int(self.k_nb / 2) - i:len(dna_seq) + int(self.k_nb / 2) - i] try: bias_signal_f.append(bias_table_f[fseq]) except Exception: bias_signal_f.append(1) try: bias_signal_r.append(bias_table_r[rseq]) except Exception: bias_signal_r.append(1) mean_bias_signal_f = np.add(mean_bias_signal_f, np.array(bias_signal_f)) mean_bias_signal_r = np.add(mean_bias_signal_r, np.array(bias_signal_r)) if self.protection_score: # signal in the center of the MPBS p1 = region.initial p2 = region.final nc_signal, _ = signal.get_signal(ref=region.chrom, start=p1, end=p2, bias_table=table, downstream_ext=self.atac_downstream_ext, upstream_ext=self.atac_upstream_ext, forward_shift=self.atac_forward_shift, reverse_shift=self.atac_reverse_shift, genome_file_name=genome_data.get_genome()) total_nc_signal += sum(nc_signal) p1 = region.final p2 = 2 * region.final - region.initial nr_signal, _ = signal.get_signal(ref=region.chrom, start=p1, end=p2, bias_table=table, downstream_ext=self.atac_downstream_ext, upstream_ext=self.atac_upstream_ext, forward_shift=self.atac_forward_shift, reverse_shift=self.atac_reverse_shift, genome_file_name=genome_data.get_genome()) total_nr_signal += sum(nr_signal) p1 = 2 * region.initial - region.final p2 = region.final nl_signal, _ = signal.get_signal(ref=region.chrom, start=p1, end=p2, bias_table=table, downstream_ext=self.atac_downstream_ext, upstream_ext=self.atac_upstream_ext, forward_shift=self.atac_forward_shift, reverse_shift=self.atac_reverse_shift, genome_file_name=genome_data.get_genome()) total_nl_signal += sum(nl_signal) mean_raw_signal = mean_raw_signal / num_sites mean_bc_signal = mean_bc_signal / num_sites mean_raw_signal_f = mean_raw_signal_f / num_sites mean_raw_signal_r = mean_raw_signal_r / num_sites mean_bc_signal_f = mean_bc_signal_f / num_sites mean_bc_signal_r = mean_bc_signal_r / num_sites mean_bias_signal_f = mean_bias_signal_f / num_sites mean_bias_signal_r = mean_bias_signal_r / num_sites protection_score = (total_nl_signal + total_nr_signal - 2 * total_nc_signal) / (2 * num_sites) # Output PWM and create logo pwm_fname = os.path.join(self.output_loc, "{}.pwm".format(self.motif_name)) pwm_file = open(pwm_fname,"w") for e in ["A","C","G","T"]: pwm_file.write(" ".join([str(int(f)) for f in pwm_dict[e]])+"\n") pwm_file.close() logo_fname = os.path.join(self.output_loc, "{}.logo.eps".format(self.motif_name)) pwm = motifs.read(open(pwm_fname), "pfm") pwm.weblogo(logo_fname, format="eps", stack_width="large", stacks_per_line="100", color_scheme="color_classic", unit_name="", show_errorbars=False, logo_title="", show_xaxis=False, xaxis_label="", show_yaxis=False, yaxis_label="", show_fineprint=False, show_ends=False) # Output the raw, bias corrected signal and protection score output_fname = os.path.join(self.output_loc, "{}.txt".format(self.motif_name)) output_file = open(output_fname, "w") if not self.strands_specific: output_file.write("raw signal: \n" + np.array_str(mean_raw_signal) + "\n") output_file.write("bias corrected signal: \n" + np.array_str(mean_bc_signal) + "\n") else: output_file.write("raw forward signal: \n" + np.array_str(mean_raw_signal_f) + "\n") output_file.write("bias corrected forward signal: \n" + np.array_str(mean_bc_signal_f) + "\n") output_file.write("raw reverse signal: \n" + np.array_str(mean_raw_signal_r) + "\n") output_file.write("bias reverse corrected signal: \n" + np.array_str(mean_bc_signal_r) + "\n") output_file.write("forward bias signal: \n" + np.array_str(mean_bias_signal_f) + "\n") output_file.write("reverse bias signal: \n" + np.array_str(mean_bias_signal_r) + "\n") if self.protection_score: output_file.write("protection score: \n" + str(protection_score) + "\n") output_file.close() if self.strands_specific: fig, (ax1, ax2, ax3) = plt.subplots(3, figsize=(12.0, 10.0)) else: fig, (ax1, ax2) = plt.subplots(2) x = np.linspace(-50, 49, num=self.window_size) ax1.plot(x, mean_bias_signal_f, color='red', label='Forward') ax1.plot(x, mean_bias_signal_r, color='blue', label='Reverse') ax1.xaxis.set_ticks_position('bottom') ax1.yaxis.set_ticks_position('left') ax1.spines['top'].set_visible(False) ax1.spines['right'].set_visible(False) ax1.spines['left'].set_position(('outward', 15)) ax1.spines['bottom'].set_position(('outward', 5)) ax1.tick_params(direction='out') ax1.set_xticks([-50, -40, -30, -20, -10, 0, 10, 20, 30, 40, 49]) ax1.set_xticklabels(['-50', '-40', '-30', '-20', '-10', '0', '10', '20', '30', '40', '49']) min_bias_signal = min(min(mean_bias_signal_f), min(mean_bias_signal_r)) max_bias_signal = max(max(mean_bias_signal_f), max(mean_bias_signal_r)) ax1.set_yticks([min_bias_signal, max_bias_signal]) ax1.set_yticklabels([str(round(min_bias_signal,2)), str(round(max_bias_signal,2))], rotation=90) ax1.text(-48, max_bias_signal, '# Sites = {}'.format(str(num_sites)), fontweight='bold') ax1.set_title(self.motif_name, fontweight='bold') ax1.set_xlim(-50, 49) ax1.set_ylim([min_bias_signal, max_bias_signal]) ax1.legend(loc="upper right", frameon=False) ax1.set_ylabel("Average Bias \nSignal", rotation=90, fontweight='bold') if not self.strands_specific: mean_raw_signal = self.standardize(mean_raw_signal) mean_bc_signal = self.standardize(mean_bc_signal) ax2.plot(x, mean_raw_signal, color='red', label='Uncorrected') ax2.plot(x, mean_bc_signal, color='green', label='Corrected') else: mean_raw_signal_f = self.standardize(mean_raw_signal_f) mean_raw_signal_r = self.standardize(mean_raw_signal_r) mean_bc_signal_f = self.standardize(mean_bc_signal_f) mean_bc_signal_r = self.standardize(mean_bc_signal_r) ax2.plot(x, mean_raw_signal_f, color='red', label='Forward') ax2.plot(x, mean_raw_signal_r, color='green', label='Reverse') ax3.plot(x, mean_bc_signal_f, color='red', label='Forward') ax3.plot(x, mean_bc_signal_r, color='green', label='Reverse') ax2.xaxis.set_ticks_position('bottom') ax2.yaxis.set_ticks_position('left') ax2.spines['top'].set_visible(False) ax2.spines['right'].set_visible(False) ax2.spines['left'].set_position(('outward', 15)) ax2.tick_params(direction='out') ax2.set_xticks([-50, -40, -30, -20, -10, 0, 10, 20, 30, 40, 49]) ax2.set_xticklabels(['-50', '-40', '-30', '-20', '-10', '0', '10', '20', '30', '40', '49']) ax2.set_yticks([0, 1]) ax2.set_yticklabels([str(0), str(1)], rotation=90) ax2.set_xlim(-50, 49) ax2.set_ylim([0, 1]) if not self.strands_specific: ax2.spines['bottom'].set_position(('outward', 40)) ax2.set_xlabel("Coordinates from Motif Center", fontweight='bold') ax2.set_ylabel("Average ATAC-seq \nSignal", rotation=90, fontweight='bold') ax2.legend(loc="center", frameon=False, bbox_to_anchor=(0.85, 0.06)) else: ax2.spines['bottom'].set_position(('outward', 5)) ax2.set_ylabel("Average ATAC-seq \n Uncorrected Signal", rotation=90, fontweight='bold') ax2.legend(loc="lower right", frameon=False) ax3.xaxis.set_ticks_position('bottom') ax3.yaxis.set_ticks_position('left') ax3.spines['top'].set_visible(False) ax3.spines['right'].set_visible(False) ax3.spines['left'].set_position(('outward', 15)) ax3.tick_params(direction='out') ax3.set_xticks([-50, -40, -30, -20, -10, 0, 10, 20, 30, 40, 49]) ax3.set_xticklabels(['-50', '-40', '-30', '-20', '-10', '0', '10', '20', '30', '40', '49']) ax3.set_yticks([0, 1]) ax3.set_yticklabels([str(0), str(1)], rotation=90) ax3.set_xlim(-50, 49) ax3.set_ylim([0, 1]) ax3.legend(loc="lower right", frameon=False) ax3.spines['bottom'].set_position(('outward', 40)) ax3.set_xlabel("Coordinates from Motif Center", fontweight='bold') ax3.set_ylabel("Average ATAC-seq \n Corrected Signal", rotation=90, fontweight='bold') ax3.text(-48, 0.05, '# K-mer = {}\n# Forward Shift = {}'.format(str(self.k_nb), str(self.atac_forward_shift)), fontweight='bold') figure_name = os.path.join(self.output_loc, "{}.line.eps".format(self.motif_name)) fig.subplots_adjust(bottom=.2, hspace=.5) fig.tight_layout() fig.savefig(figure_name, format="eps", dpi=300) # Creating canvas and printing eps / pdf with merged results output_fname = os.path.join(self.output_loc, "{}.eps".format(self.motif_name)) c = pyx.canvas.canvas() c.insert(pyx.epsfile.epsfile(0, 0, figure_name, scale=1.0)) if self.strands_specific: c.insert(pyx.epsfile.epsfile(2.76, 1.58, logo_fname, width=27.2, height=2.45)) else: c.insert(pyx.epsfile.epsfile(2.5, 1.54, logo_fname, width=16, height=1.75)) c.writeEPSfile(output_fname) os.system("epstopdf " + figure_name) os.system("epstopdf " + logo_fname) os.system("epstopdf " + output_fname)
test_seq=Seq(str(record.seq), m.alphabet) out.write(str(record.id) + "\t" + file1 + "\t" + str(record.seq) + "\n") if(check == True): #list = open(sys.argv[7]).readlines() #print list for l in list: for filename in glob.glob("/home/ngs/vivek/python/scripts/matrices/converted/Homo_sapiens/"+ l.rstrip() +"_top10align_pfm.txt"): #print filename fname = os.path.basename(filename) fname = re.sub(r"_top10align_pfm.txt","",str(fname)) test_pwm = motifs.read(open(filename), "pfm") pwm = test_pwm.counts.normalize(pseudocounts=0.5) pssm = pwm.log_odds(background) IUPAC = test_pwm.counts.degenerate_consensus for position, score in pssm.search (test_seq, threshold = float(sys.argv[2])): print str(record.id) + "\t" + str(abs(position)) +"\t"+ str(abs(position)+len(IUPAC)) +"\t"+ str("+" if position > 0 else "-") +"\t"+ str(fname) +"\t"+ str(data[fname]) +"\t"+ str(IUPAC) +"\t"+ str(score) +"\t"+ str(file1) +"\t"+ str(now) +"\t"+ str(file1) else: test_pwm = motifs.read(open(sys.argv[7]), "pfm") #print sys.argv[7] pwm = test_pwm.counts.normalize(pseudocounts=0.5) pssm = pwm.log_odds(background) IUPAC = test_pwm.counts.degenerate_consensus
# Import import sys from Bio import motifs from glob import glob from copy import deepcopy # Reading input inList = ["./ZBT7B_M00405.pfm", "./MA0138.2.REST.pfm", "./MA0527.1.ZBTB33.pfm"] # Execution for inFileName in inList: inFile = open(inFileName, "r") outFileName = inFileName[:-3] + "pdf" pwm = motifs.read(inFile, "pfm") # Revert complement tempA = deepcopy(pwm.counts["A"][::-1]) pwm.counts["A"] = pwm.counts["T"][::-1] pwm.counts["T"] = tempA tempC = deepcopy(pwm.counts["C"][::-1]) pwm.counts["C"] = pwm.counts["G"][::-1] pwm.counts["G"] = tempC # Complement #tempA = deepcopy(pwm.counts["A"]) #pwm.counts["A"] = pwm.counts["T"] #pwm.counts["T"] = tempA #tempC = deepcopy(pwm.counts["C"]) #pwm.counts["C"] = pwm.counts["G"] #pwm.counts["G"] = tempC
from Bio import SeqIO from Bio.Alphabet import IUPAC chp_list = list(SeqIO.parse("srf_chip.fasta", "fasta", IUPAC.unambiguous_dna)) count = 0 for dna in chp_list: match = dna.seq.upper().count('GCCCATATATGG') # .upper vs. .upper() count = count+match import re chp_match = 0 for dna in chp_list: if re.search(r'[GT][CA]CC[AT]TATA[AT]GG', str(dna.seq)): # dna vs. str(dna.seq)) chp_match = chp_match+1 from Bio import motifs srf_m = motifs.read(open("MA0083.1.sites"), "sites") srf_m.pseudocounts = 1 srf_m.background = 0.4 for dna in chp_list: # put into for loop for pos, score in srf_m.pssm.search(dna.seq, threshold=7.0): # use .seq atrribute print "Position %d: score = %5.2f" % (pos, score)
#! /usr/bin/env python3 from Bio import SeqIO from Bio.Alphabet import IUPAC from Bio.Seq import Seq from Bio import motifs from Bio import SeqUtils with open("sites/MA0106.1.sites") as handle: p53 = motifs.read(handle, "sites") motif = p53.degenerate_consensus with open("output/motif_result_p53.txt","w") as f: for seq_record in SeqIO.parse('input/gencode.v26.lncRNA_transcripts.fa','fasta'): f.write(">" + str(seq_record.id) + "\n") result=SeqUtils.nt_search(str(seq_record), m) f.write(str(result) + "\n") ## with open("sites/MA0001.1.sites") as handle: AGL3 = motifs.read(handle, "sites") motif = AGL3.degenerate_consensus with open("output/motif_result_AGL3.txt","w") as f: for seq_record in SeqIO.parse('input/gencode.v26.lncRNA_transcripts.fa','fasta'): f.write(">" + str(seq_record.id) + "\n") result=SeqUtils.nt_search(str(seq_record), motif)
from Bio.SeqRecord import SeqRecord from Bio.Alphabet import IUPAC, generic_dna, generic_protein from collections import defaultdict ##################### ## sys Inputs - to do ##################### ## read in alignment and motif try: alignment = list(SeqIO.parse(sys.argv[1], "fasta")) except: print ("ERROR This is not a fasta alignment file") sys.exit() try: motif = motifs.read(open(sys.argv[2]), "pfm") except: print ("ERROR This is not a pfm file") sys.exit() try: threshold = sys.argv[3] except IndexError: threshold = -10000 # Used later when marking output file alignment_file_name = os.path.basename(sys.argv[1]) motif_file_name = os.path.basename(sys.argv[2]) print ("alignment file: " + alignment_file_name) print ("motif file: " + motif_file_name)
#threshold= math.pow(10,-float(sys.argv[2])) # P-val, 5 means -log(n,10) out = open(out_file, 'w') out.write( "Sequence\tMotif\tMotif_consensus\tHit_Seq\tHit_position\tHit_score\tThreshold\n" ) # Specify if you only want to include certain pwm's in the directory if include == "all": include_if = os.listdir(f) else: include_if = open(include, 'r').read().splitlines() if f[-4:] == ".pwm": #Make PSSM for mapping motif_name = f.strip().split("/")[-1] m = motifs.read(open(f), "pfm") motif_len = len(m) pssm, consensus, threshold = make_pssm(m, ATbias, GCbias, p) hits = mapping(seq_file, pssm, threshold, consensus, motif_name, motif_len) else: for motif_file in os.listdir(f): if motif_file in include_if: motif_name = motif_file.strip().split("/")[-1] print(motif_name) path = f + motif_file m = motifs.read(open(path), "pfm") motif_len = len(m) pssm, consensus, threshold = make_pssm(m, ATbias, GCbias, p) hits = mapping(seq_file, pssm, threshold, consensus,
repositories = set(repositories) query = set(args.folders) if not repositories.issuperset(query): print("ERROR: query repositories %s do not exist" % str(list(query.difference(repositories)))) exit(1) repositories = args.folders print(">>> CREATING logos for", repositories) for repo in repositories: dir_name = path.join(curr_dir, "motifs", repo) for _, _, file_list in walk(dir_name): output_dir = path.join(curr_dir, "logos", repo) if not path.exists(output_dir): mkdir(output_dir) print(">>", repo) for pwm_file_name in file_list: pwm_full_file_name = path.join(dir_name, pwm_file_name) if pwm_file_name.split(".")[-1] != "pwm": continue pwm_file = open(pwm_full_file_name, "r") logo_file_name = path.join(output_dir, ".".join(pwm_file_name.split(".")[:-1]) + ".png") pwm = motifs.read(pwm_file, "pfm") pwm.weblogo(logo_file_name, format="png_print", stack_width="medium", color_scheme="color_classic") pwm_file.close()
fprList = [0.005, 0.001, 0.0005, 0.0001, 0.00005, 0.00001] pseudocounts = 0.1 background = {'A':0.25,'C':0.25,'G':0.25,'T':0.25} precision = 10000 # Creating output file outFile = open(outFileName,"w") outFile.write("\t".join(["MOTIF"]+[str(e) for e in fprList])+"\n") # Iterating on all PWMs for pwmFileName in glob(inFolder+"*.pwm"): # Creating PSSM name = ".".join(basename(pwmFileName).split(".")[:-1]) input_file = open(pwmFileName,"r") pfm = motifs.read(input_file, "pfm") pwm = pfm.counts.normalize(pseudocounts) input_file.close() pssm = pwm.log_odds(background) pssm_list = [pssm[e] for e in ["A","C","G","T"]] distribution = pssm.distribution(background=background, precision=precision) # Evaluating thresholds resVec = [name] for fpr in fprList: resVec.append(str(distribution.threshold_fpr(fpr))) # Writing results outFile.write("\t".join(resVec)+"\n")
# tf_class, tf_family: structuras class & family of motif # species: as taxonomy IDs # tax_group: taxonomic supergroup of motif # acc: accesion number of transcription factor protein # data_type: type of data used to construct the motif # medline: Pubmed ID or literature supporting motif # pazar_id: reference ID to PAZAR DB # comment: text, notes about motif # stores motifs in 3 main formats: 2 flat files & SQL DB # JASPAR sites format #>ID name count #seqseqMOTIFMOTIFseqseq # no added meta info with open('Arnt.sites') as handle: arnt = motifs.read(handle, 'sites') #motif format 'sites' print len(arnt.instances), arnt.instances print arnt.counts # JASPAR pfm format #2 9 0 1 32 4 #1 33 4 51 1 0 #9 3 10 0 0 0 #20 0 31 0 0 50 # only count profile matrix with open('SRF.pfm') as handle: srf = motifs.read(handle, 'pfm') # motif format 'pfm' print srf.counts print srf.instances # direct matrix, it didn't save instances print arnt.counts.consensus, srf.counts.consensus
from sys import argv import numpy as np from Bio import motifs name = str(argv[1]).split('.')[0] name_pfm = name + '.pfm' name_transfac = name + '.transfac' mat = np.genfromtxt(argv[1]) mat_trans = mat.transpose() t = '' for i in range(len(mat_trans)): for j in mat_trans[i]: t += str(int(j)) t += ' ' t += '\n' with open(name_pfm, 'w') as pfm_out: pfm_out.write(t) mot = motifs.read(open(name_pfm), 'pfm') print mot.format("transfac") # with open(name_transfac, 'w') as transfac_out: # transfac_out.write(mot.format("transfac"))