Ejemplo n.º 1
0
    def list_from_file(
        cls,
        motifs_file,
        file_format,
        threshold=None,
        pseudocounts="jaspar",
        relative_threshold=None,
    ):
        """Return a list of PSSM patterns from a file in JASPAR, MEME, etc.

        Parameters
        ----------

        motifs_file
          Path to a motifs file, or file handle.

        file_format
          File format. one of "jaspar", "meme", "TRANSFAC".
        
        pseudocounts
          Either a dict {"A": 0.01, "T": ...} or "jaspar" for automatic
          pseudocounts from the Biopython.motifs.jaspar module (recommended),
          or None for no pseudocounts at all (not recommended!)
        
        threshold
          locations of the sequence with a PSSM score above this value will be
          considered matches. For convenience, a relative_threshold can be
          given instead.

        relative_threshold
          Value between 0 and 1 from which the threshold will be auto-computed.
          0 means "match everything", 1 means "only match the one (or several)
          sequence(s) with the absolute highest possible score".
        """
        if isinstance(motifs_file, str):
            with open("./jaspar.txt", "r") as f:
                motifs_list = motifs.parse(f, format=file_format)
        else:
            motifs_list = motifs.parse(motifs_file, format=file_format)
        if pseudocounts is not None:
            for motif in motifs_list:
                cls.apply_pseudocounts(motif, pseudocounts)

        return [
            MotifPssmPattern(
                pssm,
                threshold=threshold,
                relative_threshold=relative_threshold,
            ) for pssm in motifs_list
        ]
Ejemplo n.º 2
0
def process_data(data, data_type='counts', seq_type='dna'):
    if data_type == 'counts':
        pfm, total = count_to_pfm(data)
        ic = calc_relative_information(pfm, total)
    elif data_type in ['fasta', 'stockholm']:
        #motif, ic = read_alignment(data, data_type, seq_type)
        #pfm = motif.counts.normalize(pseudocounts=1)
        data, total = read_alignment(data, data_type, seq_type)
        pfm, _ = count_to_pfm(data)
        ic = calc_relative_information(pfm, total)
    elif data_type in [
            'alignace', 'meme', 'mast', 'transfac', 'pfm', 'sites', 'jaspar'
    ]:
        if data_type in ['jaspar', 'transfac']:
            motif = motifs.parse(open(data, 'r'), data_type.upper())[0]
            pfm = dict(motif.counts.normalize())
            total = sum(list(motif.counts.values())[0])
        else:
            motif = motifs.read(open(data, 'r'), data_type)
            try:
                pfm = motif.counts.normalize(psuedocounts=1)
            except:
                pfm = motif.counts.normalize()
            total = motif.counts
        ic = calc_relative_information(pfm, total)
    return (format_matrix(pfm), format_matrix(ic))
Ejemplo n.º 3
0
def get_summary(job_id, meme_file, peaks):
    """
    Write summary in a json file
    """
    summary = {}
    # Number of occurences in peak
    summary['motif_occurrences'] = {}
    # Number of peaks
    summary['original_peaks'] = peaks
    summary['peaks'] = min(MAX_PEAKS_TO_KEEP, peaks)
    records = motifs.parse(open(meme_file), 'meme')
    num_occurrences = []
    for index, record in enumerate(records):
        num_occurrences.append(
            int(getattr(record, 'num_occurrences', 'Unknown')))

    sorted_occurences = sorted(enumerate(num_occurrences), key=lambda x: x[1])
    summary['motif_occurrences'] = {
        'motif{}'.format(index + 1): value
        for index, value in sorted_occurences
    }
    fp = os.path.join(STATIC_PATH, job_id, 'summary.json')
    with open(fp, 'w') as f:
        json.dump(summary, f)
    print summary
    return summary
Ejemplo n.º 4
0
def jaspar2pfm(jasparFile, outDir):
    with open(jasparFile) as handle:
        for m in motifs.parse(handle, "jaspar"):
            fileName = outDir + "/" + str(m.name).replace(":",
                                                          "_").upper() + ".pfm"
            with open(fileName, "w") as output:
                output.write(m.format("jaspar"))
Ejemplo n.º 5
0
def main(argv):
    parser = argparse.ArgumentParser(description='Process meme files')
    parser.add_argument('-i', '--meme', metavar='<meme_out>', help='Meme input file', required=True)
    parser.add_argument('-m', '--motif', metavar='<motif_no>', help='Motif number', required=True, type=int)
    parser.add_argument('-c', '--phylo', metavar='<phylo_out>', help='PhyloP conservation scores', required=True)
    parsed = parser.parse_args(argv)
    handle = open(parsed.meme)
    records = motifs.parse(handle, 'meme')
    record = records[parsed.motif-1]
    phylo_data = csv.reader(open(parsed.phylo,'r'), delimiter='\t')
    phylo_scores = []
    for line in phylo_data:
        phylo_scores.append(float(line[2]))
    print "Motif length", record.length
    print "phylo length", len(phylo_scores)
    profile = position_wise_profile(record.counts, record.length)
    max_occur = find_max_occurence(profile, max_count=1)
    motif_scores = []
    for position in max_occur:
        motif_scores.append(position[0][1])
    pr = pearsonr(np.array(motif_scores), np.array(phylo_scores))
    print 'Pearson correlation: {}'.format(pr)
    fig, ax = plt.subplots()
    ax= sns.regplot(y=np.array(motif_scores), x=np.array(phylo_scores), scatter=True)
    ax.set(ylabel="Count of most freq nucleotide", xlabel="PhyloP scores", title='CTCF | pearsonr = {}, p-val={}'.format(pr[0],pr[1]));
    fig.savefig('{}_motif{}_scatter.png'.format(parsed.phylo, parsed.motif))
    x = np.linspace(1,len(phylo_scores)+1,num=len(phylo_scores), endpoint=False)
    f, (ax1, ax2) = plt.subplots(2, 1)
    x1 = sns.barplot(x,y=np.array(motif_scores), ax=ax1)
    x2 = sns.barplot(x,y=np.array(phylo_scores), ax=ax2)
    x1.set(ylabel='Counts of most freq nucleotide', xlabel='Position in motif')
    x2.set(ylabel='Phylop Score', xlabel='Position in motif')
    f.tight_layout()
    f.savefig('{}_motif{}_trend.png'.format(parsed.phylo, parsed.motif))
Ejemplo n.º 6
0
def get_jaspar_motif(tfName):
    with open(Config.get("data", "pfm_db_jaspar")) as handle:
        for m in motifs.parse(handle, "jaspar"):
            if str(m.name).upper() == str(tfName).upper():
                return m
    # if not found
    return None
Ejemplo n.º 7
0
    def fit(self, fasta_file=''):
        """Save the output of MEME and parse it."""
        if not fasta_file:
            raise NameError('Input fasta file not specified')

        cmd_params = self._make_param_string()
        self._command_exec(fasta_file, cmd_params)

        # parsing meme output file with Biopython
        filename = os.path.join(self.output_dir, 'meme.txt')
        handle = open(filename)
        record = motifs.parse(handle, 'meme')
        handle.close()

        # store names of sequences given as input to fit()
        self.seq_names = record.sequences[:]
        self.n_seqs = len(record.sequences)

        # create a list of motives, each represented by an object
        motives_db = list()
        for i in range(self.nmotifs):
            motives_db.append(record[i])
        self.motives_db = motives_db[:]

        # store length, number of occurences and e-value of each motif
        self._get_stats(self.nmotifs)

        # get string representation of motives
        self.motives_list = list(self._get_motives_list())

        # create PWMs
        motives_list = self.motives_list[:]
        super(Meme, self).fit(motives=motives_list)
Ejemplo n.º 8
0
Archivo: meme.py Proyecto: saketkc/moca
def read_memefile(meme_file):
    """Summariser for MEME file
    Read meme file
    Parameters
    ----------
    meme_file: str
        Location of MEME file

    Returns
    -------
    summary: dict
        A summary containing the following details:
            - motif_occurences: dict contatining number of type each motif occurs. dict is indexed by key: 'motif1', 'motif2' etc
            - motif_records: List of Biopython motif objects
    summary:
    """
    summary = {}
    summary['motif_occurrences'] = {}
    records = motifs.parse(open(os.path.abspath(meme_file)), 'meme')
    summary['total_motifs'] = len(records)
    num_occurrences = []
    for index, record in enumerate(records):
        num_occurrences.append(int(getattr(record,'num_occurrences','Unknown')))

    sorted_occurences = sorted(enumerate(num_occurrences), key=lambda x: x[1])
    summary['motif_occurrences'] = {'motif{}'.format(index+1):value for index,value in sorted_occurences}
    summary['motif_records'] = records
    ### Read background frequenceies H since bioppython does not support them
    bg_frequencies = get_motif_bg_freq(meme_file)
    summary['bg_frequencies'] = bg_frequencies
    return summary
Ejemplo n.º 9
0
def main(motif_file, motif_outfile, d_th, pc, bp, ow, fpr, pe):
    thresholds = []
    background = {'A': bp[0], 'C': bp[1], 'T': bp[2], 'G': bp[3]}
    print(("Baseline nucleotide frequencies:\n\t" + str(background)))

    print(("Calculating thresholds (" + timeString() + "). This could take a while."))
    sys.stdout.flush()
    idx = 0
    print_exponent = 1

    # Calculate thresholds using biopython
    fh = open(motif_file)
    for m in motifs.parse(fh, "jaspar"):
        pwm = m.counts.normalize(pseudocounts=pc)    # creates dictionary like representation
        pssm = pwm.log_odds(background)              # converts to log_odds vs background
        # Precision argument of 4 was recommended by biopython's documentation (slow step)
        # YYY - JA - 05/05/2017 - This could be sped up by using TFM-PVALUE's C++ functions for
        # determining thresholds. May consider implementing, calculates precise p-values with no
        # errors much more quickly and can also generate p-values from a score.
        distribution = pssm.distribution(background=background, precision=10 ** pe)
        m_th = distribution.threshold_fpr(fpr)
        thresholds.append(m_th)
        # print progress
        idx += 1
        if (idx >= 10 ** print_exponent):
            print((str(idx) + " thresholds calculated... at " + timeString()))
            print_exponent += 1
            sys.stdout.flush()

    print(("Total motifs read: " + str(len(thresholds))))

    print("Outputing thresholds")
    motif.get_put_motifs(motif_file, motif_outfile, d_th, ow, thresholds)

    print(("Done (" + timeString() + ")"))
Ejemplo n.º 10
0
Archivo: ESO.py Proyecto: NivAmi/iGEM
def suspect_site_extractor(example_seq, compute_motifs, num_sites, motifs_path, extension=''):
    sites_collector = {}
    df_recombination = find_recombination_sites(str(example_seq).upper(), num_sites)

    print('finished finding RMD sites')

    df_slippage = find_slippage_sites(str(example_seq).upper(), num_sites)

    print('finished finding SSR sites')

    sites_collector['df_recombination' + extension] = df_recombination
    sites_collector['df_slippage' + extension] = df_slippage

    ### do methylation only if requested
    if compute_motifs == True:
        with open(motifs_path, "r") as handle:
            relevant_motifs = motifs.parse(handle, "minimal")

        df_motifs = find_motif_sites(example_seq, num_sites, relevant_motifs)

        print('finished finding motif sites')

        sites_collector['df_motifs' + extension] = df_motifs

    return sites_collector
Ejemplo n.º 11
0
def motifs_list(jasp_motifs_file):
    jasp_motifs = open(jasp_motifs_file, 'r')
    motifs_list = []
    for m in motifs.parse(jasp_motifs, "jaspar"):
        motifs_list.append(m)
    jasp_motifs.close()
    return motifs_list
Ejemplo n.º 12
0
def tffm_from_meme(meme_output, kind, name="TFFM"):
    """
    Construct a TFFM from the output of MEME on ChIP-seq data.

    :arg meme_output: File containing the output of MEME.
    :type meme_output: str
    :arg kind: Type of TFFM to construct between '1st-order' and 'detailed'.
    :type kind: str
    :arg name: Name of the TFFM (default: "TFFM")
    :type name: str

    :returns: The TFFM initialized from MEME results.
    :rtype: :class:`TFFM`

    :note: As the PFM is used to initialize the TFFM, a pseudocount of 1 is
        added to all the values in the PFM

    """

    record = motifs.parse(open(meme_output), 'MEME')
    if record.alphabet != IUPAC.unambiguous_dna:
        sys.exit("### Wrong alphabet used in MEME ###")
    motif = record[0]
    nb_seq, nb_res, first_letters = utils.get_sequences_info(record.datafile)
    if kind == TFFM_KIND.FIRST_ORDER:
        hmm = create_1storder_hmm(nb_seq, nb_res, first_letters, motif)
    elif kind == TFFM_KIND.DETAILED:
        hmm = create_detailed_hmm(nb_seq, nb_res, first_letters, motif)
    else:  # 0-order HMM here
        hmm = create_0order_hmm(nb_seq, nb_res, first_letters, motif)
    return TFFM(hmm.emissionDomain, hmm.distribution, hmm.cmodel, kind, name)
Ejemplo n.º 13
0
    def __init__(self, fasta, motifs_input, bg=None):
        self.all_motifs = []
        with open(motifs_input, "r") as infile:
            self.all_motifs = list(motifs.parse(infile, "jaspar"))

        # for large sequence header, only keep the text before the first space
        self.genome_seq = pyfasta.Fasta(fasta, key_fn=lambda x: x.split()[0])
        self.bg = bg
Ejemplo n.º 14
0
def read_jaspar_pwms(file='jaspar/pfm_vertebrates.txt', dir='d:/sequence'):
    with open(os.path.join(dir, file), 'r') as h:
        jaspar = {
            m.name.strip().split(':')[0].upper():
            m.counts.normalize().log_odds()
            for m in motifs.parse(h, 'jaspar')
        }
        return jaspar
def process_meme_output(meme_out_folder, pfm_filename):
    """
    Parse MEME search result into JASPAR .pfm files.
    """
    with open(f'{meme_out_folder}/meme.txt') as meme:
        ms = motifs.parse(meme, 'MINIMAL')
        with open(pfm_filename, 'w+') as pfm:
            pfm.write(motifs.write(ms, 'jaspar'))
Ejemplo n.º 16
0
def read_pfm(jaspar_motifs_file, tf_name):
    motif = None
    with open(jaspar_motifs_file) as handle:
        for m in motifs.parse(handle, "jaspar"):
            if m.name == tf_name:
                motif = m
                break
    return motif
Ejemplo n.º 17
0
def getmotiflist(pwmfilename, filetype="TRANSFAC", prefix=None):
    with open(pwmfilename, 'r') as mhandle:
        motiflist = motifs.parse(mhandle, filetype)
    result = []
    for motifpwm in motiflist:
        if prefix == None or \
            motifpwm['ID'].strip().startswith(prefix):
            result.append(motifpwm)
    return result
Ejemplo n.º 18
0
def pwm2pval(motifName, seq):

    with open(Config.get("data", "pfm_db_jaspar")) as handle:
        for m in motifs.parse(handle, "jaspar"):
            if str(m.name).upper() == str(motifName).upper():
                ppm = m.counts.normalize(pseudocounts=C_PSEUDOCOUNTS)
                pwm = ppm.log_odds(background=C_BACKGROUND)
                writePWM(m)
                break  ### stop if motif is found
    print "PWM:"
    print pwm

    ### scale raw PWM scores
    scaled_pwm = np.zeros(shape=(4, len(m)))

    for i, nt in enumerate(['A', 'C', 'G', 'T']):
        scaled_pwm[i] = pwm[nt]
    # print scaled_pwm

    # subtract by min PWM score, to make non-negative matrix
    scale_const = np.min(scaled_pwm)
    nonneg_pwm = scaled_pwm - scale_const

    # scale
    scale_factor = (C_PRECISION / np.max(nonneg_pwm))
    scaled_pwm = nonneg_pwm * scale_factor

    # round to nearest integer
    scaled_pwm = np.rint(scaled_pwm).astype(int)

    print "Scaled PWM:"
    print scaled_pwm

    ### score distribution
    score_distribution = np.zeros(shape=(len(m), len(m) * C_PRECISION + 1),
                                  dtype=np.int)
    print np.shape(score_distribution)

    # init first row
    for i, nt in enumerate(['A', 'C', 'G', 'T']):
        score_distribution[0, scaled_pwm[i, 0]] += 1
    # print "first row:",score_distr[0,:]

    # proc rest of motif
    for j in range(1, len(m)):  ### j: 1 to length of motif
        print "MOTIF pos:", j
        for k in scaled_pwm[:, j]:
            for idx, count in enumerate(score_distribution[j - 1, :]):
                if count > 0:
                    score_distribution[j, idx + k] += count

    scaled_score = int(seq2score(seq, scaled_pwm))
    raw_score = seq2score(seq, pwm)
    print "scaled score:", scaled_score
    print "raw score:", raw_score
    pval = score2pval(scaled_score, score_distribution)
    print "pval:", pval
Ejemplo n.º 19
0
def build_pssm(meme):
    with open(meme) as f:
        record = motifs.parse(f, 'minimal')

    motif = record[0]
    motif.pseudocounts = motif.background
    pssm = motif.pssm

    return pssm
Ejemplo n.º 20
0
def get_motif(pcm_file):
    try:
        with open(pcm_file) as handle:
            m = motifs.parse(handle, HOCOMOCO_PARSE_FORMAT)
            if (m is not None and len(m) > 0):
                return m[0]
            else:
                return None
    except IOError as e:
        logger.error(ERROR_MSG_IO.format(pcm_file, e.strerror))
Ejemplo n.º 21
0
 def create_pfm_from_meme(self, memexml, nameforpfm, tf_name, species_name):
     with open(memexml) as handle:
         record = motifs.parse(handle, "meme")
     motif = record[0]
     motif_base = str(motif.counts)
     with open(nameforpfm, 'w') as transition:
         transition.write('>{}\t{}\n'.format(tf_name, species_name))
         for line in motif_base.split('\n')[1:5]:
             base, values = line.strip().split(':')
             values = [str(round(float(i))) for i in values.split(' ') if i != '']
             transition.write('{}  [ {} ]\n'.format(base, '  '.join(values)))
Ejemplo n.º 22
0
def get_chipseq_ranges(chip_fimo, chip_all, motif_file):
    chip_all_data = pd.read_csv(chip_all)
    chip_fimo_data = pd.read_table(chip_fimo)
    motif_id = set(chip_fimo_data['#pattern name']).pop()
    with open(motif_file, 'r') as fid:
        motif_record = motifs.parse(fid, 'MEME')
    mot_len = int(motif_record[motif_id - 1].length)
    mot_range = []
    for pos in list(chip_all_data['Position']):
        mot_range.append((pos - mot_len - 10, pos + mot_len + 10))
    return mot_range
Ejemplo n.º 23
0
    def __init__(self, ref_path, bg=None):
        ref_manager = ReferenceManager(ref_path)
        self.all_motifs = []
        if ref_manager.motifs is not None:
            with open(ref_manager.motifs, "r") as infile:
                self.all_motifs = list(motifs.parse(infile, "jaspar"))

        # for large sequence header, only keep the text before the first space
        self.genome_seq = pyfasta.Fasta(ref_manager.fasta,
                                        key_fn=lambda x: x.split()[0])
        self.bg = bg
Ejemplo n.º 24
0
def build_motif_db():
    motif_dict= {}
    background = {'A': 0.3, 'C': 0.2, 'T': 0.3, 'G': 0.2}
    jf = open("jaspar_curated.pfm")
    for m in motifs.parse(jf,"jaspar"):
        pwm = m.counts.normalize(pseudocounts={'A': 0.6, 'C': 0.4, 'T': 0.6, 'G': 0.4})
        pssm = pwm.log_odds(background)
        distribution = pssm.distribution(background=background, precision = 10**3)
#        threshold = distribution.threshold_patser()
        threshold = distribution.threshold_fpr(0.001)
        motif_dict[m.name] = (pssm,threshold)
    return motif_dict
Ejemplo n.º 25
0
def load_meme(filename):
    """
    Return a parsed MEME output file using Biopython.
    """
    gapped = Gapped(ExtendedIUPACDNA(), '-')

    with open(filename, "r") as f:
        records = []
        records = motifs.parse(f, "MEME")

    records.sort(key=lambda x: x.evalue)  # Sort entries by evalue

    records = convert(records, gapped)
    return records
Ejemplo n.º 26
0
def load_meme(filename):
    """
    Return a parsed MEME output file using Biopython.
    """
    gapped = Gapped(ExtendedIUPACDNA(), '-')

    with open(filename, "r") as f:
        records = []
        records = motifs.parse(f, "MEME")

    records.sort(key=lambda x: x.evalue)  # Sort entries by evalue

    records = convert(records, gapped)
    return records
Ejemplo n.º 27
0
def MEMERIS(args):
    from Bio import motifs
    records = []
    fastaPath = os.path.join(args.indir, "sequences.fa")
    with open(os.path.join(args.indir, 'results.txt')) as f:
        for motif in motifs.parse(f, 'MEME', strict=False):
            for instance in motif.instances:
                start = instance.start - 1
                end = start + len(str(instance))
                records.append((instance.sequence_name, start, end))
    locations = pd.DataFrame.from_records(records)
    locations.columns = ["sequence_id", "start", "end"]
    locations = locations.set_index("sequence_id")
    nameMapping = pd.Series(extendSeqName(list(locations.index), fastaPath))
    locations.index = nameMapping.loc[locations.index]
    return locations
Ejemplo n.º 28
0
def motif_search_calc(input_jaspar,lincRNA_seq,output_motif):
	with open (input_jaspar,"r") as fm:
		for m in motifs.parse(fm,"jaspar"):
			with open(lincRNA_seq,"r") as lincs:
				for lincseq in SeqIO.parse(lincs,'fasta',alphabet=IUPAC.unambiguous_dna):
					for pos, score in m.pssm.search(lincseq.seq, threshold=7.0):
						d=lincseq.id.split("|")
						linc_id = d[1].strip("\" ")
						position = abs(pos)
						final_position = (int(position)/int(d[-2]))
						all_positions.append(final_position)
						with open(output_motif,"w") as f:
							f.write("For lincRNA" + "\t" + str(d[0]) + "\t" + str(d[1]) + "\t" + "total length:" + "\t" + str(d[-2]) + "\n")
							f.write("Motif" + "\t" + str(m.name) + "\t" + "binds at position" + "\t" + str(pos) + "\t" + " with score: " + "\t" + str(score) + "\n")

	return all_positions
Ejemplo n.º 29
0
def plot_pwm(filename, output=None):
    """
    Plots a mononuc probability matrix from a JASPAR file
    Caution: only ONE motif in the file !!!
    The output must be given as the name of a PDF file, otherwise automatic from the input name
    """
    if output is None:
        output = filename.split("/")[-1].split(".")[0] + ".pdf"
    # Uses WebLogo from BioPython to plot the matrix in pdf format
    # Requires an internet connection
    fh = open(filename)
    for m in motifs.parse(fh, "jaspar"):
        try:
            m.weblogo(output, format="PDF")
        except:
            print "ERROR trying to plot the sequence logo using the online website WebLogo (http://weblogo.berkeley.edu/). A possible cause is the absence of a working internet connection, otherwise the pwm file %s may be corrupted. If you wish to plot the sequence logo, consider using the alternate website STAMP (http://www.benoslab.pitt.edu/stamp/)" % filename
    fh.close()
    return 0
Ejemplo n.º 30
0
def motif_identifier(tf, meme_folder):
    meme_file = meme_folder + '/meme.txt'
    with open(meme_file, 'r') as fid:
        try:
            records = motifs.parse(fid, 'MEME')
        except ValueError:
            return '1'
    motif_list = []
    evalue_list = []
    for record in records:
        curr_evalue = record.evalue
        for curr_motif in record.instances:
            # if curr_motif.sequence_name == tf.lower():
            motif_list.append(curr_motif.motif_name)
            evalue_list.append(curr_evalue)
    motif_list = zip(motif_list, evalue_list)
    sorted_list = sorted(motif_list, key=lambda x: x[1])
    motif_name = sorted_list[0][0]
    return motif_name[-1]
Ejemplo n.º 31
0
def find_pwm_hits(narrow_peak, reference, pfm, output, treat_cov):
    """
    Search each peak for the best match against the specified position
    frequency matrix

    Args:
        narrow_peak (str) - path to the narrowPeak file output by MACS2
        reference (str) - file path to the reference genome
        pfm (str) - file path to the position frequency matrix
        output (str) - prefix for the output file
    """

    # Open the peaks and reference genome files
    with open(narrow_peak, "r") as peaks, open(reference, "r") as ref:
        # Parse the reference genome into a dictionary
        records = SeqIO.parse(ref, "fasta", alphabet=IUPAC.unambiguous_dna)
        ref_seq = {record.id: record for record in records}

        # Open and parse the position frequency matrix
        with open(pfm, "r") as pfm:
            matrix = motifs.parse(pfm, "jaspar")[0]
            pwm = matrix.counts.normalize(pseudocounts=.5)
            pssm = pwm.log_odds()

        # Open the output file
        with open(output + "_centeredpeaks.bed", "w") as out_bed, \
                open(output + "_centeredpeaks.fasta", "w") as out_fasta:

            # Write a line for each centered peak in the output file
            for peak in peaks:
                split_peak = peak.strip().split("\t")
                peak_chrom = split_peak[0]
                peak_start = int(split_peak[1])
                peak_end = int(split_peak[2])
                seq = ref_seq[peak_chrom].seq[peak_start:peak_end]

                hits = [(pos, score) for pos, score in pssm.search(seq)]

                hits.sort(key=lambda hit: hit[1], reverse=True)

                recenter_peak(out_bed, out_fasta, ref_seq, peak_chrom,
                              peak_start, peak_end, 100, hits, matrix,
                              treat_cov)
Ejemplo n.º 32
0
def motif_search(input_jaspar, lincRNA_seq):
    with open(input_jaspar, "r") as fm, open(lincRNA_seq, "r") as lincs:
        for m in motifs.parse(fm, "jaspar"):
            for lincseq in SeqIO.parse(lincs,
                                       'fasta',
                                       alphabet=IUPAC.unambiguous_dna):
                d = lincseq.id.split("|")
                linc_id = d[0].strip("\" ")
                #print(linc_id)
                for pos, score in m.pssm.search(lincseq.seq):
                    one_position = int(abs(pos))
                    list_positions = [one_position]
                    #print(pos)
                    if linc_id in motif_positions.keys():
                        motif_positions[linc_id].append(one_position)
                    else:
                        empty_values = motif_positions.get(linc_id, None)
                        motif_positions[linc_id] = list_positions

    print(motif_positions)
    return motif_positions
Ejemplo n.º 33
0
def get_summary(job_id, meme_file, peaks):
    """
    Write summary in a json file
    """
    summary = {}
    # Number of occurences in peak
    summary['motif_occurrences'] = {}
    # Number of peaks
    summary['original_peaks'] = peaks
    summary['peaks'] = min(MAX_PEAKS_TO_KEEP, peaks)
    records = motifs.parse(open(meme_file), 'meme')
    num_occurrences = []
    for index, record in enumerate(records):
        num_occurrences.append(int(getattr(record,'num_occurrences','Unknown')))

    sorted_occurences = sorted(enumerate(num_occurrences), key=lambda x: x[1])
    summary['motif_occurrences'] = {'motif{}'.format(index+1):value for index,value in sorted_occurences}
    fp = os.path.join(STATIC_PATH, job_id, 'summary.json')
    with open(fp, 'w') as f:
        json.dump(summary, f)
    print summary
    return summary
Ejemplo n.º 34
0
def build_pssm(meme):
    with open(meme) as f:
        record = motifs.parse(f, 'minimal')

    motif = record[0]
    motif.pseudocounts = motif.background
    pssm = motif.pssm

    mean = motif.pssm.mean()
    std = motif.pssm.std()
    consensus = motif.consensus
    #distribution = motif.pssm.distribution(background = motif.background)
    max_value = motif.pssm.max
    min_value = motif.pssm.min
    #print(pssm, mean, std,consensus, max_value, min_value)

    cutoff_1 = [mean - std, 0][(mean - std) < 0]
    cutoff_2 = [mean - 2 * std, 0][(mean - 2 * std) < 0]

    cutoff_3 = [mean + std, 0][(mean + std) < 0]
    cutoff_4 = [mean + 2 * std, 0][(mean + 2 * std) < 0]
    return pssm, mean, std, max_value, cutoff_1, cutoff_2, cutoff_3, cutoff_4
def get_motifscores(df):
    with open('../additional/ATtRACT/pwm_transposed.txt', 'r') as f:
        records = parse(f, 'jaspar')
    Xs_sel = pd.read_pickle(
        '/net/mraid08/export/genie/Runs/Martin/ATtRACT/irmotifs_scores.pkl')
    mtfs = []
    for i in Xs_sel.columns:
        if i.split('__')[0] not in mtfs:
            mtfs.append(i.split('__')[0])

    def find_motifs_ir(varid):
        motifs = pd.Series()
        for pos, seq in mm.counts.log_odds().search(Seq(df.sequence[varid], \
                        alphabet=IUPAC.IUPACUnambiguousDNA()), threshold=0, both=False):
            motifs.loc[pos] = seq
        motifs_up = motifs[motifs.index < df.intronstart[varid]]
        motifs_alt = motifs[(motifs.index > df.intronstart[varid])
                            & (motifs.index < df.intronend[varid])]
        motifs_down = motifs[motifs.index > df.intronend[varid]]

        return list([motifs_up.sum(), motifs_alt.sum(), motifs_down.sum()])

    junirmotifs = pd.DataFrame(index=df.index)

    database = pd.read_table('../additional/ATtRACT/ATtRACT_db.txt')
    database.drop_duplicates('Matrix_id', inplace=True)
    database.set_index('Matrix_id', inplace=True)

    for mm in records:
        #        if (mm.name in database[database.Organism=='Homo_sapiens'].index):
        if (mm.name in mtfs):
            mm.counts.__class__ = matrix.PositionWeightMatrix
            junirmotifs['motifs'] = df.index.map(lambda x: find_motifs_ir(x))
            junirmotifs[[str(mm.name) + '__score_motifs_up',str(mm.name) + '__score_motifs_alt',\
                          str(mm.name) +  '__score_motifs_down']]=junirmotifs.motifs.apply(lambda x: pd.Series(x))
            junirmotifs.drop('motifs', axis=1, inplace=True)

    return junirmotifs[Xs_sel.columns]
Ejemplo n.º 36
0
def motif_format_checker(motif_infile):
    from Bio import motifs

    if motif_infile is None:
        return None

    try:
        all_motifs = motifs.parse(open(motif_infile), "jaspar")
    except:
        sys.exit("Motif file is not in JASPAR format.")

    nmotif = 0
    with open(motif_infile) as motif_in:
        for i, line in enumerate(motif_in):
            if line.startswith('>'):
                nmotif += 1
                if len(line.split('\t')) > 1:
                    sys.exit(
                        "Motif name cannot contain tabs('\t') at line {} in {}.".format(i+1, motif_infile)
                    )

    if nmotif != len(all_motifs):
        sys.exit("Motif file is not in JASPAR format.")
Ejemplo n.º 37
0
def get_motifscores(df):
    with open('../additional/ATtRACT/pwm_transposed.txt', 'r') as f:
        records = parse(f, 'jaspar')
    Xs_sel = pd.read_pickle(
        './dataframes/ml/three/Xs_three_motifs_rna_sel_.pkl')
    mtfs = []
    for i in Xs_sel.columns:
        if i.split('__')[0] not in mtfs:
            mtfs.append(i.split('__')[0])

    def find_motifs(varid):
        motifs = pd.Series()
        for pos, seq in mm.counts.log_odds().search(Seq(df.sequence[varid], \
                        alphabet=IUPAC.IUPACUnambiguousDNA()), threshold=0, both=False):
            motifs.loc[pos] = seq
        motifs_up = motifs[motifs.index < df.acceptor1[varid]]
        motifs_alt = motifs[(motifs.index > df.acceptor1[varid])
                            & (motifs.index < df.acceptor2[varid])]
        motifs_down = motifs[motifs.index > df.acceptor2[varid]]

        return list([motifs_up.sum(), motifs_alt.sum(), motifs_down.sum()])

    motifscores = pd.DataFrame(index=df.index)

    database = pd.read_table('../additional/ATtRACT/ATtRACT_db.txt')
    database.drop_duplicates('Matrix_id', inplace=True)
    database.set_index('Matrix_id', inplace=True)

    for mm in records:
        if (mm.name in mtfs):
            mm.counts.__class__ = matrix.PositionWeightMatrix
            motifscores['motifs'] = df.index.map(lambda x: find_motifs(x))
            motifscores[[str(mm.name) + '__score_motifs_up',str(mm.name) + '__score_motifs_alt',\
                          str(mm.name) +  '__score_motifs_down']]=motifscores.motifs.apply(lambda x: pd.Series(x))
            motifscores.drop('motifs', axis=1, inplace=True)

    return motifscores[Xs_sel.columns]
Ejemplo n.º 38
0
def parseMastOut(mastOut):
    from Bio import motifs
    handle = StringIO.StringIO(mastOut)
    record = motifs.parse(handle, "mast")
    handle.close()
    return record
Ejemplo n.º 39
0
def main(argv):
    handle = open(argv[0], 'r')
    records = motifs.parse(handle, 'meme')
    print "Total motifs present: {}".format(len(records))
    for i, record in enumerate(records):
        print "Motif {} \t Length: {} \t, Seq: {}".format(i, len(record.consensus), record.consensus)
Ejemplo n.º 40
0
# concatenate all files
all_states = "all_states_all_lines.bed"
os.system("cat *.bed > {0}".format(all_states))

# Get CD19 perypheral blood HMM state annotation
roadmap_15statesHMM_CD19 = "E032_15_coreMarks_mnemonics.bed.gz"
os.system("tar zxvf {0} {1}".format(roadmap_15statesHMM, roadmap_15statesHMM_CD19))
os.system("gzip -d {0}".format(roadmap_15statesHMM_CD19))
os.system("mv E032_15_coreMarks_mnemonics.bed ../data/E032_15_coreMarks_mnemonics.bed")


# Footprinting
# get all jaspar motifs
"wget http://jaspar.genereg.net/html/DOWNLOAD/JASPAR_CORE/pfm/nonredundant/pfm_all.txt"
jaspar = motifs.parse(open("data/external/pfm_all.txt", 'r'), "jaspar")
# motif annotation
"wget http://jaspar.genereg.net/html/DOWNLOAD/database/MATRIX.txt"
annot = pd.read_table("data/external/MATRIX.txt", names=["index", "db", "id", 0, "TF"])
# get species annotation
"wget http://jaspar.genereg.net/html/DOWNLOAD/database/MATRIX_SPECIES.txt"
spec = pd.read_table("data/external/MATRIX_SPECIES.txt", names=["index", "species_id"])
# merge both
annot = pd.merge(annot, spec, on=['index'])

# get ids of only human motifs
human_annot = annot[annot['species_id'] == "9606"]

# filter out any not uniquely mappable gene name
human_annot = human_annot[
    (~human_annot['TF'].str.contains("\(")) &
Ejemplo n.º 41
0
def read_jaspar_pwms(file='jaspar/pfm_vertebrates.txt', dir='d:/sequence'):
    with open(os.path.join(dir, file), 'r') as h:
        jaspar = {m.name.strip().split(':')[0].upper(): m.counts.normalize().log_odds() for m in motifs.parse(h, 'jaspar')}
        return jaspar
Ejemplo n.º 42
0
def create_plot(
    meme_file,
    motif_number,
    flanking_sites,
    sample_phylop_file,
    control_phylop_file,
    sample_gerp_file,
    control_gerp_file,
    peak_file,
    fimo_file,
    annotate,
):
    handle = open(meme_file)
    records = motifs.parse(handle, "meme")
    record = records[motif_number - 1]
    num_occurrences = getattr(record, "num_occurrences", "Unknown")
    sample_phylo_data = None
    control_phylo_data = None
    sample_gerp_data = None
    control_gerp_data = None
    annotate_dict = None
    if annotate == "" or annotate == " ":
        annotate = None
    elif annotate:
        with open(annotate) as f:
            annotate_dict = json.load(f)

    handle = open(sample_phylop_file, "r")
    sample_phylo_data = csv.reader(handle, delimiter="\t")

    handle = open(control_phylop_file, "r")
    control_phylo_data = csv.reader(handle, delimiter="\t")

    if sample_gerp_file and control_gerp_file:

        handle = open(sample_gerp_file, "r")
        sample_gerp_data = csv.reader(handle, delimiter="\t")

        handle = open(control_gerp_file, "r")
        control_gerp_data = csv.reader(handle, delimiter="\t")

    sample_phylo_scores = []
    for line in sample_phylo_data:
        sample_phylo_scores.append(float(line[1]))
    control_phylo_scores = []
    for line in control_phylo_data:
        control_phylo_scores.append(float(line[1]))

    if sample_gerp_data:
        sample_gerp_scores = []
        for line in sample_gerp_data:
            sample_gerp_scores.append(float(line[1]))
        control_gerp_scores = []
        for line in control_gerp_data:
            control_gerp_scores.append(float(line[1]))

    assert len(sample_phylo_scores) == len(control_phylo_scores)

    handle.close()
    profile = position_wise_profile(getattr(record, score_type), record.length)
    max_occur = find_max_occurence(profile, max_count=1)
    ## motif_scores is tn array of scores of the max  occuring base at each position of the motif
    motif_scores = []
    for position in max_occur:
        motif_scores.append(position[0][1])

    motif_scores = np.asarray(motif_scores)
    sample_phylo_scores = np.asarray(sample_phylo_scores)
    control_phylo_scores = np.asarray(control_phylo_scores)
    if sample_gerp_data:
        sample_gerp_scores = np.asarray(sample_gerp_scores)
        control_gerp_scores = np.asarray(control_gerp_scores)

    motif_junk = [0 for i in range(0, flanking_sites)]
    motif_junk = np.asarray(motif_junk)
    motif_concat = np.concatenate((motif_junk, motif_scores))
    motif_concat = np.concatenate((motif_concat, motif_junk))

    ##Mean of flanking sites
    ms_p = np.mean(np.concatenate((sample_phylo_scores[0:flanking_sites], sample_phylo_scores[-flanking_sites:])))
    mc_p = np.mean(np.concatenate((control_phylo_scores[0:flanking_sites], control_phylo_scores[-flanking_sites:])))

    if sample_gerp_data:
        ms_g = np.mean(np.concatenate((sample_gerp_scores[0:flanking_sites], sample_gerp_scores[-flanking_sites:])))
        mc_g = np.mean(np.concatenate((control_gerp_scores[0:flanking_sites], control_gerp_scores[-flanking_sites:])))
        flanking_sample_gerp_scores = np.concatenate(
            (sample_gerp_scores[0:flanking_sites], sample_gerp_scores[-flanking_sites:])
        )
        flanking_control_gerp_scores = np.concatenate(
            (control_gerp_scores[0:flanking_sites], control_gerp_scores[-flanking_sites:])
        )
        motif_control_gerp_scores = control_gerp_scores[flanking_sites:-flanking_sites]
        motif_sample_gerp_scores = sample_gerp_scores[flanking_sites:-flanking_sites]

    flanking_sample_phylo_scores = np.concatenate(
        (sample_phylo_scores[0:flanking_sites], sample_phylo_scores[-flanking_sites:])
    )
    flanking_control_phylo_scores = np.concatenate(
        (control_phylo_scores[0:flanking_sites], control_phylo_scores[-flanking_sites:])
    )
    motif_control_phylo_scores = control_phylo_scores[flanking_sites:-flanking_sites]
    motif_sample_phylo_scores = sample_phylo_scores[flanking_sites:-flanking_sites]

    if flanking_sites > 0:
        shifted_sample_phylo_scores = sample_phylo_scores[flanking_sites:-flanking_sites] - ms_p
        shifted_control_phylo_scores = control_phylo_scores[flanking_sites:-flanking_sites] - mc_p
        if sample_gerp_data:
            shifted_sample_gerp_scores = sample_gerp_scores[flanking_sites:-flanking_sites] - ms_g
            shifted_control_gerp_scores = control_gerp_scores[flanking_sites:-flanking_sites] - mc_g
    else:
        shifted_sample_phylo_scores = sample_phylo_scores
        shifted_control_phylo_scores = control_phylo_scores
        if sample_gerp_data:
            shifted_sample_gerp_scores = sample_gerp_scores
            shifted_control_gerp_scores = control_gerp_scores

    pr_p = pearsonr(motif_scores, motif_sample_phylo_scores)
    if sample_gerp_data:
        pr_g = pearsonr(motif_scores, motif_sample_gerp_scores)

    ## H_0: Mean phylop scores for motif sites and flanking sites are the same
    ## H_!: Mean phylop score for motif sites > Mean phylop score of flanking sites
    ## NOTE: the perform_t_test functions returns a 2 tailed p-value forn independet t-test with unequal sample size, eqaul variances

    T_deltaphylop, p_deltaphylop = perform_t_test(motif_sample_phylo_scores, flanking_sample_phylo_scores)
    delta_phylop = np.mean(motif_sample_phylo_scores) - np.mean(
        flanking_sample_phylo_scores
    )  # -shifted_control_phylo_scores)
    if sample_gerp_data:
        T_deltagerp, p_deltagerp = perform_t_test(motif_sample_gerp_scores, flanking_sample_gerp_scores)
        delta_gerp = np.mean(motif_sample_gerp_scores) - np.mean(flanking_sample_gerp_scores)
        if T_deltagerp < 0:
            p_deltagerp = 1 - p_deltagerp * 0.5
        else:
            p_deltagerp = p_deltagerp * 0.5

    if T_deltaphylop < 0:
        p_deltaphylop = 1 - p_deltaphylop * 0.5
    else:
        p_deltaphylop = p_deltaphylop * 0.5

    ## Ordinary least square fit for phylop scores and motif_scores
    reg_phylop_sample = sm.OLS(motif_sample_phylo_scores, sm.add_constant(motif_scores)).fit()
    if len(reg_phylop_sample.params) < 2:
        y_reg_phylop_sample = motif_scores
    else:
        y_reg_phylop_sample = motif_scores * reg_phylop_sample.params[1] + reg_phylop_sample.params[0]
    reg_phylop_control = sm.OLS(motif_control_phylo_scores, sm.add_constant(motif_scores)).fit()
    if len(reg_phylop_control.params) < 2:
        y_reg_phylop_control = motif_scores
    else:
        y_reg_phylop_control = motif_scores * reg_phylop_control.params[1] + reg_phylop_control.params[0]

    if sample_gerp_data:
        reg_gerp_sample = sm.OLS(motif_sample_gerp_scores, sm.add_constant(motif_scores)).fit()
        if len(reg_gerp_sample.params) == 1:
            y_reg_gerp_sample = motif_scores
        else:
            y_reg_gerp_sample = motif_scores * reg_gerp_sample.params[1] + reg_gerp_sample.params[0]

        reg_gerp_control = sm.OLS(motif_control_gerp_scores, sm.add_constant(motif_scores)).fit()
        if len(reg_gerp_control.params) == 1:
            y_reg_gerp_control = motif_scores
        else:
            y_reg_gerp_control = motif_scores * reg_gerp_control.params[1] + reg_gerp_control.params[0]

    motif = record
    motif_length = motif.length
    meme_dir = os.path.dirname(meme_file)
    X = [40 + 15]  ## this is by trial and error, the position for the first base logo
    logo = plt.imread(os.path.join(meme_dir, "logo{}.png".format(motif_number)))
    ## Generate all other X coordinates
    fs = flanking_sites
    for j in range(1, len(motif) + 2 * fs):
        t = X[j - 1] + a + 1.9
        X.append(t)
    motif_bits = []
    for i in range(0, motif.length):
        s = 0
        for base in bases:
            s = s + -motif.pwm[base][i] * log(motif.pwm[base][i], 2) if motif.pwm[base][i] != 0 else s
            s = 2 - s
        motif_bits.append(s)

    y_phylop_pixels = [__scale__ * x for x in sample_phylo_scores]  # [fs:-fs]]#[flanking_sites:-flanking_sites]]

    ##FIXME This is a big dirty hacl to get thegenerate plots for the Reverse complement logo too
    logo_name = ["logo{}.png".format(motif_number), "logo_rc{}.png".format(motif_number)]
    for ln in logo_name:
        if "rc" in ln:
            y_phylop_pixels.reverse()
        logo = plt.imread(os.path.join(meme_dir, ln))
        height_px = logo.shape[0]  # Should be 212

        if sample_gerp_data:
            if annotate:
                total_px = X[-1] + 8 * height_px + 140
                right = (8 * height_px + 10 + 140 - 0.2 * height_px) / total_px
            else:
                total_px = X[-1] + 6 * height_px + 140
                right = (6 * height_px + 10 + 140 - 0.2 * height_px) / total_px
        else:
            if annotate:
                total_px = X[-1] + 6 * height_px + 140
                right = (6 * height_px + 10 + 140 - 0.2 * height_px) / total_px
            else:
                total_px = X[-1] + 4 * height_px + 140
                right = (4 * height_px + 10 + 140 - 0.2 * height_px) / total_px

        figsize = (total_px / 100, (2 * height_px) / 100 + 0.6)

        gs = gridspec.GridSpec(2, 1)  # , width_ratios=[1, right], height_ratios=[1,1])
        gs.update(
            top=1.0, bottom=0.14, left=0.08, right=1 - right
        )  # , right=0.8)#, left=0.06)#, right=right, wspace=0.025, hspace=0.03, wd)
        f = plt.figure(figsize=figsize, dpi=dpi, facecolor="w", edgecolor="k")

        # ax => Logo
        # stem_plot => Trend
        # gerp_scatter_plot => Phylop
        # enrichment_plot => Gerp
        logo_plot = plt.Subplot(f, gs[0])
        ##TODO Check this
        if motif_length > 45:
            XSCALE_FACTOR = motif_length / 1.9
            z = 2
        elif motif_length > 40:
            XSCALE_FACTOR = motif_length / 2.25
            z = 2.5
        elif motif_length > 36:
            XSCALE_FACTOR = motif_length / 1.95
            z = 2
        elif motif_length > 21:
            XSCALE_FACTOR = motif_length / 5
            z = 3
        else:
            XSCALE_FACTOR = 4.5
            z = 3

        logo_plot.imshow(
            logo, extent=[40 + 15 + z * (a + 1.9), logo.shape[1] + 15 + XSCALE_FACTOR * (a + 1.9), 0, logo.shape[0]]
        )
        logo_plot.set_axis_off()
        f.add_subplot(logo_plot)

        stem_plot = plt.Subplot(f, gs[1], sharex=logo_plot)
        markerline, stemlines, baseline = stem_plot.stem(
            X[:fs],
            [y for y in y_phylop_pixels[:fs]],
            markerfmt="_",
            linefmt="-",
            markerfacecolor=flankingstemcolor,
            color=greycolor,
        )
        setp(stemlines, "color", flankingstemcolor)
        setp(markerline, "markerfacecolor", flankingstemcolor)
        setp(markerline, "color", flankingstemcolor)
        setp(stemlines, "linewidth", linewidth)
        setp(markerline, "markersize", markersize)
        setp(baseline, "linewidth", linewidth - 0.5)
        setp(markerline, "markeredgewidth", markeredgewidth)
        markerline, stemlines, baseline = stem_plot.stem(
            X[fs:-fs], [y for y in y_phylop_pixels[fs:-fs]], markerfmt="g_", linefmt="g-", basefmt="r-"
        )
        setp(stemlines, "linewidth", linewidth)
        setp(markerline, "markersize", markersize)
        setp(markerline, "markeredgewidth", markeredgewidth)
        setp(baseline, "linewidth", linewidth - 0.5)
        markerline, stemlines, baseline = stem_plot.stem(
            X[-fs:],
            [y for y in y_phylop_pixels[-fs:]],
            markerfmt="_",
            linefmt="-",
            markerfacecolor=flankingstemcolor,
            color=greycolor,
        )
        setp(stemlines, "color", flankingstemcolor)
        setp(markerline, "markerfacecolor", flankingstemcolor)
        setp(stemlines, "linewidth", linewidth)
        setp(markerline, "markersize", markersize)
        setp(markerline, "markeredgewidth", markeredgewidth)
        setp(markerline, "color", flankingstemcolor)
        setp(baseline, "linewidth", linewidth - 0.5)

        indices_str = []
        indices1 = np.linspace(-fs, -1, 2)
        for i in indices1:
            indices_str.append("")
        indices2 = np.arange(0, len(X) - 2 * fs, 5)
        for i in indices2:
            indices_str.append("${}$".format(int(i) + 1))

        indices3 = np.linspace(motif_length, motif_length + fs - 1, 2)

        for i in indices3:
            indices_str.append("")

        indices12 = np.concatenate((indices1, indices2))
        indices = np.concatenate((indices12, indices3))
        xticks = [X[int(i) + fs] for i in indices]

        max_yticks = 3
        yloc = plt.MaxNLocator(max_yticks)
        stem_plot.yaxis.set_major_locator(yloc)

        # ticks_and_labels = np.linspace(1.02*min(min(y_phylop_pixels), -0.1), 1.02*max(y_phylop_pixels), num = 5, endpoint=True)
        # stem_plot.set_yticks(ticks_and_labels)
        # stem_plot.set_yticklabels(['$%.2f$' %x for x in ticks_and_labels])#(["%0.2f"%(min(y_phylop_pixels)/__scale__), "%0.2f"%(np.mean(y_phylop_pixels)/__scale__), "%0.2f"%(max(y_phylop_pixels)/__scale__)], fontsize=fontsize)
        stem_plot.set_xlabel("$\mathrm{Base}\ \mathrm{Position}$", fontsize=fontsize, fontweight="bold")
        stem_plot.set_xlim([1.2 * a, X[-1] + linewidth * 1.8])
        stem_plot.set_ylim([min(np.min(y_phylop_pixels), -0.01) - 0.03, np.max(y_phylop_pixels, 0.01)])
        stem_plot.get_xaxis().tick_bottom()
        stem_plot.get_yaxis().tick_left()
        stem_plot.set_xticks(xticks)
        stem_plot.set_xticklabels(indices_str, fontsize=fontsize)
        stem_plot.spines["top"].set_visible(False)
        stem_plot.spines["right"].set_visible(False)
        stem_plot.yaxis.set_ticks_position("left")
        stem_plot.xaxis.set_ticks_position("bottom")
        stem_plot.spines["left"].set_position("zero")
        # stem_plot.spines['bottom'].set_position(matplotlib.transforms.Bbox(array([[0.125,0.63],[0.25,0.25]])))
        stem_plot.get_yaxis().set_tick_params(direction="out")
        stem_plot.get_xaxis().set_tick_params(direction="out")
        stem_plot.tick_params(axis="y", which="major", pad=tickpad)
        stem_plot.tick_params(axis="x", which="major", pad=tickpad)
        stem_plot.tick_params("both", length=ticklength, width=2, which="major")
        stem_plot.set_ylabel("$\mathrm{PhyloP}\ \mathrm{Score}$", fontsize=fontsize)
        f.add_subplot(stem_plot)

        if sample_gerp_data:
            if annotate:
                gs1 = gridspec.GridSpec(2, 4, height_ratios=[1, 4], width_ratios=[1, 1, 1, 1])
                gerp_header_subplot_gs = gs1[0, 1]
                gerp_subplot_gs = gs1[1, 1]
                histogram_header_subplot_gs = gs1[0, 2]
                histogram_subplot_gs = gs1[1, 2]
                ann_header_subplot_gs = gs1[0, 3]
                ann_subplot_gs = gs1[1, 3]
            else:
                gs1 = gridspec.GridSpec(2, 3, height_ratios=[1, 4], width_ratios=[1, 1, 1])
                gerp_header_subplot_gs = gs1[0, 1]
                gerp_subplot_gs = gs1[1, 1]
                histogram_header_subplot_gs = gs1[0, 2]
                histogram_subplot_gs = gs1[1, 2]
        else:
            if annotate:
                gs1 = gridspec.GridSpec(2, 3, height_ratios=[1, 4], width_ratios=[1, 1, 1])
                histogram_header_subplot_gs = gs1[0, 1]
                histogram_subplot_gs = gs1[1, 1]
                ann_header_subplot_gs = gs1[0, 2]
                ann_subplot_gs = gs1[1, 2]
            else:
                gs1 = gridspec.GridSpec(2, 2, height_ratios=[1, 4], width_ratios=[1, 1])
                histogram_header_subplot_gs = gs1[0, 1]
                histogram_subplot_gs = gs1[1, 1]

        gs1.update(bottom=0.14, right=0.95, left=1 - right * 0.85, wspace=0.5)

        phlyop_plots_leg = plt.Subplot(f, gs1[0, 0], autoscale_on=True)
        pearsonr_pval = str("%.1g" % pr_p[1])
        if "e" in pearsonr_pval:
            pearsonr_pval += "}"
            pearsonr_pval = pearsonr_pval.replace("e", "*10^{").replace("-0", "-")
        score_pval = str("%.1g" % p_deltaphylop)
        if "e" in score_pval:
            score_pval += "}"
            score_pval = score_pval.replace("e", "*10^{").replace("-0", "-")

        textstr = r"\noindent$R_{pearson}=%.2f$($p=%s$)\\~\\$\Delta_{Phylop}=%.2f$($p=%s$)\\~\\" % (
            pr_p[0],
            pearsonr_pval,
            delta_phylop,
            score_pval,
        )  # , reg_phylop_control.rsquared, num_occurrences*reg_phylop_control.params[1])
        txtx = 1 - legend_xmultiplier * len(textstr) / 100.0
        phlyop_plots_leg.set_frame_on(False)
        phlyop_plots_leg.set_xticks([])
        phlyop_plots_leg.set_yticks([])
        phlyop_plots_leg.text(txtx, txty, textstr, fontsize=legend_fontsize)
        f.add_subplot(phlyop_plots_leg)

        phylop_scatter_plot = plt.Subplot(f, gs1[1, 0], autoscale_on=True)
        fit = np.polyfit(motif_scores, motif_sample_phylo_scores, 1)
        fit_fn = np.poly1d(fit)

        phylop_scatter_plot.scatter(
            motif_scores, motif_sample_phylo_scores, color="g", s=[pointsize for i in motif_scores]
        )
        phylop_scatter_plot.plot(
            motif_scores,
            y_reg_phylop_sample,
            "g",
            motif_scores,
            fit_fn(motif_scores),
            color="g",
            linewidth=plot_linewidth,
        )
        phylop_scatter_plot.scatter(
            motif_scores, motif_control_phylo_scores, color=greycolor, s=[pointsize for i in motif_scores]
        )
        phylop_scatter_plot.plot(motif_scores, y_reg_phylop_control, color=greycolor, linewidth=plot_linewidth)

        ticks_and_labels = np.linspace(1.02 * min(motif_scores), 1.02 * max(motif_scores), num=5, endpoint=True)
        phylop_scatter_plot.set_xticks(ticks_and_labels)
        ticks_and_labels = ["$%.2f$" % (x / num_occurrences) for x in ticks_and_labels]
        phylop_scatter_plot.set_xticklabels(ticks_and_labels)

        ##max_xticks = 5
        ##xloc = plt.MaxNLocator(max_xticks)
        ##print xloc
        ##phylop_scatter_plot.xaxis.set_major_locator(xloc)
        # ticks_and_labels = np.linspace(1.02*min(min(shifted_sample_phylo_scores), min(shifted_control_phylo_scores)), 1.02*max(max(shifted_sample_phylo_scores),max(shifted_control_phylo_scores)),
        # num = 4, endpoint=True)
        # phylop_scatter_plot.set_yticks(ticks_and_labels)
        # phylop_scatter_plot.set_yticklabels(["$%0.2f$"%x for x in ticks_and_labels])
        max_yticks = 4
        yloc = plt.MaxNLocator(max_yticks)
        phylop_scatter_plot.yaxis.set_major_locator(yloc)
        phylop_scatter_plot.set_xlabel("$\mathrm{Base}\ \mathrm{Frequency}$", fontsize=fontsize, fontweight="bold")
        phylop_scatter_plot.get_xaxis().tick_bottom()
        phylop_scatter_plot.get_yaxis().tick_left()
        phylop_scatter_plot.set_ylabel("$\mathrm{PhyloP}\ \mathrm{Score}$", fontsize=fontsize, fontweight="bold")
        phylop_scatter_plot.tick_params(axis="y", which="major", pad=tickpad)
        phylop_scatter_plot.tick_params(axis="x", which="major", pad=tickpad)
        phylop_scatter_plot.get_yaxis().set_tick_params(direction="out")
        phylop_scatter_plot.get_xaxis().set_tick_params(direction="out")
        phylop_scatter_plot.tick_params("both", length=ticklength, width=2, which="major")

        f.add_subplot(phylop_scatter_plot)

        gerp_plots_leg = plt.Subplot(f, gerp_header_subplot_gs, autoscale_on=True)
        gerp_plots_leg.set_frame_on(False)
        gerp_plots_leg.set_xticks([])
        gerp_plots_leg.set_yticks([])
        pearsonr_pval = str("%.1g" % pr_p[1])
        if "e" in pearsonr_pval:
            pearsonr_pval += "}"
            pearsonr_pval = pearsonr_pval.replace("e", "*10^{").replace("-0", "-")

        if sample_gerp_data:
            score_pval = str("%.1g" % p_deltagerp)
            if "e" in score_pval:
                score_pval += "}"
                score_pval = score_pval.replace("e", "*10^{").replace("-0", "-")
            textstr = r"\noindent$R_{pearson}=%.2f$($p=%s$)\\~\\$\Delta_{{Gerp}}=%.2f$($p=%s$)\\~\\" % (
                pr_g[0],
                pearsonr_pval,
                delta_gerp,
                score_pval,
            )
            txtx = 1 - legend_xmultiplier * len(textstr) / 100.0
            gerp_plots_leg.text(txtx, txty, textstr, fontsize=legend_fontsize)
            f.add_subplot(gerp_plots_leg)

            gerp_scatter_plot = plt.Subplot(f, gerp_subplot_gs, autoscale_on=True)
            gerp_scatter_plot.scatter(
                motif_scores, motif_sample_gerp_scores, color="g", s=[pointsize for i in motif_scores]
            )
            gerp_scatter_plot.plot(motif_scores, y_reg_gerp_sample, color="g", linewidth=plot_linewidth)
            gerp_scatter_plot.scatter(
                motif_scores, motif_control_gerp_scores, color=greycolor, s=[pointsize for i in motif_scores]
            )
            gerp_scatter_plot.plot(motif_scores, y_reg_gerp_control, color=greycolor, linewidth=plot_linewidth)
            ticks_and_labels = np.linspace(1.02 * min(motif_scores), 1.02 * max(motif_scores), num=5, endpoint=True)
            gerp_scatter_plot.set_xticks(ticks_and_labels)
            ticks_and_labels = ["$%.2f$" % (x / num_occurrences) for x in ticks_and_labels]
            gerp_scatter_plot.set_xticklabels(ticks_and_labels)

            ##max_xticks = 5
            ##xloc = plt.MaxNLocator(max_xticks)
            ##gerp_scatter_plot.xaxis.set_major_locator(xloc)
            max_yticks = 4
            yloc = plt.MaxNLocator(max_yticks)
            gerp_scatter_plot.yaxis.set_major_locator(yloc)
            gerp_scatter_plot.set_xlabel("$\mathrm{Base}\ \mathrm{Frequency}$", fontsize=fontsize, fontweight="bold")
            gerp_scatter_plot.set_ylabel("$\mathrm{GERP}\ \mathrm{Score}$", fontsize=fontsize, fontweight="bold")
            gerp_scatter_plot.get_xaxis().tick_bottom()
            gerp_scatter_plot.get_yaxis().tick_left()
            gerp_scatter_plot.get_yaxis().set_tick_params(direction="out")
            gerp_scatter_plot.get_xaxis().set_tick_params(direction="out")
            gerp_scatter_plot.tick_params(axis="y", which="major", pad=tickpad)
            gerp_scatter_plot.tick_params(axis="x", which="major", pad=tickpad)
            gerp_scatter_plot.tick_params("both", length=ticklength, width=2, which="major")
            f.add_subplot(gerp_scatter_plot)

        enrichment_plot4 = plt.Subplot(f, histogram_header_subplot_gs, autoscale_on=True)
        enrichment_plot4.set_frame_on(False)
        enrichment_plot4.set_xticks([])
        enrichment_plot4.set_yticks([])
        all_distances = get_motif_distances(peak_file, fimo_file)
        fimo_dir = os.path.dirname(fimo_file)
        motifs_within_100 = filter(lambda x: x <= 100 and x >= -100, all_distances)
        motifs_within_100_200 = filter(lambda x: (x < 200 and x > 100) or (x > -200 and x < -100), all_distances)
        if len(motifs_within_100_200) > 0:
            enrichment = len(motifs_within_100) / (len(motifs_within_100_200))  # +len(motifs_within_100))
        else:
            enrichment = 1
        enrichment_pval = 0
        number_of_sites = len(motifs_within_100) + len(motifs_within_100_200)  # fimo_sites_intersect(parsed.fimo_file)
        probability = 200 / (ENRICHMENT_SEQ_LENGTH - motif_length)
        enrichment_pval = binom.sf(len(motifs_within_100), number_of_sites, probability)
        enrichment_pval = str("%.1g" % enrichment_pval)
        if "e" in enrichment_pval:
            enrichment_pval += "}"
            enrichment_pval = enrichment_pval.replace("e", "*10^{").replace("-0", "-")
        textstr = r"\noindent$Enrichment={0:.2f}$\\~\\$(p={1})$".format(enrichment, enrichment_pval)
        txtx = 0.1 * len(textstr) / 100.0
        enrichment_plot4.text(txtx, txty, textstr, fontsize=legend_fontsize)
        f.add_subplot(enrichment_plot4)
        enrichment_plot = plt.Subplot(f, histogram_subplot_gs, autoscale_on=True)
        enrichment_plot.hist(all_distances, histogram_nbins, color="white", alpha=0.8, range=[-200, 200])
        enrichment_plot.set_xticks([-200, -100, 0, 100, 200])
        max_yticks = 3
        yloc = plt.MaxNLocator(max_yticks)
        enrichment_plot.yaxis.set_major_locator(yloc)
        # enrichment_plot.set_yticks(range(1,6))
        ticks_and_labels = [-200, -100, 0, 100, 200]
        all_distances = np.asarray(all_distances)
        enrichment_plot.set_xticklabels(["${}$".format(x) for x in ticks_and_labels])
        enrichment_plot.tick_params(axis="y", which="major", pad=tickpad)
        enrichment_plot.tick_params(axis="x", which="major", pad=tickpad)
        enrichment_plot.tick_params("both", length=ticklength, width=2, which="major")
        enrichment_plot.get_xaxis().tick_bottom()
        enrichment_plot.get_yaxis().tick_left()
        enrichment_plot.get_yaxis().set_tick_params(direction="out")
        enrichment_plot.get_xaxis().set_tick_params(direction="out")
        enrichment_plot.axvline(x=-100, linewidth=3, color="red", linestyle="-.")
        enrichment_plot.axvline(x=100, linewidth=3, color="red", linestyle="-.")
        f.add_subplot(enrichment_plot)
        if "rc" not in ln:
            out_file = os.path.join(fimo_dir, "motif{}Combined_plots.png".format(motif_number))
            out_file = "motif{}Combined_plots.png".format(motif_number)
        else:
            out_file = os.path.join(fimo_dir, "motif{}Combined_plots_rc.png".format(motif_number))
            out_file = "motif{}Combined_plots_rc.png".format(motif_number)

        if annotate:
            filename = r"$" + annotate[0] + "$"
            try:
                a_motif = r"$" + annotate[1] + "$"
            except IndexError:
                a_motif = ""
            try:
                cell_line = r"$" + annotate[2] + "$"
            except IndexError:
                cell_line = ""
            try:
                assay = r"$" + annotate[3] + "$"
            except IndexError:
                assay = ""

            # data = [[r'$Filename$', filename], [r'$Motif$', a_motif], [r'$Cell\ Line$', cell_line], [r'Assay', assay]]
            keys = ["title", "gene_name", "dataset", "assembly"]
            data = [[r"$" + key.replace("_", " ").upper() + "$", r"$" + annotate_dict[key] + "$"] for key in keys]
            ann_header = plt.Subplot(f, ann_header_subplot_gs, autoscale_on=True)
            ann_header.set_frame_on(False)
            ann_header.set_xticks([])
            ann_header.set_yticks([])
            f.add_subplot(ann_header)
            textstr = r"$Metadata$"
            txtx = 1.7 * len(textstr) / 100.0
            ann_header.text(txtx, txty, textstr, fontsize=legend_fontsize)
            ann_plot = plt.Subplot(f, ann_subplot_gs, autoscale_on=True)
            ann_plot.set_xticks([])
            ann_plot.set_yticks([])
            ann_plot.set_frame_on(False)
            table = ann_plot.table(cellText=data, loc="center")
            table.scale(1, 2)
            fontproperties = FontProperties(size=legend_fontsize * 8)  # , family='serif' )
            for key, cell in table.get_celld().items():
                row, col = key
                if row > 0 and col > 0:
                    cell.set_text_props(fontproperties=fontproperties)

            table.set_fontsize(legend_fontsize * 8)
            f.add_subplot(ann_plot)

        f.savefig(out_file, figsize=figsize, dpi=dpi)
    if line.strip():
      fixed_pfm_file.write(line)

fixed_pfm_file.close()

# Output is printed to stdout to enable pipes to other process, etc. The output gives the effect of each possible mutation 
# for each jaspar pfm. The file is tab delimited, with a header to make for easy reading into R or other downstream analyses.
# Each line of the output has the following fields:
#
# 1)   name -- the name of the pfm in JASPAR
# 2)   pos  -- the relative position within the matrix. The value is from -1 to 1, where 0 is the center of the motif.
# 3-8) NN   -- the change in pssm score associated with each possible mutation at that position in the motif.
#
print "name\tpos\tIC\tDegCons\tAG\tCT\tAC\tAT\tCG\tGT"
with open("pfm_all.fixed.txt") as handle:
 for m in motifs.parse(handle, "jaspar"):

#
# Get the counts and the consensus motif for the pfm
#
    counts = m.counts
    cons = m.consensus
    deg_cons = m.degenerate_consensus

#
# convert to pssm, adding a pseudocount of 0.1 to each base.
#
    pssm = m.counts.normalize(pseudocounts=0.1).log_odds()
    cons_score = pssm.calculate(cons)
    cons_list = list(cons)
    cons_str =  str(cons)
Ejemplo n.º 44
0
# read in name of input newick file and motif file
inputnewick_file = sys.argv[1]
inputmotifs = sys.argv[2]

# read in name of output file
outputfile = sys.argv[3]

# read in name of mRNA
gene = sys.argv[4]

# open file
handle = open(inputmotifs)

# read in motif information
results = motifs.parse(handle, "meme")

# close file after reading in motif info
handle.close()

# create list to store motifs
motif_branch_lengths_list = []

# read in tree
tree = Phylo.read(inputnewick_file, "newick")

# create and print cutoff value
cutoff = (60/ (math.log10(0.01) + 100))
print ("cutoff = " + str(cutoff))

# function to calculate branch length of Drosophila with motifs
Ejemplo n.º 45
0
def get_motifs(meme_file):
    handle = open(meme_file, 'r')
    records = motifs.parse(handle, 'meme')
    total_motifs = len(records)
    return total_motifs
Ejemplo n.º 46
0
from uuid import uuid4
from celery import Celery
from peaks_processor_celery import run_conservation_analysis, run_motif_analysis, run_analysis
from flask.ext.sqlalchemy import SQLAlchemy
import shutil
import json
from celery import group
from config_processor import read_config
from encode_peak_file_downloader import get_encode_peakfiles, get_metadata_for_peakfile
import subprocess
from bed_operations.format_peakfile import convert_to_scorefile
from query import get_async_id, encode_job_exists, insert_encode_job, update_job_status, insert_new_job, get_encode_metadata, get_filename, get_job_status, job_exists, encode_job_status, get_encode_jobid, is_job_type_encode,get_encode_from_jobid, get_all_encode_results
from database import SqlAlchemyTask
import operator
from Bio import motifs
jaspar_motifs = motifs.parse(open('../data/pfm_vertebrates.txt'), 'jaspar')

server_config = read_config('Server')
path_config = read_config('StaticPaths')


app = Flask(__name__)
app.config['CELERY_BROKER_URL'] = server_config['celery_broker_url']
app.config['CELERY_RESULT_BACKEND'] = server_config['celery_result_backend']
app.config['SQLALCHEMY_DATABASE_URI'] = server_config['sqlalchemy_database_uri']
app.config['CELERYD_MAX_TASKS_PER_CHILD'] = server_config['celery_max_tasks_per_child']
app.config['CELERY_IMPORTS'] = ('app',)
app.config['CELERYD_TASK_TIME_LIMIT'] = 1000000
app.url_map.strict_slashes = False

Ejemplo n.º 47
0
import Bio
sys.path.insert(0,'/mnt/lustre/home/cusanovich/Programs/lib/python2.6/site-packages/Bio')
from Bio import motifs
from Bio.Seq import Seq
from Bio.Alphabet import IUPAC
import numpy
from datetime import datetime
fasta = pybedtools.BedTool('/mnt/lustre/data/share/HumanGenome/allhg18_norandom.fasta')
matrices = open('/data/share/TRANSFAC/2011.3_nb/dat/matrix.dat','r')
bed = open('/mnt/lustre/home/cusanovich/centipede/jack_centipede_sorted.bed','r')
outbed = open('/mnt/lustre/home/cusanovich/centipede/jack_centipede_sorted_pwms_timedright2.bed','w')
#liner = ['chr1','847521','847534','M01066','0.9675','-']

d = pybedtools.BedTool("""chr1 840146 840165""", from_string=True)
genome = d.sequence(fi=fasta)
test = motifs.parse(matrices,"TRANSFAC")
motifers = []
for i in range(len(test)):
	test[i].pseudocounts = 0.5
	motifers.append(test[i]['AC'])

matrices.close()
x=1
pssms = {}
for line in bed:
	liner = line.strip().split()
	motifed = motifers.index(liner[3])
	#For some reason, Jack and Roger's bed files seem to be off in coordinates?!?!?!
	testbed = liner[0] + ':' + str(int(liner[1])-2) + '-' + str(int(liner[2])+2)
	testseq = Seq(genome.seq(testbed,fasta),IUPAC.unambiguous_dna)
	if liner[5] == '-':
Ejemplo n.º 48
0
## fetch_motifs method: fetch those which match some criteria
## any of the meta, min_ic (minimum information content), minimum length of matrix, minimum num of sites to construct it
#motfs = jdb.fetch_motifs( collection='CORE', tax_group=['vertebrates','insects'], min_ic=12)
#for motif in motfs:
    #print 'do something with the motif'

print motifs.jaspar.calculate_pseudocounts(arnt)    # create new calculated pseudocounts
print arnt.pseudocounts    # usually zeros
arnt.pseudocounts = motifs.jaspar.calculate_pseudocounts(arnt)
print arnt.pseudocounts
print arnt.counts#, arnt.pssm()

#MEME
# input DNA or protein seqs, output number of motifs requested
with open('meme.txt','r') as handle:
    motfs = motifs.parse(handle,'meme')    # motif meme format
#motfs is an object of Bio.motifs.meme.Record class, list of Motif objects
#attributes
print motfs.version, motfs.datafile, motfs.command, motfs.alphabet
print motfs.sequences    #list of names
print len(motfs)    # number of motifs
for motif in motfs:
    #attributes
    print motif.name, motif.num_occurrences, motif.length, motif.evalue
    print motif.consensus
    print motif.degenerate_consensus
    print len(motif.instances), motif.instances[0], motif.instances[0].start
    print motif.instances[0].pvalue
print motfs[0]    # by index
print motfs['Motif 2']    # by name