Esempio n. 1
0
    def test_memeprofile(self):
        """ Test entropy caclulations
	         bits    2.3
                 2.1
                 1.8                        *
                 1.6                        **
Relative         1.4                *     * **
Entropy          1.2 *      **  *** *     * **
(23.5 bits)      0.9 ** * * **  *******  ** **
                 0.7 **** * **  *******  *****
                 0.5 ********** **************
                 0.2 *************************
                 0.0 -------------------------


        """
        motifs = read_memefile(self.meme_file)
        record = motifs['motif_records'][0]
        motif_ic = get_motif_ic(self.meme_file, 0)
        target = np.array([1.2,0.9,0.7,0.9,0.5,
                           0.9,0.5,1.2,1.2,0.5,
                           0.2,1.2,1.2,1.2,0.9,
                           1.4,1.2,1.2,0.5,0.9,
                           1.2,1.4,0.7,1.8,1.6])

        assert np.allclose(target, motif_ic, atol=0.6)
Esempio n. 2
0
    def test_meme(self):
        """Test meme runner"""
        if os.path.exists('tests/data/generated_out/meme_analysis'):
            shutil.rmtree('tests/data/generated_out/meme_analysis')
        meme_args = self.pipeline.get_meme_default_params
        output = self.pipeline.run_meme(fasta_in=self.meme_fasta,
                                        out_dir='tests/data/generated_out/meme_analysis',
                                        strargs=meme_args.replace(' -p {}'.format(get_cpu_count()), ''))
        #TODO Check if meme.txt is same and created
        #TODO This check is too stringent, specially if logos are being produced.
        #MEME installation leads to hard coded paths
        print output
        assert output['exitcode'] == 0
        meme_record = read_memefile('tests/data/generated_out/meme_analysis/meme.txt')
        assert meme_record['total_motifs'] == 5

        motif_record1 = meme_record['motif_records'][0]
        motif_record2 = meme_record['motif_records'][1]
        motif_record3 = meme_record['motif_records'][2]

        assert motif_record1.consensus == 'CAGAACGCTGCTGCCAACCCGACCT'
        assert motif_record2.consensus == 'AGCAGA'
        assert motif_record3.consensus == 'CAGTTT'
Esempio n. 3
0
COUNT_TYPE = 'counts'

client = MongoClient()
db = client.moca_encode_tf
db.encode_tf_stats.remove()

for d in os.listdir(__root_dir__):

    results = db.tf_metadata.find({'@id': '/experiments/{}/'.format(d)})
    meme_file = os.path.join(__root_dir__, d, 'moca_output', 'meme_out',
                             'meme.txt')
    centrimo_dir = os.path.join(__root_dir__, d, 'moca_output', 'centrimo_out')
    if not os.path.isfile(meme_file):
        print 'Skipping {}'.format(d)
        continue
    meme_info = read_memefile(meme_file)

    total_sequences = get_total_sequences(meme_file)

    for i in range(0, meme_info['total_motifs']):
        record = meme_info['motif_records'][i]
        max_occur = get_max_occuring_bases(record,
                                           max_count=1,
                                           count_type=COUNT_TYPE)
        motif_freq = []
        for position in max_occur:
            motif_freq.append(position[0][1])

        motif_freq = np.asarray(motif_freq)

        fimo_sample = os.path.join(os.path.dirname(meme_file),
Esempio n. 4
0
def create_plot(meme_file,
                plot_title,
                output_dir=None,
                centrimo_dir=None,
                motif_number=1,
                flank_length=5,
                sample_score_files=[],
                control_score_files=[],
                reg_plot_titles=[],
                annotate=None,
                save=True):
    """Create plot
    Parameters
    ----------
    meme_file: string
        Path to meme.txt
    peak_file: string
        Path to summit file
    centrimo_dir: string
        Path to centrimo's output directory
    motif_number: int
        1-based number of motif in the motif file
    sample_score_files: list
        Path to conservation scores files for sample
    control_score_files: list
        Path to conservation score files for control
    legend_titles: list
        List of legend titles
    """
    meme_record = read_memefile(meme_file)
    total_sequences = get_total_sequences(meme_file)
    record = meme_record['motif_records'][motif_number-1]
    num_occurrences = getattr(record, 'num_occurrences', 'Unknown')
    all_meme_occurrences = []
    for motif_record in meme_record['motif_records']:
        all_meme_occurrences.append(getattr(motif_record, 'num_occurrences', 'Unknown'))

    meme_dir = os.path.abspath(os.path.dirname(meme_file))
    if not output_dir:
        output_dir = os.path.join(os.path.join(meme_dir, '..'), 'moca_plots')
    safe_makedir(output_dir)

    subplot_ncols = 1

    if len(sample_score_files) == 0:
        raise MocaException('Found no sample score files')
    elif len(control_score_files) == 0:
        raise MocaException('Found no control score filees')
    elif len(sample_score_files)!=len(control_score_files):
        raise MocaException('Found unequal size of sample and control score files')

    if annotate == "" or annotate == ' ':
        annotate = None
        subplot_ncols +=1

    max_occur = get_max_occuring_bases(record, max_count=1, count_type=COUNT_TYPE)
    motif_freq = []
    for position in max_occur:
        motif_freq.append(position[0][1])

    motif_freq = np.asarray(motif_freq)
    sample_conservation_scores = []
    control_conservation_scores = []
    for i in range(0, len(sample_score_files)):
        sample_conservation_scores.append(np.loadtxt(sample_score_files[i]))
    for i in range(0, len(control_score_files)):
        control_conservation_scores.append(np.loadtxt(control_score_files[i]))

    motif = record
    motif_length = motif.length
    motif_evalue = motif.evalue
    meme_dir = os.path.abspath(os.path.dirname(meme_file))
    X_values = [40+15] ## this is by trial and error, the position for the first base logo
    ## Generate all other X coordinates
    for j in range(1,len(motif)+2*flank_length):
        X_values.append( X_values[j-1]+OFFSET+1.9 )

    if centrimo_dir:
        subplot_ncols +=1
        centrimo_dir = os.path.abspath(centrimo_dir)
        centrimo_txt = os.path.join(centrimo_dir, 'centrimo.txt')
        centrimo_stats = os.path.join(centrimo_dir, 'site_counts.txt')

    plot_title += r' \# {}'.format(motif_number)
    ##FIXME This is a big dirty hacl to get thegenerate plots for the Reverse complement logo too
    logo_name =['logo{}.png'.format(motif_number), 'logo_rc{}.png'.format(motif_number)]
    figures = []
    for sample_score, control_score, subplot_legend_title in zip(sample_conservation_scores,
                                                  control_conservation_scores,
                                                  reg_plot_titles):
        for logo_filename in logo_name:
            setup_matplotlib()
            if 'rc'in logo_filename:
                sample_score = sample_score[::-1]
            matplot_dict = init_figure(meme_dir=meme_dir, X_values=X_values,
                                    motif=motif_number,
                                    subplot_ncols=subplot_ncols, annotate=annotate)
            f = matplot_dict['figure']
            gs = matplot_dict['gs']
            figsize = matplot_dict['figsize']
            right_margin = matplot_dict['right_margin']
            #total_px= matplot_dict['total_px']

            title = r'\textbf{' + '\\underline{'+'{}'.format(plot_title)+'}}'
            f.suptitle(title, fontsize=LEGEND_FONTSIZE)
            logo_plot = create_logo_plot({'figure':f, 'gridspec': gs[0]}, meme_dir, logo_filename, motif_length)

            subgrid = gridspec.GridSpec(2, subplot_ncols, height_ratios=[1,2], width_ratios=[1]*subplot_ncols)
            subgrid.update(bottom=0.14, right=0.9, left=1-right_margin*0.85, wspace=0.58)
            X_left, X_center, X_right = create_stemplot({'figure': f,
                                                        'gridspec': gs[1],
                                                        'shareX': logo_plot},
                                                        X_values,
                                                        sample_score,
                                                        motif_length,
                                                        flank_length=flank_length,
                                                        legend_title=subplot_legend_title)

            create_bar_plot(logo_plot,  X_right, matplot_dict['height_px'],
                            total_sequences, all_meme_occurrences, motif_number, motif_evalue)
            create_ols_legend_plot({'figure':f, 'gridspec': subgrid[0,0]},  motif_freq,
                                sample_score, control_score,
                                flank_length, legend_title=subplot_legend_title)
            create_scatter_plot({'figure':f, 'gridspec': subgrid[1,0]}, motif_freq,
                                sample_score, control_score,
                                flank_length, num_occurrences, y_label=subplot_legend_title)

            if centrimo_dir:
                create_enrichment_plot({'figure': f,
                                        'gridspec_header': subgrid[0,1],
                                        'gridspec_body': subgrid[1,1]},
                                        motif_number,
                                        centrimo_txt,
                                        centrimo_stats)

            if 'rc' not in logo_filename:
                out_file = os.path.join(output_dir,'moca_{}_{}.png'.format(subplot_legend_title, motif_number))
            else:
                out_file = os.path.join(output_dir,'moca_{}_{}_rc.png'.format(subplot_legend_title, motif_number))

            if annotate:
                create_annnotation_plot({'figure': f,
                                        'gridspec_header': subgrid[0,-1],
                                        'gridspec_body': subgrid[1,-1]},
                                        annotate)

            if save:
                f.savefig(out_file, figsize=figsize, dpi=DPI)
            figures.append(f)
            plt.close('all')
    return figures
Esempio n. 5
0
def find_motifs(bedfile, oc, configuration, slop_length,
                flank_motif, n_motif, cores, genome_build, show_progress):
    """Run meme to locate motifs and create conservation stacked plots"""
    root_dir = os.path.dirname(os.path.abspath(bedfile))
    if not oc:
        moca_out_dir = os.path.join(os.getcwd(), 'moca_output')
    else:
        moca_out_dir = oc
    moca_pipeline = pipeline.Pipeline(configuration)
    genome_data = moca_pipeline.get_genome_data(genome_build)
    genome_fasta = genome_data['fasta']
    genome_table = genome_data['genome_table']

    wigfiles = {}
    for key in list(conservation_wig_keys):
        try:
            wigfiles[key] = genome_data['{}_wig'.format(key)]
        except KeyError:
            pass
    safe_makedir(moca_out_dir)
    bedfile_fn, _ = filename_extension(bedfile)


    if show_progress:
        msg_list = ['Extracting Fasta',
                    'Running MEME',
                    'Running CENTRIMO']
        msg_list_e = ['Generating random Fasta', 'Running fimo random', 'Running fimo main'] + ['Extracting Scores']*len(wigfiles.keys()) + ['Creating PLot']
        msg_list = msg_list + msg_list_e*n_motif
        progress_bar = ProgressBar(msg_list)


    query_train_fasta = os.path.join(moca_out_dir,
                                     bedfile_fn + '_train_flank_{}.fasta'.format(slop_length))
    query_test_fasta = os.path.join(moca_out_dir,
                                    bedfile_fn + '_test_flank_{}.fasta'.format(slop_length))

    if show_progress:
        progress_bar.show_progress('Extracting Fasta')
    bed_o = bedoperations.Bedfile(bedfile, genome_table, moca_out_dir)
    bed_train, bed_test = bed_o.split_train_test_bed(train_peaks_count=500, test_peaks_count=500)

    bed_train_slopped  = bed_o.slop_bed(bed_train, flank_length=slop_length)
    bed_test_slopped  = bed_o.slop_bed(bed_test, flank_length=slop_length)

    bed_o.extract_fasta(bed_train_slopped, fasta_in=genome_fasta, fasta_out=query_train_fasta)
    bed_o.extract_fasta(bed_test_slopped, fasta_in=genome_fasta, fasta_out=query_test_fasta)

    #memechip_out_dir = os.path.join(moca_out_dir, 'memechip_analysis')
    meme_out_dir = os.path.join(moca_out_dir, 'meme_out')
    memechip_out_dir = meme_out_dir
    meme_params = moca_pipeline.get_meme_default_params
    if cores==1:
        re.sub(r' -p*', '', meme_params)
    else:
        re.sub(r'-p*', '-p {}'.format(cores), meme_params)
    if show_progress:
        progress_bar.show_progress('Running MEME')

    #meme_run_out = moca_pipeline.run_memechip(fasta_in=query_fasta, out_dir=memechip_out_dir)
    #
    meme_run_out = moca_pipeline.run_meme(fasta_in=query_train_fasta,
                                          out_dir=meme_out_dir,
                                          strargs=meme_params)
    if meme_run_out['stderr']!='':
        sys.stdout.write('Error running MEME: {}'.format(meme_run_out['stderr']))
        sys.exit(1)
    meme_file = os.path.join(meme_out_dir, 'meme.txt')
    meme_summary = read_memefile(meme_file)

    if show_progress:
        progress_bar.show_progress('Running CENTRIMO')
    centrimo_main_dir = os.path.join(moca_out_dir, 'centrimo_out')
    centrimo_main = moca_pipeline.run_centrimo(meme_file=meme_file,
                                               fasta_in=query_test_fasta,
                                               out_dir=centrimo_main_dir)
    for motif in range(1, meme_summary['total_motifs']+1):
        fimo_rand_dir = os.path.join(memechip_out_dir, 'fimo_random_{}'.format(motif))
        fimo_main_dir = os.path.join(memechip_out_dir, 'fimo_out_{}'.format(motif))

        safe_makedir(fimo_rand_dir)
        random_fasta = os.path.join(fimo_rand_dir, 'random_{}.fa'.format(motif))
        if show_progress:
            progress_bar.show_progress('Generating Random Fasta: {}'.format(motif))
        moca_pipeline.run_fasta_shuffler(fasta_in=query_train_fasta, fasta_out=random_fasta)

        #Random
        if show_progress:
            progress_bar.show_progress('Running FIMO Random')
        fimo_rand = moca_pipeline.run_fimo(motif_file=meme_file,
                                           motif_num=motif,
                                           sequence_file=random_fasta,
                                           out_dir=fimo_rand_dir)
        #Main
        if show_progress:
            progress_bar.show_progress('Running FIMO Main')
        fimo_main = moca_pipeline.run_fimo(motif_file=meme_file,
                                           motif_num=motif,
                                           sequence_file=query_test_fasta,
                                           out_dir=fimo_main_dir)


        fimo_rand_file = os.path.join(fimo_rand_dir, 'fimo.txt')
        fimo_main_file = os.path.join(fimo_main_dir, 'fimo.txt')

        main_intervals = get_start_stop_intervals(fimo_main_file, flank_length=flank_motif)
        random_intervals = get_start_stop_intervals(fimo_rand_file, flank_length=flank_motif)

        sample_score_files = []
        control_score_files = []
        for key in list(conservation_wig_keys):
            wigfile = wigfiles[key]
            if show_progress:
                progress_bar.show_progress('Creating plots')
            sample_score_file = moca_pipeline.save_conservation_scores(main_intervals, wigfile,
                                                                       fimo_main_dir, out_prefix=key)
            control_score_file = moca_pipeline.save_conservation_scores(random_intervals, wigfile,
                                                                        fimo_rand_dir, out_prefix=key)
            sample_score_files.append(sample_score_file)
            control_score_files.append(control_score_file)
        if show_progress:
            progress_bar.show_progress('Creating Plot')
        create_plot(meme_file,
                    bedfile_fn,
                    output_dir=moca_out_dir,
                    centrimo_dir=centrimo_main_dir,
                    motif_number=motif,
                    flank_length=flank_motif,
                    sample_score_files=sample_score_files,
                    control_score_files=control_score_files,
                    reg_plot_titles=[key.capitalize() for key in list(conservation_wig_keys)],
                    annotate=None)

    if show_progress:
        progress_bar.close()