Ejemplo n.º 1
0
    def _split_data(self, precision=6):
        """precision is to get same results as in the original perl script"""
        #FIXME::

        mask = self.gs.Flag == 0
        self.user_data_clean = self.user_data[mask].copy()
        print(
            'Splitting the user data set and removing flagged data (%s out of %s)'
            % (self.gs.shape[0] - mask.sum(), self.gs.shape[0]))
        self.gs_clean = self.gs[mask].copy()
        # local aliases
        gs = self.gs_clean
        user_data = self.user_data_clean

        pb = progress_bar(self.Ntf, interval=1)
        for tf_index in range(1, self.Ntf + 1):
            this_tf = 'TF_%s' % tf_index
            tf_gs = gs.query("Id == @this_tf").Answer
            tf_user = user_data.query("TF_Id == @this_tf").Signal_Mean
            df = pd.concat([tf_gs, tf_user], axis=1)
            df.to_csv(self._setfile(tf_index, 'Data'),
                      index=False,
                      sep='\t',
                      header=False,
                      float_format="%f")
            pb.animate(tf_index)
Ejemplo n.º 2
0
    def compute_statistics(self):
        """Returns final results of the user predcition

        :return: a dataframe with various metrics for each transcription factor.

        Must call :meth:`score` before.

        """
        data = {
            'Pearson': [],
            'Spearman': [],
            'Pearson_Log': [],
            "AUROC_8mer": [],
            "AUPR_8mer": [],
            "AUROC_probe": [],
            "AUPR_probe": []
        }

        pb = progress_bar(self.Ntf, interval=1)
        for tf_index in range(1, self.Ntf + 1):
            dfdata = pd.read_csv(self._setfile(tf_index, "Data"),
                                 sep='\t',
                                 header=None)
            pearson = dfdata.corr('pearson').ix[0, 1]
            spearman = dfdata.corr('spearman').ix[0, 1]
            pearsonLog = np.log10(dfdata).corr('pearson').ix[0, 1]

            data['Pearson'].append(pearson)
            data['Pearson_Log'].append(pearsonLog)
            data['Spearman'].append(spearman)

            dvdata = self._dvs[tf_index]

            r = ROCDiscovery(dvdata.values)
            rocdata = r.get_statistics()
            auroc = r.compute_auc(roc=rocdata)
            aupr = r.compute_aupr(roc=rocdata)
            data['AUROC_8mer'].append(auroc)
            data['AUPR_8mer'].append(aupr)

            dvdata = self._dvps[tf_index]
            r = ROCDiscovery(dvdata.values)
            rocdata = r.get_statistics()
            auroc = r.compute_auc(roc=rocdata)
            aupr = r.compute_aupr(roc=rocdata)
            data['AUROC_probe'].append(auroc)
            data['AUPR_probe'].append(aupr)
            pb.animate(tf_index)

        df = pd.DataFrame(data)
        df = df[[
            'Pearson', u'Spearman', u'Pearson_Log', u'AUROC_8mer',
            u'AUPR_8mer', u'AUROC_probe', u'AUPR_probe'
        ]]

        return df
Ejemplo n.º 3
0
    def _preprocessing(self):
        """Create temporary files for before further processing

        :return: nothing
        """
        # Read file octomers gold standard
        filename = self.directory + os.sep + '8mers_gs.txt'
        self.octomers_gs = pd.read_csv(filename, sep='\t', header=None)

        # Read file octomers
        filename = self.directory + os.sep + 'all_8mers.txt'
        self.octomers = pd.read_csv(filename, sep='\t',
                                    header=None)  # contains reverse complemtn
        self.octomers.columns = ['octomer', 'octomerRC']

        # Read probes gs
        filename = self.directory + os.sep + 'probe35_gs.txt'
        self.probes_gs = pd.read_csv(filename, header=None, sep='\t')
        self.probes_gs.columns = ['Id', 'Sequence']

        # reads probes (sequences)
        print('Reading probes')
        filename = self.directory + os.sep + 'probes35.txt'
        # just one column so no need for a separator
        probes = pd.read_csv(filename)

        # Extract information (first and third column of pred.txt)
        df = self.user_data[['TF_Id', 'Signal_Mean']].copy()
        df['Signal_Mean'] = df['Signal_Mean'].map(lambda x: round(x, 6))

        # data.txt is paste of probes35.txt and val.txt
        data = pd.concat([probes, df], axis=1)

        # Creates probes/TF_1.dat that contains the sequence from the GS and the answer from the user
        # for each TF
        print('Creating probes/TF_1.csv + sorting')
        pb = progress_bar(self.Ntf, interval=1)
        for i in range(1, self.Ntf + 1):
            # could use a groupby here ? faster  maybe
            tag = 'TF_%s' % i
            sequence = data[['Sequence']].ix[self.gs.Id == tag]
            answer = data.Signal_Mean[data.TF_Id == tag]
            df = pd.concat([sequence, answer], axis=1)
            try:
                df.sort_values(by=['Signal_Mean', 'Sequence'],
                               ascending=[False, False],
                               inplace=True)
            except:
                df.sort(columns=['Signal_Mean', 'Sequence'],
                        ascending=[False, False],
                        inplace=True)

            df['Signal_Mean'] = df['Signal_Mean'].map(lambda x: round(x, 6))

            self._probes[i] = df
            pb.animate(i)
Ejemplo n.º 4
0
 def get_jaccard(self, progress=True):
     import sklearn.metrics
     N = len(self.df)
     J = np.zeros((N,N))
     from easydev import progress_bar
     pb = progress_bar(N)
     for ic, i in enumerate(self.df.index):
         for jc, j in enumerate(self.df.index):
             J[ic][jc] = sklearn.metrics.jaccard_similarity_score(self.df.ix[i], self.df.ix[j])
         pb.animate(1+ic)
     return J
Ejemplo n.º 5
0
    def _preprocessing(self):
        """Create temporary files for before further processing

        :return: nothing
        """
        # Read file octomers gold standard
        filename = self.directory + os.sep + '8mers_gs.txt'
        self.octomers_gs = pd.read_csv(filename, sep='\t', header=None)

        # Read file octomers
        filename = self.directory + os.sep + 'all_8mers.txt'
        self.octomers = pd.read_csv(filename, sep='\t', header=None)  # contains reverse complemtn
        self.octomers.columns = ['octomer','octomerRC']

        # Read probes gs
        filename = self.directory + os.sep + 'probe35_gs.txt'
        self.probes_gs = pd.read_csv(filename, header=None, sep='\t')
        self.probes_gs.columns = ['Id', 'Sequence']

        # reads probes (sequences)
        print('Reading probes')
        filename = self.directory + os.sep + 'probes35.txt'
        # just one column so no need for a separator
        probes = pd.read_csv(filename)

        # Extract information (first and third column of pred.txt)
        df = self.user_data[['TF_Id', 'Signal_Mean']].copy()
        df['Signal_Mean'] = df['Signal_Mean'].map(lambda x: round(x,6))

        # data.txt is paste of probes35.txt and val.txt
        data = pd.concat([probes, df], axis=1)

        # Creates probes/TF_1.dat that contains the sequence from the GS and the answer from the user
        # for each TF
        print('Creating probes/TF_1.csv + sorting')
        pb = progress_bar(self.Ntf, interval=1)
        for i in range(1, self.Ntf+1):
            # could use a groupby here ? faster  maybe
            tag = 'TF_%s' % i
            sequence = data[['Sequence']].ix[self.gs.Id==tag]
            answer = data.Signal_Mean[data.TF_Id == tag]
            df = pd.concat([sequence, answer], axis=1)
            try:
                df.sort_values(by=['Signal_Mean', 'Sequence'], 
                    ascending=[False, False], inplace=True)
            except:
                df.sort(columns=['Signal_Mean', 'Sequence'], 
                    ascending=[False, False], inplace=True)

            df['Signal_Mean'] = df['Signal_Mean'].map(lambda x: round(x,6))

            self._probes[i] = df
            pb.animate(i)
Ejemplo n.º 6
0
 def get_null_timecourse_model1(self, N=10000):
     data = self._get_random_timecourse_model1(N=N)
     distances = []
     from easydev import progress_bar
     pb = progress_bar(N)
     for i in xrange(0,N):
         df = data[:,:,i]
         # FIXME those values 10,39 should not be hardcoded
         distance = self._compute_score_timecourse_model1(df, 10,39)
         distances.append(distance)
         pb.animate(i)
     return distances
Ejemplo n.º 7
0
 def analyse(self):
     models = self.simulator.results.models
     self.truth_tables = {}
     from easydev import progress_bar
     pb = progress_bar(len(models.df))
     for i, index in enumerate(models.df.index):
         reactions = models.df.ix[index][models.df.ix[index]==1]
         reactions = list(reactions.index)
         self.simulator.simulate(reactions=reactions)
         tt = self.simulator.simulated[self.simulator.time].flatten()
         self.truth_tables[index] = tt
         pb.animate(i+1)
Ejemplo n.º 8
0
 def _load_complexes(self, show_progress=True):
     from easydev import progress_bar
     import time
     pb = progress_bar(len(self.df.complexAC))
     complexes = {}
     self.logging.info("Loading all details from the IntactComplex database")
     for i, identifier in enumerate(self.df.complexAC):
         res = self.webserv.details(identifier)
         complexes[identifier] = res
         if show_progress:
             pb.animate(i+1, time.time()-pb.start)
     self._complexes = complexes
Ejemplo n.º 9
0
    def get_null_parameters_model1(self, N=10000):
        """Returns score distribution (parameter model1)"""
        df = self._get_random_parameters_model1(N=N)

        distances =[]
        from easydev import progress_bar
        pb = progress_bar(N)
        for i in xrange(0, N):
            df1 = df.ix[i].to_frame(name='values')
            distance = self._compute_score_model1_parameters(df1)
            distances.append(distance)
            pb.animate(i+1)
        return distances
Ejemplo n.º 10
0
    def compute_statistics(self):
        """Returns final results of the user predcition

        :return: a dataframe with various metrics for each transcription factor.

        Must call :meth:`score` before.

        """
        data = {'Pearson': [],
                'Spearman': [],
                'Pearson_Log': [],
                "AUROC_8mer": [],
                "AUPR_8mer": [],
                "AUROC_probe": [],
                "AUPR_probe": []}

        pb = progress_bar(self.Ntf, interval=1)
        for tf_index in range(1, self.Ntf + 1):
            dfdata = pd.read_csv(self._setfile(tf_index, "Data"), sep='\t', header=None)
            pearson = dfdata.corr('pearson').ix[0,1]
            spearman = dfdata.corr('spearman').ix[0,1]
            pearsonLog = np.log10(dfdata).corr('pearson').ix[0,1]

            data['Pearson'].append(pearson)
            data['Pearson_Log'].append(pearsonLog)
            data['Spearman'].append(spearman)

            dvdata = self._dvs[tf_index]

            r = ROCDiscovery(dvdata.values)
            rocdata = r.get_statistics()
            auroc = r.compute_auc(roc=rocdata)
            aupr = r.compute_aupr(roc=rocdata)
            data['AUROC_8mer'].append(auroc)
            data['AUPR_8mer'].append(aupr)

            dvdata = self._dvps[tf_index]
            r = ROCDiscovery(dvdata.values)
            rocdata = r.get_statistics()
            auroc = r.compute_auc(roc=rocdata)
            aupr = r.compute_aupr(roc=rocdata)
            data['AUROC_probe'].append(auroc)
            data['AUPR_probe'].append(aupr)
            pb.animate(tf_index)

        df = pd.DataFrame(data)
        df = df[['Pearson', u'Spearman', u'Pearson_Log', u'AUROC_8mer', 
            u'AUPR_8mer', u'AUROC_probe', u'AUPR_probe']]

        return df
Ejemplo n.º 11
0
 def compute_distances(self, N=100, show=True, progress=True):
     self.init()
     from easydev import progress_bar
     distances = []
     pb = progress_bar(N)
     for i in range(0,N):
         self.swap(1, inplace=True)
         dist = self.get_distance(self.graph)
         distances.append(dist)
         if progress:pb.animate(i)
     if show is True:
         import pylab
         pylab.plot(distances)
         pylab.grid(True)
     return distances
Ejemplo n.º 12
0
 def exhaustive(self):
     from cno.optimisers.binary_tools import permutations
     # create all 
     scores = []
     sizes = []
     from easydev import progress_bar
     N = len(self.model.reactions)
     pb = progress_bar(2**N)
     for i,this in enumerate(permutations(N)):
         self.simulate(self.parameters2reactions(this))
         scores.append(self.score())
         pb.animate(i)
         sizes.append(sum(this))
     #self._fill_results()
     self.scores = scores
     self.sizes = sizes
     return scores
Ejemplo n.º 13
0
    def compute_gtts(self):
        print("init R library")
        self._init()
        N = len(self.models)
        from easydev import progress_bar
        b = progress_bar(N)
        d = {}
        for i in range(0, N):
            res = np.array(self._get_sim(self.models.df.ix[i].values))
            b.animate(i)
            d[i] = res

        df = pd.DataFrame(d).transpose()
        grouped = df.groupby(list(df.columns))
        pylab.hist([len(this) for this in grouped.groups.values()], 100)
        res =  {'df':df, 'simulation': d, 'grouped':grouped}#
        self.gtts = res
        return self.gtts
Ejemplo n.º 14
0
    def _split_data(self, precision=6):
        """precision is to get same results as in the original perl script"""
        mask = self.gs.Flag == 0
        self.user_data_clean = self.user_data[mask].copy()
        print('Splitting the user data set and removing flagged data (%s out of %s)' % (self.gs.shape[0] - mask.sum(), self.gs.shape[0]))
        self.gs_clean = self.gs[mask].copy()
        # local aliases
        gs = self.gs_clean
        user_data = self.user_data_clean

        pb = progress_bar(self.Ntf, interval=1)
        for tf_index in range(1, self.Ntf + 1):
            this_tf = 'TF_%s' % tf_index
            tf_gs = gs.query("Id == @this_tf").Answer
            tf_user = user_data.query("TF_Id == @this_tf").Signal_Mean
            df = pd.concat([tf_gs, tf_user], axis=1)
            df.to_csv(self._setfile(tf_index, 'Data'), index=False, sep='\t',
                      header=False, float_format="%f")
            pb.animate(tf_index)
Ejemplo n.º 15
0
    def run(self, N=10, nswap=20, verbose=True, maxstallgen=50, maxtime=60):


        self.sim = steady.Steady(self.real.cnograph, self.real.midas)
        # creates the model, preprocessed
        self.sim.preprocessing()

        from easydev import progress_bar
        pb = progress_bar(N)
        for i in xrange(0,N):
            self.sim = steady.Steady(self.real.cnograph, self.real.midas)
            self.sim.preprocessing()
            self.sim.model.swap_edges(nswap)
            self.sim.preprocessing()
            self.sim.optimise(verbose=verbose, reuse_best=False, 
                    maxstallgen=maxstallgen, maxtime=maxtime)
            score = self.sim.results.results.best_score[-1]
            self.best_scores.append(score)
            pb.animate(i+1)
Ejemplo n.º 16
0
    def clean_models(self, tolerance=0.1):

        models = self.results.models.copy()
        models.midas = self.midas
        print("Found %s models within the tolerance" % len(models.df))
        models.drop_duplicates()
        print("Removing duplicates found %s" % len(models.df))
        models.drop_scores_above(tolerance=tolerance)
        print("Keeping within tolerance, found %s" % len(models.df))
        from easydev import progress_bar
        pb = progress_bar(len(models))
        count = 0
        changed = 0
        for index in models.df.index:
            count +=1
            reactions = list(models.df.columns[models.df.ix[index]==1])
            self.simulate(reactions)
            score = self.score()
            #if models.scores[index] != score:
            #     print(index, models.scores[index], score)

            # compute essentiality to simplify models
            dummy, newr = self.essentiality(reactions, show=False)
            self.simulate(newr)
            new_score = self.score()
            #print score, new_score, len(reactions), len(newr)
            if new_score <= score:
                # keep that pruned model
                models.df.ix[index] = self.reactions2parameters(newr)
                models.scores.ix[index] = new_score
                changed += 1
            else:
                # keep original
                pass

            pb.animate(count)
        print('Simplified %s %% of the model' % float(changed/float(len(models.df))))
        models.drop_duplicates()
        print("Removing duplicaes found %s" % len(models.df))
        models.drop_scores_above(tolerance=tolerance)
        print("Keeping within tolerance, found %s" % len(models.df))

        return models
Ejemplo n.º 17
0
 def plot_average_distance(self, repeat=10, N=100):
     import pandas as pd
     import pylab
     distances = []
     
     from easydev import progress_bar
     pb = progress_bar(repeat)
     for i in range(0, repeat):
         distance = self.compute_distances(N=N, show=False, progress=False)
         distances.append(distance)
         pb.animate(i+1)
     df = pd.DataFrame(distances)
     pylab.clf()
     pylab.fill_between(range(0,N), df.mean()+df.std(),
             y2=df.mean()-df.std()); 
     pylab.plot(df.mean(), 'r', lw=2)
     pylab.grid(True)
     pylab.ylim([0,1])
     pylab.ylabel('similarity')
     return distances
Ejemplo n.º 18
0
    def download_all_data(self):
        """Download all large data sets from Synapse"""
        pb = progress_bar(5)
        # load the large gold standard file from D5C2 synapse main page
        filename = self._download_data('DREAM5_GoldStandard_probes.zip', 
                'syn2898469')
        pb.animate(1)
        z = ZIP()
        z.loadZIPFile(filename)
        data = z.read('Answers.txt')
        self.gs = pd.read_csv(StringIO.StringIO(data), sep='\t')

        # download 4 other filenames from dreamtools synapse project
        self._download_data('all_8mers.txt', 'syn4483185')
        pb.animate(2)
        self._download_data('8mers_gs.txt', 'syn4483187')
        pb.animate(3)
        self._download_data('probe35_gs.txt', 'syn4483184')
        pb.animate(4)
        self._download_data('probes35.txt', 'syn4483183')
        pb.animate(5)
Ejemplo n.º 19
0
    def download_all_data(self):
        """Download all large data sets from Synapse"""
        pb = progress_bar(5)
        # load the large gold standard file from D5C2 synapse main page
        filename = self._download_data('DREAM5_GoldStandard_probes.zip',
                                       'syn2898469')
        pb.animate(1)
        z = ZIP()
        z.loadZIPFile(filename)
        data = z.read('Answers.txt')
        self.gs = pd.read_csv(BytesIO(data), sep='\t')

        # download 4 other filenames from dreamtools synapse project
        self._download_data('all_8mers.txt', 'syn4483185')
        pb.animate(2)
        self._download_data('8mers_gs.txt', 'syn4483187')
        pb.animate(3)
        self._download_data('probe35_gs.txt', 'syn4483184')
        pb.animate(4)
        self._download_data('probes35.txt', 'syn4483183')
        pb.animate(5)
Ejemplo n.º 20
0
    def _processing(self):
        """

        :return:
        """
        ########################################  1 Create the Out/TF_XX.dat files
        octomers = self.octomers.octomer
        octomersRC = self.octomers.octomerRC
        mapping1  = dict([(k,v) for k,v in zip(octomers.values, octomersRC.values)])
        mapping2  = dict([(k,v) for k,v in zip(octomersRC.values, octomers.values)])
        keys = tuple(sorted(octomers.values))

        lm = set(octomers.values)

        pb = progress_bar(self.Ntf, interval=1)
        pb.animate(0)
        for tf_index in range(1, self.Ntf + 1):
            tf = self._probes[tf_index]
            tf.columns = ['Sequence', 'Score']
            ids = collections.defaultdict(list)
            ###### TODO: most of the time is spent in the "for curR in generator" loop
            for seq, score in zip(tf.Sequence, tf.Score):
                # scan the sequence by chunk of octomers using a generator
                # for speed (although gain is small)
                generator = (seq[i:i+8] for i in xrange(0,28))
                for curR in generator:
                    if mapping1.has_key(curR) is False:
                        curR = mapping2[curR]
                    ids[curR].append(score)
                # Using a set does not help speeding up the code
                #for curR in generator:
                #    if curR not in lm:
                #        curR = mapping2[curR]
                #    ids[curR].append(score)

            # now let us build the new dataframe for the indices found
            df = pd.DataFrame({0:[k for k in  ids.keys()],
                               1:[np.median(v) for v in ids.values()]})
            df.sort(columns=[1,0], ascending=[False, False], inplace=True)
            df[1] = df[1].map(lambda x: round(x,6))

            df.to_csv(self._setfile(tf_index, 'Out'), sep=' ', index=False, header=None, float_format="%.6f")
            pb.animate(tf_index)
        print("ooooooooooo")
        ################################################# 2 create the DVP

        pb = progress_bar(self.Ntf, interval=1)
        for tf_index in range(1,self.Ntf+1):
            tag = 'TF_%s' % tf_index
            tf_probes = list(self.probes_gs.ix[self.probes_gs.groupby('Id').groups[tag]].Sequence)

            tf = self._probes[tf_index]
            dv = tf.Sequence.apply(lambda x: x in tf_probes).astype(int)
            self._dvps[tf_index] = dv
            pb.animate(tf_index)
        print("")

        ########################################################## DV
        gs_octomers = self.octomers_gs.copy()
        gs_octomers.columns = ['id', 'octomer']
        pb = progress_bar(self.Ntf, interval=1)
        for tf_index in range(1,self.Ntf+1):
            tag = 'TF_%s' % tf_index
            tf_octomers = list(gs_octomers.ix[gs_octomers.groupby('id').groups[tag]].octomer)
            tf = pd.read_csv(self._setfile(tf_index, "Out"), sep=" ",
                             header=None)
            tf.columns = ['Octomer', 'Score']
            dv = tf.Octomer.apply(lambda x: x in tf_octomers).astype(int)

            # Stores the dataframe
            self._dvs[tf_index] = dv
            pb.animate(tf_index)
Ejemplo n.º 21
0
    def run(self, eval_func, N, nswap=3, proposal=None):

        self.Nparameters = N

        results = Results(N=N, step=1)

        t1 = time.time()
        self.alpha = []
        if proposal is None:
            proposal_parameter = [1] * self.Nparameters
        else:
            proposal_parameter = proposal[:]
        init_bs = proposal_parameter[:]

        prev_score = eval_func(proposal_parameter)  # compute the score for the initial bitstring
        prev_bs = init_bs[:]
        best_score = prev_score
        results['best_score'] = best_score
        best_parameters = init_bs[:]

        # store the initial values
        results['scores'].append(prev_score)
        results['parameters'].append(prev_bs)

        from easydev import progress_bar
        pb = progress_bar(self.N)

        for i in range(0, self.N):

            proposal_parameter = self.swaps(best_parameters, nswap)

            #tup_param = tuple(proposal_parameter)
            #if tup_param in self._buffer.keys():
            #    proposal_score = self._buffer[tup_param]
            #else:
            #    proposal_score = eval_func(proposal_parameter)
            #    self._buffer[tup_param] = proposal_score
            proposal_score = eval_func(proposal_parameter)

            alpha = prev_score / proposal_score # best score is the smallests one
            # so alpha >1 means new proposal is better

            self.alpha.append(alpha)

            if alpha >=1:
                prev_score = proposal_score
                score = proposal_score
                results['parameters'].append(proposal_parameter)
                prev_bs = proposal_parameter[:]
                accepted = 1
            else:
                r = random.uniform(0,1)
                if r <= alpha:
                    prev_score = proposal_score
                    score = proposal_score
                    # storing results
                    results['parameters'].append(proposal_parameter)
                    prev_bs = proposal_parameter[:]
                    accepted = 1
                else:
                    prev_score = prev_score
                    # storing results
                    score = prev_score
                    results['parameters'].append(prev_bs)
                    accepted = 0
            self.acceptance.append(accepted)
            results['scores'].append(score)

            if score < best_score:
                best_score = score
                best_parameters = proposal_parameter[:]
                results['best_score'] = best_score
            results['best_scores'].append(best_score)
            results['best_score'] = best_score # just for the progres
            pb.animate(i)

        #print best_parameters
        del results['scores'][0] # remove first element to have a length of N value
        del results['parameters'][0] # remove first element to have a length of N value
        results['best_parameters'] = best_parameters[:]
        results['min_index'] = numpy.argmin(results['best_scores']) # store the index of the minimum score from the best_scores list     

        t2 = time.time()
        print "simulation took", t2-t1, "seconds."

        self.results = results.copy()
Ejemplo n.º 22
0
    def _processing(self):
        """

        :return:
        """
        ########################################  1 Create the Out/TF_XX.dat files
        octomers = self.octomers.octomer
        octomersRC = self.octomers.octomerRC
        mapping1 = dict([(k, v)
                         for k, v in zip(octomers.values, octomersRC.values)])
        mapping2 = dict([(k, v)
                         for k, v in zip(octomersRC.values, octomers.values)])
        keys = tuple(sorted(octomers.values))

        lm = set(octomers.values)

        pb = progress_bar(self.Ntf, interval=1)
        pb.animate(0)
        for tf_index in range(1, self.Ntf + 1):
            tf = self._probes[tf_index]
            tf.columns = ['Sequence', 'Score']
            ids = collections.defaultdict(list)
            ## TODO: most of the time is spent in "for curR in generator" loop

            for seq, score in zip(tf.Sequence, tf.Score):
                # scan the sequence by chunk of octomers using a generator
                # for speed (although gain is small)
                generator = (seq[i:i + 8] for i in range(0, 28))
                #for curR in generator:
                #    if curR not in mapping1.keys():
                #        curR = mapping2[curR]
                #    ids[curR].append(score)
                # Using a set does help speeding up the code
                for curR in generator:
                    if curR not in lm:
                        curR = mapping2[curR]
                    ids[curR].append(score)
            # now let us build the new dataframe for the indices found
            df = pd.DataFrame({
                0: [k for k in ids.keys()],
                1: [np.median(v) for v in ids.values()]
            })
            try:
                df.sort_values(by=[1, 0],
                               ascending=[False, False],
                               inplace=True)
            except:
                df.sort(columns=[1, 0], ascending=[False, False], inplace=True)
            df[1] = df[1].map(lambda x: round(x, 6))

            df.to_csv(self._setfile(tf_index, 'Out'),
                      sep=' ',
                      index=False,
                      header=None,
                      float_format="%.6f")
            pb.animate(tf_index)
        ################################################# 2 create the DVP

        pb = progress_bar(self.Ntf, interval=1)
        for tf_index in range(1, self.Ntf + 1):
            tag = 'TF_%s' % tf_index
            tf_probes = list(self.probes_gs.ix[self.probes_gs.groupby(
                'Id').groups[tag]].Sequence)

            tf = self._probes[tf_index]
            dv = tf.Sequence.apply(lambda x: x in tf_probes).astype(int)
            self._dvps[tf_index] = dv
            pb.animate(tf_index)
        print("")

        ########################################################## DV
        gs_octomers = self.octomers_gs.copy()
        gs_octomers.columns = ['id', 'octomer']
        pb = progress_bar(self.Ntf, interval=1)
        for tf_index in range(1, self.Ntf + 1):
            tag = 'TF_%s' % tf_index
            tf_octomers = list(
                gs_octomers.ix[gs_octomers.groupby('id').groups[tag]].octomer)
            tf = pd.read_csv(self._setfile(tf_index, "Out"),
                             sep=" ",
                             header=None)
            tf.columns = ['Octomer', 'Score']
            dv = tf.Octomer.apply(lambda x: x in tf_octomers).astype(int)

            # Stores the dataframe
            self._dvs[tf_index] = dv
            pb.animate(tf_index)