Esempio n. 1
0
 def _get_parameters_range():
     text = []
     text.append('\n\n\tParameters range:')
     text.append('\nPre_processor:')
     text.append(serialize_dict(pre_processor_parameters))
     text.append('\nVectorizer:')
     text.append(serialize_dict(vectorizer_parameters))
     text.append('\nEstimator:')
     text.append(serialize_dict(estimator_parameters))
     return '\n'.join(text)
Esempio n. 2
0
 def get_parameters(self):
     text = []
     text.append('\n\tModel parameters:')
     text.append('\nPre_processor:')
     text.append(serialize_dict(self.pre_processor_args))
     text.append('\nVectorizer:')
     text.append(serialize_dict(self.vectorizer_args))
     text.append('\nEstimator:')
     text.append(serialize_dict(self.estimator_args))
     return '\n'.join(text)
Esempio n. 3
0
 def get_parameters(self):
     text = []
     text.append('\n\tModel parameters:')
     text.append('\nPre_processor:')
     text.append(serialize_dict(self.pre_processor_args))
     text.append('\nVectorizer:')
     text.append(serialize_dict(self.vectorizer_args))
     text.append('\nEstimator:')
     text.append(serialize_dict(self.estimator_args))
     return '\n'.join(text)
Esempio n. 4
0
 def _get_parameters_range():
     text = []
     text.append('\n\n\tParameters range:')
     text.append('\nPre_processor:')
     text.append(serialize_dict(pre_processor_parameters))
     text.append('\nVectorizer:')
     text.append(serialize_dict(vectorizer_parameters))
     text.append('\nEstimator:')
     text.append(serialize_dict(estimator_parameters))
     return '\n'.join(text)
Esempio n. 5
0
 def __repr__(self):
     serial = []
     serial.append('Embedder2D:')
     if self.compiled is True:
         serial.append('compiled: yes')
         serial.append('learning_rate: %.6f' % self.deepnet_learning_rate)
         serial.append('n_features_hidden_factor: %d' % self.deepnet_n_features_hidden_factor)
     else:
         serial.append('compiled: no')
     serial.append('layout: %s' % (self.layout))
     serial.append('layout_prog: %s' % (self.layout_prog))
     if self.layout_prog_args:
         serial.append('layout_prog_args: %s' % (self.layout_prog_args))
     serial.append('n_nearest_neighbor_links: %s' % (self.n_nearest_neighbor_links))
     if self.n_nearest_neighbors is None:
         serial.append('n_nearest_neighbors: None')
     else:
         serial.append('n_nearest_neighbors: %d' % self.n_nearest_neighbors)
     serial.append('metric: %s' % self.metric)
     if self.kwds is None or len(self.kwds) == 0:
         pass
     else:
         serial.append('params:')
         serial.append(serialize_dict(self.kwds))
     serial.append('selectors [%d]:' % len(self.selectors))
     for i, selector in enumerate(self.selectors):
         if len(self.selectors) > 1:
             serial.append('%d/%d  ' % (i + 1, len(self.selectors)))
         serial.append(str(selector))
     return '\n'.join(serial)
Esempio n. 6
0
def main_script(prog_name=None, logger=None):

    parser = argparse_setup()
    args = parser.parse_args()

    if args.no_logging:
        configure_logging(logger, verbosity=args.verbosity)
    else:
        configure_logging(logger,
                          verbosity=args.verbosity,
                          filename=args.logging_dir +
                          'logs_gc%.2f_len%d_num%d' %
                          (args.gc_content, args.length, args.num) + '.log')

    logger.debug('-' * 80)
    logger.debug('Program: %s' % prog_name)
    logger.debug('\n')
    logger.debug('Called with parameters:\n\n %s \n\n' %
                 serialize_dict(args.__dict__))

    start_time = time.asctime(time.localtime(time.time()))
    logger.info('Initializing program execution %s \n\n' % (start_time))
    try:
        main(args, logger)
    except Exception:
        import datetime
        curr_time = datetime.datetime.now().strftime("%A, %d. %B %Y %I:%M%p")
        logger.exception("Program run failed on %s" % curr_time)
        exit(1)
    finally:
        end_time = time.asctime(time.localtime(time.time()))
        logger.info('Executing program execution %s' % (end_time))
        logger.info('-' * 80)
Esempio n. 7
0
 def __repr__(self):
     serial = []
     serial.append('Embedder2D:')
     if self.compiled is True:
         serial.append('compiled: yes')
         serial.append('learning_rate: %.6f' % self.learning_rate)
         serial.append('n_features_hidden_factor: %d' %
                       self.n_features_hidden_factor)
     else:
         serial.append('compiled: no')
     serial.append('layout: %s' % (self.layout))
     serial.append('layout_prog: %s' % (self.layout_prog))
     if self.layout_prog_args:
         serial.append('layout_prog_args: %s' % (self.layout_prog_args))
     serial.append('n_links: %s' % (self.n_links))
     if self.n_nearest_neighbors is None:
         serial.append('n_nearest_neighbors: None')
     else:
         serial.append('n_nearest_neighbors: %d' % self.n_nearest_neighbors)
     serial.append('metric: %s' % self.metric)
     if self.kwds is None or len(self.kwds) == 0:
         pass
     else:
         serial.append('params:')
         serial.append(serialize_dict(self.kwds))
     serial.append('selectors [%d]:' % len(self.selectors))
     for i, selector in enumerate(self.selectors):
         if len(self.selectors) > 1:
             serial.append('%d/%d  ' % (i + 1, len(self.selectors)))
         serial.append(str(selector))
     return '\n'.join(serial)
Esempio n. 8
0
def main_script(model_initializer=None,
                description=None,
                epilog=None,
                prog_name=None,
                logger=None):
    parser = argparse_setup(model_initializer, description, epilog)
    args = parser.parse_args()

    if args.no_logging:
        configure_logging(logger, verbosity=args.verbosity)
    else:
        configure_logging(logger,
                          verbosity=args.verbosity,
                          filename=prog_name + '.log')

    logger.debug('-' * 80)
    logger.debug('Program: %s' % prog_name)
    logger.debug('Called with parameters:\n %s' %
                 serialize_dict(args.__dict__))

    start_time = time()
    try:
        main(model_initializer, args)
    except Exception:
        import datetime
        curr_time = datetime.datetime.now().strftime("%A, %d. %B %Y %I:%M%p")
        logger.exception("Program run failed on %s" % curr_time)
    finally:
        end_time = time()
        logger.info('Elapsed time: %.1f sec', end_time - start_time)
Esempio n. 9
0
    def visualize(self, data, target, title='', region_only=False):
        """visualize."""
        auc = self.score(data, target)
        title += 'roc:%.2f' % (auc)
        title += '\nparams:%s' % serialize_dict(self.get_params())

        x2dim = self.transform(data)

        x_min, x_max = x2dim[:, 0].min(), x2dim[:, 0].max()
        y_min, y_max = x2dim[:, 1].min(), x2dim[:, 1].max()
        b = max((x_max - x_min) / 10, (y_max - y_min) / 10)  # border size
        x_min, x_max = x_min - b, x_max + b
        y_min, y_max = y_min - b, y_max + b
        h = b / 20  # step size in the mesh
        xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                             np.arange(y_min, y_max, h))

        grid2d = np.c_[xx.ravel(), yy.ravel()]
        z = self.est2d.predict_proba(grid2d)
        z = 1 - z.reshape(xx.shape)
        plt.contourf(xx,
                     yy,
                     z,
                     cmap=plt.get_cmap('BrBG'),
                     alpha=.3,
                     levels=[0.05, 0.25, 0.5, 0.75, 0.95],
                     extend='both')
        plt.contour(xx,
                    yy,
                    z,
                    levels=[-1, 0.5, 2],
                    colors='w',
                    linewidths=[.5, 4, .5],
                    linestyles=['solid', 'solid', 'solid'],
                    extend='both')
        plt.contour(xx,
                    yy,
                    z,
                    levels=[-1, 0.5, 2],
                    colors='k',
                    linewidths=[.5, 2, .5],
                    linestyles=['solid', 'solid', 'solid'],
                    extend='both')
        if region_only is False:
            plt.scatter(x2dim[:, 0],
                        x2dim[:, 1],
                        alpha=.8,
                        c=target,
                        s=30,
                        edgecolors='k',
                        cmap=plt.get_cmap('gray'))
        plt.title(title)
        plt.grid(False)
        plt.axis('off')
        return self
Esempio n. 10
0
 def __repr__(self):
     serial = []
     serial.append('Projector:')
     serial.append('metric: %s' % self.metric)
     if self.kwds is None or len(self.kwds) == 0:
         pass
     else:
         serial.append('params:')
         serial.append(serialize_dict(self.kwds))
     serial.append(str(self.selector))
     return '\n'.join(serial)
Esempio n. 11
0
 def __repr__(self):
     serial = []
     serial.append('Projector:')
     serial.append('metric: %s' % self.metric)
     if self.kwds is None or len(self.kwds) == 0:
         pass
     else:
         serial.append('params:')
         serial.append(serialize_dict(self.kwds))
     serial.append(str(self.selector))
     return '\n'.join(serial)
Esempio n. 12
0
 def __repr__(self):
     serial = []
     serial.append(self.name)
     serial.append('n_instances: %d' % (self.n_instances))
     serial.append('metric: %s' % (self.metric))
     if self.kwds is None or len(self.kwds) == 0:
         pass
     else:
         serial.append('params:')
         serial.append(serialize_dict(self.kwds))
     serial.append('random_state: %d' % (self.random_state))
     return '\n'.join(serial)
Esempio n. 13
0
 def __repr__(self):
     serial = []
     serial.append(self.name)
     serial.append('n_instances: %d' % (self.n_instances))
     serial.append('metric: %s' % (self.metric))
     if self.kwds is None or len(self.kwds) == 0:
         pass
     else:
         serial.append('params:')
         serial.append(serialize_dict(self.kwds))
     serial.append('random_state: %d' % (self.random_state))
     return '\n'.join(serial)
Esempio n. 14
0
def main_script(model_initializer=None, description=None, epilog=None, prog_name=None, logger=None):
    parser = argparse_setup(model_initializer, description, epilog)
    args = parser.parse_args()

    if args.no_logging:
        configure_logging(logger, verbosity=args.verbosity)
    else:
        configure_logging(logger, verbosity=args.verbosity, filename=prog_name + '.log')

    logger.debug('-' * 80)
    logger.debug('Program: %s' % prog_name)
    logger.debug('Called with parameters:\n %s' % serialize_dict(args.__dict__))

    start_time = time()
    try:
        main(model_initializer, args)
    except Exception:
        import datetime
        curr_time = datetime.datetime.now().strftime("%A, %d. %B %Y %I:%M%p")
        logger.exception("Program run failed on %s" % curr_time)
    finally:
        end_time = time()
        logger.info('Elapsed time: %.1f sec', end_time - start_time)
Esempio n. 15
0
 def __repr__(self):
     """string."""
     return serialize_dict(self.__dict__, offset='large')
Esempio n. 16
0
    def optimize(self, sequences=None, init_params=1):
        """
        Function to generate a list of average of ROC mean and ROC standard deviation and its 
        corresponding parameters 

        Parameters
        ----------

        sequences: list (default value = None)
            the list of sequences generated      
        no_of_times_of_parameter_initialization: int (default value = 1)
            number of times we call parameter generation function to randomly initialize the parameters
        no_of_times_fit_predict: int (default value = 1)
            number of times for a set of parameters we fit()/predict() the samples using neural network

        Returns
        --------

        list_of_ROC_score_and_parameters: 
            a list of average of ROC mean and ROC standard deviation and corresponding parameters
        """

        opt_net = None
        min_mean_ROC = 0.0
        min_std_dec_ROC = 0.0
        params_to_log = dict()

        # Split sequences to train and test
        sequences_train = sequences[:len(sequences) / 2]
        sequences_test = sequences[len(sequences) / 2:]

        # Get training and testing structure matices
        struct_matrix_train = self.transformer.seq_to_struct(sequences_train)
        struct_matrix_test = self.transformer.seq_to_struct(sequences_test)

        min_score = 1

        for i in range(init_params):

            # Set the network parameters
            params = self.randomize(self.random_state)

            # Concetenate the parameter dictionaries
            parameters = dict(params.items() + self.transformer.params.items())

            # instatiate neural network class
            deep_neural_network = DeepNeuralNetwork(
                params=parameters, seq_pre_processor=self.seq_pre_processor)

            # Train the model and get the predicted matrix
            predict_matrix = deep_neural_network.fit_predict(
                sequences_train, sequences_test, struct_matrix_train)

            # Compare the predicted to the original matrix
            ROC_mean_score, ROC_std_dev_score = self.estimate_data_representation_equivalence(
                struct_matrix_test, predict_matrix)
            curr_score = abs(float(ROC_mean_score) - 0.5) + ROC_std_dev_score
            if curr_score < min_score:
                min_mean_ROC = ROC_mean_score
                min_std_dev_ROC = ROC_std_dev_score
                params_to_log = parameters
                min_score = curr_score
                opt_net = deep_neural_network
                #deep_neural_network.save()

        logger.info('\n\n')
        logger.info('On train set:')
        logger.info('AUC ROC: %.4f +- %.4f' % (min_mean_ROC, min_std_dev_ROC))
        logger.info('\n\n')
        logger.info('Trained and tested with parameters:\n\n %s \n\n' %
                    serialize_dict(params_to_log))

        return opt_net
Esempio n. 17
0
    def sample(self, graph_iter,

               probabilistic_core_choice=True,
               score_core_choice=False,
               max_size_diff=-1,

               similarity=-1,
               n_samples=None,
               proposal_probability=False,
               batch_size=10,
               n_jobs=0,

               target_orig_cip=False,
               n_steps=50,
               quick_skip_orig_cip=False,
               improving_threshold=-1,
               improving_linear_start=0,
               accept_static_penalty=0.0,
               accept_min_similarity=0.0,
               select_cip_max_tries=20,
               burnin=0,
               backtrack=0,


               include_seed=False,
               keep_duplicates=False,
               monitor = False):

        '''

        Parameters
        ----------
        graph_iter : iterator over networkx graphs
            the by nw trained preprocessor will turn them into  graphwrappers


        probabilistic_core_choice : bool
            cores are chosen according to their frequency in the grammar...
        score_core_choice : bool
            cores are chosen  probabilisticaly according to their score
        max_size_diff : int
            linear increasing penalty is applied to enforce that the graphs
            stays in the desired size range

        similarity : float
            stop condition for sampling, stop if desired similarity is reached,
            similarity meassure is weired due to high dimensionality of eden vector, be warned
        n_samples : int
            collect this many samples for each seed graph
        proposal_probability : bool
            if you are not dealing with abstract graphs you get this option;
            if you want to comply to Metropolis hastings
        batch_size : int
            this many graphs will be processed by one instance,
            (maybe i should calculate the max effective number and use that)
        n_jobs : int (-1)
            number of processes created used. -1 is cpu count
        target_orig_cip : bool
            omly replace low scoring parts of the graph.. see implementation for details

        n_steps: int
            sample steps

        quick_skip_orig_cip : bool
            for each cip on the original graph, only try one entry from the grammar.

        improving_threshold : float
            starting from this fraction we only accept a graph if it is better
        improving_linear_start : float
            starting from this fraction there is a linearly increasing penalty
            to the score until the improving_threshould value
        accept_static_penalty : float
            decrease probability of accepting a worse graph
        accept_min_similarity : in [0,1]
            acceptance requirement, graphs musst be at least this similar to be accepted..
            zero is ignore this
        select_cip_max_tries : int
            try this many times to get a cip from the original graph before declaring
            the seed dead.
        burnin : int
            ignore this many graphs until n_samples starts collecting
        backtrack : int
            sometimes you  generate a dead-end graph, a graph that is valid but finding a proposal is impossible.
            you can take one step back this many times.
            this is of questionable efficiency currently because we cant detecect
            the exact place where we went wrong.
        include_seed : bool
            dont collect the seed as sample
        keep_duplicates : bool
            metropolice compliance says that we should output duplicates. but otherwise duplicates
            are not interesting.
        monitor : bool
            enabling monitor accessible after  sampling. sampler.monitors will contain all the information

        Returns
        -------
        list of graphs
        '''

        self.maxbacktrack=backtrack

        self.monitor = monitor
        self.monitors=[]
        self.accept_min_similarity=accept_min_similarity
        self.proposal_probability = proposal_probability

        self.similarity = similarity

        if probabilistic_core_choice + score_core_choice + max_size_diff == -1 > 1:
            raise Exception('choose max one cip choice strategy')

        if n_samples:
            self.sampling_interval = int((n_steps - burnin) / (n_samples + include_seed - 1))
        else:
            self.sampling_interval = 9999

        self.n_steps = n_steps
        self.quick_skip_orig_cip = quick_skip_orig_cip
        self.n_jobs = n_jobs
        self.target_orig_cip = target_orig_cip

        # the user doesnt know about edge nodes.. so this needs to be done
        max_size_diff = max_size_diff * 2
        self.max_core_size_diff = max_size_diff

        #  calculating the actual steps for improving :)
        self.improving_threshold = improving_threshold
        if improving_threshold > 0:
            self.improving_threshold = int(self.improving_threshold * self.n_steps)

        self.improving_linear_start = improving_linear_start
        if improving_linear_start > 0:
            self.improving_linear_start = int(improving_linear_start * n_steps)
        if self.improving_linear_start == self.improving_threshold:
            self.improving_threshold+=1


        self.improving_penalty_per_step = (1 - accept_static_penalty) / float(self.improving_threshold - self.improving_linear_start)

        self.accept_static_penalty = accept_static_penalty
        self.select_cip_max_tries = select_cip_max_tries
        self.burnin = burnin
        self.include_seed = include_seed
        self.batch_size = batch_size
        self.probabilistic_core_choice = probabilistic_core_choice
        self.score_core_choice = score_core_choice

        self.keep_duplicates = keep_duplicates
        # adapt grammar to task:
        self.lsgg.preprocessing(n_jobs,
                                max_size_diff,
                                probabilistic_core_choice)

        if score_core_choice:
            self.score_core_choice_dict = {}
            for interface in self.lsgg.productions:
                for core in self.lsgg.productions[interface]:
                    gr = self.lsgg.productions[interface][core].graph.copy()
                    transformed_graph = self.vectorizer.transform_single(gr)
                    score = self.estimatorobject.cal_estimator.predict_proba(transformed_graph)[0, 1]
                    self.score_core_choice_dict[core] = score


        logger.debug(serialize_dict(self.__dict__))

        if self.random_state is not None:
            random.seed(self.random_state)
        # sampling
        if n_jobs in [0, 1]:
            for graph in graph_iter:
                #sampled_graph = self._sample(graph)
                # yield sampled_graph
                a,b=self._sample(graph)
                for new_graph in self.return_formatter(a,b):
                    yield new_graph
        else:
            if n_jobs > 1:
                pool = Pool(processes=n_jobs)
            else:
                pool = Pool()
            sampled_graphs = pool.imap_unordered(_sample_multi, self._argbuilder(graph_iter))

            for batch in sampled_graphs:
                for graph,moni in batch:
                    for new_graph in self.return_formatter(graph,moni):
                        yield new_graph
            pool.close()
            pool.join()
Esempio n. 18
0
 def __repr__(self):
     return serialize_dict(self.__dict__, offset='large')
Esempio n. 19
0
 def __str__(self):
     """String."""
     return "%s:\n%s" % (self.__class__, serialize_dict(self.__dict__))
Esempio n. 20
0
 def __repr__(self):
     """string."""
     return serialize_dict(self.__dict__, offset='large')
Esempio n. 21
0
def main(args):
    """Main."""
    # read variables
    # if no -i is given then read from stdin
    seq = args['-i']
    seq = (sys.stdin.readline().strip() if args['-i'] == 'stdin' else seq)
    k = int(args['-k'])
    complexity = int(args['--complexity'][0])
    nbits = int(args['--nbits'][0])
    window_size = int(args['--window_size'][0])
    window_size = min(len(seq), window_size)
    max_bp_span = int(args['--max_bp_span'][0])
    max_bp_span = min(len(seq), max_bp_span)
    avg_bp_prob_cutoff = float(args['--avg_bp_prob_cutoff'][0])
    hard_threshold = float(args['--hard_threshold'][0])
    max_num_edges = int(args['--max_num_edges'][0])
    no_lonely_bps = args['--no_lonely_bps']
    no_nesting = args['--no_nesting']
    draw = args['--draw']
    jpg = args['--jpg']
    svg = args['--svg']
    png = args['--png']
    pdf = args['--pdf']

    if no_nesting is True:
        nesting = False
    else:
        nesting = True
    # setup logger
    if args['--verbose']:
        verbosity = 2
    else:
        verbosity = 1
    configure_logging(logger, verbosity=verbosity, filename='log')
    logger.debug(serialize_dict(args))

    # setup folding algorithm
    rase = StructuralStabilityEstimator(seq,
                                        alphabet='ACGU',
                                        k=k,
                                        complexity=complexity,
                                        nbits=nbits,
                                        window_size=window_size,
                                        max_bp_span=max_bp_span,
                                        avg_bp_prob_cutoff=avg_bp_prob_cutoff,
                                        hard_threshold=hard_threshold,
                                        max_num_edges=max_num_edges,
                                        no_lonely_bps=no_lonely_bps,
                                        nesting=nesting)
    # print: nt pos, original nt, most de-stabilizing nt, dotbracket, score
    for line in rase.transform(seq):
        print(line)

    # if drawing is required use the folding algorithm to compute the graph
    if draw:
        suffix = 'pdf'
        if jpg:
            suffix = 'jpg'
        if svg:
            suffix = 'svg'
        if png:
            suffix = 'png'
        if pdf:
            suffix = 'pdf'
        structure_fname = 'structure.' + suffix
        score_fname = 'score.' + suffix
        all_plots_fname = 'structures.' + suffix
        rase.draw(file_name=structure_fname)
        rase.plot(file_name=score_fname)
        rase.draw_all(file_name=all_plots_fname)