def _get_parameters_range(): text = [] text.append('\n\n\tParameters range:') text.append('\nPre_processor:') text.append(serialize_dict(pre_processor_parameters)) text.append('\nVectorizer:') text.append(serialize_dict(vectorizer_parameters)) text.append('\nEstimator:') text.append(serialize_dict(estimator_parameters)) return '\n'.join(text)
def get_parameters(self): text = [] text.append('\n\tModel parameters:') text.append('\nPre_processor:') text.append(serialize_dict(self.pre_processor_args)) text.append('\nVectorizer:') text.append(serialize_dict(self.vectorizer_args)) text.append('\nEstimator:') text.append(serialize_dict(self.estimator_args)) return '\n'.join(text)
def __repr__(self): serial = [] serial.append('Embedder2D:') if self.compiled is True: serial.append('compiled: yes') serial.append('learning_rate: %.6f' % self.deepnet_learning_rate) serial.append('n_features_hidden_factor: %d' % self.deepnet_n_features_hidden_factor) else: serial.append('compiled: no') serial.append('layout: %s' % (self.layout)) serial.append('layout_prog: %s' % (self.layout_prog)) if self.layout_prog_args: serial.append('layout_prog_args: %s' % (self.layout_prog_args)) serial.append('n_nearest_neighbor_links: %s' % (self.n_nearest_neighbor_links)) if self.n_nearest_neighbors is None: serial.append('n_nearest_neighbors: None') else: serial.append('n_nearest_neighbors: %d' % self.n_nearest_neighbors) serial.append('metric: %s' % self.metric) if self.kwds is None or len(self.kwds) == 0: pass else: serial.append('params:') serial.append(serialize_dict(self.kwds)) serial.append('selectors [%d]:' % len(self.selectors)) for i, selector in enumerate(self.selectors): if len(self.selectors) > 1: serial.append('%d/%d ' % (i + 1, len(self.selectors))) serial.append(str(selector)) return '\n'.join(serial)
def main_script(prog_name=None, logger=None): parser = argparse_setup() args = parser.parse_args() if args.no_logging: configure_logging(logger, verbosity=args.verbosity) else: configure_logging(logger, verbosity=args.verbosity, filename=args.logging_dir + 'logs_gc%.2f_len%d_num%d' % (args.gc_content, args.length, args.num) + '.log') logger.debug('-' * 80) logger.debug('Program: %s' % prog_name) logger.debug('\n') logger.debug('Called with parameters:\n\n %s \n\n' % serialize_dict(args.__dict__)) start_time = time.asctime(time.localtime(time.time())) logger.info('Initializing program execution %s \n\n' % (start_time)) try: main(args, logger) except Exception: import datetime curr_time = datetime.datetime.now().strftime("%A, %d. %B %Y %I:%M%p") logger.exception("Program run failed on %s" % curr_time) exit(1) finally: end_time = time.asctime(time.localtime(time.time())) logger.info('Executing program execution %s' % (end_time)) logger.info('-' * 80)
def __repr__(self): serial = [] serial.append('Embedder2D:') if self.compiled is True: serial.append('compiled: yes') serial.append('learning_rate: %.6f' % self.learning_rate) serial.append('n_features_hidden_factor: %d' % self.n_features_hidden_factor) else: serial.append('compiled: no') serial.append('layout: %s' % (self.layout)) serial.append('layout_prog: %s' % (self.layout_prog)) if self.layout_prog_args: serial.append('layout_prog_args: %s' % (self.layout_prog_args)) serial.append('n_links: %s' % (self.n_links)) if self.n_nearest_neighbors is None: serial.append('n_nearest_neighbors: None') else: serial.append('n_nearest_neighbors: %d' % self.n_nearest_neighbors) serial.append('metric: %s' % self.metric) if self.kwds is None or len(self.kwds) == 0: pass else: serial.append('params:') serial.append(serialize_dict(self.kwds)) serial.append('selectors [%d]:' % len(self.selectors)) for i, selector in enumerate(self.selectors): if len(self.selectors) > 1: serial.append('%d/%d ' % (i + 1, len(self.selectors))) serial.append(str(selector)) return '\n'.join(serial)
def main_script(model_initializer=None, description=None, epilog=None, prog_name=None, logger=None): parser = argparse_setup(model_initializer, description, epilog) args = parser.parse_args() if args.no_logging: configure_logging(logger, verbosity=args.verbosity) else: configure_logging(logger, verbosity=args.verbosity, filename=prog_name + '.log') logger.debug('-' * 80) logger.debug('Program: %s' % prog_name) logger.debug('Called with parameters:\n %s' % serialize_dict(args.__dict__)) start_time = time() try: main(model_initializer, args) except Exception: import datetime curr_time = datetime.datetime.now().strftime("%A, %d. %B %Y %I:%M%p") logger.exception("Program run failed on %s" % curr_time) finally: end_time = time() logger.info('Elapsed time: %.1f sec', end_time - start_time)
def visualize(self, data, target, title='', region_only=False): """visualize.""" auc = self.score(data, target) title += 'roc:%.2f' % (auc) title += '\nparams:%s' % serialize_dict(self.get_params()) x2dim = self.transform(data) x_min, x_max = x2dim[:, 0].min(), x2dim[:, 0].max() y_min, y_max = x2dim[:, 1].min(), x2dim[:, 1].max() b = max((x_max - x_min) / 10, (y_max - y_min) / 10) # border size x_min, x_max = x_min - b, x_max + b y_min, y_max = y_min - b, y_max + b h = b / 20 # step size in the mesh xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) grid2d = np.c_[xx.ravel(), yy.ravel()] z = self.est2d.predict_proba(grid2d) z = 1 - z.reshape(xx.shape) plt.contourf(xx, yy, z, cmap=plt.get_cmap('BrBG'), alpha=.3, levels=[0.05, 0.25, 0.5, 0.75, 0.95], extend='both') plt.contour(xx, yy, z, levels=[-1, 0.5, 2], colors='w', linewidths=[.5, 4, .5], linestyles=['solid', 'solid', 'solid'], extend='both') plt.contour(xx, yy, z, levels=[-1, 0.5, 2], colors='k', linewidths=[.5, 2, .5], linestyles=['solid', 'solid', 'solid'], extend='both') if region_only is False: plt.scatter(x2dim[:, 0], x2dim[:, 1], alpha=.8, c=target, s=30, edgecolors='k', cmap=plt.get_cmap('gray')) plt.title(title) plt.grid(False) plt.axis('off') return self
def __repr__(self): serial = [] serial.append('Projector:') serial.append('metric: %s' % self.metric) if self.kwds is None or len(self.kwds) == 0: pass else: serial.append('params:') serial.append(serialize_dict(self.kwds)) serial.append(str(self.selector)) return '\n'.join(serial)
def __repr__(self): serial = [] serial.append(self.name) serial.append('n_instances: %d' % (self.n_instances)) serial.append('metric: %s' % (self.metric)) if self.kwds is None or len(self.kwds) == 0: pass else: serial.append('params:') serial.append(serialize_dict(self.kwds)) serial.append('random_state: %d' % (self.random_state)) return '\n'.join(serial)
def __repr__(self): """string.""" return serialize_dict(self.__dict__, offset='large')
def optimize(self, sequences=None, init_params=1): """ Function to generate a list of average of ROC mean and ROC standard deviation and its corresponding parameters Parameters ---------- sequences: list (default value = None) the list of sequences generated no_of_times_of_parameter_initialization: int (default value = 1) number of times we call parameter generation function to randomly initialize the parameters no_of_times_fit_predict: int (default value = 1) number of times for a set of parameters we fit()/predict() the samples using neural network Returns -------- list_of_ROC_score_and_parameters: a list of average of ROC mean and ROC standard deviation and corresponding parameters """ opt_net = None min_mean_ROC = 0.0 min_std_dec_ROC = 0.0 params_to_log = dict() # Split sequences to train and test sequences_train = sequences[:len(sequences) / 2] sequences_test = sequences[len(sequences) / 2:] # Get training and testing structure matices struct_matrix_train = self.transformer.seq_to_struct(sequences_train) struct_matrix_test = self.transformer.seq_to_struct(sequences_test) min_score = 1 for i in range(init_params): # Set the network parameters params = self.randomize(self.random_state) # Concetenate the parameter dictionaries parameters = dict(params.items() + self.transformer.params.items()) # instatiate neural network class deep_neural_network = DeepNeuralNetwork( params=parameters, seq_pre_processor=self.seq_pre_processor) # Train the model and get the predicted matrix predict_matrix = deep_neural_network.fit_predict( sequences_train, sequences_test, struct_matrix_train) # Compare the predicted to the original matrix ROC_mean_score, ROC_std_dev_score = self.estimate_data_representation_equivalence( struct_matrix_test, predict_matrix) curr_score = abs(float(ROC_mean_score) - 0.5) + ROC_std_dev_score if curr_score < min_score: min_mean_ROC = ROC_mean_score min_std_dev_ROC = ROC_std_dev_score params_to_log = parameters min_score = curr_score opt_net = deep_neural_network #deep_neural_network.save() logger.info('\n\n') logger.info('On train set:') logger.info('AUC ROC: %.4f +- %.4f' % (min_mean_ROC, min_std_dev_ROC)) logger.info('\n\n') logger.info('Trained and tested with parameters:\n\n %s \n\n' % serialize_dict(params_to_log)) return opt_net
def sample(self, graph_iter, probabilistic_core_choice=True, score_core_choice=False, max_size_diff=-1, similarity=-1, n_samples=None, proposal_probability=False, batch_size=10, n_jobs=0, target_orig_cip=False, n_steps=50, quick_skip_orig_cip=False, improving_threshold=-1, improving_linear_start=0, accept_static_penalty=0.0, accept_min_similarity=0.0, select_cip_max_tries=20, burnin=0, backtrack=0, include_seed=False, keep_duplicates=False, monitor = False): ''' Parameters ---------- graph_iter : iterator over networkx graphs the by nw trained preprocessor will turn them into graphwrappers probabilistic_core_choice : bool cores are chosen according to their frequency in the grammar... score_core_choice : bool cores are chosen probabilisticaly according to their score max_size_diff : int linear increasing penalty is applied to enforce that the graphs stays in the desired size range similarity : float stop condition for sampling, stop if desired similarity is reached, similarity meassure is weired due to high dimensionality of eden vector, be warned n_samples : int collect this many samples for each seed graph proposal_probability : bool if you are not dealing with abstract graphs you get this option; if you want to comply to Metropolis hastings batch_size : int this many graphs will be processed by one instance, (maybe i should calculate the max effective number and use that) n_jobs : int (-1) number of processes created used. -1 is cpu count target_orig_cip : bool omly replace low scoring parts of the graph.. see implementation for details n_steps: int sample steps quick_skip_orig_cip : bool for each cip on the original graph, only try one entry from the grammar. improving_threshold : float starting from this fraction we only accept a graph if it is better improving_linear_start : float starting from this fraction there is a linearly increasing penalty to the score until the improving_threshould value accept_static_penalty : float decrease probability of accepting a worse graph accept_min_similarity : in [0,1] acceptance requirement, graphs musst be at least this similar to be accepted.. zero is ignore this select_cip_max_tries : int try this many times to get a cip from the original graph before declaring the seed dead. burnin : int ignore this many graphs until n_samples starts collecting backtrack : int sometimes you generate a dead-end graph, a graph that is valid but finding a proposal is impossible. you can take one step back this many times. this is of questionable efficiency currently because we cant detecect the exact place where we went wrong. include_seed : bool dont collect the seed as sample keep_duplicates : bool metropolice compliance says that we should output duplicates. but otherwise duplicates are not interesting. monitor : bool enabling monitor accessible after sampling. sampler.monitors will contain all the information Returns ------- list of graphs ''' self.maxbacktrack=backtrack self.monitor = monitor self.monitors=[] self.accept_min_similarity=accept_min_similarity self.proposal_probability = proposal_probability self.similarity = similarity if probabilistic_core_choice + score_core_choice + max_size_diff == -1 > 1: raise Exception('choose max one cip choice strategy') if n_samples: self.sampling_interval = int((n_steps - burnin) / (n_samples + include_seed - 1)) else: self.sampling_interval = 9999 self.n_steps = n_steps self.quick_skip_orig_cip = quick_skip_orig_cip self.n_jobs = n_jobs self.target_orig_cip = target_orig_cip # the user doesnt know about edge nodes.. so this needs to be done max_size_diff = max_size_diff * 2 self.max_core_size_diff = max_size_diff # calculating the actual steps for improving :) self.improving_threshold = improving_threshold if improving_threshold > 0: self.improving_threshold = int(self.improving_threshold * self.n_steps) self.improving_linear_start = improving_linear_start if improving_linear_start > 0: self.improving_linear_start = int(improving_linear_start * n_steps) if self.improving_linear_start == self.improving_threshold: self.improving_threshold+=1 self.improving_penalty_per_step = (1 - accept_static_penalty) / float(self.improving_threshold - self.improving_linear_start) self.accept_static_penalty = accept_static_penalty self.select_cip_max_tries = select_cip_max_tries self.burnin = burnin self.include_seed = include_seed self.batch_size = batch_size self.probabilistic_core_choice = probabilistic_core_choice self.score_core_choice = score_core_choice self.keep_duplicates = keep_duplicates # adapt grammar to task: self.lsgg.preprocessing(n_jobs, max_size_diff, probabilistic_core_choice) if score_core_choice: self.score_core_choice_dict = {} for interface in self.lsgg.productions: for core in self.lsgg.productions[interface]: gr = self.lsgg.productions[interface][core].graph.copy() transformed_graph = self.vectorizer.transform_single(gr) score = self.estimatorobject.cal_estimator.predict_proba(transformed_graph)[0, 1] self.score_core_choice_dict[core] = score logger.debug(serialize_dict(self.__dict__)) if self.random_state is not None: random.seed(self.random_state) # sampling if n_jobs in [0, 1]: for graph in graph_iter: #sampled_graph = self._sample(graph) # yield sampled_graph a,b=self._sample(graph) for new_graph in self.return_formatter(a,b): yield new_graph else: if n_jobs > 1: pool = Pool(processes=n_jobs) else: pool = Pool() sampled_graphs = pool.imap_unordered(_sample_multi, self._argbuilder(graph_iter)) for batch in sampled_graphs: for graph,moni in batch: for new_graph in self.return_formatter(graph,moni): yield new_graph pool.close() pool.join()
def __repr__(self): return serialize_dict(self.__dict__, offset='large')
def __str__(self): """String.""" return "%s:\n%s" % (self.__class__, serialize_dict(self.__dict__))
def main(args): """Main.""" # read variables # if no -i is given then read from stdin seq = args['-i'] seq = (sys.stdin.readline().strip() if args['-i'] == 'stdin' else seq) k = int(args['-k']) complexity = int(args['--complexity'][0]) nbits = int(args['--nbits'][0]) window_size = int(args['--window_size'][0]) window_size = min(len(seq), window_size) max_bp_span = int(args['--max_bp_span'][0]) max_bp_span = min(len(seq), max_bp_span) avg_bp_prob_cutoff = float(args['--avg_bp_prob_cutoff'][0]) hard_threshold = float(args['--hard_threshold'][0]) max_num_edges = int(args['--max_num_edges'][0]) no_lonely_bps = args['--no_lonely_bps'] no_nesting = args['--no_nesting'] draw = args['--draw'] jpg = args['--jpg'] svg = args['--svg'] png = args['--png'] pdf = args['--pdf'] if no_nesting is True: nesting = False else: nesting = True # setup logger if args['--verbose']: verbosity = 2 else: verbosity = 1 configure_logging(logger, verbosity=verbosity, filename='log') logger.debug(serialize_dict(args)) # setup folding algorithm rase = StructuralStabilityEstimator(seq, alphabet='ACGU', k=k, complexity=complexity, nbits=nbits, window_size=window_size, max_bp_span=max_bp_span, avg_bp_prob_cutoff=avg_bp_prob_cutoff, hard_threshold=hard_threshold, max_num_edges=max_num_edges, no_lonely_bps=no_lonely_bps, nesting=nesting) # print: nt pos, original nt, most de-stabilizing nt, dotbracket, score for line in rase.transform(seq): print(line) # if drawing is required use the folding algorithm to compute the graph if draw: suffix = 'pdf' if jpg: suffix = 'jpg' if svg: suffix = 'svg' if png: suffix = 'png' if pdf: suffix = 'pdf' structure_fname = 'structure.' + suffix score_fname = 'score.' + suffix all_plots_fname = 'structures.' + suffix rase.draw(file_name=structure_fname) rase.plot(file_name=score_fname) rase.draw_all(file_name=all_plots_fname)