def main_script(model_initializer=None, description=None, epilog=None, prog_name=None, logger=None): parser = argparse_setup(model_initializer, description, epilog) args = parser.parse_args() if args.no_logging: configure_logging(logger, verbosity=args.verbosity) else: configure_logging(logger, verbosity=args.verbosity, filename=prog_name + '.log') logger.debug('-' * 80) logger.debug('Program: %s' % prog_name) logger.debug('Called with parameters:\n %s' % serialize_dict(args.__dict__)) start_time = time() try: main(model_initializer, args) except Exception: import datetime curr_time = datetime.datetime.now().strftime("%A, %d. %B %Y %I:%M%p") logger.exception("Program run failed on %s" % curr_time) finally: end_time = time() logger.info('Elapsed time: %.1f sec', end_time - start_time)
def main_script(prog_name=None, logger=None): parser = argparse_setup() args = parser.parse_args() if args.no_logging: configure_logging(logger, verbosity=args.verbosity) else: configure_logging(logger, verbosity=args.verbosity, filename=args.logging_dir + 'logs_gc%.2f_len%d_num%d' % (args.gc_content, args.length, args.num) + '.log') logger.debug('-' * 80) logger.debug('Program: %s' % prog_name) logger.debug('\n') logger.debug('Called with parameters:\n\n %s \n\n' % serialize_dict(args.__dict__)) start_time = time.asctime(time.localtime(time.time())) logger.info('Initializing program execution %s \n\n' % (start_time)) try: main(args, logger) except Exception: import datetime curr_time = datetime.datetime.now().strftime("%A, %d. %B %Y %I:%M%p") logger.exception("Program run failed on %s" % curr_time) exit(1) finally: end_time = time.asctime(time.localtime(time.time())) logger.info('Executing program execution %s' % (end_time)) logger.info('-' * 80)
def load_PUBCHEM_data(assay_id, max_size=20): configure_logging(logger, verbosity=2) logger.debug('_' * 80) logger.debug('Dataset %s info:' % assay_id) desc = get_assay_description(assay_id) logging.debug('\n%s' % desc) # extract pos and neg graphs all_pos_graphs, all_neg_graphs = get_pos_graphs(assay_id), get_neg_graphs(assay_id) # remove too large and too small graphs and outliers initial_max_size = 2000 initial_max_size = max(initial_max_size, max_size) args = dict(initial_max_size=initial_max_size, fraction_to_remove=.1, n_neighbors_for_outliers=9, remove_similar=False, max_size=max_size) logging.debug('\nPositive graphs') pos_graphs = pre_process(all_pos_graphs, **args) logging.debug('\nNegative graphs') neg_graphs = pre_process(all_neg_graphs, **args) logger.debug('-' * 80) configure_logging(logger, verbosity=1) return pos_graphs, neg_graphs
def do(): #special case: bursi pos_dataset_fname = 'RF00005.fa' neg_dataset_fname = None # none will permute the first dataset dataset = '%s_vs_%s' % (pos_dataset_fname, neg_dataset_fname) #logging if True: logger_fname = '%s_predictive_performance_of_samples.log' % dataset else: logger_fname = None configure_logging(logger, verbosity=1, filename=logger_fname) #main start = time() print('Working with dataset: %s' % dataset) logger.info('Working with dataset: %s' % dataset) percentages = [.08, .2, .4, .6, .8, .95] percentages = [.07, 0.1, 0.15, 0.2] # set size to 900 in production original_repetitions, original_sample_repetitions, sample_repetitions = evaluate( pos_dataset_fname, neg_dataset_fname, size=100, percentages=percentages, n_repetitions=3, train_test_split=0.7) #save and display results result_fname = '%s_predictive_performance_of_samples.data' % dataset save_results(result_fname, percentages, original_repetitions, original_sample_repetitions, sample_repetitions) percentages_l, original_repetitions_l, original_sample_repetitions_l, sample_repetitions_l = load_results( 'asd.data') plot(dataset, percentages_l, original_sample_repetitions_l, original_repetitions_l, sample_repetitions_l) print('Time elapsed: %s' % (datetime.timedelta(seconds=(time() - start))))
def test_pareto(): configure_logging(logging.getLogger(), verbosity=2) graphs = rg.make_graphs_static( 100, # how many to generate 5, # graph size 4, # node-labelcount 2, # edgelabelcount labeldistribution='uniform', allow_cycles=False) im = InstanceMaker(n_landmarks=5, n_neighbors=50).fit(graphs, ntargets=2) optimizer = pareto.LocalLandmarksDistanceOptimizer(n_iter=7, context_size=1, multiproc=True) landmark_graphs, desired_distances, ranked_graphs, target_graph = im.get() NONE = optimizer.optimize( landmark_graphs, desired_distances, ranked_graphs, #start_graph_list=[landmark_graphs[0]]) start_graph_list=landmark_graphs) return None
def do(): #special case: bursi pos_dataset_fname = 'RF00005.fa' neg_dataset_fname = None # none will permute the first dataset dataset='%s_vs_%s' % (pos_dataset_fname, neg_dataset_fname) #logging if True: logger_fname = '%s_predictive_performance_of_samples.log'%dataset else: logger_fname = None configure_logging(logger,verbosity=1, filename=logger_fname) #main start=time() print( 'Working with dataset: %s' % dataset ) logger.info( 'Working with dataset: %s' % dataset ) percentages=[.08,.2,.4,.6,.8,.95] percentages=[.07,0.1,0.15,0.2] # set size to 900 in production original_repetitions, original_sample_repetitions, sample_repetitions = evaluate(pos_dataset_fname, neg_dataset_fname, size=100, percentages=percentages, n_repetitions=3, train_test_split=0.7) #save and display results result_fname='%s_predictive_performance_of_samples.data'%dataset save_results(result_fname,percentages, original_repetitions,original_sample_repetitions,sample_repetitions) percentages_l, original_repetitions_l,original_sample_repetitions_l,sample_repetitions_l = load_results('asd.data') plot(dataset, percentages_l, original_sample_repetitions_l, original_repetitions_l, sample_repetitions_l) print('Time elapsed: %s'%(datetime.timedelta(seconds=(time() - start))))
args=vars(parser.parse_args()) import os.path if not os.path.isfile(args['start_graphs']): parser.print_usage() print 'at least provide a path to input' exit() print "*raw args" print "*" * 80 print args # verbosity from eden.util import configure_logging import logging configure_logging(logging.getLogger(),verbosity=args['verbose']) args.pop('verbose') # graphs from eden.io.gspan import gspan_to_eden from itertools import islice args['graph_iter'] = islice(gspan_to_eden(args.pop('start_graphs')),args.pop('num_graphs')) #output OUTFILE=args.pop('out') MODEL=args.pop('model') # CREATE SAMPLER from graphlearn01.graphlearn import Sampler s=Sampler()
args = vars(parser.parse_args()) import os.path if not os.path.isfile(args['start_graphs']): parser.print_usage() print 'at least provide a path to input' exit() print "*raw args" print "*" * 80 print args # verbosity from eden.util import configure_logging import logging configure_logging(logging.getLogger(), verbosity=args['verbose']) args.pop('verbose') # graphs from eden.io.gspan import gspan_to_eden from itertools import islice args['graph_iter'] = islice(gspan_to_eden(args.pop('start_graphs')), args.pop('num_graphs')) #output OUTFILE = args.pop('out') MODEL = args.pop('model') # CREATE SAMPLER from graphlearn01.graphlearn import Sampler s = Sampler()
import os.path if not os.path.isfile(args['input']): parser.print_usage() print 'at least provide a path to input' exit() print "*raw args" print "*"*80 print args # verbosity from eden.util import configure_logging import logging configure_logging(logging.getLogger(),verbosity=args.pop('verbose')) # handle Vectorizer: from eden.graph import Vectorizer args['vectorizer'] = Vectorizer(args.pop('vectorizer_complexity')) # estimator, if the user is providing a negative graph set, we use # the twoclass esti OO import graphlearn01.estimate as estimate if args['negative_input']==None: args['estimator']=estimate.OneClassEstimator(nu=.5, cv=2, n_jobs=-1) else: args['estimator']=estimate.TwoClassEstimator( cv=2, n_jobs=-1)
from ego.decomposition.paired_neighborhoods import decompose_paired_neighborhoods, decompose_neighborhood from ego.vectorize import hash_graph from ego.vectorize import set_feature_size, vectorize from ego.encode import make_encoder from utils_oracle_with_target import oracle_setup as oracle_setup_target from utils_oracle_from_dataset import oracle_setup as oracle_setup_dataset from eden_chem.io.rdkitutils import nx_to_inchi from eden_chem.io.rdkitutils import nx_to_smi from datetime import datetime logger = logging.getLogger() configure_logging(logger, verbosity=1) download_active = curry(download)(active=True) download_inactive = curry(download)(active=False) def get_pos_graphs(assay_id): return pipe(assay_id, download_active, sdf_to_nx, list) def get_neg_graphs(assay_id): return pipe(assay_id, download_inactive, sdf_to_nx, list) colormap = 'tab20c' #assay_ids = ['624466','492992','463230','651741','743219','588350','492952','624249','463213','2631','651610'] def rank_and_persist_molecules(graphs, scores, name='', plot=True):
default="", help="Prefix to use for output filenames") parser.add_argument( "--chromosome_limits", help="Path to file containing chromosome limites as required by bedtools. Use this parameter disables automatic lookup via the genome id.") parser.add_argument( "--negative_site_candidate_regions_fn", help="Path to regions considered for placement of negatives in bed format") parser.add_argument( "-v", "--verbosity", action="count", help="Increase output verbosity") args = parser.parse_args() logger = logging.getLogger() configure_logging(logger, verbosity=args.verbosity) # fixed global variables npeek = 2 # check chromsizes retreival if (args.chromosome_limits is None): # check if genome_id can be found, chromsizes = get_chromsizes_from_ucsc(args.genome_id) logging.debug("Number of chromosomes: {}.".format(len(chromsizes))) # otherwise request manual definition of chromosome limits if (len(chromsizes) == 0): logging.error("Error: retrieving chromosome sizes from UCSC failed. Please specify manually using parameter --chromosome_limits") exit(1) # output file arguments
value = int(round(value)) # parameter_setting[key] = value break value = np.random.normal(mu, 2 * sigma) n_iter += 1 success, value = check_validity(key, value, noise) parameter_setting[key] = value return parameter_setting # In[ ]: # %%time logger = logging.getLogger() configure_logging(logger, verbosity=1) filename = "Result_at_" + str(noise_level) + ".txt" best_config = {'min_score': 6, # atleast motif_length/2 'min_freq': 0.1, # can not be more than (1- noise level) 'min_cluster_size': 3, # atleast 3 'p_value': 0.1, # atleast 0.1 'similarity_th': 0.8, # 0.8 'regex_th': 0.3, # max 0.3 'freq_th': 0.05, # 0.05 'std_th': 0.2} # 0.2 results_dic = {}
from eden.util import configure_logging import logging configure_logging(logging.getLogger(),verbosity=1) ''' GET RNA DATA ''' from eden.converter.fasta import fasta_to_sequence import itertools from eden.util import random_bipartition_iter import random import numpy def rfam_uri(family_id): return 'http://rfam.xfam.org/family/%s/alignment?acc=%s&format=fastau&download=0'%(family_id,family_id) def rfam_uri(family_id): return '%s.fa'%(family_id) RFAM="RF01725" #cutoff 162 (44.0) #cutoff 1725 (38.0) #cutoff rest (29) sizes=[50,100,200,400] repeats=3
parser.add_argument( "--chromosome_limits", help= "Path to file containing chromosome limites as required by bedtools. Use this parameter disables automatic lookup via the genome id." ) parser.add_argument( "--negative_site_candidate_regions_fn", help="Path to regions considered for placement of negatives in bed format") parser.add_argument("-v", "--verbosity", action="count", help="Increase output verbosity") args = parser.parse_args() logger = logging.getLogger() configure_logging(logger, verbosity=args.verbosity) # fixed global variables npeek = 2 # check chromsizes retreival if (args.chromosome_limits is None): # check if genome_id can be found, chromsizes = get_chromsizes_from_ucsc(args.genome_id) logging.debug("Number of chromosomes: {}.".format(len(chromsizes))) # otherwise request manual definition of chromosome limits if (len(chromsizes) == 0): logging.error( "Error: retrieving chromosome sizes from UCSC failed. Please specify manually using parameter --chromosome_limits" ) exit(1)
from eden.util import configure_logging import logging configure_logging(logging.getLogger(), verbosity=1) ''' GET RNA DATA ''' from eden.converter.fasta import fasta_to_sequence import itertools from eden.util import random_bipartition_iter import random import numpy def rfam_uri(family_id): return 'http://rfam.xfam.org/family/%s/alignment?acc=%s&format=fastau&download=0' % ( family_id, family_id) def rfam_uri(family_id): return '%s.fa' % (family_id) RFAM = "RF01725" #cutoff 162 (44.0) #cutoff 1725 (38.0) #cutoff rest (29) sizes = [50, 100, 200, 400] repeats = 3 def get_sequences(size=9999, rand=False):
def main(args): """Main.""" # read variables # if no -i is given then read from stdin seq = args['-i'] seq = (sys.stdin.readline().strip() if args['-i'] == 'stdin' else seq) k = int(args['-k']) complexity = int(args['--complexity'][0]) nbits = int(args['--nbits'][0]) window_size = int(args['--window_size'][0]) window_size = min(len(seq), window_size) max_bp_span = int(args['--max_bp_span'][0]) max_bp_span = min(len(seq), max_bp_span) avg_bp_prob_cutoff = float(args['--avg_bp_prob_cutoff'][0]) hard_threshold = float(args['--hard_threshold'][0]) max_num_edges = int(args['--max_num_edges'][0]) no_lonely_bps = args['--no_lonely_bps'] no_nesting = args['--no_nesting'] draw = args['--draw'] jpg = args['--jpg'] svg = args['--svg'] png = args['--png'] pdf = args['--pdf'] if no_nesting is True: nesting = False else: nesting = True # setup logger if args['--verbose']: verbosity = 2 else: verbosity = 1 configure_logging(logger, verbosity=verbosity, filename='log') logger.debug(serialize_dict(args)) # setup folding algorithm rase = StructuralStabilityEstimator(seq, alphabet='ACGU', k=k, complexity=complexity, nbits=nbits, window_size=window_size, max_bp_span=max_bp_span, avg_bp_prob_cutoff=avg_bp_prob_cutoff, hard_threshold=hard_threshold, max_num_edges=max_num_edges, no_lonely_bps=no_lonely_bps, nesting=nesting) # print: nt pos, original nt, most de-stabilizing nt, dotbracket, score for line in rase.transform(seq): print(line) # if drawing is required use the folding algorithm to compute the graph if draw: suffix = 'pdf' if jpg: suffix = 'jpg' if svg: suffix = 'svg' if png: suffix = 'png' if pdf: suffix = 'pdf' structure_fname = 'structure.' + suffix score_fname = 'score.' + suffix all_plots_fname = 'structures.' + suffix rase.draw(file_name=structure_fname) rase.plot(file_name=score_fname) rase.draw_all(file_name=all_plots_fname)