def analyse_all(filename_pattern): import glob from analyse import analyse datafiles = glob.glob(filename_pattern) for filename in datafiles: print filename analyse(filename)
def process(clean_word): w, new = word.get_word(clean_word) if w.completed(): logging.info('Processing COMPLETED for %s' % clean_word) return state = status.get_latest_state(clean_word) logging.info('PROCESSING %s [%i]' % (w.word(), state)) # Dispatch word to appropriate processing stage. if state < 10: analyse.analyse(w, state) elif state < 20: generate.generate(w, state) else: # Completion state! state = 808 # Persist latest word payload if needed. w.persist_payload() # Queue up next processing step if required. if not w.completed(): deferred.defer(process, clean_word)
def archive(args): realoutput = '' output = '' inputs = [] uncompiled = [] skipping = True; for arg in args[1:]: if len(realoutput) == 0 and (arg[0] == '-' or len(arg) == 1): continue; elif (len(realoutput) == 0): realoutput = os.path.abspath(arg) output = callconfig.cachefile(realoutput) else: inputs.append(arg) if not len(inputs): return makecachedir(output) print "callcatcher - detecting archiving:" print "\tautojoining", \ realoutput, "from\n\t", inputs combine.combine(output, inputs) print "callcatcher - dump currently unused:" print "\tUse \"callanalyse\" to manually analyse a set of compiler output files" print "\tautoanalysing", realoutput print "\tCurrently unused functions are..." analyse.analyse(output, "\t\t")
def archive(args): realoutput = '' output = '' inputs = [] uncompiled = [] skipping = True for arg in args[1:]: if len(realoutput) == 0 and (arg[0] == '-' or len(arg) == 1): continue elif (len(realoutput) == 0): realoutput = os.path.abspath(arg) output = callconfig.cachefile(realoutput) else: inputs.append(arg) if not len(inputs): return makecachedir(output) print "callcatcher - detecting archiving:" print "\tautojoining", \ realoutput, "from\n\t", inputs combine.combine(output, inputs) print "callcatcher - dump currently unused:" print "\tUse \"callanalyse\" to manually analyse a set of compiler output files" print "\tautoanalysing", realoutput print "\tCurrently unused functions are..." analyse.analyse(output, "\t\t")
def graph(): fig = Figure() global ListPart l = 0 k = 0 j = 0 if (str)(interface.getvar(name="crowd_type")) == "Hétérogène": for i in range(0, len(ListPart)): if ListPart[i].name == "Enfant": k = k + 1 if ListPart[i].name == "Adulte": j = j + 1 if ListPart[i].name == "Ancien": l = l + 1 print(k) t = ("Enfant", "Adulte", "Ancien") Y = (k, j, l) fig.add_subplot(111).plot(t, Y) canvas = FigureCanvasTkAgg(fig, master=Simulation) # A tk.DrawingArea. canvas.draw() canvas.get_tk_widget().grid(row=1, column=2) else: pass ## Because in this case we have a pop which is Homogeneous analyse()
def process(line): # tokenise line # run POS tagger on it # convert to conllu # extract useful parts pos_tagged = run_irishfst(line) conll = irishfst_output_to_conll(pos_tagged) deps = parse_dependencies(conll) analyse.analyse(deps)
def main(): list_of_1100_links = le.get_all_1000_links( 'https://www.goodreads.com/list/show/6.Best_Books_of_the_20th_Century?page=' ) df_all = s_s.create_df_and_save_as_csv(list_of_1100_links) #### other version #df_all = s_s.create_df_and_save_as_csv(le.get_all_1000_links('https://www.goodreads.com/list/show/6.Best_Books_of_the_20th_Century?page=')) df_cleaned = preprocessing(df_all) analyse(df_cleaned, df_all)
def main(): """ Main function of the script """ args = getArgs() # User specifies motif size range instead of giving a repeats file if args.repeats is None: min_motif_size = args.min_motif_size max_motif_size = args.max_motif_size args.repeats = generate_repeats(min_motif_size, max_motif_size) # User specifies minimum length if args.min_length: getSSRNative(args) # User specific minimum number of units elif args.min_units: unit_cutoff = dict() try: args.min_units = int(args.min_units) unit_cutoff[0] = args.min_units except ValueError: try: with open(args.min_units, 'r') as unitsIn: for line in unitsIn: L = line.strip().split() try: L[0] = int(L[0]) L[1] = int(L[1]) if L[1] == 1: print( 'Warning: Repeat unit of 1 used for size %d.' % (L[0]), file=sys.stderr) unit_cutoff[L[0]] = L[1] except ValueError: sys.exit( 'Invalid file format given for minimum units. Refer to help for more details' ) except FileNotFoundError: sys.exit( 'Units file specified is not found. Please provide a valid file' ) getSSR_units(args, unit_cutoff) # Default settings elif args.min_length is None and args.min_units is None: args.min_length = 12 getSSRNative(args) # Specifies to generate a HTML report if args.analyse: analyse(args)
def link(args): realoutput = abslinkoutput(args) output = callconfig.cachefile(realoutput) inputs = [] fakeargs = [ args[0], ] uncompiled = [] skip = False for arg in args[1:]: if skip: skip = False continue if arg[0] == '-' and len(arg) > 1 and arg[1] != 'o': if arg[1] == 'l': print 'linking against lib' + arg[2:] + '[.so|.a]' fakeargs.append(arg) elif arg == '-o': skip = True else: name, suffix = os.path.splitext(arg) if suffix == '.c' or suffix == '.cc' \ or suffix == '.cp' or suffix == '.cxx' \ or suffix == '.cpp' or suffix == '.CPP' \ or suffix == '.c++' or suffix == '.C' \ or suffix == '.s': inputs.append(name + '.o') uncompiled.append(arg) else: inputs.append(arg) if len(uncompiled): print 'callcatcher - linkline contains source files, forcing',\ 'compile of:' print '\t', uncompiled fakeargs.append('-c') for uncompile in uncompiled: compileline = fakeargs compileline.append(uncompile) compile(compileline) if not len(inputs): return makecachedir(output) print "callcatcher - detecting link:" print "\tautojoining", \ realoutput, "from\n\t", inputs combine.combine(output, inputs) print "callcatcher - dump currently unused:" print "\tUse \"callanalyse\" to manually analyse a set of compiler output files" print "\tautoanalysing", realoutput print "\tCurrently unused functions are..." analyse.analyse(output, "\t\t")
def monitor(): monitoring = True while (monitoring): count = (randint(1, 100)) if (count >= 3): print("Good value: {}".format(count)) else: print("Bad value: {}".format(count)) analyse() monitoring = False
def do_the_business(): # open logfile with open(logfile_name,'w') as logfile: # some details broadcast(logfile,"File list contains %d files"%len(file_list)) # delete the database if do_delete_db: os.remove(dbfile_name) # analysis stage if do_analyse: start = datetime.now() analyse.analyse(file_list=file_list,dbfile_name=dbfile_name,logfile=logfile,use_multiprocessing=use_multiprocessing,rel_judgment_dir=rel_judgment_dir) elapsed = datetime.now() - start broadcast(logfile,"Analyse phase took %s"%elapsed) # crossreference stage if do_crossreference: start = datetime.now() crossreference.crossreference(file_list=file_list,dbfile_name=dbfile_name,logfile=logfile,use_multiprocessing=use_multiprocessing) elapsed = datetime.now() - start broadcast(logfile,"Crossreference phase took %s"%elapsed) # convert stage if do_convert: conversion_start = time.time() start = datetime.now() convert.convert(file_list=file_list,dbfile_name=dbfile_name,logfile=logfile,public_html_dir=public_html_dir,use_multiprocessing=use_multiprocessing,do_legislation=do_legislation) elapsed = datetime.now() - start broadcast(logfile,"Convert phase took %s"%elapsed) if do_delete_html: delete_html.delete_html(conversion_start,output_dir) # disambiguation stage if do_disambiguation: disambiguation_start = time.time() start = datetime.now() disambiguation.disambiguation(file_list=file_list,dbfile_name=dbfile_name,logfile=logfile,output_dir=output_dir,use_multiprocessing=use_multiprocessing) elapsed = datetime.now() - start broadcast(logfile,"Disambiguation phase took %s"%elapsed) # index stage if do_index: start = datetime.now() indexes.make_indexes(dbfile_name=dbfile_name,logfile=logfile,output_dir=output_dir,use_multiprocessing=use_multiprocessing) elapsed = datetime.now() - start broadcast(logfile,"Index phase took %s"%elapsed)
def run(dataset_path, results_path, parser_names, limit): parsers = [PARSERS[name] for name in parser_names] logger.info("Starting experiment run - reading dataset") with open(dataset_path) as dataset_file: dataset = [json.loads(row) for row in dataset_file if len(row.strip()) > 0] experiment = Experiment(limit) logger.info("Running experiment") results = experiment.run(dataset, parsers) output_results(results, results_path) logger.info("Experiment complete") print analyse(results)
def formatLineBreaks(self,page_info): """ Consolidate code for creating line breaks here """ # determine the 'normal' line break gaps = page_info['gaps_analysis'] smallest_average_gap = (0,0) largest_average_gap = (0,0) most_frequent_gap = (0,0) for i in gaps: # loop through the dictionaries if i['n'] > most_frequent_gap: most_frequent_gap = (i,i['n']) # taking this to be 'normal' lower = gaps[most_frequent_gap[0]]['min'] # lower boundary upper = gaps[most_frequent_gap[0]]['max'] # upper boundary previous_y = 0 string = "" for line in page_info['stripped_text']: coords = analyse().getCoordinates(line) if not coords: string = string+line else: top_y = int(coords[1]) if previous_y != 0: # skip the first iteration whitespace = int(top_y-previous_y) if whitespace >= lower and whitespace <= upper: string = string+"<br/>%s"%line elif whitespace > upper: string = string+"<br/><br/>%s"%line else: string = string+line # not quite sure what this does previous_y = int(coords[3]) # our bottom y is now 'previous' return string
def hello_world(): name = request.args.get('cd-name') text = request.args.get('cd-textarea') if(name==None and text==None): return render_template('index.html') else: kind,tags=analyse(name,text) return render_template('index.html',kind="类别:"+kind,tags="标签:"+tags)
def main(): if len(sys.argv)<3: print 'no data!. Usage:' print 'python run.py <.xsls document path> <sheet name> <no. of bins>' return fname = sys.argv[1] sname = sys.argv[2] try: nbins = int(sys.argv[3]) except: print('\nno binning specified, using Freedman-Diaconis automatic bin optimisation\n') nbins = None ana.analyse(fname, sname, nbins)
def test_all(self): variables = ['asdf', 'hijkl'] results = analyse(variables) self.assertEqual( results['highlights']['longest'], {'hijkl': 5}) self.assertEqual( results['stats']['longest_10'], [{'hijkl': 5}, {'asdf': 4}])
def scrape_from_file(filename): """ The main method which will ask for a file to read from, read it, analyse it and store it. (Using other methods)""" print "Filename: " + filename # Global variables global SOLR_SERVER global CONFIG # What file do you want to read from? # file_path = getFile() file = open(filename, "r") # Set the read variables: current_user = "" tweet_content = [] in_tweet = False # Setup the Solr server variable solr_server = CONFIG.get_solr_server sh = solr_server if isinstance(solr_server, StorageHandler) else StorageHandler(solr_server) # Start reading the file for text in file.readline(): if not "TwitterHelp.get_all_statuses():" in text: # Look for a user and store the username in a variable: if text == "#####_NEW_USER_#####": current_user = file.readline() # Look for a new Tweet and read till new_user or new_tweet. elif text == "#####_NEW_TWEET_#####": in_tweet = True # Look for the end of a Tweet elif text == "#####_END_OF_TWEET_#####": in_tweet = False if tweet_content != []: # Analyse if there's any content (lovekeywords, hatekeywords) = addalyse(filter_analysis(analyse(tweet_content))) # Store into Solr # parameter 4 = 1, update everything on the next update # parameter 5 = 0, full update on next update sh.add_profile(current_user, lovekeywords, hatekeywords, 1, 0) # Debug print print "Username: "******" has the following content:\n" + tweet_content print "\n\n The following lovekeywords were found: \n" + lovekeywords print "\n\n The following hatekeywords were found: \n" + hatekeywords # Store the content of a Tweet. elif in_tweet: if text != "": tweet_content.append(text)
def doAnalyse(self, imgName): imgFullPath = str(self._ui.txtPath.text()) + "/" + imgName an = analyse.analyse() an.analyseImage(imgFullPath, self.coords[0], self.coords[1], self.coords[2], self.coords[3]) tmpPath = os.path.dirname(os.path.abspath(__file__)) + "/tmp/" self.setImageForControl(self._ui.lblSelect, tmpPath + "line.png") self.setImageForControl(self._ui.lblGraph, tmpPath + "graph.png")
def compareQuery(): term1 = request.form.get('twitter_query1', None) term2 = request.form.get('twitter_query2', None) country = request.form.get('countryDataset', 'global') if len(term1) == 0: if len(term2) == 0: compare = None return compare_err("You must add a search query in at least one of the input fields") if len(term1) != 0: result1, err = analyse(term1, country) if err != None: flash("Analysing fewer than 20 tweets will lead to less accurate results. Only "+str(result1.tweetsetInfo.tweet_count)+" tweets analysed for "+str(result1.tweetsetInfo.term)) if result1 == "invalidSearchQuery": return compare_err(term1+" is not a valid Twitter hashtag or user handle, please try again") elif result1 == "noHashorAt": return compare_err("You must enter a #tag or @user in the first input field, please try again") elif result1 == "noTweetsFound": return compare_err("No tweets found for the query "+term1+", please try again") resultlist[0] = result1 if len(term2) != 0: result2, err = analyse(term2, country) if err != None: flash("Analysing fewer than 20 tweets will lead to less accurate results. Only "+str(result2.tweetsetInfo.tweet_count)+" tweets analysed for "+str(result2.tweetsetInfo.term)) if result2 == "invalidSearchQuery": return compare_err(term2+" is not a valid Twitter hashtag or user handle, please try again") elif result2 == "noHashorAt": return compare_err("You must enter a #tag or @user in input second input field, please try again") elif result2 == "noTweetsFound": return compare_err("No tweets found for this query "+term2+", please try again") resultlist[1] = result2 compare = compare_results(resultlist[0], resultlist[1], country) return render_template('tabs/compare_tweets.html', resultlist=resultlist, compare=compare)
def snippet_to_results(snippet): fd, path = tempfile.mkstemp('.js') results = None # need to do this to flush the file with open(path, 'w') as f: f.write(snippet) with open(path, 'r') as f: variables = get_var_names(fullpath=path) # finally get interest statistics results = analyse(variables) return results
def super_analyze(strings): custom_gap = '-' analyzed = [] categories = [] for string in strings: a = analyse.analyse(string) analyzed.append(a) categories.append(a.get_categories()) uber_analyzed = multi_nw_algorithm(map(translate_categories, categories), gap=custom_gap, null=[], concatenate=prepend) generated = factorize(analyzed, uber_analyzed, categories, custom_gap) return generated
def main(): id_ = args.jobid n = args.nEvents if args.nEvents else 100000 # TODO read total number from muon file directly OR # TODO always pass from steering process? with tempfile.NamedTemporaryFile() as t: outFile = t.name generate(args.input, args.geofile, n, outFile, args.lofi) chain = r.TChain('cbmsim') chain.Add(outFile) xs = analyse(chain, args.results) res = r.TFile.Open(args.results, 'update') res.WriteObject(xs, 'results') res.Close() print 'Slave: Worker process {} done.'.format(id_)
def netclamp_analyse(channel, GIDs, multiplier, make_plots=False): threshold = -30 #mV directory = "C:/Users/spand/OneDrive - University of Toronto/Year 4/Skinner Lab/Spiking Data/network_clamp_results/" directory += (channel + '/' + multiplier) file_list = [] for gid in GIDs: file_list.append(directory + '/' + multiplier + '_mytrace_' + \ gid + '_soma.dat') freq_data = np.zeros((len(file_list), 4)) isi_data = np.zeros((len(file_list), 4)) if make_plots == True: fig, ax = plt.subplots(len(file_list), 1, sharex=True, sharey=True) fig.suptitle(channel + ': ' + multiplier) plt.ylabel('Voltage (mV)') plt.xlabel('Time (ms)') for i in range(len(file_list)): x, y = np.loadtxt(file_list[i], unpack=True, skiprows=1) if make_plots == True: ax[i].plot(x, y) ax[i].plot(x, x * 0 + threshold, 'r') # print (x,y) F, T = analyse(x, y) freq_data[i] = F isi_data[i] = T avg_freq_data = np.zeros(4) avg_isi_data = np.zeros(4) for i in range(4): avg_freq_data[i] = np.mean(freq_data[:, i]) avg_isi_data[i] = np.mean(isi_data[:, i]) print('f1 \t f2 \t f_avg \t f_sd') print(freq_data) print(avg_freq_data) print('t1 \t t2 \t t_avg \t t_sd') print(isi_data) print(avg_isi_data) if make_plots == True: plt.show()
def parameters_grid_search(train_s, valid_s, method_name, kwargs): """ methode name = string, name of the method (eg :"naiveBayes") kwargs = dictionnary of the parameters of the method: range to be tested """ exp = 0 kwargs_test = {} dTuning = {} for items in product(*kwargs.values()): for i, key in enumerate(kwargs.keys()): kwargs_test[key] = items[i] d = analyse.analyse(train_s, valid_s, method_name, kwargs_test) dTuning[exp]= d exp += 1 return dTuning
def analyseQuery(): term = request.form.get('twitter_query', '') if len(term) == 0: return analyse_err("You must add a search query") country = request.form.get('countryDataset', 'global') resultitem, err = analyse(term, country) if resultitem == "noHashorAt": return analyse_err("You must enter a #tag or @user, please try again") elif resultitem == "noTweetsFound": return analyse_err("No tweets found for this query, please try again") if err != None: flash("Analysing fewer than 20 tweets will lead to less accurate results. Only "+str(resultitem.tweetsetInfo.tweet_count)+" tweets analysed for "+str(resultitem.tweetsetInfo.term),'info') return render_template('tabs/analyse_tweets.html', result=resultitem)
def reflash_today(): global today, today_danmu, status, lasttime, user_ip, lastsize while True: today = time.strftime("%Y-%m-%d", time.localtime()) status = requests.get( 'https://api.live.bilibili.com/room/v1/Room/room_init?id=801580' ).json() flash_lock.acquire() now_time = time.time() if (now_time - lasttime > 300): user_ip = {} lasttime = now_time reflashflag = False nowsize = rd.dbsize() if len(user_ip) > 0 and (nowsize != lastsize or status['data']['live_status'] == 1): today_danmu = search_danmu(today, True) reflashflag = True lastsize = nowsize flash_lock.release() #print(status['data']) top, hantalk = analyse() os.system("clear") board = "当前在线dd:\n" for iip in user_ip: board = board + ' ' + str(iip) + ' ' + str( user_ip[iip]['cname']) + ' ' + str( user_ip[iip]['date']) + user_ip[iip]['mode'] + '\n' print(board) print('\n常规更新 数据库变动:', reflashflag, '开播模式:', status['data']['live_status'], '更新时间', time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())) if status['data']['live_status'] == 1: time.sleep(6) elif reflashflag == True: time.sleep(9) else: time.sleep(19)
def analyse(self): #exception handling try: fname = self.fname if not fname: raise IOError('File name is empty') except Exception as e: print e tkMessageBox.showwarning('File error', 'No file imported!') return try: sname = str(self.sheetbox.get(self.sheetbox.curselection())) except Exception as e: print e tkMessageBox.showwarning('Sheet selection', 'Please select a sheet') return #check binning options if self.bvar.get() == 1: nbins = None else: bin_entry = self.bentry.get() try: nbins = int(bin_entry) except ValueError as e: tkMessageBox.showwarning('Binning error', 'Please enter a valid bin number') return except Exception as e: print e tkMessageBox.showwarning('Binning error', 'Please check your binning options') #analyse data and display results results = ana.analyse(fname, sname, nbins) out_text = '\n'.join('{} {}'.format(*x) for x in zip(self.outstrs, results.values())) self.output_text.set(out_text)
def predesigned_network(network_type): print "running mode:" + network_type prompt = ">" print "What kind of operation you want to run?" print "#1 Run a single experiment;" print "#2 Run a batched experiment;" print "#3 analyse existing experimental results or doing further experiments on existing data" if_batch = int(raw_input(prompt)) if if_batch == 1: run_single(0, network_type) elif if_batch == 2: print "#1: fixed CNN, different ratio; #2:..." run_type = int(raw_input(prompt)) if run_type == 1: os.system('clear') print "============================================================================================" print "Enter a sery of numbers of the ratio of training samples, end with an 'e' or 'end'," print "if you want to use the default sequence 1,5,10,15,20,25,30,35,40,45,50,55,60,65,70,75,80,85,90, enter an 'a' or 'all':" ratios = [] temp_ratio = raw_input(prompt) if temp_ratio == 'a' or temp_ratio == 'all': temp_ratio = [ 1, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90 ] else: while temp_ratio != 'e' and temp_ratio != 'end': ratios.append(int(temp_ratio)) temp_ratio = raw_input(prompt) #ratios = temp_ratio print ratios #def run_batch(learning_ratio): # mix_model_svm_ratio = 0 # file_name, neighbors = data_util.prepare(learning_ratio) # print "now gathering the parameters of the network..." # neighbors = neighbors + 1 # print "the neighbors strategy is: " + str(neighbors) print "enter the dataset name:" dataset_fixed = raw_input(prompt) print "enter the neighbor strategy, choose from 1, 4, or 8, end with an 'e' or 'end'. if you want to run on all the strategies, enter an 'a' or 'all' for all 1,4,8 strategies." temp_strategies_list = [] temp_strategy_input = raw_input(prompt) if temp_strategy_input == 'a' or temp_strategy_input == 'all': temp_strategies_list = [1, 4, 8] else: while temp_strategy_input != 'e' and temp_strategy_input != 'end': temp_strategies_list.append(int(temp_strategy_input)) temp_strategy_input = raw_input(prompt) #strategy_fixed = raw_input(prompt) os.system('clear') print "Now gathering network configuration parameters for prior proposed Cube CNN...." print "--------------------------------------------------------------------------------------------" print "enter the number of convolutional neurons:" neurons = int(raw_input(prompt)) print "enter the number of layers you want the CNN to operate convolutional operation:" neuronLayersCount = int(raw_input(prompt)) print "enter the kernel size of the maxpooling layer:" maxpoolings = int(raw_input(prompt)) print "enter the number of full layers\' neurons, default is 100:" fullLayers = int(raw_input(prompt)) print "enter the batch size for bsgd:" batch_size = int(raw_input(prompt)) print "enter the learning ratio:" learning = float(raw_input(prompt)) print "enter the train decay:" train_decay = float(raw_input(prompt)) print "enter the epoches you want the network to be trained:" epoches = int(raw_input(prompt)) print "now choose the following strategy after the cnn network been trained:" print "#1:train a cnn-svm joint framework;" print "#2:train a cnn-rf joint framework;" print "#3:train both cnn-svm and cnn-rf joint frameworks;" #if network_type == '3': # print "#4:compare the cube cnn with the new hic framework;" # print "#5:TODO: train a mix assemble cnn-classifier model." #elif network_type == '1' print "#4:TODO: train a mix assemble cnn-classifier model." if network_type == '3': print "#5: run and compare the cube cnn with the new hic framework" following_strategy = int(raw_input(prompt)) if network_type == '1' and following_strategy == 4: print "enter the ratio of svm classifier:" mix_model_svm_ratio = int(row_input(prompt)) tress = 0 if following_strategy == 2 or following_strategy == 3: print "enter the count of trees you want to set in Random Forest:" trees = int(raw_input(prompt)) #if network_type == '3' and following_strategy == 4: # print "Now gathering parameter for hic network:" print "How many individual experiments want to take?" experiment_times = raw_input(prompt) for time_counts in range(int(experiment_times)): ltime = time.localtime() time_stamp = str(ltime[0]) + "#" + str(ltime[1]) + "#" + str( ltime[2]) + "#" + str(ltime[3]) + "#" + str(ltime[4]) file = open( "../experiments/BatchExpsFixedCNN_" + time_stamp + ".txt", 'w') resultFile = open( "../experiments/BatchResults_" + time_stamp + ".txt", 'w') file.write("======== Experimental Folders ==========\n") resultFile.write( "=============== Batch Exprimental Results ===============\n" ) resultFile.write( "=========================================================\n" ) #strategiesList = [] #if str(strategy_fixed) == 'a' or strategy_fixed == 'all': # strategiesList = [1,4,8] #else: # strategiesList = [int(strategy_fixed)] # strategiesList = temp_strategies_list for neighbor_strategy_mark in range(len(strategiesList)): neighbor_strategy = strategiesList[neighbor_strategy_mark] print "now is running on strategy " + str( neighbor_strategy) file.write("~~~~~~~~~~~~~~~ Neighbors Strategies:" + str(neighbor_strategy) + " ~~~~~~~~~~~~~~~\n") for temp_mark in range(len(ratios)): learning_ratio = 0 train_decay_inner = 0 batch_size_inner = 0 if ratios[temp_mark] < 10: learning_ratio = learning / 10 train_decay_inner = train_decay / 10 batch_size_inner = batch_size / 10 #elif ratios[temp_mark] < 5: # learning_ratio = learning / 100 # train_decay_inner = train_decay / 100 # batch_size_inner = batch_size / 100 else: learning_ratio = learning train_decay_inner = train_decay batch_size_inner = batch_size #set the full layers nodes to satisfy the change of neighbors strategies. #TODO: need to check if this makes sense #actual_full_layers = 0 #if neighbor_strategy == 4: # actual_full_layers = fullLayers / 2 #elif neighbor_strategy == 1: # actual_full_layers = fullLayers / 4 # for time_counts in range(int(experiment_times)): file_name = run_batch(dataset_fixed, neighbor_strategy, neurons, neuronLayersCount, maxpoolings, fullLayers, batch_size_inner, learning_ratio, train_decay_inner, epoches, following_strategy, trees, ratios[temp_mark], 2) #file_name = run_single(ratitemp_mark]) file.write(file_name + "\n") fileCNNRFResultsPath = file_name + "_CNNRFdescription.txt" if following_strategy == 3: fileCNNSVMResultsPath = file_name + "CNNSVMdescription.txt" resultFile.write( "=========================================================\n" ) resultFile.write(file_name + "\n") inputFileRF = open(fileCNNRFResultsPath, "r") if following_strategy == 3: inputFileSVM = open(fileCNNSVMResultsPath, "r") allLinesRF = inputFileRF.readlines() if following_strategy == 3: allLinesSVM = inputFileSVM.readlines() resultFile.write("CNN-RF Results:\n") for eachLine in allLinesRF: resultFile.write(eachLine) resultFile.write( "-----------------------------------------\n") if following_strategy == 3: resultFile.write("CNN-SVM Results:\n") for eachLine in allLinesSVM: resultFile.write(eachLine) inputFileRF.close() inputFileSVM.close() resultFile.write( "##################################################\n" ) #file.close() resultFile.close() print "The results are stored in the file " + "BatchResults_" + time_stamp + ".txt" print "All folders contains the experiments are stored in the file " + "BatchExpsFixedCNN_" + time_stamp + ".txt" elif if_batch == 3: os.system('clear') analyse.analyse()
async_mode = None app = Flask(__name__) app.config['SECRET_KEY'] = 'secret!' #socketio = SocketIO(app) user_ip = {} lasttime = time.time() rd = redis.StrictRedis(host='localhost', port=4514, db=0, decode_responses=True) lastk = [] status = requests.get( 'https://api.live.bilibili.com/room/v1/Room/room_init?id=801580').json() top, hantalk = analyse() lastsize = 0 # limiter.init_app(app) today_danmu = "" def main_server(): #socketio.run(app, host='0.0.0.0', port=14514,debug=False) app.run(host='0.0.0.0', port=14514, debug=False, threaded=True) def reflash_today(): global today, today_danmu, status, lasttime, user_ip, lastsize while True: today = time.strftime("%Y-%m-%d", time.localtime()) status = requests.get(
import sys from analyse import analyse from display import print_word_info, print_error_message if len(sys.argv) < 2: print_error_message( 'Ooops, something went wrong. Have you supplied a word to be approximated?' ) exit(1) actual_arguments = sys.argv[1:] word = ' '.join(actual_arguments) try: info = analyse(word) print_word_info(info) except: print_error_message('Ooops, something went wrong internally!')
def main(): ############### ### IMPORT #### ############### # Importation parameters: split= True normalize = True noise_var = 0. ratio_train = 0.9 # Import the training data: print("Extracting the data sets...") start = time.clock() train_s, valid_s, test_s = tokenizer.extract_data(split= split, normalize= normalize, noise_variance= noise_var, ratio_train= ratio_train) stop = time.clock() print ("Extraction time: %i s") %(stop-start) print(" ") print(" ") ###################### ### PRE-TREATMENT #### ###################### print("------------------------- Pre-treatment --------------------------") ### Average number of signal per subset: print("Train subsets signal average:") train_s_average = preTreatment.ratio_sig_per_dataset(train_s[2]) print(" ") print("Valid subsets signal average:") valid_s_average = preTreatment.ratio_sig_per_dataset(valid_s[2]) print(" ") print(" ") ############ # ANALYSES # ############ # Dictionnary that will contain all the data for each methods. In the end # we'll have a dict of dict # Keys of the methods : {naiveBayes, svm, kNeighbors, lda, qda, adaBoost, # randomForest} dMethods ={} # RANDOM FOREST: kwargs_rdf= {'n_trees': 50} dMethods['randomForest'] = analyse.analyse(train_s, valid_s, 'randomForest', kwargs_rdf) print(" ") ################## # POST-TREATMENT # ################## print("post treatment") yProba_s = dMethods['randomForest']['yProba_s'] yPredicted_s = dMethods['randomForest']['yPredicted_s'] for n in range(8): L = [] for i in range(yPredicted_s[n].shape[0]): if yPredicted_s[n][i] == 1: L.append(yProba_s[n][i][1]) L.sort(reverse = True) prob_limit = L[int(len(L)*0.45)] for i in range(yPredicted_s[n].shape[0]): if yProba_s[n][i][1] < prob_limit: yPredicted_s[n][i] = 0 else: yPredicted_s[n][i] = 1 # Numerical score: if type(yPredicted_s) == list: for i in range(len(yPredicted_s)): sum_s, sum_b = submission.get_numerical_score(yPredicted_s[i], valid_s[2][i]) print "Subset %i: %i elements - sum_s[%i] = %i - sum_b[%i] = %i" \ %(i, yPredicted_s[i].shape[0], i, sum_s, i, sum_b) # Get s and b for each group (s_s, b_s) and the final final_s and # final_b: final_s, final_b, s_s, b_s = submission.get_s_b_8(yPredicted_s, valid_s[2], valid_s[3]) # Balance the s and b final_s *= 250000/25000 final_b *= 250000/25000 # AMS final: AMS = hbc.AMS(final_s , final_b) print ("Expected AMS score for randomforest : %f") %AMS #AMS by group AMS_s = [] for i, (s,b) in enumerate(zip(s_s, b_s)): s *= 250000/yPredicted_s[i].shape[0] b *= 250000/yPredicted_s[i].shape[0] score = hbc.AMS(s,b) AMS_s.append(score) print("Expected AMS score for randomforest : for group %i is : %f" %(i, score)) print(" ") ############## # SUBMISSION # ############## print("-------------------------- Submission ---------------------------") # Prediction on the test set: # method used for the submission # TODO : Verifier que le nom de la method a bien la bonne forme( # creer une liste de noms de methodes) #method = "randomForest" #test_prediction_s, test_proba_s = eval(method).get_test_prediction( # dMethods[method]['predictor_s'], # test_s[1]) test_prediction_s, test_proba_s = postTreatment.get_SL_test_prediction( dMethods, dSl, test_s[1]) print("Test subsets signal average:") test_s_average = preTreatment.ratio_sig_per_dataset(test_prediction_s) print(" ") #RankOrder = np.arange(1,550001) if type(test_prediction_s) == list: test_prediction_s = np.concatenate(test_prediction_s) test_proba_s = np.concatenate(test_proba_s) RankOrder = postTreatment.rank_signals(test_proba_s) ID = np.concatenate(test_s[0]) else: ID = test_s[0] # Create a submission file: sub = submission.print_submission(ID, RankOrder , test_prediction_s) return sub
def FitnessFunction(point, sample): try: tmpl = copy.deepcopy(config.RESULTS_TEMPLATE) params = point paramFile = '/eos/experiment/ship/user/ffedship/EA_V2/Shared/params' + str( sample) + '_{}.root'.format(create_id(params)) geoinfoFile = paramFile.replace('params', 'geoinfo') heavy = '/eos/experiment/ship/user/ffedship/EA_V2/Shared/heavy' + str( sample) + '_{}'.format(create_id(params)) lockfile = paramFile + '.lock' print heavy, lockfile if os.path.exists(geoinfoFile): geolockfile = geoinfoFile + '.lock' lock = filelock.FileLock(geolockfile) if not lock.is_locked: with lock: with open(geoinfoFile, 'r') as f: length, weight = map(float, f.read().strip().split(',')) tmpl['weight'] = weight tmpl['length'] = length while not os.path.exists(paramFile) and not os.path.exists(heavy): lock = filelock.FileLock(lockfile) if not lock.is_locked: with lock: tmpl['status'] = 'Acquired lock.' tmp_paramFile = generate_geo( paramFile.replace('.r', '.tmp.r'), params) subprocess.call([ 'python2', '/afs/cern.ch/user/f/ffedship/private/EA_Muon_Shield_V2/get_geo.py', '-g', tmp_paramFile, '-o', geoinfoFile ]) shutil.move( '/eos/experiment/ship/user/ffedship/EA_V2/Shared/' + os.path.basename(tmp_paramFile), paramFile.replace('shared', 'output').replace('params', 'geo')) with open(geoinfoFile, 'r') as f: length, weight = map(float, f.read().strip().split(',')) tmpl['weight'] = weight tmpl['length'] = length shutil.move( '/eos/experiment/ship/user/ffedship/EA_V2/Geometry/' + os.path.basename(tmp_paramFile), paramFile) tmpl['status'] = 'Created geometry.' print "Fitness Function Message: Geometry has been generated using config ", point print "Fitness Function Message: Length ", length print "Fitness Function Message: Weight ", weight else: sleep(60) outFile = root_output_name tmpl['status'] = 'Simulating...' generate(inputFile=root_input_name, paramFile=paramFile, outFile=root_output_name, seed=1, nEvents=10000) tmpl['status'] = 'Analysing...' chain = r.TChain('cbmsim') chain.Add(outFile) xs = analyse(chain, 'hists.root') tmpl['muons'] = len(xs) tmpl['muons_w'] = sum(xs) print "muons: ", tmpl['muons'] print "muons_w: ", tmpl['muons_w'] print "Fitness", FCN(tmpl['weight'], np.array(xs), tmpl['length'])[0] XS_output = open(csv_output_name, "w") XS_write = csv.writer(XS_output) XS_write.writerow([tmpl['weight'], tmpl['length'], tmpl['muons_w']]) XS_output.close() tmpl['error'] = None tmpl['status'] = 'Done.' os.remove(root_output_name) except: print "EA_LL_FCN Message: Wrong geometry, operation rejected, negative values assigned" XS_output = open(csv_output_name, "w") XS_write = csv.writer(XS_output) XS_write.writerow([100000000, 10000000, 100000000]) XS_output.close()
def predesigned_network(network_type): print "running mode:" + network_type prompt = ">" print "What kind of operation you want to run?" print "#1 Run a single experiment;" print "#2 Run a batched experiment;" print "#3 analyse existing experimental results or doing further experiments on existing data" if_batch = int(raw_input(prompt)) if if_batch == 1: run_single(0, network_type) elif if_batch == 2: print "#1: fixed CNN, different ratio; #2:..." run_type = int(raw_input(prompt)) if run_type == 1: os.system('clear') print "============================================================================================" print "Enter a sery of numbers of the ratio of training samples, end with an 'e' or 'end'," print "if you want to use the default sequence 1,5,10,15,20,25,30,35,40,45,50,55,60,65,70,75,80,85,90, enter an 'a' or 'all':" ratios = [] temp_ratio = raw_input(prompt) if temp_ratio == 'a' or temp_ratio == 'all': temp_ratio = [1,5,10,15,20,25,30,35,40,45,50,55,60,65,70,75,80,85,90] else: while temp_ratio != 'e' and temp_ratio != 'end': ratios.append(int(temp_ratio)) temp_ratio = raw_input(prompt) #ratios = temp_ratio print ratios #def run_batch(learning_ratio): # mix_model_svm_ratio = 0 # file_name, neighbors = data_util.prepare(learning_ratio) # print "now gathering the parameters of the network..." # neighbors = neighbors + 1 # print "the neighbors strategy is: " + str(neighbors) print "enter the dataset name:" dataset_fixed = raw_input(prompt) print "enter the neighbor strategy, choose from 1, 4, or 8, end with an 'e' or 'end'. if you want to run on all the strategies, enter an 'a' or 'all' for all 1,4,8 strategies." temp_strategies_list = [] temp_strategy_input = raw_input(prompt) if temp_strategy_input == 'a' or temp_strategy_input == 'all': temp_strategies_list = [1,4,8] else: while temp_strategy_input != 'e' and temp_strategy_input != 'end': temp_strategies_list.append(int(temp_strategy_input)) temp_strategy_input = raw_input(prompt) #strategy_fixed = raw_input(prompt) os.system('clear') print "Now gathering network configuration parameters for prior proposed Cube CNN...." print "--------------------------------------------------------------------------------------------" print "enter the number of convolutional neurons:" neurons = int(raw_input(prompt)) print "enter the number of layers you want the CNN to operate convolutional operation:" neuronLayersCount = int(raw_input(prompt)) print "enter the kernel size of the maxpooling layer:" maxpoolings = int(raw_input(prompt)) print "enter the number of full layers\' neurons, default is 100:" fullLayers = int(raw_input(prompt)) print "enter the batch size for bsgd:" batch_size = int(raw_input(prompt)) print "enter the learning ratio:" learning = float(raw_input(prompt)) print "enter the train decay:" train_decay = float(raw_input(prompt)) print "enter the epoches you want the network to be trained:" epoches = int(raw_input(prompt)) print "now choose the following strategy after the cnn network been trained:" print "#1:train a cnn-svm joint framework;" print "#2:train a cnn-rf joint framework;" print "#3:train both cnn-svm and cnn-rf joint frameworks;" #if network_type == '3': # print "#4:compare the cube cnn with the new hic framework;" # print "#5:TODO: train a mix assemble cnn-classifier model." #elif network_type == '1' print "#4:TODO: train a mix assemble cnn-classifier model." if network_type == '3': print "#5: run and compare the cube cnn with the new hic framework" following_strategy = int(raw_input(prompt)) if network_type == '1' and following_strategy == 4: print "enter the ratio of svm classifier:" mix_model_svm_ratio = int(row_input(prompt)) tress = 0 if following_strategy == 2 or following_strategy == 3: print "enter the count of trees you want to set in Random Forest:" trees = int(raw_input(prompt)) #if network_type == '3' and following_strategy == 4: # print "Now gathering parameter for hic network:" print "How many individual experiments want to take?" experiment_times = raw_input(prompt) for time_counts in range(int(experiment_times)): ltime = time.localtime() time_stamp = str(ltime[0]) + "#" + str(ltime[1]) + "#" + str(ltime[2]) + "#" + str(ltime[3]) + "#" + str(ltime[4]) file = open("../experiments/BatchExpsFixedCNN_" + time_stamp + ".txt", 'w') resultFile = open("../experiments/BatchResults_" + time_stamp + ".txt", 'w') file.write("======== Experimental Folders ==========\n") resultFile.write("=============== Batch Exprimental Results ===============\n") resultFile.write("=========================================================\n") #strategiesList = [] #if str(strategy_fixed) == 'a' or strategy_fixed == 'all': # strategiesList = [1,4,8] #else: # strategiesList = [int(strategy_fixed)] # strategiesList = temp_strategies_list for neighbor_strategy_mark in range(len(strategiesList)): neighbor_strategy = strategiesList[neighbor_strategy_mark] print "now is running on strategy " + str(neighbor_strategy) file.write("~~~~~~~~~~~~~~~ Neighbors Strategies:" + str(neighbor_strategy) +" ~~~~~~~~~~~~~~~\n") for temp_mark in range(len(ratios)): learning_ratio = 0 train_decay_inner = 0 batch_size_inner = 0 if ratios[temp_mark] < 10: learning_ratio = learning / 10 train_decay_inner = train_decay / 10 batch_size_inner = batch_size / 10 #elif ratios[temp_mark] < 5: # learning_ratio = learning / 100 # train_decay_inner = train_decay / 100 # batch_size_inner = batch_size / 100 else: learning_ratio = learning train_decay_inner = train_decay batch_size_inner = batch_size #set the full layers nodes to satisfy the change of neighbors strategies. #TODO: need to check if this makes sense #actual_full_layers = 0 #if neighbor_strategy == 4: # actual_full_layers = fullLayers / 2 #elif neighbor_strategy == 1: # actual_full_layers = fullLayers / 4 # for time_counts in range(int(experiment_times)): file_name = run_batch(dataset_fixed,neighbor_strategy, neurons, neuronLayersCount, maxpoolings,fullLayers, batch_size_inner, learning_ratio, train_decay_inner, epoches, following_strategy, trees, ratios[temp_mark], 2) #file_name = run_single(ratitemp_mark]) file.write(file_name + "\n") fileCNNRFResultsPath = file_name + "_CNNRFdescription.txt" if following_strategy == 3: fileCNNSVMResultsPath = file_name + "CNNSVMdescription.txt" resultFile.write("=========================================================\n") resultFile.write(file_name + "\n") inputFileRF = open(fileCNNRFResultsPath, "r") if following_strategy == 3: inputFileSVM = open(fileCNNSVMResultsPath, "r") allLinesRF = inputFileRF.readlines() if following_strategy == 3: allLinesSVM = inputFileSVM.readlines() resultFile.write("CNN-RF Results:\n") for eachLine in allLinesRF: resultFile.write(eachLine) resultFile.write("-----------------------------------------\n") if following_strategy == 3: resultFile.write("CNN-SVM Results:\n") for eachLine in allLinesSVM: resultFile.write(eachLine) inputFileRF.close() inputFileSVM.close() resultFile.write("##################################################\n") #file.close() resultFile.close() print "The results are stored in the file " + "BatchResults_" + time_stamp + ".txt" print "All folders contains the experiments are stored in the file " + "BatchExpsFixedCNN_" + time_stamp + ".txt" elif if_batch == 3: os.system('clear') analyse.analyse()
anadbprocess.start() print('Started analyse database process') timestamp = int(dt.datetime.now().timestamp()) while control_flag.value == 0: if mbqueue.qsize() == 0: time.sleep(0.2) else: time1 = time.time() # get data from mbqueue pq_data, timestamp = mbqueue.get() # send data to analysis func frequency_10s, status_dict = ana.analyse(pq_data) # create dict for database insert and showing on website for addr in pq_data.index: if addr in live_ports: livedatadict['port_' + str(addr)] = pq_data[addr] datadict['port_' + str(addr)] = pq_data[addr] # add primary key to every dict and frequency 10s datadict['timestamp'] = timestamp datadict['frequency_10s'] = frequency_10s # insert data in dbqueue dbqueue.put(datadict) # create data json
writer.writerow(src[neuron_index]) csvFile.close() def mapping_process(): assert SIM_TIME > 0 spynnaker.setup(timestep=1) spynnaker.set_number_of_neurons_per_core(spynnaker.IF_curr_exp, 50) time_space = readData() pn_population = setupLayer_PN(time_space) kc_population = setupLayer_KC() kc_population.record(["spikes"]) pn_kc_projection = setupProjection_PN_KC(pn_population, kc_population) spynnaker.run(SIM_TIME) neo = kc_population.get_data(variables=["spikes"]) spikeData_original = neo.segments[0].spiketrains spynnaker.end() return spikeData_original if __name__ == '__main__': NUMBER_OF_DATA = int(sys.argv[1]) SIM_TIME = NUMBER_OF_DATA * 50 # 暂时默认expose_time=50 begin = time.time() spikeData_original = mapping_process() end = time.time() operation_time("***Whole process of Map.py", begin=begin, end=end) ana.analyse(spikeData_original)
def search(): passwd=request.args.get('passwd') return analyse(passwd)
valid_s = tuple(valid_s) test_s = tuple(test_s) print(" ") ### Classifier # Linear SVM: dMethods = {} kwargs_linearSVM= {'penalty': 'l2', 'loss': 'l2', 'dual': True, 'tol': 0.0001, 'C': 1.0, 'multi_class': 'ovr', 'fit_intercept': True, 'intercept_scaling': 1, 'class_weight': None, 'verbose': 0, 'random_state': None} dMethods['linearSVM'] = analyse.analyse(train_s= train_s, train2_s= train_s_2, valid_s= valid_s, method_name = 'linearSVM', kwargs = kwargs_linearSVM) print dMethods['linearSVM']['AMS_treshold_valid'] """ if load_only == True: # Load learning: print(" ") sub_folder = 'unsupervised' load_dir = os.path.join(load_path,load_path,sub_folder) stack_AE = SAE.load(load_dir) reconstructed_layer_value, error = stack_AE.reconstruct(test_set_x) print("The error of the loaded network reconstruction is: {0}".format(error.eval()), "%")
for self.i in self.out: if (self.i["view"] == "pro"): self.procount = self.procount + 1 else: self.negcount = self.negcount + 1 print("Tweets for:", self.procount) print("Tweets against:", self.negcount) if self.procount > self.negcount: print("Twitter user are in favor of:", self.input) else: print("Twitter user are not in favor of:", self.input) elif (self.userAnswer == "4"): anas.twitPollCompare() elif (self.userAnswer == "5"): anas.outOldData() else: print("Plase enter a valid input (1,2,3,4,5).") go.menu() return 0 dis = display() threading.Thread(target=dis.slider, args=("Connecting ", )).start() twit = twitterAPI() mong = mongo() anas = analyse(mong.conn()) coll = collection(twit.authentigate(False), mong.conn()) go = Main(twit.authentigate(False), mong.conn()) dis.stop() go.menu() #calls the function that gets tweets and puts them in the DB
def highchart(request): analyse() template = loader.get_template('scholarship/highchart.html') return HttpResponse(template.render(request))
def main(): ############### ### IMPORT #### ############### # Importation parameters: split= True normalize = True noise_var = 0. ratio_train = 0.9 # Import the training data: print("Extracting the data sets...") start = time.clock() train_s, valid_s, test_s = tokenizer.extract_data(split= split, \ normalize= normalize, \ noise_variance= noise_var, \ ratio_train= ratio_train) yValid_conca = preTreatment.concatenate_vectors(valid_s[2]) weights_conca = preTreatment.concatenate_vectors(valid_s[3]) stop = time.clock() print ("Extraction time: %i s") %(stop-start) print(" ") print(" ") # Create the elected vectors for each group (best AMS score) best_yPredicted_s = [np.zeros(valid_s[2][i].shape[0]) for i in range(8)] best_yProba_s = [np.zeros(valid_s[2][i].shape[0]) for i in range(8)] best_AMS_s = [0. for i in range(8)] best_method_s = [0 for i in range(8)] best_ratio_s = [0 for i in range(8)] best_AMS_1_method = 0. best_method = "methode" best_ratio = "0." ###################### ### PRE-TREATMENT #### ###################### print("------------------------- Pre-treatment --------------------------") ### Average number of signal per subset: print("Train subsets signal average:") train_s_average = preTreatment.ratio_sig_per_dataset(train_s[2]) print(" ") print("Valid subsets signal average:") valid_s_average = preTreatment.ratio_sig_per_dataset(valid_s[2]) print(" ") print(" ") ############ # ANALYSES # ############ # Dictionnary that will contain all the data for each methods. In the end # we'll have a dict of dict # Keys of the methods : {naiveBayes, svm, kNeighbors, lda, qda, adaBoost, # randomForest, gradientBoosting} dMethods ={} # NAIVE BAYES: kwargs_bayes = {} dMethods['naiveBayes'] = analyse.analyse(train_s, valid_s, 'naiveBayes', kwargs_bayes) kwargs_bayes = {} dMethods['naiveBayes'] = analyse.analyse(train_s, valid_s, 'naiveBayes', kwargs_bayes) # SVM kwargs_svm ={} dMethods['svm'] = analyse.analyse(train_s, valid_s,'svm', kwargs_svm) # K NEIGHBORS kwargs_tuning_kn = {'n_neighbors': [20,50]} dTuning = tuningModel.parameters_grid_search(train_s, valid_s, 'kNeighbors', kwargs_tuning_kn) dMethods['kNeighbors'] = combineClassifiers.select_best_classifiers(dTuning, valid_s) # LDA kwargs_lda = {} dMethods['lda'] = analyse.analyse(train_s, valid_s, 'lda', kwargs_lda) # QDA kwargs_qda= {} dMethods['qda'] = analyse.analyse(train_s, valid_s, 'qda', kwargs_qda) # ADABOOST kwargs_ada= { 'n_estimators': 50, 'learning_rate': 1., 'algorithm': 'SAMME.R', 'random_state':None} dMethods['adaBoost'] = analyse.analyse(train_s, valid_s, 'adaBoost', kwargs_ada) # RANDOM FOREST: kwargs_tuning_rdf = {'n_estimators': [10,50,100]} dTuning = tuningModel.parameters_grid_search(train_s, valid_s, 'randomForest', kwargs_tuning_rdf) dMethods['randomForest'] = combineClassifiers.select_best_classifiers(dTuning, valid_s) # GRADIENT BOOSTING kwargs_gradB = {} dMethods['gradientBoosting'] = analyse.analyse(train_s, valid_s, 'gradientBoosting', kwargs_gradB) kwargs_tuning_gradB = {'loss': ['deviance'], 'learning_rate': [0.1], 'n_estimators': [100], 'subsample': [1.0], 'min_samples_split': [2], 'min_samples_leaf': [1], 'max_depth': [10], 'init': [None], 'random_state': [None], 'max_features': [None], 'verbose': [0]} dTuning = tuningModel.parameters_grid_search(train_s, valid_s, 'gradientBoosting', kwargs_tuning_gradB) dMethods['gradientBoosting'] = combineClassifiers.select_best_classifiers( dTuning, valid_s) print(" ") ################## # POST-TREATMENT # ################## print("-------------------- Best overall combination --------------------") dCombine = combineClassifiers.select_best_classifiers(dMethods, valid_s) print("-------------------------- Thresholding --------------------------") # COMBINED CLASSIFIERS: f = open("Tests/test_treshold_combined.txt","w") yProba_s = dCombine['yProba_s'] yPredicted_s = dCombine['yPredicted_s'] #Let's concatenate the vectors yProba_conca = preTreatment.concatenate_vectors(yProba_s) yPredicted_conca = preTreatment.concatenate_vectors(yPredicted_s) # Best treshold global best_treshold = tresholding.best_treshold(yProba_conca, yValid_conca, weights_conca) yPredicted_treshold = tresholding.get_yPredicted_treshold(yProba_conca, best_treshold) s, b = submission.get_s_b(yPredicted_treshold, yValid_conca, weights_conca) s *= 10 b *= 10 ams = hbc.AMS(s,b) if ams > best_AMS_1_method: best_AMS_1_method = ams best_method = dCombine['method'][i] best_ratio = best_treshold # Best treshold group by group for i in range(8): best_treshold = tresholding.best_treshold(yProba_s[i], valid_s[2][i], valid_s[3][i]) yPredicted_s[i] = tresholding.get_yPredicted_treshold(yProba_s[i], best_treshold) s, b = submission.get_s_b(yPredicted_s[i], valid_s[2][i], valid_s[3][i]) s *= 250000/yPredicted_s[i].shape[0] b *= 250000/yPredicted_s[i].shape[0] ams = hbc.AMS(s,b) if ams > best_AMS_s[i]: best_yPredicted_s[i] = yPredicted_s[i] best_yProba_s[i] = yProba_s[i] best_AMS_s[i] = ams best_method_s[i] = dCombine['method'][i] best_ratio_s[i] = best_treshold # FOR EACH METHOD: for method in dMethods: yProba_s = dMethods[method]['yProba_s'] yPredicted_s = dMethods[method]['yPredicted_s'] #Let's concatenate the vectors yProba_conca = preTreatment.concatenate_vectors(yProba_s) yPredicted_conca = preTreatment.concatenate_vectors(yPredicted_s) # Best treshold global best_treshold = tresholding.best_treshold(yProba_conca, yValid_conca, weights_conca) yPredicted_treshold = tresholding.get_yPredicted_treshold(yProba_conca, best_treshold) s, b = submission.get_s_b(yPredicted_treshold, yValid_conca, weights_conca) s *= 10 b *= 10 ams = hbc.AMS(s,b) if ams > best_AMS_1_method: best_AMS_1_method = ams best_method = str(method) best_ratio = best_treshold # Best treshold group by group for i in range(8): best_treshold = tresholding.best_treshold(yProba_s[i], valid_s[2][i], valid_s[3][i]) yPredicted_s[i] = tresholding.get_yPredicted_treshold(yProba_s[i], best_treshold) s, b = submission.get_s_b(yPredicted_s[i], valid_s[2][i], valid_s[3][i]) s *= 250000/yPredicted_s[i].shape[0] b *= 250000/yPredicted_s[i].shape[0] ams = hbc.AMS(s,b) if ams > best_AMS_s[i]: best_yPredicted_s[i] = yPredicted_s[i] best_yProba_s[i] = yProba_s[i] best_AMS_s[i] = ams best_method_s[i] = str(method) best_ratio_s[i] = best_treshold # Let's concatenate the 8 vectors which performs the best on each on # each of the sub group and tresholding it best_yPredicted_conca = preTreatment.concatenate_vectors(best_yPredicted_s) best_treshold_conca = tresholding.best_treshold(best_yPredicted_conca, yValid_conca, weights_conca) best_yPredicted_conca_treshold = tresholding.get_yPredicted_treshold(best_yPredicted_conca, best_treshold_conca) best_final_s, best_final_b, best_s_s, best_b_s = submission.get_s_b_8(best_yPredicted_s, valid_s[2], valid_s[3]) best_s_treshold, best_b_treshold = submission.get_s_b(best_yPredicted_conca_treshold, yValid_conca, weights_conca) best_final_s *= 10 best_final_b *= 10 best_s_treshold *= 10 best_b_treshold *= 10 best_AMS = hbc.AMS(best_final_s, best_final_b) best_AMS_treshold = hbc.AMS(best_s_treshold, best_b_treshold) print "Best AMS using one of the methods : %f" %best_AMS_1_method print " method : %s" %(str(method)) print " ratio : %f" %(best_ratio) print " " print "Best AMS final : %f" %best_AMS print "Best AMS final after final tresholding : %f" %best_AMS_treshold print "best ratio on the concatenated vector : %f" %best_treshold_conca print " " for n in range(8): print "Best AMS group %i: %f - method %s - ratio %f" \ %(n, best_AMS_s[n], best_method_s[n], best_ratio_s[n]) return best_yPredicted_s, valid_s
def main(): ############### ### IMPORT #### ############### # Importation parameters: split= True normalize = True noise_var = 0. ratio_train = 0.9 # Import the training data: print("Extracting the data sets...") start = time.clock() train_s, valid_s, test_s = tokenizer.extract_data(split= split, normalize= normalize, noise_variance= noise_var, ratio_train= ratio_train) stop = time.clock() print ("Extraction time: %i s") %(stop-start) print(" ") print(" ") ###################### ### PRE-TREATMENT #### ###################### print("------------------------- Pre-treatment --------------------------") ### Average number of signal per subset: print("Train subsets signal average:") train_s_average = preTreatment.ratio_sig_per_dataset(train_s[2]) print(" ") print("Valid subsets signal average:") valid_s_average = preTreatment.ratio_sig_per_dataset(valid_s[2]) print(" ") print(" ") ############ # ANALYSES # ############ # Dictionnary that will contain all the data for each methods. In the end # we'll have a dict of dict # Keys of the methods : {naiveBayes, svm, kNeighbors, lda, qda, adaBoost, # randomForest} dMethods ={} # NAIVE BAYES: kwargs_bayes = {} dMethods['naiveBayes'] = analyse.analyse(train_s, valid_s, 'naiveBayes', kwargs_bayes) # SVM """ kwargs_tuning_svm ={'kernel': ["rbf", "poly"], 'C' : [0.025], 'probability': [True]} dTuning = tuningModel.parameters_grid_search(train_s, valid_s, 'svm', kwargs_tuning_svm) dMethods['svm'] = combineClassifiers.select_best_classifiers(dTuning, valid_s) """ # K NEIGHBORS kwargs_tuning_kn = {'n_neighbors': [10,20]} dTuning = tuningModel.parameters_grid_search(train_s, valid_s, 'kNeighbors', kwargs_tuning_kn) dMethods['kNeighbors'] = combineClassifiers.select_best_classifiers(dTuning, valid_s) # LDA kwargs_lda = {} dMethods['lda'] = analyse.analyse(train_s, valid_s, 'lda', kwargs_lda) # QDA kwargs_qda= {} dMethods['qda'] = analyse.analyse(train_s, valid_s, 'qda', kwargs_qda) # ADABOOST kwargs_ada= {'n_estimators': 50, 'learning_rate': 1.0, 'algorithm': 'SAMME.R', 'random_state': None} #kwargs_ada = {} dMethods['adaBoost'] = analyse.analyse(train_s, valid_s, 'adaBoost', kwargs_ada) # GRADIENT BOOSTING: kwargs_tuning_gradB = {'loss': ['deviance'], 'learning_rate': [0.1], 'n_estimators': [100,200], 'subsample': [1.0], 'min_samples_split': [2], 'min_samples_leaf': [200], 'max_depth': [10], 'init': [None], 'random_state': [None], 'max_features': [None], 'verbose': [0]} dTuning = tuningModel.parameters_grid_search(train_s, valid_s, 'gradientBoosting', kwargs_tuning_gradB) dMethods['gradientBoosting'] = combineClassifiers.select_best_classifiers( dTuning, valid_s) # RANDOM FOREST: kwargs_tuning_rdf = {'n_estimators': [10,20,50,100]} dTuning = tuningModel.parameters_grid_search(train_s, valid_s, 'randomForest', kwargs_tuning_rdf) dMethods['randomForest'] = combineClassifiers.select_best_classifiers(dTuning, valid_s) print(" ") ################## # POST-TREATMENT # ################## print("------------------------ Post Treatment -----------------------") d = combineClassifiers.select_best_classifiers(dMethods, valid_s) print (" ") for i in range(len(d['parameters'])): print "Best classifier for subset %i : " %i if type(d['method'][i]) == list: print d['method'][i][i], ": ", d['parameters'][i] else: print d['method'][i], ": ", d['parameters'][i] """ ############## # SUBMISSION # ############## print("-------------------------- Submission ---------------------------") # Prediction on the test set: # method used for the submission # TODO : Verifier que le nom de la method a bien la bonne forme( # creer une liste de noms de methodes) #method = "randomForest" #test_prediction_s, test_proba_s = eval(method).get_test_prediction( # dMethods[method]['predictor_s'], # test_s[1]) test_prediction_s, test_proba_s = onTopClassifier.get_SL_test_prediction( dMethods, dSl, test_s[1]) print("Test subsets signal average:") test_s_average = preTreatment.ratio_sig_per_dataset(test_prediction_s) print(" ") #RankOrder = np.arange(1,550001) if type(test_prediction_s) == list: test_prediction_s = np.concatenate(test_prediction_s) test_proba_s = np.concatenate(test_proba_s) RankOrder = onTopClassifier.rank_signals(test_proba_s) ID = np.concatenate(test_s[0]) else: ID = test_s[0] # Create a submission file: sub = submission.print_submission(ID, RankOrder , test_prediction_s) """ return d
'penalty': 'l2', 'loss': 'l2', 'dual': True, 'tol': 0.0001, 'C': 1.0, 'multi_class': 'ovr', 'fit_intercept': True, 'intercept_scaling': 1, 'class_weight': None, 'verbose': 0, 'random_state': None } dMethods['linearSVM'] = analyse.analyse(train_s=train_s, train2_s=train_s_2, valid_s=valid_s, method_name='linearSVM', kwargs=kwargs_linearSVM) print dMethods['linearSVM']['AMS_treshold_valid'] """ if load_only == True: # Load learning: print(" ") sub_folder = 'unsupervised' load_dir = os.path.join(load_path,load_path,sub_folder) stack_AE = SAE.load(load_dir) reconstructed_layer_value, error = stack_AE.reconstruct(test_set_x) print("The error of the loaded network reconstruction is: {0}".format(error.eval()), "%") """
dMethods['naiveBayes'] = analyse.analyse(train_s= train_RM_s, train2_s= train_RM_s_2, valid_s= valid_RM_s, method_name = 'naiveBayes', kwargs = kwargs_bayes) """ # SVM """ kwargs_svm ={} dMethods['svm'] = analyse.analyse(train_s, valid_s,'svm', kwargs_svm) """ # K NEIGHBORS kwargs_kn = {'n_neighbors': 20} dMethods['kNeighbors_RM_' + str(n_removeFeatures)] = analyse.analyse( train_s= train_RM_s, train2_s= train_RM_s_2, valid_s= valid_RM_s, method_name= 'kNeighbors', kwargs= kwargs_kn) """ # LDA kwargs_lda = {} dMethods['lda'] = analyse.analyse(train_s= train_RM_s, train2_s= train_RM_s_2, valid_s= valid_RM_s, method_name = 'lda', kwargs = kwargs_lda) # QDA kwargs_qda= {} dMethods['qda'] = analyse.analyse(train_s= train_RM_s, train2_s= train_RM_s_2,
#!/usr/bin/env python3 # # Copyright (c) 2014 Paul Gerrard # This program is free software. # license: GNU General Public License version 3 # # This code is an example from `Lean Python`: http://leanpy.com/ # from analyse import analyse numlist = [] while True: nextnum = input('Enter a number or blank line:') if len(nextnum) == 0: break # # try and obtain a floating point number from the input # try: num = float(nextnum) numlist.append(num) except: print(nextnum, 'is not numeric') nmin, nmax, navg, nsum = analyse(numlist) print(nmin, nmax, navg, nsum)
def main(): ############### ### IMPORT #### ############### # Importation parameters: split= True normalize = True noise_var = 0. ratio_train = 0.9 # Import the training data: print("Extracting the data sets...") start = time.clock() train_s, valid_s, test_s = tokenizer.extract_data(split = split, normalize = normalize, noise_variance = 0., #n_classes = "multiclass", n_classes = "binary", train_size = 200000, train_size2 = 0, valid_size = 50000) stop = time.clock() print ("Extraction time: %i s") %(stop-start) print train_s[4] print(" ") print(" ") ###################### ### PRE-TREATMENT #### ###################### print("------------------------- Pre-treatment --------------------------") ### Average number of signal per subset: print("Train subsets signal average:") train_s_average = preTreatment.ratio_sig_per_dataset(train_s[2]) print(" ") print("Valid subsets signal average:") valid_s_average = preTreatment.ratio_sig_per_dataset(valid_s[2]) print(" ") print(" ") ############ # ANALYSES # ############ # Dictionnary that will contain all the data for each methods. In the end # we'll have a dict of dict # Keys of the methods : {naiveBayes, svm, kNeighbors, lda, qda, adaBoost, # randomForest} dMethods ={} # NAIVE BAYES: kwargs_bayes = {} dMethods['naiveBayes'] = analyse.analyse(train_s, valid_s, 'naiveBayes', kwargs_bayes) # SVM """ kwargs_svm ={} dMethods['svm'] = analyse.analyse(train_s, valid_s,'svm', kwargs_svm) """ # K NEIGHBORS kwargs_kn = {'n_neighbors':50} dMethods['kNeighbors'] = analyse.analyse(train_s, valid_s, 'kNeighbors', kwargs_kn) # LDA kwargs_lda = {} dMethods['lda'] = analyse.analyse(train_s, valid_s, 'lda', kwargs_lda) # QDA kwargs_qda= {} dMethods['qda'] = analyse.analyse(train_s, valid_s, 'qda', kwargs_qda) # ADABOOST kwargs_ada= { 'base_estimators': None, 'n_estimators': 50, 'learning_rate': 1., 'algorithm': 'SAMME.R', 'random_state':None} dMethods['adaBoost'] = analyse.analyse(train_s, valid_s, 'adaBoost', kwargs_ada) # RANDOM FOREST: kwargs_rdf= {'n_trees': 10} dMethods['randomForest'] = analyse.analyse(train_s, valid_s, 'randomForest', kwargs_rdf) # RANDOM FOREST 2: kwargs_rdf= {'n_trees': 100} dMethods['randomForest2'] = analyse.analyse(train_s, valid_s, 'randomForest', kwargs_rdf) # ADABOOST2 kwargs_ada= { 'base_estimators': None, 'n_estimators': 100, 'learning_rate': .5, 'algorithm': 'SAMME.R', 'random_state':None} dMethods['adaBoost2'] = analyse.analyse(train_s, valid_s, 'adaBoost', kwargs_ada) print(" ") ################## # POST-TREATMENT # ################## print("------------------------ Merged predictor -----------------------") #ignore = ['randomForest2', 'randomForest'] ignore = [] final_prediction_s, dSl = onTopClassifier.SL_classification(dMethods, valid_s, train_s, method='svm', ignore = ignore) # Transform the probabilities in rank: #final_pred = postTreatment.rank_signals(final_pred) # Trunk the vectors for method in dMethods: yProba_s = dMethods[str(method)]['yProba_s'] yPredicted_s = dMethods[str(method)]['yPredicted_s'] yPredicted_treshold_s = postTreatment.proba_treshold(yPredicted_s, yProba_s, 0.5) # Numerical score: if type(yPredicted_s) == list: for i in range(len(yPredicted_s)): sum_s, sum_b = submission.get_numerical_score(yPredicted_s[i], valid_s[2][i]) print "Subset %i: %i elements - sum_s[%i] = %i - sum_b[%i] = %i" \ %(i, yPredicted_s[i].shape[0], i, sum_s, i, sum_b) # Get s and b for each group (s_s, b_s) and the final final_s and # final_b: final_s, final_b, s_s, b_s = submission.get_s_b_8(yPredicted_s, valid_s[2], valid_s[3]) # Balance the s and b final_s *= 250000/25000 final_b *= 250000/25000 # AMS final: AMS = hbc.AMS(final_s , final_b) print ("Expected AMS score for randomforest : %f") %AMS #AMS by group AMS_s = [] for i, (s,b) in enumerate(zip(s_s, b_s)): s *= 250000/yPredicted_s[i].shape[0] b *= 250000/yPredicted_s[i].shape[0] score = hbc.AMS(s,b) AMS_s.append(score) print("Expected AMS score for randomforest : for group %i is : %f" %(i, score)) print(" ") ############## # SUBMISSION # ############## print("-------------------------- Submission ---------------------------") # Prediction on the test set: # method used for the submission # TODO : Verifier que le nom de la method a bien la bonne forme( # creer une liste de noms de methodes) #method = "randomForest" #test_prediction_s, test_proba_s = eval(method).get_test_prediction( # dMethods[method]['predictor_s'], # test_s[1]) test_prediction_s, test_proba_s = onTopClassifier.get_SL_test_prediction( dMethods, dSl, test_s[1]) print("Test subsets signal average:") test_s_average = preTreatment.ratio_sig_per_dataset(test_prediction_s) print(" ") #RankOrder = np.arange(1,550001) if type(test_prediction_s) == list: test_prediction_s = np.concatenate(test_prediction_s) test_proba_s = np.concatenate(test_proba_s) RankOrder = onTopClassifier.rank_signals(test_proba_s) ID = np.concatenate(test_s[0]) else: ID = test_s[0] # Create a submission file: sub = submission.print_submission(ID, RankOrder , test_prediction_s) return sub
def insert_null_events(timeout=60): import time from Queue import Empty while pool.call_queue.unfinished_tasks > 0: try: yield pool.returns.get(timeout=timeout) except Empty: yield (time.time(),) out = file('live.log', 'a') lastactive = time.time() lastcheck = time.time() print 'STARTUP' startup() for k, v in analyse(insert_null_events(timeout=10)): # store out.write("%s: %s\n" % (k, v)) out.flush() print k, v # put logic here: # if currently inactive period -- check emails every 10 minutes # so that we are up to date when he comes back # also check facebook inactive = lastactive - time.time() > 10 * 60 and lastcheck - time.time() > 10 * 60 if (inactive and v) or 'switchtabs' in v: # good time to interrupt! print 'DECIDING to interrupt user' interrupt_user()
#encoding: UTF-8 import sys,os from hashlib import sha256 import json import analyse import process import remove if not os.path.exists('config.py'): sys.exit(1) result = '' init_path = sys.argv[1] if len(sys.argv) > 1 else '' if not init_path == '' or os.path.exists(init_path): result = analyse.analyse(init_path) processed = process.process( result ) remove.remove( result ,True) else: print 'i need a valid path' sys.exit(1)
import argparse def parse_args(): parser = argparse.ArgumentParser() parser.add_argument("--build", action='store_true', help="build pubmed articles vector index", required=False) parser.add_argument("--find", nargs="+", help="find article by keywords in index", required=False) return parser.parse_args() if __name__ == "__main__": args = parse_args() if args.find and args.build: print("Only one command can be passed") elif args.find: from analyse import worker as analyse keywords = " ".join(args.find) analyse(keywords) elif args.build: from build import worker as build build() exit(0)
def main(): ############### ### IMPORT #### ############### # Importation parameters: split= True normalize = True noise_var = 0. n_classes = "binary" train_size = 200000 train_size2 = 25000 valid_size = 25000 # Import the training data: print("Extracting the data sets...") start = time.clock() train_s, train2_s, valid_s, test_s = tokenizer.extract_data(split= split, normalize= normalize, noise_variance= noise_var, n_classes = n_classes, train_size = train_size, train_size2 = train_size2, valid_size = valid_size) # Remerging the y and weights of the validation if necessary: if type(valid_s[2]) == list: yValid_conca = preTreatment.concatenate_vectors(valid_s[2]) weights_conca = preTreatment.concatenate_vectors(valid_s[3]) stop = time.clock() print ("Extraction time: %i s") %(stop-start) print(" ") print(" ") ###################### ### PRE-TREATMENT #### ###################### print("------------------------- Pre-treatment --------------------------") ### Average number of signal per subset: print("Train subsets signal average:") train_s_average = preTreatment.ratio_sig_per_dataset(train_s[2]) print(" ") print("Valid subsets signal average:") valid_s_average = preTreatment.ratio_sig_per_dataset(valid_s[2]) print(" ") print(" ") ############ # ANALYSES # ############ # Dictionnary that will contain all the data for each methods. In the end # we'll have a dict of dict # Keys of the methods : {naiveBayes, svm, kNeighbors, lda, qda, adaBoost, # randomForest} dMethods ={} # NAIVE BAYES: kwargs_bayes = {} dMethods['naiveBayes'] = analyse.analyse(train_s= train_s, train2_s= train2_s, valid_s= valid_s, method_name = 'naiveBayes', kwargs = kwargs_bayes) # SVM """ kwargs_svm ={} dMethods['svm'] = analyse.analyse(train_s, valid_s,'svm', kwargs_svm) """ """ # K NEIGHBORS kwargs_kn = {'n_neighbors':50} dMethods['kNeighbors'] = analyse.analyse(train_s, valid_s, 'kNeighbors', kwargs_kn) """ # LDA kwargs_lda = {} dMethods['lda'] = analyse.analyse(train_s= train_s, train2_s= train2_s, valid_s= valid_s, method_name = 'lda', kwargs = kwargs_lda) # QDA kwargs_qda= {} dMethods['qda'] = analyse.analyse(train_s= train_s, train2_s= train2_s, valid_s= valid_s, method_name = 'qda', kwargs = kwargs_qda) """ # ADABOOST kwargs_ada= { 'n_estimators': 50, 'learning_rate': 1., 'algorithm': 'SAMME.R', 'random_state':None} dMethods['adaBoost'] = analyse.analyse(train_s, valid_s, 'adaBoost', kwargs_ada) """ # RANDOM FOREST: kwargs_randomForest= {'n_estimators': 10} dMethods['randomForest'] = analyse.analyse(train_s= train_s, train2_s= train2_s, valid_s= valid_s, method_name = 'randomForest', kwargs = kwargs_randomForest) # RANDOM FOREST 2: kwargs_randomForest= {'n_estimators': 100} dMethods['randomForest2'] = analyse.analyse(train_s= train_s, train2_s= train2_s, valid_s= valid_s, method_name = 'randomForest', kwargs = kwargs_randomForest) """ # ADABOOST2 kwargs_ada= { 'n_estimators': 100, 'learning_rate': .5, 'algorithm': 'SAMME.R', 'random_state':None} dMethods['adaBoost2'] = analyse.analyse(train_s, valid_s, 'adaBoost', kwargs_ada) # RANDOM FOREST 3: kwargs_randomForest= {'n_estimators': 100} dMethods['randomForest3'] = analyse.analyse(train_s= train_s, train2_s= train2_s, valid_s= valid_s, method_name = 'randomForest', kwargs = kwargs_randomForest) # RANDOM FOREST 4: kwargs_randomForest= {'n_estimators': 100} dMethods['randomForest4'] = analyse.analyse(train_s= train_s, train2_s= train2_s, valid_s= valid_s, method_name = 'randomForest', kwargs = kwargs_randomForest) # RANDOM FOREST 5: kwargs_randomForest= {'n_estimators': 100} dMethods['randomForest5'] = analyse.analyse(train_s= train_s, train2_s= train2_s, valid_s= valid_s, method_name = 'randomForest', kwargs = kwargs_randomForest) # GRADIENT BOOSTING: kwargs_gradB = {'loss': 'deviance', 'learning_rate': 0.1, 'n_estimators': 100, 'subsample': 1.0, 'min_samples_split': 2, 'min_samples_leaf': 200, 'max_depth': 10, 'init': None, 'random_state': None, 'max_features': None, 'verbose': 0} dMethods['gradientBoosting'] = analyse.analyse(train_s, valid_s, 'gradientBoosting', kwargs_gradB) """ print(" ") ################## # POST-TREATMENT # ################## print("------------------------ Feaure importance: -----------------------") if type(dMethods['randomForest2']['predictor_s']) == list: for i,predictor_s in enumerate(dMethods['randomForest2']['predictor_s']): print "Subset %i:" %i print predictor_s.feature_importances_ else: print "Dataset: " print dMethods['randomForest2']['predictor_s'].feature_importances_ print("------------------------ On-top predictor -----------------------") # Classifiers to be ignored: #ignore = ['randomForest2', 'randomForest'] ignore = [] clf_onTop = 'randomForest' parameters = {}#{'C': 0.5, 'kernel': 'rbf', 'degree': 3, 'gamma': 0.0, # 'coef0': 0.0, 'shrinking':True, 'probability':True, # 'tol': 0.001, 'cache_size': 200, 'class_weight': None} print ("We will use an 'on-top' predictor on %i classifiers using a %s.") \ %(len(dMethods.keys())-len(ignore), clf_onTop) final_prediction_s, dOnTop = onTopClassifier.SL_classification(dMethods, valid_s, train_s, ignore = ignore, method= clf_onTop, parameters= parameters) print("-------------------------- Tresholding -------------------------") ### ON THE 'ON-TOP' CLASSIFIER: # Create the elected vectors for each group (best AMS score) OT_best_yPredicted_s = [np.zeros(valid_s[2][i].shape[0]) for i in range(8)] OT_best_yProba_s = [np.zeros(valid_s[2][i].shape[0]) for i in range(8)] OT_best_AMS_s = [0. for i in range(8)] OT_best_method_s = [0 for i in range(8)] OT_best_ratio_s = [0 for i in range(8)] OT_best_sum_s_s = [0 for i in range(8)] OT_best_sum_b_s = [0 for i in range(8)] OT_best_method = "On-top" OT_yProba_s = dOnTop['yProba_s'] OT_yPredicted_s = dOnTop['yPredicted_s'] #Let's concatenate the vectors OT_yProba_conca = preTreatment.concatenate_vectors(OT_yProba_s) OT_yPredicted_conca = preTreatment.concatenate_vectors(OT_yPredicted_s) # Best treshold global OT_best_ratio = tresholding.best_treshold(OT_yProba_conca, yValid_conca, weights_conca) OT_yPredicted_treshold = tresholding.get_yPredicted_treshold(OT_yProba_conca, OT_best_ratio) OT_s, OT_b = submission.get_s_b(OT_yPredicted_treshold, yValid_conca, weights_conca) OT_s *= 10 OT_b *= 10 OT_best_AMS = hbc.AMS(OT_s,OT_b) # COMPARISON BEST TRESHOLD IN DMETHOD # FOR EACH METHOD: best_yPredicted_s = [np.zeros(valid_s[2][i].shape[0]) for i in range(8)] best_yProba_s = [np.zeros(valid_s[2][i].shape[0]) for i in range(8)] best_AMS_s = [0. for i in range(8)] best_method_s = [0 for i in range(8)] best_ratio_s = [0 for i in range(8)] best_AMS_1_method = 0. best_method = "methode" best_ratio = "0." for method in dMethods: yProba_s = dMethods[method]['yProba_s'] yPredicted_s = dMethods[method]['yPredicted_s'] #Let's concatenate the vectors yProba_conca = preTreatment.concatenate_vectors(yProba_s) yPredicted_conca = preTreatment.concatenate_vectors(yPredicted_s) # Best treshold global best_treshold = tresholding.best_treshold(yProba_conca, yValid_conca, weights_conca) yPredicted_treshold = tresholding.get_yPredicted_treshold(yProba_conca, best_treshold) s, b = submission.get_s_b(yPredicted_treshold, yValid_conca, weights_conca) s *= 10 b *= 10 ams = hbc.AMS(s,b) if ams > best_AMS_1_method: best_AMS_1_method = ams best_method = str(method) best_ratio = best_treshold # Let's concatenate the 8 vectors which performs the best on each on # each of the sub group and tresholding it best_yPredicted_conca = preTreatment.concatenate_vectors(best_yPredicted_s) best_treshold_conca = tresholding.best_treshold(best_yPredicted_conca, yValid_conca, weights_conca) best_yPredicted_conca_treshold = tresholding.get_yPredicted_treshold(best_yPredicted_conca, best_treshold_conca) best_final_s, best_final_b, best_s_s, best_b_s = submission.get_s_b_8(best_yPredicted_s, valid_s[2], valid_s[3]) best_s_treshold, best_b_treshold = submission.get_s_b(best_yPredicted_conca_treshold, yValid_conca, weights_conca) best_final_s *= 10 best_final_b *= 10 best_s_treshold *= 10 best_b_treshold *= 10 best_AMS = hbc.AMS(best_final_s, best_final_b) best_AMS_treshold = hbc.AMS(best_s_treshold, best_b_treshold) print "Best AMS using one of the methods : %f" %best_AMS_1_method print " method : %s" %(str(method)) print " ratio : %f" %(best_ratio) print " " print "Best AMS concatenate: %f" %best_AMS print "Best AMS concatenate after final tresholding : %f" %best_AMS_treshold print "best ratio on the concatenated vector : %f" %best_treshold_conca print " " print "Best AMS on-top : %f" %OT_best_AMS print "Best ratio on the concatenated vector : %f" %OT_best_ratio print " " """ # Best treshold group by group for i in range(8): OT_best_treshold_s = tresholding.best_treshold(OT_yProba_s[i], valid_s[2][i], valid_s[3][i]) OT_yPredicted_s[i] = tresholding.get_yPredicted_treshold(OT_yProba_s[i], OT_best_treshold_s) s, b = submission.get_s_b(OT_yPredicted_s[i], valid_s[2][i], valid_s[3][i]) s *= 250000/yPredicted_s[i].shape[0] b *= 250000/yPredicted_s[i].shape[0] ams = hbc.AMS(s,b) if ams > best_AMS_s[i]: best_yPredicted_s[i] = yPredicted_s[i] best_yProba_s[i] = yProba_s[i] best_AMS_s[i] = ams best_method_s[i] = dOnTop['method'] best_ratio_s[i] = best_treshold best_sum_s_s[i] = s best_sum_b_s[i] = b for n in range(8): print "Best AMS group %i: %f - method %s - ratio %f" \ %(n, best_AMS_s[n], best_method_s[n], best_ratio_s[n]) print "Best AMS : %f" %best_AMS_1_method print " ratio : %f" %(best_ratio) print " " """ """ ############## # SUBMISSION # ############## print("-------------------------- Submission ---------------------------") # Prediction on the test set: # method used for the submission # TODO : Verifier que le nom de la method a bien la bonne forme( # creer une liste de noms de methodes) #method = "randomForest" #test_prediction_s, test_proba_s = eval(method).get_test_prediction( # dMethods[method]['predictor_s'], # test_s[1]) test_prediction_s, test_proba_s = onTopClassifier.get_SL_test_prediction( dMethods, dOnTop, test_s[1]) print("Test subsets signal average:") test_s_average = preTreatment.ratio_sig_per_dataset(test_prediction_s) print(" ") #RankOrder = np.arange(1,550001) if type(test_prediction_s) == list: test_prediction_s = np.concatenate(test_prediction_s) test_proba_s = np.concatenate(test_proba_s) RankOrder = onTopClassifier.rank_signals(test_proba_s) ID = np.concatenate(test_s[0]) else: ID = test_s[0] # Create a submission file: sub = submission.print_submission(ID, RankOrder , test_prediction_s) return sub """ return 0
def _addalyse(solr_server, username, since_id=0, remake_profile=True, update_count=1): th = TwitterHelp() # does not use a Twitter API call if not th.twitter_contains(username): raise AddalyseUserNotOnTwitterError("Couldn't find any trace of '" + username + "'") username = th.get_screen_name(username) # canonicalize the name like a bawz (in the future, though, th.twitter_contains(sdf) might just return this canonical stuffs) # solr_server can now optionally be a StorageHandler object sh = solr_server if isinstance(solr_server, StorageHandler) else StorageHandler(solr_server) # remake if not in Solr remake_profile = remake_profile or not sh.contains(username) if remake_profile: # get all tweeets from Twitter API tweets = th.get_all_statuses(username) if not tweets: e = AddalyseUnableToProcureTweetsError("I couldn't for the love of me extract some tweets for '" + username + "'. Maybe they just doesn't have any?") e.remake_profile = True raise e # latest tweet is first in lists new_since_id = tweets[0].id # assumes that the # send to analysis print "addalyse(remake_profile=" + str(remake_profile) + "): analyzing, '" + username + "'" (lovekeywords, hatekeywords) = filter_analysis(analyse(map(lambda x: x.GetText(), tweets))) # store result in sunburnt print "addalyse(remake_profile=" + str(remake_profile) + "): adding, '" + username + "'" sh.add_profile(username, lovekeywords, hatekeywords, new_since_id, update_count) print "addalyse(remake_profile=" + str(remake_profile) + "): done" else: tweets = th.get_all_statuses(username, since_id) # get all tweets since since_id if not tweets: e = AddalyseUnableToProcureTweetsError("I couldn't for the love of me extract some tweets for '" + username + "'. Maybe they just doesn't have any new ones?") e.remake_profile = False raise e new_since_id = tweets[0].id # MERGING # send to analysis print "addalyse(remake_profile=" + str(remake_profile) + "): analyzing, '" + username + "'" (lovekeywords, hatekeywords) = analyse(map(lambda x: x.GetText(), tweets)) # Don't filter the new analysis just yet, merge it first! # get a users old hatekeywords_list and lovekeywords_list doc = sh.get_user_documents(username, 'lovekeywords_list', 'hatekeywords_list')[0] (lovekeywords_old, hatekeywords_old) = (doc.lovekeywords_pylist, doc.hatekeywords_pylist) # merge tuples. Also now that we are done mergeing we can start looking for keywords with a too low weight (lovemerge, hatemerge) = filter_analysis((merge_keywords(lovekeywords, lovekeywords_old), merge_keywords(hatekeywords, hatekeywords_old))) # add merged result to database print "addalyse(remake_profile=" + str(remake_profile) + "): adding, '" + username + "'" sh.add_profile(username, lovemerge, hatemerge, new_since_id, update_count) print "addalyse(remake_profile=" + str(remake_profile) + "): done" # returns true if added to database return True #TODO: should this return True?
print "Deleting the column..." for i in range(8): for index_column in L_delete: xsTrain_s[i] = np.delete(xsTrain_s[i], np.s_[index_column],1) xsValid_s[i] = np.delete(xsValid_s[i], np.s_[index_column],1) xsTest_s[i] = np.delete(xsTest_s[i], np.s_[index_column],1) print "Training each groups" dMethods ={} # NAIVE BAYES: kwargs_bayes = {} dMethods['naiveBayes'] = analyse.analyse(train_s, valid_s, 'naiveBayes', kwargs_bayes) # SVM """ kwargs_svm ={} dMethods['svm'] = analyse.analyse(train_s, valid_s,'svm', kwargs_svm) """ # K NEIGHBORS kwargs_kn = {'n_neighbors':50} dMethods['kNeighbors'] = analyse.analyse(train_s, valid_s, 'kNeighbors', kwargs_kn) # LDA kwargs_lda = {} dMethods['lda'] = analyse.analyse(train_s, valid_s, 'lda', kwargs_lda) # QDA kwargs_qda= {} dMethods['qda'] = analyse.analyse(train_s, valid_s, 'qda', kwargs_qda) # ADABOOST
def analysis(sentence): return analyse(sentence)