def plot_spectrograms(data, config): """ This function makes a plot of 4 random spectrograms and their respective histograms cube (np.array) the processed data loaded in train.py """ fig, axs = plt.subplots(4, 7, figsize=(10, 10)) # Get all the correct cubes p = preprocessor(data) p.get_magnitude_and_phase() mag, phase = p.get_processed_cube()[..., 0:1], p.get_processed_cube()[..., 1:2] p = preprocessor(data) p.interp(config['n_frequencies'], config['n_time_steps']) p.get_magnitude_and_phase() mag_interp, phase_interp = p.get_processed_cube()[ ..., 0:1], p.get_processed_cube()[..., 1:2] p = preprocessor(data) p.interp(config['n_frequencies'], config['n_time_steps']) p.get_magnitude() p.median_threshold() p.minmax(per_baseline=True, feature_range=(np.min(phase_interp), np.max(phase_interp))) mag_interp_thresh = p.get_processed_cube() for i in range(4): r = np.random.randint(0, data.shape[0]) im = axs[i, 0].imshow(data[r, ..., 0]) axs[i, 0].title.set_text('Real Component') plt.colorbar(im, ax=axs[i, 0]) im = axs[i, 1].imshow(data[r, ..., 1]) axs[i, 1].title.set_text('Imaginary Component') plt.colorbar(im, ax=axs[i, 1]) im = axs[i, 2].imshow(mag[r, ..., 0]) axs[i, 2].title.set_text('Magnitude Component') plt.colorbar(im, ax=axs[i, 2]) im = axs[i, 3].imshow(phase[r, ..., 0]) axs[i, 3].title.set_text('Phase Component') plt.colorbar(im, ax=axs[i, 3]) im = axs[i, 4].imshow(mag_interp[r, ..., 0]) axs[i, 4].title.set_text('Magnitude component interpolated') plt.colorbar(im, ax=axs[i, 4]) im = axs[i, 5].imshow(phase_interp[r, ..., 0]) axs[i, 5].title.set_text('Phase component interpolated') plt.colorbar(im, ax=axs[i, 5]) im = axs[i, 6].imshow(mag_interp_thresh[r, ..., 0]) axs[i, 6].title.set_text( 'Magnitude component interpolated and thresholded') plt.colorbar(im, ax=axs[i, 6]) return plt
def data_generator(num_files ): first_flag = False ms_files = get_files(filter='None') for i in tqdm(range(0,num_files)): c = next(ms_files) cubes = get_cube(c) p = preprocessor.preprocessor(cubes) p.interp(32, 128) if not first_flag: output = p.get_processed_cube() first_flag = True else: output = np.concatenate((output,p.get_processed_cube()),axis=0) if not os.path.exists('datasets'): os.mkdir('datasets') info = {'Description':'LOFAR training set', 'Features':'Unlabelled' , 'Dimensions':(32,128), 'Source':'LOFAR MS'} f_name = 'datasets/LOFAR_dataset_{}.pkl'.format(datetime.datetime.now().strftime("%d-%m-%Y")) pickle.dump([output,np.zeros([1,1,1,1]), np.zeros([1,1,1,1]),np.zeros([1,1,1,1]),info], open(f_name, 'wb'), protocol=4) print('{} Saved!'.format(f_name))
def test_empty_lines_are_removed(self): grammar = """ """ self.assertEqual("\n" + preprocessor(grammar), "" + self.permanent_suffix)
def evaluateSavedModel(): # Choose saved model to evaluate model = load_model("saved/Piczak_CNN_pretrainsingle_trainmulti.h5") # Setup preprocessor for loading extracted features pp = preprocessor(parent_dir='../UrbanSound8K/audio') # extracted features that should be loaded to calculate mean and std values train_dirs = [ "audio_overlap/folder1_overlap", "audio_overlap/folder3_overlap", "audio_overlap/folder4_overlap", "audio_overlap/folder5_overlap", "audio_overlap/folder6_overlap", "audio_overlap/folder7_overlap", "audio_overlap/folder8_overlap", "audio_overlap/folder9_overlap", "audio_overlap/folder10_overlap" ] #pp.data_prep(train_dirs=[], test_fold="fold2", load_path="../UrbanSound8K/audio/extracted_short_60/") # Load features test_folder = "fold2" pp.load_extracted_fts_lbs(train_dirs=train_dirs, test_fold=test_folder) tb = TensorBoard( log_dir= './TensorBoard/piczak_CNN_singlelabel_pretrain_continue_multilabel') # model.fit(pp.train_x, pp.train_y,validation_split=.1, epochs=25, # batch_size=256, verbose=2, callbacks=[tb]) print("model evaluation") scores = model.evaluate(pp.test_x, pp.test_y, verbose=2) print("loss: {0}, test-acc: {1}".format(scores[0], scores[1])) # Make predictions preds = model.predict(pp.test_x) # Evaluate predictions evaluateModel(pp, preds, test_folder)
def test_parent_hierarchy(self): semtypes = {"#foo": 1} grammar = "FOO: BAR" self.assertEqual( preprocessor(grammar, semtypes), """FOO: "#foo" | (BAR) eps_foo: FOO | empty_foo empty_foo: """ + self.permanent_suffix)
def explain_timestep_distribution(self): """ Goal: Display info about the distribution of timesteps """ from scipy.stats import norm from scipy.stats import mode if not os.path.isfile(self.DATA_FILE_PATH): p = preprocessor.preprocessor() p.preprocess(start_anew=True, quick_validation=True, display_epochs=False) data = h5py.File(self.DATA_FILE_PATH, 'r') timesteps = [] for key in data.keys(): timestep = data[key]['eeg'].shape[0] timesteps.append(float(timestep)) # code adapted from http://stackoverflow.com/questions/20011122/fitting-a-normal-distribution-to-1d-data # Fit a normal distribution to the data: mu, std = norm.fit(timesteps) print("{} examples. Shortest # samples is {}. Longest # samples is {}. Mean is {:.2f}. Mode is {}. Standard Deviation is {:.2f}.".format(len(timesteps), min(timesteps), max(timesteps), mu, mode(timesteps)[0], std)) plt.hist(timesteps, bins=100, normed=True, alpha=0.6, color='g') # # Plot the PDF. # xmin, xmax = plt.xlim() # x = np.linspace(xmin, xmax, 100) # p = norm.pdf(x, mu, std) # plt.plot(x, p, 'k', linewidth=2) # samapling frequency -> 1100 samples per second * x seconds = 2500 samples, solve for x title = "Fit results (# samples at 1100 Hz sfreq): mu = %.2f, std = %.2f" % (mu, std) plt.title(title) plt.show() data.close()
def parse(self, string): global err,syn_error string2 = preprocessor.preprocessor(string) if (string != string2): print("Program code after preprocessing: ", string2) string=string2 from preprocessor import err as err from preprocessor import syn_error as syn_error #print(string) lexer=MU0Parser.__Lexer() self.parser.parse(string, lexer=lexer.lexer) #print(syn_error,err) if MU0Parser.s.length()==0 and MU0Parser.brstack.length()==0 and MU0Parser.contstack.length()==0 and MU0Parser.forstack.length()==0 and syn_error==False: err='' err=err+"No syntax errors detected." print("No syntax errors detected.") else: if MU0Parser.s.length()==0: print("Syntax error detected - continue/break used outside loop") else: print("Syntax error detected.") ins=MU0Parser.ins data=MU0Parser.data self.restart() return ins + list(data)
def __init__(self): self.filename = os.path.join(os.path.expandvars("%appdata%"), "latex-access.conf") self.speech_translator = speech.speech() self.preprocessor = preprocessor.preprocessor() self.activateSettings() self.newcommands = preprocessor.newcommands(self.preprocessor)
def test_if_epsilon_nonterminal_was_added_for_terminal(self): grammar1 = """ TERMINAL: "foo" """ self.assertEqual( preprocessor(grammar1).strip(), 'TERMINAL:("foo")\neps_terminal: TERMINAL | empty_terminal\nempty_terminal: ' + self.permanent_suffix) grammar2 = """ TERMINAL: "foo" TERMINAL: "bar" """ self.assertEqual( preprocessor(grammar2).strip(), 'TERMINAL:("foo")|("bar")\neps_terminal: TERMINAL | empty_terminal\nempty_terminal: ' + self.permanent_suffix)
def test_if_coordination_is_added_for_single_suffix(self): grammar = """ t_attr_single: t_quality* ATTR """ self.assertEqual( preprocessor(grammar), "t_attr_single:(t_quality* ATTR)\nt_attr: (t_attr_single) | ((t_attr_single \",\")+ t_attr_single) | ((t_attr_single \",\")* t_attr_single COORD_A t_attr_single)" + self.permanent_suffix)
def test_merge_of_two_left_sides(self): """ Test if the left sides are merged if they are across multiple lines """ grammar = """ sentence: foo sentence: bar """ self.assertEqual( preprocessor(grammar).strip(), 'sentence:(foo)|(bar)' + self.permanent_suffix)
def to_magnitude(complex_data): data = np.array([complex_data.real, complex_data.imag]) data = np.swapaxes(np.array(data), 0, -1) p = preprocessor(np.expand_dims(data, axis=0)) p.interp(32, 128) # interpolate p.get_magnitude() return p.get_processed_cube()[ 0, ...] # dont want the first component as this comes when we concatenate
def test_order_of_rules_is_untouched(self): """ Test if the content of rule is untouched """ grammar = """ sentence: foo foo: bar """ self.assertEqual( preprocessor(grammar).strip(), 'sentence:(foo)\nfoo:(bar)' + self.permanent_suffix)
def piczac_cross_validation(epochs, load_path): train_dirs = [] n_folders = 10 for i in range(1, n_folders + 1): #train_dirs.append('fold{0}'.format(i)) train_dirs.append('folder{0}_overlap'.format(i)) print(train_dirs) for fold in ((10, 1), (1, 2), (2, 3), (3, 4), (4, 5), (5, 6), (6, 7), (7, 8), (8, 9), (9, 10)): val_fold = 'folder{0}_overlap'.format(fold[0]) test_fold = 'folder{0}_overlap'.format(fold[1]) #val_fold = 'fold{0}'.format(fold[0]) #test_fold = 'fold{0}'.format(fold[1]) train_dirs.remove(val_fold) train_dirs.remove(test_fold) pp = preprocessor(parent_dir='../../data/UrbanSound8K/audio') pp.load_extracted_fts_lbs(train_dirs=train_dirs, val_fold=val_fold, test_fold=test_fold, load_path=load_path) model = piczak_CNN_multi(input_dim=pp.train_x[0].shape, output_dim=pp.train_y.shape[1]) print("done") print("OPTIMIZER") #print(model.optimizer.lr) #K.set_value(model.optimizer.lr, 0.002) #model.optimizer.lr.set_value(0.0001) #model.save('Models/model1_all_p2_bn{0}.h5'.format(str(fold))) #model = load_model('Models/model1_all_p2{0}.h5'.format(str(fold))) #model = load_model('Models/model1_all_p2_bnsec_overlap_{0}.h5'.format(str(fold))) tb = TensorBoard(log_dir='./TensorBoard/' + 'overlap_run{0}'.format(fold[1])) es = EarlyStopping(patience=10, verbose=1) model.fit(pp.train_x, pp.train_y, validation_data=[pp.val_x, pp.val_y], epochs=epochs, batch_size=1000, verbose=2, callbacks=[tb, es]) #model.save('Models/model1_all_p2_bnsec_overlap_9010_{0}.h5'.format(str(fold))) preds = model.predict(pp.test_x) evaluateModel(pp, preds, fold) K.clear_session() train_dirs.append(val_fold) train_dirs.append(test_fold)
def test_merge_of_two_left_sides_with_inserted_lines(self): grammar = """ sentence: foo // this is my comment foo: bar sentence: bar """ self.assertEqual( preprocessor(grammar).strip(), 'sentence:(foo)|(bar)\nfoo:(bar)' + self.permanent_suffix)
def plot_scatter(autoencoder, data): """ This function plots a 2d scatter plot with the data superimposed over each point autoencoder (keras.model) the autoencoder based model data (np.array) the preprocessed training data that is in a list format. With the first index being magnitude and the second phase. """ plt.rcParams['image.cmap'] = 'viridis' encoder,mag_phase_flag = load_encoder(autoencoder) if not mag_phase_flag: mag_data = data p = preprocessor(mag_data) it = 1 else: mag_data = data mag_data = [mag_data[0], mag_data[1]] it = 2 fig,ax = plt.subplots(1,it,figsize=(20,10)); for i in range(it): p = preprocessor(mag_data[i]) embeddings,_,_ = encoder.predict(mag_data) p.interp(20,20) _data = p.get_processed_cube() for x, y, image_path in zip(embeddings[:,0], embeddings[:,1], _data[...,0]): imscatter(x, y, image_path, zoom=0.7, ax=ax[i]) if it == 1: ax = [ax] # a hack to deal with single index ax[i].title.set_text(titles[i]); ax[i].grid(); ax[i].set_xlim([-6,6]) ax[i].set_ylim([-6,6]) plt.suptitle('Scatter Plot of Embedding with Inputs Overlayed'); plt.savefig('/tmp/temp.png',dpi=600) img=mpimg.imread('/tmp/temp.png') fig = plt.figure(figsize=(10, 10)) plt.imshow(img) plt.axis('off') return plt
def num_examples(self): """ Goal: Return the number of examples in the data file """ if not os.path.isfile(self.DATA_FILE_PATH): p = preprocessor.preprocessor() p.preprocess(start_anew=True, quick_validation=True, display_epochs=False) data = h5py.File(self.DATA_FILE_PATH, 'r') num_examples = len(list(data.keys())) data.close() return num_examples
def plot_preprocessing(data): fig, ax = plt.subplots(1, 4) r = randint(0, data.shape[0]) p = preprocessor(data) p.interp(config['n_frequencies'], config['n_time_steps']) # always interpolate p.get_magnitude_and_phase() d = p.get_cube() #Interpolated cube im = ax[0].imshow(d[r, ..., 0]) ax[0].title.set_text('Original Interpolated Image') fig.colorbar(im, ax=ax[0]) #Magnitude of original cube p.get_magnitude_and_phase() p_cube = p.get_processed_cube() im = ax[1].imshow(p_cube[r, ..., 0]) fig.colorbar(im, ax=ax[1]) ax[1].title.set_text('Magnitude of Interpolated Image') p.sigma_threshold(2) # Standardised cube p.standardise(per_baseline=config['per_baseline']) s_cube = p.get_processed_cube() im = ax[2].imshow(s_cube[r, ..., 0]) ax[2].title.set_text('Standardised Interpolated Image') fig.colorbar(im, ax=ax[2]) # Minmax scaled cube p = preprocessor(p_cube) p.minmax(per_baseline=config['per_baseline']) m_cube = p.get_processed_cube() im = ax[3].imshow(m_cube[r, ..., 0]) ax[3].title.set_text('Min Max Interpolated Image') fig.colorbar(im, ax=ax[3]) return plt
def filter_and_dict(table_of_strings, stop_words): data = [] for tweet in table_of_strings: tweet_words = preprocessor().preprocess(tweet[0], stop_words) temp_dict = {} for word in tweet_words: if word in temp_dict.keys(): temp_dict[word] = temp_dict[word] + 1 else: temp_dict[word] = 1 data.append(temp_dict) return data
def test_generate_merged_terminals_wo_naives(self): semtypes = {'#floskule^#measure': 1} grammar = "" self.assertEqual( preprocessor(grammar, semtypes), """FLOSKULE: "#floskule" | "#floskule^#measure" MEASURE: "#floskule^#measure" | "#measure" eps_floskule: FLOSKULE | empty_floskule empty_floskule: eps_measure: MEASURE | empty_measure empty_measure: """ + self.permanent_suffix)
def piczac_cross_validation(epochs, load_path): train_dirs = [] n_folders = 10 for i in range(1, n_folders + 1): train_dirs.append('fold{0}'.format(i)) cvscores = [] for folds in [(9, 10)]: val_fold = 'fold' + str(folds[0]) test_fold = 'fold' + str(folds[1]) # Remove validation and test from train train_dirs.remove(val_fold) train_dirs.remove(test_fold) print("Run {0}: test folder is fold{0}".format(folds[1]) + ", validation folder is fold{0}".format(folds[0])) # tb = TensorBoard(log_dir='./TensorBoard/short_60/' + 'run{0}'.format(folds[1])) es = EarlyStopping(patience=10, verbose=1) pp = preprocessor() pp.load_extracted_fts_lbs(load_path=load_path, train_dirs=train_dirs, test_fold=test_fold, val_fold=val_fold) train_dirs.append(val_fold) train_dirs.append(test_fold) print("Data prep completed") model = piczak_CNN(input_dim=pp.train_x[0].shape, output_dim=pp.train_y.shape[1]) print("Model built") model.fit(pp.train_x, pp.train_y, validation_data=[pp.val_x, pp.val_y], epochs=epochs, batch_size=1000, verbose=2, callbacks=[es]) print("Model trained") output_model_file = 'models/long60_' + str(epochs) + '_' + str( folds) + '.h5' model.save(output_model_file) scores = model.evaluate(pp.test_x, pp.test_y, verbose=0) print("loss: {0}, test-acc: {1}".format(scores[0], scores[1])) cvscores.append(scores[1] * 100) K.clear_session() print("Average performance after cross-validation: %.2f%% (+/- %.2f%%)" % (np.mean(cvscores), np.std(cvscores)))
def piczac_cross_validation(epochs, load_path): for fold in ((10, 1), (1, 2), (2, 3), (3, 4), (4, 5), (5, 6), (6, 7), (7, 8), (8, 9), (9, 10)): val_fold = 'overlap/fold{0}_overlap_{1}dB'.format( fold[0], volumeOverlay) test_fold = 'overlap/fold{0}_overlap_{1}dB'.format( fold[1], volumeOverlay) single_fold = 'fold{0}'.format(fold[1]) train_dirs.remove(val_fold) train_dirs.remove(test_fold) model = load_model('Models/short60_300_{0}.h5'.format(str(fold))) pp = preprocessor( parent_dir='C:\\Deep Learning Dataset\\UrbanSound8K\\audio_overlap' ) pp.load_extracted_fts_lbs(load_path=load_path, train_dirs=train_dirs, val_fold=val_fold, test_fold=test_fold, single_fold=single_fold) scores = model.evaluate(pp.test_x, pp.test_y, verbose=0) print("Test_fold: {2} Pretrain overlap - loss: {0}, test-acc: {1}". format(scores[0], scores[1], fold[1])) scores = model.evaluate(pp.single_x, pp.single_y, verbose=0) print( "Test_fold: {2} Pretrain single - loss: {0}, test-acc: {1}".format( scores[0], scores[1], fold[1])) tb = TensorBoard(log_dir='./TensorBoard/' + 'overlap_run{0}'.format(fold[1])) es = EarlyStopping(patience=10, verbose=1) model.fit(pp.train_x, pp.train_y, validation_data=[pp.val_x, pp.val_y], epochs=epochs, batch_size=1000, verbose=0, callbacks=[tb, es]) scores = model.evaluate(pp.test_x, pp.test_y, verbose=0) print("Test_fold: {2} Posttrain - loss: {0}, test-acc: {1}".format( scores[0], scores[1], fold[1])) scores = model.evaluate(pp.single_x, pp.single_y, verbose=0) print("Test_fold: {2} Posttrain single - loss: {0}, test-acc: {1}". format(scores[0], scores[1], fold[1])) K.clear_session() train_dirs.append(val_fold) train_dirs.append(test_fold)
def preprocess(self, data, is_cat=[], num_quantiles=20, weighted=False, nthread=-1): self.prep = preprocessor() IDs, X, w, delta = self.prep.preprocess(data=data, is_cat=is_cat, num_quantiles=num_quantiles, weighted=weighted, nthread=nthread) self.X_colnames = X.columns.values.tolist() self.X_colnames = [ item if item != 't_start' else 'time' for item in self.X_colnames ] return IDs, X, w, delta
def main(): #Send in the clowns!! cmdline = sys.argv fil = "" translated = "" output_file = "zzzz_output.bridge" exec_code = False processCommandline(cmdline) #print(cmdline) if len(cmdline) == 1: print("Usage: lark_bridge file_to_translate [options]") print("You must pass a file to translate!") sys.exit(1) else: fil = open(cmdline[1], "r", encoding=Config.encoding).read() preprocessed = preprocessor.preprocessor(fil) translated = bridge(preprocessed) #,True) print(translated) if exec_code: exec(translated)
def plot_confusion_matrix(model_filename, load_path, save=False): classes = ['air_conditioner', 'car_horn', 'children_playing', 'dog_bark', 'drilling', 'engine_idling', 'gun_shot', 'jackhammer', 'siren', 'street_music'] model = load_model(model_filename) n_folders = 10 train_dirs = [] for i in range(1, n_folders + 1): train_dirs.append('fold{0}'.format(i)) test_fold = 'fold' + model_filename.split(', ')[1].split(')')[0] val_fold = 'fold' + model_filename.split(', ')[0].split('(')[1] # example: long200_150_(1,2).h5 pp = preprocessor() pp.load_extracted_fts_lbs(train_dirs=train_dirs, test_fold=test_fold, val_fold=val_fold, load_path=load_path) preds = model.predict_classes(pp.train_x, verbose=0) # write_preds(preds, output_predictions_file) cm = metrics.confusion_matrix(np.argmax(pp.train_y, axis=1), preds) if save: utils.save_confusion_matrix(cm, classes) else: utils.plot_confusion_matrix(cm, classes)
def __init__(self): self.__preprocessor = preprocessor() self.__imgsize = 0 self.__kernels = [] self.__kernel_indx = [] self.__path = ""
def preprocessor(): # reprocessor data from fetch for file in getFile('result'): pre.preprocessor(file)
def encode(self, complex_db_cube): """ Encodes the cube with a pretrained encoding Keras model that is specified in settings. :param complex_db_cube: numpy.array with shape (baseline,subband,timestamp,pol :return: numpy.array with shape (baseline,D) where D is dependent on input shape size and model. """ if complex_db_cube is None or len(complex_db_cube.shape) != 4: raise ValueError( 'Data is not in correct format: numpy.array with shape (baseline,subband,timestamp,pol)' ) print('This is complex cube shape after mean {}'.format( complex_db_cube.shape)) ##################################3 complex_db_cube = np.concatenate( [complex_db_cube[..., 0:1], complex_db_cube[..., 4:5]], axis=3) print('This is complex cube shape after mean {}'.format( complex_db_cube.shape)) ##################################3 p = preprocessor.preprocessor(complex_db_cube) p.interp(self.config['n_frequencies']['value'], self.config['n_time_steps']['value']) cube = p.get_processed_cube() if self.config['architecture']['value'] == 'skip': p.get_phase() phase_cube = p.get_processed_cube() p = preprocessor.preprocessor(cube) p.get_magnitude() p.median_threshold() p.minmax(per_baseline=True, feature_range=(np.min(phase_cube), np.max(phase_cube))) encoded, _, _ = self.encoder.predict( [p.get_processed_cube(), phase_cube]) return encoded.reshape(encoded.shape[0], np.product(encoded.shape[1:])) elif self.config['mag_phase']['value']: p.get_magnitude_and_phase() elif self.config['magnitude']['value']: p.get_magnitude() p_cube = p.get_processed_cube() self.config['n_layers']['value'] = p_cube.shape[ -1] # TODO: This might cause problems elif self.config['phase']['value']: p.get_phase() p_cube = p.get_processed_cube() self.config['n_layers']['value'] = p_cube.shape[ -1] # TODO: This might cause problems if self.config['median_threshold']['value']: p.median_threshold( per_baseline=self.config['per_baseline']['value']) if self.config['db']['value']: p.mag2db() if self.config['wavelets']['value']: p.wavelet_decomp_2D() if self.config['flag']['value']: #TODO raise Exception('Flagging Code Not Written') if self.config['freq']['value']: #TODO raise Exception('Frequency Domain Code Not Written') if self.config['standardise']['value']: p.standardise(per_baseline=self.config['per_baseline']['value']) elif self.config['minmax']['value']: p.minmax(per_baseline=self.config['per_baseline']['value']) real_cube = p.get_processed_cube() #use preprocessor to reshape cubes if self.config['architecture']['value'] == 'vae': encoded, _, _ = self.encoder.predict(real_cube) else: encoded = self.encoder.predict(real_cube) #encoded = np.mean(real_cube,axis=3)[:,::4,::4] print('This is complex cube shape after mean {}'.format(encoded.shape)) return encoded.reshape(encoded.shape[0], np.product(encoded.shape[1:]))
import nltk.data from preprocessor import preprocessor # Classify the text from the Search API classifier = nltk.data.load("classifiers/naive_bayes.pickle") text = preprocessor().preprocess(textt,[]) label = classifier.classify(features_extractor(text)) # Find its Probability if label == 'traffic': probability_dict = classifier.prob_classify(test) probability = probability_dict.prob('traffic') #
def test_commands_are_left_untouched(self): grammar = "\%ignore abc" self.assertEqual(preprocessor(grammar), '\%ignore abc' + self.permanent_suffix)
def trainClassifier(conn, cursor, tablename, test_tweet, enable_evaluation): """Train the Naive Bayes""" stop_words = [] # Fetch all the stop words # try: # query_sw = "SELECT word FROM stop_words limit 35" # cursor.execute(query_sw) # sw = cursor.fetchall() # stop_words = filter_tweets(sw) # print(stop_words) # except: # Get the most recent exception # exceptionType, exceptionValue, exceptionTraceback = sys.exc_info() # print "Select Error -> %s" % exceptionValue # lastid="0" # Fetch all the traffic tweets try: query_pt = "SELECT tweet FROM "+ tablename +" WHERE ptraffic='y' ORDER BY tid ASC LIMIT 681" cursor.execute(query_pt) ttweets = cursor.fetchall() except: # Get the most recent exception exceptionType, exceptionValue, exceptionTraceback = sys.exc_info() print "Select Error -> %s" % exceptionValue lastid="0" # Fetch all the non-traffic tweets try: query_nt = "SELECT tweet FROM "+ tablename +" WHERE ntraffic='y' ORDER BY tid ASC LIMIT 681" cursor.execute(query_nt) nttweets = cursor.fetchall() except: # Get the most recent exception exceptionType, exceptionValue, exceptionTraceback = sys.exc_info() print "Select Error -> %s" % exceptionValue lastid="0" # If the user chose to evaluate the classifier fetach more labelled tweets for testing if enable_evaluation == 'test': # Fetch all the traffic tweets for the evaluation try: query_pt = "SELECT tweet FROM "+ tablename +" WHERE ptraffic='y' ORDER BY tid DESC LIMIT 375" cursor.execute(query_pt) ttweets_test = cursor.fetchall() except: # Get the most recent exception exceptionType, exceptionValue, exceptionTraceback = sys.exc_info() print "Select Error -> %s" % exceptionValue lastid="0" # Fetch all the non-traffic tweets for the evaluation try: query_nt = "SELECT tweet FROM "+ tablename +" WHERE ntraffic='y' ORDER BY tid DESC LIMIT 375" cursor.execute(query_nt) nttweets_test = cursor.fetchall() except: # Get the most recent exception exceptionType, exceptionValue, exceptionTraceback = sys.exc_info() print "Select Error -> %s" % exceptionValue lastid="0" try: # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< # >>>>>>>>>>>>>>>>>>>>>>>>>> TRAIN SET <<<<<<<<<<<<<<<<<<<<<<<<<< # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< # Apply preprocessing on the traffic tweets for the train set data=[] for text in ttweets: temp = preprocessor().preprocess(text[0],stop_words) data.append(temp) traffic_tweets=add_label(data, 'traffic') # Apply preprocessing on the non-traffic tweets for the train set data=[] for text in nttweets: temp = preprocessor().preprocess(text[0],stop_words) data.append(temp) nontraffic_tweets=add_label(data, 'nontraffic') # Merge the tweets for the train set combined_tweets = traffic_tweets + nontraffic_tweets # Extract the features for the train set temp = [] for i in range(len(combined_tweets)): temp.append(((features_extractor(combined_tweets[i][0])),combined_tweets[i][1])) train_set=temp # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< # >>>>>>>>>>>>>>>>>>>>>>>>>> TEST SET <<<<<<<<<<<<<<<<<<<<<<<<<<< # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< # If the user chose to evaluate the classifier create a test_set if enable_evaluation == 'test': # Apply preprocessing on the traffic tweets for the test set data=[] for text in ttweets_test: temp = preprocessor().preprocess(text[0],stop_words) data.append(temp) traffic_tweets_test=add_label(data, 'traffic') # Apply preprocessing on the non-traffic tweets for the test set data=[] for text in nttweets_test: temp = preprocessor().preprocess(text[0],stop_words) data.append(temp) nontraffic_tweets_test=add_label(data, 'nontraffic') # Merge the tweets for the test set combined_tweets_test = traffic_tweets_test + nontraffic_tweets_test # Extract the features for the test set temp = [] for i in range(len(combined_tweets_test)): temp.append(((features_extractor(combined_tweets_test[i][0])),combined_tweets_test[i][1])) test_set=temp # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< # >>>>>>>>>>>>>>>>>>>>>>>>>> TRAIN THE CLASSIFIER <<<<<<<<<<<<<<<<<<<<<<<<<<< # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< # Train our classifier using the training set classifier = nltk.NaiveBayesClassifier.train(train_set) # Save the classifier in a .pickle file name = 'naive_bayes.pickle' fname = os.path.join(os.path.expanduser('~/nltk_data/classifiers'), name) dump_classifier(classifier, fname) # Classify the tweet test_tweet1 = preprocessor().preprocess(test_tweet,stop_words) test = features_extractor(test_tweet1) proba = classifier.prob_classify(test) print "\nThe tweet '%s' is about: %s with probability: %s\n" % (test_tweet, classifier.classify(test),proba.prob('traffic')) # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< # >>>>>>>>>>>>>>>>>>>>>>>>>> TEST THE CLASSIFIER <<<<<<<<<<<<<<<<<<<<<<<<<<< # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< # If the user chose to evaluate the classifier apply the evaluation techniques if enable_evaluation == 'test': evaluation(test_set,classifier) except: # Get the most recent exception exceptionType, exceptionValue, exceptionTraceback = sys.exc_info() print "Error -> %s" % exceptionValue lastid="0"
def t_SOURCE(t): r'[^(\-\>|\|\#|\,)].*' scope = 0 token_lexpos = 0 dquotes = 0 squotes = 0 # i.e. 0 if token is "1" in "1 -> print" and 5 if token is "print" in "1 -> print" # TODO: .find() overlapping in `1 -> 1 -> 1`, .rfind() not, is .rfind() safe? relative_lexpos = t.lexer.lexdata.rfind(t.value) # Iterate token while token_lexpos < len(t.value): char = t.value[token_lexpos] # " ... " or ' ... ' ? if dquotes or squotes: if char == '\"': if dquotes: dquotes = dquotes - 1 if char == '\'': if squotes: squotes = squotes - 1 # Ignore other characters else: # `[]` , `()` and '{}' if char in '[({': scope = scope + 1 if char in '])}': scope = scope - 1 # `"` and `'` if char == '\"': dquotes = dquotes + 1 if char == '\'': squotes = squotes + 1 # i.e. `... X` and not `[ ... X ]` , `( ... X )` ' { ... X } ' , `" ... X"` or `' ... X'` if not dquotes and not squotes and not scope: # Comma if char == ',': # END OF TOKEN (i.e. 1 -> print , ... ) break # Operator if char == '|' or \ (char == '-' and token_lexpos + 1 < len(t.value) and t.value[token_lexpos + 1] == '>'): # END OF TOKEN (i.e. 1 -> ...) break token_lexpos = token_lexpos + 1 # Break? if token_lexpos != len(t.value): # Calculate a new lexpos t.lexer.lexpos = relative_lexpos + token_lexpos # Send strip()'ed token to preprocessor t.type, t.value = preprocessor.preprocessor(t.value[:token_lexpos].rstrip(), stmt_as_is=t.lexer.stmt_as_is) return t
from crldriver import crldriver from SQLinterface import SQLinterface from preprocessor import preprocessor crldriver = crldriver(headless=True) p = preprocessor() interface = SQLinterface(passwd='0000', dbname='mining') #interface.push('keywords', type1 = 1, type2 = 1, word = 'test', count = 1, date = '2020.07.29') #interface.showall('keywords') #interface.init_table('keywords') #interface.init_id('keywords') #interface.delete_column('keywords', 'date') #interface.add_column('keywords', 'date', 'DATE') #interface.new_table('testtb') #interface.show_table() #interface.delete_table('testtb') def dbupload(packet, type1, type2): for m in packet.keys(): kwrds = interface.showall('Keywords') interface.init_table('Keywords') interface.init_id('Keywords') searched = {} for i in kwrds: searched[i['word']] = i['count'] #print(searched) for n in packet[m]: L = list(p.keywording(n)) for i in L:
def wavelet_decomp(self, data): p = preprocessor.preprocessor(data) p.wavelet_decomp_2D() return p.get_processed_cube()
def test_parentheses_are_added_to_right_side(self): """ Test if the parentheses are added to the right side of the rule """ grammar = " sentence: foo " self.assertEqual( preprocessor(grammar).strip(), 'sentence:(foo)' + self.permanent_suffix)
def main(args): # Build mask parameters DataFrame df_params = pd.DataFrame({"mask_name" : args.mask_name,\ "slice_axis" : args.slice_axis,\ "n_patches" : args.n_patches,\ "overlap" : args.overlap, \ "rotation" : args.rotation}) # print(df_params) mpl.use(args.mpl_agg) data_io.show_header() if not os.path.exists(args.seg_path): os.makedirs(args.seg_path) if args.run_seg: # Understand input data format if os.path.isdir(args.ct_fpath): tiff_input = True elif args.ct_fpath.split('.')[-1] in ("hdf5", "h5"): tiff_input = False if args.ct_data_tag == "": raise ArgumentTypeError("dataset-name required for hdf5") else: raise ArgumentTypeError( "input file type not recognized. must be tiff folder or hdf5 file" ) ct_dfile = data_io.DataFile(args.ct_fpath, \ tiff = tiff_input,\ data_tag = args.ct_data_tag, \ VERBOSITY = args.rw_verbosity) ct_dfile.show_stats() chunk_shape = ct_dfile.chunk_shape if args.stats_only: print("\nSet stats_only = False and start over to run program.") sys.exit() # Load model from model repo model_filename = os.path.join(args.model_path, args.model_name + '.hdf5') print("\nStarting segmentation mode ...") segmenter = Segmenter(model_filename=model_filename) print("Reading CT volume into memory...") dd = ct_dfile.read_full() if args.preprocess: print("\tPreprocessing volume...") if not os.path.exists("preprocessor.py"): input( "Looked for preprocessor.py, but not found! Please create one and press enter. Or press CTRL+C to exit" ) from preprocessor import preprocessor dd = preprocessor(dd) for idx, row in df_params.iterrows(): # iterate over masks # assign arguments from df_params for this mask slice_axis = row["slice_axis"] max_patches = row["n_patches"] segfile_tag = row["mask_name"] overlap = row["overlap"] rotation = row["rotation"] # define DataFile object for mask seg_fname = os.path.join(args.seg_path, segfile_tag) if not args.tiff_output: seg_fname = seg_fname + ".hdf5" seg_dfile = data_io.DataFile(seg_fname, \ data_tag = "SEG",\ tiff = args.tiff_output, \ d_shape = ct_dfile.d_shape, \ d_type = np.uint8, \ chunk_shape = chunk_shape,\ VERBOSITY = args.rw_verbosity) seg_dfile.create_new(overwrite=args.overwrite_OK) t0 = time.time() print("\nWorking on %s\n" % segfile_tag) ch = process_data(dd, segmenter, \ slice_axis = slice_axis, \ rot_angle = rotation, \ max_patches = max_patches, \ overlap = overlap, \ nprocs = args.nprocs, \ arr_split = args.arr_split,\ arr_split_infer = args.arr_split_infer,\ crops = args.crops) seg_dfile.write_full(ch) t1 = time.time() total_time = (t1 - t0) / 60.0 print( "\nDONE on %s\nTotal time for generating %s mask: %.2f minutes" % (time.ctime(), segfile_tag, total_time)) del slice_axis del max_patches del segfile_tag del rotation del ch if args.run_ensemble: print("\nStarting ensemble mode ...\n") t0 = time.time() # get the d_shape of one of the masks temp_fname = os.path.join(args.seg_path, df_params.loc[0, "mask_name"]) if not args.tiff_output: temp_fname = temp_fname + ".hdf5" temp_ds = data_io.DataFile(temp_fname, data_tag="SEG", tiff=args.tiff_output, VERBOSITY=0) mask_shape = temp_ds.d_shape chunk_shape = temp_ds.chunk_shape if not args.run_seg: temp_ds.show_stats() del temp_ds del temp_fname if args.stats_only: print("\nSet stats_only = False and start over to run program.") sys.exit() vote_fname = os.path.join(args.seg_path, args.vote_maskname) if not args.tiff_output: vote_fname = vote_fname + ".hdf5" vote_dfile = data_io.DataFile(vote_fname, \ tiff = args.tiff_output,\ data_tag = "SEG",\ d_shape = mask_shape, \ d_type = np.uint8, \ chunk_shape = chunk_shape,\ VERBOSITY = args.rw_verbosity) vote_dfile.create_new(overwrite=args.overwrite_OK) slice_start = 0 n_masks = len(df_params) pbar = tqdm(total=mask_shape[0]) while slice_start < mask_shape[0]: ch = [0] * len(df_params) for idx, row in df_params.iterrows(): seg_fname = os.path.join(args.seg_path, row["mask_name"]) if not args.tiff_output: seg_fname = seg_fname + ".hdf5" seg_dfile = data_io.DataFile(seg_fname, \ tiff = args.tiff_output, \ data_tag = "SEG", \ VERBOSITY = args.rw_verbosity) if mask_shape != seg_dfile.d_shape: raise ValueError("Shape of all masks must be same") ch[idx], s = seg_dfile.read_chunk(axis = 0, \ slice_start = slice_start, \ max_GB = args.mem_thres/(n_masks)) ch = np.asarray(ch) ch = np.median(ch, axis=0).astype(np.uint8) vote_dfile.write_chunk(ch, axis=0, s=s) del ch slice_start = s.stop pbar.update(s.stop - s.start) pbar.close() t1 = time.time() total_time = (t1 - t0) / 60.0 print("\nDONE on %s\nTotal time for ensemble mask %s : %.2f minutes" % (time.ctime(), args.vote_maskname, total_time)) if args.remove_masks: print("Intermediate masks will be removed.") for idx, row in df_params.iterrows(): # iterate over masks seg_fname = os.path.join(args.seg_path, row["mask_name"]) if not args.tiff_output: seg_fname = seg_fname + ".hdf5" os.remove(seg_fname) else: rmtree(seg_fname) if args.morpho_filt: print("\nApplying morphological operations on ensemble vote...") vote_fname = os.path.join(args.seg_path, args.vote_maskname) if not args.tiff_output: vote_fname = vote_fname + ".hdf5" vote_dfile = data_io.DataFile(vote_fname, \ tiff = args.tiff_output,\ data_tag = "SEG",\ VERBOSITY = args.rw_verbosity) from ct_segnet.morpho import morpho_filter vol = vote_dfile.read_full() vol = morpho_filter(vol, radius = args.radius, \ ops = args.ops, \ crops = args.crops, \ invert_mask = args.invert_mask) vote_dfile.write_full(vol) return
import numpy as np from matplotlib import pyplot as plt import matplotlib.cm as cm import preprocessor import mne p = preprocessor.preprocessor() raw = p.mne_open(p.triples[0][0], preload=True) raw = raw.pick_types(eeg=True, meg=True) # raw.drop_channels(['EEG061', 'EEG062', 'EEG063']) # drop mistakenly labeled HEOG, VEOG, ECG channels # print(raw.info) # print(raw.info['ch_names']) raw.plot_projs_topomap() # print(raw.info['chs']) # eeg_picks = mne.pick_types(raw.info, eeg=True, meg=False) # print(list(eeg_picks)) # print(list(eeg_picks)) # print(list(raw.info['ch_names'])) # print() # print(raw.info['ch_names'] - eeg_picks) # raw.drop_channels(raw.info['ch_names'] - eeg_picks)