def loadData(tarball, params): sourceFile, formattedFile = extractFile(tarball) #open and format input file print "<DataLoader> formatting file: %s" % sourceFile try: inFile = open(sourceFile) except: print "<DataLoader> Could not retrieve data from file: %s" % sourceFile return DataFormatter.formatFile(inFile, formattedFile) inFile.close() print "<DataLoader> Successfully formatted file: %s" % sourceFile #attempt to upload file uploaded = DataUploader.upload(formattedFile, params["bucket"], **params) if not uploaded: print "<DataLoader> Failed to upload file, exiting" return #create bigquery object try: bigquery = Google_Service_Builder.buildBigQuery(**params) except Exception as e: print "<DataLoader>", e return source = "gs://" + params["bucket"] + "/" + formattedFile BigQuery_Append.appendData(bigquery, source, **params)
def get_model_RFE_top_features(self,expression_file,ic50_file,target_features,drug): expression_frame,ic50_series = dfm.get_expression_frame_and_ic50_series_for_drug(expression_file, ic50_file,drug,normalized=True,trimmed=True,threshold=None) scikit_data,scikit_target = dfm.get_scikit_data_and_target(expression_frame,ic50_series) step_length = int(len(scikit_data.tolist()[0]) / 100) + 1 selector = RFE(self.model,int(target_features),step=step_length) selector.fit(scikit_data,scikit_target) return [expression_frame.index[i] for i in xrange(0,len(expression_frame.index)) if selector.support_[i]]
def loadData(tarball, params): sourceFile, formattedFile = extractFile(tarball) #open and format input file print "<DataLoader> formatting file: %s" % sourceFile try: inFile = open(sourceFile) except: print "<DataLoader> Could not retrieve data from file: %s" % sourceFile return DataFormatter.formatFile(inFile, formattedFile) inFile.close() print "<DataLoader> Successfully formatted file: %s" % sourceFile #attempt to upload file uploaded = DataUploader.upload(formattedFile, params["bucket"], **params) if not uploaded: print "<DataLoader> Failed to upload file, exiting" return #create bigquery object try: bigquery = Google_Service_Builder.buildBigQuery(**params) except Exception as e: print "<DataLoader>", e return source = "gs://"+params["bucket"]+"/"+formattedFile BigQuery_Append.appendData(bigquery, source, **params)
def get_model_coefficients_threshold(self,expression_file,ic50_file,threshold,drug): if(self.model_type == 'svm' and self.kernel == 'linear'): expression_frame,ic50_series = dfm.get_expression_frame_and_ic50_series_for_drug(expression_file, ic50_file,drug,normalized=True,trimmed=True,threshold=threshold) scikit_data,scikit_target = dfm.get_scikit_data_and_target(expression_frame,ic50_series) self.model.fit(scikit_data,scikit_target) return expression_frame.index, self.model.coef_[0] else: raise Exception("Method only defined for the SVM linear model")
def get_model_accuracy_filter_feature_size(self,expression_file, ic50_file,feature_size,num_permutations,drug): scikit_data,scikit_target = dfm.get_expression_scikit_data_target_for_drug(expression_file,ic50_file,drug,normalized=True,trimmed=True,threshold=None) for i in range(0,num_permutations): try: shuffled_data,shuffled_target = dfm.shuffle_scikit_data_target(scikit_data,scikit_target) accuracy = cv.cross_val_score_filter_feature_selection(self.model,cv.trim_X_num_features,feature_size,shuffled_data,shuffled_target,cv=5) yield accuracy.mean() except: yield 0.0
def get_predictions_full_CCLE_dataset_threshold(self,expression_file,ic50_file,threshold,drug): training_frame,training_series = dfm.get_expression_frame_and_ic50_series_for_drug(expression_file,ic50_file,drug,normalized=True,trimmed=True,threshold=threshold) training_data,training_target = dfm.get_scikit_data_and_target(training_frame,training_series) cell_lines, testing_data = dfm.get_normalized_full_expression_identifiers_and_data(expression_file,training_frame.index) self.model.fit(training_data,training_target) predictions = self.model.predict(testing_data) return cell_lines, predictions
def main(argv=None): #Importer.loadFromRaw(argv[1], numFiles = 2) DataFormatter.formatData('.', argv[2]) tmp = h5py.File("{0}.hdf5".format(argv[2]), "r+") data = [] for dataset in tmp["raw_data"].keys(): data.append(tmp["raw_data"][dataset]) print tmp["raw_data"][dataset][0] datavisualization.analyze8x8data(data=data, samprate=20000, time=2) tmp.close()
def get_predictions_full_CCLE_dataset_top_features(self,expression_file,ic50_file,num_features,drug): expression_frame,ic50_series = dfm.get_expression_frame_and_ic50_series_for_drug(expression_file,ic50_file,drug,normalized=True,trimmed=True) top_features = dfm.get_pval_top_n_features(expression_frame,ic50_series,num_features) expression_frame = expression_frame.ix[top_features] scikit_data,scikit_target = dfm.get_scikit_data_and_target(expression_frame,ic50_series) cell_lines, testing_data = dfm.get_normalized_full_expression_identifiers_and_data(expression_file,expression_frame.index) self.model.fit(scikit_data,scikit_target) predictions = self.model.predict(testing_data) return cell_lines,predictions,list(top_features)
def get_cross_validation_time(self,expression_file, ic50_file,feature_size,num_permutations,drug): scikit_data,scikit_target = dfm.get_expression_scikit_data_target_for_drug(expression_file,ic50_file,drug,normalized=True,trimmed=True,threshold=None) for i in range(0,num_permutations): try: shuffled_data,shuffled_target = dfm.shuffle_scikit_data_target(scikit_data,scikit_target) start_time = datetime.datetime.now() cv.cross_val_score_filter_feature_selection(self.model,cv.trim_X_num_features,feature_size,shuffled_data,shuffled_target,cv=5) end_time = datetime.datetime.now() yield float((end_time - start_time).microseconds) / 100000 except: yield 0.0
def get_model_accuracy_RFE(self,expression_file,ic50_file,target_features,num_permutations,drug): scikit_data,scikit_target = dfm.get_expression_scikit_data_target_for_drug(expression_file,ic50_file,drug,normalized=True,trimmed=True,threshold=None) step_length = int(len(scikit_data.tolist()[0]) / 100) + 1 for i in xrange(0,num_permutations): try: shuffled_data,shuffled_target = dfm.shuffle_scikit_data_target(scikit_data,scikit_target) selector = RFE(self.model,target_features,step=step_length) yield cross_val_score(selector,shuffled_data,shuffled_target,cv=5).mean() except: yield 0.0
def get_patient_predictions_rfe(self,expression_file,ic50_file,patient_directory,target_features,drug): e_data,e_target,p_identifiers,p_data = dfm.get_cell_line_and_patient_expression_data_target_for_drug(expression_file,ic50_file,patient_directory,1.0,drug) step_length = int(len(e_data.tolist()[0]) / 100) + 1 model = RFE(self.model,target_features,step=step_length) model.fit(e_data,e_target) predictions = model.predict(p_data) all_features = dfm.get_cell_line_and_patient_expression_gene_intersection(dfm.get_cell_line_expression_frame(expression_file),dfm.get_patients_expression_frame(patient_directory))[0] top_features = [all_features[i] for i in xrange(0,len(all_features)) if model.support_[i]] return p_identifiers, predictions, top_features
def _get_data(): try: global training_input global training_output training_input = [] training_output = [] log.info('Fetching training data') try: number_of_files = int(len(os.listdir('trainingData'))) except: log.error('Directory trainingData does not exist') return False for file_num in range(0, int(number_of_files / 2)): state_file = 'trainingData/ExportedState{}.txt'.format(file_num) move_file = 'trainingData/ExportedMove{}.txt'.format(file_num) state_data = df._format_array_v2(state_file) move_data = df._format_array_v2(move_file) if not state_data: log.error('Failed to load board state data') return False if not move_data: log.error('Failed to load move data') return False state_data = np.array(state_data) move_data = np.array(move_data) training_input.append(state_data) training_output.append(move_data) training_input = np.array(training_input) training_output = np.array(training_output) log.info('\tData fetched') log.info('\t\tTraining_input length: {}\tShape: {}'.format( len(training_input), training_input.shape)) log.info('\t\tTraining_out length: {}\tShape: {}\n'.format( len(training_output), training_output.shape)) return True except: log.error('\tUnknown error in NeuralNetwork._get_data\n') return False
def _get_data(): try: global training_input global training_output training_input = [] training_output = [] log.info('Fetching training data') number_of_files = int(len(os.listdir('trainingData'))) for file_num in range(0, int(number_of_files / 2)): state_file = 'trainingData/ExportedState{}.txt'.format(file_num) move_file = 'trainingData/ExportedMove{}.txt'.format(file_num) data = df._format_array_v2(state_file) if not data: log.error('Failed to load data') return False data = np.array(data) training_input.append(data) data = df._format_array_v2(move_file) if not data: log.error('Failed to load data') return False data = np.array(data) training_output.append(data) training_input = np.array(training_input) training_output = np.array(training_output) log.info("Input array shape: {}".format(training_input.shape)) log.info("Input data array shape: {}".format(training_input[0].shape)) log.info('\tData fetched') log.info('\t\tTraining_input length: {}'.format(len(training_input))) log.info('\t\tTraining_out length: {}\n'.format(len(training_output))) return True except: log.error('\tUnknown error in NeuralNetwork._get_data\n') return False
def click(): print("Event Triggered") # Initialise elasticsearch request handler passing Elasticsearch Address requestHandler = ElasticRequest.elasticRequest( 'http://10.2.5.21:9200') requestHandler.add_search_term(entry_box.get()) try: r = requestHandler.send_request() except elasticsearch.exceptions.ConnectionError: print("Exception Catched!") messagebox.showerror( "Error", "Connection Error, Unable to connect to server") return except Exceptions.noDataFoundError: print("No search results found!") messagebox.showerror("Error", "No Results found!") return # except Exception as e: # messagebox.showerror("Error", "UNKNOWN ERROR") # print(e) # return data_formatter = DataFormatter.DataFormatter(r) to_display = data_formatter.get_display() self.generate_table(to_display)
def click(event=None): # Initialise elasticsearch request handler passing Elasticsearch Address a = StaticValues.StaticValues() requestHandler = ElasticRequest.elasticRequest( a.get_elasticsearch_server()) # Check if queries are empty else add the queries to request body if self.check_if_empty(entry_box1.get(), entry_box2.get(), entry_box3.get(), entry_box4.get()): requestHandler.set_minimum_matches(0) else: requestHandler.add_search_term(entry_box1.get()) requestHandler.add_search_term(entry_box2.get()) requestHandler.add_search_term(entry_box2.get()) requestHandler.add_search_term(entry_box3.get()) requestHandler.add_search_term(entry_box4.get()) # Checking logger checkbuttons is_adapter_selected = False for i in var: if var[i].get() != 0: is_adapter_selected = True requestHandler.add_logger_name(str(i)) if not is_adapter_selected: print("Please select at least one adapter") messagebox.showerror("Error", "Please check at least one Program") return # Checking Max Results entry if self.contains_value((request_entry_box.get())): requestHandler.change_max_results((request_entry_box.get())) if self.contains_value(datepicker1.get()): requestHandler.set_from_date(datepicker1.get()) if self.contains_value(datepicker2.get()): requestHandler.set_to_date(datepicker2.get()) try: r = requestHandler.send_request() except elasticsearch.exceptions.ConnectionError: print("Exception Caught!") messagebox.showerror( "Error", "Connection Error, Unable to connect to server") return except Exceptions.noDataFoundError: print("No search results found!") messagebox.showerror("Error", "No Results found!") return except Exception as e: messagebox.showerror("Error", "UNKNOWN ERROR") print(e) return data_formatter = DataFormatter.DataFormatter(r) to_display = data_formatter.get_display() self.generate_table(to_display)
def _reshape_input(input, abs_max): return DataFormatter.reshape_recurrent_input(input, rows=rows, columns=columns, global_columns=global_columns, global_column_forecast_timesteps=global_column_forecast_timesteps, abs_max=abs_max)
def backward_step(model,expression_frame,ic50_series,backward_step_size,forward_features_selected,backward_features_removed): removable_features = set(expression_frame.index) - set(forward_features_selected) - set(backward_features_removed) expression_frame = dfm.get_expression_frame_with_features(expression_frame,set(expression_frame.index) - set(backward_features_removed)) scikit_data,scikit_target = dfm.get_scikit_data_and_target(expression_frame,ic50_series) model.fit(scikit_data,scikit_target) coefs = model.coef_[0] feature_names = list(expression_frame.index) coefs,feature_names = zip(*sorted(zip(coefs,feature_names),key=lambda x : math.fabs(x[0]))) num_features_removed = 0 feature_index = 0 while(num_features_removed < backward_step_size and feature_index < len(feature_names)): if(feature_names[feature_index] in removable_features): backward_features_removed.append(feature_names[feature_index]) num_features_removed += 1 feature_index += 1 return backward_features_removed
def get_patient_predictions_top_features(self,expression_file,ic50_file,patient_directory,num_features,drug): e_data,e_target,p_identifiers,p_data,top_features = dfm.get_cell_line_and_patient_expression_data_target_top_features_for_drug(expression_file,ic50_file,patient_directory,num_features,drug) self.model.fit(e_data,e_target) predictions = self.model.predict(p_data) return p_identifiers,predictions,top_features
def forward_step(model,expression_frame,ic50_series,forward_features_selected,backward_features_removed): potential_features = set(expression_frame.index) - set(forward_features_selected) - set(backward_features_removed) max_score = -1 best_feature = None for feature in potential_features: model = copy.copy(model) model_features = set(forward_features_selected) & set(feature) expression_frame = dfm.get_expression_frame_with_features(expression_frame,model_features) scikit_data,scikit_target = dfm.get_scikit_data_and_target(expression_frame,ic50_series) score = cv.cross_val_score(model,scikit_data,scikit_target,cv=5).mean() if(score > max_score): max_score = score best_feature = feature if(best_feature): forward_features_selected.append(best_feature) return forward_features_selected else: return forward_features_selected
def get_accuracy_and_runtime_vs_num_generations(expression_file,ic50_file,num_features,generation_range,num_permutations,num_threads): drug = "SMAP" scikit_data,scikit_target = dfm.get_expression_scikit_data_target_for_drug(expression_file,ic50_file,drug,normalized=True,trimmed=True,threshold=None) p = Pool(num_threads) scores = p.map(wrap, [(g,scikit_data,scikit_target,num_features,num_permutations) for g in generation_range]) scores = {g : scores[i] for i,g in enumerate(generation_range)} print(scores) return scores
def get_predictions_full_CCLE_dataset_rfe(self,expression_file,ic50_file,target_features,drug): scikit_data,scikit_target = dfm.get_expression_scikit_data_target_for_drug(expression_file,ic50_file,drug,normalized=True,trimmed=True,threshold=None) step_length = int(len(scikit_data.tolist()[0]) / 100) + 1 model = RFE(self.model,target_features,step=step_length) model.fit(scikit_data,scikit_target) expression_frame = dfm.normalize_expression_frame(dfm.get_cell_line_expression_frame(expression_file)) cell_lines = expression_frame.columns testing_data = dfm.get_scikit_data(expression_frame) predictions = model.predict(testing_data) top_features = [expression_frame.index[i] for i in xrange(0,len(expression_frame.index)) if model.support_[i]] return cell_lines,predictions,top_features
def Main(): config = Configuration('configuration.json') if config.isProcessingDataFirst: ProcessData.process_data_files() if config.isGeneratingGraph: GraphData.create_plots() if config.isGeneratingClocklabFiles: FormatDataForClockLab.create_clock_lab_formatted_bulked_out_with_zeros_text_file( ) if config.isGeneratingChronosFitFile: FormatDataForChronosFit.create_cronos_fit_formatted_file() if config.isShowingIndividualFish and config.isGeneratingDistanceSums: DataFormatter.generate_distance_sums_for_individual_fish()
def get_patient_predictions_threshold(self,expression_file,ic50_file,patient_directory,threshold,drug): """ Returns the predictions for which patients are likely to be sensitive to SMAPs and which are likely to be resistant. First trains a given SVM model on expression data, and then uses the trained model to predict patient outcome. Returns a list of patient identifiers, and a list of predictions about the patients response to a given drug. """ e_data,e_target,p_identifiers,p_data = dfm.get_cell_line_and_patient_expression_data_target_for_drug(expression_file,ic50_file,patient_directory,threshold,drug) self.model.fit(e_data,e_target) predictions = self.model.predict(p_data) return p_identifiers,predictions
def trim_X_num_features(X,y,train,num_features): """ Do calculations to trim X by taking the top num_features features based on p-value rank """ all_samples = pd.DataFrame(X) all_labels = pd.Series(y) train_samples,train_labels = get_training_samples_labels(all_samples,all_labels,train) features = dfm.get_pval_top_n_features(train_samples.T,train_labels,num_features) trimmed_all_samples = all_samples[features] return np.array([list(trimmed_all_samples.ix[row]) for row in trimmed_all_samples.index])
def trim_X_threshold(X,y,train,threshold): """ Do calculations to trim X based on a p-value threshold """ all_samples = pd.DataFrame(X) all_labels = pd.Series(y) train_samples,train_labels = get_training_samples_labels(all_samples,all_labels,train) features = dfm.get_features_below_pval_threshold(train_samples.T,train_labels,threshold) trimmed_all_samples = all_samples[features] return np.array([list(trimmed_all_samples.ix[row]) for row in trimmed_all_samples.index])
def hello_world(): url = request.args.get('url') page = requests.get(url) soup = BeautifulSoup(page.content, 'html.parser') # content = DataFormatter.strip_html_tags(soup) headline = soup.find("h1", {"class": "entry-title"}) body = '' for paragraph in soup.find("div", {"class": "entry-content"}).findAll("p"): body += DataFormatter.strip_html_tags(paragraph) content = { 'headline': DataFormatter.strip_html_tags(headline), 'body': body } # content_df = pandas.DataFrame(columns=['Body ID', 'Headline', 'articleBody']) # content_df.append(['0', DataFormatter.normalize_string(content['headline'])], # DataFormatter.normalize_string(content['body'])) # content_df.loc[1]['Body ID'] = # content_df.loc[1]['Headline'] = # content_df.loc[1]['articleBody'] = # print(content_df) X_prediction = numpy.array([ DataFormatter.normalize_string(content['headline'] + content['body']) ]) # print(X_prediction.reshape(-1,1)) # print(X_prediction.reshape(-1,1).shape) prediction = model.predict(X_prediction) prob = model.predict_proba(X_prediction) # train_set = read_dataframe_train() # test_dataframe(train_set) # data_frame_to_csv(train_set) # create_distribution(train_set) return jsonify(prediction, prob)
def acc_and_run(g,scikit_data,scikit_target,num_features,num_permutations): results = [] for perm in xrange(0,num_permutations): try: start_time = datetime.datetime.now() model = n.NeatClassifier(max_generations=g) shuffled_data,shuffled_target = dfm.shuffle_scikit_data_target(scikit_data,scikit_target) acc = cv.cross_val_score_filter_feature_selection(model,cv.trim_X_num_features,num_features,shuffled_data,shuffled_target,cv=5) end_time = datetime.datetime.now() results.append((acc.mean(),float((end_time - start_time).microseconds) / 100000)) except: results.append((0.0, 1000.0)) print(sys.exc_info()[0]) return results
def read_police_call_weather_data(csv_file, rows, columns, train_test_split=0.80, backward_timesteps=28, forward_timesteps=4, forecast_timesteps=4, regression=True): """ Read :param csv_file: raw data file :param rows: number of rows of the map grid :param columns: number of columns of the map grid :param train_test_split: float designating the train/test split percentage (0.80 yields 80% train and 20% test :param backward_timesteps: designates how many timesteps back the network will analyze :param forward_timesteps: designates the number of timesteps forward the network will predict :param forecast_timesteps: how many timesteps forward to forecast global variables such as temperature/relative humidity/precipitation :param regression: if True, regression is performed, else classification is performed. NOTE: classification does not work well currently because inputs are not diffed :return: shape of input, (training input data, training output data), (testing input data, testing output data) """ from pandas import read_csv import DataFormatter raw = read_csv(csv_file) global_columns = ["Fahrenheit", "Precipitation", "Relative Humidity"] raw.drop(["Start Date", "End Date", "Num of Calls"], axis=1, inplace=True) global_column_forecast_timesteps = dict(zip(global_columns, [forecast_timesteps] * len(global_columns))) # raw.drop(global_columns, axis=1, inplace=True) recurrent = DataFormatter.make_recurrent(raw, backward_timesteps, forward_timesteps) training_set, testing_set = DataFormatter.partition(recurrent, train_test_split) if regression: train_input, train_output = DataFormatter.split_recurrent_data(training_set) test_input, test_output = DataFormatter.split_recurrent_data(testing_set) else: format_regression_to_classification = lambda x: DataFormatter.recurrent_regression_to_classification(*DataFormatter.split_recurrent_data(x), minimum_delta=2, minimum_delta_percentage=0.10, enforce_both_minimums=True) train_input, train_output = format_regression_to_classification(training_set) test_input, test_output = format_regression_to_classification(testing_set) def _reshape_input(input, abs_max): return DataFormatter.reshape_recurrent_input(input, rows=rows, columns=columns, global_columns=global_columns, global_column_forecast_timesteps=global_column_forecast_timesteps, abs_max=abs_max) train_input, abs_max = _reshape_input(train_input, 0) test_input, _ = _reshape_input(test_input, abs_max) remove_global_output_columns = lambda x: DataFormatter.remove_global_output_columns(x, global_columns) train_output = remove_global_output_columns(train_output).values.astype(float)[:train_input.shape[0]] test_output = remove_global_output_columns(test_output).values.astype(float)[:test_input.shape[0]] train_output /= max(train_output.max(), abs(train_output.min())) test_output /= max(test_output.max(), abs(test_output.min())) return train_input.shape, (train_input, train_output), (test_input, test_output)
def main(argv=None): if argv is None: argv = sys.argv run_importer = True run_formatter = True run_analysis = True try: try: opts, args = getopt.getopt( argv[1:], "h", ["help", "skip-importer", "skip-formatter", "skip-analysis"]) print "got args" except getopt.error, msg: raise Usage(msg) for option, data in opts: if ('-h' == option or '--help' == option): print "LSCE test script. Usage: \"python testscript.py [--skip-importer] [--skip-formatter] [--skip-analysis]" +\ " mat_source hdf5_dest" +\ "\"\n\nSupply the following arguments to run pipeline:\n\n\tmat_source: " +\ "The path to the raw .mat files to be imported.\n\thdf5_dest: the name to save hdf5 output file under" +\ "\n\nAvailable modes:\n\t--skip-importer: skip importation step. Formatter wil still run using" +\ " mat_source as src directory." +\ "\n\t--skip-formatter: skip formatting step. Importer will use mat_source as usual. \n\t\t\t Analysis will" +\ " use hdf5_dest if it exists." + \ "\n\t--skip-analysis: skip computation of analysis data. Formatter will still output to hdf5_dest. " return if ('--skip-importer' == option): run_importer = False if ('--skip-formatter' == option): run_formatter = False if ('--skip-analysis' == option): run_analysis = False if (len(args) < 2): raise Usage("Insufficient arguments supplied.") else: print args.__repr__() print "Welcome to LSCE test script.\nThis script will perform a " + \ "complete iteration of our pipeline, starting with the data importer." if (run_importer): print "Importing data from directory " + args[0] Importer.loadFromMat(args[0]) else: print "Skipped importing data." if (run_formatter): print "Formatting data as hdf5 in " + args[1] + ".hdf5" DataFormatter.formatData(args[0], args[1]) else: print "Skipped formatting data." os.system("PAUSE") testing = None raw_data = None if (run_analysis): dtool = DataAnalysis.data_analysis() dtool.load_hdf5(args[1], dataset_name="Electrode_12_master", group_name="raw_data") dtool.sampling_rate = 1000 testing = dtool.high_demo_filter(20) raw_data = dtool.f["raw_data"]["Electrode_12_master"] else: print "Skipped data analysis.\nPlaceholder groups " + \ "\"/data_analysis/demo_filter_results\" and \"/raw_data/Electrode_12_master\" will be used." hdfile = h5py.File(args[1] + ".hdf5", "r+") if ("data_analysis" not in hdfile or "demo_filter_results" not in hdfile["data_analysis"]): print "Skipping graphs..." return testing = hdfile["data_analysis"]["demo_filter_results"] raw_data = hdfile["raw_data"]["Electrode_12_master"] os.system("PAUSE") plt.subplot(2, 1, 1) plt.plot(testing) plt.subplot(2, 1, 2) plt.plot(raw_data) plt.show() if (run_analysis): dtool.close()
def predictData(data): data = df.getFormattedData(data) return model.predict(data)
def main(argv=None): if argv is None: argv = sys.argv run_importer = True run_formatter = True run_analysis = True try: try: opts, args = getopt.getopt(argv[1:], "h", ["help", "skip-importer", "skip-formatter", "skip-analysis"]) print "got args" except getopt.error, msg: raise Usage(msg) for option, data in opts: if('-h' == option or '--help' == option): print "LSCE test script. Usage: \"python testscript.py [--skip-importer] [--skip-formatter] [--skip-analysis]" +\ " mat_source hdf5_dest" +\ "\"\n\nSupply the following arguments to run pipeline:\n\n\tmat_source: " +\ "The path to the raw .mat files to be imported.\n\thdf5_dest: the name to save hdf5 output file under" +\ "\n\nAvailable modes:\n\t--skip-importer: skip importation step. Formatter wil still run using" +\ " mat_source as src directory." +\ "\n\t--skip-formatter: skip formatting step. Importer will use mat_source as usual. \n\t\t\t Analysis will" +\ " use hdf5_dest if it exists." + \ "\n\t--skip-analysis: skip computation of analysis data. Formatter will still output to hdf5_dest. " return if('--skip-importer' == option): run_importer = False if('--skip-formatter' == option): run_formatter = False if('--skip-analysis' == option): run_analysis = False if(len(args) < 2): raise Usage("Insufficient arguments supplied.") else: print args.__repr__() print "Welcome to LSCE test script.\nThis script will perform a " + \ "complete iteration of our pipeline, starting with the data importer." if(run_importer): print "Importing data from directory "+args[0] Importer.loadFromMat(args[0]) else: print "Skipped importing data." if(run_formatter): print "Formatting data as hdf5 in "+args[1]+".hdf5" DataFormatter.formatData(args[0], args[1]) else: print "Skipped formatting data." os.system("PAUSE") testing = None raw_data = None if(run_analysis): dtool = DataAnalysis.data_analysis() dtool.load_hdf5(args[1], dataset_name="Electrode_12_master", group_name="raw_data") dtool.sampling_rate = 1000 testing = dtool.high_demo_filter(20) raw_data = dtool.f["raw_data"]["Electrode_12_master"] else: print "Skipped data analysis.\nPlaceholder groups " + \ "\"/data_analysis/demo_filter_results\" and \"/raw_data/Electrode_12_master\" will be used." hdfile = h5py.File(args[1]+".hdf5", "r+") if("data_analysis" not in hdfile or "demo_filter_results" not in hdfile["data_analysis"]): print "Skipping graphs..." return testing = hdfile["data_analysis"]["demo_filter_results"] raw_data = hdfile["raw_data"]["Electrode_12_master"] os.system("PAUSE") plt.subplot(2, 1, 1) plt.plot(testing) plt.subplot(2, 1, 2) plt.plot(raw_data) plt.show() if(run_analysis): dtool.close()
def get_data(keyword, keyword2): # # Generate sub table from json in "_source" column # def generate_subTable(data): # for i in range(len(data.index)): # if i == 0: # subtable = pd.DataFrame(actualData.iloc[i, 3]) # subtable.loc[i, "timestamp"] = dateutil.parser.parse(subtable["@timestamp"][i]) # subtable.loc[i, "date"] = (subtable["timestamp"][i] + datetime.timedelta(hours=8)).strftime("%H:%M:%S %d/%m/%Y") # if isinstance(subtable.iloc[i]['response_body'], str): # subtable.loc[i, "response_message"] = json.loads(subtable.iloc[i]['response_body'])['message'] # else: # subtable = subtable.append(pd.DataFrame(actualData.iloc[i, 3]), ignore_index=True) # subtable.loc[i, 'timestamp'] = dateutil.parser.parse(subtable["@timestamp"][i]) # subtable.loc[i, "date"] = (subtable["timestamp"][i] + datetime.timedelta(hours=8)).strftime("%H:%M:%S %d/%m/%Y") # if isinstance(subtable.iloc[i]['response_body'], str): # subtable.loc[i, "response_message"] = json.loads(subtable.iloc[i]['response_body'])['message'] # return subtable # # # Copies values of desired columns to be displayed # def form_displayTable(mainDataFrame): # displayTable = pd.DataFrame() # displayTable['timestamp'] = mainDataFrame['date'] # displayTable['logger_name'] = mainDataFrame['logger_name'] # displayTable['path'] = mainDataFrame['path'] # displayTable['response_status'] = mainDataFrame['response_status'] # displayTable['response_message'] = mainDataFrame['response_message'] # displayTable['request_body'] = mainDataFrame['request_body'] # displayTable['response_body'] = mainDataFrame['response_body'] # return displayTable print("Sendrequest Module Initiated") requestHandler = ElasticRequest.elasticRequest('http://10.2.5.21:9200') r = 0 try: requestHandler.add_search_term(keyword) requestHandler.add_search_term(keyword2) r = requestHandler.send_request() except elasticsearch.exceptions.ConnectionError as e: print("Except block entered") raise elasticsearch.exceptions.ConnectionError # Checking for any search results if (r['hits']['total'] == 0): print("No Hits!") raise Exceptions.noDataFoundError return print("Finished log extraction from ELK") # Generate dataframe based on contents of r # data = pd.DataFrame(r) # # pull out desired data ( search results from the json <cell 1,3> # actualData = pd.DataFrame(data.iloc[1,3]) # # main = generate_subTable(actualData) # # # Join the subtable back with original dataFrame # result = pd.concat([actualData, main], axis=1, join_axes=[actualData.index]) # result = result.sort_values(by=['timestamp'], ascending=False) # result = result.reset_index(drop=True) # # # Remove '_source' column from original Dataframe # result = result.drop('_source', 1) # # # Copy desired fields to separate new df # to_display = form_displayTable(result) data_formatter = DataFormatter.DataFormatter(r) to_display = data_formatter.get_display() return to_display
""" A sample program that predicts MSFT stock data using linear regression and MatPlotLib """ import DataFormatter import matplotlib.pyplot as plt import pandas as pd import numpy as np from math import ceil from sklearn import linear_model, preprocessing, model_selection import datetime AV_key = "" #AVKey here DataFormatter.set_av_key(AV_key) if __name__ == "__main__": df = DataFormatter.get_alpha_vantage_data('MSFT', AV_key) df.index = pd.to_datetime(df.index) projection_out = int(ceil(0.05 * len(df))) df['label'] = df['adjusted_close'].shift(projection_out) df.dropna(inplace=True) X = preprocessing.scale(np.array(df.drop(['label'], 1))) y = np.array(df['label']) X_train, X_test, y_train, y_test = model_selection.train_test_split( X, y, test_size=0.2) clf = linear_model.LinearRegression(n_jobs=-1) clf.fit(X_train, y_train) print(clf.score(X_test, y_test)) projection_set = clf.predict(X[:projection_out])[::-1]