def __init__(self): self.sc = StandardScaler() self.classifier = svm.SVC(kernel="rbf") self.x_train, self.y_train = parse_data(training_data) self.x_test, self.y_test = parse_data(testing_data) # Preprocessing data aiming to achieve Gaussian with zero mean and unit variance self.x_train = self.sc.fit_transform(self.x_train) self.x_test = self.sc.transform(self.x_test) self.accuracy_rate = 0 self.error_rate = 100
def test__parse_data(self): headers = ["pool_id", "analyte", "strippedname"] cp = ConfigParser.RawConfigParser() cp.read("prism_pipeline.cfg") header_map = pd.generate_header_map( headers, cp.items(_prism_cell_config_file_section), False) data = [["1", "analyte 2", "my cell's name"], ["3", "analyte 5", "autre cell nom"]] r = pd.parse_data(header_map, data, Dummy) logger.debug("r: {}".format(r)) assert len(r) == len(data), len(r) header_map["extra header that doesn't have data in any row"] = 10 r = pd.parse_data(header_map, data, Dummy) logger.debug("r: {}".format(r)) assert len(r) == len(data), len(r) data.append(["7", "", "blah"]) r = pd.parse_data(header_map, data, Dummy) assert r[2].analyte_id is None headers = [ "well_position", "compound_well_mmoles_per_liter", "dilution_factor" ] cp = ConfigParser.RawConfigParser() cp.read("prism_pipeline.cfg") header_map = pd.generate_header_map( headers, cp.items(_perturbagen_CM_input_config_file_section), False) data = [["A01", "1.010101", "2"], ["B07", "3.030303", "5"]] r = pd.parse_data(header_map, data, Dummy) logger.debug("r: {}".format(r)) assert len(r) == len(data), len(r) assert hasattr(r[0], "compound_well_mmoles_per_liter"), r[0].__dict__ assert isinstance(r[0].compound_well_mmoles_per_liter, float) assert r[0].compound_well_mmoles_per_liter == 1.010101, r[ 0].compound_well_mmoles_per_liter assert isinstance(r[0].dilution_factor, int) assert r[0].dilution_factor == 2, r[0].dilution_factor assert isinstance(r[1].compound_well_mmoles_per_liter, float) assert isinstance(r[1].dilution_factor, int)
def main(): junk_param = sys.argv[1] classifier_param = sys.argv[2] train_param = sys.argv[3] #split_param = sys.argv[4] print('Main Program Begins : ') write_csv.generate_inkml_file_list() symbol_data_obj_list, junk_data_obj_list, test_data_obj_list = parse_data.parse_data( junk_param) print(len(symbol_data_obj_list)) print(len(junk_data_obj_list)) print(len(test_data_obj_list)) print('object created') symbol_data_obj_list = feature_extraction.get_features( symbol_data_obj_list, 'symbol_feature_list.csv') junk_data_obj_list = feature_extraction.get_features( junk_data_obj_list, 'junk_feature_list.csv') test_data_obj_list = feature_extraction.get_features( test_data_obj_list, 'test_feature_list.csv') print('Features extracted') prediction_file, GT_file = classification_driver.classification( junk_param, classifier_param, train_param) #Feature Extraction follows if (prediction_file is not None and GT_file is not None): command = 'python evalSymbIsole.py ' + data_folder + GT_file + ' ' + data_folder + prediction_file + ' HTML > output.html' #After this we can save all features in one csv as a table with final column as output(GT) #This will also save time for parsing ISO files again and again. os.system(command) print('Done!')
def trainTheModel(self, filePath): # receive the X matrix and y vector after parsing, as well as the means and frequents to be used in testTheModel X, y, self.means_and_frequents = parse_data(filePath) # scaling the matrix to be used in the SVM model X = preprocessing.scale(X) # split the data to train 70%, and test 30% X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=24, shuffle=True) # classify the data using best suited parameters, fitting the training data classifier = svm.SVC(kernel='rbf', gamma=0.0001, C=1000, verbose=10).fit(X_train, y_train) # predict the data using the model trainPredictions = classifier.predict(X_test) # print the confusion matrix, accuracy and recall print(confusion_matrix(y_test, trainPredictions)) print("Accuracy:", metrics.accuracy_score(y_test, trainPredictions)) print("Recall:", metrics.recall_score(y_test, trainPredictions)) return classifier
def run_train(self): try: # ---------------------------------------------------------------------------- # Start Training and parse the data in case the user didn't do it already. # ---------------------------------------------------------------------------- if self.training_status == Status.UNINITIALIZED: self.training_status = Status.IN_PROGRESS x_train, y_train, corrupted_data = parse_data( 'data/adult.data') self.corrupted_data = corrupted_data # ---------------------------------------------------------------------------- # Searching for the best K Features, in our case, K=4. converting # lists to np arrays to support multi-dimensional slicing. # ---------------------------------------------------------------------------- x_train = np.array(x_train) y_train = np.array(y_train) selector = SelectKBest(f_classif, k=BEST_K_FEATURES) selector.fit(x_train, y_train) self.cols = selector.get_support(indices=True) x_train = x_train[:, self.cols] # ---------------------------------------------------------------------------- # Training the svm and setting training status # ---------------------------------------------------------------------------- self.clf.fit(x_train, y_train) self.training_status = Status.DONE except Exception as e: print(e) self.training_status = Status.UNINITIALIZED
def train(self): try: start_time = time.time() # Train the model using the training sets print("Start training...") self.X_train, self.y_train = parse_data(self.data_file_full_path) self.X_test, self.y_test = parse_data(self.test_file_full_path) self.classifier.fit(self.X_train, self.y_train) except Exception as err: print("Error: ", err) finally: print("---Train: %s seconds ---" % (time.time() - start_time))
def test_parse_data(self): seed, sats, start, end = parse.parse_data(self.raw_data) self.assertEqual(seed, self.test_seed) self.assertEqual(len(sats), 20) self.assertEqual(sats[0], self.test_sat) self.assertListEqual(list(start), list(self.start)) self.assertListEqual(list(end), list(self.end))
def main(): dataKeys = ['xAccl', 'yAccl', 'zAccl', 'time'] data = parse_data.parse_data() drivingData = data["Driving"] jumpingData = data["Jumping"] standingData = data["Standing"] walkingData = data["Walking"] #varDict, powerSpectrums, spectrumPeaks, relTime = generateFeatures.generateFeatures(drivingData[0]) fig = nextFig(0) plotMidSpectrums(data, 'xAccl') fig = nextFig(fig) plotMidSpectrums(data, 'yAccl') fig = nextFig(fig) plotMidSpectrums(data, 'zAccl') # varGraphs(data, 'yAccl') # fig = nextFig(fig) # varGraphs(data, 'xAccl') fig = nextFig(fig) varGraphs(data, 'zAccl') for i in range(2): fig = nextFig(fig) plotSpectrum(drivingData[i * 3], 'yAccl', 'drive') fig = nextFig(fig) plotSpectrum(walkingData[i * 3], 'yAccl', 'walk')
def __init__(self): self.train_data_size, self.valid_train_data_size, self.x_train, self.y_train = parse_data("data\\adult.data") self.test_data_size, self.valid_test_data_size, self.x_test, self.y_test = parse_data("data\\adult.test") self.svclassifier = svm.LinearSVC() #Optimization made to minimize runtime and maximize prediction accuracy rate scaling_x_train = MinMaxScaler(feature_range=(-1,1)).fit(self.x_train) self.x_train = scaling_x_train.transform(self.x_train) self.x_test = scaling_x_train.transform(self.x_test)
def test(self): print "Test Started at {0}".format(str(datetime.datetime.now())) x, y = parse_data(self.test_path_str) print "Test parse Finished at {0}".format(str(datetime.datetime.now())) results_list = self.classifier.predict(x) self.error_percentage = calculate_error_percentage(y, results_list) print "Test Finished at {0}".format(str(datetime.datetime.now())) print "The error percentage is {0}".format(self.error_percentage)
def train(self): """ training the SVM, saving the trained SVM into the local disk, this will helpful for the user because he can avoid running the training on the same data every time he reopens the program """ print "Training Started at {0}".format(str(datetime.datetime.now())) x, y = parse_data(self.training_path_str) print "Training parse Finished at {0}".format( str(datetime.datetime.now())) self.classifier.fit(x, y) self.save_classifier(self.classifier) print "Training Finished at {0}".format(str(datetime.datetime.now()))
def read_prism_cell_from_file(row_metadata_file, items): filepath = row_metadata_file (headers, data) = parse_data.read_data(filepath) data = [x for x in data if x[0][0] != "#"] header_map = parse_data.generate_header_map(headers, items, False) logger.debug("header_map: {}".format(header_map)) return parse_data.parse_data(header_map, data, PrismCell)
def _read_perturbagen_from_file(filepath, do_keep_all): (headers, data) = parse_data.read_data(filepath) #todo: think about other checks / better notification of wrong map type if "well_position" in headers: Exception( "Merino no longer supports CM map type, please convert map to CMap map type" ) header_map = parse_data.generate_header_map(headers, None, do_keep_all) logger.debug("header_map: {}".format(header_map)) return parse_data.parse_data(header_map, data, Perturbagen)
def write_record(i, proxy, queue, min_time, max_time): try: data_file = codecs.open(os.path.join(data_path, '%s.csv'%proxy.split(':')[0]), 'ab') error_time = 0 url = "" while True: print queue.qsize() url = "" #gc.collect() #objgraph.show_most_common_types(limit=50) try: url = queue.get(block=False) except: break print "proxy:%s,url:%s"%(proxy, url) content = open(url, proxy) if not content or content == "503": if not content: error_time+=1 if error_time > 100: break queue.put(url) time.sleep(1) continue if content == "404": continue #解析价格 try: result_row = parse_data(content) except: logger.error(url + "".join(traceback.format_exception(*sys.exc_info()))) queue.put(url) #如果读取出错,暂停1秒 time.sleep(1) #一个线程如果错误大于5次,终止该线程 continue if not result_row: continue csv.writer(data_file).writerow([url.split('/')[5]] + result_row) del result_row, url #暂停几秒 stop_time(min_time, max_time) except: logger.error(url + "".join(traceback.format_exception(*sys.exc_info()))) finally: l = locals() if 'data_file' in l: data_file.close()
def main(): parsed_cases = parse_data() conn = None try: conn = psycopg2.connect(host="localhost", database="expunge", port='5432', user="******" ) cur = conn.cursor() added_list = [] for parsed_case in parsed_cases: # if parsed_case['person_id'] not in added_list: # cmd = "INSERT INTO person (person_id, age) VALUES ({}, {})".format("'"+parsed_case['person_id']+"'", parsed_case['age']) # cur.execute(cmd) # added_list.append(parsed_case['person_id']) ## # cmd = "INSERT INTO holds (person_id, case_number) VALUES ({}, {}) ON CONFLICT DO NOTHING".format("'" + parsed_case['person_id'] + "'", "'"+parsed_case['case_number']+"'") # if parsed_case['case_number'] not in added_list: # cmd = "INSERT INTO cases(case_number, balance, location, violation_type) VALUES({}, {}, {}, {})"\ # .format("'"+parsed_case['case_number']+"'", parsed_case['balance'], "'"+parsed_case['location']+"'", "'"+parsed_case['violation_type']+"'") # cur.execute(cmd) # added_list.append(parsed_case['case_number']) for charge in parsed_case['charges']: cmd = "INSERT INTO charges(case_number, eligibility, convicted) VALUES({}, {}, {})".format("'"+parsed_case['case_number']+"'", "'"+charge['eligibility']+"'", "'"+charge['convicted']+"'") cur.execute(cmd) conn.commit() cur.close() except (Exception, psycopg2.DatabaseError) as error: print(error) finally: if conn is not None: conn.close()
def run_test(self): # ------------------------------------------------------------------------------------- # Start Training and parse the data in case the user didn't # do it already and only if we finished the training. # ------------------------------------------------------------------------------------- if self.training_status == Status.DONE and self.testing_status == Status.UNINITIALIZED: try: self.corrupted_data = False self.testing_status = Status.IN_PROGRESS x_test, y_test, corrupted_data = parse_data('data/adult.test') self.corrupted_data = corrupted_data # ------------------------------------------------------------------------------------- # converting list to np array, start testing and calculating the error_percentage. # ------------------------------------------------------------------------------------- x_test = np.array(x_test) x_test = x_test[:, self.cols] y_pred = self.clf.predict(x_test) self.error_percentage = calculate_error_percentage( y_test, y_pred) self.testing_status = Status.DONE except Exception as e: self.testing_status = Status.UNINITIALIZED print(e)
from build_csv import build_csv from parse_data import parse_data from perform_analysis import check_range, check_countour, check_key from music21 import stream, midi, musicxml composition_id = '2' build_csv(composition_id) note_stream = stream.Stream() parse_data(note_stream, composition_id) check_range(note_stream) check_countour(note_stream) check_key(note_stream) midi_file = midi.translate.streamToMidiFile(note_stream) midi_file.open('../user_data/midi' + composition_id + '.midi', 'wb') midi_file.write() midi_file.close() exporter = musicxml.m21ToXml.GeneralObjectExporter(note_stream) xml_bytes = exporter.parse() xml_string = xml_bytes.decode('utf-8') xml_file = open('../user_data/xml' + composition_id + '.xml', 'w') xml_file.write(xml_string) xml_file.close()
def main(argv): parser = argparse.ArgumentParser() parser.add_argument('mode', help='extract | tsne | umap') parser.add_argument( 'data', help= '[features]: Filepath to an image or folder containing images to extract features from. [tsne/umap]: Filepath to a .csv file to read into a DataFrame. ' ) parser.add_argument('out', help='Output filepath of operation') parser.add_argument( '--feature-cols', '-f', help= '[tsne/umap]: Numerical data column indices to treat as features. Ex: "B,C,F", use "all" to consider all columns (excluding optional unique-col).' ) parser.add_argument( '--unique-col', '-u', help= '[tsne/umap]: The column index containing unique IDs for each row (typically "ID" or "Name" column). Not required. Omitted from "all" feature-cols' ) parser.add_argument( '--reduce', '-r', help= '[tsne/umap]: How many dimensions to reduce features to. Default is 2.', default='2') parser.add_argument( '--model', '-m', help= '[features]: Which model to use. ResNet50 | Xception | VGG16 | VGG19 | InceptionV3 | MobileNet. Default: ResNet50', default='ResNet50') args = parser.parse_args(argv[1:]) # === FEATURE EXTRACTION === # We expect an image filepath or folder of images if args.mode == 'features': assert os.path.exists(args.data),\ 'Features mode (data arg): File or directory not found: "{}"'\ .format(args.data) # Calculate and write to args.out features = extract_features(args.data, model=args.model, write_to=args.out) # === DIMENSION REDUCTION === # We expect a .csv file of features elif args.mode in ['tsne', 'umap']: # Make sure we know what columns are intended to be used numerically as a list of strings, or 'all' feature_cols = args.feature_cols if feature_cols is None: raise Exception( 'Feature reduction mode: No data column indices provided. Example usage: "--feature-cols B,C,F", "--feature-cols all"' ) elif feature_cols != 'all': feature_cols = [ s.strip() for s in feature_cols.split(',') if s.strip() != '' ] # Parse the data into a squashed pd.DataFrame with first column being unique keys df = parse_data(args.data, feature_cols, args.unique_col) if args.mode == 'tsne': tsne(df, dims=int(args.reduce), write_to=args.out) elif args.mode == 'umap': umap(df, write_to=args.out)
filename = 'NBA team name vs abbreviation.csv' with open(filename, "r") as f: f = csv.DictReader(f, delimiter = ',') for i, line in enumerate(f): if i >= 0: #the website might have issue if we download the data for all 30 # teams in one run, so might need to download the data team by team, #by modifiying the value of i, I have downloaded and saved the data, you #you can also just run with the downloaded data Team_name = line['Abbreviation/Acronym'] #print 'downloading data for ' + line['Abbreviation/Acronym'] #Game_ID(Team_name) #download_play_by_play(Team_name) #parse_shot_log(Team_name) print 'parsing data for ' + line['Abbreviation/Acronym'] parse_data(Team_name) print 'saving to mongodb for ' + line['Abbreviation/Acronym'] save_to_db_shot_log(Team_name) update_db_shot_distance_shot_percentage(Team_name) ########################## #predict the shot difficulty of making a shot(probablity of missing a shot). The training dataset is the shot data #from the other 29 teams, and the testing dataset is the shot data for the target team #the shot data is loaded from the database 'NBA_shot_log2' from pymongo import MongoClient client = MongoClient("mongodb://localhost:27017") db = client.NBA_shot_log2 shot_difficulty = {}
def run(args): ### Setup ### # define outside cwd dir base = os.getcwd() #+ "\\CDT Test" csv_folder = base + "\\" + args.data #"+\\Use 1" # #csv_folder = "C:\\Users\\pkellicker\\Desktop\\Coffee Code\\Git Code\\Excel\\Use 2" # C:\Users\pkellicker\Desktop\Coffee Code\Code # base = "C:\\Users\\pkellicker\\Desktop\\Coffee Code\\Git Code\\Excel" os.chdir(base) # import static files cesar_name = args.kpi #= "CP300 & CF90 Cup temp-volume.xlsm" # recipe_name = args.recipe #= "CP300 EB2.xlsx" # cesar = pd.read_excel(cesar_name) recipe = pd.read_excel(recipe_name, skiprows=1) # ********************************************************************* # # * * * work cesar to correspond w/ csv * * * # cesar_csv = cesar.filter(items=[ 'CSV Filename' ]) # Create df of cesar with csv filename only *** does skip Tea line cesar_csv = cesar_csv.dropna() # Clean up df # find length based on this column - should be more accurate cesar_length = cesar.filter(items=["Pre Test Chamber Weight grams"]) cesar_length = cesar_length.dropna() file_count = cesar_length.shape[0] # get number of csv files names = cesar_csv.values # I want a string array of the names so I can feed them into the fn skip_to # ********************************************************************* # # * * * work recipe to correspond w/ csv * * * # tea = 0 # flag for when we hit tea # create array of dates -> field in which Tea brew start is noted cesar_date = cesar.filter(items=['Date']) cesar_date = cesar_date.dropna() dates = cesar_date.values # find number of date entries to avoid index error num_dates = cesar_date.shape[0] # ********************************************************************* # count = 0 array = np.zeros((file_count, 6)) end_time = args.end # Load workbook for storing data & find correct columns' index wb = load_workbook(base + "\\" + cesar_name, read_only=False, keep_vba=True) ws = wb['Data'] # find column number avg_bot_col = cesar.columns.get_loc("Avg Bot") + 1 avg_mid_col = avg_bot_col - 1 avg_top_col = avg_bot_col - 2 time_zero_col = cesar.columns.get_loc("Time Zero Temp") + 1 #convert column number to letter avg_bot_col_let = get_column_letter(avg_bot_col) avg_mid_col_let = get_column_letter(avg_mid_col) avg_top_col_let = get_column_letter(avg_top_col) #time_zero_col_let = get_column_letter(time_zero_col) ### Loop ### for i in range(0, file_count): # check for when tea starts - also for incrementing since one line in xlsx is for *** careful it needs to find tea i.e. start b4 tea line if i < num_dates: if tea == 0: if dates[i] == "EB2 TEA BREWS": tea = 1 file_temp = names[i] + ".csv" # add the .csv extension to file name file_temp = str(file_temp)[ 2:-2] # remove first 2 and last 2 chars -> [''] if os.path.isfile(csv_folder + "\\" + file_temp) == False: # check to see if file exists continue data_curr = skip_to(file_temp, "Number", csv_folder) # current csv file working with data_table = skip_to(file_temp, "CH", csv_folder) #data_f = data_table.iloc[0:10, 0:2] # get top of csv file -> channel table #data_table.head(8).to_excel("csv_Headers.xlsx") # create filter for current row, create dataframe of current row cesar_rowi = cesar['Test Number'] == i + 1 + tea cesar_curr = cesar[ cesar_rowi] # cesar_curr = row corresponding to current csv file *** need to step by 1 when -> tea recipe_curr_1 = find_curr_recipe(cesar_curr, recipe, tea) recipe_curr = recipe_curr_1[0] brew_type = recipe_curr_1[1] data = parse_data(recipe_cur=recipe_curr, data_cur=data_curr, data_f=data_table, cesar_cur=cesar_curr, end_time=end_time, top=args.top, mid=args.mid, bot=args.bot) array[i][0] = data[0] # bot array[i][1] = data[1] # mid array[i][2] = data[2] # top array[i][3] = data[3] # one array[i][4] = data[4] # two #store data #os.chdir("C:\\Users\\pkellicker\\Desktop\\Coffee Code\\Git Code\\Excel") ws.cell(row=i + 2, column=avg_bot_col).value = array[i][0] ws.cell(row=i + 2, column=avg_mid_col).value = array[i][1] ws.cell(row=i + 2, column=avg_top_col).value = array[i][2] #ws.cell(row = i+2, column = avg_top_col-2).value = array[i][4] #ws.cell(row = i+2, column = avg_top_col-1).value = array[i][5] # input average formula into column "Time Zero Temp" string_1 = "=ROUND(AVERAGE(" + avg_bot_col_let + str( i + 2) + "," + avg_mid_col_let + str( i + 2) + "," + avg_top_col_let + str( i + 2) + "),1)" # ********************changed could be bug ws.cell(row=i + 2, column=time_zero_col).value = string_1 wb.save(base + "\\" + cesar_name)
def run_test(self, file_path): x, y = parse_data(file_path) predict = self.clf.predict(preprocessing.scale(x)) self.y = y self.predicted_y = predict
def run_training(self, file_path): x, y = parse_data(file_path) #normelize X values for better preformnce x_scaled = preprocessing.scale(x) self.clf.fit(x_scaled, y)
def get_data(): return parse.parse_data(get_raw_data())
import flask, json from flask import jsonify, request, abort import parse_data as p app = flask.Flask(__name__) app.config["DEBUG"] = True #Retrieves the list of pokemons and the research parameters which are the different caracteristics of a pokemon pokemons, parameters = p.parse_data() with open("ressources\\pokemon.json", "r", encoding="utf-8-sig") as j: pokemons = json.load(j) @app.route('/', methods=['GET']) def home(): return "<h1>POKEMON LIBRARY</h1><p>This site is a prototype API for accessing data about Pokemons.</p>" @app.route('/api/v1/resources/pokemon/all', methods=['GET']) def api_all_pokemon(): return jsonify(pokemons) @app.route('/api/v1/resources/pokemon/<int:pokemon_id>', methods=['GET']) def api_id(pokemon_id): pokes = [poke for poke in pokemons if str(poke['#']) == str(pokemon_id)] if len(pokes) == 0: abort(404) return jsonify(pokes[0])
def get_data(data, data_name): dfDict = par.parse_data(data) X = dfDict[data_name][:,0:-1] Y = dfDict[data_name][:,-1] return X, Y
def get_data(data, data_name): dfDict = par.parse_data(data) X = dfDict[data_name][:, 0:-1] Y = dfDict[data_name][:, -1] return X, Y
def policyGen(): #Parse data.csv print("\nReading data.csv") numStates, numActions, n, Data = parse_data("data.csv") n = len(Data) theta_b = np.random.rand(numStates, numActions) print("Data reading done of episodes: ", n) policy_e = Softmax(numStates,numActions) delta = 0.05 c = 1.41537 gamma = 0.95 sigma = 0.5 runs = 100 i = 0 iterations = 0 print("Start to generate policies\n\n") while i < runs: start_time = time.time() iterations += 1 print("Iteration: ",iterations) # split from randomly sampled data Data_sample = np.random.choice(Data, n,replace=False) # 100k for training train_limit = int(n*0.1) #print(train_limit) train_c = Data_sample[:train_limit] # 60k for testing test_limit = int(n*0.94) test_s = Data_sample[test_limit:] # generate policies candidate_policy = PolicyImprovement(Data, train_c, test_s, theta_b, policy_e, delta, c, gamma) # evaluate selected_theta, result = candidate_policy.evaluate(sigma) print("Result: ", result) #conduct a safety test safety_pass=False if result < 100: print("\nTheta found and performing safety test") policy_c = Softmax(numStates, numActions) policy_c.parameters = selected_theta safety_pass, safety_pdis = candidate_policy.safetyTest(theta_c = selected_theta, policy_c = policy_c) print("safety pass:"******"Theta(4s+a): \n", selected_theta) print("Writing returns and policy to file") #write to output write_policy_to_file("../policy" + str(i + 1) + ".txt", selected_theta) i += 1 else: print("Safety pass failed") print("returns: ", result) print("Number of theta obtained so far: ", i) print("\n") print("Total number of theta obtained: ", i)