def save(ptosave): f = open('data.csv', 'a+') # print(ptosave) for pt in ptosave: ps = pt[0] if not is_necessary(ps): continue psts = pt[1] month = datetime.datetime.fromtimestamp(psts).strftime('%m') dayofmonth = datetime.datetime.fromtimestamp(psts).strftime('%d') dayofweek = datetime.datetime.fromtimestamp(psts).strftime('%w') hour = datetime.datetime.fromtimestamp(psts).strftime('%H') min = datetime.datetime.fromtimestamp(psts).strftime('%M') data = ps + "," + getprocess_file_name(ps) + "," + month + "," + dayofmonth + "," + dayofweek + \ "," + hour + "," + min + "," + str(psts) # print(data) f.write(data + "\n") print("Logged for : '" + getprocess_file_name(ps) + "' at " + datetime.datetime.fromtimestamp(psts).strftime('%d-%m-%y %H:%M')) f.close() src = path.realpath('data.csv') head, tail = path.split(src) dst = head + "\odata.csv" shutil.copy(src, dst) shutil.copystat(src, dst) data_parser.parse_data()
def main(): # 1.读取数据 matches = dp.parse_data("sc_data/wc_group_matches.csv") teams = dp.parse_data("sc_data/team_info.csv") history_matches = dp.parse_data("sc_data/history_matches.csv") squads = dp.parse_data("sc_data/squads.csv") # 2.初始化结果表示结构 match_details = dict() # 3.计算每场比赛的推荐度 for match in matches: # 3.1 获取比赛基本信息 match_id = match["No."] team_a, team_b = match["TeamA"], match["TeamB"] rank_a, code_a = get_team_info(teams, team_a) rank_b, code_b = get_team_info(teams, team_b) # 3.2 计算悬念大小 uncertainty_score = get_match_uncertainty(match, rank_a, rank_b) # 3.3 计算进球期望 goal_score = get_match_goals(history_matches, team_a, team_b) # 3.4 计算球员知名度 fame_score = get_match_fame(squads, code_a, code_b) # 3.5 计算综合推荐分数 recommedation_score = (uncertainty_score + goal_score + fame_score) / 3 print(match_id + " " + team_a + " vs. " + team_b + ":") print("unc_score: %.2f goal_score: %.2f fame_score: %.2f" % (uncertainty_score, goal_score, fame_score)) print("recommendation score: %.2f" % recommedation_score)
def run_configs(data_dir, reviews_filename): # directory of raw data eg. {root}/data/preprocessed_files/electronics/reviews_Electronics_5 filename = data_dir + reviews_filename # file endings raw = filename + ".json.gz" reviews = filename + "_reviews.txt" ratings = filename + "_ratings.npy" # if review or rating file doesnt exist, parse data and create them if not os.path.isfile(reviews) or not os.path.isfile(ratings): data_parser.parse_data(raw)
def run_pipeline(): """ Runs all functions in the pipeline. Parses tracking and events data from 52 xml files. Preprocesses the DataFrame to conform to Metrica Sports format. Then calculates EPV values to get the optimal passes using the Friends Of Tracking code, which can be found in the EPV_code folder. Then creates multiple features based on tracking and events data. Followed by the analysis, using a Linear Regression and Decision Tree. After each step files are saved to the /data folder. """ data_parser.parse_data() preprocessing.preprocess() generate_EPV_values.generate_epv_files() feature_engineering.engineer_features() analysis.run_analysis()
def main(): #logReg() in_size = 32 # length * width of image, but 1 for now num_Of_Input = 1000 # number of images out_size = 2 # size of output step_Size = 0.0001 # learning rate iter = 100 # iterations of gradient decent norm = 0.99/255 normBias = 0.01 X = np.random.randint(0, 2, (in_size, num_Of_Input)) # input layer, tested with random int 0 to 1 for grayscale Y = np.random.rand(out_size, num_Of_Input) # output_layer_truths where rows are outputs and col are examples train_images_stack, train_labels_stack, test_images_stack, test_labels_stack = data_parser.parse_data(norm, normBias) layers = [784, 16, 10] trainSize = 50000 # max 60000 testSize = 10000 # max 10000 model = Model(layers, trainSize) #[:, 0:trainSize] model.generateLayers(train_labels_stack[:, 0:trainSize]) # model.train(100, train_images_stack) # Todo: the 10 vs 100 caused an error model.batch_train(512, train_images_stack[:, 0:trainSize], 100, trainSize) model.print_cost() model.update_num_Of_Input(testSize) model.test(test_images_stack[:, 0:testSize], test_labels_stack[:, 0:testSize]) model.feedForward(test_images_stack) model.print_cost() """
def process(self): """Process the :return: ResultObj """ with open(self.file_path, "r") as f: data = f.read() result = parse_data(data) return ResultObj(self.file, **result)
def create_training_array(train_data): x_train_arr = [] y_train_arr = [] count = 0 for value in dp.parse_data(train_data): if value[1] == 0.0: if random.uniform(0, 1) > 0.9: x_train_arr.append(value[0]) y_train_arr.append(value[1]) count += 1 else: x_train_arr.append(value[0]) y_train_arr.append(value[1]) count += 1 x_train = x_train_arr y_train = y_train_arr return (x_train, y_train)
import numpy as np import matplotlib.pyplot as plt from utils import * import statistics if __name__ == '__main__': file_name = 'dataset_1' sample_num = 200 # 100 # k = 5 min_k = 2 max_k = 50 k_step = 4 # 2 # print('Working on %s, Randomly select %d samples, %d Centers' % (file_name, sample_num, k)) print('Working on %s, Randomly select %d samples, k from %d to %d' % (file_name, sample_num, min_k, max_k)) parsed_data = parse_data(file_name) print('Succeed in Parsing Data') tot_exp = 10 # 20 criteria = [9, 8, 3] # Gender, race, highest degree criteria_text = ['Gender', 'Race', 'Educational Status'] #for alpha in [0.8, 1.0, 1.2]: #for alpha in [0.8, 1.0, 1.2]: for alpha in [1.0]: #for beta in [0.8, 1.0, 1.2]: #for beta in [0.8, 1.0]: for beta in [1.0]: k_values = range(min_k, max_k + 1, k_step) control_kcenter_avg, control_kcenter_betas_avg = [], [] control_kcenter_grouped_betas_avg = [] control_kmedian_avg, control_kmedian_betas_avg = [], []
option = parser.parse_args() file_name = "./data/" + option.file_name output_file = "./result/" + option.file_name + "_result.txt" rho_value = option.rho min_k = 2 max_k = 10 k_step = 1 sample_num = option.sample_num center_num = option.center_num sample_type = "Random" if option.sample_type else "Full" # Choose from Full, Random tot_exp = 1 # Number of Experiments ran for each k print('Working on %s, Randomly select %d samples, k from %d to %d' % (file_name, sample_num, min_k, max_k)) parsed_data, kmeans_parsed_data = parse_data(file_name) dim = len(kmeans_parsed_data[0]) # Assertion makes sure we get identical copy of data in two formats assert (len(kmeans_parsed_data[0]) == parsed_data[0].dim) assert (len(parsed_data) == len(kmeans_parsed_data)) print('Succeed in Parsing Data') if sample_type == "Random": all_clients, reverse_map, _ = random_sample(parsed_data, sample_num) print('Succeed in Sampling %d Clients' % len(all_clients)) all_centers, original_centers = kmeansinitialization( kmeans_parsed_data, center_num) print('Succeed in Sampling %d Centers' % len(all_centers)) else: all_centers = parsed_data
import nltk from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.linear_model import LogisticRegression from sklearn.model_selection import train_test_split import data_parser as dp import database_connector as connector import text_processing as text select_reviews = "SELECT id, text FROM reviews WHERE positive IS NULL LIMIT 10000" set_sentiment = "UPDATE reviews SET positive = %s WHERE id = %s" if __name__ == "__main__": generator = dp.parse_data("train_reviews.json.gz") stop_words = nltk.corpus.stopwords.words('english') dbcursor = connector.get_cursor() # Setting output labels. reviews = list() for review in generator: overall = float(review.get("overall")) if overall > 3: review["label"] = 1 else: review["label"] = 0 reviews.append(review) # Splitting the data. train_data, test_data = train_test_split(reviews, train_size=0.8, shuffle=True)
# Adding argument parser. parser = argparse.ArgumentParser() parser.add_argument('--data', '-d', help='add path to data file') args = parser.parse_args() # Global variables. DATA_FNAME = args.data if __name__ == '__main__': if DEBUG_MODE: data_dict = mock_data.DATA_DICT elif path.isfile('../data/retail_data_dict.pkl'): data_dict = pkl.load(open('../data/retail_data_dict.pkl', 'rb')) else: data_dict = parse_data(DATA_FNAME) print('Data dictionary formed...') print('storing data dict for retail.txt in pickled dictionary') pkl.dump(data_dict, open('../data/retail_data_dict.pkl', 'wb')) if DEBUG_MODE: kUI_idx = mock_data.KUI_IDX arc = mock_data.ARC else: kUI_idx = get_kui_index(data_dict) print('Done kUI Index and ARC...') num_slots = sum([len(k) for k in data_dict.keys()]) type_slots = 24 zipf = 0.7
user="******", password="******", database="recommender", auth_plugin="mysql_native_password" ) dbcursor = db.cursor() insert_book = "INSERT INTO books (asin, title, description) VALUES (%s, %s, %s)" select_book = "SELECT id FROM books WHERE asin = " insert_review = "INSERT INTO reviews (book_id, weight, text) VALUES (%s, %s, %s)" delete_book = "DELETE FROM books WHERE id = " delete_reviews = "DELETE FROM reviews WHERE book_id = " if __name__ == "__main__": generator = dp.parse_data("meta_Books.json.gz") counter = 0 train_counter = 0 # Adding books to database. for book in generator: if counter < 103443: if "description" in book and "title" in book: desc = book.get("description") if len(desc.split()) > 100: counter += 1 asin = book["asin"] title = book["title"] new_book = (asin, title, desc)
action='store_true') parser.add_argument('--plot_overhead', required=False, default=False, action='store_true') parser.add_argument('--no_norm', dest='norm', action='store_false', required=False, default=True) opts = vars(parser.parse_args()) script_dir = os.path.dirname(os.path.normpath(__file__)) data_clean = None try: with open(os.path.join(script_dir, 'data.pickle'), 'rb') as file: data_clean = pickle.load(file) except: pass finally: if data_clean is None or opts['rebuild']: data_clean = data_parser.parse_data(rebuild=opts['rebuild']) with open(os.path.join(script_dir, 'data.pickle'), 'wb') as file: pickle.dump(data_clean, file) options = { k: opts[k] for k in opts if opts[k] is not None and k != 'rebuild' } plotter(data_clean, **options)