def outputs(model, pred, df, csv_path_name): df = clean_data(df) pred = clean_data(pred) y = df['price'] x = df.drop(columns='price') model = model() model.fit(x, y) y_pred = (model.predict(pred)) return pd.DataFrame(y_pred, columns=["price"]).to_csv(csv_path_name)
def main(argv): df = clean_data(argv[0]) X = df.drop(['PassengerId', 'Survived'], 1) y = df['Survived'] select_features(X, y)
def test(testfile, outputfile, best=False): features_n = 14 if best else 18 # number of features used test_data = pd.read_csv(testfile) if best: test_data = test_data[test_data["測項"] != "RAINFALL"] test_data = test_data[test_data["測項"] != "THC"] test_data = test_data[test_data["測項"] != "WD_HR"] test_data = test_data[test_data["測項"] != "WIND_DIREC"] test_data = clean_data(test_data) X_test = [] for i in range(0, test_data.shape[0], features_n): X_test.append(test_data[i:i + features_n, :].ravel()) X_test = np.array(X_test) # print(X_test) X_test = np.concatenate((np.ones((X_test.shape[0], 1)), X_test), axis=1) weightsfile = "./models/weights-best.npy" if best else "./models/weight.npy" Y_test = np.dot(X_test, np.load(weightsfile)) # create output directory if not exist dirname = os.path.dirname(outputfile) if not os.path.exists(dirname): os.makedirs(dirname) with open(outputfile, "w") as csvfile: writer = csv.writer(csvfile) writer.writerow(["id", "value"]) for i, y in enumerate(Y_test): writer.writerow(["id_%d" % i, y])
def main(country, yeari, yearf): data = get_data() cldata = clean.clean_data(data) filfix = filter_fixer(cldata, country, yeari, yearf) pop_plot_r, gr_plot_r = analyse_this(filfix) nomb, altnomb, capi, regi, langu, cpopul = enrich_that(filfix) rpdf = make_PDF(nomb, altnomb, capi, regi, langu, cpopul, pop_plot_r, gr_plot_r) return rpdf
def main(): parser = argparse.ArgumentParser() parser.add_argument("-source", choices=["local", "remote"], nargs=1, help="where data should be gotten from") args = parser.parse_args() location = args.source[0] if location == "local": download() grab_data_from_downloaded_raw_files() else: grab_data_by_scraping_and_api_requests() future_dates, past_dates_format = clean_data() draw_picuture(future_dates, past_dates_format)
def main(): dir_path = './raw_html' #dir_path = input("Input data file path: ") df_record = [] file_list = get_file_list(dir_path) #print(file_list) for file_name in file_list: file_path = os.path.join(dir_path, file_name) tree = read_html(file_path) record = parse_html(tree) df_record.append(record) df = pd.DataFrame(df_record) lagou = clean_data(df) #print(df.head()) print(lagou) lagou.to_csv('./output/lagou.csv', encoding='gbk')
find_thresh(preds, y_test, 10) cross_val_recall_auc(X_train, y_train, best_rf_model) #################################################### ############################################### ############### MAIN ########################## if __name__ == '__main__': df = pd.read_json('website/data/data.json') ## Clean the data cleaned = clean.clean_data(df) print(cleaned.columns) ## Getting targets and cleaned features y = clean.get_target(cleaned) # log_reg_cols = ['user_age', 'age_dummy'] # fit_logreg(y, cleaned, log_reg_cols) #Cols: 'USD','GBP','CAD','AUD','EUR','NZD','MXN','age_dummy','user_age','payoutdiff', 'eventdiff', 0.0, 1.0, 3.0, 'gts', 'num_order', 'num_payouts','payee_exists' rf_cols = [ 'USD', 'GBP', 'CAD', 'AUD', 'EUR', 'NZD', 'MXN', 'age_dummy', 'user_age', 'payoutdiff', 'gts', 'num_order', 'num_payouts', 'payee_exists', 'dict_elements' ] rf_model = fit_rf(y, cleaned, rf_cols)
def main(argv): df_train = clean_data(argv[0]) df_test = clean_data(argv[1]) select_parameters(df_train, df_test)
def main(argv): df_train = clean_data(argv[0]) df_test = clean_data(argv[1]) make_prediction(df_train, df_test, fit_tree, './output/prediction_tree.csv')
modules = ["CMY 127"] #modules = ["CMY 382", "CMY 383", "CMY 384", "CMY 385"] chorts = [2011, 2012, 2013, 2014, 2015] show_plans = [ 'Chemistry', 'Microbiology', 'Biochemistry', 'Physics', 'Geology', 'Medical', 'Biological', 'Human Physiology', 'Genetics', 'Veterinary Science' ] names = ["Marks", "Plans", "Students"] file_names = { name: [ "../Data/{0}/{1}".format(name, f) for f in os.listdir('../Data/{0}'.format(name)) ] for name in names } plans = clean.read_plan(file_names["Plans"]) for chort in chorts: sn, marks = clean.read_marks(file_names["Marks"], modules, chort, True) df = clean.read_student(file_names["Students"], chort, sn, plans, marks) dfnew = clean.clean_data(df, show_plans, 15) for key, val in d3s.items(): gen_html("base_bl.html", get_json.get(dfnew), modules[0], str(chort), val, "../Output/Dark/{}".format(key)) gen_html("base_mbl.html", get_json2.get(dfnew), modules[0], str(chort), val, "../Output/Dark/{}".format(key), True)
RandomForest] #, LogisticRegression] accuracy_dict = dict( zip([platform.__name__ for platform in model_platforms], [[] for i in range(len(model_platforms))])) raw_df = pd.read_csv(filepath) ''' bin_map = {'age': linspace(0, 100, 11).tolist()}#, #'capital.gain': linspace(0, 100000, 21).tolist()}#, #'capital.loss': unique(raw_df['capital.loss']).tolist(), #'hours.per.week': linspace(0, 100, 11).tolist()} raw_df = cont2cat(raw_df, bin_map) ''' for platform in model_platforms: df = clean_data(raw_df.copy(), data_cleaning_dict) model = platform() df1 = model.format_attr(df) for it in range(100): X_train, X_test, y_train, y_test = split_data(df1, response_variable) trained_model = model.create_model(X_train, y_train) accuracy_dict[platform.__name__].append( model.get_results(X_test, y_test)['Accuracy']) ''' for it in range(100): skf = StratifiedKFold(n_splits=10)
def main(argv): df_train = clean_data(argv[0]) df_test = clean_data(argv[1]) make_prediction(df_train, df_test, fit_linear_model, './output/prediction_logit.csv')
codes_dict = dict_all # All codes:plans for all years worth of files master_ = process.get_master( filelist, years) # Generate master dataframe containing all data for all students pbar = ProgBar() i = 1 for k, p in plans_desc.items(): pbar.pvar.set(i / len(list(plans_desc.values())) * 100) pbar.update() # Get raw data frame containing all student data for plan p (code k) df_raw = process.get_plan(k, master_, cohort, int(mg_dict[k]), codes_dict) # Check if any data in df, then continue: (Maybe change to check if multiple "terms" in df if len(df_raw) > 10: df_clean = clean.clean_data(df_raw, cohort, int(years[-1])) newname = p.replace(":", " ").replace(" ", "_") name = "{0}_{1}".format(str(cohort), newname) gen_html(get_json.get(df_clean), name, saveloc) if int(pbar.pvar.get()) == 100: pbar.button.config(state="normal", command=pbar.finish_button) i += 1 pbar.mainloop()
import pandas as pd import numpy as np from sklearn.model_selection import train_test_split from sklearn.linear_model import LinearRegression from sklearn.tree import DecisionTreeRegressor from sklearn.ensemble import RandomForestRegressor from sklearn.metrics import mean_squared_error from clean import clean_data from train import get_scores df = pd.read_csv('avocado.csv') df_train, df_test = train_test_split(df, test_size=0.2, random_state=42) df_train = clean_data(df_train) avoc_labels = df_train['AveragePrice'] df_train.drop(['AveragePrice'], axis=1, inplace=True) lin_reg = LinearRegression() lin_scores = get_scores(lin_reg, df_train, avoc_labels) print(lin_scores.mean()) tree_reg = DecisionTreeRegressor() tree_scores = get_scores(tree_reg, df_train, avoc_labels) print(tree_scores.mean()) rf_reg = RandomForestRegressor() rf_scores = get_scores(rf_reg, df_train, avoc_labels) print(rf_scores.mean())
from sklearn.preprocessing import StandardScaler, MinMaxScaler from sklearn.impute import SimpleImputer from sklearn.decomposition import PCA from sklearn.svm import SVC from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import RandomForestClassifier from sklearn.ensemble import BaggingClassifier from sklearn.ensemble import VotingClassifier from sklearn.pipeline import make_pipeline from sklearn.metrics import confusion_matrix, f1_score df = pd.read_csv('train_data.csv') test = pd.read_csv('test_data.csv') # clean dataset clean = clean_data(df) df_no_impact = clean[clean.Impact == 0] df_impact = clean[clean.Impact == 1] df_no_impact = df_no_impact.sample(frac=0.2, replace=False, random_state=1) df_cleaned = pd.concat([df_impact, df_no_impact]) X = df_cleaned.drop(columns=['Impact']) y = df_cleaned['Impact'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0, stratify=y) X_train.head() # logistic regression model_lr = LogisticRegression(
'Express SocSci views', 'Science is difficult', 'Observe experiments', 'Conduct experiments', 'Solve science problems', 'Express science views', 'Watch TV', 'Read magazine', 'Read a book', 'Play games', 'Help in household' ] # target variables target = ['performance', 'Maths %', 'Reading %', 'Science %', 'Social %'] # a new column "performance" is added to target as # average of ('Maths %', 'Reading %', 'Science %', 'Social %') marks["performance"] = marks[['Maths %', 'Reading %', 'Science %', 'Social %']].mean(axis=1, skipna=True) # cleaning data into featureset(X) and target(y) X, y = clean_data(marks, category, 'performance') # printing the stats of cleaned data print("No. of null values in X:\n", X.isnull().sum()) print("No. of null values in y:\n", y.isnull().sum()) # Creating a pipeline pipe = Pipeline(steps=[ # ('poly',PolynomialFeatures(degree=2, interaction_only = False)), # Uncomment this line to include extra polynomial features # ('selK',SelectKBest(score_func = f_regression, k='all')), # ('pca',PCA(n_components = 10)), ('regression', LinearRegression(fit_intercept=True, normalize=True)) ]) pipe.fit(X, y) # Print the classifier
import pandas as pd from tensorflow.keras.models import Sequential from tensorflow.keras.layers import Dense, Dropout import clean customers, transactions = clean.get_data("Datasource") charities = clean.get_charities(transactions) data = clean.clean_data(customers, transactions, charities) cut = round(data.shape[0] * 0.7) train = data.iloc[:cut, :] test = data.iloc[cut:, :] X_train, y_train = clean.sep_Xy(train, charities) X_test, y_test = clean.sep_Xy(test, charities) model = Sequential() model.add(Dense(250, activation="relu", input_dim=X_train.shape[1])) model.add(Dense(150, activation="relu")) model.add(Dense(y_train.shape[1], activation="relu")) model.compile(optimizer="adam", loss="mean_squared_error", metrics=["accuracy", "mae", "mse"]) model.fit(X_train, y_train, batch_size=4, epochs=5)
def shape_element(element): if element.tag != "node" and element.tag != "way": return None # Create an empty JSON object # Note: node is used as a generic name # here for "nodes" and "ways" node = {} node['created'] = {} node['type'] = element.tag # Create an empty list for GPS position if node['type'] == "node": node['pos'] = [] # Insert all attributes of the element key_list = element.keys() for key in key_list: # Inserted "created" values into a nested dictionary if key in created: node['created'][key] = element.attrib[key] # Insert "pos" coordinates ordered into a list elif key == 'lat' or key == 'lon': node['pos'] = [float(element.attrib['lat']), float(element.attrib['lon'])] # Create a key, value entry for all other attributes else: node[key] = element.attrib[key] # Loop through all children of the element and include their attributes for child in element: if child.tag == 'tag': tag_key, tag_value = child.attrib['k'], child.attrib['v'] # If the tag is all lowercase, clean the data if needed, # then insert it if lower.search(tag_key) is not None: if tag_key in needs_cleaning: tag_value = clean.clean_data(tag_key, tag_value) node[tag_key] = tag_value # If there is a colon in the key, split it and created # a nested dictionary as the entry elif lower_colon.search(tag_key) is not None: colon_location = tag_key.find(":") main_key, nested_key = tag_key[:colon_location], tag_key[colon_location+1:] # name:(language) contains foreign characters # and is only present on a few entries. These are skipped. if main_key == "name": # Continue to next child in element continue # Categories like building vs. building:levels conflict # because "building" contains a string and #"building" with a colon wants to contains a dictionary. # To preserve all data, "building" is converted to a # nested dictioinary and the old data is stored under # ['building']['type'] if main_key in node.keys() and type(node[main_key])==type("string"): temp = node[main_key] node[main_key]= {} node[main_key]["type"] = temp if main_key not in node.keys(): node[main_key] = {} if nested_key in needs_cleaning: tag_value = clean.clean_data(nested_key, tag_value) node[main_key][nested_key] = tag_value # Add node references in list format if child.tag == 'nd': if "node_refs" not in node: node["node_refs"] = [] node["node_refs"].append(child.attrib['ref']) return node
# temp = spam_emails[j].get_content() msg = spam_emails[j] print(j) for part in msg.walk(): # gets only text part of email if part.get_content_type() == 'text/plain': try: temp = part.get_content() except LookupError: print('exception at ', i) spam.append(temp) # cleans up email data new_ham, new_spam = clean.clean_data(ham, spam) # take all text emails and put them into individual text files new_ham_location = "C:\\Users\\Student\\Desktop\\email_files\\ham" new_spam_location = "C:\\Users\\Student\\Desktop\\email_files\\spam" count = 1 for script in new_ham: filename = str(count) + '.txt' print(filename, type(filename)) filename = new_ham_location + '\\' + filename print(filename, type(filename)) script_file = open(filename, "w", encoding='utf-8') script_file.write(script) script_file.close() count += 1
def train(X_train, Y_train, b = 10.): # add bias X_train = np.concatenate((np.full((X_train.shape[0], 1), b), X_train), axis = 1) w = np.zeros(X_train.shape[1]) lr = 0.1 iteration = 10000 n = X_train.shape[0] grad_squared_sum = np.zeros(X_train.shape[1]) # for ada grad for t in range(iteration): loss = Y_train - np.dot(X_train, w) RMSE = math.sqrt(np.sum(loss ** 2) / n) grad = -2 * np.dot(X_train.T, loss) grad_squared_sum += grad ** 2 w = w - lr * grad / np.sqrt(grad_squared_sum) print("Iteration %d: RMSE = %f" % (t, RMSE)) np.save("./models/weights-best", w) return w if __name__ == "__main__": features_n = 14 # number of features used train_data_list = [] for i in range(1, len(sys.argv)): train_data_list.append(pd.read_csv(sys.argv[i])) data = clean_data(pd.concat(train_data_list)) data = flatten_data_remove_outliers(data, features_n) X_train, Y_train = parse_train_data(data) w = train(X_train, Y_train)
import pandas as pd import clean # reading and cleaning data train = pd.read_csv('train.csv') clean.clean_data(train) test = pd.read_csv('test.csv') # first model (all women survive, all men die) # independent of training set test["survived_prediction"] = 0 test.loc[test.Sex == "female", "survived_prediction"] = 1 submission = pd.DataFrame({ "PassengerId": test["PassengerId"], "Survived": test["survived_prediction"] }) submission.to_csv("titanic_gender.csv", index=False)
driver.find_element_by_tag_name('body').send_keys(Keys.ARROW_RIGHT) try: video_time = driver.find_element_by_class_name('ytp-time-duration') if video_time.text == '': time_ad = int(current_ad.split('\n')[1][-2:]) time.sleep(time_ad + 1) driver.find_element_by_tag_name('body').send_keys(Keys.ARROW_RIGHT) video_time = driver.find_element_by_class_name('ytp-time-duration') time_ls.append(str(video_time.text)) else: time_ls.append(str(video_time.text)) print("Time: ", video_time.text) except: time_ls.append('0') print("Time: 0") i += 1 print('------------------------------------------') insert_to_df() end = time.time() print(f'Time needed for scraping {end_video - start_video} videos:', timer(start, end)) driver.close() #-------------------------------------------------------------------------------------------------------------------------------------------------- # Data cleaning clean_data(df_export, yt_channel, df_error) # Create Dashboard create_dashboard(df_export, yt_channel, sub)
import clean import model import scraper # region SETTINGS logging.basicConfig(level='INFO') ROWS_TO_SCRAPE = 600 sb_url = 'https://sfbay.craigslist.org/search/sby/apa' nb_url = 'https://sfbay.craigslist.org/search/nby/apa' # endregion # Scrape Craigslist if scraped data is not present if not os.path.isfile('../data/northbay.csv') or not os.path.isfile( '../data/southbay.csv'): southbay = scraper.scrape_apts(ROWS_TO_SCRAPE, sb_url) northbay = scraper.scrape_apts(ROWS_TO_SCRAPE, nb_url) southbay.to_csv('../data/southbay.csv') northbay.to_csv('../data/northbay.csv') if not os.path.isfile('../data/merge.csv'): clean.clean_data() # Option to change to 'merge2.csv' x, y = clean.prep_model('merge2.csv') # Fit model and generate plots lm, selected_x = model.model(x, y) # List of model coefficients; no application at the moment coefficient_list = model.return_coefficients(lm, selected_x) model.plot_analyze(lm, selected_x, y)
def preprocess(): logging.debug('preprocess function in process') # Load dataset # location test data os.listdir('C:\\Users\\Student\\Desktop\\extension_data\\hamnspam') ham_filenames = [ name for name in sorted( os.listdir( 'C:\\Users\\Student\\Desktop\\extension_data\\hamnspam\\ham')) if len(name) > 20 ] spam_filenames = [ name for name in sorted( os.listdir( 'C:\\Users\\Student\\Desktop\\extension_data\\hamnspam\\spam')) if len(name) > 20 ] # making list to match index values with filenames ham_emails = [ load_email(is_spam=False, filename=name) for name in ham_filenames ] spam_emails = [ load_email(is_spam=True, filename=name) for name in spam_filenames ] #joey numTrainHam = round(len(ham_emails) * 0.8, 0) numTrainSpam = round(len(spam_emails) * 0.8, 0) train_emails = [] test_emails = [] trainHam = 0 trainSpam = 0 testHam = 0 testSpam = 0 logging.debug('Entering ham for loop of adding data to test/train lists') for i in range(0, len(ham_emails) - 1): # temp = ham_emails[i].get_content() msg = ham_emails[i] for part in msg.walk(): # gets only text part of email if part.get_content_type() == 'text/plain': temp = part.get_content() if i < numTrainHam: train_emails.append(temp) trainHam += 1 logging.debug('current # ham emails for training: %d' % (trainHam)) else: test_emails.append(temp) testHam += 1 logging.debug('current # ham emails for testing: %d' % (testHam)) logging.debug( 'Finished ham - total # ham training emails: %d - total # ham test emails: %d' % (trainHam, testHam)) logging.debug('Entering spam for loop of adding data to test/train lists') for j in range(0, len(spam_emails) - 1): # temp = spam_emails[j].get_content() msg = spam_emails[j] for part in msg.walk(): # gets only text part of email if part.get_content_type() == 'text/plain': temp = part.get_content() if j < numTrainSpam: train_emails.append(temp) trainSpam += 1 logging.debug('current # spam emails for training: %d' % (trainSpam)) else: test_emails.append(temp) testSpam += 1 logging.debug('Finished spam - total # spam training emails: %d' % (trainSpam)) # endJOey #train_labels train_labels = [] for x in range(0, trainHam): train_labels.append("ham") logging.debug('current # ham labels: %d' % (x + 1)) ham_labels_len = len(train_labels) for x in range(0, trainSpam): train_labels.append("spam") logging.debug('current # ham labels: %d' % (x + 1)) logging.debug('total # ham labels: %d - total # spam labels: %d' % (ham_labels_len, (len(train_labels) - ham_labels_len))) #test_labels test_labels = [] for x in range(0, testHam): test_labels.append("ham") for x in range(0, testSpam): test_labels.append("spam") # cleans up email data logging.debug('going to clean.py') train_emails_cfd, test_emails_cfd = clean.clean_data( train_emails, test_emails) logging.debug('back from clean.py') # return data return train_emails_cfd, train_labels, test_emails_cfd, test_labels
import matplotlib.pyplot as plt import pandas as pd from clean import clean_data future_dates, past_dates_format = clean_data() def future_picture(future_dates): # draw picture of future_temperature_from_url and future_temperature_from_api plt.style.use('ggplot') fig = plt.figure(figsize=(10, 6)) colors1 = '#6D6D6D' data1 = pd.read_csv( 'future_temperature_from_url.csv')['Day Temperature'].values.tolist() for i in range(len(data1)): data1[i] = float(data1[i][:-1]) data3 = pd.read_csv('future_temperature_from_url.csv' )['Night Temperature'].values.tolist() for i in range(len(data1)): data3[i] = float(data3[i][:-1]) data2 = pd.read_csv('future_temperature_from_api_cleaned.csv' )['Day Temperature'].values.tolist() data4 = pd.read_csv('future_temperature_from_api_cleaned.csv' )['Night Temperature'].values.tolist() plt.plot(future_dates, data1, label='day_temperature_url') plt.plot(future_dates, data2, label='day_temperature_api') plt.plot(future_dates, data3, label='night_temperature_url') plt.plot(future_dates, data4, label='night_temperature_api')