def parse_file(self, sgmfiles): ''' accesses the given files and finds the content between the required tags ''' for filename in sgmfiles: fopen = open(filename, 'r') lines = fopen.read() fopen.close() pattern = "<REUTERS[^>]*>([\s\S]*?)</REUTERS>" reuters = re.findall(pattern, lines) for line in reuters: rec_dict = {} #removing special characters from the obtained text line = CleanData().clean_spc_articles(line) rec_dict['REUTERS'] = line t_pttrn = "<TITLE>(.*?)</TITLE>" title = re.findall(t_pttrn, line) if len(title) != 0: title = CleanData().clean_spc_articles(title[0]) rec_dict['TITLE'] = title txt_pttrn = "<TEXT[^>]*>([\s\S]*?)</TEXT>" text = re.findall(txt_pttrn, line) if len(text) != 0: text = CleanData().clean_spc_articles(text[0]) rec_dict['TEXT'] = text #inserting each article into the database into the Articles collection ConnectDb().insert_data(rec_dict, self.connect, 'ReuterDb', 'Articles')
def process_data(self): #function that cleans the fetched tweets and inserts into processdb data = ConnectDb().find_data(self.connect, 'RawDb', 'RawTweets') for value in data: processed_data = {} for prop in value: if (isinstance(value[prop], bson.objectid.ObjectId)) | ( value[prop] == None): if value[prop] == None: processed_data[prop] = value[prop] else: try: formatted = CleanData().clean_emoji_data( value[prop]) #remove emoji clean_url = CleanData().clean_url_data( formatted) #remove url clean_spc = CleanData().clean_spc_chars( clean_url) #remove special characters processed_data[prop] = clean_spc except: pass #insert cleaned data into process db ConnectDb().insert_data(processed_data, self.connect, 'ProcessDb', 'Tweets')
def predict(self): X = self.__X_test.drop('rent_amount_boxcox',axis=1) y = self.__X_test['rent_amount_boxcox'] ypred = self.__xgbRegression.predict(X) print('MAE:', metrics.mean_absolute_error(y, ypred)) print('MSE:', metrics.mean_squared_error(y, ypred)) print('RMSE:', np.sqrt(metrics.mean_squared_error(y, ypred))) print('r2_score:', metrics.r2_score(y, ypred)) """ def test(self, lambda_): test_data = np.array([2, 1, 4, 2, 3, 0, 1, 0, 1, 5, 3, 2, 10]) ypred = self.__linearRegression.predict(test_data) scipy.special.inv_boxcox(ypred, lambda_) """ start = time.time() clean_data = CleanData("house_price.csv") data = clean_data.fit() encode_data = EncodeData(data) data = encode_data.fit() corr = Correlation(data) data = corr.corr_fit() split = SplitData(data) X, x = split.fit() para_x = split.getParameters() print(para_x) xgb = XGBReg(X, x) xgb.fit_() xgb.predict() print("Total time taken:", time.time() - start)
from download_data import vars_to_pull from clean_data import CleanData from analyze_data import Results, initial_data from analyze_data_sklearn import SkLearnResults, init_data from linear_regression_analysis import LinearData if __name__ =='__main__': data_cleaner = CleanData(vars_to_pull, is_test=True) # Note: downloading all data can take 12+ hours # Will skip if data exists (if partial download occured must move files for fresh download) data_cleaner.run_download() data_cleaner.clean_data() # Calculating residuals can take 30+ minutes results = Results(initial_data) results.run_analyze() sklresults = SkLearnResults(init_data) sklresults.run_sk_analysis() lrd = LinearData(init_data) lrd.run_linear_analysis()
def plot_feature_size(num_iter): """Tests various feature sizes and plots the error. Args: num_iter: Number of times to test for each point. """ points = [100, 500, 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000] errors = [] train_errors = [] # Iterate over all points defined. for point in points: print "Testing for point", point, "features." error = 0 train_error = 0 # Repeat the test the desired number of times. for i in range(0, num_iter): cd = CleanData(tfidf=True, max_train_size=25000, max_features=point) try: # Get and train data. training_data = cd.bag_of_words(in_file="data/clean_train_input.csv") ids, X, y = get_numpy_matrices(training_data) del training_data X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0) del X, y, ids nb = NaiveBayes() nb.train(X_train, y_train) # Calculate training and validation errors. out = nb.classify(X_test) error += nb.compute_error(out, y_test) train_out = nb.classify(X_train) train_error += nb.compute_error(train_out, y_train) except MemoryError: print "Memory error. Continuing." continue del X_train, X_test, y_train, y_test errors.append(error / num_iter) train_errors.append(train_error / num_iter) # PLOT. plt.figure(2) plt.title("Error vs Features") plt.xlabel("Number of features") plt.ylabel("Error") # plt.xscale('log') plt.plot(points, errors, '-ro') plt.plot(points, train_errors, '-bo') plt.show()
print "Training Naive Bayes classifier." nb.train(X_train, y_train) print "Done training." print "Classifying training input." out = nb.classify(X_train) print "Training error:", nb.compute_error(out, y_train) # Clean up unused arrays. del out del y_train del X_train if __name__ == '__main__': cd = CleanData(tfidf=True, max_train_size=15000, max_features=7000) nb = NaiveBayes() train_naive_bayes(cd, nb) # Classify the test data. classify_test_data(cd, nb, "results/nb_predictions2.csv") # Tests. # plot_max_train_size(3) # plot_feature_size(3) print "Completed successfully."
columns.remove("rent_amount_boxcox") test_data = np.array([2, 1, 4, 2, 3, 0, 1, 0, 1, 5, 3, 2, 91]).T index = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12] df = pd.DataFrame(test_data, index=index, columns=columns) #for i in range(len(self.__X_train.columns)): # df.loc[0, df.columns[i]] = test[i] #ypred = self.__model.predict(test_data) #return scipy.special.inv_boxcox(ypred, lambda_) return df """ start = time.time() clean_data = CleanData("NO-CHANGES\hyd_v2.csv") data = clean_data.fit() encode_data = EncodeData(data) data = encode_data.fit() corr = Correlation(data) data = corr.corr_fit() split = SplitData(data) X, x = split.fit() xgb_ = XGBReg(X, x) print(xgb_.fit_()) choice = input("Enter do you want to save this model.....type 'yes' to save or 'no' to ignore: ") choice = choice.lower() if choice == 'yes': xgb_.save_XGBmodel() else:
from sklearn import svm from clean_data import CleanData __author__ = "Yacine Sibous" cd = CleanData() print "Getting the training data." training_data = cd.bag_of_words(in_file="data/clean_train_input.csv") print "Done collecting data." X = [x[1] for x in training_data] y = [y[2] for y in training_data] print X[0:5] print y[0:5] clf = svm.SVC() clf.fit(X, y)
y = exp_smooth.exp_smooth_forecast(xs_fit_opt, True)[-forecast_periods:] else: # Consumption forecasting with elasticity and income y = 8 # Mask any negative, zero, infinity, or n/a values before returning y = np.ma.masked_less_equal(y, 0) y = np.ma.fix_invalid(y) return y # Format all rows new_datum_xs = np.ma.masked_all(datum_xs.shape, np.float) count = 0 for row in datum_xs: try: start, stop = np.ma.flatnotmasked_edges(row[VALUE_COLUMN:][np.newaxis, :]) values = CleanData(row[VALUE_COLUMN:stop + VALUE_COLUMN + 1][np.newaxis, :], X) xs = np.ma.hstack((values.get_return_values().flatten(), np.ma.masked_all(X.shape[0] - stop - 1))) except TypeError: # Some GDP rows do not have any values, therefore remove them xs = np.ma.array([0]) if np.ma.sum(xs): new_datum_xs[count] = np.ma.hstack((row[ID_SLICE], xs)) count += 1 # Resize the array to remove blank rows of data new_datum_xs = np.ma.resize(new_datum_xs, (count, new_datum_xs.shape[1])) # Append population and population net change arrays to the formatted and forecasted datum table count = 0 Q = "SELECT * FROM Datum WHERE element_id BETWEEN 511 AND 703" pop_xs = np.ma.masked_equal(cursor.execute(Q).fetchall(), -1)[:, 1:] pop_xs = np.ma.filled(np.ma.column_stack(
from sklearn import svm from sklearn.cross_validation import cross_val_score from clean_data import CleanData import numpy as np import csv __author__ = "Yacine Sibous, Jana Pavlasek" # Initialize data for final submission. cd = CleanData(tfidf=True, max_features=2500000, n_grams=3) # Geat features and output. print 'Getting Training data.' X, y = cd.bag_of_words(in_file="data/clean_train_input.csv", sparse=True) print 'Done collecting data.' # Train. print 'Training the model.' lin_clf = svm.LinearSVC() lin_clf.fit(X, y) print 'Done training.' # 3-fold cross validation. print 'Cross Validation' c_validation = cross_val_score(lin_clf, X, y, scoring='accuracy') print c_validation.mean() # Get and predict on the final test data. print 'Collecting test data.' test = cd.get_x_in(sparse=True) print 'Done collecting data.'
import folium from folium import plugins from folium.plugins import MarkerCluster from clean_data import CleanData met_df = CleanData() m = folium.Map(zoom_start=4, tiles="OpenStreetMap") cluster_map = MarkerCluster().add_to(m) # Create an individual marker for each meteorite, adding it to a cluster for coord in [tuple(x) for x in met_df.to_records(index=False)]: ID = coord[1] name = coord[0] year = coord[6] mass = coord[4] rec_class = coord[3] latitude = coord[7] longitude = coord[8] # Manually generate row index #index = met_df[(met_df["reclat"] == latitude) & (met_df["reclong"] == longitude)].index.tolist()[0] # Create custom marker icon | causes size increase of output html file #meteorite_icon = folium.features.CustomIcon('Assets/meteorite.png', icon_size=(80,80)) html = f""" <table border="1"> <tr> <th> ID </th>