def cycle_coord(self, ncycle=0): """ Cyclic shift of coordinates, e.g. x-y-z to y-z-x. """ ncycle = ncycle % 3 if ncycle == 0: return None for icycle in range(ncycle): a1, a2, a3 = self.get_lattice_vectors() a1 = [a1[1], a1[2], a1[0]] a2 = [a2[1], a2[2], a2[0]] a3 = [a3[1], a3[2], a3[0]] self.set_lattice(1.0, a2, a3, a1) from_col = ['x', 'y', 'z'] to_col = [ from_col[(0 + ncycle) % 3], from_col[(1 + ncycle) % 3], from_col[(2 + ncycle) % 3], ] newdf = np.DataFrame(columns=['x', 'y', 'z']) for i in range(3): newdf[to_col[i]] = self.atoms[from_col[i]] self.atoms[['x', 'y', 'z']] = newdf[['x', 'y', 'z']] return None
def importDatasetFromHDF5(filepath, dataset_name): """Read hdf5 input file""" with h5py.File(filepath, 'r') as hf: ds = hf[dataset_name] df = np.DataFrame(data=ds[:]) return df
def plot_importance(model, X, num=len(X)): feature_imp = pd.DataFrame({'Value': model.feature_importances_, 'Feature': X.columns}) plt.figure(figsize=(10, 15)) sns.set(font_scale=1) sns.barplot(x="Value", y="Feature", data=feature_imp.sort_values(by="Value", ascending=False)[0:num]) plt.title('Feature Importance') plt.tight_layout() plt.savefig('importances-01.png') plt.show()
def evaluate_model(model, X_test, Y_test, category_names): ''' INPUT model - a model that utilizes a prespecified pipeline for vectorization and classification in combination with gridsearch to find best specified parameters for predictions X_test - test input data obtained from train test split Y_test - test output data obtained from train test split category_names - the names of the Y (i.e. Output) categories OUTPUT Tables containing stats on how well the model predicts the test data ''' Y_pred = model.predict(X_test) Y_pred_df = pd.DataFrame(Y_pred, columns=category_names) for i in range(36): print('Category: {}'.format(category_names[i].upper()), "\n\n", classification_report(Y_test.iloc[:, i], Y_pred_df.iloc[:, i]))
def image_udflip(imageDir_path="", trainLabel_path="", outputpath=""): """ flip image up to down and flip the bounding box either output pic end with "-updown" Args: imageDir_path: the path of the train pic dir trainLabel_path: the path of the train label path outputpath: the path of the flipped pic and train label """ bounding_boxs = get_bounding_box(trainLabel_path) image_data = read_images(imageDir_path) ids = get_ids(imageDir_path) newImages = {} new_pd_data = [] for id in ids: height = image_data[id].shape[0] newImages[id + "-updown"] = np.flipud(image_data[id]) for bounding in bounding_boxs[id]: if (id + "-updown") not in newBounding_boxes: newBounding_boxes[id + "-updown"] = [] num = str(bounding.split()[0]) num1 = str(height - int(bounding.split()[1])) num2 = str(bounding.split()[2]) num3 = str(height - int(bounding.split()[3])) new_pd_data.append({ "ID": id + "-updown.jpg", "Detection": " ".join((num, num1, num2, num3)) }) output_csv_path = outputpath + "updown_label.csv" np.DataFrame(new_pd_data).to_csv(output_csv_path) for id in ids: name = re.split(r".jpg|.png|.jpeg", id)[0] print(outputpath + name + "-updown.jpg") cv2.imwrite(newImages[id + "-updown"], outputpath + name + "-updown.jpg")
def image_saltNoise(imageDir_path="", trainLabel_path="", outputpath=""): """ Add salt noise to the pic output pic end with "-salt" Args: imageDir_path: the path of the train pic dir trainLabel_path: the path of the train label path outputpath: the path of the flipped pic and train label """ def salt(image, n): height, width = image.shape[:2] for k in range(n): x = int(np.random.random() * height) y = int(np.random.random() * width) if img.ndim == 2: img[x, y] = 255 else: img[x, y, 0] = 255 img[x, y, 1] = 255 img[x, y, 2] = 255 return img bounding_boxs = get_bounding_box(trainLabel_path) image_data = read_images(imageDir_path) ids = get_ids(imageDir_path) new_pd_data = [] for id in ids: salt(image_data[id], 5000) for box in bounding_boxs[id]: new_pd_data.append({"ID": id + "-salt.jpg", "Detection": box}) output_csv_path = outputpath + "-salt.csv" np.DataFrame(new_pd_data).to_csv(output_csv_path) for id in ids: name = re.split(r".jpg|.png|.jpeg", id)[0] print(outputpath + name + "-salt.jpg") cv2.imwrite(image_data[id], outputpath + name + "-salt.jpg")
time_2018x住院.rename( columns=lambda x: 'y' + str(int(x.replace('理賠金額_住院', '')) - 2003), inplace=True) time_2018x手術.rename( columns=lambda x: 'y' + str(int(x.replace('理賠金額_手術', '')) - 2003), inplace=True) time_trainx住院_pca = reduced_Scaler住院.fit_transform(time_trainx住院) time_2018x住院_pca = reduced_Scaler住院.transform(time_2018x住院) time_trainx手術_pca = reduced_Scaler手術.fit_transform(time_trainx手術) time_2018x手術_pca = reduced_Scaler手術.transform(time_2018x手術) time_2018x住院_pca = pd.DataFrame(time_testx住院_pca, columns=[ 'pca_1', 'pca_2', 'pca_3', 'pca_4', 'pca_5', 'pca_6', 'pca_7', 'pca_8', 'pca_9', 'pca_10' ]) time_2018x手術_pca = pd.DataFrame(time_testx手術_pca, columns=[ 'pca_11', 'pca_12', 'pca_13', 'pca_14', 'pca_15', 'pca_16', 'pca_17', 'pca_18', 'pca_19', 'pca_20' ]) test_2018x_pca = nontime_2018x.join(time_2018x住院_pca) test_2018x_pca = test_2018x_pca.join(time_2018x手術_pca) ###################################################################### test2018_y_binary = test2018_y.astype('bool').astype('int')
meta_data = pd.concat(search_data_list) X_meta = meta_data.drop(["score", "eval_time"], axis=1) y_meta = meta_data["score"] print("X_meta", X_meta) print("y_meta", y_meta) gbr = GradientBoostingRegressor() gbr.fit(X_meta, y_meta) data = load_iris() X_new, y_new = data.data, data.target X_meta_test = pd.DataFrame(range(1, 100), columns=["n_neighbors"]) X_meta_test["size_X"] = X_new.size X_meta_test["itemsize_X"] = X_new.itemsize X_meta_test["ndim_X"] = X_new.ndim X_meta_test["size_y"] = y_new.size X_meta_test["itemsize_y"] = y_new.itemsize X_meta_test["ndim_y"] = y_new.ndim y_meta_pred = gbr.predict(X_meta_test) print("y_meta_pred", y_meta_pred) y_meta_pred_max_idx = y_meta_pred.argmax() n_neighbors_best = search_space["n_neighbors"][y_meta_pred_max_idx] print("n_neighbors_best", n_neighbors_best)
import pandas as pd import pylab as P import matplotlib.colors as colours import datetime import sklearn.ensemble test_forest = sklearn.ensemble.RandomForestClassifier(n_estimators = 50) print(processed_data.info()) processed_data_X = processed_data[['year','X','Y','day','hour']] processed_data_y = processed_data['Category'] test_forest.fit(processed_data_X,processed_data_y) test_forest.score(processed_data_X,processed_data_y) test_data = read_test_data() test_data_X = test_data[['year','X','Y','day','hour']] submission = test_forest.predict(test_data_X) types = processed_data['Category'].unique() types.sort() submission_formatted = np.DataFrame(columns = ['Id']+types, index = range(0,len(submission))) for type in submission_formatted.columns: submission_formatted[type] = 1*(submission == str(type)) submission_formatted['Id'] = range(0,len(submission)) submission_formatted.to_csv("submission.csv",sep=',',index=False)
row.append(TOFH) row.append(TOAH) row.append(TOFA) row.append(TOAA) row.append(ORFH) row.append(ORAH) row.append(ORFA) row.append(ORAA) row.append(FTRFH) row.append(FTRAH) row.append(FTRFA) row.append(FTRAA) data.append(row) #Create DataFrame of Team Stats teamStats = pd.DataFrame(data, columns=columns) refHomeTeams = teamStats[[ 'Team', 'PFH', 'PAH', 'eFGFH', 'eFGAH', 'TOFH', 'TOAH', 'ORFH', 'ORAH', 'FTRFH', 'FTRAH' ]].copy() refAwayTeams = teamStats[[ 'Team', 'PFA', 'PAA', 'eFGFA', 'eFGAA', 'TOFA', 'TOAA', 'ORFA', 'ORAA', 'FTRFA', 'FTRAA' ]].copy() #Create Dataframe that adds stats to each team's games addHome = pd.merge(frame, refHomeTeams, left_on='homeTeam', right_on='Team') addAway = pd.merge(addHome, refAwayTeams, left_on='awayTeam', right_on='Team') finalFrame = addAway.drop(['Team_x', 'Team_y'], axis=1)
# -*- coding: utf-8 -*- """ Created on Thu May 6 20:13:37 2021 @author: User """ import numpy as pd import pandas as pd import matplotlib.pyplot as plt from sklearn.datasets import load_iris iris = load_iris() iris.feature_names Data_iris = iris.data Data_iris = pd.DataFrame(Data_iris, columns=iris.feature_names) Data_iris['label'] = iris.target plt.scatter(Data_iris.iloc[:,2], Data_iris.iloc[:,3])
my_hour = 13 my_minute = 30 my_second = 15 my_date = datetime(my_year, my_month, my_day) my_date.day first_two = [datetime(2016, 1, 1), datetime(2016, 1, 2)] # Datetime Index dt_ind = pd.DatetimeIndex(first_two) data = np.random.randn(2, 2) cols = ['a', 'b'] df = pd.DataFrame(data, dt_ind, cols) df.index.argmax() # latest index (use min for first) df.index.max() # latest date # Time Resampling df = pd.read_csv( 'data/walmart_stock.csv') # parse_dates=True, index_col='Date' df.info() df['Date'] = pd.to_datetime(df['Date'], format='') df['Date'] = df['Date'].apply(pd.to_datetime) df.set_index('Date', inplace=True) df.head() df.resample(rule='A').mean() # year end frequency
def image_rotate(imageDir_path="", trainLabel_path="", outputpath="", degree=90): """ rotate the image clockwise with a degree output pic end with "-rotated" Args: imageDir_path: the path of the train pic dir trainLabel_path: the path of the train label path outputpath: the path of the flipped pic and train label degree: the degree pic will roate """ bounding_boxs = get_bounding_box(trainLabel_path) image_data = read_images(imageDir_path) ids = get_ids(imageDir_path) newImages = {} new_pd_data = {} def transfer(box): lst = [int(item) for item in box.split()] lst1 = [lst[:2], [lst[2], lst[1]], lst[2:4], [lst[0], lst[3]]] ones = [1, 1, 1, 1] newMat = [] for point in np.c_[lst1, ones]: newMat.append(M.dot(point)) #minX should bigger than you pic width and height minX = 10000 maxX = -1 minY = minX maxY = maxX for item in newMat: minX = min(item[0], minX) maxX = max(item[0], maxX) minY = min(item[1], minY) maxY = min(item[1], maxY) return np.array([minX, minY, maxX, maxY]) for id in ids: cols, rows = image_data[id].shape[:2] """ the first para is the rotate point, second para is the degree third para is the scale riot """ M = cv2.getRotationMatrix2D((cols / 2, rows / 2), degree, 0.5) #rotate the pic newImages[id + "-rotated"] = cv2.warpAffine(image_data[id], M, (cols, rows)) #transfer the box for box in bounding_boxs[id]: newBox = [] for item in transfer(box): newBox.append(int(item)) new_pd_data.append({ "ID": id + "-rotated.jpg", "Detection": "".join((newBox[0], newBox[1], newBox[2], newBox[3])) }) output_csv_path = outputpath + "-rotated.csv" np.DataFrame(new_pd_data).to_csv(output_csv_path) for id in ids: name = re.split(r".jpg|.png|.jpeg", id)[0] print(outputpath + name + "-rotated.jpg") cv2.imwrite(newImages[id + "-rotated"], outputpath + name + "-rotated.jpg")
train_X = train[predictors_cols] my_model = RandomForestRegressor() # Fit the model: Capture patterns from provided data. This is the heart of modeling. my_model.fit(train_X, train_y) # Read test data test = pd.read_csv('test.csv') # Pull the same features/columns as training data from the test data test_X = test[predictors_cols] # Use the model to make predictions predicted_prices = my_model.predict(test_X) print(predicted_prices) #************************ # Submission #************************ # Submissions usally have two columns, ID and the prediction column # ID comes from test data. Prediction column will use target_field? my_submission = pd.DataFrame({'Id': test.Id, 'SalePrice': predicted_prices}) # print(my_submission) # save into a file called submission my_submission.to_csv('submission.csv', index=False) print('Finished!')
import pandas as pd from sklearn import preprocessing import talib from matplotlib import pyplot as plt #%% file = '/Users/wanjun/Desktop/LSTM模型/data/data_train_latest.csv' file_1 = '/Users/wanjun/Desktop/LSTM模型/data/MINUTE_zhuli_IF_20170509.csv' data = pd.read_csv(file) data_1 = pd.read_csv(file_1, index_col=1, parse_dates=True) data.index = data_1.index data['datetime'] = data_1.index #%% data = data.sort_index() data = data['2016-12-19':] index = data.drop_duplicates('datetime').resample('D').mean().dropna().index data_clean = pd.DataFrame(columns=['open', 'high', 'low', 'volume', 'close']) lst_len = [] lst = [] for i in index: i = str(i)[:10] temp = data[i] start = i + ' 09:30:00' end = i + ' 15:01:00' data_clean = data_clean.append(temp[start:end]) if len(temp[start:end]) > 242 or len(temp[start:end]) < 241: lst.append(i) lst_len.append(len(temp[start:end])) #%% #11月18日单独作处理,因为超过242,经过分析删除这一行 #data_clean=data_clean.drop('2016-11-18 13:01:00')