Exemple #1
0
    def cycle_coord(self, ncycle=0):
        """
        Cyclic shift of coordinates, e.g. x-y-z to y-z-x.
        """
        ncycle = ncycle % 3
        if ncycle == 0:
            return None
        for icycle in range(ncycle):
            a1, a2, a3 = self.get_lattice_vectors()
            a1 = [a1[1], a1[2], a1[0]]
            a2 = [a2[1], a2[2], a2[0]]
            a3 = [a3[1], a3[2], a3[0]]
            self.set_lattice(1.0, a2, a3, a1)

        from_col = ['x', 'y', 'z']
        to_col = [
            from_col[(0 + ncycle) % 3],
            from_col[(1 + ncycle) % 3],
            from_col[(2 + ncycle) % 3],
        ]
        newdf = np.DataFrame(columns=['x', 'y', 'z'])
        for i in range(3):
            newdf[to_col[i]] = self.atoms[from_col[i]]
        self.atoms[['x', 'y', 'z']] = newdf[['x', 'y', 'z']]
        return None
Exemple #2
0
def importDatasetFromHDF5(filepath, dataset_name):
    """Read hdf5 input file"""

    with h5py.File(filepath, 'r') as hf:
        ds = hf[dataset_name]
        df = np.DataFrame(data=ds[:])

    return df
Exemple #3
0
def plot_importance(model, X, num=len(X)):
    feature_imp = pd.DataFrame({'Value': model.feature_importances_, 'Feature': X.columns})
    plt.figure(figsize=(10, 15))
    sns.set(font_scale=1)
    sns.barplot(x="Value", y="Feature", data=feature_imp.sort_values(by="Value",
                                                                     ascending=False)[0:num])
    plt.title('Feature Importance')
    plt.tight_layout()
    plt.savefig('importances-01.png')
    plt.show()
Exemple #4
0
def evaluate_model(model, X_test, Y_test, category_names):
    '''
    INPUT
    model - a model that utilizes a prespecified pipeline for vectorization and classification
    in combination with gridsearch to find best specified parameters for predictions
    X_test - test input data obtained from train test split
    Y_test - test output data obtained from train test split
    category_names - the names of the Y (i.e. Output) categories
    
    OUTPUT
    Tables containing stats on how well the model predicts the test data
    '''
    Y_pred = model.predict(X_test)

    Y_pred_df = pd.DataFrame(Y_pred, columns=category_names)

    for i in range(36):

        print('Category: {}'.format(category_names[i].upper()), "\n\n",
              classification_report(Y_test.iloc[:, i], Y_pred_df.iloc[:, i]))
def image_udflip(imageDir_path="", trainLabel_path="", outputpath=""):
    """
    flip image up to down and flip the bounding box either
    output pic end with "-updown"

    Args: 
        imageDir_path: the path of the train pic dir
        trainLabel_path: the path of the train label path
        outputpath: the path of the flipped pic and train label
    """
    bounding_boxs = get_bounding_box(trainLabel_path)
    image_data = read_images(imageDir_path)
    ids = get_ids(imageDir_path)

    newImages = {}
    new_pd_data = []

    for id in ids:
        height = image_data[id].shape[0]
        newImages[id + "-updown"] = np.flipud(image_data[id])
        for bounding in bounding_boxs[id]:
            if (id + "-updown") not in newBounding_boxes:
                newBounding_boxes[id + "-updown"] = []
            num = str(bounding.split()[0])
            num1 = str(height - int(bounding.split()[1]))
            num2 = str(bounding.split()[2])
            num3 = str(height - int(bounding.split()[3]))
            new_pd_data.append({
                "ID": id + "-updown.jpg",
                "Detection": " ".join((num, num1, num2, num3))
            })

    output_csv_path = outputpath + "updown_label.csv"
    np.DataFrame(new_pd_data).to_csv(output_csv_path)

    for id in ids:
        name = re.split(r".jpg|.png|.jpeg", id)[0]
        print(outputpath + name + "-updown.jpg")
        cv2.imwrite(newImages[id + "-updown"],
                    outputpath + name + "-updown.jpg")
def image_saltNoise(imageDir_path="", trainLabel_path="", outputpath=""):
    """
    Add salt noise to the pic
    output pic end with "-salt"
    Args:
        imageDir_path: the path of the train pic dir
        trainLabel_path: the path of the train label path
        outputpath: the path of the flipped pic and train label
    """
    def salt(image, n):
        height, width = image.shape[:2]
        for k in range(n):
            x = int(np.random.random() * height)
            y = int(np.random.random() * width)
            if img.ndim == 2:
                img[x, y] = 255
            else:
                img[x, y, 0] = 255
                img[x, y, 1] = 255
                img[x, y, 2] = 255
        return img

    bounding_boxs = get_bounding_box(trainLabel_path)
    image_data = read_images(imageDir_path)
    ids = get_ids(imageDir_path)

    new_pd_data = []

    for id in ids:
        salt(image_data[id], 5000)
        for box in bounding_boxs[id]:
            new_pd_data.append({"ID": id + "-salt.jpg", "Detection": box})

    output_csv_path = outputpath + "-salt.csv"
    np.DataFrame(new_pd_data).to_csv(output_csv_path)

    for id in ids:
        name = re.split(r".jpg|.png|.jpeg", id)[0]
        print(outputpath + name + "-salt.jpg")
        cv2.imwrite(image_data[id], outputpath + name + "-salt.jpg")
Exemple #7
0
time_2018x住院.rename(
    columns=lambda x: 'y' + str(int(x.replace('理賠金額_住院', '')) - 2003),
    inplace=True)
time_2018x手術.rename(
    columns=lambda x: 'y' + str(int(x.replace('理賠金額_手術', '')) - 2003),
    inplace=True)

time_trainx住院_pca = reduced_Scaler住院.fit_transform(time_trainx住院)
time_2018x住院_pca = reduced_Scaler住院.transform(time_2018x住院)
time_trainx手術_pca = reduced_Scaler手術.fit_transform(time_trainx手術)
time_2018x手術_pca = reduced_Scaler手術.transform(time_2018x手術)

time_2018x住院_pca = pd.DataFrame(time_testx住院_pca,
                                columns=[
                                    'pca_1', 'pca_2', 'pca_3', 'pca_4',
                                    'pca_5', 'pca_6', 'pca_7', 'pca_8',
                                    'pca_9', 'pca_10'
                                ])
time_2018x手術_pca = pd.DataFrame(time_testx手術_pca,
                                columns=[
                                    'pca_11', 'pca_12', 'pca_13', 'pca_14',
                                    'pca_15', 'pca_16', 'pca_17', 'pca_18',
                                    'pca_19', 'pca_20'
                                ])

test_2018x_pca = nontime_2018x.join(time_2018x住院_pca)
test_2018x_pca = test_2018x_pca.join(time_2018x手術_pca)

######################################################################
test2018_y_binary = test2018_y.astype('bool').astype('int')
Exemple #8
0
meta_data = pd.concat(search_data_list)

X_meta = meta_data.drop(["score", "eval_time"], axis=1)
y_meta = meta_data["score"]

print("X_meta", X_meta)
print("y_meta", y_meta)

gbr = GradientBoostingRegressor()
gbr.fit(X_meta, y_meta)

data = load_iris()
X_new, y_new = data.data, data.target

X_meta_test = pd.DataFrame(range(1, 100), columns=["n_neighbors"])

X_meta_test["size_X"] = X_new.size
X_meta_test["itemsize_X"] = X_new.itemsize
X_meta_test["ndim_X"] = X_new.ndim

X_meta_test["size_y"] = y_new.size
X_meta_test["itemsize_y"] = y_new.itemsize
X_meta_test["ndim_y"] = y_new.ndim

y_meta_pred = gbr.predict(X_meta_test)
print("y_meta_pred", y_meta_pred)

y_meta_pred_max_idx = y_meta_pred.argmax()
n_neighbors_best = search_space["n_neighbors"][y_meta_pred_max_idx]
print("n_neighbors_best", n_neighbors_best)
Exemple #9
0
import pandas as pd
import pylab as P
import matplotlib.colors as colours
import datetime
import sklearn.ensemble



test_forest = sklearn.ensemble.RandomForestClassifier(n_estimators = 50)

print(processed_data.info())
processed_data_X = processed_data[['year','X','Y','day','hour']]
processed_data_y = processed_data['Category']

test_forest.fit(processed_data_X,processed_data_y)
test_forest.score(processed_data_X,processed_data_y)

test_data = read_test_data()

test_data_X = test_data[['year','X','Y','day','hour']]
submission = test_forest.predict(test_data_X)

types = processed_data['Category'].unique()
types.sort()
submission_formatted = np.DataFrame(columns = ['Id']+types, index = range(0,len(submission)))

for type in submission_formatted.columns:
    submission_formatted[type] = 1*(submission == str(type))

submission_formatted['Id'] = range(0,len(submission))
submission_formatted.to_csv("submission.csv",sep=',',index=False)
Exemple #10
0
    row.append(TOFH)
    row.append(TOAH)
    row.append(TOFA)
    row.append(TOAA)
    row.append(ORFH)
    row.append(ORAH)
    row.append(ORFA)
    row.append(ORAA)
    row.append(FTRFH)
    row.append(FTRAH)
    row.append(FTRFA)
    row.append(FTRAA)
    data.append(row)

#Create DataFrame of Team Stats
teamStats = pd.DataFrame(data, columns=columns)

refHomeTeams = teamStats[[
    'Team', 'PFH', 'PAH', 'eFGFH', 'eFGAH', 'TOFH', 'TOAH', 'ORFH', 'ORAH',
    'FTRFH', 'FTRAH'
]].copy()
refAwayTeams = teamStats[[
    'Team', 'PFA', 'PAA', 'eFGFA', 'eFGAA', 'TOFA', 'TOAA', 'ORFA', 'ORAA',
    'FTRFA', 'FTRAA'
]].copy()

#Create Dataframe that adds stats to each team's games
addHome = pd.merge(frame, refHomeTeams, left_on='homeTeam', right_on='Team')
addAway = pd.merge(addHome, refAwayTeams, left_on='awayTeam', right_on='Team')
finalFrame = addAway.drop(['Team_x', 'Team_y'], axis=1)
Exemple #11
0
# -*- coding: utf-8 -*-
"""
Created on Thu May  6 20:13:37 2021

@author: User
"""

import numpy as pd
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.datasets import load_iris

iris = load_iris()
iris.feature_names

Data_iris = iris.data
Data_iris = pd.DataFrame(Data_iris, columns=iris.feature_names)

Data_iris['label'] = iris.target

plt.scatter(Data_iris.iloc[:,2], Data_iris.iloc[:,3])


Exemple #12
0
my_hour = 13
my_minute = 30
my_second = 15

my_date = datetime(my_year, my_month, my_day)
my_date.day

first_two = [datetime(2016, 1, 1), datetime(2016, 1, 2)]

# Datetime Index
dt_ind = pd.DatetimeIndex(first_two)

data = np.random.randn(2, 2)
cols = ['a', 'b']

df = pd.DataFrame(data, dt_ind, cols)
df.index.argmax()  # latest index (use min for first)
df.index.max()  # latest date

# Time Resampling
df = pd.read_csv(
    'data/walmart_stock.csv')  # parse_dates=True, index_col='Date'
df.info()

df['Date'] = pd.to_datetime(df['Date'], format='')
df['Date'] = df['Date'].apply(pd.to_datetime)

df.set_index('Date', inplace=True)
df.head()

df.resample(rule='A').mean()  # year end frequency
def image_rotate(imageDir_path="",
                 trainLabel_path="",
                 outputpath="",
                 degree=90):
    """
    rotate the image clockwise with a degree 
    output pic end with "-rotated"
    Args:
        imageDir_path: the path of the train pic dir
        trainLabel_path: the path of the train label path
        outputpath: the path of the flipped pic and train label
        degree: the degree pic will roate
    """
    bounding_boxs = get_bounding_box(trainLabel_path)
    image_data = read_images(imageDir_path)
    ids = get_ids(imageDir_path)

    newImages = {}
    new_pd_data = {}

    def transfer(box):
        lst = [int(item) for item in box.split()]
        lst1 = [lst[:2], [lst[2], lst[1]], lst[2:4], [lst[0], lst[3]]]
        ones = [1, 1, 1, 1]
        newMat = []
        for point in np.c_[lst1, ones]:
            newMat.append(M.dot(point))
        #minX should bigger than you pic width and height
        minX = 10000
        maxX = -1
        minY = minX
        maxY = maxX

        for item in newMat:
            minX = min(item[0], minX)
            maxX = max(item[0], maxX)
            minY = min(item[1], minY)
            maxY = min(item[1], maxY)

        return np.array([minX, minY, maxX, maxY])

    for id in ids:
        cols, rows = image_data[id].shape[:2]
        """
        the first para is the rotate point, second para is the degree
        third para is the scale riot
        """
        M = cv2.getRotationMatrix2D((cols / 2, rows / 2), degree, 0.5)
        #rotate the pic
        newImages[id + "-rotated"] = cv2.warpAffine(image_data[id], M,
                                                    (cols, rows))

        #transfer the box
        for box in bounding_boxs[id]:
            newBox = []
            for item in transfer(box):
                newBox.append(int(item))
            new_pd_data.append({
                "ID":
                id + "-rotated.jpg",
                "Detection":
                "".join((newBox[0], newBox[1], newBox[2], newBox[3]))
            })

    output_csv_path = outputpath + "-rotated.csv"
    np.DataFrame(new_pd_data).to_csv(output_csv_path)

    for id in ids:
        name = re.split(r".jpg|.png|.jpeg", id)[0]
        print(outputpath + name + "-rotated.jpg")
        cv2.imwrite(newImages[id + "-rotated"],
                    outputpath + name + "-rotated.jpg")
Exemple #14
0
train_X = train[predictors_cols]

my_model = RandomForestRegressor()

# Fit the model: Capture patterns from provided data. This is the heart of modeling.
my_model.fit(train_X, train_y)

# Read test data
test = pd.read_csv('test.csv')

# Pull the same features/columns as training data from the test data
test_X = test[predictors_cols]

# Use the model to make predictions
predicted_prices = my_model.predict(test_X)

print(predicted_prices)

#************************
#		Submission
#************************

# Submissions usally have two columns, ID and the prediction column
# ID comes from test data. Prediction column will use target_field?

my_submission = pd.DataFrame({'Id': test.Id, 'SalePrice': predicted_prices})
# print(my_submission)

# save into a file called submission
my_submission.to_csv('submission.csv', index=False)
print('Finished!')
Exemple #15
0
import pandas as pd
from sklearn import preprocessing
import talib
from matplotlib import pyplot as plt
#%%
file = '/Users/wanjun/Desktop/LSTM模型/data/data_train_latest.csv'
file_1 = '/Users/wanjun/Desktop/LSTM模型/data/MINUTE_zhuli_IF_20170509.csv'
data = pd.read_csv(file)
data_1 = pd.read_csv(file_1, index_col=1, parse_dates=True)
data.index = data_1.index
data['datetime'] = data_1.index
#%%
data = data.sort_index()
data = data['2016-12-19':]
index = data.drop_duplicates('datetime').resample('D').mean().dropna().index
data_clean = pd.DataFrame(columns=['open', 'high', 'low', 'volume', 'close'])
lst_len = []
lst = []
for i in index:
    i = str(i)[:10]
    temp = data[i]
    start = i + ' 09:30:00'
    end = i + ' 15:01:00'
    data_clean = data_clean.append(temp[start:end])
    if len(temp[start:end]) > 242 or len(temp[start:end]) < 241:
        lst.append(i)
    lst_len.append(len(temp[start:end]))

#%%
#11月18日单独作处理,因为超过242,经过分析删除这一行
#data_clean=data_clean.drop('2016-11-18 13:01:00')