def test_train(x_yarr,
               offset=0,
               x_split=0.9,
               nshuffles=1,
               x_cols=1,
               label_cols=-1,
               norm=True):
    """
    split the data into train and test data

    Args: 
        x_yarr (numpy.array): An array contianing the data to seperate
        offset (int): the offset of the data to use as test, if 0, the end of the array is used
        x_split (float): percent of th data to use as training 
        nshuffles (int): how many times to shuffle data
        x_cols (int): the colomn axis starting point of data in x_yarr
        label_cols (int): the colomn axis starting point of labels in x_yarr
        norm (bool): normalize the data if True

    Returns: 
        x_train (numpy.array): x colomns for training 
        x_test (numpy.array): x colomns for testing
        y_train (numpy.array): y colomns for training
        y_test (numpy.array): y colomns for testing
        test_data (numpy.array): all the columns of test_data (both x_test and y_test)
        m_t (numpy.array): mean of the columns
        s_t (numpy.array): std of the columns
    """
    for i in range(nshuffles):
        np.random.shuffle(x_yarr)

    if offset == 0:
        train_size = int(x_yarr.shape[0] * x_split)
        train_data = x_yarr[:train_size]
        test_data = x_yarr[train_size:]
    else:
        test_size = int(x_yarr.shape[0] * (1 - x_split))
        test_data = x_yarr[offset * test_size:(offset + 1) * test_size]
        train_data = np.array(x_yarray, copy=True)
        train_data = np.delete(train_data,
                               slice(offset * test_size,
                                     (offset + 1) * test_size),
                               axis=0)

    if norm:
        x_train, m_t, s_t = normalize(train_data[:, 1:-1])
        x_test, m_t, s_t = normalize(test_data[:, 1:-1], m_t, s_t)
        test_data[:, 1:-1] = x_test
    else:
        x_train = train_data[:, 1:-1]
        x_test = test_data[:, 1:-1]
        m_t = None
        s_t = None

    y_train = train_data[:, label_cols]
    y_test = test_data[:, label_cols]
    return x_train, x_test, y_train, y_test, test_data, m_t, s_t
Beispiel #2
0
def convert_to_corpus(name, rows):
    file = join(dirname(dirname(__file__)), "data", "{}.txt".format(name))
    f = open(file, "w")
    for row in rows:
        label = '__label__' + row['label'].replace(" ", "_")
        text = row['text'].replace("\r\n", " ")
        print(normalize(text))
        text = normalize(text)
        f.write(label + " " + text + "\n")
def generate_stat_feature(station_id: int, forecast_date: pd.Timestamp,
                          df: pd.DataFrame, days, name):
    """
    Using assigned length of history data to fetch statistic features.
    """
    history_list = []
    for i in range(days, 0, -1):
        try:
            history_list.append(df.loc[station_id, forecast_date -
                                       pd.DateOffset(days=i)].iloc[:24])
        except KeyError:
            pass
    h = normalize(pd.concat(history_list +
                            [df.loc[station_id, forecast_date]]))

    h_max = h.max()
    h_min = h.min()
    h_mean = h.mean()
    h_var = h.var()

    h_max.index = h_max.index + '_{}_max'.format(name)
    h_min.index = h_min.index + '_{}_min'.format(name)
    h_mean.index = h_mean.index + '_{}_mean'.format(name)
    h_var.index = h_var.index + '_{}_var'.format(name)

    return pd.concat([h_max, h_min, h_mean, h_var])
def generate_one_set(station_id: int,
                     forecast_date: pd.Timestamp,
                     df: pd.DataFrame,
                     previous_days=2,
                     predict=False):
    """
    Use forecast date's data and previous date data to concat a set of training data.
    """
    forecast_date = pd.Timestamp(forecast_date)
    history_list = []
    for i in range(previous_days, 0, -1):
        history_list.append(df.loc[station_id, forecast_date -
                                   pd.DateOffset(days=i)].iloc[:24])
    history = pd.concat(history_list + [df.loc[station_id, forecast_date]])
    history = check_invalid(history)
    history = history.interpolate(method='linear',
                                  limit=8,
                                  limit_direction='both')
    # history = fill_nan_with_m(history)
    if predict:
        assert ~history.iloc[:(24 * previous_days + 4)].isnull().values.any(
        ), 'Empty data found in station {} date {}'.format(
            station_id, forecast_date)
    else:
        assert ~history.isnull().values.any(
        ), 'Empty data found in station {} date {}'.format(
            station_id, forecast_date)
    history = normalize(history)

    history_obs = history.iloc[:(24 * previous_days + 4)][obs_names]
    history_m = history[m_names]

    prediction = history.iloc[(24 * previous_days +
                               4):][['t2m_obs', 'rh2m_obs', 'w10m_obs']]

    return history_obs, history_m, prediction
Beispiel #5
0
    """
    problem1_df = df["vidsWatched"] >= 5
    problem1_df = df[problem1_df]
    print(problem1_df.head())
    xy = problem1_df.drop(['VidID', 's', 's_rel_avg', 's_tot_avg', 'stdPBR'],
                          axis=1)
    return xy


dft, dfs = readdata("data-sets/behavior-performance.txt")
xy = filter(dfs)

k = find_k(plot=False)

xy2 = xy.to_numpy()
xy2, m, s = normalize(xy2[:, 1:])
kmeans = KMeans(n_clusters=k)  #number of clusters
kmeans.fit(xy2)
centers = kmeans.cluster_centers_
figure2 = plt.figure(figsize=(10, 10))
plt.subplots_adjust(bottom=.05,
                    top=0.91,
                    hspace=.5,
                    wspace=.5,
                    left=.01,
                    right=.99)
count = 1

graph_set = []
for col in range(centers.shape[-1]):
    for two in range(centers.shape[-1]):
Beispiel #6
0
import fasttext
from load_data import normalize

PATH = 'example.txt'

if __name__ == '__main__':
    classifier = fasttext.load_model('snapshots/model.bin')
    with open(PATH, errors='ignore') as f:
        str = f.read()
    str = normalize(str)
    predict = classifier.predict(str, k=3)
    print(predict)