def transform(self, X):
     #print 'getting metadata features'
     features_to_use = [ "requester_account_age_in_days_at_request", \
                     "requester_days_since_first_post_on_raop_at_request", \
                     "requester_number_of_comments_at_request", \
                     "requester_number_of_comments_in_raop_at_request", \
                     "requester_number_of_posts_at_request", \
                     "requester_number_of_posts_on_raop_at_request", \
                     "requester_number_of_subreddits_at_request", \
                     "requester_upvotes_minus_downvotes_at_request", \
                     "requester_upvotes_plus_downvotes_at_request", \
                     ]
     utc_difference = (X["unix_timestamp_of_request_utc"] - X["unix_timestamp_of_request"]).as_matrix()
     length_of_post = [len(post) for post in X['request_text_edit_aware']]
     length_of_title = [len(title) for title in X['request_title']]
     timestamps = X["unix_timestamp_of_request"]
     date_times = [datetime.fromtimestamp(ts) for ts in timestamps]
     year = np.array([dt.year for dt in date_times])
     month = np.array([dt.month for dt in date_times])
     enc = OneHotEncoder()
     weekday = np.array([[dt.isocalendar()[2]] for dt in date_times])
     weekday = enc.fit_transform(weekday).toarray()  
     hours =  np.array([[dt.hour] for dt in date_times])        
     hours = enc.fit_transform(hours).toarray()    
     return np.c_[X[features_to_use].as_matrix(), utc_difference,length_of_title,length_of_post, year , month,  weekday]
Example #2
0
    def transformTestData(self, train_data, test_data):
        #Select the right features for both training and testing data
        X_train, y_train = self.__selectRelevantFeatures(train_data)
        X_test, y_test = self.__selectRelevantFeatures(test_data)

        #Transform categorical variables into integer labels
        martial_le = LabelEncoder()
        occupation_le = LabelEncoder()
        relationship_le = LabelEncoder()
        race_le = LabelEncoder()
        sex_le = LabelEncoder()
        transformers = [martial_le, occupation_le, relationship_le, race_le, sex_le]

        for i in range(len(transformers)):
            X_train[:,i] = transformers[i].fit_transform(X_train[:,i])
            X_test[:,i] = transformers[i].transform(X_test[:,i])

        #Dummy code categorical variables
        dummy_code = OneHotEncoder(categorical_features = range(5))
        X_train = dummy_code.fit_transform(X_train).toarray()
        X_test = dummy_code.transform(X_test).toarray()

        #Normalize all features
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)

        #Encode y
        class_le = LabelEncoder()
        y_train = class_le.fit_transform(y_train)
        y_test = class_le.transform(y_test)
        #print class_le.transform(["<=50K", ">50K"])

        return X_train, X_test, y_train, y_test
def load_data():
    # Read file content
    training_file_content = pd.read_csv(TRAINING_FILE_PATH)
    testing_file_content = pd.read_csv(TESTING_FILE_PATH)
    combined_file_content = pd.concat([training_file_content, testing_file_content])

    # Manipulate file content
    X = combined_file_content.drop([ID_COLUMN_NAME, LABEL_COLUMN_NAME], axis=1).as_matrix()
    categorical_features_mask_list = []
    for column_vector in X.T:
        valid_elements_mask = np.logical_not(pd.isnull(column_vector))
        if np.can_cast(type(column_vector[valid_elements_mask][0]), np.float):
            categorical_features_mask_list.append(False)
            min_value = np.min(column_vector[valid_elements_mask])
            column_vector[np.logical_not(valid_elements_mask)] = min_value - 1
        else:
            categorical_features_mask_list.append(True)
            column_vector[np.logical_not(valid_elements_mask)] = "Missing"
            column_vector[:] = perform_categorization(column_vector)
    encoder = OneHotEncoder(categorical_features=categorical_features_mask_list)
    X = encoder.fit_transform(X).toarray()

    # Separate the data set
    Y = combined_file_content[LABEL_COLUMN_NAME].as_matrix()
    ID = combined_file_content[ID_COLUMN_NAME].as_matrix()
    test_data_mask = pd.isnull(Y)
    X_train = X[np.logical_not(test_data_mask)]
    Y_train = Y[np.logical_not(test_data_mask)]
    X_test = X[test_data_mask]
    ID_test = ID[test_data_mask]

    return X_train, Y_train, X_test, ID_test
Example #4
0
def get_toy_classification_data(n_samples=100, centers=3, n_features=2, type_data = "blobs"):
    # generate 2d classification dataset
    if (type_data == "blobs"):
        X, y = make_blobs(n_samples=n_samples, centers=centers, n_features=n_features)
    elif(type_data == "moons"):
        X, y = make_moons(n_samples=n_samples, noise=0.1)
    elif(type_data == "circles"):
        X, y =  make_circles(n_samples=n_samples, noise=0.05)
    # scatter plot, dots colored by class value
#    df = DataFrame(dict(x=X[:,0], y=X[:,1], label=y))
#    colors = {0:'red', 1:'blue', 2:'green'}
#    fig, ax = pyplot.subplots()
#    grouped = df.groupby('label')
#    for key, group in grouped:
#        group.plot(ax=ax, kind='scatter', x='x', y='y', label=key, color=colors[key])
#    pyplot.show()
    
    X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.25, stratify = None)
    
    classes = np.unique(y_train)
    
    if(0):
        enc = OneHotEncoder().fit(classes.reshape(-1,1))
        
        y_train = enc.transform(y_train.reshape(-1, 1))
        print (y_test)
        y_test = enc.transform(y_test.reshape(-1, 1))
        print (y_test)
    
    y_train = one_hot_encode(y_train, classes)
    y_test = one_hot_encode(y_test, classes)
    
    return  X_train, y_train, X_test, y_test, classes
Example #5
0
def get_coded_data(cases_df, case_ids, coded_feature_names):
    """
    Retrieves the valences corresponding to case_ids, 
    along with coded features, if any
    Recode unknown valences to neutral.
    args:
      cases_df: A dataframe containing the case variables.
      case_ids: list of sorted case_ids
      coded_feature_names: list of column names to pull from cases_df (ie 'geniss' or ['geniss','casetyp1'])
    returns:
      valences: np array of valences
      coded_feature_array: np array of coded features
      filtered_cases_df: Dataframe containing the sorted, filtered case variables
    """
    UNKNOWN_VALENCE = 0
    NEUTRAL_VALENCE = 2

    if isinstance(coded_feature_names, str):
        coded_feature_names = [coded_feature_names]

    print "coded_feature_names: ",coded_feature_names

    valences = []
    coded_feature_list = []
    for case_id in case_ids:
        valence = cases_df[cases_df['caseid'] == case_id]['direct1'].values[0]
        if np.isnan(valence)==False:
            valence = int(valence)
        else: valence = 2

        if coded_feature_names is not None:
            coded_feature_row = cases_df[cases_df['caseid'] == case_id][coded_feature_names].values[0]
            clean_row = []

            #clean row
            for val in coded_feature_row:
                if val and np.isnan(val) == False:
                    clean_row.append(int(val))
                else:
                    clean_row.append(0)
            assert clean_row[0]>=0, ""
            coded_feature_list.append(clean_row)
            
        # Replacing unknown valence variables with netural scores.
        if valence == UNKNOWN_VALENCE:
            valence = NEUTRAL_VALENCE
        valences.append(valence)

    #one-hot encoding
    if coded_feature_names is not None:
        enc = OneHotEncoder()
        coded_feature_array = enc.fit_transform(np.array(coded_feature_list))
        print "Coded Feature Array shape: ", coded_feature_array.shape
    else: 
        coded_feature_array = np.array([])

    #Filter case df
    filtered_case_df = filter_cases_df(cases_df,case_ids)

    return np.array(valences),coded_feature_array,filtered_case_df
Example #6
0
def process(discrete, cont):
  # Create discrete and continuous data matrices
  discrete_X = np.array(discrete)
  cont_X = np.array(cont)

  # Impute discrete values
  imp = Imputer(strategy='most_frequent')
  discrete_X = imp.fit_transform(discrete_X)

  # Impute continuous values
  imp_c = Imputer(strategy='mean')
  cont_X = imp_c.fit_transform(cont_X)

  # Discrete basis representation
  enc = OneHotEncoder()
  enc.fit(discrete_X)
  discrete_X = enc.transform(discrete_X).toarray()

  # Continuous scaling
  scaler = StandardScaler()
  scaler.fit(cont_X)
  cont_X = scaler.transform(cont_X)

  # Merge to one array
  X = np.concatenate((discrete_X, cont_X), axis=1)
  return X
Example #7
0
def load_bees():
    '''
    helper function to load our data
    '''
    train_fp = "/home/ubuntu/bee_images/train"
    labels = "/home/ubuntu/bee_images"
    train_labels = pd.read_csv(labels + '/' + "train_labels.csv")
    train_labels.set_index('id', inplace = True)

    bee_images = os.listdir(train_fp)
    bee_images = filter(lambda f: f[-3:] == 'jpg', bee_images)
    bee_images = filter(lambda f: f != '1974.jpg', bee_images)

    bees = []
    for i in bee_images:
        im = imread(train_fp + "/" + i, as_grey = False)
        im = resize(im, (48, 48))
        bees.append(im)

    # divide bees by 255 to give it a 0 - 1 scale
    # (255 is the current max val and zero is the min)
    bees = np.array(bees)/255.0

    Y = train_labels.ix[[int(x.split('.')[0]) for x in bee_images]].values

    onehot = OneHotEncoder(sparse = False, n_values = 2)

    Y = onehot.fit_transform(Y)
    bees, Y = gen_data(bees, Y)
    return balance(bees, Y)
def test_one_hot_encoder_not_fitted():
    X = np.array([['a'], ['b']])
    enc = OneHotEncoder(categories=['a', 'b'])
    msg = ("This OneHotEncoder instance is not fitted yet. "
           "Call 'fit' with appropriate arguments before using this method.")
    with pytest.raises(NotFittedError, match=msg):
        enc.transform(X)
Example #9
0
def transform_with_gbm_to_categorical(header, tr_x, tr_y, ts_x, n_est=100, learning_rate=0.1, max_depth=5):

    clf = GradientBoostingClassifier(n_estimators=n_est, learning_rate=learning_rate, max_depth=max_depth)
    clf = clf.fit(tr_x, tr_y)

    """ #Node count
    estimators = clf.estimators_
    for row in estimators:
        for e in row:
            print(e.tree_.node_count)"""
    leaf_indices = clf.apply(tr_x)
    leaf_indices = leaf_indices.reshape(leaf_indices.shape[0], -1)

    ts_leaf_indices = clf.apply(ts_x)
    ts_leaf_indices = ts_leaf_indices.reshape(ts_leaf_indices.shape[0], -1)

    enc = OneHotEncoder()
    enc.fit(np.append(leaf_indices, ts_leaf_indices, axis=0))

    tr_cat_features = enc.transform(leaf_indices).toarray()
    ts_cat_features = enc.transform(ts_leaf_indices).toarray()

    header = ["cat_" + str(i) for i in range(ts_cat_features.shape[1])]
    print("[gbm_cat] Features size: ", len(header))
    return header, tr_cat_features, ts_cat_features
def prepare_features(data, enc=None, scaler=None):
    '''
    One-hot encode all boolean/string (categorical) features,
    and shift/scale integer/float features
    '''
    # X needs to contain only non-negative integers
    bfs = data['bfeatures'] + 1
    sfs = data['sfeatures'] + 1
    
    # Shift/scale integer and float features to have mean=0, std=1
    ifs = data['ifeatures']
    ffs = data['ffeatures']
    x2 = np.hstack((ifs,ffs))
    if scaler is None:
        scaler = StandardScaler()
        x2 = scaler.fit_transform(x2)
        print "Training features have mean: %s" % scaler.mean_
        print "and standard deviation: %s" % scaler.std_
    else:
        x2 = scaler.transform(x2, copy=False)
        
    # one-hot encode categorical features
    X = np.hstack((bfs,sfs,x2))
    categorical = np.arange(bfs.shape[1]+sfs.shape[1])
    if enc is None:
        enc = OneHotEncoder(n_values='auto', categorical_features=categorical)
        X = enc.fit_transform(X)
        print "One-hot encoded features have dimension %d" % X.shape[1]
    else:
        X = enc.transform(X)
    return X, enc, scaler
Example #11
0
def modelselect(input_filename, num_test_examples, block_size, n_estimators=100):
    # Perform some model selection to determine good parameters
    # Load data
    X_train, y_train, X_test, y_test, scaler = loaddata(input_filename, num_test_examples, block_size)

    # Feature generation using random forests
    forest = RandomForestClassifier(n_estimators=n_estimators, n_jobs=-1)
    forest.fit(X_train, y_train)
    encoder = OneHotEncoder()
    encoder.fit(forest.apply(X_train))
    X_train = encoder.transform(forest.apply(X_train))
    learner = SGDClassifier(
        loss="hinge",
        penalty="l2",
        learning_rate="invscaling",
        alpha=0.001,
        average=10 ** 4,
        eta0=0.5,
        class_weight="balanced",
    )

    metric = "f1"
    losses = ["log", "hinge", "modified_huber", "squared_hinge", "perceptron"]
    penalties = ["l2", "l1", "elasticnet"]
    alphas = 10.0 ** numpy.arange(-5, 0)
    learning_rates = ["constant", "optimal", "invscaling"]
    param_grid = [{"alpha": alphas, "loss": losses, "penalty": penalties, "learning_rate": learning_rates}]
    grid_search = GridSearchCV(learner, param_grid, n_jobs=-1, verbose=2, scoring=metric, refit=True)

    grid_search.fit(X_train, y_train)
    print(grid_search.best_params_, grid_search.best_score_)
    return grid_search
Example #12
0
class CategoricalExpansion(BaseEstimator, TransformerMixin):
    """
    Uses one hot encoder to expand categorical columns
    Don't use this in a pipeline

    Arguments:
    =========
    threshold: int
        The maximum number of unique values that a column can have
        for it to be considered categorical

    Returns:
    ========
    Sparse matrix of expanded column.
    """
    def __init__(self, threshold):
        self.threshold = threshold

    def fit(self, X, y=None):
        uniques = [(len(x.unique()), x.dtype.kind) for n, x in X.iteritems()]
        self.mask_ = [(x[0] < self.threshold and x[1] == 'i') for x in uniques]
        self.encoder_ = OneHotEncoder()
        self.encoder_.fit(X.loc[:, self.mask_])
        return self

    def transform(self, X):
        return self.encoder_.transform(X.loc[:, self.mask_])
Example #13
0
class ExpandCategorical(BaseEstimator, TransformerMixin):
    def __init__(self, columns, append=False, only_new=False):
        if isinstance(columns, str):
            columns = [columns]
        self.columns = columns
        self.append = append
        self.only_new = only_new

    def fit(self, X=None, y=None):
        self.encoder_ = OneHotEncoder()
        self.encoder_.fit(X.loc[:, self.columns])
        # Expand the column names
        new_colnames = []
        for i, c in enumerate(self.columns):
            this_map = self.encoder_.active_features_[self.encoder_.feature_indices_[i]:self.encoder_.feature_indices_[i+1]]
            for n in this_map:
                new_colnames.append("{}_{}".format(c, str(n)))

        self.new_colnames_ = new_colnames
        return self

    def transform(self, X):
        new_data = pd.DataFrame(self.encoder_.transform(X.loc[:, self.columns]).toarray(), index=X.index, columns=self.new_colnames_)
        assert new_data.shape[0] == X.shape[0], "Row lengths do not match"
        if self.only_new:
            return new_data
        res = X.copy()
        if not self.append:
            # Remove the unexpanded columns from the data frame
            for c in self.columns:
                res.drop(c, 1, inplace=True)
        return res.join(new_data)
Example #14
0
	def apply_onehot(self, columns=[]):
		enc = OneHotEncoder()
		enc.fit(self.M[:, columns])
		R = enc.transform(self.M[:, columns]).toarray()
		self.M = np.c_[self.M[:,[x for x in range(self.M.shape[1]) if x not in columns]], R]
		self.class_index -= len([c for c in columns if c < self.class_index])
		return self
Example #15
0
def cost(all_thetas, weights, X, y, lamb):
    thetas = unpack_thetas(all_thetas, weights)
    
    # add column of 1's
    X = X/255
    a1 = np.insert(X, 0, 1, 1)
    
    # create a binary index matrix of y data and initialize activation layers
    encoder = OneHotEncoder(sparse=False)
    y_matrix = encoder.fit_transform(y.T)
    act_layers = activation_layers(a1, thetas)
    
    # cost function created in seperate parts
    first = np.multiply(-y_matrix, np.log(act_layers[-1]))
    second = np.multiply(1 - y_matrix, np.log(1 - act_layers[-1]))
    
    # regularization
    reg_1 = lamb/(2 * len(X))
    reg_2 = 0
    for i in range(len(thetas)):
        reg_2 += np.power(thetas[i][...,1:], 2).sum()
    
    J = 1/len(X) * (first - second).sum() + (reg_1 * reg_2)
    print('Current Cost')
    print(J)
    print('*' * 20)
    return J
Example #16
0
def prepare_items_features(user_items_csv, out_dir):
    array = np.loadtxt(user_items_csv, delimiter='|',
            dtype=np.dtype(np.uint64))

    le = LabelEncoder()
    col1 = le.fit_transform(array[:, 1].T)
    col2 = le.fit_transform(array[:, 2].T)
    col3 = le.fit_transform(array[:, 3].T)
    col4 = le.fit_transform(array[:, 4].T)

    columns = np.array([col1, col2, col3, col4]).T
    enc = OneHotEncoder()
    print(array[:10])
    encoded = np.c_[array[:, 0], enc.fit_transform(columns).toarray()]
    print(encoded[:10])
    print(encoded.shape)

    user_id = encoded[0][0]
    rows = []
    current = np.zeros(encoded.shape[1]-1)
    for i in range(encoded.shape[0]):
        if encoded[i][0] != user_id:
            rows.append(np.concatenate([[user_id], current]))
            user_id = encoded[i][0]
            current = np.zeros(encoded.shape[1]-1)
        else:
            current = np.sum([current, encoded[i, 1:]], axis=0)
    rows.append(np.concatenate([[user_id], current]))

    array = np.array(rows)
    print(array.shape)

    # let's serialize array
    np.save(os.path.join(out_dir, "user_items"), array)
def getdataset(datasetname, onehot_encode_strings=True):
    # load
    dataset = fetch_mldata(datasetname)
    # get X and y
    X = dshape(dataset.data)
    try:
        target = dshape(dataset.target)
    except:
        print("WARNING: No target found. Taking last column of data matrix as target")
        target = X[:, -1]
        X = X[:, :-1]
    if len(target.shape) > 1 and target.shape[1] > X.shape[1]:  # some mldata sets are mixed up...
        X = target
        target = dshape(dataset.data)
    if len(X.shape) == 1 or X.shape[1] <= 1:
        for k in dataset.keys():
            if k != 'data' and k != 'target' and len(dataset[k]) == X.shape[1]:
                X = np.hstack((X, dshape(dataset[k])))
    # one-hot for categorical values
    if onehot_encode_strings:
        cat_ft = [i for i in range(X.shape[1]) if 'str' in str(
            type(unpack(X[0, i]))) or 'unicode' in str(type(unpack(X[0, i])))]
        if len(cat_ft):
            for i in cat_ft:
                X[:, i] = tonumeric(X[:, i])
            X = OneHotEncoder(categorical_features=cat_ft).fit_transform(X)
    # if sparse, make dense
    try:
        X = X.toarray()
    except:
        pass
    # convert y to monotonically increasing ints
    y = tonumeric(target).astype(int)
    return np.nan_to_num(X.astype(float)), y
def one_hot_encode(train_discrete_features, test_discrete_features):
    """ Perform one hot encoding to both train and test set.
        Use this when having memory limitation, otherwise to use
        scikit-learn's OneHotEncoder.
    parameters:
    --------------------------------------------------------
    train_discrete_features: discrete features of training data
    test_discrete_features: discrete features of test data
    """
    m, n = train_discrete_features.shape
    train_encoded_features = lil_matrix((LENGTH_OF_TRAIN, MAX_OF_DIM))
    test_encoded_features = lil_matrix((LENGTH_OF_TEST, MAX_OF_DIM))
    cnt = 0

    for i in range(n):
        print "processing " + str(i) + "th feature..."
        train_column = train_discrete_features[:, i]
        test_column = test_discrete_features[:, i]

        # one hot encode the value in train and test
        encoder = OneHotEncoder(handle_unknown="ignore")
        train_encoded_column = lil_matrix(encoder.fit_transform(np.mat(train_column).T))
        test_encoded_column = lil_matrix(encoder.transform(np.mat(test_column).T))

        # get number of features
        _, num = train_encoded_column.shape

        # put the column into matrix
        for j in range(num):
            train_encoded_features[:,cnt+j] = train_encoded_column[:,j]
            test_encoded_features[:,cnt+j] = test_encoded_column[:,j]
            
        cnt += num

    return csr_matrix(train_encoded_features[:, 0:cnt]), csr_matrix(test_encoded_features[:, 0:cnt])
def encode_non_numeric(train, test, column):

    # compose full list of options
    options = list(set(list(train[column].unique()) + list(test[column].unique())))

    # encode them with integers
    for i, option in enumerate(options):
        train.loc[:, column] = train.loc[:, column].replace(option, i + 1)
        test.loc[:, column] = test.loc[:, column].replace(option, i + 1)

    # recode into one-hot vectors
    options = list(set(list(train[column].unique()) + list(test[column].unique())))
    enc = OneHotEncoder(sparse=False)
    enc.fit(np.matrix(options).T)
    original_names = dict((i, a) for i, a in enumerate(train.columns.values))
    train = pd.concat([train, pd.DataFrame(enc.transform(np.matrix(train[column]).T))], axis=1, ignore_index=True)
    test = pd.concat([test, pd.DataFrame(enc.transform(np.matrix(test[column]).T))], axis=1, ignore_index=True)
    train = train.rename(columns=original_names)
    test = test.rename(columns=original_names)

    # drop the original of the encoded column
    train = train.drop(column, axis=1)
    test = test.drop(column, axis=1)

    return train, test
Example #20
0
class Fileio(object):
	""" Fileio helper """
	def __init__(self, train='../data/train.csv', test='../data/test.csv'):
		# Create a OneHotEncoder
		self.encoder = OneHotEncoder()
		self.trainDF = pd.read_csv(train,usecols=[0])
		self.trainDF['ID'] = map(lambda x: "%s.%06i"%(x[0],x[1]), zip(['train']*NUMTRAIN, range(NUMTRAIN)))

		self.testDF = pd.read_csv(test)
		self.testDF['ID'] = map(lambda x: "%s.%06i"%(x[0],x[1]), zip(['test']*NUMTEST, range(NUMTEST)))

	def encode(self,usecols):
		self.encoder.fit(np.array(self.df.ix[:,usecols],dtype='float'))

	def transformTrain(self,cols,idCol=8):
		""" Transform the training set"""
	
		x = pd.merge(self.trainDF,self.df.ix[:,[idCol]+cols],how='left',on='ID',sort=False)
		ignore = ['ID','ACTION']
		usecols = [c for c in x.columns if c not in ignore]
		return self.encoder.transform(np.array(x.ix[:,usecols],dtype='float')), np.array(x.ACTION)

	def transformTest(self,cols,idCol=8):
		""" Transform the testing set"""

		x = pd.merge(self.testDF.ix[:,['ID','ROLL_CODE']],self.df.ix[:,[idCol]+cols]
			,how='left',on='ID',sort=False)
		ignore = ['ID','ROLL_CODE']
		usecols = [c for c in x.columns if c not in ignore]
		return self.encoder.transform(np.array(x.ix[:,usecols],dtype='float'))
Example #21
0
def main():
    enc = OneHotEncoder(n_values=[7,7,7,7,7,7])
    conn = sqlite3.connect('server.db')
    cursor = conn.cursor()
    all_ = pandas.read_sql_query('SELECT layers.burger, labels.output, layers.layer0, layers.layer1, layers.layer2, layers.layer3, layers.layer4, layers.layer5 FROM layers,labels WHERE layers.burger = labels.burger', conn, index_col='burger')
    
    X = all_.drop(['output'], axis=1)
    y = all_['output']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5)

               
    clf = MLPClassifier(solver='adam',  activation='relu',
                        verbose=False,
                        max_iter=10000,
                        tol=1e-9,
                        random_state=1)
    
    X_train_categoricals = X_train[column_names]
    tX_train_categoricals = enc.fit_transform(X_train_categoricals)
    clf.fit(tX_train_categoricals, y_train.as_matrix().astype(int))

    
    X_test_categoricals = X_test[column_names]
    tX_test_categoricals = enc.fit_transform(X_test_categoricals)
    prediction = clf.predict(tX_test_categoricals)
    
    print(classification_report(y_test, prediction))
    
    print_eval(y_test, prediction)
Example #22
0
def vectorize_data(df):

    cat_vars = ["UniqueCarrier",
                "OriginAirportID",
                "OriginAirportSeqID",
                "OriginCityMarketID",
                "OriginState",
                "DestAirportID",
                "DestAirportSeqID",
                "DestCityMarketID",
                "DepTimeBlk",
                "ArrTimeBlk",
                "DistanceGroup",
                "DestState"]

    con_vars = ["CRSElapsedTime",
                "Distance",
                "CRSDepTime",
                "CRSArrTime",
                "WeekDay",
                "YearDay"]

    df = df.dropna()

    Xenc = OneHotEncoder()

    X1 = Xenc.fit_transform(df[cat_vars].as_matrix())
    X2 = df[con_vars].as_matrix()

    X = sparse.hstack((X1, X2))
    X = X.tocsr()

    y = df["Cancelled"].as_matrix()

    return X, y, Xenc
Example #23
0
def load_dataset_from_file(filename, examples_count, is_labeled=True, expand_categorical=True):
    data = open (filename, 'r').readlines()
    # Next two lines verifies that the parsing result of header is what
    # we expect.
    header, _unused = parse_line(data[0], is_labeled, is_header=True)
    assert header == EXPECTED_HEADER
    data_X = []
    data_y = []
    cnt = 0
    for line in data[1:]:
        cnt += 1
        if len(data_X) == examples_count:
            break
        parse_result = get_features(line, is_labeled)
        if parse_result == None:
            continue
        (features, label) = parse_result
        data_X.append(np.array(features))
        data_y.append(label)
        if len(data_X) % 100000 == 0:
            print "Processed %d rows, loaded %d examples." % (
                cnt, len(data_X))
    cat_X = data_X
    if expand_categorical:
        encoder = OneHotEncoder(categorical_features=list(CATEGORICAL_FEATURES), sparse=False)
        cat_X = encoder.fit_transform(cat_X)
        cat_X = MaxAbsScaler().fit_transform(cat_X)
        print "Feature indices: ", encoder.feature_indices_
        print "Cat_X shape: ", cat_X.shape
    return (data_X, cat_X, np.array(data_y) if is_labeled else None)
Example #24
0
def pywfmLocalModel(trainFeature, testFeature, trainLabel, testLabel, trainIndex, testIndex, fm, cvIndex):


	print 'run local: folds: ' + str(cvIndex) 

	trainIndex, testIndex, value1, value2 = getIntId(trainIndex, testIndex)
	encoder = OneHotEncoder(n_values=[value1, value2])
	trainIndex_encode = encoder.fit_transform(trainIndex)
	testIndex_encode = encoder.transform(testIndex)

	trainFeature = hstack((trainIndex_encode, trainFeature))
	testFeature = hstack((testIndex_encode, testFeature))

	'''
	for i in range(len(trainLabel)):
		if i == 0:
			trainLabel[i] = -1
	for i in range(len(testLabel)):
		if i == 0:
			testLabel[i] = -1
	'''
	model = fm.run(trainIndex_encode, trainLabel, testIndex_encode, testLabel)

	predict = model.predictions

	predict = np.array(predict, np.float)

	predict = (predict - np.min(predict))/(np.max(predict) - np.min(predict))


	return predict
def convert_network(filename,final_filename, var_flag = 0):
	'''
	Filename : input filename of csv filename
	final_filename : o/p filename of .pickle file
	'''
	res = {'x':[],'y':[]}
	with open(filename,'rb') as csvfile:
		f = csv.reader(csvfile)
		count = 0
		for line in f:
			if count != 0:
				if var_flag == 0:
					res['x'].append(line[:-2]+[line[-1]])
					res['y'].append(float(line[-2]))
				else:
					res['x'].append(line[:-1])
					res['y'].append(float(line[-1]))
			count += 1

	res['x'] = get_num(res['x'])
	m = len(res['x'][0])-1
	enc = OneHotEncoder(categorical_features = range(m),sparse = False)
	enc.fit(res['x'])
	res['x'] = enc.transform(res['x'])
	
	with open(final_filename,'wb') as f:
		pickle.dump(res,f)
Example #26
0
class CategoricalColumn(BaseEstimator, TransformerMixin):
    '''
    Take a string or key categorical column and transform it
    to one hot encodings.
    '''

    def __init__(self):
        '''
        Set up the internal transformation.
        '''
        self._labeler = LabelEncoder()
        self._encoder = OneHotEncoder()

    def fit(self, X, y=None):
        '''
        Fit the label and encoding
        '''
        handle_none = list(map(str, X))
        encoded = self._labeler.fit_transform(handle_none)
        self._encoder.fit(encoded.reshape(-1, 1))
        return self

    def transform(self, X):
        '''
        Transform a column of data into one hot encodings.

        Parameters
        ----------
        X : pandas series or numpy array
        '''
        handle_none = list(map(str, X))
        encoded = self._labeler.transform(handle_none)
        return self._encoder.transform(encoded.reshape(-1, 1)).todense().astype(np.float32)
Example #27
0
def pywfmPredictModel(trainFeature, testFeature, trainLabel, trainIndex, testIndex, fm):


	print 'run online!'

	trainIndex, testIndex, value1, value2 = getIntId(trainIndex, testIndex)
	encoder = OneHotEncoder(n_values=[value1, value2])
	trainIndex_encode = encoder.fit_transform(trainIndex)
	testIndex_encode = encoder.transform(testIndex)

	trainFeature = hstack((trainIndex_encode, trainFeature))
	testFeature = hstack((testIndex_encode, testFeature))

	#print trainFeature

	'''
	for i in range(len(trainLabel)):
		if i == 0:
			trainLabel[i] = -1
	for i in range(len(testLabel)):
		if i == 0:
			testLabel[i] = -1
	'''
	testLabel = np.zeros((testFeature.shape[0]))
	model = fm.run(trainFeature, trainLabel, testFeature, testLabel)

	predict = model.predictions

	predict = np.array(predict, np.float)
	print np.max(predict), np.min(predict)

	#predict = (predict - np.min(predict))/(np.max(predict) - np.min(predict))


	return predict
Example #28
0
def convert_categorical_to_numeric(state_holiday):
    enc = OneHotEncoder()
    state_holiday[state_holiday=='a'] = 1
    state_holiday[state_holiday=='b'] = 2
    state_holiday[state_holiday=='c'] = 3
    enc.fit(state_holiday)
    return enc.transform(state_holiday).toarray()
def loadData(experiment):
    if experiment.has_key("size"):
        size = experiment["size"]
    else:
        size = 0
    data, label, description, reduce = experiment["dataset"]()

    if size > 0:
        initialReduceBlockSize = np.arange(size, size+0.2, 0.1)
        testSetPercentage = 0.2
        trainDataBlocks, trainLabelBlocks, testDataBlocks, testLabelBlocks = data_factory.splitDatasetInBlocks(data, np.array(label), initialReduceBlockSize, testSetPercentage)

        data = trainDataBlocks[0][0]
        label = trainLabelBlocks[0][0]

    # if required (cancer datasets) perform binary encoding
    if experiment['binary_encode']:
        print "perform binary encode"
        analyze(data, label, "before encode")
        # encode features (one-hot-encoder / dummy coding)
        enc = OneHotEncoder()
        enc.fit(data)
        data = enc.transform(data).toarray()
        analyze(data, label, "after encode")

    return data, label, description, reduce
Example #30
0
def _to_one_hot_encoding(labels, dtype=np.float64):
    labels = labels.reshape((labels.shape[0], 1))
    """Creates a one-hot encoding of the labels."""
    from sklearn.preprocessing import OneHotEncoder

    enc = OneHotEncoder(dtype=dtype)
    return enc.fit_transform(labels).toarray()
Example #31
0
X = df[c_vars.header_useful].as_matrix()
y = df['click'].as_matrix()

del df

print (str(datetime.now()) + ' Label Encoding Started')
label_encoder = [LabelEncoder() for _ in range(3)]
for i in range(len(label_encoder)):
    label_encoder[i].fit(X[:,i])
    # print (i, c_vars.header_useful[i], label_encoder[i].get_params(deep=True))
    X[:,i] = label_encoder[i].transform(X[:,i])
print (str(datetime.now()) + ' Label Encoding Completed')

print (str(datetime.now()) + ' OHE Started')

ohe = OneHotEncoder(sparse = False)
ohe.fit(X[:,[0,1,2,3,4]])
# X_ohe = ohe.transform(X[:,[0,1,2,3,4]])
print (str(datetime.now()) + ' OHE Completed')

# X = X[:,[i for i in range(len(c_vars.header_useful)) if i not in [0,1,2,3,4,5]]]

# X = np.hstack((X, X_ohe))

'''
'''
# save the label encoder and the one hot encoding to disk
with open('../analysis_graphs/label_encoder', 'wb') as f:
    pickle.dump(label_encoder, f)

with open('../analysis_graphs/ohe', 'wb') as f:
Example #32
0
def test_one_hot_encoder_inverse(sparse_, drop):
    X = [["abc", 2, 55], ["def", 1, 55], ["abc", 3, 55]]
    enc = OneHotEncoder(sparse=sparse_, drop=drop)
    X_tr = enc.fit_transform(X)
    exp = np.array(X, dtype=object)
    assert_array_equal(enc.inverse_transform(X_tr), exp)

    X = [[2, 55], [1, 55], [3, 55]]
    enc = OneHotEncoder(sparse=sparse_, categories="auto", drop=drop)
    X_tr = enc.fit_transform(X)
    exp = np.array(X)
    assert_array_equal(enc.inverse_transform(X_tr), exp)

    if drop is None:
        # with unknown categories
        # drop is incompatible with handle_unknown=ignore
        X = [["abc", 2, 55], ["def", 1, 55], ["abc", 3, 55]]
        enc = OneHotEncoder(
            sparse=sparse_,
            handle_unknown="ignore",
            categories=[["abc", "def"], [1, 2], [54, 55, 56]],
        )
        X_tr = enc.fit_transform(X)
        exp = np.array(X, dtype=object)
        exp[2, 1] = None
        assert_array_equal(enc.inverse_transform(X_tr), exp)

        # with an otherwise numerical output, still object if unknown
        X = [[2, 55], [1, 55], [3, 55]]
        enc = OneHotEncoder(
            sparse=sparse_, categories=[[1, 2], [54, 56]], handle_unknown="ignore"
        )
        X_tr = enc.fit_transform(X)
        exp = np.array(X, dtype=object)
        exp[2, 0] = None
        exp[:, 1] = None
        assert_array_equal(enc.inverse_transform(X_tr), exp)

    # incorrect shape raises
    X_tr = np.array([[0, 1, 1], [1, 0, 1]])
    msg = re.escape("Shape of the passed X data is not correct")
    with pytest.raises(ValueError, match=msg):
        enc.inverse_transform(X_tr)
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from  sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression

# Importing the dataset
data = pd.read_csv('50_Startups.csv')
x = data.iloc[:, :-1].values
y = data.iloc[:, -1].values


ct  = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [3])], remainder='passthrough')
x = np.array(ct.fit_transform(x))

x_train, x_test, y_train, y_test  = train_test_split(x, y, test_size=0.2,  random_state=1)

regressor = LinearRegression()
regressor.fit(x_train, y_train)

percentErros = (abs(regressor.predict(x_test) - y_test) /y_test)*100
AveragePercentError  = sum(percentErros)/len(percentErros)
y_pred = regressor.predict(x_test)

np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))
print("the accruacy of the model is:", 100 - AveragePercentError)
X_train[:,1] = sexe_le.fit_transform(X_train[:,1])
X_test[:,1] = sexe_le.transform(X_test[:,1])


# In[35]:


X_train[:,5] = embark_le.fit_transform(X_train[:,5])
X_test[:,5] = embark_le.transform(X_test[:,5])


# In[37]:


from sklearn.preprocessing import OneHotEncoder
embark_ohe = OneHotEncoder(categorical_features = [5])
X_train = embark_ohe.fit_transform(X_train)
X_test = embark_ohe.transform(X_test)


# In[40]:


X_train = X_train.toarray()
X_test = X_test.toarray()


# In[42]:


X_train = X_train[:,1:]
Example #35
0
X = dataset.iloc[:, -9:-1].values
Y = dataset.iloc[:, -1].values

#Transformation du valeur NaN
imptr = Imputer(missing_values="NaN", strategy="mean", axis=0)
imptr.fit(X[:, 0:1])
imptr.fit(X[:, 7:8])

X[:, 0:1] = imptr.transform(X[:, 0:1])
X[:, 7:8] = imptr.transform(X[:, 7:8])

from sklearn.preprocessing import LabelEncoder, OneHotEncoder

labEncr_X = LabelEncoder()
X[:, 2] = labEncr_X.fit_transform(X[:, 2])
X[:, 5] = labEncr_X.fit_transform(X[:, 5])
onehotEncr = OneHotEncoder(categorical_features=[2])
onehotEncr = OneHotEncoder(categorical_features=[5])
X = onehotEncr.fit_transform(X).toarray()

#Codage de valeur a predit
labEnc_Y = LabelEncoder()
Y = labEnc_Y.fit_transform(Y)

#base d'apprentissag e de test
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X,
                                                    Y,
                                                    test_size=0.2,
                                                    random_state=0)
Example #36
0
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder, StandardScaler

dataset = pd.read_csv("Social_Network_Ads.csv")

print(dataset.shape)
print(dataset.head())

columns_to_encode = ['Gender']
columns_to_scale = ['Age', 'EstimatedSalary']

encoder = OneHotEncoder(sparse=False)
scaler = StandardScaler()

encoded_columns = encoder.fit_transform(dataset[columns_to_encode])
scaled_columns = scaler.fit_transform(dataset[columns_to_scale])

print("shape: ", encoded_columns.shape)

processed_dataset = np.concatenate([encoded_columns, scaled_columns], axis=1)

dataset = pd.concat([pd.DataFrame(processed_dataset), dataset.Purchased],
                    axis=1)

print(dataset.head())

X = dataset.iloc[:, :-1].values
Example #37
0
hour_x_train['dteday'] = (hour_x_train['dteday'] - pd.to_datetime('2011-01-01')
                          ) / pd.Timedelta('1 days')
hour_x_val['dteday'] = (hour_x_val['dteday'] -
                        pd.to_datetime('2011-01-01')) / pd.Timedelta('1 days')
numeric_features = ['dteday', 'temp', 'atemp', 'hum', 'windspeed']
numeric_transformer = Pipeline(
    steps=[('imputer',
            SimpleImputer(strategy='median')), ('scaler', StandardScaler())])

categorical_features = [
    'season', 'yr', 'mnth', 'hr', 'holiday', 'weekday', 'workingday',
    'weathersit'
]
categorical_transformer = Pipeline(
    steps=[('imputer', SimpleImputer(strategy='most_frequent')
            ), ('onehot', OneHotEncoder(handle_unknown='ignore'))])
preprocessor = ColumnTransformer(transformers=[(
    'num', numeric_transformer,
    numeric_features), ('cat', categorical_transformer, categorical_features)])

preprocessor.fit(hour_x_train)
x_train = preprocessor.transform(hour_x_train).todense()
x_val = preprocessor.transform(hour_x_val).todense()
pickle.dump(preprocessor, open('encoder.p', "wb"))  # Save encoder
print('Predictors prepared')

# Prepare targets
y_train = hour_y_train.values.astype(float)
y_val = hour_y_val.values.astype(float)
print('all data prepared')
Example #38
0

if __name__ == '__main__':
    files=os.listdir(".")
    files=[i for i in files if i.split(spliter)[-1]=="p"]
    output=[file+spliter+"toPre.fa" for file in files]
    featureSize=200
    lists=[[files[i],output[i],200] for i in range(0,len(files))]
    pool = multiprocessing.Pool(int(t))
    d = pool.map(getSeqFragment,lists)
    pool.close()
    pool.join()


integer_encoder = LabelEncoder()
one_hot_encoder = OneHotEncoder()
input_features = []


def getData(file):
    feature_integer_encoder = LabelEncoder()
    input_features = []
    records=SeqIO.parse(file,"fasta")
    l_seq=[str(rec.seq) for rec in records]
    records=SeqIO.parse(file,"fasta")
    l_target=[rec1.id.split("_")[-1] for rec1 in records]
    voc=["A","C","G","T","N"]
    feature_integer_encoder.fit(voc)
    sequences = list(filter(None, l_seq))
    for sequence in sequences:
        integer_encoded = feature_integer_encoder.transform(list(sequence))
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# Importing the dataset
dataset = pd.read_csv('Churn_Modelling.csv')
X = dataset.iloc[:, 3:13].values
y = dataset.iloc[:, 13].values

# Encoding categorical data
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
labelencoder_X_1 = LabelEncoder()
X[:, 1] = labelencoder_X_1.fit_transform(X[:, 1])
labelencoder_X_2 = LabelEncoder()
X[:, 2] = labelencoder_X_2.fit_transform(X[:, 2])
onehotencoder = OneHotEncoder(categorical_features = [1])
X = onehotencoder.fit_transform(X).toarray()
X = X[:, 1:]

# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

# Part 2 - Now let's make the ANN!
Example #40
0
class_mapping = {label:idx for idx, label in enumerate(np.unique(df['classlabel']))}
inv_class_mapping = {v: k for k, v in class_mapping.items()}
df['classlabel'] = df['classlabel'].map(class_mapping)
print(df)
df['classlabel'] = df['classlabel'].map(inv_class_mapping)
print(df)
#2method
from sklearn.preprocessing import LabelEncoder
#make instance of label encoder
class_le = LabelEncoder()
#convert classlabel ro integer
#fit + transform
y = class_le.fit_transform(df['classlabel'].values)
print(df)
print(y)
#extract color size price
X = df[['color', 'size', 'price']].values
color_le = LabelEncoder()
#covert color label to integer
X[:, 0] = color_le.fit_transform(X[:, 0])
print(X)
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(categorical_features=[0], sparse=True)
print(ohe.fit_transform(X))
print(ohe.fit_transform(X).toarray())
#automataically calculate one hot vector
print(pd.get_dummies(df[['price', 'color', 'size']]))
#escape from multicolinearlity
print(pd.get_dummies(df[['price', 'color', 'size']], drop_first=True))

Example #41
0
import dash_html_components as html
import dash_daq as daq

import flask

import pandas as pd
import numpy as np

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder

# load our data
mtcars = pd.read_csv('mtcars.csv', dtype={'cyl': str, 'am': np.float64})

# create and fit a one-hot encoder--we'll want to reuse this in the app as well
cyl_enc = OneHotEncoder(categories='auto', sparse=False)
cyl_enc.fit(mtcars['cyl'].values.reshape(-1, 1))

y = mtcars['mpg']
# we need to concatenate the one-hot (dummy) encoded values with
# the values from mtcars
X = np.concatenate((mtcars[['disp', 'qsec', 'am']].values,
                    cyl_enc.transform(mtcars['cyl'].values.reshape(-1, 1))),
                   axis=1)

# fit our regression model
fit = LinearRegression()
fit.fit(X=X, y=y)


def preds(fit, cyl_enc, disp, qsec, am, cyl):
Example #42
0
        labelencoder_X_col = LabelEncoder()
        df[col] = labelencoder_X_col.fit_transform(df[col])
        return df
    elif axis == 1:
        labelencoder_y = LabelEncoder()
        df = labelencoder_y.fit_transform(df)
    return df


for col in [
        'Generic Group', 'Generic Brand', 'Generic Product Category',
        'Generic Product', 'Variable Group', 'Units'
]:
    X = label_encode(X, col, axis=0)

onehotencoder = OneHotEncoder(sparse=False)
X = onehotencoder.fit_transform(X)
y = pd.get_dummies(y)

import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, PReLU

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.30,
                                                    random_state=42)

classifier = Sequential()
classifier.add(
Example #43
0
    data_voiced = []
    data_unvoiced = []
    for i in range(len(labels_tot)):
        if labels_tot[i] == 1:  #voiced
            data_voiced.append(data_tot[i])
        else:  #unvoiced
            data_unvoiced.append(data_tot[i])

    data_length = min(len(data_unvoiced), len(data_voiced))
    data_voiced = np.asarray(data_voiced[:data_length])
    data_unvoiced = np.asarray(data_unvoiced[:data_length])
    data = np.concatenate((data_voiced, data_unvoiced), axis=0)
    labels = np.concatenate((np.ones(data_length), np.zeros(data_length)))

    # One hot encoding the labels
    onehotencoder = OneHotEncoder(categorical_features='all')
    labels_encoded = onehotencoder.fit_transform(
        np.asarray(labels).reshape(-1, 1)).toarray()

    training_ratio = 0.75
    training_index = int(data_length * training_ratio)
    training_data, test_data = np.concatenate(
        (data_voiced[:training_index],
         data_unvoiced[:training_index])), np.concatenate(
             (data_voiced[training_index:], data_unvoiced[training_index:]))

    # Build the neural networks
    # The autoencoder
    encoding_dim = 200
    input_dim = Input(shape=(max_length, ))
    encoded = Dense(encoding_dim, activation='relu')(input_dim)
Example #44
0
class BaseModel(mlflow.pyfunc.PythonModel):
    def __init__(self, model_cls, model_params={}, table_columns=[]):
        self._model_params = model_params
        self._model_obj = model_cls(**model_params)

        self._features = DEFAULT_FEATURES
        self._cat_features = DEFAULT_CATEGORICAL_FEATURES
        self._cont_features = [
            f for f in self._features if f not in self._cat_features
        ]

        self._label = 'device_operational_status'

        self._table_columns = table_columns

    def fit(self, X, y):
        self._cat_x_encoder = OneHotEncoder(handle_unknown='ignore').fit(
            X[self._cat_features])
        self._y_encoder = LabelEncoder().fit(y)

        _X = self._preprocess_X(X)
        _y = self._preprocess_y(y)

        self._model_obj = self._model_obj.fit(X=_X, y=_y)
        return self

    def predict(self, context, X):
        _y = self._predict(X)
        return _y

    def _predict(self, X):
        #TODO: hide
        if len(X.columns) == len(self._table_columns):
            X.columns = self._table_columns

        _X = self._preprocess_X(X)
        _y_num = self._model_obj.predict(_X)
        _y = self._y_encoder.inverse_transform(_y_num)
        return _y

    def load_context(self, context):
        with open(context.artifacts['model'], 'rb') as file:
            self._model_obj = pickle.load(file)

    def get_label_names(self):
        out = self._y_encoder.classes_
        return out

    def _preprocess_X(self, X):
        _X_processed = np.concatenate([
            X[self._cont_features],
            self._cat_x_encoder.transform(X[self._cat_features]).todense()
        ],
                                      axis=1)
        return _X_processed

    def _preprocess_y(self, y):
        _y_preprocessed = self._y_encoder.transform(y)
        return _y_preprocessed

    def log_to_mlflow(self):
        with TempDir() as local_artifacts_dir:
            # dumping model
            model_path = local_artifacts_dir.path('model.pkl')
            with open(model_path, 'wb') as m:
                pickle.dump(self._model_obj, m)

            # dumping feature encoder
            cat_encoder_path = local_artifacts_dir.path('cat_encoder.pkl')
            with open(cat_encoder_path, 'wb') as m:
                pickle.dump(self._cat_x_encoder, m)

            # dumping label encoder
            label_encoder_path = local_artifacts_dir.path('label_encoder.pkl')
            with open(label_encoder_path, 'wb') as m:
                pickle.dump(self._y_encoder, m)

            # all of the model subcomponents will need to go here
            artifacts = {
                'model': model_path,
                'cat_encoder': cat_encoder_path,
                'label_encoder': label_encoder_path
            }

            mlflow.pyfunc.log_model(artifact_path='model',
                                    python_model=self,
                                    artifacts=artifacts)
Example #45
0
    else:
        x_test2[i, 6] = 1
unique, counts = np.unique(x_train[:, 6], return_counts=True)

#Categorical
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
label1 = LabelEncoder()
x_train[:, 1] = label1.fit_transform(x_train[:, 1])
label2 = LabelEncoder()
x_train[:, 7] = label2.fit_transform(x_train[:, 7])
label3 = LabelEncoder()
x_train[:, 0] = label3.fit_transform(x_train[:, 0])
from sklearn.compose import ColumnTransformer
ct = ColumnTransformer(
    [
        ('one_hot_encoder', OneHotEncoder(), [7])
    ],  # The column numbers to be transformed (here is [0] but can be [0, 1, 3])
    remainder='passthrough'  # Leave the rest of the columns untouched
)
x_train = np.array(ct.fit_transform(x_train), dtype=np.float)
ct = ColumnTransformer(
    [
        ('one_hot_encoder', OneHotEncoder(), [3])
    ],  # The column numbers to be transformed (here is [0] but can be [0, 1, 3])
    remainder='passthrough'  # Leave the rest of the columns untouched
)
x_train = np.array(ct.fit_transform(x_train), dtype=np.float)
x_train = x_train[:, [1, 2, 4, 5, 6, 7, 8, 9, 10, 11]]

#For test2
label1 = LabelEncoder()
Example #46
0
def train_model(train, target, features = train.columns, model = LinearRegression()):
    pipe = make_pipeline(OneHotEncoder(handle_unknown = 'ignore'), model)
    mod = pipe.fit(train, target)
    return mod
Example #47
0
standing_filename = 'NBA/standings.csv'
standings = pd.read_csv(standing_filename, skiprows=[0])

dataset = pd.read_csv('NBA/March.csv', parse_dates=["Date"])
dataset.columns = [
    "Date", "Start", "Visitor Team", "VisitorPts", "Home Team", "HomePts",
    "Score Type", "OT", "Notes"
]

# 球队编号并转化为二进制
encoding.fit(dataset['Home Team'].values)
home_teams = encoding.transform(dataset["Home Team"].values)
visitor_teams = encoding.transform(dataset["Visitor Team"].values)
X_teams = np.vstack([home_teams, visitor_teams]).T

onehot = OneHotEncoder()
X_teams = onehot.fit_transform(X_teams).todense()

# step 1
won_last = defaultdict(int)
dataset["HomeLastWin"] = False
dataset["VisitorLastWin"] = False
dataset["HomeWin"] = dataset["VisitorPts"] < dataset["HomePts"]

for index, row in dataset.iterrows():
    home_team = row["Home Team"]
    visitor_team = row["Visitor Team"]
    row["HomeLastWin"] = won_last[home_team]
    row["VisitorLastWin"] = won_last[visitor_team]
    dataset.iloc[index] = row
    won_last[home_team] = row["HomeWin"]
Example #48
0
train_y = train_data['Survived']
train_x = train_data.drop(
    ['PassengerId', 'Survived', 'Name', 'Cabin', 'Ticket'], axis=1)
test_x = pd.read_csv('./data/test.csv')
test_x = test_x.drop(['PassengerId', 'Name', 'Cabin', 'Ticket'], axis=1)
total_data = [train_x, test_x]

# sex离散化 one-hot编码
for data in total_data:
    data['Sex'] = data.Sex.map({'male': 0, 'female': 1})
    data['Embarked'] = data['Embarked'].fillna("S")
# age 字段的null采用均值填充
for data in total_data:
    data['Age'] = data.Age.fillna(data['Age'].mean())

enc = OneHotEncoder(sparse=False)
sex_onehot = enc.fit_transform(pd.DataFrame(train_x['Sex']))
train_x["sex_0"] = sex_onehot[:, 0]
train_x["sex_1"] = sex_onehot[:, 1]
train_x = train_x.drop(["Sex"], axis=1)

sex_onehot_test = enc.transform(pd.DataFrame(test_x['Sex']))
test_x["sex_0"] = sex_onehot_test[:, 0]
test_x["sex_1"] = sex_onehot_test[:, 1]
test_x = test_x.drop(["Sex"], axis=1)

Embarked_onehot = OneHotEncoder(sparse=False)
Embarked_onehot_data = Embarked_onehot.fit_transform(
    pd.DataFrame(train_x['Embarked']))
train_x["Embarked_0"] = Embarked_onehot_data[:, 0]
train_x["Embarked_1"] = Embarked_onehot_data[:, 1]
def make_model(file_name="TD20200309210544.json",
               column_review="reviewText",
               column_rating="overall",
               json_balanced=True,
               have_corpus=True,
               size=10000):

    # Making a json file with balanced ratings
    if json_balanced == False:
        make_balance_json(r'static/DBAlpha/TrainingDB/Files/' + file_name,
                          column_review, column_rating,
                          "main/files/uniform_json.json", size / 5)
    dataset = read_json('main/files/uniform_json.json', lines=True)
    dataset = dataset[:size]

    # Making corpus, in case corpus doesn't exists
    if have_corpus == False:
        corpus = basic.preprocess_lemm_dataset(dataset, 'review')
        process_corpus.write_corpus(corpus)

    # If corpus exists, read it directly
    else:
        corpus = []
        corpus = process_corpus.read_corpus()
        corpus = corpus[:size]

    # Getting the ratings
    y = dataset.iloc[:size, 0]

    # Maximum words to consider
    TRAINING_VOCAB = 5000

    # Tokenizing the words upto the maximum vocabulary
    tokenizer = Tokenizer(num_words=TRAINING_VOCAB,
                          lower=True,
                          char_level=False)
    # Fitting the corpus to tokenizer
    tokenizer.fit_on_texts(corpus)
    training_sequences = tokenizer.texts_to_sequences(corpus)
    # Getting the encoding dictionary
    vocab_to_int = tokenizer.word_index

    sequence_length = 150

    # Padding to maximum sequence length
    features = pad_sequences(training_sequences, maxlen=sequence_length)
    """
    EMBEDDING_DIM = 300
    # Loading google's words to vect embedding
    print("\nLoading the Google's word2vec \nPlease Wait...")
    word2vec_path = 'resources/GoogleNews-vectors-negative300.bin'
    word2vec = models.KeyedVectors.load_word2vec_format(word2vec_path, binary=True)
    
    train_embedding_weights = np.zeros((len(vocab_to_int), EMBEDDING_DIM))
    for word,index in vocab_to_int.items():
        if word in word2vec:
            train_embedding_weights[index,:] = word2vec[word]  
        else:
            np.random.rand(EMBEDDING_DIM)
    print(train_embedding_weights.shape)
    """

    # Variables for RNN LSTM
    vocab_size = len(vocab_to_int)
    embedding_dim = 512

    # Training parameters
    batch_size = int(size // 100)
    num_epochs = 30

    # Encoding y data into diffeerent categorical columns
    labelencoder_y = LabelEncoder()
    y = labelencoder_y.fit_transform(y)
    y = y.reshape(len(y), 1)
    onehotencoder = OneHotEncoder()
    y = onehotencoder.fit_transform(y).toarray()

    # Splitting the dataset into the Training set and Test set
    X_train, X_test, y_train, y_test = train_test_split(features,
                                                        y,
                                                        test_size=0.20,
                                                        random_state=0)

    # Initialising the RNN
    model = Sequential()

    # Adding Layers to RNN
    #model.add(Embedding(vocab_size, embedding_dim, weights = [train_embedding_weights],input_length=sequence_length))
    if size > 2000:
        model.add(
            Embedding(TRAINING_VOCAB,
                      embedding_dim,
                      input_length=sequence_length))
    else:
        model.add(
            Embedding(TRAINING_VOCAB, size / 10, input_length=sequence_length))

    model.add(LSTM(100, return_sequences=True))
    model.add(LSTM(100))
    model.add(Dense(units=200, kernel_initializer='uniform',
                    activation='relu'))

    model.add(Dense(5, activation='sigmoid'))
    #rmsprop=optimizers.rmsprop(lr=0.01)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

    # Fitting the ANN to the Training set
    model.fit(X_train, y_train, batch_size=batch_size, epochs=num_epochs)

    # Predicting the Test set results over trained model
    y_pred = model.predict(X_test)

    # Getting result in proper format that is initially probabilistic
    for i in range(len(y_pred)):
        ind_ = 0
        max_ = y_pred[i][0]
        for j in range(5):
            if y_pred[i][j] > max_:
                max_ = y_pred[i][j]
                ind_ = j
            y_pred[i][j] = 0
        y_pred[i][ind_] = 1

    # Inverse Transforming the categorical encodings on y_pred and y_test
    y_pred = onehotencoder.inverse_transform(y_pred)
    y_test = onehotencoder.inverse_transform(y_test)

    # Measuring the performance
    accuracy = accuracy_score(y_test,
                              y_pred,
                              normalize=True,
                              sample_weight=None)

    #
    file_name = re.sub(".json", "", file_name)
    with open(r'static/DBAlpha/TrainingDB/Models/TOKEN_' + file_name + ".pkl",
              'wb') as handle:
        pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

    model.save(r'static/DBAlpha/TrainingDB/Models/' + file_name + '.h5')

    #     Returning the performance parameters
    return accuracy
Example #50
0
#handling missing values
#for Age
ds.Age = ds.Age.fillna(ds.Age.median())
dssub.Age = dssub.Age.fillna(dssub.Age.median())
#for Embarked
ds.Embarked = ds.Embarked.fillna('S')
dssub.Embarked = dssub.Embarked.fillna('S')

X_all = np.concatenate((X, X_sub), axis=0)
y = dataset.loc[:, 'Survived'].values
#Handling categorical data
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
labelencoder_X = LabelEncoder()
X_all[:, 1] = labelencoder_X.fit_transform(X_all[:, 1])
X_all[:, 5] = labelencoder_X.fit_transform(X_all[:, 5])
onehotencoder = OneHotEncoder(categorical_features=[0, 5])
X_all = onehotencoder.fit_transform(X_all).toarray()
X = X_all[:891, :]
X_sub = X_all[891:, :]
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X,
                                                  y,
                                                  test_size=0.33,
                                                  random_state=1)

from keras.models import Sequential
from keras.layers import Dense
model = Sequential()

model.add(
    Dense(26,
import pandas as pd
from tensorflow import keras
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

ct = OneHotEncoder(sparse=False)


# function to load data from csv
def load_data(filename):
    return pd.read_csv(filename)


def processFlightData(data):
    return data.filter(['Month', 'Day', 'Origin_Airport', 'WeatherDelay'])  #


def main():
    # loading the smaller 2017 flight data
    flight2017 = load_data('flight-delays/fl_samp.csv')
    flight2017Processed = processFlightData(flight2017)

    # loading the other flight data
    flightData = load_data('flight-delays/flight.csv')
    flightDateProcessed = processFlightData(flightData)

    # Combine the two datasets
    frames = [flight2017Processed, flightDateProcessed]
    combinedFlightData = pd.concat(frames)
Example #52
0
    'num_common_interest5', 'num_common_topic1'
]].values)
train_x = scaler.transform(train[[
    'num_advertise_touser', 'num_common_interest1', 'num_common_interest2',
    'num_common_interest5', 'num_common_topic1'
]].values)

test_x = scaler.transform(test[[
    'num_advertise_touser', 'num_common_interest1', 'num_common_interest2',
    'num_common_interest5', 'num_common_topic1'
]].values)
train_x = np.hstack((train_x, ct_trains))
test_x = np.hstack((test_x, ct_tests))

# 特征进行onehot处理
enc = OneHotEncoder()

oc_encoder = OneHotEncoder()
for feature in one_hot_feature:
    oc_encoder.fit(data[feature].values.reshape(-1, 1))
    train_a = oc_encoder.transform(train[feature].values.reshape(-1, 1))
    test_a = oc_encoder.transform(test[feature].values.reshape(-1, 1))
    train_x = sparse.hstack((train_x, train_a))
    test_x = sparse.hstack((test_x, test_a))
print('one-hot prepared !')

# 处理count特征向量

ct_encoder = CountVectorizer(min_df=0.0009)
for feature in vector_feature:
    ct_encoder.fit(data[feature])
day and month are cyclical in nature, so we can do the following:
"""

train['dy_sin'] = np.sin((train['day']-1)*(2.*np.pi/7))
train['dy_cos'] = np.cos((train['day']-1)*(2.*np.pi/7))
train['mnth_sin'] = np.sin((train['month']-1)*(2.*np.pi/12))
train['mnth_cos'] = np.cos((train['month']-1)*(2.*np.pi/12))

train = train.drop(columns="day")
train = train.drop(columns="month")
train = train.drop(columns="id")
train.head()

"""# Nominal Features (Low Cardinality)"""

column_trans = make_column_transformer((OneHotEncoder(sparse=False),['nom_0','nom_1','nom_2','nom_3','nom_4']),remainder='passthrough')
train_after_low_car_nom = column_trans.fit_transform(train)

pd.DataFrame(train_after_low_car_nom).head()

"""# Nominal Features (High Cardinality)

## Dummy Encoding
"""

train_after_low_car_nom = pd.DataFrame(train_after_low_car_nom)
train_after_high_car_nom = pd.get_dummies(train_after_low_car_nom, columns=train_after_low_car_nom.columns, drop_first=True, sparse=True)

"""## Hash Encoding (not used anymore)"""

#hashing_encoder = ce.HashingEncoder(cols=[30, 31, 32, 33, 34])
Example #54
0
import numpy as np
from seqlearn.evaluation import bio_f_score
from seqlearn.hmm import MultinomialHMM
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.model_selection import cross_validate

from data import *
from epam_nlp import CustomHMM, get_bio_f1

DATA_PATH = Path('../data')
RAW_DATA_PATH = DATA_PATH / 'processed.tsv'
df = load_data(RAW_DATA_PATH, nrows=1000)
X, y, lengths = get_X_y_lengths(df, cols_to_keep={'token'})
le = LabelEncoder()
ohe = OneHotEncoder(handle_unknown='ignore')
clf = CustomHMM(y=y)
pipeline = Pipeline([('one_hot', ohe), ('hmm', clf)])
cv = get_cv(lengths=lengths)

res = cross_validate(pipeline,
                     X.reshape(-1, 1),
                     y,
                     cv=cv,
                     n_jobs=1,
                     scoring=get_bio_f1)
print(res)

# cv = get_cv(X, y, lengths)
# i = 1
# scores = []
X5 = X5.reshape(-1, 1)
missingvalues = missingvalues.fit(X5)
X5 = missingvalues.transform(X5)

X6 = X2[:, 0]
X6 = X6.reshape(-1, 1)

X7 = X2[:, 2:4]

X_train = np.concatenate((X1, X6, X5, X7, X3, X4), axis=1)

X_class = X_train[:, 0]
X_class = X_class.reshape(-1, 1)
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
ohe = OneHotEncoder()
X_class = ohe.fit_transform(X_class).toarray()
X_class = X_class[:, 1:3]

X_embark = X_train[:, 6]
X_embark = X_embark.reshape(-1, 1)

missingvalues1 = SimpleImputer(missing_values=np.nan,
                               strategy='most_frequent',
                               verbose=0)
missingvalues1 = missingvalues1.fit(X_embark)
X_embark = missingvalues1.transform(X_embark)

le = LabelEncoder()
ohe1 = OneHotEncoder()
X_embark = le.fit_transform(X_embark)
Example #56
0
def onehot(x): return np.array(OneHotEncoder().fit_transform(x.values.reshape(-1,1)).todense())

def format(data):
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(dataset, labels, test_size=0.25, random_state=random_state, shuffle=True)
print("Train dataset shape:", X_train.shape)
print("Train label shape:", y_train.shape)
print("Test dataset shape:", X_valid.shape)
print("Test label shape:", y_valid.shape)

print("Dataset example:")
print(X_train[0, 2].reshape(height, width))
print(X_valid[0, 2].reshape(height, width))

# Label encoding

from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(sparse=False)

y_train = y_train.reshape(-1, 1)
y_train = ohe.fit_transform(y_train)
y_valid = y_valid.reshape(-1, 1)
y_valid = ohe.transform(y_valid)

with open(results_path + "/ohe", "wb") as file:
    pickle.dump(ohe, file)

# Dataset Normalization

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
def main():
    # Load data and run brief analysis on it
    raw_data = load_data('train.csv')
    quick_analysis(raw_data)

    plt.hist(raw_data['SalePrice'])
    plt.show()

    # View all unique values of categorical features
    non_numeric_cols = raw_data.loc[:, raw_data.dtypes == object]

    for col in non_numeric_cols.columns:
        print(non_numeric_cols[col].value_counts())

    # Analize correlations between features and the label
    corr_matrix = raw_data.corr()
    sale_correl = corr_matrix['SalePrice'].sort_values(ascending=False)
    print(sale_correl)

    # Feature engineering the following:
    #   Grade = OverallQual / OverallCond
    #   Age = YrSold - YearBuilt
    #   RemodAge = YrSold - YearRemodAdd
    #   TotalSF = TotalBsmtSF + 1stFlrSF + 2ndFlrSF

    raw_data['Grade'] = raw_data['OverallQual'] / raw_data['OverallCond']
    raw_data['Age'] = raw_data['YrSold'] - raw_data['YearBuilt']
    raw_data['RemodAge'] = raw_data['YrSold'] - raw_data['YearRemodAdd']
    raw_data['TotalSF'] = raw_data['TotalBsmtSF'] + raw_data[
        '1stFlrSF'] + raw_data['2ndFlrSF']

    # Correlation matrix for the new features
    corr_matrix = raw_data.corr()
    sale_correl = corr_matrix['SalePrice'].sort_values(ascending=False)
    print(sale_correl)

    # Check correlation of new features with their respective components
    age_correl = corr_matrix['Age'].sort_values(ascending=False)
    print('Age correlations:', age_correl, '\n')

    remod_age_correl = corr_matrix['RemodAge'].sort_values(ascending=False)
    print('RemodAge correlations:', remod_age_correl, '\n')

    grade_correl = corr_matrix['Grade'].sort_values(ascending=False)
    print('Grade correlations:', grade_correl, '\n')

    totalsf_correl = corr_matrix['TotalSF'].sort_values(ascending=False)
    print('TotalSF correlations:', totalsf_correl, '\n')

    # Correlation matrix vizualization
    corr_plot(raw_data, 'SalePrice', fig_size=(4, 4))
    corr_plot(raw_data, 'SalePrice', plot_type='hist', fig_size=(4, 4))

    # Change type of columns to reflect their nature. Concretely, change the YrSold, MoSold, MSZoning and OverallCond features to categorical ones
    raw_data['YrSold_C'] = raw_data['YrSold'].copy().astype(str)
    raw_data['MoSold'] = raw_data['MoSold'].astype(str)
    raw_data['MSZoning'] = raw_data['MSZoning'].astype(str)
    raw_data['OverallCond_C'] = raw_data['OverallCond'].copy().astype(str)

    num_cols = [
        'OverallQual',
        'OverallCond',
        'YearBuilt',
        'YearRemodAdd',
        'TotalBsmtSF',
        '1stFlrSF',
        '2ndFlrSF',
        'GarageCars',
        'GarageArea',
        'FullBath',
        'YrSold',
    ]
    cat_cols = [
        'MSZoning',
        'Street',
        'Utilities',
        'Neighborhood',
        'ExterQual',
        'ExterCond',
        'BsmtQual',
        'BsmtCond',
        'Heating',
        'CentralAir',
        'PavedDrive',
        'SaleType',
        'SaleCondition',
        'YrSold_C',
        'MoSold',
        'OverallCond_C',
    ]

    # Create a list of all values that the categorical features can take
    cat_cols_categs = [raw_data[col].unique() for col in cat_cols]
    print(cat_cols_categs)

    # Create the pipeline to process data
    num_pipeline = Pipeline([
        ('feat_sel', FeatureSelector(num_cols, True)),
        ('Grade',
         FeatureCreator(['OverallCond', 'OverallQual'],
                        lambda x, y: x / y,
                        as_dataframe=True,
                        feat_name='Grade')),
        ('Age',
         FeatureCreator(['YrSold', 'YearBuilt'],
                        lambda x, y: x - y,
                        as_dataframe=True,
                        feat_name='Age')),
        ('RemodAge',
         FeatureCreator(['YrSold', 'YearRemodAdd'],
                        lambda x, y: x - y,
                        as_dataframe=True,
                        feat_name='RemodAge')),
        ('TotalSF',
         FeatureCreator(['TotalBsmtSF', '1stFlrSF', '2ndFlrSF'],
                        lambda x, y: x + y,
                        as_dataframe=True,
                        feat_name='TotalSF')),
        ('drop_cat_feat',
         FeatureDropper(['YrSold', 'OverallCond'], as_dataframe=True)),
        ('imputer_mean', Imputer(strategy='mean')),
        ('std_scaler', RobustScaler())
    ])

    cat_pipeline = Pipeline([
        ('feat_sel', FeatureSelector(cat_cols, True)),
        ('imputer_most_frequent', CategoricalImputer()),
        ('encode', OneHotEncoder(categories=cat_cols_categs, sparse=False)),
    ])
    feat_union = FeatureUnion(transformer_list=[
        ('num_features', num_pipeline),
        ('cat_features', cat_pipeline),
    ])

    # Create the train data and labels
    train_labels = raw_data['SalePrice'].copy()
    train_feat = feat_union.fit_transform(raw_data)

    # Check the linear regression model
    lin_reg = LinearRegression()
    print('Linear regression best hyperparameters:')
    final_lr_model = find_best_estimator(lin_reg, [{}], train_feat,
                                         train_labels)

    # Check the decision tree model
    hyperparams_vals = [
        {
            'max_features': [6, 10, 12, 16, 18, 20, 24]
        },
    ]

    dt_reg = DecisionTreeRegressor(random_state=42)
    print('Decision tree best hyperparameters:')
    final_dt_model = find_best_estimator(dt_reg, hyperparams_vals, train_feat,
                                         train_labels)

    # Check the random forest model
    hyperparams_vals = [
        {
            'n_estimators': [200, 225, 250],
            'max_features': [16, 24, 30]
        },
        {
            'bootstrap': [False],
            'n_estimators': [220, 225],
            'max_features': [24, 28]
        },
    ]

    forest_reg = RandomForestRegressor(n_jobs=-1, random_state=42)
    print('Random forest best hyperparameters:')
    final_rf_model = find_best_estimator(forest_reg, hyperparams_vals,
                                         train_feat, train_labels)

    # Check the XGBoost model
    hyperparams_vals = [
        {
            'n_estimators': [450, 500, 400],
            'max_features': [2, 4, 8],
            'max_depth': [3, 4, None]
        },
    ]

    xgbr_reg = XGBRegressor(learning_rate=0.05, n_threads=-1, random_state=42)
    print('XGBoost regressor best hyperparameters:')
    final_xgb_model = find_best_estimator(xgbr_reg, hyperparams_vals,
                                          train_feat, train_labels)

    # Check the SVM model
    hyperparams_vals = [
        {
            'kernel': ['linear', 'sigmoid', 'rbf'],
            'gamma': ['auto', 'scale']
        },
        {
            'kernel': ['poly'],
            'gamma': ['auto', 'scale'],
            'degree': [3, 4, 5]
        },
    ]

    svm_reg = SVR()
    print('Support vector machine best hyperparameters:')
    final_svm_model = find_best_estimator(svm_reg, hyperparams_vals,
                                          train_feat, train_labels)

    # Check the ElasticNet model
    hyperparams_vals = [
        {
            'alpha': [0.0005, 0.005, 0.05, 0.2],
            'l1_ratio': [0.1, 0.25, 0.75, 0.9]
        },
    ]

    enet_reg = ElasticNet(max_iter=100000000, tol=0.001)
    print('ElasticNet best hyperparameters:')
    final_enet_model = find_best_estimator(enet_reg, hyperparams_vals,
                                           train_feat, train_labels)

    # Check the feature importances for both random forest algorithms
    rf_feat_imp = final_rf_model.feature_importances_
    xgb_feat_imp = final_xgb_model.feature_importances_

    other_feat = ['Grade', 'RemodAge', 'TotalSF']
    all_features = num_cols.copy()
    print(num_cols)
    for cat_values in cat_cols_categs.copy():
        all_features.extend(cat_values)
    all_features.extend(other_feat.copy())

    print('Random forest feature importances:')
    for feat in sorted(zip(rf_feat_imp, all_features), reverse=True):
        print(feat)

    print('\nXGBoost feature importances:')
    for feat in zip(xgb_feat_imp, all_features):
        print(feat)

    # Load and process test data
    test_data = load_data('test.csv')
    test_data['YrSold_C'] = test_data['YrSold'].copy().astype(str).replace(
        'nan', None)
    test_data['MoSold'] = test_data['MoSold'].astype(str).replace('nan', None)
    test_data['MSZoning'] = test_data['MSZoning'].astype(str).replace(
        'nan', None)
    test_data['OverallCond_C'] = test_data['OverallCond'].copy().astype(
        str).replace('nan', None)
    test_feat = feat_union.transform(test_data)

    # Predict using the combination of Random Forest and XGBoost
    rf_predictions = final_rf_model.predict(test_feat)
    xgb_predictions = final_xgb_model.predict(test_feat)
    predictions = rf_predictions * 0.35 + xgb_predictions * 0.65

    # Save resulting predictions
    pred_df = pd.DataFrame()
    pred_df['Id'] = test_data['Id']
    pred_df['SalePrice'] = predictions.flatten()

    print(pred_df)
    pred_df.to_csv('submission_rf_xgb.csv', index=False)

    # Predict using only the XGBoost model
    xgb_predictions = final_xgb_model.predict(test_feat)
    predictions = xgb_predictions.copy()

    pred_df = pd.DataFrame()
    pred_df['Id'] = test_data['Id']
    pred_df['SalePrice'] = predictions.flatten()

    print(pred_df)
    pred_df.to_csv('submission_xgb.csv', index=False)
def data_processing(positive_data_file, negative_data_file):
    # 处理输入的影评,每一行是一些词
    # x = [N,max_len,300]
    neg_dir = negative_data_file
    pos_dir = positive_data_file
    with open(neg_dir, "r", encoding='Windows-1252') as f:
        data = f.read().split('\n')  # 读取每一行的单词
        neg_words = [0] * len(data)
        for d in range(len(data)):
            neg_words[d] = data[d].split(' ')  # 以空格划分单词
    print(neg_words[1])
    max_len = 0
    for d in neg_words:
        if len(d) > max_len:
            max_len = len(d)
    print(max_len)
    with open(pos_dir, "r", encoding='Windows-1252') as f:
        data = f.read().split('\n')
        pos_words = [0] * len(data)
        for d in range(len(data)):
            pos_words[d] = data[d].split(' ')
    print(pos_words[1])
    for d in pos_words:
        if len(d) > max_len:
            max_len = len(d)
    print(max_len)

    # word_to_vector
    vectors_dir = r'rt-polaritydata/test2.w2v'
    with open(vectors_dir, "r", encoding='Windows-1252') as f:
        data = f.read()
        # data = str(data)
        data = data.split('\n')
        i = 0
        word_to_vec = {}
        vec = []
        print(len(data))
        for d in range(len(data)):
            if i:
                dd = data[d].split(' ')
                word = dd[0]
                vecs = dd[1:]
                vec = []
                for v in vecs:
                    if v and word is not '':
                        vec.append(float(v))
                word_to_vec[word] = vec
            i += 1

    # text word to vector
    worddim = 300
    null_fill = [0.0] * worddim
    x_neg = np.zeros((len(neg_words), max_len, worddim))
    print(x_neg.shape)
    for line in range(len(neg_words)):
        for word_ind in range(max_len):
            if word_ind >= len(neg_words[line]):
                x_neg[line][word_ind] = np.array(null_fill)
            else:
                if neg_words[line][word_ind] in word_to_vec and word_to_vec[
                        neg_words[line][word_ind]]:
                    x_neg[line][word_ind] = np.array(
                        word_to_vec[neg_words[line][word_ind]])
                else:
                    x_neg[line][word_ind] = np.array(null_fill)

    null_fill = [0.0] * worddim
    x_pos = np.zeros((len(pos_words), max_len, worddim))
    print(x_pos.shape)
    for line in range(len(pos_words)):
        for word_ind in range(max_len):
            if word_ind >= len(pos_words[line]):
                x_pos[line][word_ind] = np.array(null_fill)
            else:
                if pos_words[line][word_ind] in word_to_vec and word_to_vec[
                        pos_words[line][word_ind]]:
                    x_pos[line][word_ind] = np.array(
                        word_to_vec[pos_words[line][word_ind]])
                else:
                    x_pos[line][word_ind] = np.array(null_fill)

    # x of shape(14012, 447, 300) and y of shape(14012,)
    from sklearn.preprocessing import OneHotEncoder
    y_neg = np.zeros((len(neg_words)))
    y_pos = np.ones((len(pos_words)))
    # y_ = np.concatenate((y_neg,y_pos),axis=0)
    # x_ = np.concatenate((x_neg,x_pos),axis=0)
    train_data = np.concatenate(
        (x_neg[len(neg_words) // 10:], x_pos[len(neg_words) // 10:]),
        axis=0)  # 将积极和消极的数据连接起来
    train_label = np.concatenate(
        (y_neg[len(neg_words) // 10:], y_pos[len(neg_words) // 10:]), axis=0)

    ohe = OneHotEncoder()
    ohe.fit([[0], [1]])
    train_label = np.array(
        ohe.transform(np.transpose([
            train_label,
        ])).toarray())  # trainsform into onehot label

    np.random.seed(231)
    np.random.shuffle(train_data)
    np.random.seed(231)
    np.random.shuffle(train_label)

    test_data = np.concatenate(
        (x_neg[:len(neg_words) // 10], x_pos[:len(pos_words) // 10]), axis=0)
    test_label = np.concatenate(
        (y_neg[:len(neg_words) // 10], y_pos[:len(pos_words) // 10]), axis=0)
    test_label = np.array(
        ohe.transform(np.transpose([
            test_label,
        ])).toarray())  # trainsform into onehot label
    np.random.seed(131)
    np.random.shuffle(test_data)
    np.random.seed(131)
    np.random.shuffle(test_label)
    print(test_data.shape)
    print(test_label.shape)
    print(train_data.shape)

    return [train_data, train_label, test_data, test_label]
Example #60
0
# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# Importing the dataset
dataset = pd.read_csv('50_Startups.csv')
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values  # the last column (Profit)

# Data encoding
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
le = LabelEncoder()
X[:, -1] = le.fit_transform(X[:, -1])  # labelling the State
enc = OneHotEncoder(categorical_features=[3])
X = enc.fit_transform(X).toarray()

# Avoiding the dummy variable trap
X = X[:, 1:]  # ignoring the first dummy column :)

# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=0)

# Fitting Multiple Linear Regression to the Training Set
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()