def hash(data, labels, new_dimension): print "start hashing trick..." # convert features as dict dictList = list() if hasattr(data, "indices"): #ind = data.indices #dat = data.data data = data.toarray() indices = range(len(data[0])) for item in data: zipped = zip(indices, item) row = dict() for index,value in zipped: if value != 0: row[str(index)] = value dictList.append(row) a = 234 else: indices = map(str, range(len(data[0]))) for row in data: dictList.append(dict(zip(indices, row))) start = time.time() hasher = FeatureHasher(n_features=new_dimension) # , input_type='dict' reduced = hasher.fit_transform(dictList).toarray() end = time.time() return (reduced, end-start)
def hash_array(feature_dict, feature_num): # print feature_dict[0] if feature_num == 1: x_new = np.asarray(feature_dict) x_h = x_new.reshape(len(feature_dict), 1) else: hasher = FeatureHasher(n_features=feature_num, non_negative=True, input_type='dict') X_new = hasher.fit_transform(feature_dict) x_h = X_new.toarray() # vec = DictVectorizer() # x_h = vec.fit_transform(feature_dict).toarray() # print x_h.shape, type(x_h) return x_h
def train_regression_model(dataset, labelset, epochs=1000, resample=False, remove=False): label_distribution = Counter(labelset) classif = Perceptron() #classif = LogisticRegression(solver='liblinear', penalty='l1')#, class_weight='balanced') #print('## train sample distribution', label_distribution) FH = FeatureHasher() dataset = FH.fit_transform(dataset) #dataset = transform_vectors(dataset) #samp, nx, ny = dataset.shape #dataset = dataset.reshape((samp, nx*ny)) if resample: resample_dataset(dataset, labelset) return classif.fit(dataset, labelset), FH
def run_hash_trick(df, columns, table): replace_col_names = [ cols for cols in df.columns if len(re.findall('_{}$'.format(columns), cols)) > 0 ] df[replace_col_names] = df[replace_col_names].astype(str) hasher = FeatureHasher( n_features=HASH_TRICK_FEATURES['{}_hash_features'.format(table)], input_type="string") hashed_features = hasher.fit_transform( df[replace_col_names].values).todense() hashed_features = pd.DataFrame(hashed_features) hashed_features.columns = [ '{}_seq_feat{}'.format(table, i) for i in range(HASH_TRICK_FEATURES['{}_hash_features'.format(table)]) ] df = df.drop(replace_col_names, axis=1) return (df, hashed_features)
def hash_features(basic_features): """ Args: basic_features: Returns: """ h = FeatureHasher(n_features=20, input_type='string', dtype=int, alternate_sign=False) features_ = [ str(basic_features.values.tolist()[i]) for i in range(len(basic_features.values.tolist())) ] features = h.fit_transform(features_) return features
class RegCBLearner(Learner): """A learner using the RegCB algorithm by Foster et al. and the online bin search implementation by Bietti et al. References: Foster, Dylan, Alekh Agarwal, Miroslav Dudík, Haipeng Luo, and Robert Schapire. "Practical contextual bandits with regression oracles." In International Conference on Machine Learning, pp. 1539-1548. PMLR, 2018. Bietti, Alberto, Alekh Agarwal, and John Langford. "A contextual bandit bake-off." arXiv preprint arXiv:1802.04064 (2018). """ @property def family(self) -> str: """The family of the learner. See the base class for more information """ return f"RegCB" @property def params(self) -> Dict[str, Any]: """The parameters of the learner. See the base class for more information """ dict = {'beta': self._beta, 'alpha': self._alpha, 'interactions': self._interactions} return dict def __init__(self, *, beta: float, alpha: float, learning_rate:float=0.1, interactions: Sequence[str] = ['a', 'ax']) -> None: """Instantiate a RegCBLearner. Args: beta : square-loss tolerance alpha: confidence bounds precision interactions: the set of interactions the learner will use. x refers to context and a refers to actions, e.g. xaa would mean interactions between context, actions and actions. """ PackageChecker.sklearn("RegCBLearner") from sklearn.feature_extraction import FeatureHasher from sklearn.preprocessing import PolynomialFeatures self._beta = beta self._alpha = alpha self._iter = 0 self._core_model = [] self._times = [0,0,0,0] self._interactions = interactions self._terms = [] self._learning_rate = learning_rate for term in self._interactions: term = term.lower() x_num = term.count('x') a_num = term.count('a') if x_num + a_num != len(term): raise Exception("Letters other than x and a were passed for parameter interactions. Please remove other letters/characters.") self._terms.append((x_num, a_num)) max_x_term = max(max(term[0] for term in self._terms),1) max_a_term = max(max(term[1] for term in self._terms),1) self._x_p = PolynomialFeatures(degree=max_x_term, include_bias=False, interaction_only=False) self._a_p = PolynomialFeatures(degree=max_a_term, include_bias=False, interaction_only=False) self._h = FeatureHasher(input_type='pair') def predict(self, key: Key, context: Context, actions: Sequence[Action]) -> Sequence[float]: """Determine a PMF with which to select the given actions. Args: key: The key identifying the interaction we are choosing for. context: The context we're currently in. See the base class for more information. actions: The actions to choose from. See the base class for more information. Returns: The probability of taking each action. See the base class for more information. """ import numpy as np from scipy import sparse if self._iter == 0: if isinstance(context,dict) or isinstance(actions[0],dict): self._core_model = sparse.csr_matrix(self._featurize(context, actions[0]).shape) else: self._core_model = np.zeros(self._featurize(context, actions[0]).shape) if self._iter == 200: self._times = [0,0,0,0] if (self._iter < 200): return [1/len(actions)] * len(actions) else: maxScore = -float('inf') maxAction = None for action in actions: features = self._featurize(context,action) score = self._bin_search(features, len(actions)) if score > maxScore: maxAction = action maxScore = score return [int(action == maxAction) for action in actions] def learn(self, key: Key, context: Context, action: Action, reward: float, probability: float) -> None: """Learn from the given interaction. Args: key: The key identifying the interaction this observed reward came from. context: The context we're learning about. See the base class for more information. action: The action that was selected in the context. See the base class for more information. reward: The reward that was gained from the action. See the base class for more information. probability: The probability that the given action was taken. """ start = time.time() features = self._featurize(context, action) self._core_model = self._update_model(self._core_model, features, reward, 1) self._times[2] += time.time()-start self._iter += 1 # if (self._iter-200-1) % 50 == 0 and self._iter > 200: # print(f'avg phi time: {round(self._times[0]/(self._iter-200),2)}') # print(f'avg bin time: {round(self._times[1]/(self._iter-200),2)}') # print(f'avg lrn time: {round(self._times[2]/(self._iter-200),2)}') def _bin_search(self, features, K_t) -> float: start = time.time() y_u = 2 w = 1 f_u_a_w = self._update_model(self._core_model, features, y_u, w) f_x_t_a = self._predict_model(self._core_model, features) s_u_a = (self._predict_model(f_u_a_w, features) - f_x_t_a) / w obj = lambda w: w*(f_x_t_a-y_u)**2 - w*(f_x_t_a+s_u_a*w-y_u)**2 lower_search_bound = 0 upper_search_bound = (f_x_t_a-y_u)/(-s_u_a) width_search_bound = upper_search_bound - lower_search_bound constraint = self._alpha * math.log(K_t) w_old = lower_search_bound w_now = lower_search_bound + 1/2*width_search_bound o = obj(w_now) while abs(w_now-w_old) > width_search_bound*(1/2)**30 or o >= constraint: w_diff = abs(w_now-w_old) w_old = w_now if o < constraint: w_now += w_diff/2 else: w_now -= w_diff/2 o = obj(w_now) self._times[1] += time.time() - start return f_x_t_a + s_u_a*w_now def _featurize(self, context, action): import numpy as np #type: ignore start = time.time() is_sparse = isinstance(context, dict) or isinstance(action, dict) if isinstance(context, dict): context_values = list(context.values()) context_names = list([ f"x{k}" for k in context.keys() ]) else: context_values = (context or [1]) context_names = [''] if not is_sparse else [ f"x{i}" for i in range(len(context_values)) ] if isinstance(action, dict): action_names = list([ f"a{k}" for k in action.keys() ]) action_values = list(action.values()) else: action_values = action action_names = [''] if not is_sparse else [ f"a{i}" for i in range(len(action_values)) ] x_terms_by_degree = self._terms_by_degree(len(context_values), self._x_p.fit_transform([context_values])[0]) a_terms_by_degree = self._terms_by_degree(len(action_values) , self._a_p.fit_transform([action_values])[0]) features = self._interaction_terms(x_terms_by_degree, a_terms_by_degree, [1]) if is_sparse: x_names_by_degree = self._terms_by_degree(len(context_values), self._x_p.get_feature_names(context_names)) a_names_by_degree = self._terms_by_degree(len(context_values), self._a_p.get_feature_names(action_names)) names = self._interaction_terms(x_names_by_degree, a_names_by_degree, ['']) final_features = np.array(features) if not is_sparse else self._h.fit_transform([list(zip(names,features))]) self._times[0] += time.time() - start return final_features def _terms_by_degree(self, base_term_count:int, terms:Sequence[Any], with_bias:bool = False) -> Dict[int,Sequence[Any]]: terms_by_degree = {} index = 0 if not with_bias else 1 degree = 1 while index != len(terms): degree_terms_count = int((base_term_count**degree + base_term_count)/2) terms_by_degree[degree] = terms[index:degree_terms_count] index += degree_terms_count degree += 1 return terms_by_degree def _interaction_terms(self, x_terms_by_degree, a_terms_by_degree, default): import numpy as np interaction_terms = [] for term in self._terms: x_for_degree = x_terms_by_degree.get(term[0], default) a_for_degree = a_terms_by_degree.get(term[1], default) if not isinstance(x_for_degree[0],str): outer = np.outer(x_for_degree, a_for_degree) else: outer = np.char.array(x_for_degree)[:,None] + np.char.array(a_for_degree) interaction_terms += outer.T.reshape((1,-1)).squeeze().tolist() return interaction_terms def _predict_model(self, model, features): import numpy as np import scipy.sparse as sp if sp.issparse(model): return model.multiply(features).data.sum() else: return np.dot(model, features) def _update_model(self, model, features, value, importance): error = self._predict_model(model, features) - value return model - self._learning_rate*features*error*importance
}, { 'feature_3': -2, 'feature_4': 10 }] # Vectorize the dictionary data print('Dictionary data vectorization') dv = DictVectorizer() Y_dict = dv.fit_transform(data) print(Y_dict.todense()) print('Vocabulary:') print(dv.vocabulary_) # Feature hashing print('Feature hashing') fh = FeatureHasher() Y_hashed = fh.fit_transform(data) # Decode the features print('Feature decoding') print(Y_hashed.todense()) # One-hot encoding data1 = [[0, 10], [1, 11], [1, 8], [0, 12], [0, 15]] # Encode data oh = OneHotEncoder(categorical_features=[0]) Y_oh = oh.fit_transform(data1) print(Y_oh.todense())
# data = pd.concat([data, one_hot_column], axis=1) # del data[colname] # return data columns = [ 'workclass', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country'] columns_prefix = [ 'workclass_px', 'marital-status_px', 'occupation_px', 'relationship_px', 'race_px', 'sex_px', 'native-country_px'] # columns_other = ['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week'] x_train_onehot = pd.get_dummies(x_train, prefix=columns_prefix, columns=columns) # print(x_train.head()) print(x_train.shape) fh = FeatureHasher(n_features=8, input_type='string') sp = fh.fit_transform(x_train['native-country']) df1 = pd.DataFrame(sp.toarray(), columns=['country_1', 'country_2','country_3','country_4','country_5','country_16', 'country_7','country_8']) x_train_hashed = pd.concat([df1, x_train.drop(columns=['native-country'])], axis=1) print('x_train\n', x_train.columns) print('x_train_hashed\n', x_train_hashed.columns) # print('x_train_onehot\n', x_train_onehot.columns) from sklearn.model_selection import train_test_split data_train, data_test, target_train, target_test = train_test_split(x_train_onehot, y_train, test_size=0.20, random_state=10 )
def load_data(file_name, size=-1): example_path = 'experiments_data' df = pd.read_csv( os.path.join(os.getcwd(), '../..', f'{example_path}/{file_name}_data.csv')) if size > -1: df = df.sample(size, random_state=seed) if file_name == 'wine': y = df['y'] df = df.drop(columns=['y']) return df, y elif file_name == 'fake_job_posting': df.fillna(" ", inplace=True) df['text'] = df['title'] + ' ' + df['location'] + ' ' + df['department'] + ' ' + df['company_profile'] + ' ' + \ df['description'] + ' ' + df['requirements'] + ' ' + df['benefits'] + ' ' + df['employment_type'] \ + ' ' + df['required_education'] + ' ' + df['industry'] + ' ' + df['function'] return df['text'], df['fraudulent'] elif file_name == 'hotel_bookings': X = df.drop(["is_canceled"], axis=1) y = df["is_canceled"] return X, y elif file_name == 'hr_employee_attrition': target_map = {'Yes': 1, 'No': 0} # Use the pandas apply method to numerically encode our attrition target variable y = df["Attrition"].apply(lambda x: target_map[x]) X = df.drop(["Attrition"], axis=1) return X, y elif file_name == 'nomao': X = df.drop(["__TARGET__"], axis=1) y = df["__TARGET__"] return X, y elif file_name == 'placement_full_class': X = df[[ 'gender', 'ssc_p', 'ssc_b', 'hsc_p', 'hsc_b', 'hsc_s', 'degree_p', 'degree_t', 'workex', 'etest_p', 'specialisation', 'mba_p' ]] y = df['status'] return X, y elif file_name == 'rain_weather_aus': df = df.drop(columns=[ 'Sunshine', 'Evaporation', 'Cloud3pm', 'Cloud9am', 'Location', 'RISK_MM', 'Date' ], axis=1) df = df.dropna(how='any') X = df.loc[:, df.columns != 'RainTomorrow'] y = df['RainTomorrow'] return X, y elif file_name == 'cervical_cancer': df = df.replace('?', np.nan) df = df.rename(columns={'Biopsy': 'Cancer'}) df = df.apply(pd.to_numeric) df = df.fillna(df.mean().to_dict()) X = df.drop('Cancer', axis=1) y = df['Cancer'] return X, y elif file_name == 'glass': features = df.columns[:-1].tolist() X = df[features] y = df['Type'] return X, y elif file_name == 'mobile_price': y = df.price_range X = df.drop(["price_range"], axis=1) return X, y elif file_name == 'clinvar_conflicting': toBeConsidered = [ 'CHROM', 'POS', 'REF', 'ALT', 'AF_ESP', 'AF_EXAC', 'AF_TGP', 'CLNDISDB', 'CLNDN', 'CLNHGVS', 'CLNVC', 'MC', 'ORIGIN', 'CLASS', 'Allele', 'Consequence', 'IMPACT', 'SYMBOL', 'Feature_type', 'Feature', 'BIOTYPE', 'STRAND', 'CADD_PHRED', 'CADD_RAW' ] df2 = df[toBeConsidered] df2 = df2.dropna() cutdowns = [] for i in df2.columns.values: if df2[i].nunique() < 1000: cutdowns.append(i) df_final = df2[cutdowns] df_final['CHROM'] = df_final['CHROM'].astype(str) from sklearn.feature_extraction import FeatureHasher fh = FeatureHasher(n_features=5, input_type='string') hashed1 = fh.fit_transform(df_final['REF']) hashed1 = hashed1.toarray() hashedFeatures1 = pd.DataFrame(hashed1) nameList = {} for i in hashedFeatures1.columns.values: nameList[i] = "REF" + str(i + 1) hashedFeatures1.rename(columns=nameList, inplace=True) hashed2 = fh.fit_transform(df_final['ALT']) hashed2 = hashed2.toarray() hashedFeatures2 = pd.DataFrame(hashed2) nameList2 = {} for i in hashedFeatures2.columns.values: nameList2[i] = "ALT" + str(i + 1) hashedFeatures2.rename(columns=nameList2, inplace=True) binaryFeature1 = pd.get_dummies(df_final['CLNVC']) df_final = df_final.drop(columns=['MC'], axis=1) hashed0 = fh.fit_transform(df_final['CHROM']) hashed0 = hashed0.toarray() hashedFeatures0 = pd.DataFrame(hashed0) nameList0 = {} for i in hashedFeatures0.columns.values: nameList0[i] = "CHROM" + str(i + 1) hashedFeatures0.rename(columns=nameList0, inplace=True) hashed3 = fh.fit_transform(df_final['Allele']) hashed3 = hashed3.toarray() hashedFeatures3 = pd.DataFrame(hashed3) nameList3 = {} for i in hashedFeatures3.columns.values: nameList3[i] = "Allele" + str(i + 1) hashedFeatures3.rename(columns=nameList3, inplace=True) hashed4 = fh.fit_transform(df_final['Consequence']) hashed4 = hashed4.toarray() hashedFeatures4 = pd.DataFrame(hashed4) nameList4 = {} for i in hashedFeatures4.columns.values: nameList4[i] = "Consequence" + str(i + 1) hashedFeatures4.rename(columns=nameList4, inplace=True) binaryFeature3 = pd.get_dummies(df_final['IMPACT']) df_final = df_final.drop(columns=['Feature_type'], axis=1) binaryFeature4 = pd.get_dummies(df_final['BIOTYPE'], drop_first=True) binaryFeature5 = pd.get_dummies(df_final['STRAND'], drop_first=True) df3 = pd.concat([ binaryFeature1, binaryFeature3, binaryFeature4, binaryFeature5, hashedFeatures1, hashedFeatures2, hashedFeatures3, hashedFeatures4, hashedFeatures0, df_final['CLASS'] ], axis=1) df3 = df3.dropna() df3.rename(columns={1: "one", 16: "sixteen"}, inplace=True) y = df3['CLASS'] X = df3.drop(columns=['CLASS'], axis=1) return X, y elif file_name == 'heart_failure_clinical': y = df['DEATH_EVENT'] X = df.drop('DEATH_EVENT', axis=1) return X, y elif file_name == 'churn_modelling': df['EstimatedSalary'] = df['EstimatedSalary'].astype(int) df.drop(columns=['RowNumber', 'CustomerId', 'Surname', 'Geography'], inplace=True) le = preprocessing.LabelEncoder() df['Gender'] = le.fit_transform(df['Gender']) X = df.drop('Exited', axis=1) y = df['Exited'] return X, y elif file_name == 'hr_leaving': y = df['left'] X = df.drop('left', axis=1) return X, y elif file_name == 'bank_churners': df = pd.get_dummies(df, drop_first=True) norm = MinMaxScaler().fit(df) data_norm_arr = norm.transform(df) X = pd.DataFrame( data=data_norm_arr, columns=[ 'CLIENTNUM', 'Customer_Age', 'Dependent_count', 'Months_on_book', 'Total_Relationship_Count', 'Months_Inactive_12_mon', 'Contacts_Count_12_mon', 'Credit_Limit', 'Total_Revolving_Bal', 'Avg_Open_To_Buy', 'Total_Amt_Chng_Q4_Q1', 'Total_Trans_Amt', 'Total_Trans_Ct', 'Total_Ct_Chng_Q4_Q1', 'Avg_Utilization_Ratio', 'Attrition_Flag_Existing Customer', 'Gender_M', 'Education_Level_Doctorate', 'Education_Level_Graduate', 'Education_Level_High School', 'Education_Level_Post-Graduate', 'Education_Level_Uneducated', 'Education_Level_Unknown', 'Marital_Status_Married', 'Marital_Status_Single', 'Marital_Status_Unknown', 'Income_Category_$40K - $60K', 'Income_Category_$60K - $80K', 'Income_Category_$80K - $120K', 'Income_Category_Less than $40K', 'Income_Category_Unknown', 'Card_Category_Gold', 'Card_Category_Platinum', 'Card_Category_Silver' ]) X = df.drop("Attrition_Flag_Existing Customer", axis=1) y = df["Attrition_Flag_Existing Customer"] return X, y elif file_name == 'fetal_health': X = df.drop(["fetal_health"], axis=1) y = df["fetal_health"] return X, y elif file_name == 'stroke': df.drop("id", axis=1, inplace=True) for column in ['bmi']: df[column].fillna(df[column].mode()[0], inplace=True) for label, content in df.items(): if pd.api.types.is_string_dtype(content): df[label] = content.astype("category").cat.as_ordered() for label, content in df.items(): if not pd.api.types.is_numeric_dtype(content): df[label] = pd.Categorical(content).codes + 1 X = df.drop("stroke", axis=1) y = df["stroke"] return X, y elif file_name == 'company_bankruptcy_prediction': df.columns = [str(col).strip() for col in list(df.columns)] X = df.drop(["Bankrupt?"], axis=1) y = df['Bankrupt?'] return X, y elif file_name == 'airline_passenger_satisfaction': df['satisfaction'] = df['satisfaction'].map({ 'neutral or dissatisfied': 0, 'satisfied': 1 }) X = df.drop(["satisfaction"], axis=1) y = df['satisfaction'] return X, y elif file_name == 'banking_marketing_targets': X = df.drop(["y"], axis=1) target_map = {'yes': 1, 'no': 0} y = df['y'].apply(lambda x: target_map[x]) return X, y else: raise ValueError( f"file name can be one of the following: wine, fake_job_posting, hotel_bookings, " f"hr_employee_attrition, nomao, placement_full_class, rain_weather_aus, cervical_cancer, " f"glass or mobile_price. " f"file_name that passed is {type(file_name)}")
# %% # new_effect # %% # new_effect.describe() # %% [markdown] # # Feature Hasher # %% from sklearn.feature_extraction import FeatureHasher # %% effect_hasher = FeatureHasher(n_features=3, input_type="string") x = effect_hasher.fit_transform(total_effects) # %% # print(total_effects) # print(x.toarray()) # len(np.unique(x.toarray(), axis=0)) # %% flavor_hasher = FeatureHasher(n_features=10, input_type="string") y = flavor_hasher.fit_transform(total_flavor) # %% # print(total_flavor) # print(y.toarray()) # len(np.unique(y.toarray(), axis=0))
df['INICIO_SINTOMAS'] = df['INICIO_SINTOMAS'].apply( lambda x: int(time.mktime(x.timetuple()))) # Formata a Idade com duas casas decimais df['IDADE'] = df['IDADE'].map('{:,.2f}'.format) #Exclui registros com Sexo 'missing' df = df.drop(df[df.SEXO == 'missing'].index) #Transforma Sexo em código df['SEXO'] = df['SEXO'].apply(lambda x: '0' if (x == 'f') else '1') #Transforma Territorio em hash len(df.groupby('TERRITORIO').size()) fh = FeatureHasher(n_features=10, input_type='string') hashTerritorio = fh.fit_transform(df['TERRITORIO']) dfTerritorio = pd.DataFrame(fh.fit_transform(df['TERRITORIO']).toarray(), columns=[ 'hf0', 'hf1', 'hf2', 'hf3', 'hf4', 'hf5', 'hf6', 'hf7', 'hf8', 'hf9' ]) df[dfTerritorio.columns] = dfTerritorio #Transforma Raça em código len(df.groupby('RACA_COR').size()) df['RACA_COR'].value_counts() enc = preprocessing.OneHotEncoder() dfRacaCor = pd.DataFrame(preprocessing.OneHotEncoder().fit_transform( df['RACA_COR'].to_frame()).toarray(), columns=['rc0', 'rc1', 'rc2', 'rc3', 'rc4']) df[dfRacaCor.columns] = dfRacaCor
display(unique_races.head(10)) s = sum(unique_races.values) h = unique_races.values / s c_sum = np.cumsum(h) plt.plot(c_sum, label="Distribución de la suma acumulativa de razas") plt.grid() plt.legend() # - # >Con el top 10 cubrimos mas del 85% de la data # + # https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.FeatureHasher.html fh = FeatureHasher(n_features=10, input_type='string') hashed_features = fh.fit_transform(dataset['Race'].astype(str).values.reshape( -1, 1)).todense() pd.DataFrame(hashed_features).add_prefix('Race_').head(10).join( dataset['Race'].head(10)) # - # ## Numéricas # En el set tenemos dos variables numéricas, *Weight* y *Height* veamos su distribución # + df = dataset[dataset.Alignment != 'neutral'].reset_index(drop=True) def plot_weight_vs_height(df, title=""): fig = px.scatter(
# In[44]: train_merge = pd.concat([train_merge,pd.get_dummies(train_merge['DeviceType'],prefix='DeviceType')],axis=1).drop(['DeviceType'],axis=1) train_merge = pd.concat([train_merge,pd.get_dummies(train_merge['ProductCD'],prefix='ProductCD')],axis=1).drop(['ProductCD'],axis=1) train_merge = pd.concat([train_merge,pd.get_dummies(train_merge['card4'],prefix='card4')],axis=1).drop(['card4'],axis=1) train_merge = pd.concat([train_merge,pd.get_dummies(train_merge['card6'],prefix='card6')],axis=1).drop(['card6'],axis=1) # ### I did FeatureHashing for DeviceInfo, R_emaildomain, P_emaildomain. # In[45]: fh = FeatureHasher(n_features=5, input_type='string') sp = fh.fit_transform(train_merge['DeviceInfo']) dev_0 = pd.DataFrame(sp.toarray(), columns=['DeviceInfo1', 'DeviceInfo2', 'DeviceInfo3', 'DeviceInfo4', 'DeviceInfo5']) train_merge = pd.concat([train_merge, dev_0], axis=1) # In[46]: fh = FeatureHasher(n_features=5, input_type='string') sp = fh.fit_transform(train_merge['R_emaildomain']) dev_1 = pd.DataFrame(sp.toarray(), columns=['R_emaildomain1', 'R_emaildomain2', 'R_emaildomain3', 'R_emaildomain4', 'R_emaildomain5']) train_merge = pd.concat([train_merge, dev_1], axis=1) # In[47]:
from sklearn.linear_model import LinearRegression from sklearn.model_selection import cross_val_score from sklearn import metrics import statsmodels.api as sm import matplotlib.pyplot as plt # Data import battles = pd.read_csv("01_data/raw/Battle_Results.csv", sep="|") # Prep cat_vars = battles.select_dtypes(object).columns.values.tolist() battles = battles.astype({"Legendary_1": int, "Legendary_2": int}) h1 = FeatureHasher(n_features=5, input_type='string') h2 = FeatureHasher(n_features=5, input_type='string') d1 = h1.fit_transform(battles["Name_1"]) d2 = h2.fit_transform(battles["Name_2"]) d1 = pd.DataFrame(data=d1.toarray()) d1.columns = ["Name_1_" + str(x) for x in range(5)] d2 = pd.DataFrame(data=d2.toarray()) d2.columns = ["Name_2_" + str(x) for x in range(5)] battles = battles.drop(columns=cat_vars[0:2]) battles = pd.concat([battles, d1, d2], axis=1) battles = pd.get_dummies(battles) X = battles.drop(labels="BattleResult", axis=1) y = battles.BattleResult X_train, X_test, y_train, y_test = train_test_split(X,
Data Cleansing """ countVectorizer = CountVectorizer() #-----Convert to dense array cv_trainData_x = countVectorizer.fit_transform(trainData.Phrase).toarray() cv_trainData_x.shape """ Total features: (156060, 15240) """ featureHasher = FeatureHasher(input_type='string', n_features=5000, non_negative=True) fh_trainData_x = featureHasher.fit_transform(trainData.Phrase).toarray() fh_trainData_x.shape trainData_y = trainData.Sentiment.astype('category') trainData_y.shape #-----------Implement gaussianNB model from sklearn.naive_bayes import GaussianNB gaussianModel = GaussianNB() gaussianModel.fit(fh_trainData_x, trainData_y) gaussian_predict = gaussianModel.predict(fh_trainData_x) (trainData_y == gaussian_predict).sum() / len(trainData) """ #--Observations for GAussian mdoel
feature["FromTimestamp"] = click[1] feature["ToTimestamp"] = 0 feature["ItemId"] = click[2] feature["Category"] = click[3] feature["Price"] = 0 feature["Quantitiy"] = 0 X.append(feature) sys.stderr.write("\rProgress:%.2f%%" % (100. * i / len(clicks))) # make dictvect print "make dict vect" v = DictVectorizer() X_dict_sparse = v.fit_transform(X) X_dict = [zip(map(str, row.indices), row.data) for row in X_dict_sparse] # Feature Hashing print "Feature Hashing" n_features = 2**24 hasher = FeatureHasher(n_features=n_features, input_type='pair') X_hash_sparse = hasher.fit_transform(X_dict) X_hash = [zip(row.indices, row.data) for row in X_hash_sparse] # make libsvm data with open("./data/yoochoose-train.dat", "w") as f: for val, features in zip(c, X_hash): features_list = [] for feature in features: features_list.append(str(feature[0]) + ":" + str(feature[1])) features_line = " ".join(features_list) f.write(str(val)+" "+features_line+"\n")
class Project: def __init__(self, train_filepath, test_filepath, sample_filepath, is_generate_feature=True, is_sample_data=False): if is_sample_data: self.train_data = pd.read_csv(sample_filepath) else: self.train_data = pd.read_table(train_filepath) self.test_data = pd.read_table(test_filepath) self.columns = self.train_data.columns self.hash_number = 1 self.feature_hash = FeatureHasher(n_features=self.hash_number, input_type='string') self.spark = SparkSession \ .builder \ .appName("Python Spark SQL basic example") \ .config("spark.some.config.option", "some-value") \ .getOrCreate() self.data_cleaning() if not os.path.exists('./data/feature.csv') or is_generate_feature: self.feature_generation() def data_cleaning(self): # remove meaningless part of data self.train_data.drop(columns=[ "Row", "First Transaction Time", "Step Start Time", "Correct Transaction Time", "Step End Time", "Step Duration (sec)", "Hints" ], inplace=True) self.columns = self.train_data.columns check_list = ["Correct Step Duration (sec)", "Incorrects", "Corrects"] data = {} for item in check_list: data[item] = { "std": self.train_data[item].std(), "mean": self.compute_mean_spark(self.train_data, item) } def check_outlier(record): dict_re = dict(zip(self.columns, record)) # item on check list is float 64 for item in check_list: if dict_re[item] and abs(dict_re[item] - data[item]["mean"] ) > 10 * data[item]["std"]: return True return False def check_error(record): # wrong data dict_re = dict(zip(self.columns, record)) if dict_re['Correct Step Duration (sec)'] == np.NaN and dict_re[ 'Correct First Attempt'] == 0: return True elif dict_re['Error Step Duration (sec)'] == np.NaN and dict_re[ 'Incorrects'] == 0: return True return False index = 0 remove_list = [] problem_unit = [] problem_section = [] oppo_num = [] for i in self.train_data.values: if check_outlier(i) or check_error(i): remove_list.append(index) else: # dealing with problem hierarchy dict_re = dict(zip(self.columns, i)) unit = dict_re["Problem Hierarchy"].split(", ")[0] unit = re.sub("Unit ", "", unit) section = dict_re["Problem Hierarchy"].split(", ")[1] section = re.sub("Section ", "", section) problem_unit.append(unit) problem_section.append(section) if type(dict_re["Opportunity(Default)"]) == str: oppo_num.append(dict_re["Opportunity(Default)"]) else: oppo_num.append("0") # dealing with KC index += 1 write_save_log("number of cleaned record: {}".format(len(remove_list))) self.train_data.drop(remove_list, inplace=True) self.train_data.drop(columns=[ "Problem Hierarchy", "Opportunity(Default)", "Error Step Duration (sec)" ], inplace=True) self.train_data["Problem Unit"] = problem_unit self.train_data["Problem Section"] = problem_section self.train_data["Opportunity(Default)"] = oppo_num # for col in self.train_data.columns: # print(self.train_data[col].describe()) self.columns = self.train_data.columns @staticmethod def one_hot_encoder_generator(features_pd, column, data): id_ohe = OneHotEncoder() id_le = LabelEncoder() id_labels = id_le.fit_transform(data[column]) # id_feature_arr: num(row) * num(unique(id)) id_feature_arr = id_ohe.fit_transform( pd.DataFrame(id_labels)).toarray() id_feature_arr = np.transpose(id_feature_arr) for label in id_le.classes_: features_pd[label] = id_feature_arr[list( id_le.classes_).index(label)] def hash_encoder_generator(self, features_pd, column, data): sn_feature = self.feature_hash.fit_transform(data[column]).toarray() sn_feature = np.transpose(sn_feature) for i in range(self.hash_number): features_pd["{}_{}".format(column, i)] = sn_feature[i] # @staticmethod # def count_intelligent_score(cor_time, cor_first, cor_num, in_cur): # return cor_time * cor_first * (in_cur / cor_num) @staticmethod def count_intelligent_score(cor_time, cor_first, cor_num, in_cur): cor_step_time_score = -cor_time + 100 if cor_step_time_score < -100: cor_step_time_score = -100 normalize_step = (cor_step_time_score + 100) / 200 return normalize_step * cor_first * (in_cur / cor_num) def compute_mean_spark(self, dataframe, column): dataframe[column].to_csv('./data/tem.csv') spark_df = self.spark.read.csv('./data/tem.csv') spark_df.createOrReplaceTempView("train") sqlDF = self.spark.sql( "SELECT AVG(_c1) as mean FROM train WHERE _c1 is not Null") return json.loads(sqlDF.toJSON().first())['mean'] def feature_generation(self): """ feature 1: compute the intelligent feature 2: compute the difficulty of a problem feature 3: sum of difficulty of knowledge component """ # for col in self.train_data.columns: # print(self.train_data[col].describe()) write_save_log("start to generate features") features_pd = pd.DataFrame() # hash encoder encoder ID # self.one_hot_encoder_generator(features_pd, "Anon Student Id", self.train_data) self.hash_encoder_generator(features_pd, "Anon Student Id", self.train_data) write_save_log("ID feature generated") # hash encoder Problem Name self.hash_encoder_generator(features_pd, "Problem Name", self.train_data) write_save_log("Problem Name feature generated") # hash encoder Problem Name self.hash_encoder_generator(features_pd, "Problem Unit", self.train_data) write_save_log("Problem Unit feature generated") # hash encoder Problem Name self.hash_encoder_generator(features_pd, "Problem Section", self.train_data) write_save_log("Problem Section feature generated") # directly add problem view features_pd["Problem View"] = self.train_data["Problem View"] mean_pv = self.compute_mean_spark(features_pd, "Problem View") new_pv = [] for row in features_pd["Problem View"]: if not np.isnan(row): new_pv.append(row) else: new_pv.append(mean_pv) features_pd.drop(columns=["Problem View"], inplace=True) features_pd["Problem View"] = new_pv write_save_log("Problem View feature generated") # Step Name hash to features self.hash_encoder_generator(features_pd, "Step Name", self.train_data) write_save_log("Step Name feature generated") # next features are precomputed values in train data set # person intelligent write_save_log("start to generate person intelligent") id_unique = self.train_data["Anon Student Id"] intelligent_table = dict( zip(id_unique, [0 for i in range(len(id_unique))])) id_group = self.train_data.groupby(["Anon Student Id"]).mean() for i in range(len(id_group.values)): write_save_log("id group process: {}".format(i)) stu_id = id_group.index[i] dict_row = dict(zip(id_group.columns, id_group.values[i])) intelligent_table[stu_id] = self.count_intelligent_score( dict_row['Correct Step Duration (sec)'], dict_row['Correct First Attempt'], dict_row['Corrects'], dict_row['Incorrects']) problem_group = self.train_data.groupby(["Step Name"]).mean() problem_difficulty = {} problem_group_cor_first = problem_group["Correct First Attempt"] for i in range(len(problem_group_cor_first.index)): problem_difficulty[problem_group_cor_first. index[i]] = problem_group_cor_first.values[i] problem_difficulty['mean'] = problem_group_cor_first.mean() write_save_log("problem difficulty mean : {}".format( problem_difficulty['mean'])) with open("./data/problem.json", 'w') as f: f.write(json.dumps(problem_difficulty)) unique_KC = self.train_data["KC(Default)"].unique() unique_KC_list = [] for kc in unique_KC: if type(kc) == str: for true_kc in kc.split("~~"): unique_KC_list.append(true_kc) # [correct, total] kc_difficulty = dict( zip(unique_KC_list, [[0, 0] for i in range(len(id_unique))])) person_intelligent = [] kc_length = [] index_count = 0 for row in self.train_data.values: dict_row = dict(zip(self.train_data.columns, row)) if index_count % 10000 == 0: write_save_log( "loading feature to dataframe process: {}".format( index_count)) # processing intelligent_table stu_id = dict_row["Anon Student Id"] person_intelligent.append(intelligent_table[stu_id]) # extract kc stu_kc = dict_row["KC(Default)"] kc_num = 0 if type(stu_kc) == str: kc_num = len(stu_kc.split("~~")) for true_kc in stu_kc.split("~~"): if dict_row["Correct First Attempt"] == 1: kc_difficulty[true_kc][0] += 1 kc_difficulty[true_kc][1] += 1 kc_length.append(kc_num) index_count += 1 with open('./data/kc_difficulty.json', 'w') as f: re_kc = {} for key, value in kc_difficulty.items(): re_kc[key] = value[0] / value[1] kc_difficulty = re_kc kc_df = pd.DataFrame({"value": list(kc_difficulty.values())}) kc_mean = self.compute_mean_spark(kc_df, "value") kc_difficulty["mean"] = kc_mean f.write(json.dumps(kc_difficulty)) write_save_log("kc mean: {}".format(kc_mean)) kc_features = [] oppo_feature = [] problem_diff_value = [] for row in self.train_data.values: dict_row = dict(zip(self.train_data.columns, row)) stu_kc = dict_row["KC(Default)"] sum_difficult = 0 oppo_value = 0 if type(stu_kc) == str: oppo_list = dict_row["Opportunity(Default)"].split("~~") for true_kc in stu_kc.split("~~"): oppo_value += int(oppo_list[stu_kc.split("~~").index( true_kc)]) * kc_difficulty[true_kc] sum_difficult += kc_difficulty[true_kc] sum_difficult /= len(stu_kc.split("~~")) oppo_value /= len(stu_kc.split("~~")) else: oppo_value = kc_difficulty["mean"] sum_difficult = kc_difficulty["mean"] # problem difficulty problem_diff_value.append( problem_difficulty[dict_row["Step Name"]]) kc_features.append(sum_difficult) oppo_feature.append(oppo_value) features_pd["kc difficulty"] = kc_features features_pd["kc number"] = kc_length features_pd["person_intelligent"] = person_intelligent features_pd["oppo value"] = oppo_feature features_pd['Problem difficulty'] = problem_diff_value write_save_log("feature length: {}".format(len(features_pd.columns))) features_pd.to_csv("./data/feature.csv", mode='w', index=False) with open('./data/intelligent_table.json', 'w') as f: f.write(json.dumps(intelligent_table)) def predict(self): write_save_log("start to predict") correct_answer = [] first_attempt_index = list(self.columns).index('Correct First Attempt') for row in self.train_data.values: re_cor = row[first_attempt_index] if np.isnan(re_cor): re_cor = 0 correct_answer.append(re_cor) correct_answer = np.array(correct_answer) features = pd.read_csv("./data/feature.csv") # for col in features.columns: # print(features[col].describe()) with open('./data/intelligent_table.json', 'r') as f: intelligent_table = json.loads(f.read()) with open('./data/kc_difficulty.json', 'r') as f: kc_table = json.loads(f.read()) with open('./data/problem.json', 'r') as f: problem_table = json.loads(f.read()) # generate feature for test dataa test_features_pd = pd.DataFrame() problem_unit = [] problem_section = [] problem_values = [] for row in self.test_data.values: dict_re = dict(zip(self.test_data.columns, row)) unit = dict_re["Problem Hierarchy"].split(", ")[0] unit = re.sub("Unit ", "", unit) section = dict_re["Problem Hierarchy"].split(", ")[1] section = re.sub("Section ", "", section) problem_unit.append(unit) problem_section.append(section) if dict_re["Step Name"] in problem_table.keys(): problem_values.append(problem_table[dict_re["Step Name"]]) else: problem_values.append(problem_table['mean']) self.test_data["Problem Unit"] = problem_unit self.test_data["Problem Section"] = problem_section # one hot encoder ID self.hash_encoder_generator(test_features_pd, "Anon Student Id", self.test_data) # self.one_hot_encoder_generator(test_features_pd, "Anon Student Id", self.test_data) write_save_log("ID feature generated") # hash encoder Problem Name self.hash_encoder_generator(test_features_pd, "Problem Name", self.test_data) write_save_log("Problem Name feature generated") # hash encoder Problem Unit self.hash_encoder_generator(test_features_pd, "Problem Unit", self.test_data) write_save_log("Problem Unit feature generated") # hash encoder Problem Section self.hash_encoder_generator(test_features_pd, "Problem Section", self.test_data) write_save_log("Problem Section feature generated") # directly add problem view test_features_pd["Problem View"] = self.test_data["Problem View"] self.hash_encoder_generator(test_features_pd, "Step Name", self.test_data) intel_values = [] kc_values = [] test_answer = [] kc_length = [] index_count = 0 remove_list = [] oppo_feature = [] for row in self.test_data.values: dict_re = dict(zip(self.test_data.columns, row)) if np.isnan(dict_re["Correct First Attempt"]): remove_list.append(index_count) test_answer.append(-1) else: test_answer.append(dict_re["Correct First Attempt"]) intel_values.append(intelligent_table[dict_re["Anon Student Id"]]) stu_kc = dict_re["KC(Default)"] sum_difficult = 0 kc_num = 0 oppo_value = 0 if type(stu_kc) == str: oppo_list = dict_re["Opportunity(Default)"].split("~~") kc_num = len(stu_kc.split("~~")) for true_kc in stu_kc.split("~~"): oppo_value += int(oppo_list[stu_kc.split("~~").index( true_kc)]) * kc_table[true_kc] sum_difficult += kc_table[true_kc] sum_difficult /= len(stu_kc.split("~~")) else: oppo_value = kc_table["mean"] sum_difficult = kc_table["mean"] kc_values.append(sum_difficult) kc_length.append(kc_num) oppo_feature.append(oppo_value) index_count += 1 test_features_pd["kc difficulty"] = kc_values test_features_pd["kc number"] = kc_length test_features_pd["person_intelligent"] = intel_values test_features_pd["oppo value"] = oppo_feature test_features_pd['Problem difficulty'] = problem_values # test_features_pd.drop(remove_list, inplace=True) clf = HistGradientBoostingRegressor(random_state=1, max_iter=331, loss='least_squares', learning_rate=0.4, l2_regularization=0.2) clf.fit(features.values, correct_answer) res = clf.predict(test_features_pd.values) re_res = [] for i in res: if i >= 0.5: re_res.append(1) else: re_res.append(0) for i in range(len(re_res)): if test_answer[i] == -1: test_answer[i] = re_res[i] self.test_data.drop(columns=['Correct First Attempt']) self.test_data['Correct First Attempt'] = test_answer self.test_data.to_csv('./data/final.csv', index=False) def train(self): write_save_log("start to train") correct_answer = [] first_attempt_index = list(self.columns).index('Correct First Attempt') for row in self.train_data.values: re_cor = row[first_attempt_index] if np.isnan(re_cor): re_cor = 0 correct_answer.append(re_cor) correct_answer = np.array(correct_answer) features = pd.read_csv("./data/feature.csv") # for col in features.columns: # print(features[col].describe()) with open('./data/intelligent_table.json', 'r') as f: intelligent_table = json.loads(f.read()) with open('./data/kc_difficulty.json', 'r') as f: kc_table = json.loads(f.read()) with open('./data/problem.json', 'r') as f: problem_table = json.loads(f.read()) # generate feature for test dataa test_features_pd = pd.DataFrame() problem_unit = [] problem_section = [] problem_values = [] for row in self.test_data.values: dict_re = dict(zip(self.test_data.columns, row)) unit = dict_re["Problem Hierarchy"].split(", ")[0] unit = re.sub("Unit ", "", unit) section = dict_re["Problem Hierarchy"].split(", ")[1] section = re.sub("Section ", "", section) problem_unit.append(unit) problem_section.append(section) if dict_re["Step Name"] in problem_table.keys(): problem_values.append(problem_table[dict_re["Step Name"]]) else: problem_values.append(problem_table['mean']) self.test_data["Problem Unit"] = problem_unit self.test_data["Problem Section"] = problem_section # one hot encoder ID self.hash_encoder_generator(test_features_pd, "Anon Student Id", self.test_data) # self.one_hot_encoder_generator(test_features_pd, "Anon Student Id", self.test_data) write_save_log("ID feature generated") # hash encoder Problem Name self.hash_encoder_generator(test_features_pd, "Problem Name", self.test_data) write_save_log("Problem Name feature generated") # hash encoder Problem Unit self.hash_encoder_generator(test_features_pd, "Problem Unit", self.test_data) write_save_log("Problem Unit feature generated") # hash encoder Problem Section self.hash_encoder_generator(test_features_pd, "Problem Section", self.test_data) write_save_log("Problem Section feature generated") # directly add problem view test_features_pd["Problem View"] = self.test_data["Problem View"] self.hash_encoder_generator(test_features_pd, "Step Name", self.test_data) intel_values = [] kc_values = [] test_answer = [] kc_length = [] index_count = 0 remove_list = [] oppo_feature = [] for row in self.test_data.values: dict_re = dict(zip(self.test_data.columns, row)) if np.isnan(dict_re["Correct First Attempt"]): remove_list.append(index_count) else: test_answer.append(dict_re["Correct First Attempt"]) intel_values.append(intelligent_table[dict_re["Anon Student Id"]]) stu_kc = dict_re["KC(Default)"] sum_difficult = 0 kc_num = 0 oppo_value = 0 if type(stu_kc) == str: oppo_list = dict_re["Opportunity(Default)"].split("~~") kc_num = len(stu_kc.split("~~")) for true_kc in stu_kc.split("~~"): oppo_value += int(oppo_list[stu_kc.split("~~").index( true_kc)]) * kc_table[true_kc] sum_difficult += kc_table[true_kc] sum_difficult /= len(stu_kc.split("~~")) else: oppo_value = kc_table["mean"] sum_difficult = kc_table["mean"] kc_values.append(sum_difficult) kc_length.append(kc_num) oppo_feature.append(oppo_value) index_count += 1 test_features_pd["kc difficulty"] = kc_values test_features_pd["kc number"] = kc_length test_features_pd["person_intelligent"] = intel_values test_features_pd["oppo value"] = oppo_feature test_features_pd['Problem difficulty'] = problem_values test_features_pd.drop(remove_list, inplace=True) parameter_range = { "random_state": [i for i in range(0, 40)], "max_iter": [i for i in range(100, 500)], "loss": ['least_squares', 'least_absolute_deviation', 'poisson'], "learning_rate": [0.1 * i for i in range(1, 7)], "l2_regularization": [0.1 * i for i in range(1, 10)], } best_score = 1 bes_policy = {} while best_score > 0.35: random_state = {} for key, value in parameter_range.items(): random_state[key] = random.sample(value, 1) write_save_log(random_state) # clf1 = HistGradientBoostingRegressor() # clf2 = AdaBoostRegressor() # # clf = VotingRegressor(estimators=[('hgb', clf1), ('rf', clf2)], weights=[2, 1]) clf = HistGradientBoostingRegressor( random_state=random_state["random_state"][0], max_iter=random_state["max_iter"][0], loss=random_state['loss'][0], learning_rate=random_state['learning_rate'][0], l2_regularization=random_state['l2_regularization'][0]) clf.fit(features.values, correct_answer) for i in range(len(test_features_pd.columns)): if test_features_pd.columns[i] != features.columns[i]: raise KeyError("feature order error!") res = clf.predict(test_features_pd.values) re_res = [] for i in res: if i >= 0.5: re_res.append(1) else: re_res.append(0) re_score = MSER(re_res, test_answer) write_save_log("result error: {}".format(re_score)) if best_score > re_score: best_score = re_score bes_policy = copy.deepcopy(random_state) write_save_log("\nbest policy and score\n" + str(bes_policy)) write_save_log(str(best_score) + '\n')
labels = data_frame['Category'] pd_frame = data_frame['PdDistrict'] resolution = data_frame['Resolution'] data_frame.drop(['Category'],inplace=True,axis=1) #training_data = pd.concat([pd_frame,resolution], axis=1) training_data = data_frame.as_matrix(['Dates','DayOfWeek','Address']) testing_data = data_frame_test.as_matrix(['Dates','DayOfWeek','Address']) gnb = MultinomialNB(alpha=0) #gnb = LinearSVC() print 'Made it till here-1' fh = FeatureHasher(input_type='string',non_negative=True) X=fh.fit_transform(training_data) X_test = fh.fit_transform(testing_data) print 'Made it till here-2' print training_data.shape #print X.toarray() print 'Made it till here-3' gnb_model = gnb.fit(X,labels) y_pred=gnb_model.predict(X_test) print len(y_pred) #for actual,predicted in zip(labels,y_pred):
def preprocess_single_predict_manual(self, feature_list): """ Method Name: preprocess_single_predict_manual Description: Preprocesses the prediction data entered manually. Input: feature_list Input Type: list Output: Returns dataframe Written By: Vaishnavi Ambati Version: 1.0 Revisions: None """ try: self.log_file = self.loggerObj.write_log(self.log_file,'Entered preprocess_single_predict_manual of dataProcessor class') self.log_file = self.loggerObj.write_log(self.log_file, 'Data preprocessing has been initiated.') columns = ['name', 'city', 'ranking', 'no_of_reviews', 'no_of_cuisines', 'review1', 'review2', 'cheap', 'high', 'medium'] features = feature_list[:-1] price = feature_list[-1] if price == 'cheap': features.extend([1, 0, 0]) elif price == 'medium': features.extend([0, 0, 1]) elif price == 'high': features.extend([0, 1, 0]) feature_dic = dict(zip(columns, features)) df = pd.DataFrame(feature_dic, index=[0]) # feature hashing city column fh = FeatureHasher(n_features=7, input_type='string') hashed_features = fh.fit_transform(df['city']) hashed_features = hashed_features.toarray() hashed_df = pd.DataFrame(hashed_features, columns=['city_1', 'city_2', 'city_3', 'city_4', 'city_5', 'city_6', 'city_7']) df_hashed = pd.concat([df.drop('city', axis=1), hashed_df], axis=1) df_hashed['review1_sentiment'] = df_hashed.apply(lambda row: self.sentiment_analyzer(row['review1']), axis=1) df_hashed['review2_sentiment'] = df_hashed.apply(lambda row: self.sentiment_analyzer(row['review2']), axis=1) df_hashed.drop(['review1', 'review2'], axis=1, inplace=True) self.log_file = self.loggerObj.write_log(self.log_file,'Data Preprocessing has completed. Exiting the preprocess_single_predict_manual method of dataPreprocessor class.') return df_hashed except Exception as e: self.log_file = self.loggerObj.write_log(self.log_file, "Exception occured in preprocess_single_predict_manual method of dataPreprocessor class. Exception is " + str(e)) self.log_file = self.loggerObj.write_log(self.log_file,'Exiting the preprocess_single_predict_manual method of dataPreprocessor class.') self.log_file.to_csv("Logs\\Prediction Logs\\prediction_logs.csv") raise Exception
train_shift = train.copy() train_shift['month_num'] = train_shift['month_num'] + month_shift train_shift = train_shift.rename(columns={"amount" : \ 'amount_{0}'.format(month_shift)}) train_shift = train_shift[['year_num', 'month_num', 'customer_id',\ 'mcc_code', 'amount_{0}'.format(month_shift)]] train = pd.merge(train, train_shift, on=['year_num', 'month_num',\ 'customer_id', 'mcc_code'], how='left').fillna(0) test = pd.merge(test, train_shift, on=['year_num', 'month_num', \ 'customer_id', 'mcc_code'], how='left').fillna(0) hasher = FeatureHasher(n_features=10000, input_type='string') train_sparse = \ hasher.fit_transform(train[['year_num', 'month_num', \ 'customer_id', 'mcc_code']].astype(str).as_matrix()) test_sparse = \ hasher.transform(test[['year_num', 'month_num', 'customer_id',\ 'mcc_code']].astype(str).as_matrix()) train_sparse = sparse.hstack([ train_sparse, np.log(np.abs(train[['amount_1', 'amount_2']]) + 1).as_matrix() ]) test_sparse = sparse.hstack([ test_sparse, np.log(np.abs(test[['amount_1', 'amount_2']]) + 1).as_matrix() ])
#Implement Random-Under sampling #First, shuffle dataframe df = df.sample(frac=1) #Create a balanced dataset number_of_clicks = len(df.loc[df['clicks'] == 1]) df_clicks = df.loc[df['clicks'] == 1] df_non_clicks = df.loc[df['clicks'] == 0][:number_of_clicks] df_balanced = pd.concat([df_clicks, df_non_clicks]) #%% #Encoding categorical data using the "hashing trick" vectorizer = FeatureHasher(n_features=2**25, input_type='string') invent_src = vectorizer.fit_transform(df_balanced.inventory_source) #geo_zip = vectorizer.fit_transform(df_balanced.geo_zip) screen_size = vectorizer.fit_transform(df_balanced.platform_device_screen_size) carrier = vectorizer.fit_transform(df_balanced.platform_carrier) bandwidth = vectorizer.fit_transform(df_balanced.platform_bandwidth) maker = vectorizer.fit_transform(df_balanced.platform_device_make) model = vectorizer.fit_transform(df_balanced.platform_device_model) day_of_week = vectorizer.fit_transform(df_balanced.day_of_week) scaler = RobustScaler() #StandardScaler() bid_floor = np.transpose( csr_matrix(scaler.fit_transform([df_balanced.bid_floor.values]))) #spend = np.transpose(csr_matrix(scaler.fit_transform([df_balanced.spend.values]))) #%% y = df_balanced['clicks'] X = hstack([
def preprocess_training_data(self, dataframe): """ Method Name: preprocess_training_data Description: Preprocess the training data. Input: dataframe Input Type: dataframe Onput: Returns a processsed dataframe. Written By: Vaishnavi Ambati Version: 1.0 Revisions: None """ try: self.log_file = self.loggerObj.write_log(self.log_file, 'Entered preprocess_training_data of dataProcessor class') self.log_file = self.loggerObj.write_log(self.log_file, 'Data preprocessing has been initiated.') # renaming the columns replacing ' ' with '_'. dataframe.rename( columns={'Name': 'name', 'City': 'city', 'Cuisine Style': 'cuisine_style', 'Ranking': 'ranking', 'Price Range': 'price_range', 'Number of Reviews': 'no_of_reviews', 'Reviews': 'reviews', 'URL_TA': 'url_ta', 'ID_TA': 'id_ta', 'Rating': 'rating'}, inplace=True) # processing cuisine_style dataframe['cuisine_style'] = dataframe.apply(lambda row: self.cuisine_process(row['cuisine_style']), axis=1) # adding a new column to the dataframe dataframe['no_of_cuisines'] = dataframe.apply( lambda row: len(row['cuisine_style']) if row['cuisine_style'] != 'Not Available' else 0, axis=1) # processing price_range dataframe['price_range'] = dataframe['price_range'].map({'$': 'cheap', '$$ - $$$': 'medium', '$$$$': 'high', }) dataframe['price_range'].fillna('medium', inplace=True) # dropping rows in rating with values -1 drop_index = list(dataframe[dataframe['rating'] == -1].index) dataframe.drop(drop_index, inplace=True) dataframe['rating'] = dataframe['rating'].map( {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 1.5: 6, 2.5: 7, 3.5: 8, 4.5: 9}) # converting reviews to list of reviews dataframe['reviews'] = dataframe.apply(lambda row: self.review_to_words(row['reviews']), axis=1) # considering only relevant features features = ['city', 'ranking', 'price_range', 'no_of_reviews', 'no_of_cuisines', 'rating','reviews'] df_features = dataframe[features] # dropping the null values df_features = df_features.dropna() # onehotencoding price_range df_feat = pd.concat([df_features.drop(['price_range'], axis=1), pd.get_dummies(df_features['price_range'])], axis=1) df_feat.reset_index(inplace=True, drop=True) # feature hashing city column fh = FeatureHasher(n_features=7, input_type='string') hashed_features = fh.fit_transform(df_feat['city']) hashed_features = hashed_features.toarray() hashed_df = pd.DataFrame(hashed_features, columns=['city_1', 'city_2', 'city_3', 'city_4', 'city_5', 'city_6', 'city_7']) df_hashed = pd.concat([df_feat.drop('city', axis=1), hashed_df], axis=1) df_hashed['review1'] = df_hashed.apply(lambda row: row['reviews'][0], axis=1) df_hashed['review2'] = df_hashed.apply(lambda row: row['reviews'][1] if len(row['reviews']) == 2 else np.NaN, axis=1) df_hashed = df_hashed.dropna() df_hashed['review1_sentiment'] = df_hashed.apply(lambda row: self.sentiment_analyzer(row['review1']), axis=1) df_hashed['review2_sentiment'] = df_hashed.apply(lambda row: self.sentiment_analyzer(row['review2']), axis=1) df_hashed.drop(['review1', 'review2', 'reviews'], axis=1, inplace=True) self.log_file = self.loggerObj.write_log(self.log_file,'Data Preprocessing has completed. Exiting the preprocess_training_data method of dataPreprocessor class.') return df_hashed except Exception as e: self.log_file = self.loggerObj.write_log(self.log_file,'An error occured in the preprocess_training_data method of dataPreprocessor class. The exception is ' + str(e)) self.log_file = self.loggerObj.write_log(self.log_file,'Exiting the preprocess_training_data method of dataPreprocessor class.') self.log_file.to_csv("Logs\\Prediction Logs\\prediction_logs.csv") raise Exception
# + races = df.race.value_counts() races = races[races < 2].index df.race.replace(to_replace=races, value='other').value_counts() # Nota: Esto se guardaria en el dataframe haciendo # df['race'] = df.race.replace(to_replace = races, value='other') # - # Otra opcion es producir un hash que tenga una dimension menor a las columnas del one hot # https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.FeatureHasher.html fh = FeatureHasher(n_features=3, input_type='string') hashed_features = fh.fit_transform(df['race'].astype(str)).todense() hashed_features = pd.DataFrame(hashed_features).add_prefix('race_') pd.concat([df[['race']], hashed_features], ignore_index=True, axis=1) # ## Numéricas # En el set tenemos dos variables numéricas, *Weight* y *Height* veamos su distribución # + def plot_weight_vs_height(df, title=""): fig = px.scatter( df.dropna(), x="weight", y="height", color="alignment",
if __name__=='__main__': data_frame = read_training_file('/Users/prateek.jain/work/datasets/kaggle-competition/sf-crime/train.csv') labels = data_frame['Category'] pd_frame = data_frame['PdDistrict'] resolution = data_frame['Resolution'] data_frame.drop(['Category'],inplace=True,axis=1) training_data = pd.concat([pd_frame,resolution], axis=1) training_data = data_frame.as_matrix(['PdDistrict','Address']) regr = linear_model.LinearRegression() #gnb = LinearSVC() print 'Made it till here-1' fh = FeatureHasher(input_type='string',non_negative=True) X=fh.fit_transform(training_data) fhy = FeatureHasher(input_type='string',non_negative=True) Y = fhy.fit_transform(labels) knn_prediction = regr.fit(X,Y) print(regr.coef_) prediction = regr.predict(X) print regr.score(X, prediction) print 'Made it till here-2' print prediction #print X.toarray() #print 'Made it till here-3'
temp_dict = {'Cold': 1, 'Warm': 2, 'Hot': 3} dnew['Ord_2_encod'] = dnew.ord_2.map(temp_dict) dnew = dnew.drop(['ord_2'], axis=1) #Binary encoding from category_encoders import BinaryEncoder encoder = BinaryEncoder(cols=['ord_2']) newdata = encoder.fit_transform(df['ord_2']) df = pd.concat([df, newdata], axis=1) df = df.drop(['ord_2'], axis=1) df.head(10) #Hash encoding from sklearn.feature_extraction import FeatureHasher h = FeatureHasher(n_features=3, input_type='string') hashed_Feature = h.fit_transform(df['nom_0']) hashed_Feature = hashed_Feature.toarray() df = pd.concat([df, pd.DataFrame(hashed_Feature)], axis=1) df.head(10) df.insert(6, "Target", [0, 1, 1, 0, 0, 1, 0, 0, 0, 1], True) #mean Ecoding /Target encoding mean = train['target'].mean() agg = train.groupby(col)['target'].agg(['count', 'mean']) counts = agg['count'] means = agg['mean'] weight = 100 smooth = ((counts * means) + (weight * mean)) / (counts + weight) train.loc[:, "{}_mean_encode".format(col)] = train[col].map(smooth)
feature["FromTimestamp"] = click[1] feature["ToTimestamp"] = 0 feature["ItemId"] = click[2] feature["Category"] = click[3] feature["Price"] = 0 feature["Quantitiy"] = 0 X.append(feature) sys.stderr.write("\rProgress:%.2f%%" % (100. * i / len(clicks))) # make dictvect print "make dict vect" v = DictVectorizer() X_dict_sparse = v.fit_transform(X) X_dict = [zip(map(str, row.indices), row.data) for row in X_dict_sparse] # Feature Hashing print "Feature Hashing" n_features = 2**24 hasher = FeatureHasher(n_features=n_features, input_type='pair') X_hash_sparse = hasher.fit_transform(X_dict) X_hash = [zip(row.indices, row.data) for row in X_hash_sparse] # make libsvm data with open("./data/yoochoose-train.dat", "w") as f: for val, features in zip(c, X_hash): features_list = [] for feature in features: features_list.append(str(feature[0]) + ":" + str(feature[1])) features_line = " ".join(features_list) f.write(str(val) + " " + features_line + "\n")
# In[18]: gen_onehot_features = pd.get_dummies(poke_df['Generation']) gen_effect_features = gen_onehot_features.iloc[:, :-1] gen_effect_features.loc[np.all(gen_effect_features == 0, axis=1)] = -1. pd.concat([poke_df[['Name', 'Generation']], gen_effect_features], axis=1).iloc[4:10] # ## Feature Hashing scheme # In[19]: unique_genres = np.unique(vg_df[['Genre']]) print("Total game genres:", len(unique_genres)) print(unique_genres) # In[20]: from sklearn.feature_extraction import FeatureHasher fh = FeatureHasher(n_features=6, input_type='string') hashed_features = fh.fit_transform(vg_df['Genre']) hashed_features = hashed_features.toarray() pd.concat([vg_df[['Name', 'Genre']], pd.DataFrame(hashed_features)], axis=1).iloc[1:7] # In[21]: fh.get_params()
def main(): storting_csv = sys.argv[1] annotations_path = sys.argv[2] loc = os.path.dirname(os.path.abspath(__file__)) stopwords = [w for w in codecs.open(os.path.join(loc, 'stop.txt'), 'r', 'utf8').read().split() if not w.startswith('|')] csv_reader = csv.DictReader(open(storting_csv)) examples = [] #v = DictVectorizer(sparse=False) v = FeatureHasher() print 'Reading speeches and extracting features...' for speech in csv_reader: if speech['title'] == 'Representant': sys.stdout.write(speech['id']) sys.stdout.write("\b" * len(speech['id'])) metadata = {} for name in csv_reader.fieldnames: if name != 'text': metadata[name] = speech[name] label = metadata['party_id'] example = Example(label, metadata=metadata) annotations = codecs.open(os.path.join(annotations_path, '%s.tsv' % (speech['id'])), 'r', 'utf8').read() sentlengths = [] for sentence in parse_conll(annotations): sentlengths.append(float(len(sentence))) for token in sentence: if token[1] not in stopwords: #example.add_feature('#token:' + token[1]) example.add_feature('#lemma-pos:%s-%s' % (token[2], token[3])) average_sent_length = sum(sentlengths) / len(sentlengths) example.add_feature('#avg-s-length:%s' % (average_sent_length)) examples.append(example) print print 'Done!' print 'Vectorizing...' X = v.fit_transform([e.features for e in examples]) print 'Done!' print 'Tfidf weighting...' t = TfidfTransformer() X = t.fit_transform(X) print 'Done!' print 'Binning vectors...' parties = {} for e, x in zip(examples, X): if e.label not in parties: parties[e.label] = {} year = int(e.metadata['date'].split('-')[0]) if year not in parties[e.label]: parties[e.label][year] = [] parties[e.label][year].append(x) print 'Done!' # for p in parties: # print sorted(parties[p].keys()) results = {} for p in tqdm(parties, desc='Computing similarities:'): results[p] = {} for year in tqdm(parties[p], desc=p): results[p][year] = [] for i, x in enumerate(tqdm(parties[p][year], desc=str(year))): for j, y in enumerate(parties[p][year]): if j != i: score = cosine_similarity(x, y)[0][0] results[p][year].append(score) print 'Done!' print 'Saving results...' na_counter = 0 for p in results: if not p: out = open('na_%s' % (na_counter) + '.out', 'w') na_counter += 1 else: out = open(p + '.out', 'w') years = sorted(results[p].keys()) for y in years: try: avg = sum(results[p][y]) / len(results[p][y]) except ZeroDivisionError: avg = 0 out.write("%s\t%s\n" % (y, avg)) out.close() print 'All done!' # for i, x in enumerate(X): # for j, y in enumerate(X): # if j != i: # #print cosine_similarity(x.reshape(1, -1), y.reshape(1, -1))[0][0] # print cosine_similarity(x, y)[0][0] print 'done'
count = len(Counter(df[c])) print("%s: %i" % (c, count)) if count > N_FEATURES: cols_to_hash.append(c) #use hash encoder to encode it to reduce the final number of features #since the following features has too many labels: #block: 6747 #apartment_number: 3834 cols_not_hash = [c for c in cols_categorical if c not in cols_to_hash] print("one hot encode %s" % cols_not_hash) df = pd.get_dummies(data=df, drop_first=True, columns=cols_not_hash) y = df['sale_price'] for col in cols_to_hash: print("hash encode %s" % col) encoder = FeatureHasher(n_features=N_FEATURES, input_type='string') encoded = encoder.fit_transform([str(v) for v in df[col].values], y) df_encoded = pd.DataFrame( encoded.toarray(), columns=["%s_hash_%i" % (col, i) for i in range(N_FEATURES)]) df = pd.concat([df, df_encoded], axis=1) df.drop(col, axis=1, inplace=True) #move sale_price to last column for processing's sake cols = list(df) col_y = cols.pop(cols.index('sale_price')) cols.append(col_y) df = df.ix[:, cols] df.to_csv("../data/encoded.csv", index=False)
gen_onehot_features = pd.get_dummies(poke_df['Generation']) gen_effect_features = gen_onehot_features.iloc[:,:-1] gen_effect_features.loc[np.all(gen_effect_features == 0, axis=1)] = -1. pd.concat([poke_df[['Name', 'Generation']], gen_effect_features], axis=1).iloc[4:10] # ## Feature Hashing scheme # In[19]: unique_genres = np.unique(vg_df[['Genre']]) print("Total game genres:", len(unique_genres)) print(unique_genres) # In[20]: from sklearn.feature_extraction import FeatureHasher fh = FeatureHasher(n_features=6, input_type='string') hashed_features = fh.fit_transform(vg_df['Genre']) hashed_features = hashed_features.toarray() pd.concat([vg_df[['Name', 'Genre']], pd.DataFrame(hashed_features)], axis=1).iloc[1:7] # In[21]: fh.get_params()
def cluster(self, dataset): """ clusters the data provided into the number of clusters set by self.numberOfClusters stationToArtist: dict where dict['data'] = data in a 2d array dict['labels'] = labels for each array returns ------- a list of clusters, where a cluster is a list of station names """ outputlabels = [] # the set of stations per cluster outputdata = [] # list of set of artists per cluster finaloutputdata = [] hasher = FeatureHasher(input_type="string") transformer = TfidfTransformer() km = KMeans(n_clusters=self.numberOfClusters, init="k-means++", max_iter=10, n_init=1, verbose=0) # edit the dataset so that it contains only artist name and not # artist popularity artistdataset = dataset["data"] newartistdataset = [] for i in range(0, len(artistdataset)): if len(artistdataset[i]) != 0: newartistdataset.append(artistdataset[i][0][0]) # if the number of artists is not enough, get more artists # here!!! print "clustering " + str(len(artistdataset)) + " artists" if len(artistdataset) < self.maximumArtistsToCluster: print "we need more artists to cluster" self.getMoreArtists(artistdataset) datacounts = hasher.fit_transform(newartistdataset) # tfidfcounts = transformer.fit_transform(datacounts) # disabled tf-idf because too slow # km.fit(tfidfcounts) km.fit(datacounts) labeleddata = km.labels_ # init output array for i in range(0, len(set(labeleddata))): outputlabels.append([]) outputdata.append([]) # add items to output array for i in range(0, len(labeleddata)): currentcluster = labeleddata[i] outputlabels[currentcluster].append(dataset["labels"][i]) outputdata[currentcluster].append(dataset["data"][i]) # change the artist list to artist sets for item in outputdata: listofartists = [] for artistlist in item: for artist in artistlist: listofartists.append(artist) finaloutputdata.append(list(set(listofartists))) return {"labels": outputlabels, "data": finaloutputdata}
# [0., 0., 0., ..., 0., 0., 0.], if True and FIT: est = LogisticRegression(multi_class='auto', solver='liblinear') t1 = time.time() est.fit(X_train, y_train) print(f"Score {est.score(X_test, y_test)} in {time.time()-t1}") print("\nFeatureHasher") print("FeatureHasher on frequency dicts") n_features=1048576 #n_features=int(1048576 / 2) hasher = FeatureHasher(n_features=n_features) t1 = time.time() X_train = hasher.fit_transform(token_freqs(d) for d in X_train_text) X_test = hasher.transform(token_freqs(d) for d in X_test_text) print(f"FeatureHasher XX shape {X_train.shape} with {X_train.data.nbytes:,} bytes and nnz {X_train.nnz:,} in {time.time()-t1}") if FIT: est = LogisticRegression(multi_class='auto', solver='liblinear') t1 = time.time() est.fit(X_train, y_train) print(f"Score {est.score(X_test, y_test)} in {time.time()-t1}") #NGRAM_MAX 1 #CountVectorizer #CountVectorizer shape (8485, 112359) with 10,723,592 bytes and nnz 1,340,449 #Vocab length 112359
del orltest feature=orldata.columns.values.tolist() orldata.astype(object) orldata.dtypes.value_counts() sample=orldata.iloc[0:100,:] from sklearn.feature_extraction import FeatureHasher bin_columns_name=['pkgname','ver','adunitshowid','mediashowid','apptype','city','reqrealip','idfamd5','openudidmd5','model','make','osv'] for i in bin_columns_name: fh = FeatureHasher(n_features=5, input_type='string') orldata[i]=orldata[i].astype('str') hashed_features = fh.fit_transform(orldata[i]) hashed_features = hashed_features.toarray() hashed_features=pd.DataFrame(hashed_features) hashed_features.columns=[i+'0',i+'1',i+'2',i+'3',i+'4'] orldata=orldata.join(hashed_features) orldata=orldata.drop(columns=i) oh_columns=['os','lan'] orldata_oh=pd.get_dummies(orldata[oh_columns].astype('object')) orldata_oh=orldata_oh.reset_index(drop=True) orldata=orldata.join(orldata_oh) # # orldata=orldata.drop(columns=oh_columns) orldata=orldata.drop(columns='sid') label=orldata['label']
# Hashing # Hashing encoder uses the md5 hashing algorithm. A feature with 5 categories can be represented using N new features similarly, a feature with 100 categories can also be transformed using N new features. import category_encoders as ce import pandas as pd encoder=ce.HashingEncoder(cols='Var',n_components=6) encoder.fit_transform(Df) # 11. Feature Hashing # Default is 8 columns # If number of features=12, feature hashing helps in capturing that 8 vars info in less number of variables # 6 in the below example from sklearn.feature_extraction import FeatureHasher fh = FeatureHasher(n_features=6, input_type='string') hashed_features = fh.fit_transform(df['StringVar']) hashed_features = hashed_features.toarray() pd.DataFrame(hashed_features) # Hashed features # 12. M-Estimate Encoder %%time MEE_encoder = MEstimateEncoder() train_mee = MEE_encoder.fit_transform(train[feature_list], target) test_mee = MEE_encoder.transform(test[feature_list]) # 13. Target Encoder %%time TE_encoder = TargetEncoder() train_te = TE_encoder.fit_transform(train[feature_list], target) test_te = TE_encoder.transform(test[feature_list])
def make_regression_data(num_examples=100, train_test_ratio=0.5, num_features=2, sd_noise=1.0, use_feature_hashing=False, feature_bins=4, start_feature_num=1, random_state=1234567890): # if we are doing feature hashing and we have asked for more # feature bins than number of total features, we need to # handle that because `make_regression()` doesn't know # about hashing if use_feature_hashing and num_features < feature_bins: num_features = feature_bins # use sklearn's make_regression to generate the data for us X, y, weights = make_regression(n_samples=num_examples, n_features=num_features, noise=sd_noise, random_state=random_state, coef=True) # since we want to use SKLL's FeatureSet class, we need to # create a list of IDs ids = ['EXAMPLE_{}'.format(n) for n in range(1, num_examples + 1)] # create a list of dictionaries as the features index_width_for_feature_name = int(floor(log10(num_features))) + 1 feature_names = [] for n in range(start_feature_num, start_feature_num + num_features): index_str = str(n).zfill(index_width_for_feature_name) feature_name = 'f{}'.format(index_str) feature_names.append(feature_name) features = [dict(zip(feature_names, row)) for row in X] # At this point the labels are generated using unhashed features # even if we want to do feature hashing. `make_regression()` from # sklearn doesn't know anything about feature hashing, so we need # a hack here to compute the updated labels ourselves # using the same command that sklearn uses inside `make_regression()` # which is to generate the X and the weights and then compute the # y as the dot product of the two. This y will then be used as our # labels instead of the original y we got from `make_regression()`. # Note that we only want to use the number of weights that are # equal to the number of feature bins for the hashing if use_feature_hashing: feature_hasher = FeatureHasher(n_features=feature_bins) hashed_X = feature_hasher.fit_transform(features) y = hashed_X.dot(weights[:feature_bins]) # convert the weights array into a dictionary for convenience # if we are using feature hashing, we need to use the names # that would be output by `model_params()` instead of the # original names since that's what we would get from SKLL if use_feature_hashing: index_width_for_feature_name = int(floor(log10(feature_bins))) + 1 hashed_feature_names = [] for i in range(feature_bins): index_str = str(i + 1).zfill(index_width_for_feature_name) feature_name = 'hashed_feature_{}'.format(index_str) hashed_feature_names.append(feature_name) weightdict = dict(zip(hashed_feature_names, weights[:feature_bins])) else: weightdict = dict(zip(feature_names, weights)) # split everything into training and testing portions num_train_examples = int(round(train_test_ratio * num_examples)) train_features, test_features = (features[:num_train_examples], features[num_train_examples:]) train_y, test_y = y[:num_train_examples], y[num_train_examples:] train_ids, test_ids = ids[:num_train_examples], ids[num_train_examples:] # create a FeatureHasher if we are asked to use feature hashing # with the specified number of feature bins vectorizer = (FeatureHasher( n_features=feature_bins) if use_feature_hashing else None) train_fs = FeatureSet('regression_train', train_ids, labels=train_y, features=train_features, vectorizer=vectorizer) test_fs = FeatureSet('regression_test', test_ids, labels=test_y, features=test_features, vectorizer=vectorizer) return (train_fs, test_fs, weightdict)
item['browser_family'] = parsed_string['user_agent']['family'] if parsed_string['user_agent'][ 'family'] else 'N/a' item['os_family'] = parsed_string['os']['family'] if parsed_string['os']['family'] else 'N/a' del item['http_user_agent'] del item['http_referer'] del item['time_local'] del item['request'] del item['version'] yield item tic = time.time() vec = FeatureHasher() items = list(load_data()) # trains, tests = train_test_split(items, train_size=0.8) X_train = vec.fit_transform(items) print("Total", len(items)) # print("Train", len(trains)) # print("Test", len(tests)) print("Done fit train") # X_test = vec.transform(tests) print("Done fit test") # fit the model # clf = svm.OneClassSVM(nu=0.1, kernel="rbf", gamma=0.1) rng = np.random.RandomState(42) clf = IsolationForest(random_state=rng, n_estimators=10) print("Start Fit Model") clf.fit(X_train) print("Done Fit Model")
'feature_4': 10.0 }] # use DictVectorizer dv = DictVectorizer() Ydict = dv.fit_transform(data) print('the encode array by DictVectorizer is:') print(Ydict) print('after to dense array:') print(Ydict.todense()) print('dv class `vocabulary_` is:') print(dv.vocabulary_) # use FeatureHasher dh = FeatureHasher() Yhash = dh.fit_transform(data) print('the encode array by FeatureHasher is:') print(Yhash) print('after to dense array:') YhashArray = Yhash.todense() print(YhashArray) print('the shape of dense array by FeatureHasher is:') print(YhashArray.shape) # use one-hot encoder to extend data = [[0, 10], [1, 11], [2, 8], [3, 12], [0, 15]] oh = OneHotEncoder(categorical_features=[0]) Yoh = oh.fit_transform(data) print('the encode array by OneHotEncoder is:') print(Yoh) print('the to dense array is:')
def hash_tweets(tweetlist): """hash tweetlist inputs and outputs a sparse matrix representation""" hasher = FeatureHasher(input_type = "string") hashed_tweets = hasher.fit_transform(map(lambda tweet: tweet['text'], tweetlist)) return hashed_tweets