コード例 #1
0
def hash(data, labels, new_dimension):
    print "start hashing trick..."
    # convert features as dict
    dictList = list()
    if hasattr(data, "indices"):
        #ind = data.indices
        #dat = data.data
        data = data.toarray()
        indices = range(len(data[0]))
        for item in data:
            zipped = zip(indices, item)
            row = dict()
            for index,value in zipped:
                if value != 0:
                    row[str(index)] = value
            dictList.append(row)

        a = 234
    else:
        indices = map(str, range(len(data[0])))
        for row in data:
            dictList.append(dict(zip(indices, row)))

    start = time.time()
    hasher = FeatureHasher(n_features=new_dimension) # , input_type='dict'
    reduced = hasher.fit_transform(dictList).toarray()
    end = time.time()
    return (reduced, end-start)
コード例 #2
0
ファイル: nn_CTR.py プロジェクト: Nikhil112/CTR_prediction
def hash_array(feature_dict, feature_num):
	# print feature_dict[0]
	if feature_num == 1:
		x_new = np.asarray(feature_dict)
		x_h = x_new.reshape(len(feature_dict), 1)
	else:
		hasher = FeatureHasher(n_features=feature_num, non_negative=True, input_type='dict')
		X_new = hasher.fit_transform(feature_dict)
		x_h = X_new.toarray()
		# vec = DictVectorizer()
		# x_h = vec.fit_transform(feature_dict).toarray()
		# print x_h.shape, type(x_h)
	return x_h
コード例 #3
0
def train_regression_model(dataset, labelset, epochs=1000, resample=False, remove=False):
    label_distribution = Counter(labelset)

    classif = Perceptron()
    #classif = LogisticRegression(solver='liblinear', penalty='l1')#, class_weight='balanced')

    #print('## train sample distribution', label_distribution)

    FH = FeatureHasher()
    dataset = FH.fit_transform(dataset)
    #dataset = transform_vectors(dataset)
    #samp, nx, ny = dataset.shape
    #dataset = dataset.reshape((samp, nx*ny))

    if resample:
        resample_dataset(dataset, labelset)

    return classif.fit(dataset, labelset), FH
def run_hash_trick(df, columns, table):

    replace_col_names = [
        cols for cols in df.columns
        if len(re.findall('_{}$'.format(columns), cols)) > 0
    ]
    df[replace_col_names] = df[replace_col_names].astype(str)
    hasher = FeatureHasher(
        n_features=HASH_TRICK_FEATURES['{}_hash_features'.format(table)],
        input_type="string")
    hashed_features = hasher.fit_transform(
        df[replace_col_names].values).todense()
    hashed_features = pd.DataFrame(hashed_features)
    hashed_features.columns = [
        '{}_seq_feat{}'.format(table, i)
        for i in range(HASH_TRICK_FEATURES['{}_hash_features'.format(table)])
    ]

    df = df.drop(replace_col_names, axis=1)
    return (df, hashed_features)
コード例 #5
0
ファイル: features.py プロジェクト: Renwoxin/CIDDS
def hash_features(basic_features):
    """

    Args:
        basic_features:

    Returns:

    """

    h = FeatureHasher(n_features=20,
                      input_type='string',
                      dtype=int,
                      alternate_sign=False)
    features_ = [
        str(basic_features.values.tolist()[i])
        for i in range(len(basic_features.values.tolist()))
    ]
    features = h.fit_transform(features_)

    return features
コード例 #6
0
ファイル: regCB.py プロジェクト: VowpalWabbit/coba
class RegCBLearner(Learner):
    """A learner using the RegCB algorithm by Foster et al.
        and the online bin search implementation by Bietti et al. 

    References:
        Foster, Dylan, Alekh Agarwal, Miroslav Dudík, Haipeng Luo, and Robert Schapire.
        "Practical contextual bandits with regression oracles." In International 
        Conference on Machine Learning, pp. 1539-1548. PMLR, 2018.

        Bietti, Alberto, Alekh Agarwal, and John Langford.
        "A contextual bandit bake-off." arXiv preprint 
        arXiv:1802.04064 (2018).
    """

    @property
    def family(self) -> str:
        """The family of the learner.

        See the base class for more information
        """
        return f"RegCB"

    @property
    def params(self) -> Dict[str, Any]:
        """The parameters of the learner.

        See the base class for more information
        """
        dict = {'beta': self._beta, 'alpha': self._alpha, 'interactions': self._interactions}
        return dict

    def __init__(self, *, beta: float, alpha: float, learning_rate:float=0.1, interactions: Sequence[str] = ['a', 'ax']) -> None:
        """Instantiate a RegCBLearner.

        Args:
            beta : square-loss tolerance
            alpha: confidence bounds precision
            interactions: the set of interactions the learner will use. x refers to context and a refers to actions, 
                e.g. xaa would mean interactions between context, actions and actions. 
        """

        PackageChecker.sklearn("RegCBLearner")
        from sklearn.feature_extraction import FeatureHasher
        from sklearn.preprocessing import PolynomialFeatures

        self._beta  = beta
        self._alpha = alpha
        self._iter  = 0

        self._core_model = []

        self._times         = [0,0,0,0]
        self._interactions  = interactions
        self._terms         = []
        self._learning_rate = learning_rate

        for term in self._interactions:
            term = term.lower()
            x_num = term.count('x')
            a_num = term.count('a')

            if x_num + a_num != len(term):
                raise Exception("Letters other than x and a were passed for parameter interactions. Please remove other letters/characters.")

            self._terms.append((x_num, a_num))

        max_x_term = max(max(term[0] for term in self._terms),1)
        max_a_term = max(max(term[1] for term in self._terms),1)

        self._x_p = PolynomialFeatures(degree=max_x_term, include_bias=False, interaction_only=False)
        self._a_p = PolynomialFeatures(degree=max_a_term, include_bias=False, interaction_only=False)
        self._h   = FeatureHasher(input_type='pair')

    def predict(self, key: Key, context: Context, actions: Sequence[Action]) -> Sequence[float]:
        """Determine a PMF with which to select the given actions.

        Args:
            key: The key identifying the interaction we are choosing for.
            context: The context we're currently in. See the base class for more information.
            actions: The actions to choose from. See the base class for more information.

        Returns:
            The probability of taking each action. See the base class for more information.
        """

        import numpy as np
        from scipy import sparse

        if self._iter == 0:
            if isinstance(context,dict) or isinstance(actions[0],dict):
                self._core_model = sparse.csr_matrix(self._featurize(context, actions[0]).shape)
            else:
                self._core_model = np.zeros(self._featurize(context, actions[0]).shape)

        if self._iter == 200:
            self._times = [0,0,0,0]

        if (self._iter < 200):
            return [1/len(actions)] * len(actions)

        else:
            maxScore  = -float('inf')
            maxAction = None

            for action in actions:
                features = self._featurize(context,action)
                score = self._bin_search(features, len(actions))

                if score > maxScore:
                    maxAction = action
                    maxScore  = score

            return [int(action == maxAction) for action in actions]

    def learn(self, key: Key, context: Context, action: Action, reward: float, probability: float) -> None:
        """Learn from the given interaction.

        Args:
            key: The key identifying the interaction this observed reward came from.
            context: The context we're learning about. See the base class for more information.
            action: The action that was selected in the context. See the base class for more information.
            reward: The reward that was gained from the action. See the base class for more information.
            probability: The probability that the given action was taken.
        """

        start = time.time()
        features = self._featurize(context, action)
        self._core_model = self._update_model(self._core_model, features, reward, 1)
        self._times[2] += time.time()-start

        self._iter += 1

        # if (self._iter-200-1) % 50 == 0 and self._iter > 200:
        #     print(f'avg phi time: {round(self._times[0]/(self._iter-200),2)}')
        #     print(f'avg bin time: {round(self._times[1]/(self._iter-200),2)}')
        #     print(f'avg lrn time: {round(self._times[2]/(self._iter-200),2)}')

    def _bin_search(self, features, K_t) -> float:

        start = time.time()

        y_u = 2
        w   = 1

        f_u_a_w = self._update_model(self._core_model, features, y_u, w)
        f_x_t_a = self._predict_model(self._core_model, features)
        s_u_a   = (self._predict_model(f_u_a_w, features) - f_x_t_a) / w

        obj = lambda w: w*(f_x_t_a-y_u)**2 - w*(f_x_t_a+s_u_a*w-y_u)**2

        lower_search_bound = 0
        upper_search_bound = (f_x_t_a-y_u)/(-s_u_a)
        width_search_bound = upper_search_bound - lower_search_bound

        constraint = self._alpha * math.log(K_t)

        w_old = lower_search_bound
        w_now = lower_search_bound + 1/2*width_search_bound
        o     = obj(w_now)

        while abs(w_now-w_old) > width_search_bound*(1/2)**30 or o >= constraint:
            w_diff = abs(w_now-w_old)
            w_old  = w_now
            if o < constraint:
                w_now += w_diff/2
            else:
                w_now -= w_diff/2
            o = obj(w_now)

        self._times[1] += time.time() - start

        return f_x_t_a + s_u_a*w_now

    def _featurize(self, context, action):
        import numpy as np #type: ignore

        start = time.time()

        is_sparse = isinstance(context, dict) or isinstance(action, dict)

        if isinstance(context, dict):
            context_values = list(context.values())
            context_names  = list([ f"x{k}" for k in context.keys() ])
        else:
            context_values = (context or [1])
            context_names  = [''] if not is_sparse else [ f"x{i}" for i in range(len(context_values)) ]

        if isinstance(action, dict):
            action_names  = list([ f"a{k}" for k in action.keys() ])
            action_values = list(action.values())
        else:
            action_values = action
            action_names  = [''] if not is_sparse else [ f"a{i}" for i in range(len(action_values)) ]

        x_terms_by_degree = self._terms_by_degree(len(context_values), self._x_p.fit_transform([context_values])[0])
        a_terms_by_degree = self._terms_by_degree(len(action_values) , self._a_p.fit_transform([action_values])[0])
        features          = self._interaction_terms(x_terms_by_degree, a_terms_by_degree, [1])

        if is_sparse:
            x_names_by_degree = self._terms_by_degree(len(context_values), self._x_p.get_feature_names(context_names))
            a_names_by_degree = self._terms_by_degree(len(context_values), self._a_p.get_feature_names(action_names))
            names             = self._interaction_terms(x_names_by_degree, a_names_by_degree, [''])

        final_features = np.array(features) if not is_sparse else self._h.fit_transform([list(zip(names,features))])

        self._times[0] += time.time() - start

        return final_features

    def _terms_by_degree(self, base_term_count:int, terms:Sequence[Any], with_bias:bool = False) -> Dict[int,Sequence[Any]]:
        terms_by_degree = {} 

        index  = 0 if not with_bias else 1
        degree = 1

        while index != len(terms):
            degree_terms_count = int((base_term_count**degree + base_term_count)/2)
            terms_by_degree[degree] = terms[index:degree_terms_count]

            index  += degree_terms_count
            degree += 1

        return terms_by_degree

    def _interaction_terms(self, x_terms_by_degree, a_terms_by_degree, default):

        import numpy as np

        interaction_terms = []

        for term in self._terms:
            x_for_degree = x_terms_by_degree.get(term[0], default)
            a_for_degree = a_terms_by_degree.get(term[1], default)

            if not isinstance(x_for_degree[0],str):
                outer = np.outer(x_for_degree, a_for_degree)
            else:
                outer = np.char.array(x_for_degree)[:,None] + np.char.array(a_for_degree)

            interaction_terms += outer.T.reshape((1,-1)).squeeze().tolist()

        return interaction_terms

    def _predict_model(self, model, features):
        import numpy as np
        import scipy.sparse as sp

        if sp.issparse(model):
            return model.multiply(features).data.sum()
        else:
            return np.dot(model, features)

    def _update_model(self, model, features, value, importance):
        error = self._predict_model(model, features) - value
        return model - self._learning_rate*features*error*importance
コード例 #7
0
    }, {
        'feature_3': -2,
        'feature_4': 10
    }]

    # Vectorize the dictionary data
    print('Dictionary data vectorization')
    dv = DictVectorizer()
    Y_dict = dv.fit_transform(data)
    print(Y_dict.todense())

    print('Vocabulary:')
    print(dv.vocabulary_)

    # Feature hashing
    print('Feature hashing')
    fh = FeatureHasher()
    Y_hashed = fh.fit_transform(data)

    # Decode the features
    print('Feature decoding')
    print(Y_hashed.todense())

    # One-hot encoding
    data1 = [[0, 10], [1, 11], [1, 8], [0, 12], [0, 15]]

    # Encode data
    oh = OneHotEncoder(categorical_features=[0])
    Y_oh = oh.fit_transform(data1)
    print(Y_oh.todense())
コード例 #8
0
#         data = pd.concat([data, one_hot_column], axis=1)
#         del data[colname]
#     return data


columns = [ 'workclass',  'marital-status', 'occupation', 'relationship', 'race', 'sex',  'native-country']
columns_prefix = [ 'workclass_px',  'marital-status_px', 'occupation_px', 'relationship_px', 'race_px', 'sex_px',  'native-country_px']

# columns_other = ['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']

x_train_onehot = pd.get_dummies(x_train, prefix=columns_prefix, columns=columns)
# print(x_train.head())
print(x_train.shape)

fh = FeatureHasher(n_features=8, input_type='string')
sp = fh.fit_transform(x_train['native-country'])
df1 = pd.DataFrame(sp.toarray(), columns=['country_1', 'country_2','country_3','country_4','country_5','country_16', 'country_7','country_8'])
x_train_hashed = pd.concat([df1, x_train.drop(columns=['native-country'])], axis=1)


print('x_train\n', x_train.columns)
print('x_train_hashed\n', x_train_hashed.columns)
# print('x_train_onehot\n', x_train_onehot.columns)




from sklearn.model_selection import train_test_split
data_train, data_test, target_train, target_test = train_test_split(x_train_onehot, y_train, test_size=0.20, random_state=10 )

コード例 #9
0
def load_data(file_name, size=-1):
    example_path = 'experiments_data'
    df = pd.read_csv(
        os.path.join(os.getcwd(), '../..',
                     f'{example_path}/{file_name}_data.csv'))
    if size > -1:
        df = df.sample(size, random_state=seed)

    if file_name == 'wine':
        y = df['y']
        df = df.drop(columns=['y'])
        return df, y

    elif file_name == 'fake_job_posting':
        df.fillna(" ", inplace=True)
        df['text'] = df['title'] + ' ' + df['location'] + ' ' + df['department'] + ' ' + df['company_profile'] + ' ' + \
                     df['description'] + ' ' + df['requirements'] + ' ' + df['benefits'] + ' ' + df['employment_type'] \
                     + ' ' + df['required_education'] + ' ' + df['industry'] + ' ' + df['function']
        return df['text'], df['fraudulent']

    elif file_name == 'hotel_bookings':
        X = df.drop(["is_canceled"], axis=1)
        y = df["is_canceled"]
        return X, y

    elif file_name == 'hr_employee_attrition':
        target_map = {'Yes': 1, 'No': 0}
        # Use the pandas apply method to numerically encode our attrition target variable
        y = df["Attrition"].apply(lambda x: target_map[x])
        X = df.drop(["Attrition"], axis=1)
        return X, y

    elif file_name == 'nomao':
        X = df.drop(["__TARGET__"], axis=1)
        y = df["__TARGET__"]
        return X, y

    elif file_name == 'placement_full_class':
        X = df[[
            'gender', 'ssc_p', 'ssc_b', 'hsc_p', 'hsc_b', 'hsc_s', 'degree_p',
            'degree_t', 'workex', 'etest_p', 'specialisation', 'mba_p'
        ]]
        y = df['status']
        return X, y

    elif file_name == 'rain_weather_aus':
        df = df.drop(columns=[
            'Sunshine', 'Evaporation', 'Cloud3pm', 'Cloud9am', 'Location',
            'RISK_MM', 'Date'
        ],
                     axis=1)
        df = df.dropna(how='any')
        X = df.loc[:, df.columns != 'RainTomorrow']
        y = df['RainTomorrow']
        return X, y

    elif file_name == 'cervical_cancer':
        df = df.replace('?', np.nan)
        df = df.rename(columns={'Biopsy': 'Cancer'})
        df = df.apply(pd.to_numeric)
        df = df.fillna(df.mean().to_dict())
        X = df.drop('Cancer', axis=1)
        y = df['Cancer']
        return X, y

    elif file_name == 'glass':
        features = df.columns[:-1].tolist()
        X = df[features]
        y = df['Type']
        return X, y

    elif file_name == 'mobile_price':
        y = df.price_range
        X = df.drop(["price_range"], axis=1)
        return X, y

    elif file_name == 'clinvar_conflicting':
        toBeConsidered = [
            'CHROM', 'POS', 'REF', 'ALT', 'AF_ESP', 'AF_EXAC', 'AF_TGP',
            'CLNDISDB', 'CLNDN', 'CLNHGVS', 'CLNVC', 'MC', 'ORIGIN', 'CLASS',
            'Allele', 'Consequence', 'IMPACT', 'SYMBOL', 'Feature_type',
            'Feature', 'BIOTYPE', 'STRAND', 'CADD_PHRED', 'CADD_RAW'
        ]
        df2 = df[toBeConsidered]
        df2 = df2.dropna()
        cutdowns = []
        for i in df2.columns.values:
            if df2[i].nunique() < 1000:
                cutdowns.append(i)
        df_final = df2[cutdowns]
        df_final['CHROM'] = df_final['CHROM'].astype(str)
        from sklearn.feature_extraction import FeatureHasher
        fh = FeatureHasher(n_features=5, input_type='string')
        hashed1 = fh.fit_transform(df_final['REF'])
        hashed1 = hashed1.toarray()
        hashedFeatures1 = pd.DataFrame(hashed1)
        nameList = {}
        for i in hashedFeatures1.columns.values:
            nameList[i] = "REF" + str(i + 1)
        hashedFeatures1.rename(columns=nameList, inplace=True)
        hashed2 = fh.fit_transform(df_final['ALT'])
        hashed2 = hashed2.toarray()
        hashedFeatures2 = pd.DataFrame(hashed2)
        nameList2 = {}
        for i in hashedFeatures2.columns.values:
            nameList2[i] = "ALT" + str(i + 1)
        hashedFeatures2.rename(columns=nameList2, inplace=True)
        binaryFeature1 = pd.get_dummies(df_final['CLNVC'])
        df_final = df_final.drop(columns=['MC'], axis=1)
        hashed0 = fh.fit_transform(df_final['CHROM'])
        hashed0 = hashed0.toarray()
        hashedFeatures0 = pd.DataFrame(hashed0)
        nameList0 = {}
        for i in hashedFeatures0.columns.values:
            nameList0[i] = "CHROM" + str(i + 1)
        hashedFeatures0.rename(columns=nameList0, inplace=True)
        hashed3 = fh.fit_transform(df_final['Allele'])
        hashed3 = hashed3.toarray()
        hashedFeatures3 = pd.DataFrame(hashed3)
        nameList3 = {}
        for i in hashedFeatures3.columns.values:
            nameList3[i] = "Allele" + str(i + 1)
        hashedFeatures3.rename(columns=nameList3, inplace=True)
        hashed4 = fh.fit_transform(df_final['Consequence'])
        hashed4 = hashed4.toarray()
        hashedFeatures4 = pd.DataFrame(hashed4)
        nameList4 = {}
        for i in hashedFeatures4.columns.values:
            nameList4[i] = "Consequence" + str(i + 1)
        hashedFeatures4.rename(columns=nameList4, inplace=True)
        binaryFeature3 = pd.get_dummies(df_final['IMPACT'])
        df_final = df_final.drop(columns=['Feature_type'], axis=1)
        binaryFeature4 = pd.get_dummies(df_final['BIOTYPE'], drop_first=True)
        binaryFeature5 = pd.get_dummies(df_final['STRAND'], drop_first=True)
        df3 = pd.concat([
            binaryFeature1, binaryFeature3, binaryFeature4, binaryFeature5,
            hashedFeatures1, hashedFeatures2, hashedFeatures3, hashedFeatures4,
            hashedFeatures0, df_final['CLASS']
        ],
                        axis=1)
        df3 = df3.dropna()
        df3.rename(columns={1: "one", 16: "sixteen"}, inplace=True)
        y = df3['CLASS']
        X = df3.drop(columns=['CLASS'], axis=1)
        return X, y

    elif file_name == 'heart_failure_clinical':
        y = df['DEATH_EVENT']
        X = df.drop('DEATH_EVENT', axis=1)
        return X, y

    elif file_name == 'churn_modelling':
        df['EstimatedSalary'] = df['EstimatedSalary'].astype(int)
        df.drop(columns=['RowNumber', 'CustomerId', 'Surname', 'Geography'],
                inplace=True)
        le = preprocessing.LabelEncoder()
        df['Gender'] = le.fit_transform(df['Gender'])
        X = df.drop('Exited', axis=1)
        y = df['Exited']
        return X, y
    elif file_name == 'hr_leaving':
        y = df['left']
        X = df.drop('left', axis=1)
        return X, y
    elif file_name == 'bank_churners':
        df = pd.get_dummies(df, drop_first=True)
        norm = MinMaxScaler().fit(df)
        data_norm_arr = norm.transform(df)
        X = pd.DataFrame(
            data=data_norm_arr,
            columns=[
                'CLIENTNUM', 'Customer_Age', 'Dependent_count',
                'Months_on_book', 'Total_Relationship_Count',
                'Months_Inactive_12_mon', 'Contacts_Count_12_mon',
                'Credit_Limit', 'Total_Revolving_Bal', 'Avg_Open_To_Buy',
                'Total_Amt_Chng_Q4_Q1', 'Total_Trans_Amt', 'Total_Trans_Ct',
                'Total_Ct_Chng_Q4_Q1', 'Avg_Utilization_Ratio',
                'Attrition_Flag_Existing Customer', 'Gender_M',
                'Education_Level_Doctorate', 'Education_Level_Graduate',
                'Education_Level_High School', 'Education_Level_Post-Graduate',
                'Education_Level_Uneducated', 'Education_Level_Unknown',
                'Marital_Status_Married', 'Marital_Status_Single',
                'Marital_Status_Unknown', 'Income_Category_$40K - $60K',
                'Income_Category_$60K - $80K', 'Income_Category_$80K - $120K',
                'Income_Category_Less than $40K', 'Income_Category_Unknown',
                'Card_Category_Gold', 'Card_Category_Platinum',
                'Card_Category_Silver'
            ])
        X = df.drop("Attrition_Flag_Existing Customer", axis=1)
        y = df["Attrition_Flag_Existing Customer"]
        return X, y
    elif file_name == 'fetal_health':
        X = df.drop(["fetal_health"], axis=1)
        y = df["fetal_health"]
        return X, y
    elif file_name == 'stroke':
        df.drop("id", axis=1, inplace=True)
        for column in ['bmi']:
            df[column].fillna(df[column].mode()[0], inplace=True)
        for label, content in df.items():
            if pd.api.types.is_string_dtype(content):
                df[label] = content.astype("category").cat.as_ordered()
        for label, content in df.items():
            if not pd.api.types.is_numeric_dtype(content):
                df[label] = pd.Categorical(content).codes + 1
        X = df.drop("stroke", axis=1)
        y = df["stroke"]
        return X, y
    elif file_name == 'company_bankruptcy_prediction':
        df.columns = [str(col).strip() for col in list(df.columns)]
        X = df.drop(["Bankrupt?"], axis=1)
        y = df['Bankrupt?']
        return X, y
    elif file_name == 'airline_passenger_satisfaction':
        df['satisfaction'] = df['satisfaction'].map({
            'neutral or dissatisfied': 0,
            'satisfied': 1
        })
        X = df.drop(["satisfaction"], axis=1)
        y = df['satisfaction']
        return X, y
    elif file_name == 'banking_marketing_targets':
        X = df.drop(["y"], axis=1)
        target_map = {'yes': 1, 'no': 0}
        y = df['y'].apply(lambda x: target_map[x])
        return X, y
    else:
        raise ValueError(
            f"file name can be one of the following: wine, fake_job_posting, hotel_bookings, "
            f"hr_employee_attrition, nomao, placement_full_class, rain_weather_aus, cervical_cancer, "
            f"glass or mobile_price. "
            f"file_name that passed is {type(file_name)}")
コード例 #10
0
# %%
# new_effect

# %%
# new_effect.describe()

# %% [markdown]
# # Feature Hasher

# %%
from sklearn.feature_extraction import FeatureHasher

# %%
effect_hasher = FeatureHasher(n_features=3, input_type="string")
x = effect_hasher.fit_transform(total_effects)

# %%
# print(total_effects)
# print(x.toarray())
# len(np.unique(x.toarray(), axis=0))

# %%
flavor_hasher = FeatureHasher(n_features=10, input_type="string")
y = flavor_hasher.fit_transform(total_flavor)

# %%
# print(total_flavor)
# print(y.toarray())
# len(np.unique(y.toarray(), axis=0))
コード例 #11
0
df['INICIO_SINTOMAS'] = df['INICIO_SINTOMAS'].apply(
    lambda x: int(time.mktime(x.timetuple())))

# Formata a Idade com duas casas decimais
df['IDADE'] = df['IDADE'].map('{:,.2f}'.format)

#Exclui registros com Sexo 'missing'
df = df.drop(df[df.SEXO == 'missing'].index)

#Transforma Sexo em código
df['SEXO'] = df['SEXO'].apply(lambda x: '0' if (x == 'f') else '1')

#Transforma Territorio em hash
len(df.groupby('TERRITORIO').size())
fh = FeatureHasher(n_features=10, input_type='string')
hashTerritorio = fh.fit_transform(df['TERRITORIO'])
dfTerritorio = pd.DataFrame(fh.fit_transform(df['TERRITORIO']).toarray(),
                            columns=[
                                'hf0', 'hf1', 'hf2', 'hf3', 'hf4', 'hf5',
                                'hf6', 'hf7', 'hf8', 'hf9'
                            ])
df[dfTerritorio.columns] = dfTerritorio

#Transforma Raça em código
len(df.groupby('RACA_COR').size())
df['RACA_COR'].value_counts()
enc = preprocessing.OneHotEncoder()
dfRacaCor = pd.DataFrame(preprocessing.OneHotEncoder().fit_transform(
    df['RACA_COR'].to_frame()).toarray(),
                         columns=['rc0', 'rc1', 'rc2', 'rc3', 'rc4'])
df[dfRacaCor.columns] = dfRacaCor
コード例 #12
0
display(unique_races.head(10))

s = sum(unique_races.values)
h = unique_races.values / s
c_sum = np.cumsum(h)
plt.plot(c_sum, label="Distribución de la suma acumulativa de razas")
plt.grid()
plt.legend()
# -

# >Con el top 10 cubrimos mas del 85% de la data

# +
# https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.FeatureHasher.html
fh = FeatureHasher(n_features=10, input_type='string')
hashed_features = fh.fit_transform(dataset['Race'].astype(str).values.reshape(
    -1, 1)).todense()

pd.DataFrame(hashed_features).add_prefix('Race_').head(10).join(
    dataset['Race'].head(10))
# -

# ## Numéricas

# En el set tenemos dos variables numéricas, *Weight* y *Height* veamos su distribución

# +
df = dataset[dataset.Alignment != 'neutral'].reset_index(drop=True)


def plot_weight_vs_height(df, title=""):
    fig = px.scatter(
コード例 #13
0
# In[44]:


train_merge = pd.concat([train_merge,pd.get_dummies(train_merge['DeviceType'],prefix='DeviceType')],axis=1).drop(['DeviceType'],axis=1)
train_merge = pd.concat([train_merge,pd.get_dummies(train_merge['ProductCD'],prefix='ProductCD')],axis=1).drop(['ProductCD'],axis=1)
train_merge = pd.concat([train_merge,pd.get_dummies(train_merge['card4'],prefix='card4')],axis=1).drop(['card4'],axis=1)
train_merge = pd.concat([train_merge,pd.get_dummies(train_merge['card6'],prefix='card6')],axis=1).drop(['card6'],axis=1)


# ###  I did FeatureHashing for DeviceInfo, R_emaildomain, P_emaildomain.

# In[45]:


fh = FeatureHasher(n_features=5, input_type='string')
sp = fh.fit_transform(train_merge['DeviceInfo'])
dev_0 = pd.DataFrame(sp.toarray(), columns=['DeviceInfo1', 'DeviceInfo2', 'DeviceInfo3', 'DeviceInfo4', 'DeviceInfo5'])
train_merge = pd.concat([train_merge, dev_0], axis=1)


# In[46]:


fh = FeatureHasher(n_features=5, input_type='string')
sp = fh.fit_transform(train_merge['R_emaildomain'])
dev_1 = pd.DataFrame(sp.toarray(), columns=['R_emaildomain1', 'R_emaildomain2', 'R_emaildomain3', 'R_emaildomain4', 'R_emaildomain5'])
train_merge = pd.concat([train_merge, dev_1], axis=1)


# In[47]:
コード例 #14
0
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn import metrics
import statsmodels.api as sm
import matplotlib.pyplot as plt

# Data import
battles = pd.read_csv("01_data/raw/Battle_Results.csv", sep="|")

# Prep
cat_vars = battles.select_dtypes(object).columns.values.tolist()
battles = battles.astype({"Legendary_1": int, "Legendary_2": int})

h1 = FeatureHasher(n_features=5, input_type='string')
h2 = FeatureHasher(n_features=5, input_type='string')
d1 = h1.fit_transform(battles["Name_1"])
d2 = h2.fit_transform(battles["Name_2"])

d1 = pd.DataFrame(data=d1.toarray())
d1.columns = ["Name_1_" + str(x) for x in range(5)]
d2 = pd.DataFrame(data=d2.toarray())
d2.columns = ["Name_2_" + str(x) for x in range(5)]

battles = battles.drop(columns=cat_vars[0:2])
battles = pd.concat([battles, d1, d2], axis=1)
battles = pd.get_dummies(battles)

X = battles.drop(labels="BattleResult", axis=1)
y = battles.BattleResult

X_train, X_test, y_train, y_test = train_test_split(X,
コード例 #15
0
Data Cleansing

"""

countVectorizer = CountVectorizer()
#-----Convert to dense array
cv_trainData_x = countVectorizer.fit_transform(trainData.Phrase).toarray()
cv_trainData_x.shape
"""
Total features: (156060, 15240)
"""

featureHasher = FeatureHasher(input_type='string',
                              n_features=5000,
                              non_negative=True)
fh_trainData_x = featureHasher.fit_transform(trainData.Phrase).toarray()
fh_trainData_x.shape

trainData_y = trainData.Sentiment.astype('category')
trainData_y.shape

#-----------Implement gaussianNB model
from sklearn.naive_bayes import GaussianNB

gaussianModel = GaussianNB()
gaussianModel.fit(fh_trainData_x, trainData_y)
gaussian_predict = gaussianModel.predict(fh_trainData_x)

(trainData_y == gaussian_predict).sum() / len(trainData)
"""
#--Observations for GAussian mdoel
コード例 #16
0
        feature["FromTimestamp"] = click[1]
        feature["ToTimestamp"] = 0
        feature["ItemId"] = click[2]
        feature["Category"] = click[3]
        feature["Price"] = 0
        feature["Quantitiy"] = 0
        X.append(feature)
    sys.stderr.write("\rProgress:%.2f%%" % (100. * i / len(clicks)))

# make dictvect
print "make dict vect"
v = DictVectorizer()
X_dict_sparse = v.fit_transform(X)
X_dict = [zip(map(str, row.indices), row.data) for row in X_dict_sparse]

# Feature Hashing
print "Feature Hashing"
n_features = 2**24
hasher = FeatureHasher(n_features=n_features, input_type='pair')
X_hash_sparse = hasher.fit_transform(X_dict)
X_hash = [zip(row.indices, row.data) for row in X_hash_sparse]

# make libsvm data
with open("./data/yoochoose-train.dat", "w") as f:
    for val, features in zip(c, X_hash):
        features_list = []
        for feature in features:
            features_list.append(str(feature[0]) + ":" + str(feature[1]))
        features_line = " ".join(features_list)
        f.write(str(val)+" "+features_line+"\n")
コード例 #17
0
class Project:
    def __init__(self,
                 train_filepath,
                 test_filepath,
                 sample_filepath,
                 is_generate_feature=True,
                 is_sample_data=False):
        if is_sample_data:
            self.train_data = pd.read_csv(sample_filepath)
        else:
            self.train_data = pd.read_table(train_filepath)
        self.test_data = pd.read_table(test_filepath)
        self.columns = self.train_data.columns
        self.hash_number = 1
        self.feature_hash = FeatureHasher(n_features=self.hash_number,
                                          input_type='string')
        self.spark = SparkSession \
            .builder \
            .appName("Python Spark SQL basic example") \
            .config("spark.some.config.option", "some-value") \
            .getOrCreate()

        self.data_cleaning()
        if not os.path.exists('./data/feature.csv') or is_generate_feature:
            self.feature_generation()

    def data_cleaning(self):
        # remove meaningless part of data
        self.train_data.drop(columns=[
            "Row", "First Transaction Time", "Step Start Time",
            "Correct Transaction Time", "Step End Time", "Step Duration (sec)",
            "Hints"
        ],
                             inplace=True)
        self.columns = self.train_data.columns

        check_list = ["Correct Step Duration (sec)", "Incorrects", "Corrects"]
        data = {}
        for item in check_list:
            data[item] = {
                "std": self.train_data[item].std(),
                "mean": self.compute_mean_spark(self.train_data, item)
            }

        def check_outlier(record):
            dict_re = dict(zip(self.columns, record))
            # item on check list is float 64
            for item in check_list:
                if dict_re[item] and abs(dict_re[item] - data[item]["mean"]
                                         ) > 10 * data[item]["std"]:
                    return True
            return False

        def check_error(record):
            # wrong data
            dict_re = dict(zip(self.columns, record))
            if dict_re['Correct Step Duration (sec)'] == np.NaN and dict_re[
                    'Correct First Attempt'] == 0:
                return True
            elif dict_re['Error Step Duration (sec)'] == np.NaN and dict_re[
                    'Incorrects'] == 0:
                return True
            return False

        index = 0
        remove_list = []
        problem_unit = []
        problem_section = []
        oppo_num = []

        for i in self.train_data.values:
            if check_outlier(i) or check_error(i):
                remove_list.append(index)
            else:
                # dealing with problem hierarchy
                dict_re = dict(zip(self.columns, i))
                unit = dict_re["Problem Hierarchy"].split(", ")[0]
                unit = re.sub("Unit ", "", unit)
                section = dict_re["Problem Hierarchy"].split(", ")[1]
                section = re.sub("Section ", "", section)
                problem_unit.append(unit)
                problem_section.append(section)

                if type(dict_re["Opportunity(Default)"]) == str:
                    oppo_num.append(dict_re["Opportunity(Default)"])
                else:
                    oppo_num.append("0")

                # dealing with KC

            index += 1
        write_save_log("number of cleaned record: {}".format(len(remove_list)))
        self.train_data.drop(remove_list, inplace=True)
        self.train_data.drop(columns=[
            "Problem Hierarchy", "Opportunity(Default)",
            "Error Step Duration (sec)"
        ],
                             inplace=True)
        self.train_data["Problem Unit"] = problem_unit
        self.train_data["Problem Section"] = problem_section
        self.train_data["Opportunity(Default)"] = oppo_num
        # for col in self.train_data.columns:
        #     print(self.train_data[col].describe())
        self.columns = self.train_data.columns

    @staticmethod
    def one_hot_encoder_generator(features_pd, column, data):
        id_ohe = OneHotEncoder()
        id_le = LabelEncoder()
        id_labels = id_le.fit_transform(data[column])

        # id_feature_arr: num(row) * num(unique(id))
        id_feature_arr = id_ohe.fit_transform(
            pd.DataFrame(id_labels)).toarray()
        id_feature_arr = np.transpose(id_feature_arr)
        for label in id_le.classes_:
            features_pd[label] = id_feature_arr[list(
                id_le.classes_).index(label)]

    def hash_encoder_generator(self, features_pd, column, data):
        sn_feature = self.feature_hash.fit_transform(data[column]).toarray()
        sn_feature = np.transpose(sn_feature)
        for i in range(self.hash_number):
            features_pd["{}_{}".format(column, i)] = sn_feature[i]

    # @staticmethod
    # def count_intelligent_score(cor_time, cor_first, cor_num, in_cur):
    #     return cor_time * cor_first * (in_cur / cor_num)

    @staticmethod
    def count_intelligent_score(cor_time, cor_first, cor_num, in_cur):
        cor_step_time_score = -cor_time + 100
        if cor_step_time_score < -100:
            cor_step_time_score = -100
        normalize_step = (cor_step_time_score + 100) / 200
        return normalize_step * cor_first * (in_cur / cor_num)

    def compute_mean_spark(self, dataframe, column):
        dataframe[column].to_csv('./data/tem.csv')
        spark_df = self.spark.read.csv('./data/tem.csv')
        spark_df.createOrReplaceTempView("train")
        sqlDF = self.spark.sql(
            "SELECT AVG(_c1) as mean FROM train WHERE _c1 is not Null")
        return json.loads(sqlDF.toJSON().first())['mean']

    def feature_generation(self):
        """
        feature 1: compute the intelligent
        feature 2: compute the difficulty of a problem
        feature 3: sum of difficulty of  knowledge component
        """
        # for col in self.train_data.columns:
        #     print(self.train_data[col].describe())
        write_save_log("start to generate features")
        features_pd = pd.DataFrame()

        # hash encoder encoder ID
        # self.one_hot_encoder_generator(features_pd, "Anon Student Id", self.train_data)
        self.hash_encoder_generator(features_pd, "Anon Student Id",
                                    self.train_data)
        write_save_log("ID feature generated")
        # hash encoder Problem Name
        self.hash_encoder_generator(features_pd, "Problem Name",
                                    self.train_data)
        write_save_log("Problem Name feature generated")

        # hash encoder Problem Name
        self.hash_encoder_generator(features_pd, "Problem Unit",
                                    self.train_data)
        write_save_log("Problem Unit feature generated")

        # hash encoder Problem Name
        self.hash_encoder_generator(features_pd, "Problem Section",
                                    self.train_data)
        write_save_log("Problem Section feature generated")

        # directly add problem view
        features_pd["Problem View"] = self.train_data["Problem View"]

        mean_pv = self.compute_mean_spark(features_pd, "Problem View")
        new_pv = []
        for row in features_pd["Problem View"]:
            if not np.isnan(row):
                new_pv.append(row)
            else:
                new_pv.append(mean_pv)
        features_pd.drop(columns=["Problem View"], inplace=True)
        features_pd["Problem View"] = new_pv

        write_save_log("Problem View feature generated")
        # Step Name hash to features

        self.hash_encoder_generator(features_pd, "Step Name", self.train_data)
        write_save_log("Step Name feature generated")

        # next features are precomputed values in train data set
        # person intelligent
        write_save_log("start to generate person intelligent")
        id_unique = self.train_data["Anon Student Id"]
        intelligent_table = dict(
            zip(id_unique, [0 for i in range(len(id_unique))]))

        id_group = self.train_data.groupby(["Anon Student Id"]).mean()
        for i in range(len(id_group.values)):
            write_save_log("id group process: {}".format(i))
            stu_id = id_group.index[i]
            dict_row = dict(zip(id_group.columns, id_group.values[i]))
            intelligent_table[stu_id] = self.count_intelligent_score(
                dict_row['Correct Step Duration (sec)'],
                dict_row['Correct First Attempt'], dict_row['Corrects'],
                dict_row['Incorrects'])

        problem_group = self.train_data.groupby(["Step Name"]).mean()

        problem_difficulty = {}
        problem_group_cor_first = problem_group["Correct First Attempt"]
        for i in range(len(problem_group_cor_first.index)):
            problem_difficulty[problem_group_cor_first.
                               index[i]] = problem_group_cor_first.values[i]
        problem_difficulty['mean'] = problem_group_cor_first.mean()
        write_save_log("problem difficulty mean : {}".format(
            problem_difficulty['mean']))
        with open("./data/problem.json", 'w') as f:
            f.write(json.dumps(problem_difficulty))

        unique_KC = self.train_data["KC(Default)"].unique()
        unique_KC_list = []
        for kc in unique_KC:
            if type(kc) == str:
                for true_kc in kc.split("~~"):
                    unique_KC_list.append(true_kc)

        # [correct, total]
        kc_difficulty = dict(
            zip(unique_KC_list, [[0, 0] for i in range(len(id_unique))]))
        person_intelligent = []
        kc_length = []
        index_count = 0
        for row in self.train_data.values:
            dict_row = dict(zip(self.train_data.columns, row))
            if index_count % 10000 == 0:
                write_save_log(
                    "loading feature to dataframe process: {}".format(
                        index_count))

            # processing intelligent_table
            stu_id = dict_row["Anon Student Id"]
            person_intelligent.append(intelligent_table[stu_id])

            # extract kc
            stu_kc = dict_row["KC(Default)"]
            kc_num = 0
            if type(stu_kc) == str:
                kc_num = len(stu_kc.split("~~"))
                for true_kc in stu_kc.split("~~"):
                    if dict_row["Correct First Attempt"] == 1:
                        kc_difficulty[true_kc][0] += 1
                    kc_difficulty[true_kc][1] += 1
            kc_length.append(kc_num)

            index_count += 1

        with open('./data/kc_difficulty.json', 'w') as f:
            re_kc = {}
            for key, value in kc_difficulty.items():
                re_kc[key] = value[0] / value[1]

            kc_difficulty = re_kc

            kc_df = pd.DataFrame({"value": list(kc_difficulty.values())})
            kc_mean = self.compute_mean_spark(kc_df, "value")
            kc_difficulty["mean"] = kc_mean

            f.write(json.dumps(kc_difficulty))

        write_save_log("kc mean: {}".format(kc_mean))
        kc_features = []
        oppo_feature = []
        problem_diff_value = []
        for row in self.train_data.values:
            dict_row = dict(zip(self.train_data.columns, row))
            stu_kc = dict_row["KC(Default)"]
            sum_difficult = 0
            oppo_value = 0
            if type(stu_kc) == str:
                oppo_list = dict_row["Opportunity(Default)"].split("~~")
                for true_kc in stu_kc.split("~~"):
                    oppo_value += int(oppo_list[stu_kc.split("~~").index(
                        true_kc)]) * kc_difficulty[true_kc]
                    sum_difficult += kc_difficulty[true_kc]
                sum_difficult /= len(stu_kc.split("~~"))
                oppo_value /= len(stu_kc.split("~~"))
            else:
                oppo_value = kc_difficulty["mean"]
                sum_difficult = kc_difficulty["mean"]

            # problem difficulty
            problem_diff_value.append(
                problem_difficulty[dict_row["Step Name"]])

            kc_features.append(sum_difficult)
            oppo_feature.append(oppo_value)

        features_pd["kc difficulty"] = kc_features
        features_pd["kc number"] = kc_length
        features_pd["person_intelligent"] = person_intelligent
        features_pd["oppo value"] = oppo_feature
        features_pd['Problem difficulty'] = problem_diff_value
        write_save_log("feature length: {}".format(len(features_pd.columns)))

        features_pd.to_csv("./data/feature.csv", mode='w', index=False)

        with open('./data/intelligent_table.json', 'w') as f:
            f.write(json.dumps(intelligent_table))

    def predict(self):
        write_save_log("start to predict")
        correct_answer = []
        first_attempt_index = list(self.columns).index('Correct First Attempt')
        for row in self.train_data.values:
            re_cor = row[first_attempt_index]
            if np.isnan(re_cor):
                re_cor = 0
            correct_answer.append(re_cor)

        correct_answer = np.array(correct_answer)
        features = pd.read_csv("./data/feature.csv")

        # for col in features.columns:
        #     print(features[col].describe())

        with open('./data/intelligent_table.json', 'r') as f:
            intelligent_table = json.loads(f.read())

        with open('./data/kc_difficulty.json', 'r') as f:
            kc_table = json.loads(f.read())

        with open('./data/problem.json', 'r') as f:
            problem_table = json.loads(f.read())

        # generate feature for test dataa

        test_features_pd = pd.DataFrame()

        problem_unit = []
        problem_section = []
        problem_values = []
        for row in self.test_data.values:
            dict_re = dict(zip(self.test_data.columns, row))
            unit = dict_re["Problem Hierarchy"].split(", ")[0]
            unit = re.sub("Unit ", "", unit)
            section = dict_re["Problem Hierarchy"].split(", ")[1]
            section = re.sub("Section ", "", section)
            problem_unit.append(unit)
            problem_section.append(section)
            if dict_re["Step Name"] in problem_table.keys():
                problem_values.append(problem_table[dict_re["Step Name"]])
            else:
                problem_values.append(problem_table['mean'])
        self.test_data["Problem Unit"] = problem_unit
        self.test_data["Problem Section"] = problem_section

        # one hot encoder ID
        self.hash_encoder_generator(test_features_pd, "Anon Student Id",
                                    self.test_data)
        # self.one_hot_encoder_generator(test_features_pd, "Anon Student Id", self.test_data)
        write_save_log("ID feature generated")
        # hash encoder Problem Name
        self.hash_encoder_generator(test_features_pd, "Problem Name",
                                    self.test_data)
        write_save_log("Problem Name feature generated")

        # hash encoder Problem Unit
        self.hash_encoder_generator(test_features_pd, "Problem Unit",
                                    self.test_data)
        write_save_log("Problem Unit feature generated")

        # hash encoder Problem Section
        self.hash_encoder_generator(test_features_pd, "Problem Section",
                                    self.test_data)
        write_save_log("Problem Section feature generated")

        # directly add problem view
        test_features_pd["Problem View"] = self.test_data["Problem View"]

        self.hash_encoder_generator(test_features_pd, "Step Name",
                                    self.test_data)

        intel_values = []
        kc_values = []
        test_answer = []
        kc_length = []
        index_count = 0
        remove_list = []
        oppo_feature = []
        for row in self.test_data.values:
            dict_re = dict(zip(self.test_data.columns, row))
            if np.isnan(dict_re["Correct First Attempt"]):
                remove_list.append(index_count)
                test_answer.append(-1)
            else:
                test_answer.append(dict_re["Correct First Attempt"])
            intel_values.append(intelligent_table[dict_re["Anon Student Id"]])

            stu_kc = dict_re["KC(Default)"]
            sum_difficult = 0
            kc_num = 0
            oppo_value = 0
            if type(stu_kc) == str:
                oppo_list = dict_re["Opportunity(Default)"].split("~~")
                kc_num = len(stu_kc.split("~~"))
                for true_kc in stu_kc.split("~~"):
                    oppo_value += int(oppo_list[stu_kc.split("~~").index(
                        true_kc)]) * kc_table[true_kc]
                    sum_difficult += kc_table[true_kc]
                sum_difficult /= len(stu_kc.split("~~"))
            else:
                oppo_value = kc_table["mean"]
                sum_difficult = kc_table["mean"]
            kc_values.append(sum_difficult)
            kc_length.append(kc_num)
            oppo_feature.append(oppo_value)

            index_count += 1

        test_features_pd["kc difficulty"] = kc_values
        test_features_pd["kc number"] = kc_length
        test_features_pd["person_intelligent"] = intel_values
        test_features_pd["oppo value"] = oppo_feature
        test_features_pd['Problem difficulty'] = problem_values
        # test_features_pd.drop(remove_list, inplace=True)

        clf = HistGradientBoostingRegressor(random_state=1,
                                            max_iter=331,
                                            loss='least_squares',
                                            learning_rate=0.4,
                                            l2_regularization=0.2)

        clf.fit(features.values, correct_answer)
        res = clf.predict(test_features_pd.values)
        re_res = []
        for i in res:
            if i >= 0.5:
                re_res.append(1)
            else:
                re_res.append(0)

        for i in range(len(re_res)):
            if test_answer[i] == -1:
                test_answer[i] = re_res[i]

        self.test_data.drop(columns=['Correct First Attempt'])
        self.test_data['Correct First Attempt'] = test_answer
        self.test_data.to_csv('./data/final.csv', index=False)

    def train(self):
        write_save_log("start to train")
        correct_answer = []
        first_attempt_index = list(self.columns).index('Correct First Attempt')
        for row in self.train_data.values:
            re_cor = row[first_attempt_index]
            if np.isnan(re_cor):
                re_cor = 0
            correct_answer.append(re_cor)

        correct_answer = np.array(correct_answer)
        features = pd.read_csv("./data/feature.csv")

        # for col in features.columns:
        #     print(features[col].describe())

        with open('./data/intelligent_table.json', 'r') as f:
            intelligent_table = json.loads(f.read())

        with open('./data/kc_difficulty.json', 'r') as f:
            kc_table = json.loads(f.read())

        with open('./data/problem.json', 'r') as f:
            problem_table = json.loads(f.read())

        # generate feature for test dataa

        test_features_pd = pd.DataFrame()

        problem_unit = []
        problem_section = []
        problem_values = []
        for row in self.test_data.values:
            dict_re = dict(zip(self.test_data.columns, row))
            unit = dict_re["Problem Hierarchy"].split(", ")[0]
            unit = re.sub("Unit ", "", unit)
            section = dict_re["Problem Hierarchy"].split(", ")[1]
            section = re.sub("Section ", "", section)
            problem_unit.append(unit)
            problem_section.append(section)
            if dict_re["Step Name"] in problem_table.keys():
                problem_values.append(problem_table[dict_re["Step Name"]])
            else:
                problem_values.append(problem_table['mean'])
        self.test_data["Problem Unit"] = problem_unit
        self.test_data["Problem Section"] = problem_section

        # one hot encoder ID
        self.hash_encoder_generator(test_features_pd, "Anon Student Id",
                                    self.test_data)
        # self.one_hot_encoder_generator(test_features_pd, "Anon Student Id", self.test_data)
        write_save_log("ID feature generated")
        # hash encoder Problem Name
        self.hash_encoder_generator(test_features_pd, "Problem Name",
                                    self.test_data)
        write_save_log("Problem Name feature generated")

        # hash encoder Problem Unit
        self.hash_encoder_generator(test_features_pd, "Problem Unit",
                                    self.test_data)
        write_save_log("Problem Unit feature generated")

        # hash encoder Problem Section
        self.hash_encoder_generator(test_features_pd, "Problem Section",
                                    self.test_data)
        write_save_log("Problem Section feature generated")

        # directly add problem view
        test_features_pd["Problem View"] = self.test_data["Problem View"]

        self.hash_encoder_generator(test_features_pd, "Step Name",
                                    self.test_data)

        intel_values = []
        kc_values = []
        test_answer = []
        kc_length = []
        index_count = 0
        remove_list = []
        oppo_feature = []
        for row in self.test_data.values:
            dict_re = dict(zip(self.test_data.columns, row))
            if np.isnan(dict_re["Correct First Attempt"]):
                remove_list.append(index_count)
            else:
                test_answer.append(dict_re["Correct First Attempt"])
            intel_values.append(intelligent_table[dict_re["Anon Student Id"]])

            stu_kc = dict_re["KC(Default)"]
            sum_difficult = 0
            kc_num = 0
            oppo_value = 0
            if type(stu_kc) == str:
                oppo_list = dict_re["Opportunity(Default)"].split("~~")
                kc_num = len(stu_kc.split("~~"))
                for true_kc in stu_kc.split("~~"):
                    oppo_value += int(oppo_list[stu_kc.split("~~").index(
                        true_kc)]) * kc_table[true_kc]
                    sum_difficult += kc_table[true_kc]
                sum_difficult /= len(stu_kc.split("~~"))
            else:
                oppo_value = kc_table["mean"]
                sum_difficult = kc_table["mean"]
            kc_values.append(sum_difficult)
            kc_length.append(kc_num)
            oppo_feature.append(oppo_value)

            index_count += 1

        test_features_pd["kc difficulty"] = kc_values
        test_features_pd["kc number"] = kc_length
        test_features_pd["person_intelligent"] = intel_values
        test_features_pd["oppo value"] = oppo_feature
        test_features_pd['Problem difficulty'] = problem_values
        test_features_pd.drop(remove_list, inplace=True)

        parameter_range = {
            "random_state": [i for i in range(0, 40)],
            "max_iter": [i for i in range(100, 500)],
            "loss": ['least_squares', 'least_absolute_deviation', 'poisson'],
            "learning_rate": [0.1 * i for i in range(1, 7)],
            "l2_regularization": [0.1 * i for i in range(1, 10)],
        }
        best_score = 1
        bes_policy = {}
        while best_score > 0.35:
            random_state = {}
            for key, value in parameter_range.items():
                random_state[key] = random.sample(value, 1)
            write_save_log(random_state)

            # clf1 = HistGradientBoostingRegressor()
            # clf2 = AdaBoostRegressor()
            #
            # clf = VotingRegressor(estimators=[('hgb', clf1), ('rf', clf2)], weights=[2, 1])

            clf = HistGradientBoostingRegressor(
                random_state=random_state["random_state"][0],
                max_iter=random_state["max_iter"][0],
                loss=random_state['loss'][0],
                learning_rate=random_state['learning_rate'][0],
                l2_regularization=random_state['l2_regularization'][0])

            clf.fit(features.values, correct_answer)

            for i in range(len(test_features_pd.columns)):
                if test_features_pd.columns[i] != features.columns[i]:
                    raise KeyError("feature order error!")

            res = clf.predict(test_features_pd.values)
            re_res = []
            for i in res:
                if i >= 0.5:
                    re_res.append(1)
                else:
                    re_res.append(0)

            re_score = MSER(re_res, test_answer)

            write_save_log("result error: {}".format(re_score))
            if best_score > re_score:
                best_score = re_score
                bes_policy = copy.deepcopy(random_state)
            write_save_log("\nbest policy and score\n" + str(bes_policy))
            write_save_log(str(best_score) + '\n')
コード例 #18
0
    labels =  data_frame['Category']
    pd_frame = data_frame['PdDistrict']
    resolution = data_frame['Resolution']
    data_frame.drop(['Category'],inplace=True,axis=1)
    #training_data = pd.concat([pd_frame,resolution], axis=1)
    training_data = data_frame.as_matrix(['Dates','DayOfWeek','Address'])
    testing_data = data_frame_test.as_matrix(['Dates','DayOfWeek','Address'])


    gnb = MultinomialNB(alpha=0)
    #gnb = LinearSVC()

    print 'Made it till here-1'
    fh = FeatureHasher(input_type='string',non_negative=True)
    X=fh.fit_transform(training_data)
    X_test = fh.fit_transform(testing_data)


    print 'Made it till here-2'
    print training_data.shape

    #print X.toarray()
    print 'Made it till here-3'

    gnb_model = gnb.fit(X,labels)
    y_pred=gnb_model.predict(X_test)

    print len(y_pred)

    #for actual,predicted in zip(labels,y_pred):
コード例 #19
0
    def preprocess_single_predict_manual(self, feature_list):
        """
                     Method Name: preprocess_single_predict_manual
                     Description: Preprocesses the prediction data entered manually.
                     Input: feature_list
                     Input Type: list
                     Output: Returns dataframe

                     Written By: Vaishnavi Ambati
                     Version: 1.0
                     Revisions: None
                """

        try:
            self.log_file = self.loggerObj.write_log(self.log_file,'Entered preprocess_single_predict_manual of dataProcessor class')
            self.log_file = self.loggerObj.write_log(self.log_file, 'Data preprocessing has been initiated.')

            columns = ['name', 'city', 'ranking', 'no_of_reviews', 'no_of_cuisines', 'review1', 'review2', 'cheap',
                       'high', 'medium']

            features = feature_list[:-1]
            price = feature_list[-1]
            if price == 'cheap':
                features.extend([1, 0, 0])
            elif price == 'medium':
                features.extend([0, 0, 1])
            elif price == 'high':
                features.extend([0, 1, 0])

            feature_dic = dict(zip(columns, features))

            df = pd.DataFrame(feature_dic, index=[0])

            # feature hashing city column
            fh = FeatureHasher(n_features=7, input_type='string')
            hashed_features = fh.fit_transform(df['city'])
            hashed_features = hashed_features.toarray()
            hashed_df = pd.DataFrame(hashed_features,
                                     columns=['city_1', 'city_2', 'city_3', 'city_4', 'city_5', 'city_6', 'city_7'])
            df_hashed = pd.concat([df.drop('city', axis=1), hashed_df],
                                  axis=1)

            df_hashed['review1_sentiment'] = df_hashed.apply(lambda row: self.sentiment_analyzer(row['review1']),
                                               axis=1)
            df_hashed['review2_sentiment'] = df_hashed.apply(lambda row: self.sentiment_analyzer(row['review2']),
                                               axis=1)

            df_hashed.drop(['review1', 'review2'], axis=1, inplace=True)

            self.log_file = self.loggerObj.write_log(self.log_file,'Data Preprocessing has completed. Exiting the preprocess_single_predict_manual method of dataPreprocessor class.')

            return df_hashed

        except Exception as e:

            self.log_file = self.loggerObj.write_log(self.log_file, "Exception occured in preprocess_single_predict_manual method of dataPreprocessor class. Exception is " + str(e))
            self.log_file = self.loggerObj.write_log(self.log_file,'Exiting the preprocess_single_predict_manual method of dataPreprocessor class.')

            self.log_file.to_csv("Logs\\Prediction Logs\\prediction_logs.csv")

            raise Exception
コード例 #20
0
    train_shift = train.copy()
    train_shift['month_num'] = train_shift['month_num'] + month_shift
    train_shift = train_shift.rename(columns={"amount" : \
                                             'amount_{0}'.format(month_shift)})
    train_shift = train_shift[['year_num', 'month_num', 'customer_id',\
                               'mcc_code', 'amount_{0}'.format(month_shift)]]

    train = pd.merge(train, train_shift,
                                  on=['year_num', 'month_num',\
                            'customer_id', 'mcc_code'], how='left').fillna(0)
    test = pd.merge(test, train_shift,
                                 on=['year_num', 'month_num', \
                            'customer_id', 'mcc_code'], how='left').fillna(0)
hasher = FeatureHasher(n_features=10000, input_type='string')
train_sparse = \
    hasher.fit_transform(train[['year_num', 'month_num', \
                        'customer_id', 'mcc_code']].astype(str).as_matrix())

test_sparse = \
    hasher.transform(test[['year_num', 'month_num', 'customer_id',\
                                       'mcc_code']].astype(str).as_matrix())

train_sparse = sparse.hstack([
    train_sparse,
    np.log(np.abs(train[['amount_1', 'amount_2']]) + 1).as_matrix()
])

test_sparse = sparse.hstack([
    test_sparse,
    np.log(np.abs(test[['amount_1', 'amount_2']]) + 1).as_matrix()
])
コード例 #21
0
ファイル: LogisticReg.py プロジェクト: brsm11/root
#Implement Random-Under sampling

#First, shuffle dataframe
df = df.sample(frac=1)

#Create a balanced dataset
number_of_clicks = len(df.loc[df['clicks'] == 1])

df_clicks = df.loc[df['clicks'] == 1]
df_non_clicks = df.loc[df['clicks'] == 0][:number_of_clicks]
df_balanced = pd.concat([df_clicks, df_non_clicks])
#%%
#Encoding categorical data using the "hashing trick"

vectorizer = FeatureHasher(n_features=2**25, input_type='string')
invent_src = vectorizer.fit_transform(df_balanced.inventory_source)
#geo_zip = vectorizer.fit_transform(df_balanced.geo_zip)
screen_size = vectorizer.fit_transform(df_balanced.platform_device_screen_size)
carrier = vectorizer.fit_transform(df_balanced.platform_carrier)
bandwidth = vectorizer.fit_transform(df_balanced.platform_bandwidth)
maker = vectorizer.fit_transform(df_balanced.platform_device_make)
model = vectorizer.fit_transform(df_balanced.platform_device_model)
day_of_week = vectorizer.fit_transform(df_balanced.day_of_week)
scaler = RobustScaler()  #StandardScaler()
bid_floor = np.transpose(
    csr_matrix(scaler.fit_transform([df_balanced.bid_floor.values])))
#spend = np.transpose(csr_matrix(scaler.fit_transform([df_balanced.spend.values])))

#%%
y = df_balanced['clicks']
X = hstack([
コード例 #22
0
    def preprocess_training_data(self, dataframe):
        """
             Method Name: preprocess_training_data
             Description: Preprocess the training data.
             Input: dataframe
             Input Type: dataframe
             Onput: Returns a processsed dataframe.

             Written By: Vaishnavi Ambati
             Version: 1.0
             Revisions: None
        """

        try:

            self.log_file = self.loggerObj.write_log(self.log_file, 'Entered preprocess_training_data of dataProcessor class')
            self.log_file = self.loggerObj.write_log(self.log_file, 'Data preprocessing has been initiated.')

            # renaming the columns replacing ' ' with '_'.
            dataframe.rename(
                columns={'Name': 'name', 'City': 'city', 'Cuisine Style': 'cuisine_style', 'Ranking': 'ranking',
                         'Price Range': 'price_range', 'Number of Reviews': 'no_of_reviews',
                         'Reviews': 'reviews', 'URL_TA': 'url_ta', 'ID_TA': 'id_ta', 'Rating': 'rating'},
                inplace=True)
            # processing cuisine_style
            dataframe['cuisine_style'] = dataframe.apply(lambda row: self.cuisine_process(row['cuisine_style']), axis=1)
            # adding a new column to the dataframe
            dataframe['no_of_cuisines'] = dataframe.apply(
                lambda row: len(row['cuisine_style']) if row['cuisine_style'] != 'Not Available' else 0, axis=1)
            # processing price_range
            dataframe['price_range'] = dataframe['price_range'].map({'$': 'cheap',
                                                                     '$$ - $$$': 'medium',
                                                                     '$$$$': 'high',
                                                                     })

            dataframe['price_range'].fillna('medium', inplace=True)

            # dropping rows in rating with values -1
            drop_index = list(dataframe[dataframe['rating'] == -1].index)
            dataframe.drop(drop_index, inplace=True)

            dataframe['rating'] = dataframe['rating'].map(
                {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 1.5: 6, 2.5: 7, 3.5: 8, 4.5: 9})
            # converting reviews to list of reviews
            dataframe['reviews'] = dataframe.apply(lambda row: self.review_to_words(row['reviews']), axis=1)

            # considering only relevant features
            features = ['city', 'ranking', 'price_range', 'no_of_reviews', 'no_of_cuisines', 'rating','reviews']
            df_features = dataframe[features]
            # dropping the null values
            df_features = df_features.dropna()

            # onehotencoding price_range
            df_feat = pd.concat([df_features.drop(['price_range'], axis=1), pd.get_dummies(df_features['price_range'])],
                                axis=1)

            df_feat.reset_index(inplace=True, drop=True)


            # feature hashing city column
            fh = FeatureHasher(n_features=7, input_type='string')
            hashed_features = fh.fit_transform(df_feat['city'])
            hashed_features = hashed_features.toarray()
            hashed_df = pd.DataFrame(hashed_features,
                                     columns=['city_1', 'city_2', 'city_3', 'city_4', 'city_5', 'city_6', 'city_7'])
            df_hashed = pd.concat([df_feat.drop('city', axis=1), hashed_df],
                                  axis=1)

            df_hashed['review1'] = df_hashed.apply(lambda row: row['reviews'][0], axis=1)
            df_hashed['review2'] = df_hashed.apply(lambda row: row['reviews'][1] if len(row['reviews']) == 2 else np.NaN, axis=1)
            df_hashed = df_hashed.dropna()

            df_hashed['review1_sentiment'] = df_hashed.apply(lambda row: self.sentiment_analyzer(row['review1']), axis=1)
            df_hashed['review2_sentiment'] = df_hashed.apply(lambda row: self.sentiment_analyzer(row['review2']), axis=1)

            df_hashed.drop(['review1', 'review2', 'reviews'], axis=1, inplace=True)

            self.log_file = self.loggerObj.write_log(self.log_file,'Data Preprocessing has completed. Exiting the preprocess_training_data method of dataPreprocessor class.')

            return df_hashed

        except Exception as e:

            self.log_file = self.loggerObj.write_log(self.log_file,'An error occured in the preprocess_training_data method of dataPreprocessor class. The exception is ' + str(e))
            self.log_file = self.loggerObj.write_log(self.log_file,'Exiting the preprocess_training_data method of dataPreprocessor class.')
            self.log_file.to_csv("Logs\\Prediction Logs\\prediction_logs.csv")

            raise Exception
コード例 #23
0
# +
races = df.race.value_counts()
races = races[races < 2].index

df.race.replace(to_replace=races, value='other').value_counts()

# Nota: Esto se guardaria en el dataframe haciendo
# df['race'] = df.race.replace(to_replace = races, value='other')
# -

# Otra opcion es producir un hash que tenga una dimension menor a las columnas del one hot

# https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.FeatureHasher.html
fh = FeatureHasher(n_features=3, input_type='string')
hashed_features = fh.fit_transform(df['race'].astype(str)).todense()
hashed_features = pd.DataFrame(hashed_features).add_prefix('race_')
pd.concat([df[['race']], hashed_features], ignore_index=True, axis=1)

# ## Numéricas

# En el set tenemos dos variables numéricas, *Weight* y *Height* veamos su distribución


# +
def plot_weight_vs_height(df, title=""):
    fig = px.scatter(
        df.dropna(),
        x="weight",
        y="height",
        color="alignment",
コード例 #24
0
if __name__=='__main__':

    data_frame = read_training_file('/Users/prateek.jain/work/datasets/kaggle-competition/sf-crime/train.csv')

    labels =  data_frame['Category']
    pd_frame = data_frame['PdDistrict']
    resolution = data_frame['Resolution']
    data_frame.drop(['Category'],inplace=True,axis=1)
    training_data = pd.concat([pd_frame,resolution], axis=1)
    training_data = data_frame.as_matrix(['PdDistrict','Address'])
    regr = linear_model.LinearRegression()
    #gnb = LinearSVC()

    print 'Made it till here-1'
    fh = FeatureHasher(input_type='string',non_negative=True)
    X=fh.fit_transform(training_data)

    fhy = FeatureHasher(input_type='string',non_negative=True)
    Y = fhy.fit_transform(labels)


    knn_prediction = regr.fit(X,Y)
    print(regr.coef_)
    prediction = regr.predict(X)
    print regr.score(X, prediction)
    print 'Made it till here-2'
    print prediction

    #print X.toarray()
    #print 'Made it till here-3'
コード例 #25
0
ファイル: encode.py プロジェクト: SarangPratap/Encoding
temp_dict = {'Cold': 1, 'Warm': 2, 'Hot': 3}
dnew['Ord_2_encod'] = dnew.ord_2.map(temp_dict)
dnew = dnew.drop(['ord_2'], axis=1)

#Binary encoding
from category_encoders import BinaryEncoder
encoder = BinaryEncoder(cols=['ord_2'])
newdata = encoder.fit_transform(df['ord_2'])
df = pd.concat([df, newdata], axis=1)
df = df.drop(['ord_2'], axis=1)
df.head(10)

#Hash encoding
from sklearn.feature_extraction import FeatureHasher
h = FeatureHasher(n_features=3, input_type='string')
hashed_Feature = h.fit_transform(df['nom_0'])
hashed_Feature = hashed_Feature.toarray()
df = pd.concat([df, pd.DataFrame(hashed_Feature)], axis=1)
df.head(10)

df.insert(6, "Target", [0, 1, 1, 0, 0, 1, 0, 0, 0, 1], True)

#mean Ecoding /Target encoding
mean = train['target'].mean()
agg = train.groupby(col)['target'].agg(['count', 'mean'])
counts = agg['count']
means = agg['mean']
weight = 100
smooth = ((counts * means) + (weight * mean)) / (counts + weight)
train.loc[:, "{}_mean_encode".format(col)] = train[col].map(smooth)
コード例 #26
0
        feature["FromTimestamp"] = click[1]
        feature["ToTimestamp"] = 0
        feature["ItemId"] = click[2]
        feature["Category"] = click[3]
        feature["Price"] = 0
        feature["Quantitiy"] = 0
        X.append(feature)
    sys.stderr.write("\rProgress:%.2f%%" % (100. * i / len(clicks)))

# make dictvect
print "make dict vect"
v = DictVectorizer()
X_dict_sparse = v.fit_transform(X)
X_dict = [zip(map(str, row.indices), row.data) for row in X_dict_sparse]

# Feature Hashing
print "Feature Hashing"
n_features = 2**24
hasher = FeatureHasher(n_features=n_features, input_type='pair')
X_hash_sparse = hasher.fit_transform(X_dict)
X_hash = [zip(row.indices, row.data) for row in X_hash_sparse]

# make libsvm data
with open("./data/yoochoose-train.dat", "w") as f:
    for val, features in zip(c, X_hash):
        features_list = []
        for feature in features:
            features_list.append(str(feature[0]) + ":" + str(feature[1]))
        features_line = " ".join(features_list)
        f.write(str(val) + " " + features_line + "\n")
コード例 #27
0
# In[18]:

gen_onehot_features = pd.get_dummies(poke_df['Generation'])
gen_effect_features = gen_onehot_features.iloc[:, :-1]
gen_effect_features.loc[np.all(gen_effect_features == 0, axis=1)] = -1.
pd.concat([poke_df[['Name', 'Generation']], gen_effect_features],
          axis=1).iloc[4:10]

# ## Feature Hashing scheme

# In[19]:

unique_genres = np.unique(vg_df[['Genre']])
print("Total game genres:", len(unique_genres))
print(unique_genres)

# In[20]:

from sklearn.feature_extraction import FeatureHasher

fh = FeatureHasher(n_features=6, input_type='string')
hashed_features = fh.fit_transform(vg_df['Genre'])
hashed_features = hashed_features.toarray()
pd.concat([vg_df[['Name', 'Genre']],
           pd.DataFrame(hashed_features)], axis=1).iloc[1:7]

# In[21]:

fh.get_params()
コード例 #28
0
ファイル: sim_test.py プロジェクト: emanlapponi/storting
def main():
    storting_csv = sys.argv[1]
    annotations_path = sys.argv[2]

    loc = os.path.dirname(os.path.abspath(__file__))
    stopwords = [w for w
                 in codecs.open(os.path.join(loc, 'stop.txt'),
                                'r', 'utf8').read().split()
                 if not w.startswith('|')]

    csv_reader = csv.DictReader(open(storting_csv))

    examples = []

    #v = DictVectorizer(sparse=False)
    v = FeatureHasher()

    print 'Reading speeches and extracting features...'
    for speech in csv_reader:
        if speech['title'] == 'Representant':
            sys.stdout.write(speech['id'])
            sys.stdout.write("\b" * len(speech['id']))
            metadata = {}
            for name in csv_reader.fieldnames:
                if name != 'text':
                    metadata[name] = speech[name]

            label = metadata['party_id']
            example = Example(label, metadata=metadata)

            annotations = codecs.open(os.path.join(annotations_path,
                                                    '%s.tsv' % (speech['id'])),
                                                                'r',
                                                                'utf8').read()

            sentlengths = []
            for sentence in parse_conll(annotations):
                sentlengths.append(float(len(sentence)))
                for token in sentence:
                    if token[1] not in stopwords:
                        #example.add_feature('#token:' + token[1])
                        example.add_feature('#lemma-pos:%s-%s' % (token[2], token[3]))

            average_sent_length = sum(sentlengths) / len(sentlengths)
            example.add_feature('#avg-s-length:%s' % (average_sent_length))
            examples.append(example)

    print
    print 'Done!'
    print 'Vectorizing...'
    X = v.fit_transform([e.features for e in examples])
    print 'Done!'
    print 'Tfidf weighting...'
    t = TfidfTransformer()
    X = t.fit_transform(X)
    print 'Done!'

    print 'Binning vectors...'
    parties = {}
    for e, x in zip(examples, X):
        if e.label not in parties:
            parties[e.label] = {}
        year = int(e.metadata['date'].split('-')[0])
        if year not in parties[e.label]:
            parties[e.label][year] = []
        parties[e.label][year].append(x)
    print 'Done!'

    # for p in parties:
    #     print sorted(parties[p].keys())

    results = {}

    for p in tqdm(parties, desc='Computing similarities:'):
        results[p] = {}
        for year in tqdm(parties[p], desc=p):
            results[p][year] = []
            for i, x in enumerate(tqdm(parties[p][year], desc=str(year))):
                for j, y in enumerate(parties[p][year]):
                    if j != i:
                        score = cosine_similarity(x, y)[0][0]
                        results[p][year].append(score)
    print 'Done!'

    print 'Saving results...'
    na_counter = 0
    for p in results:
        if not p:
            out = open('na_%s' % (na_counter) + '.out', 'w')
            na_counter += 1
        else:
            out = open(p + '.out', 'w')
        years = sorted(results[p].keys())
        for y in years:
            try:
                avg = sum(results[p][y]) / len(results[p][y])
            except ZeroDivisionError:
                avg = 0
            out.write("%s\t%s\n" % (y, avg))
        out.close()
    print 'All done!'

    # for i, x in enumerate(X):
    #     for j, y in enumerate(X):
    #         if j != i:
    #             #print cosine_similarity(x.reshape(1, -1), y.reshape(1, -1))[0][0]
    #             print cosine_similarity(x, y)[0][0]

    print 'done'
コード例 #29
0
    count = len(Counter(df[c]))
    print("%s: %i" % (c, count))
    if count > N_FEATURES:
        cols_to_hash.append(c)

#use hash encoder to encode it to reduce the final number of features
#since the following features has too many labels:
#block: 6747
#apartment_number: 3834
cols_not_hash = [c for c in cols_categorical if c not in cols_to_hash]
print("one hot encode %s" % cols_not_hash)
df = pd.get_dummies(data=df, drop_first=True, columns=cols_not_hash)

y = df['sale_price']
for col in cols_to_hash:
    print("hash encode %s" % col)
    encoder = FeatureHasher(n_features=N_FEATURES, input_type='string')
    encoded = encoder.fit_transform([str(v) for v in df[col].values], y)
    df_encoded = pd.DataFrame(
        encoded.toarray(),
        columns=["%s_hash_%i" % (col, i) for i in range(N_FEATURES)])
    df = pd.concat([df, df_encoded], axis=1)
    df.drop(col, axis=1, inplace=True)

#move sale_price to last column for processing's sake
cols = list(df)
col_y = cols.pop(cols.index('sale_price'))
cols.append(col_y)
df = df.ix[:, cols]

df.to_csv("../data/encoded.csv", index=False)
gen_onehot_features = pd.get_dummies(poke_df['Generation'])
gen_effect_features = gen_onehot_features.iloc[:,:-1]
gen_effect_features.loc[np.all(gen_effect_features == 0, axis=1)] = -1.
pd.concat([poke_df[['Name', 'Generation']], gen_effect_features], axis=1).iloc[4:10]


# ## Feature Hashing scheme

# In[19]:

unique_genres = np.unique(vg_df[['Genre']])
print("Total game genres:", len(unique_genres))
print(unique_genres)


# In[20]:

from sklearn.feature_extraction import FeatureHasher

fh = FeatureHasher(n_features=6, input_type='string')
hashed_features = fh.fit_transform(vg_df['Genre'])
hashed_features = hashed_features.toarray()
pd.concat([vg_df[['Name', 'Genre']], pd.DataFrame(hashed_features)], axis=1).iloc[1:7]


# In[21]:

fh.get_params()

コード例 #31
0
    def cluster(self, dataset):
        """
		clusters the data provided into the number of 
		clusters set by self.numberOfClusters

		stationToArtist: dict where 
		dict['data'] = data in a 2d array
		dict['labels'] = labels for each array

		returns
		-------
		a list of clusters, where a cluster is a
		list of station names
		"""
        outputlabels = []  # the set of stations per cluster
        outputdata = []  # list of set of artists per cluster
        finaloutputdata = []

        hasher = FeatureHasher(input_type="string")
        transformer = TfidfTransformer()
        km = KMeans(n_clusters=self.numberOfClusters, init="k-means++", max_iter=10, n_init=1, verbose=0)

        # edit the dataset so that it contains only artist name and not
        # artist popularity
        artistdataset = dataset["data"]

        newartistdataset = []
        for i in range(0, len(artistdataset)):
            if len(artistdataset[i]) != 0:
                newartistdataset.append(artistdataset[i][0][0])

                # if the number of artists is not enough, get more artists
                # here!!!
        print "clustering " + str(len(artistdataset)) + " artists"

        if len(artistdataset) < self.maximumArtistsToCluster:

            print "we need more artists to cluster"
            self.getMoreArtists(artistdataset)

        datacounts = hasher.fit_transform(newartistdataset)
        # tfidfcounts = transformer.fit_transform(datacounts)

        # disabled tf-idf because too slow
        # km.fit(tfidfcounts)
        km.fit(datacounts)

        labeleddata = km.labels_

        # init output array
        for i in range(0, len(set(labeleddata))):
            outputlabels.append([])
            outputdata.append([])

            # add items to output array
        for i in range(0, len(labeleddata)):
            currentcluster = labeleddata[i]
            outputlabels[currentcluster].append(dataset["labels"][i])
            outputdata[currentcluster].append(dataset["data"][i])

            # change the artist list to artist sets
        for item in outputdata:
            listofartists = []

            for artistlist in item:
                for artist in artistlist:
                    listofartists.append(artist)

            finaloutputdata.append(list(set(listofartists)))

        return {"labels": outputlabels, "data": finaloutputdata}
コード例 #32
0
#        [0., 0., 0., ..., 0., 0., 0.],

if True and FIT:
    est = LogisticRegression(multi_class='auto', solver='liblinear')
    t1 = time.time()
    est.fit(X_train, y_train)

    print(f"Score {est.score(X_test, y_test)} in {time.time()-t1}")

print("\nFeatureHasher")
print("FeatureHasher on frequency dicts")
n_features=1048576
#n_features=int(1048576 / 2)
hasher = FeatureHasher(n_features=n_features)
t1 = time.time()
X_train = hasher.fit_transform(token_freqs(d) for d in X_train_text)
X_test = hasher.transform(token_freqs(d) for d in X_test_text)
print(f"FeatureHasher XX shape {X_train.shape} with {X_train.data.nbytes:,} bytes and nnz {X_train.nnz:,} in {time.time()-t1}")

if FIT:
    est = LogisticRegression(multi_class='auto', solver='liblinear')
    t1 = time.time()
    est.fit(X_train, y_train)
    print(f"Score {est.score(X_test, y_test)} in {time.time()-t1}")


#NGRAM_MAX 1

#CountVectorizer
#CountVectorizer shape (8485, 112359) with 10,723,592 bytes and nnz 1,340,449
#Vocab length 112359
コード例 #33
0
del orltest
feature=orldata.columns.values.tolist()
orldata.astype(object)
orldata.dtypes.value_counts()

sample=orldata.iloc[0:100,:]



from sklearn.feature_extraction import FeatureHasher

bin_columns_name=['pkgname','ver','adunitshowid','mediashowid','apptype','city','reqrealip','idfamd5','openudidmd5','model','make','osv']
for i in bin_columns_name:
    fh = FeatureHasher(n_features=5, input_type='string')
    orldata[i]=orldata[i].astype('str')
    hashed_features = fh.fit_transform(orldata[i])
    hashed_features = hashed_features.toarray()
    hashed_features=pd.DataFrame(hashed_features)
    hashed_features.columns=[i+'0',i+'1',i+'2',i+'3',i+'4']
    orldata=orldata.join(hashed_features)
    orldata=orldata.drop(columns=i)
    
oh_columns=['os','lan']
orldata_oh=pd.get_dummies(orldata[oh_columns].astype('object'))
orldata_oh=orldata_oh.reset_index(drop=True)
orldata=orldata.join(orldata_oh)
#
#
orldata=orldata.drop(columns=oh_columns)
orldata=orldata.drop(columns='sid')
label=orldata['label']
コード例 #34
0
# Hashing
# Hashing encoder uses the md5 hashing algorithm. A feature with 5 categories can be represented using N new features similarly, a feature with 100 categories can also be transformed using N new features. 
import category_encoders as ce
import pandas as pd
encoder=ce.HashingEncoder(cols='Var',n_components=6)
encoder.fit_transform(Df)


# 11. Feature Hashing
# Default is 8 columns
# If number of features=12, feature hashing helps in capturing that 8 vars info in less number of variables
# 6 in the below example
from sklearn.feature_extraction import FeatureHasher
fh = FeatureHasher(n_features=6, input_type='string')
hashed_features = fh.fit_transform(df['StringVar'])
hashed_features = hashed_features.toarray()
pd.DataFrame(hashed_features) # Hashed features 


# 12. M-Estimate Encoder
%%time
MEE_encoder = MEstimateEncoder()
train_mee = MEE_encoder.fit_transform(train[feature_list], target)
test_mee = MEE_encoder.transform(test[feature_list])

# 13. Target Encoder
%%time
TE_encoder = TargetEncoder()
train_te = TE_encoder.fit_transform(train[feature_list], target)
test_te = TE_encoder.transform(test[feature_list])
コード例 #35
0
def make_regression_data(num_examples=100,
                         train_test_ratio=0.5,
                         num_features=2,
                         sd_noise=1.0,
                         use_feature_hashing=False,
                         feature_bins=4,
                         start_feature_num=1,
                         random_state=1234567890):

    # if we are doing feature hashing and we have asked for more
    # feature bins than number of total features, we need to
    # handle that because `make_regression()` doesn't know
    # about hashing
    if use_feature_hashing and num_features < feature_bins:
        num_features = feature_bins

    # use sklearn's make_regression to generate the data for us
    X, y, weights = make_regression(n_samples=num_examples,
                                    n_features=num_features,
                                    noise=sd_noise,
                                    random_state=random_state,
                                    coef=True)

    # since we want to use SKLL's FeatureSet class, we need to
    # create a list of IDs
    ids = ['EXAMPLE_{}'.format(n) for n in range(1, num_examples + 1)]

    # create a list of dictionaries as the features
    index_width_for_feature_name = int(floor(log10(num_features))) + 1
    feature_names = []
    for n in range(start_feature_num, start_feature_num + num_features):
        index_str = str(n).zfill(index_width_for_feature_name)
        feature_name = 'f{}'.format(index_str)
        feature_names.append(feature_name)
    features = [dict(zip(feature_names, row)) for row in X]

    # At this point the labels are generated using unhashed features
    # even if we want to do feature hashing. `make_regression()` from
    # sklearn doesn't know anything about feature hashing, so we need
    # a hack here to compute the updated labels ourselves
    # using the same command that sklearn uses inside `make_regression()`
    # which is to generate the X and the weights and then compute the
    # y as the dot product of the two. This y will then be used as our
    # labels instead of the original y we got from `make_regression()`.
    # Note that we only want to use the number of weights that are
    # equal to the number of feature bins for the hashing
    if use_feature_hashing:
        feature_hasher = FeatureHasher(n_features=feature_bins)
        hashed_X = feature_hasher.fit_transform(features)
        y = hashed_X.dot(weights[:feature_bins])

    # convert the weights array into a dictionary for convenience
    # if we are using feature hashing, we need to use the names
    # that would be output by `model_params()` instead of the
    # original names since that's what we would get from SKLL
    if use_feature_hashing:
        index_width_for_feature_name = int(floor(log10(feature_bins))) + 1
        hashed_feature_names = []
        for i in range(feature_bins):
            index_str = str(i + 1).zfill(index_width_for_feature_name)
            feature_name = 'hashed_feature_{}'.format(index_str)
            hashed_feature_names.append(feature_name)
        weightdict = dict(zip(hashed_feature_names, weights[:feature_bins]))
    else:
        weightdict = dict(zip(feature_names, weights))

    # split everything into training and testing portions
    num_train_examples = int(round(train_test_ratio * num_examples))
    train_features, test_features = (features[:num_train_examples],
                                     features[num_train_examples:])
    train_y, test_y = y[:num_train_examples], y[num_train_examples:]
    train_ids, test_ids = ids[:num_train_examples], ids[num_train_examples:]

    # create a FeatureHasher if we are asked to use feature hashing
    # with the specified number of feature bins
    vectorizer = (FeatureHasher(
        n_features=feature_bins) if use_feature_hashing else None)
    train_fs = FeatureSet('regression_train',
                          train_ids,
                          labels=train_y,
                          features=train_features,
                          vectorizer=vectorizer)
    test_fs = FeatureSet('regression_test',
                         test_ids,
                         labels=test_y,
                         features=test_features,
                         vectorizer=vectorizer)

    return (train_fs, test_fs, weightdict)
コード例 #36
0
                    item['browser_family'] = parsed_string['user_agent']['family'] if parsed_string['user_agent'][
                        'family'] else 'N/a'
                    item['os_family'] = parsed_string['os']['family'] if parsed_string['os']['family'] else 'N/a'
                    del item['http_user_agent']
                    del item['http_referer']
                    del item['time_local']
                    del item['request']
                    del item['version']
                    yield item


tic = time.time()
vec = FeatureHasher()
items = list(load_data())
# trains, tests = train_test_split(items, train_size=0.8)
X_train = vec.fit_transform(items)
print("Total", len(items))
# print("Train", len(trains))
# print("Test", len(tests))
print("Done fit train")
# X_test = vec.transform(tests)
print("Done fit test")

# fit the model
# clf = svm.OneClassSVM(nu=0.1, kernel="rbf", gamma=0.1)

rng = np.random.RandomState(42)
clf = IsolationForest(random_state=rng, n_estimators=10)
print("Start Fit Model")
clf.fit(X_train)
print("Done Fit Model")
コード例 #37
0
    'feature_4': 10.0
}]

# use DictVectorizer
dv = DictVectorizer()
Ydict = dv.fit_transform(data)
print('the encode array by DictVectorizer is:')
print(Ydict)
print('after to dense array:')
print(Ydict.todense())
print('dv class `vocabulary_` is:')
print(dv.vocabulary_)

# use FeatureHasher
dh = FeatureHasher()
Yhash = dh.fit_transform(data)
print('the encode array by FeatureHasher is:')
print(Yhash)
print('after to dense array:')
YhashArray = Yhash.todense()
print(YhashArray)
print('the shape of dense array by FeatureHasher is:')
print(YhashArray.shape)

# use one-hot encoder to extend
data = [[0, 10], [1, 11], [2, 8], [3, 12], [0, 15]]
oh = OneHotEncoder(categorical_features=[0])
Yoh = oh.fit_transform(data)
print('the encode array by OneHotEncoder is:')
print(Yoh)
print('the to dense array is:')
コード例 #38
0
def hash_tweets(tweetlist):
    """hash tweetlist inputs and outputs a sparse matrix representation"""
    hasher = FeatureHasher(input_type = "string")
    hashed_tweets = hasher.fit_transform(map(lambda tweet: tweet['text'], tweetlist))
    return hashed_tweets