def _make_dummies(data, variables):
    #convert to expected variables only
    filtered_data = data.loc[:,variables].astype(str)
    data_dict = [dict(r.iteritems()) for _, r in filtered_data.iterrows()]
    vectorizer = DV( sparse = False )
    vec_x_cat = vectorizer.fit_transform(data_dict)
    return vec_x_cat, vectorizer
Beispiel #2
0
def pres_drug_dv():
    """
    2.5 min on 2% dataset
    """
    df_base = _load_base()
    df_base['idx'] = list(range(len(df_base)))
    print("load trans")
    df = pd.read_csv(g.FILE_PRES_OUT, usecols=['patient_id', 'drug_id'])
    print("load trans ... done")

    patient_list = []
    df_feat = []
    for patient_id, df_part in df.groupby('patient_id'):
        df_feat.append(Counter(df_part.drug_id))
        patient_list.append(patient_id)
    X = DV(sparse=True).fit_transform(df_feat)
    print(X.shape)

    df_feat_order = pd.DataFrame({
        'patient_id': patient_list,
        'feat_idx': list(range(len(patient_list))),
    })
    df_feat_order = df_feat_order.merge(df_base, how='left', on='patient_id')
    print(df_feat_order.head())

    # Re-ordering
    X_ordered = ss.lil_matrix((len(df_base), X.shape[1]))
    for idx, row in df_feat_order.iterrows():
        X_ordered[row['idx'], :] = X[row['feat_idx'], :]

    X_ordered = ss.csr_matrix(X_ordered)
    print(X_ordered.shape)
    return X_ordered
Beispiel #3
0
def train_rfc():
    out = open('rfc', 'wb')
    #data = pandas.read_csv('res.csv')
    df = pandas.read_csv('res.csv').fillna('nan')
    #print df.shape
    vectorizer = DV(sparse=False)
    df_dict = df.drop(['0', '11', '12'], axis=1).T.to_dict().values()
    #df_dict = df_dict.fillna( 'NA' )
    #print df_dict
    X = vectorizer.fit_transform(df_dict)
    #print X.shape
    X = np.hstack([X, df[['0', '11']]])
    y = df.ix[:, -1].as_matrix()
    #scale = StandardScaler()
    #X = scale.fit_transform(X)
    clf = RandomForestClassifier()
    #clf.fit(X,y)
    #pickle.dump(clf,out)
    #grid = {'C': np.power(10.0, np.arange(-5, 6))}
    #gs = grid_search.GridSearchCV(clf, grid)
    res = cross_validation.cross_val_score(clf, X, y, cv=5, scoring='roc_auc')
    print(res, res.mean())
    clf = GradientBoostingClassifier()
    clf.fit(X, y)
    pickle.dump(clf, out)
    res = cross_validation.cross_val_score(clf, X, y, cv=5, scoring='roc_auc')
    clf.fit(X, y)
    print(res, res.mean(), clf.feature_importances_)
Beispiel #4
0
 def categorical_2_dummy(self, df):
     """docstring for categorical_2_dummy"""
     df = df.applymap(str)
     ch_dict = df.T.to_dict().values() # This creates huge memory ~ 10Gb
     vec = DV(sparse=True) 
     ch_array = vec.fit_transform(ch_dict)
     '''
     ch_array = ch_array.astype('float16')
     # This step kills everything, only 9862926 of 57083328046 have values
     # Calling toarray(), it will have 2byte*57083328046=106Gb
     df_after = pd.DataFrame(ch_array, dtype='float16') 
     '''
     # One drawback of this, is SparseDataFrame doesn't support float32 or float16, which is a shame
     '''
     # TODO: Wondering the cheapest way to penetrate pandas.DataFrame; why don't we just wait for stackOverflow.
     # TODO: simple, csr_matrix -> sparseDataFrame, without releasing toarray(), is this the most space efficient way?
     # TODO issue on github
     df_after = pd.SparseDataFrame(index=df.index, columns=vec.get_featrue_names())
     for i in np.arange(ch_array.shape[0]):
         elem = pd.SparseSeries(ch_array[i].toarray().ravel())
         df_after.loc[[2]] = [elem]  # not implemendted error
     '''
     # New method
     df_after = pd.DataFrame(ch_array[:, 0].toarray().ravel()).to_sparse(0)
     for i in range(1, ch_array.shape[1]):
         df_after[i] = ch_array[:, i].toarray().ravel()
         if i % 1000 == 0:
             print('Finish: ' + str(i))
     return df_after
Beispiel #5
0
def sklearn_tree(frame, x, y):
    vectorize = DV(sparse=False)
    X = frame.ix[:, x]
    Y = frame.ix[:, y]
    del frame
    X_transform = vectorize.fit_transform(X.to_dict(outtype="records"))
    dtree = sktree.DecisionTreeRegressor(max_depth=10, min_samples_split=2000)
    dtree = dtree.fit(X_transform, Y)
    return dtree
Beispiel #6
0
def category_transformation(train_categoric,
                            test_categoric,
                            labels,
                            type='std'):

    if type == 'freq':
        print("Encoding categories by freqency rank...")
        for c in train_categoric.columns:
            freqs = train_categoric[c].append(test_categoric[c]).value_counts()
            train_categoric[c] = pd.match(train_categoric[c].values,
                                          freqs[0:91].index)
            test_categoric[c] = pd.match(test_categoric[c].values,
                                         freqs[0:91].index)

    if type == 'std':
        print("Encoding categories by sklearn label encoder...")
        for c in train_categoric.columns:
            lbl = LabelEncoder()
            lbl.fit(
                list(train_categoric.ix[:, c]) + list(test_categoric.ix[:, c]))
            train_categoric.ix[:, c] = lbl.transform(train_categoric.ix[:, c])
            test_categoric.ix[:, c] = lbl.transform(test_categoric.ix[:, c])

    if type == 'tgtrate':
        print("Encoding categories by target rate...")
        for c in train_categoric.columns:
            train_categoric[c], test_categoric[c] = category_to_prob_weight(
                train_categoric, test_categoric, c, labels)

    if type == 'rank':
        print("Encoding categories by rank transformation...")
        for c in train_categoric.columns:
            rank = pd.concat([train_categoric[c], labels],
                             axis=1).groupby(c).mean().sort_values(
                                 by='target', ascending=False)
            train_categoric[c] = pd.match(train_categoric[c].values,
                                          rank[0:20000].index)
            test_categoric[c] = pd.match(test_categoric[c].values,
                                         rank[0:20000].index)

    if type == 'onehot':
        print("One hot... ")
        for c in train_categoric.columns:
            uniques = np.unique(train_categoric[c])
            if len(uniques) > 100:
                train_categoric.drop(c, axis=1, inplace=True)
                test_categoric.drop(c, axis=1, inplace=True)
        x_cat_train = train_categoric.T.to_dict().values()
        x_cat_test = test_categoric.T.to_dict().values()

        # vectorize
        vectorizer = DV(sparse=False)
        train_categoric = pd.DataFrame(vectorizer.fit_transform(x_cat_train))
        test_categoric = pd.DataFrame(vectorizer.transform(x_cat_test))

    return train_categoric, test_categoric
Beispiel #7
0
    def _transfer_data_to_model(self, data, animal, total_info, logger):
	''' extract data from DataFrame'''
	total_breed = total_info[0]
	total_color = total_info[1]
	intake_df = total_info[2]

	# encode y
	(encode_y, le_y) = self._encode_y(data['OutcomeType'].values,logger)
	#print encode_y

	# encode x
	#if animal in ('Dog', 'All'):
	if True:
	#if False:
		new_age_info = self._transfer_age_infos(data['AgeuponOutcome'])
		data['EncodeAgeuponOutcome'] = new_age_info

		(year, month, weekday, hour) = self._transfer_time_infos(data['DateTime'])
		data['EncodeYear'] = year
		data['EncodeMonth'] = month 
		data['EncodeWeekday'] = weekday 
		data['EncodeHour'] = hour 
		drop_list = ['AnimalID', 'Name', 'DateTime', 'OutcomeType', 'OutcomeSubtype', 'AgeuponOutcome', 'SexuponOutcome', 'Breed', 'Color']
		#drop_list = ['AnimalID', 'Name', 'DateTime', 'OutcomeType', 'OutcomeSubtype', 'AgeuponOutcome', 'SexuponOutcome', 'Breed']
		#drop_list = ['AnimalID', 'Name', 'DateTime', 'OutcomeType', 'OutcomeSubtype', 'AgeuponOutcome', 'SexuponOutcome']

	data['HasName'] = self._transfer_name_infos(data['Name'])
	data['Sex'] = self._transfer_sex_infos(data['SexuponOutcome'])
	data['Intact'] = self._transfer_intact_infos(data['SexuponOutcome'])
	data['IsMix'] = self._transfer_mix_infos(data['Breed'])
	#data['NewBreed'] = self._transfer_breed_infos(data['Breed'])
	#data['Species'] = self._transfer_species_infos(data['Color'])
	#data['NewColor'] = self._transfer_color_infos(data['Color'])
	data['ColorMix'] = self._transfer_color_count_infos(data['Color'])
	for breed_type in total_breed:
		data['Breed%s' % breed_type] = self._transfer_breed_type_infos(data['Breed'], breed_type)
	for color_type in total_color:
		data['Color%s' % color_type] = self._transfer_color_type_infos(data['Color'], color_type)
	(found_location, intake_type, intake_condition) = self._transfer_intake_infos(data['AnimalID'], intake_df)
	#data['FoundLocation'] = found_location
	data['IntakeType'] = intake_type
	data['IntakeCondition'] = intake_condition

	#print np.isnan(data.any())
	#print np.isfinite(data.all())
	df = data.drop(drop_list, 1)
	#print df.isnull().sum()
	#print pd.isnull(df).any(1).nonzero()[0]

	x = df.T.to_dict().values()
	#print x
	vectorizer_x = DV(sparse=False)
	encode_x = vectorizer_x.fit_transform(x)
	#print encode_x
	return (encode_x, encode_y, vectorizer_x, le_y)
def load_data():
    train = pd.read_csv('../input/train.csv')
    test = pd.read_csv('../input/test.csv')

    train.drop(['v22', 'v91'], axis=1, inplace=True)
    test.drop(['v22', 'v91'], axis=1, inplace=True)

    nas = {}
    for colname in cat_cols:
        nas[colname] = impute_most_freq_value(train, colname)

    for colname in cat_cols:
        train[colname].fillna(nas[colname], inplace=True)

    for colname in cat_cols:
        test[colname].fillna(nas[colname], inplace=True)

    cat_train = train[cat_cols]
    cat_test = test[cat_cols]

    # put the numerical as matrix
    train.drop(cat_cols, axis=1, inplace=True)
    test.drop(cat_cols, axis=1, inplace=True)

    print(cat_train.describe())

    # transform the categorical to dict
    dict_train_data = cat_train.T.to_dict().values()
    dict_test_data = cat_test.T.to_dict().values()

    # vectorize
    vectorizer = DV(sparse=False)
    features = vectorizer.fit_transform(dict_train_data)
    vec_data = pd.DataFrame(features)
    vec_data.columns = vectorizer.get_feature_names()
    # vec_data.rename(columns={'changed': 'vec_changed'}, inplace=True)
    # vec_data.rename(columns={'id': 'vec_id'}, inplace=True)
    vec_data.index = train.index
    train = train.join(vec_data)

    features = vectorizer.transform(dict_test_data)
    vec_data = pd.DataFrame(features)
    vec_data.columns = vectorizer.get_feature_names()
    vec_data.index = test.index
    test = test.join(vec_data)

    # merge numerical and categorical sets
    trainend = int(0.75 * len(train))
    valid_inds = list(train[trainend:].index.values)
    train_inds = list(train.loc[~train.index.isin(valid_inds)].index.values)

    train.fillna(-100, inplace=True)
    test.fillna(-100, inplace=True)

    return train, test, train_inds, valid_inds
Beispiel #9
0
def matrix (wordlength,length,instance):
	di = {'i': 0, 'o': 0, 'P': 1, 'L':0}
	damino = {'A': 1,'R': 2,'D': 3,'N': 4,'C': 5,'E': 6,'Q': 7,'G': 8,'H': 9,'I': 10,'L': 11,'K': 12,'M': 13,'F': 14,'P': 15,'S': 16,'T': 17,'W': 18,'Y': 19,'V': 20,'J': 21}
	word_list = []
	word_list_w = []
	toplogy_list = []
	toplogy_w = []
	tempd = ''
	z = wordpro(wordlength)
	filein = open('prototext.txt','r')
	for line in filein:
			temp_line = line.rstrip()
			#Adding charachters in the begening and end for windows
			temporary_string = ("J" * z)+(temp_line)+("J" * z)
			for each in window(temporary_string, wordlength):
				temp = ''.join(each)
				temp_c = []
				for c in temp:
					g = damino[c]
					temp_c.append(g)
				word_list.append(temp)
			temporary_topology = next(filein)
			temporary_topology = temporary_topology.rstrip()
			for c in temporary_topology:
				k = di[c]
				toplogy_list.append(k)
				toplogy_w.append(c)
	#http://stackoverflow.com/questions/30522724/take-multiple-lists-into-dataframe
	#http://stackoverflow.com/questions/20970279/how-to-do-a-left-right-and-mid-of-a-string-in-a-pandas-dataframe
	dftemp = pd.DataFrame({'word_list':word_list})
	dwtemp = pd.DataFrame({'word_list':word_list_w})
	if(length == 21):
		df = pd.read_csv("finaltest.csv")
	train_dict = df.T.to_dict().values()
	#print (train_dict)
	vectorizer = DV( sparse = False )
	vec_train = vectorizer.fit_transform( train_dict )
	max_abs_scaler = preprocessing.MaxAbsScaler()
	vec_train = max_abs_scaler.fit_transform(vec_train)
	print (vectorizer.get_feature_names())
	target = np.asarray(toplogy_list)
	X_train, X_test, y_train, y_test = train_test_split(vec_train, target, test_size=0.2, random_state=0)
	estimator = svm.SVC(kernel='rbf')
	cv = ShuffleSplit(X_train.shape[0], n_iter=10, test_size=0.2, random_state=0)
	gammas = np.logspace(-6, -1, 10)
	classifier = GridSearchCV(estimator=estimator, cv=cv, param_grid=dict(gamma=gammas))	
	X_train, X_test, y_train, y_test = train_test_split(vec_train, target, test_size=0.33, random_state=0)
	classifier.fit(vec_train, target)
	classifier_prediction = classifier.predict(X_test)
	print ('\nClasification report P:\n', classification_report(y_test, classifier_prediction))
	plot_classification_report(classification_report(y_test, classifier_prediction))
	plt.savefig('P_plot_classif_report.pdf', dpi=200, format='pdf', bbox_inches='tight')
	plt.close()
	joblib.dump(classifier, 'P.pkl', compress=9)
Beispiel #10
0
def diag_physician_specialty_description_dv():
    df_base = _load_base()
    df_base['idx'] = list(range(len(df_base)))
    print("load trans")
    df = pd.read_csv(
        g.FILE_DIAG_OUT,
        usecols=['patient_id', 'primary_practitioner_id'],
        dtype={'primary_practitioner_id': str},
    ).rename(columns={
        'primary_practitioner_id': 'practitioner_id'})

    df_phy = pd.read_csv(
        g.FILE_PHYS,
        usecols=['specialty_description', 'practitioner_id'],
        dtype={
            'practitioner_id': str,
            'specialty_description': str})

    df['practitioner_id'].fillna('NA', inplace=True)
    df_phy['practitioner_id'].fillna('NA', inplace=True)
    df_phy['specialty_description'].fillna('NA', inplace=True)
    df = df.merge(df_phy, how='left', on='practitioner_id')

    print("load trans ... done")

    patient_list = []
    df_feat = []
    for patient_id, df_part in df.groupby('patient_id'):
        df_feat.append(Counter(df_part.specialty_description))
        patient_list.append(patient_id)
    X = DV(sparse=True).fit_transform(df_feat)
    print(X.shape)

    df_feat_order = pd.DataFrame({
        'patient_id': patient_list,
        'feat_idx': list(range(len(patient_list))),
    })
    df_feat_order = df_feat_order.merge(df_base, how='left', on='patient_id')
    print(df_feat_order.head())

    # Re-ordering
    print("re-ordering")
    sz = len(df_feat_order)
    X_ordered = ss.lil_matrix((len(df_base), X.shape[1]))
    for idx, row in df_feat_order.iterrows():
        if idx % 1000 == 0:
            print(str(datetime.datetime.now()), idx, sz)
        X_ordered[row['idx'], :] = X[row['feat_idx'], :]
    print("re-ordering ... done")

    X_ordered = ss.csr_matrix(X_ordered)
    print(X_ordered.shape)
    return X_ordered
def encode_cat_test(X):
    print('\nSource data:\n')
    print(X.shape)
    print(X[:10])
    encoder = DV(sparse=False)
    X_cat = encoder.fit_transform(X.T.to_dict().values())
    print('\nEncoded data:\n')
    print(X_cat.shape)
    print(X_cat[:10])
    print('\nVocabulary:\n')
    print(encoder.vocabulary_)
    print(encoder.feature_names_)
    return X_cat
Beispiel #12
0
def simple_mod_v1():
    data = pd.read_csv('./mushrooms.csv')
    data.drop_duplicates()

    #STEP ONE: PREPARE DATA#
    features = ['stalk-color-above-ring', 'spore-print-color', "gill-color"]
    print(features)
    data_x = data[features]
    data_y = data['class']
    le = preprocessing.LabelEncoder()
    le.fit(data_y)
    data_y = le.transform(data_y)
    data_x_dict = data_x.to_dict(orient='records')
    v = DV(sparse=False)
    data_x_dict = v.fit_transform(data_x_dict)

    #STEP TWO: SPLIT THE DATA#
    x_train, x_test, y_train, y_test = train_test_split(data_x_dict,
                                                        data_y,
                                                        test_size=0.3)

    #STEP THREE: CREATE MODEL#
    print('----------- DTREE WITH GINI IMPURITY CRITERION ------------------')
    dtree_gini_mod = tree.DecisionTreeClassifier(criterion='gini')
    dtree_gini_mod.fit(x_train, y_train)
    preds_gini = dtree_gini_mod.predict(x_test)
    print_multiclass_classif_error_report(y_test, preds_gini)

    #STEP FOUR: VALIDATE MODEL#

    print(
        '----------- VALIDATE: DTREE WITH GINI IMPURITY CRITERION ------------------'
    )
    data_v = pd.read_csv('./m_v.csv')

    features_v = list(data)
    features_v.remove('class')
    data_x_v = data_v[features]
    data_y_v = data_v['class']

    data_y_v = le.transform(data_y_v)
    data_x_dict_v = data_x_v.to_dict(orient='records')
    data_x_dict_v = v.transform(data_x_dict_v)

    preds_gini_v = dtree_gini_mod.predict(data_x_dict_v)
    print_multiclass_classif_error_report(data_y_v, preds_gini_v)

    return (dtree_gini_mod, le, v)
Beispiel #13
0
def preprocessData(data):
    # extract categorical columns
    header_not_cat = list(data.columns.values)
    header_not_cat.remove('property_type')
    X_cat = data.drop(header_not_cat, axis=1)

    # convert categorical data to dict
    X_cat.fillna('NA', inplace=True)
    X_cat = X_cat.T.to_dict().values()

    # vectorize categorical feature
    vec = DV(sparse=False)
    X_cat = vec.fit_transform(X_cat)
    #print vec.get_feature_names()

    # extract numerical columns
    header_not_num = list(data.columns.values)
    header_not_num.remove('property_type')
    header_not_num.remove('price')
    X_num = data.drop(['property_type', 'price'], axis=1)
    X_num = X_num.values  # pandas dataframe to numpy array

    # impute n/a value (replace it with mean value)
    imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
    X_num = imp.fit_transform(X_num)

    # scale the data
    #X_num = preprocessing.scale(X_num)
    X_scaler = preprocessing.StandardScaler().fit(X_num)
    X_num = X_scaler.transform(X_num)

    # combine numerical data and vectorized categorical data
    X = np.hstack((X_num, X_cat))

    # extract label column (predicted values: price)
    header_features = list(data.columns.values)
    header_features.remove('price')
    Y = data.drop(header_features, axis=1)
    Y = Y.price

    # standardize label column
    Y_scaler = preprocessing.StandardScaler().fit(Y)
    Y = Y_scaler.transform(Y)

    header = header_not_num + vec.get_feature_names()
    header = np.array(header)

    return X, Y, X_scaler, Y_scaler, header, imp, vec
def oneHotEncoding(train, numeric_cols):
    # receives the clean tain and test data
    # in: train and test numpy matrix
    x_num_train = train[numeric_cols].as_matrix()
    #x_num_test = test[numeric_cols].as_matrix()
    cat_train = train.drop(numeric_cols, axis=1)
    #cat_test = test.drop(numeric_cols, axis=1)
    x_cat_train = cat_train.T.to_dict().values()
    #x_cat_test = cat_test.T.to_dict().values()
    # 5.1 vectorize
    vectorizer = DV(sparse=False)
    vec_x_cat_train = vectorizer.fit_transform(x_cat_train)
    #vec_x_cat_test = vectorizer.transform(x_cat_test)
    # complete x
    x_train = np.hstack((x_num_train, vec_x_cat_train))
    #x_test = np.hstack((x_num_test, vec_x_cat_test))
    return x_train
Beispiel #15
0
def cleanData(traindf: pd.DataFrame,
              testdf: pd.DataFrame,
              describe=False) -> (pd.DataFrame, pd.DataFrame):

    traindf.drop(['v22', 'v91'], axis=1, inplace=True)
    testdf.drop(['v22', 'v91'], axis=1, inplace=True)

    nas = {}
    for colname in objectCols:
        nas[colname] = compute_most_freq_value(traindf, colname)

    for colname in objectCols:
        traindf[colname].fillna(nas[colname], inplace=True)

    for colname in objectCols:
        testdf[colname].fillna(nas[colname], inplace=True)

    cat_train = traindf[objectCols]
    cat_test = testdf[objectCols]

    traindf.drop(objectCols, axis=1, inplace=True)
    testdf.drop(objectCols, axis=1, inplace=True)

    dict_train_data = cat_train.T.to_dict().values()
    dict_test_data = cat_test.T.to_dict().values()

    #vectorize
    vectorizer = DV(sparse=False)
    features = vectorizer.fit_transform(dict_train_data)
    vec_data = pd.DataFrame(features)
    vec_data.columns = vectorizer.get_feature_names()
    vec_data.index = traindf.index
    traindf = traindf.join(vec_data)

    features = vectorizer.transform(dict_test_data)
    vec_data = pd.DataFrame(features)
    vec_data.columns = vectorizer.get_feature_names()
    vec_data.index = testdf.index
    testdf = testdf.join(vec_data)

    traindf.fillna(traindf.mean(), inplace=True)
    testdf.fillna(testdf.mean(), inplace=True)

    if describe: describeDataframe(traindf)
    return traindf, testdf
Beispiel #16
0
def pres_drug_bb_usc_code_dv():
    df_base = _load_base()
    df_base['idx'] = list(range(len(df_base)))
    print("load trans")
    df = pd.read_csv(
        g.FILE_PRES_OUT,
        usecols=['patient_id', 'drug_id'],
        dtype={'drug_id': str})
    print("load trans ... done")

    df['drug_id'].fillna('NA', inplace=True)

    # Drug
    df_drug = pd.read_csv(
        g.FILE_DRUG,
        usecols=['drug_id', 'BB_USC_code'],
        dtype={'drug_id': str, 'BB_USC_code': str})
    df_drug['drug_id'].fillna('NA', inplace=True)
    df_drug['BB_USC_code'].fillna('NA', inplace=True)

    df = df.merge(df_drug, how='left', on='drug_id')

    patient_list = []
    df_feat = []
    for patient_id, df_part in df.groupby('patient_id'):
        df_feat.append(Counter(df_part.BB_USC_code))
        patient_list.append(patient_id)
    X = DV(sparse=True).fit_transform(df_feat)
    print(X.shape)

    df_feat_order = pd.DataFrame({
        'patient_id': patient_list,
        'feat_idx': list(range(len(patient_list))),
    })
    df_feat_order = df_feat_order.merge(df_base, how='left', on='patient_id')
    print(df_feat_order.head())

    # Re-ordering
    X_ordered = ss.lil_matrix((len(df_base), X.shape[1]))
    for idx, row in df_feat_order.iterrows():
        X_ordered[row['idx'], :] = X[row['feat_idx'], :]

    X_ordered = ss.csr_matrix(X_ordered)
    print(X_ordered.shape)
    return X_ordered
Beispiel #17
0
def proc_procedure_code_dv():
    df_base = _load_base()
    df_base['idx'] = list(range(len(df_base)))
    print("load trans")
    df = pd.read_csv(
        g.FILE_PROC_OUT,
        usecols=['patient_id', 'procedure_code'],
        dtype={'procedure_code': str},
    )

    df['procedure_code'].fillna('NA', inplace=True)
    print("load trans ... done")

    patient_list = []
    df_feat = []
    for patient_id, df_part in df.groupby('patient_id'):
        df_feat.append(Counter(df_part.procedure_code))
        patient_list.append(patient_id)
    X = DV(sparse=True).fit_transform(df_feat)
    print(X.shape)

    df_feat_order = pd.DataFrame({
        'patient_id': patient_list,
        'feat_idx': list(range(len(patient_list))),
    })
    df_feat_order = df_feat_order.merge(df_base, how='left', on='patient_id')
    print(df_feat_order.head())

    # Re-ordering
    print("re-ordering")
    sz = len(df_feat_order)
    X_ordered = ss.lil_matrix((len(df_base), X.shape[1]))
    for idx, row in df_feat_order.iterrows():
        if idx % 1000 == 0:
            print(str(datetime.datetime.now()), idx, sz)
        X_ordered[row['idx'], :] = X[row['feat_idx'], :]
    print("re-ordering ... done")

    X_ordered = ss.csr_matrix(X_ordered)
    print(X_ordered.shape)
    return X_ordered
Beispiel #18
0
def fitData(folds, regressor, features):

    num_features = features.select_dtypes(exclude=['object'])
    num_features.fillna(0, inplace=True)
    obj_features = features.select_dtypes(include=['object'])
    obj_features.fillna('empty', inplace=True)

    encoder = DV(sparse=False)
    encoded_data = encoder.fit_transform(obj_features.T.to_dict().values())
    newFeatures = np.hstack([num_features, encoded_data])
    score = np.empty([1, 2])

    for [trainInds, testInds] in folds:
        regressor.fit(newFeatures[trainInds, :], price[trainInds])
        y_pr = regressor.predict(newFeatures[testInds, :])
        pr = myScore(price[testInds], y_pr)
        print pr
        score = np.append(score, pr, axis=0)

    score = np.delete(score, 0, 0)
    return score
Beispiel #19
0
    def preprocess(self):
        logging.debug("Pre-processing data...")

        # Same train and test - union of train/test features.
        x_train_cols = self.train_data.columns
        x_test_cols = self.test_data.columns
        x_intersect_cols = [x for x in x_train_cols if x in x_test_cols]
        only_train_cols = [x for x in x_train_cols if x not in x_test_cols]
        only_test_cols = [x for x in x_test_cols if x not in x_train_cols]
        logging.debug("Train columns: %s Test columns: %s",
                      len(self.train_data.columns),
                      len(self.test_data.columns))
        logging.info("Only train: %s", only_train_cols)
        logging.info("Only test: %s", only_test_cols)

        self.test_data = self.test_data[x_intersect_cols]
        self.train_data = self.train_data[x_intersect_cols]
        logging.debug("Train columns: %s Test columns: %s",
                      len(self.train_data.columns),
                      len(self.test_data.columns))

        drop_cols = [
            self.problem_definition.y_column,
            self.problem_definition.id_column,
            self.problem_definition.grouping_column
        ]

        x_num_train = self.train_data.select_dtypes(
            include=['int64', 'float']).drop(drop_cols, axis=1)
        col_names_num = x_num_train.columns.values
        x_num_train = x_num_train.as_matrix()

        x_num_test = self.test_data.select_dtypes(
            include=['int64', 'float']).drop(drop_cols, axis=1).as_matrix()
        imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
        x_num_train = imp.fit_transform(x_num_train)
        x_num_test = imp.fit_transform(x_num_test)

        # # scale to <0,1>
        # max_train = np.amax(x_num_train, 0)
        # print(max_train)
        # print("IsNaN train:")
        # print(np.any(np.isnan(x_num_train)))

        min_max_scaler = preprocessing.MinMaxScaler()
        x_num_train = min_max_scaler.fit_transform(x_num_train)
        x_num_test = min_max_scaler.fit_transform(
            x_num_test)  # scale test by max_train

        cat_train = self.train_data.select_dtypes(include=['object'])
        drop_cols = np.intersect1d(drop_cols, cat_train.columns.values)
        cat_train = cat_train.drop(drop_cols, axis=1)

        cat_test = self.test_data.select_dtypes(include=['object'])
        cat_test = cat_test.drop(drop_cols, axis=1)

        cat_train.fillna('NA', inplace=True)
        cat_test.fillna('NA', inplace=True)

        x_cat_train = cat_train.T.to_dict().values()
        x_cat_test = cat_test.T.to_dict().values()

        # vectoring
        vectorizer = DV(sparse=False)
        vec_x_cat_train = vectorizer.fit_transform(x_cat_train)
        vec_x_cat_test = vectorizer.transform(x_cat_test)
        col_names_cat = np.asarray(vectorizer.get_feature_names())
        self.col_names = np.hstack((col_names_num, col_names_cat))

        self.x_train = np.hstack((x_num_train, vec_x_cat_train))
        self.x_test = np.hstack((x_num_test, vec_x_cat_test))

        # HACK: This should be treated in a better way!!!!
        self.y_train = 1. - self.train_data[self.label]
        self.y_test = 1. - self.test_data[self.label]

        logging.info('Train 0: %s', len(np.where(self.y_train < 1)[0]))
        logging.info('Train 1: %s', len(np.where(self.y_train > 0)[0]))
        logging.info('Test 0: %s', len(np.where(self.y_test < 1)[0]))
        logging.info('Test 1: %s', len(np.where(self.y_test > 0)[0]))
Beispiel #20
0

# Import CSV
data = readCsvIntoPandasDataframe(csvfile)

# extract categorical columns (cat = categorical)
header_not_cat = list(data.columns.values)
header_not_cat.remove('property_type')
X_cat = data.drop(header_not_cat, axis=1)

# convert to dict
X_cat.fillna('NA', inplace=True)
X_cat = X_cat.T.to_dict().values()

# vectorize categorical feature
vec = DV(sparse=False)
X_cat = vec.fit_transform(X_cat)
print vec.get_feature_names()

# extract numerical columns (num = numerical)
header_not_num = list(data.columns.values)
header_not_num.remove('property_type')
header_not_num.remove('price')
X_num = data.drop(['property_type', 'price'], axis=1)
X_num = X_num.values  # pandas dataframe to numpy array

# impute n/a value (replace it with mean value)
imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
X_num = imp.fit_transform(X_num)

# scale the data
Beispiel #21
0
from sklearn.feature_extraction import DictVectorizer as DV
from sklearn.cross_validation import train_test_split
from skll import kappa
import os


#outputDf = pd.DataFrame({"first" : [],"second" : []})
#outputDf = outputDf.append(pd.DataFrame({"first" : [2],"second" : [3]}))
os.chdir("/Users/swapnil/work/Kaggle/out/PLIA")
print "hello"


df1Data = {"col1" : [1,2,3],"col2" : ["swap","kals","bang"]}
df1 = pd.DataFrame(data = df1Data)

d = DV(sparse = False)
d1 = d.fit_transform(df1.T.to_dict().values())
#print d1

df2Data = {"col1" : [1,20,30,40],"col2" : ["swap","kals1","bang","nag"],"col3":[22,33,44,55]}
df2 = pd.DataFrame(data = df2Data)
d2 = d.transform(df2.T.to_dict().values())
print d2



d3 = d2[0:2,0:d2.shape[1]]


#print d.get_feature_names()
Beispiel #22
0
    [column for column in train.columns
     if column not in skip_feature_columns]):
    if type(train[column][0] == str):
        try:
            train[column] = train[column].apply(dollar_to_numeric)
            test[column] = test[column].apply(dollar_to_numeric)
        except ValueError:
            pass
    if type(train[column][0]) in [numpy.float64, numpy.int64]:
        numeric.append(column)
    else:
        categorical.append(column)

    print column, train[column].nunique(), type(
        train[column][0]), train[column][0]

numeric, categorical

vectorizer = DV(sparse=False)

X_train_cat = vectorizer.fit_transform(
    train[categorical].fillna('NA').T.to_dict().values())
X_test_cat = vectorizer.transform(
    test[categorical].fillna('NA').T.to_dict().values())

X_train = numpy.hstack([train[numeric].fillna(-999).values, X_train_cat])
y_train = train[target]

X_test = numpy.hstack([test[numeric].fillna(-999).values, X_test_cat])
test_ids = test[index]
Beispiel #23
0
def matrix(wordlength, length, instance):
    di = {'i': 1, 'o': 2, 'P': 3, 'L': 4}
    damino = {
        'A': 1,
        'R': 2,
        'D': 3,
        'N': 4,
        'C': 5,
        'E': 6,
        'Q': 7,
        'G': 8,
        'H': 9,
        'I': 10,
        'L': 11,
        'K': 12,
        'M': 13,
        'F': 14,
        'P': 15,
        'S': 16,
        'T': 17,
        'W': 18,
        'Y': 19,
        'V': 20,
        'J': 21
    }
    word_list = []
    word_list_w = []
    toplogy_list = []
    toplogy_w = []
    tempd = ''
    z = wordpro(wordlength)
    filein = open('prototext.txt', 'r')
    for line in filein:
        temp_line = line.rstrip()
        #Adding charachters in the begening and end for windows
        temporary_string = ("J" * z) + (temp_line) + ("J" * z)
        for each in window(temporary_string, wordlength):
            temp = ''.join(each)
            temp_c = []
            for c in temp:
                g = damino[c]
                temp_c.append(g)
            word_list.append(temp)
        temporary_topology = next(filein)
        temporary_topology = temporary_topology.rstrip()
        for c in temporary_topology:
            k = di[c]
            toplogy_list.append(k)
            toplogy_w.append(c)
    #http://stackoverflow.com/questions/30522724/take-multiple-lists-into-dataframe
    #http://stackoverflow.com/questions/20970279/how-to-do-a-left-right-and-mid-of-a-string-in-a-pandas-dataframe
    dftemp = pd.DataFrame({'word_list': word_list})
    dwtemp = pd.DataFrame({'word_list': word_list_w})
    if (length == 3):
        df = pd.DataFrame({
            'p-1': dftemp['word_list'].str[0],
            'p': dftemp['word_list'].str[1],
            'p+1': dftemp['word_list'].str[2]
        })

    if (length == 5):
        df = pd.DataFrame({
            'p-2': dftemp['word_list'].str[0],
            'p-1': dftemp['word_list'].str[1],
            'p': dftemp['word_list'].str[2],
            'p+1': dftemp['word_list'].str[3],
            'p+2': dftemp['word_list'].str[4]
        })

    if (length == 7):
        df = pd.DataFrame({
            'p-3': dftemp['word_list'].str[0],
            'p-2': dftemp['word_list'].str[1],
            'p-1': dftemp['word_list'].str[2],
            'p': dftemp['word_list'].str[3],
            'p+1': dftemp['word_list'].str[4],
            'p+2': dftemp['word_list'].str[5],
            'p+3': dftemp['word_list'].str[6]
        })

    if (length == 9):
        df = pd.DataFrame({
            'p-4': dftemp['word_list'].str[0],
            'p-3': dftemp['word_list'].str[1],
            'p-2': dftemp['word_list'].str[2],
            'p-1': dftemp['word_list'].str[3],
            'p': dftemp['word_list'].str[4],
            'p+1': dftemp['word_list'].str[5],
            'p+2': dftemp['word_list'].str[6],
            'p+3': dftemp['word_list'].str[7],
            'p+4': dftemp['word_list'].str[8]
        })

    if (length == 11):
        df = pd.DataFrame({
            'p-5': dftemp['word_list'].str[0],
            'p-4': dftemp['word_list'].str[1],
            'p-3': dftemp['word_list'].str[2],
            'p-2': dftemp['word_list'].str[3],
            'p-1': dftemp['word_list'].str[4],
            'p': dftemp['word_list'].str[5],
            'p+1': dftemp['word_list'].str[6],
            'p+2': dftemp['word_list'].str[7],
            'p+3': dftemp['word_list'].str[8],
            'p+4': dftemp['word_list'].str[9],
            'p+5': dftemp['word_list'].str[10]
        })

    if (length == 13):
        df = pd.DataFrame({
            'p-6': dftemp['word_list'].str[0],
            'p-5': dftemp['word_list'].str[1],
            'p-4': dftemp['word_list'].str[2],
            'p-3': dftemp['word_list'].str[3],
            'p-2': dftemp['word_list'].str[4],
            'p-1': dftemp['word_list'].str[5],
            'p': dftemp['word_list'].str[6],
            'p+1': dftemp['word_list'].str[7],
            'p+2': dftemp['word_list'].str[8],
            'p+3': dftemp['word_list'].str[9],
            'p+4': dftemp['word_list'].str[10],
            'p+5': dftemp['word_list'].str[11],
            'p+6': dftemp['word_list'].str[12]
        })

    if (length == 15):
        df = pd.DataFrame({
            'p-7': dftemp['word_list'].str[0],
            'p-6': dftemp['word_list'].str[1],
            'p-5': dftemp['word_list'].str[2],
            'p-4': dftemp['word_list'].str[3],
            'p-3': dftemp['word_list'].str[4],
            'p-2': dftemp['word_list'].str[5],
            'p-1': dftemp['word_list'].str[6],
            'p': dftemp['word_list'].str[7],
            'p+1': dftemp['word_list'].str[8],
            'p+2': dftemp['word_list'].str[9],
            'p+3': dftemp['word_list'].str[10],
            'p+4': dftemp['word_list'].str[11],
            'p+5': dftemp['word_list'].str[12],
            'p+6': dftemp['word_list'].str[13],
            'p+7': dftemp['word_list'].str[14]
        })

    if (length == 17):
        df = pd.DataFrame({
            'p-8': dftemp['word_list'].str[0],
            'p-7': dftemp['word_list'].str[1],
            'p-6': dftemp['word_list'].str[2],
            'p-5': dftemp['word_list'].str[3],
            'p-4': dftemp['word_list'].str[4],
            'p-3': dftemp['word_list'].str[5],
            'p-2': dftemp['word_list'].str[6],
            'p-1': dftemp['word_list'].str[7],
            'p': dftemp['word_list'].str[8],
            'p+1': dftemp['word_list'].str[9],
            'p+2': dftemp['word_list'].str[10],
            'p+3': dftemp['word_list'].str[11],
            'p+4': dftemp['word_list'].str[12],
            'p+5': dftemp['word_list'].str[13],
            'p+6': dftemp['word_list'].str[14],
            'p+7': dftemp['word_list'].str[15],
            'p+8': dftemp['word_list'].str[16]
        })

    if (length == 19):
        df = pd.DataFrame({
            'p-9': dftemp['word_list'].str[0],
            'p-8': dftemp['word_list'].str[1],
            'p-7': dftemp['word_list'].str[2],
            'p-6': dftemp['word_list'].str[3],
            'p-5': dftemp['word_list'].str[4],
            'p-4': dftemp['word_list'].str[5],
            'p-3': dftemp['word_list'].str[6],
            'p-2': dftemp['word_list'].str[7],
            'p-1': dftemp['word_list'].str[8],
            'p': dftemp['word_list'].str[9],
            'p+1': dftemp['word_list'].str[10],
            'p+2': dftemp['word_list'].str[11],
            'p+3': dftemp['word_list'].str[12],
            'p+4': dftemp['word_list'].str[13],
            'p+5': dftemp['word_list'].str[14],
            'p+6': dftemp['word_list'].str[15],
            'p+7': dftemp['word_list'].str[16],
            'p+8': dftemp['word_list'].str[17],
            'p+9': dftemp['word_list'].str[18]
        })

    if (length == 21):
        df = pd.DataFrame({
            'p-10': dftemp['word_list'].str[0],
            'p-9': dftemp['word_list'].str[1],
            'p-8': dftemp['word_list'].str[2],
            'p-7': dftemp['word_list'].str[3],
            'p-6': dftemp['word_list'].str[4],
            'p-5': dftemp['word_list'].str[5],
            'p-4': dftemp['word_list'].str[6],
            'p-3': dftemp['word_list'].str[7],
            'p-2': dftemp['word_list'].str[8],
            'p-1': dftemp['word_list'].str[9],
            'p': dftemp['word_list'].str[10],
            'p+1': dftemp['word_list'].str[11],
            'p+2': dftemp['word_list'].str[12],
            'p+3': dftemp['word_list'].str[13],
            'p+4': dftemp['word_list'].str[14],
            'p+5': dftemp['word_list'].str[15],
            'p+6': dftemp['word_list'].str[16],
            'p+7': dftemp['word_list'].str[17],
            'p+8': dftemp['word_list'].str[18],
            'p+9': dftemp['word_list'].str[19],
            'p+10': dftemp['word_list'].str[20]
        })

    train_dict = df.T.to_dict().values()
    #print (train_dict)
    vectorizer = DV(sparse=False)
    vec_train = vectorizer.fit_transform(train_dict)
    print(vectorizer.get_feature_names())
    target = np.asarray(toplogy_list)
    X_train, X_test, y_train, y_test = train_test_split(vec_train,
                                                        target,
                                                        test_size=0.2,
                                                        random_state=0)
    estimator = svm.SVC(kernel='rbf')
    cv = ShuffleSplit(X_train.shape[0],
                      n_iter=10,
                      test_size=0.2,
                      random_state=0)
    gammas = np.logspace(-6, -1, 10)
    classifier = GridSearchCV(estimator=estimator,
                              cv=cv,
                              param_grid=dict(gamma=gammas))
    classifier.fit(X_train, y_train)
    title = 'Learning Curves (SVM, rbf kernel, $\gamma=%.6f$)' % classifier.best_estimator_.gamma
    estimator = svm.SVC(kernel='rbf', gamma=classifier.best_estimator_.gamma)
    plot_learning_curve(estimator, title, X_train, y_train, cv=cv)
    plt.savefig('rbf-word-%04d.pdf' % instance)
    print(classifier.score(X_test, y_test))
Beispiel #24
0
    def pre_encode(self, X):
        # encode static cols
        if self.label_col not in self.static_cols:
            self.static_cols.append(self.label_col)
        if self.case_id_col not in self.static_cols:
            self.static_cols.append(self.case_id_col)
        data_final = X[X[self.event_nr_col] == 1][self.static_cols]

        # encode dynamic cols
        for i in range(1, self.max_events + 1):
            data_selected = X[X[self.event_nr_col] == i][[self.case_id_col] +
                                                         self.dynamic_cols]
            data_selected.columns = [self.case_id_col] + [
                "%s_%s%s" % (col, self.dyn_event_marker, i)
                for col in self.dynamic_cols
            ]
            data_final = pd.merge(data_final,
                                  data_selected,
                                  on=self.case_id_col,
                                  how="left")

        # encode last state cols
        for i in range(1, self.max_events + 1):
            data_selected = X[X[self.event_nr_col] == i][[self.case_id_col] +
                                                         self.last_state_cols]
            data_selected.columns = [self.case_id_col] + [
                "%s_%s%s" % (col, self.last_event_marker, i)
                for col in self.last_state_cols
            ]
            data_final = pd.merge(data_final,
                                  data_selected,
                                  on=self.case_id_col,
                                  how="left")
            if i > 1:
                for col in self.last_state_cols:
                    missing = pd.isnull(
                        data_final["%s_%s%s" %
                                   (col, self.last_event_marker, i)])
                    data_final["%s_%s%s" %
                               (col, self.last_event_marker,
                                i)].loc[missing] = data_final[
                                    "%s_%s%s" % (col, self.last_event_marker,
                                                 i - 1)].loc[missing]

        # make categorical
        dynamic_cat_cols = [
            col for col in self.cat_cols if col in self.dynamic_cols
        ]
        static_cat_cols = [
            col for col in self.cat_cols if col in self.static_cols
        ]
        categorical_cols = [
            "%s_%s%s" % (col, self.dyn_event_marker, i)
            for i in range(1, self.max_events + 1) for col in dynamic_cat_cols
        ] + static_cat_cols
        cat_df = data_final[categorical_cols]
        cat_dict = cat_df.T.to_dict().values()
        vectorizer = DV(sparse=False)
        vec_cat_dict = vectorizer.fit_transform(cat_dict)
        cat_data = pd.DataFrame(vec_cat_dict,
                                columns=vectorizer.feature_names_)
        data_final = pd.concat(
            [data_final.drop(categorical_cols, axis=1), cat_data], axis=1)

        data_final = pd.merge(data_final,
                              X.groupby(
                                  self.case_id_col)[self.event_nr_col].agg({
                                      "case_length":
                                      "max"
                                  }).reset_index(),
                              on=self.case_id_col,
                              how="left")

        # fill NA
        if self.fillna:
            for col in data_final:
                dt = data_final[col].dtype
                if dt == int or dt == float:
                    data_final[col].fillna(0, inplace=True)
                else:
                    data_final[col].fillna("", inplace=True)

        return data_final
Beispiel #25
0
    def _complex_encode(self, X):  # he gives one
        # encode static cols
        if self.label_col not in self.static_cols:
            self.static_cols.append(self.label_col)
        if self.case_id_col not in self.static_cols:
            self.static_cols.append(self.case_id_col)
        data_final = X[X[self.event_nr_col] == 1][self.static_cols]

        # encode dynamic cols
        print(self.nr_events)
        for i in range(1, self.nr_events + 1):
            data_selected = X[X[self.event_nr_col] == i][[self.case_id_col] +
                                                         self.dynamic_cols]
            data_selected.columns = [self.case_id_col] + [
                "%s_%s" % (col, i) for col in self.dynamic_cols
            ]
            data_final = pd.merge(data_final,
                                  data_selected,
                                  on=self.case_id_col,
                                  how="right")

        print(data_final.columns)

        # encode last state cols
        for col in self.last_state_cols:
            data_final = pd.merge(data_final,
                                  X[X[self.event_nr_col] == self.nr_events][[
                                      self.case_id_col, col
                                  ]],
                                  on=self.case_id_col,
                                  how="right")
            for idx, row in data_final.iterrows():
                current_nr_events = self.nr_events - 1
                while pd.isnull(data_final.loc[idx,
                                               col]) and current_nr_events > 0:
                    data_final.loc[idx, col] = X[
                        (X[self.case_id_col] == row[self.case_id_col])
                        & (X[self.event_nr_col] == current_nr_events
                           )].iloc[0][col]
                    current_nr_events -= 1

        # make categorical
        dynamic_cat_cols = [
            col for col in self.cat_cols if col in self.dynamic_cols
        ]
        static_cat_cols = [
            col for col in self.cat_cols if col in self.static_cols
        ]
        catecorical_cols = [
            "%s_%s" % (col, i) for i in range(1, self.nr_events + 1)
            for col in dynamic_cat_cols
        ] + static_cat_cols
        cat_df = data_final[catecorical_cols]
        cat_dict = cat_df.T.to_dict().values()
        vectorizer = DV(sparse=False)
        vec_cat_dict = vectorizer.fit_transform(cat_dict)
        cat_data = pd.DataFrame(vec_cat_dict,
                                columns=vectorizer.feature_names_)
        data_final = pd.concat(
            [data_final.drop(catecorical_cols, axis=1), cat_data], axis=1)

        if self.fitted_columns is not None:
            missing_cols = self.fitted_columns[~self.fitted_columns.
                                               isin(data_final.columns)]
            for col in missing_cols:
                data_final[col] = 0
            data_final = data_final[self.fitted_columns]
        else:
            self.fitted_columns = data_final.columns

        # fill NA
        if self.fillna:
            for col in data_final:
                dt = data_final[col].dtype
                if dt == int or dt == float:
                    data_final[col].fillna(0, inplace=True)
                else:
                    data_final[col].fillna("", inplace=True)
        print('data_final    ', data_final)
        return data_final
Beispiel #26
0
def matrix(wordlength, length, instance):
    di = {'i': 0, 'o': 1, 'P': 2, 'L': 3}
    damino = {
        'A': 1,
        'R': 2,
        'D': 3,
        'N': 4,
        'C': 5,
        'E': 6,
        'Q': 7,
        'G': 8,
        'H': 9,
        'I': 10,
        'L': 11,
        'K': 12,
        'M': 13,
        'F': 14,
        'P': 15,
        'S': 16,
        'T': 17,
        'W': 18,
        'Y': 19,
        'V': 20,
        'J': 21
    }
    word_list = []
    word_list_w = []
    toplogy_list = []
    toplogy_w = []
    tempd = ''
    counter = 0
    z = wordpro(wordlength)
    filein = open('prototext.txt', 'r')
    for line in filein:
        temp_line = line.rstrip()
        #Adding charachters in the begening and end for windows
        temporary_string = ("J" * z) + (temp_line) + ("J" * z)
        for each in window(temporary_string, wordlength):
            temp = ''.join(each)
            temp_c = []
            for c in temp:
                g = damino[c]
                temp_c.append(g)
            word_list.append(temp)
        temporary_topology = next(filein)
        temporary_topology = temporary_topology.rstrip()
        for c in temporary_topology:
            k = di[c]
            toplogy_list.append(k)
            toplogy_w.append(c)
    #http://stackoverflow.com/questions/30522724/take-multiple-lists-into-dataframe
    #http://stackoverflow.com/questions/20970279/how-to-do-a-left-right-and-mid-of-a-string-in-a-pandas-dataframe
    dftemp = pd.DataFrame({'word_list': word_list})
    dwtemp = pd.DataFrame({'word_list': word_list_w})
    if (length == 21):
        df = pd.read_csv("finaltest.csv")
    train_dict = df.T.to_dict().values()
    #print (train_dict)
    vectorizer = DV(sparse=False)
    vec_train = vectorizer.fit_transform(train_dict)
    max_abs_scaler = preprocessing.MaxAbsScaler()
    vec_train = max_abs_scaler.fit_transform(vec_train)
    print(vectorizer.get_feature_names())
    target = np.asarray(toplogy_list)
    estimator = svm.SVC(kernel='rbf', gamma=0.027826, class_weight="balanced")
    classifier = estimator
    cv = cross_validation.KFold(22165,
                                n_folds=10,
                                shuffle=False,
                                random_state=None)
    class_names = ['i', 'o', 'P', 'L']
    for train_index, test_index in cv:
        counter = counter + 1
        X_tr, X_tes = vec_train[train_index], vec_train[test_index]
        y_tr, y_tes = target[train_index], target[test_index]
        clf = classifier.fit(X_tr, y_tr)

        y_pred = clf.predict(X_tes)
        cnf_matrix = confusion_matrix(y_tes, y_pred)
        np.set_printoptions(precision=2)
        print(cnf_matrix)

        # Plot non-normalized confusion matrix
        plt.figure()
        plot_confusion_matrix(cnf_matrix,
                              classes=class_names,
                              title='Confusion matrix, without normalization')
        fileName = "Plot_wnr-%04d.pdf" % counter
        plt.savefig(fileName, dpi=200, format='pdf', bbox_inches='tight')
        # Plot normalized confusion matrix
        plt.figure()
        plot_confusion_matrix(cnf_matrix,
                              classes=class_names,
                              normalize=True,
                              title='Normalized confusion matrix')
        fileName = "Plot_nr-%04d.pdf" % counter
        plt.savefig(fileName, dpi=200, format='pdf', bbox_inches='tight')
        fileName = "Plot_cr_w-%04d.pdf" % counter
        plot_classification_report(classification_report(y_pred, y_tes))
        plt.savefig(fileName, dpi=200, format='pdf', bbox_inches='tight')
        plt.close()

    counter = 0
    estimator = svm.SVC(kernel='rbf', gamma=0.027826)
    classifier = estimator
    cv = cross_validation.KFold(22165,
                                n_folds=10,
                                shuffle=False,
                                random_state=None)
    class_names = ['i', 'o', 'P', 'L']
    for train_index, test_index in cv:
        counter = counter + 1
        X_tr, X_tes = vec_train[train_index], vec_train[test_index]
        y_tr, y_tes = target[train_index], target[test_index]
        clf = classifier.fit(X_tr, y_tr)

        y_pred = clf.predict(X_tes)
        cnf_matrix = confusion_matrix(y_tes, y_pred)
        np.set_printoptions(precision=2)
        print(cnf_matrix)

        # Plot non-normalized confusion matrix
        plt.figure()
        plot_confusion_matrix(cnf_matrix,
                              classes=class_names,
                              title='Confusion matrix, without normalization')
        fileName = "uw_Plot_wnr-%04d.pdf" % counter
        plt.savefig(fileName, dpi=200, format='pdf', bbox_inches='tight')
        # Plot normalized confusion matrix
        plt.figure()
        plot_confusion_matrix(cnf_matrix,
                              classes=class_names,
                              normalize=True,
                              title='Normalized confusion matrix')
        fileName = "uw_Plot_nr-%04d.pdf" % counter
        plt.savefig(fileName, dpi=200, format='pdf', bbox_inches='tight')
        fileName = "Plot_cr_uw-%04d.pdf" % counter
        plot_classification_report(classification_report(y_pred, y_tes))
        plt.savefig(fileName, dpi=200, format='pdf', bbox_inches='tight')
        plt.close()
Beispiel #27
0
# В предыдущей ячейке мы разделили наш датасет ещё на две части: в одной присутствуют только вещественные признаки, в другой только категориальные. Это понадобится нам для раздельной последующей обработке этих данных, а так же для сравнения качества работы тех или иных методов.
# 
# Для использования модели регрессии требуется преобразовать категориальные признаки в вещественные. Рассмотрим основной способ преоборазования категориальных признаков в вещественные: one-hot encoding. Его идея заключается в том, что мы преобразуем категориальный признак при помощи бинарного кода: каждой категории ставим в соответствие набор из нулей и единиц.
# 
# Посмотрим, как данный метод работает на простом наборе данных.

# In[ ]:

from sklearn.linear_model import LogisticRegression as LR
from sklearn.feature_extraction import DictVectorizer as DV

categorial_data = pd.DataFrame({'sex': ['male', 'female', 'male', 'female'], 
                                'nationality': ['American', 'European', 'Asian', 'European']})
print('Исходные данные:\n')
print(categorial_data)
encoder = DV(sparse = False)
encoded_data = encoder.fit_transform(categorial_data.T.to_dict().values())
print('\nЗакодированные данные:\n')
print(encoded_data)


# Как видно, в первые три колонки оказалась закодированна информация о стране, а во вторые две - о поле. При этом для совпадающих элементов выборки строки будут полностью совпадать. Также из примера видно, что кодирование признаков сильно увеличивает их количество, но полностью сохраняет информацию, в том числе о наличии пропущенных значений (их наличие просто становится одним из бинарных признаков в преобразованных данных).
# 
# Теперь применим one-hot encoding к категориальным признакам из исходного датасета. Обратите внимание на общий для всех методов преобработки данных интерфейс. Функция
# 
#     encoder.fit_transform(X)
#     
# позволяет вычислить необходимые параметры преобразования, впоследствии к новым данным можно уже применять функцию
# 
#     encoder.transform(X)
#     
Beispiel #28
0
    def _encode_feature(self, splited_key, train_data, test_data,
                        external_data, logger):
        """ feature transfer and encoding """

        # encode y
        logger.debug('splited_key[%s] encode y' % splited_key)
        (train_y, le_y) = self._encode_y(train_data['OutcomeType'].values,
                                         logger)

        (total_breed,
         total_color) = self._generate_combine_data(train_data, test_data,
                                                    logger)

        test_data.rename(columns={'ID': 'AnimalID'}, inplace=True)
        feature_columns = test_data.columns
        feature_train_data = train_data[feature_columns]
        feature_train_data.loc[:, 'data_type'] = 'train'
        feature_test_data = test_data[feature_columns]
        feature_test_data.loc[:, 'data_type'] = 'test'
        logger.debug('feature_train_data columns %s' %
                     str(feature_train_data.columns))
        logger.debug('feature_test_data columns %s' %
                     str(feature_test_data.columns))

        data = pd.concat([feature_train_data, feature_test_data])
        logger.debug('feature_train_data shape %s' %
                     str(feature_train_data.shape))
        logger.debug('feature_test_data shape %s' %
                     str(feature_test_data.shape))
        logger.debug('data shape %s' % str(data.shape))

        logger.debug('splited_key[%s] encode x' % splited_key)
        data['EncodeYear'] = data['DateTime'].apply(self._transfer_year_info)
        data['EncodeMonth'] = data['DateTime'].apply(self._transfer_month_info)
        data['EncodeWeekday'] = data['DateTime'].apply(
            self._transfer_weekday_info)
        data['EncodeHour'] = data['DateTime'].apply(self._transfer_hour_info)
        data['UnixDateTime'] = data['DateTime'].apply(
            self._transfer_unix_datetime_info)

        data['EncodeAgeuponOutcome'] = data['AgeuponOutcome'].apply(
            self._transfer_age_info)

        data = data[data['SexuponOutcome'] != '']

        data['NameLen'] = data['Name'].apply(self._transfer_name_len)

        if self._encode_type == 'dv':
            for breed_type in total_breed:
                data[breed_type] = data['Breed'].apply(
                    self._transfer_breed_type_info, args=(breed_type, ))
            for color_type in total_color:
                data[color_type] = data['Color'].apply(
                    self._transfer_color_type_info, args=(color_type, ))
        data['BreedMix'] = data['Breed'].apply(self._transfer_breed_mix_info)
        data['ColorCount'] = data['Color'].apply(
            self._transfer_color_count_info)
        data['Sex'] = data['SexuponOutcome'].apply(self._transfer_sex_info)
        data['Intact'] = data['SexuponOutcome'].apply(
            self._transfer_intact_info)

        logger.debug('transfer feature_train_data shape %s' %
                     str(feature_train_data.shape))
        logger.debug('transfer feature_test_data shape %s' %
                     str(feature_test_data.shape))

        drop_list = [
            'AnimalID', 'Name', 'DateTime', 'AgeuponOutcome', 'SexuponOutcome'
        ]
        data = data.drop(drop_list, 1)
        transfer_train_data = data[data['data_type'] == 'train']
        transfer_test_data = data[data['data_type'] == 'test']
        type_drop_list = ['data_type']
        transfer_train_data = transfer_train_data.drop(type_drop_list, 1)
        transfer_test_data = transfer_test_data.drop(type_drop_list, 1)
        data = data.drop(type_drop_list, 1)

        if self._encode_type == 'dv':  # one-hot encoder
            x_all = data.T.to_dict().values()
            vectorizer_x = DV(sparse=False)
            vectorizer_x.fit(x_all)

            x1 = transfer_train_data.T.to_dict().values()
            train_x = pd.DataFrame(vectorizer_x.fit_transform(x1))
            x2 = transfer_test_data.T.to_dict().values()
            test_x = pd.DataFrame(vectorizer_x.transform(x2))

            model_infos = {'vectorizer_x': vectorizer_x, 'le_y': le_y}

        elif self._encode_type == 'label':  # label encode
            col_le_dict = self._fit(data, logger)
            train_x = self._transform(transfer_train_data, col_le_dict, logger)
            test_x = self._transform(transfer_test_data, col_le_dict, logger)
            model_infos = {'col_le_dict': col_le_dict, 'le_y': le_y}
        else:
            raise ValueError("encode_type not valid, [label, dv] supported")

        logger.debug('splited_key[%s] train_x shape %s' %
                     (splited_key, str(train_x.shape)))
        logger.debug('splited_key[%s] train_y shape %s' %
                     (splited_key, str(train_y.shape)))
        logger.debug('splited_key[%s] test_x shape %s' %
                     (splited_key, str(test_x.shape)))

        return (train_x, train_y, test_x, model_infos)
Beispiel #29
0
#activity_category one hot encoding
ac_cnt = train_input.activity_category.value_counts(normalize=True)
ac_cat_d = defaultdict(lambda: 'a0')
ac_cols = ac_cnt.index.values
ac_cols.sort()
for k, v in enumerate(ac_cols):
    ac_cat_d[v] = 'a' + str(k + 1)

print ac_cat_d
train_input['activity_category'].replace(ac_cat_d, inplace=True)
# train_input['activity_category'].head()

# DictVectorizer - string to one-hot encoding
train_dict = train_input[['activity_category', 'char_10']].T.to_dict().values()
train_input = train_input.drop(['activity_category', 'char_10'], axis=1)
train_vectorizer = DV(sparse=False)
vec_train_feat = train_vectorizer.fit_transform(train_dict)
print type(vec_train_feat)
print vec_train_feat[0:5, :]
train_df = pd.concat([
    train_input[['outcome', 'people_id']],
    pd.DataFrame(vec_train_feat, dtype=int)
],
                     axis=1)
print train_df.shape
print train_df.head()

# test dataset - one-hot encoding
test_input = pd.read_csv('dataset/act_test_t.csv',
                         keep_default_na=True).fillna("-1")
test_input['char_10'].replace(c10_cat_d, inplace=True)
Beispiel #30
0

def adjustResponse(resp):
    if resp < 1:
        return 1
    elif resp > 8:
        return 8
    else:
        return int(round(resp))


os.chdir("/Users/swapnil/work/Kaggle/out/PLIA")
print "Hello"
data = pd.read_csv("trans_train_sumMK.csv")
test = pd.read_csv("trans_train_sumMK.csv", na_values="NA")
d = DV(sparse=True)

data = data.fillna(-9999)
test = test.fillna(-9999)

trainData, cvData, yTrain, yCv = train_test_split(data,
                                                  data["Response"],
                                                  test_size=0.2,
                                                  random_state=42)

trainData = trainData.drop("Response", axis=1)
trainData = trainData.drop("Id", axis=1)

cvData = cvData.drop("Response", axis=1)
cvData = cvData.drop("Id", axis=1)