Ejemplo n.º 1
0
def oneHotEncoding():
    print("-----------Try to ONE HOT ENCODING-----------------")
    setToCompare = 'abcdefghijklmnopqrstuvwxyz '
    ctoi = dict((c, i) for i, c in enumerate(setToCompare))
    itoc = dict((i, c) for i, c in enumerate(setToCompare))
    # integer encode input data
    integer_encoded = [ctoi[char] for char in musicdata]
    print(integer_encoded)
    # one hot encode
    onehot = list()
    for value in integer_encoded:
        letter = [0 for _ in range(len(setToCompare))]
        letter[value] = 1
        onehot.append(letter)
    print(onehot)
    # invert encoding
    inverted = itoc[np.argmax(onehot[0])]
    print(inverted)

    print(
        "--------------------------ENCODING IN PROGRESS----------------------")
    labelencoder = LE()
    X1 = X
    Y1 = Y
    X1[:, 0] = labelencoder.fit_transform(X1[:, 0])
    onehotencoder = OHE([0])
    X1 = onehotencoder.fit_transform(X1)

    labelencoderY = LE()
    Y1 = labelencoderY.fit_transform(Y1)
    print(X1)
    print(Y1)
Ejemplo n.º 2
0
    def __init__(self, target_building, method, load_from_file=1):

        self.time_range = (None, None)

        if not load_from_file:
            #data features
            ids, self.fd = get_data_features(target_building,
                                             self.time_range[0],
                                             self.time_range[1])
            print('%d data streams loaded' % len(ids))

            #labels
            res = get_labels(target_building)
            label = [res[srcid] for srcid in ids]
            le = LE()
            self.label = le.fit_transform(label)

            res = [self.fd, self.label]
            with open('./%s_fs.pkl' % (target_building), 'wb') as wf:
                pk.dump(res, wf)
        else:
            with open('./%s_fs.pkl' % (target_building), 'rb') as rf:
                res = pk.load(rf)
            self.fd, self.label = res[0], res[1]

        print('# of classes:', len(set(self.label)))
        print('data features for %s with dim:' % target_building,
              np.concatenate(self.fd, axis=1).shape)

        self.method = method
        self.building = target_building
Ejemplo n.º 3
0
 def partition_data(self,args):
     method, j = args
     if method== "vi":
         dp = BayesianGaussianMixture(n_components = self.K,weight_concentration_prior = self.alpha, max_iter=1,init_params='kmeans',weight_concentration_prior_type='dirichlet_process')
         dp.fit(self.X[self.U[j]])
         Z = dp.predict(self.X[self.U[j]]).astype(int)
         Z_star = dp.predict(self.X_star).astype(int)
     if method=="gmm":
         Z,Z_star= self.uncollapsed_dp_partition_alt(j)
     elif method=="kmean":
         km = KMeans(n_clusters=self.K)
         Z = km.fit_predict(self.X[self.U[j]]).astype(int)
         Z_star = km.predict(self.X_star[self.U[j]]).astype(int)
     else:
         Z = np.random.choice(self.K,size = self.N_minibatch,replace=True)
         Z_star = np.random.choice(np.unique(Z),size = self.N_star,replace=True)
     le = LE()
     le.fit(np.hstack((Z,Z_star)))
     Z = le.transform(Z)
     Z_star = le.transform(Z_star)
     if (method=="vi"): #& (self.vi_partition):
         Z_diff = np.setdiff1d(Z_star,Z)
         if Z_diff.size > 0:
             idx = np.hstack((np.where(Z_star==k) for k in Z_diff)).flatten()
             unique_Z = np.unique(Z)
             post_Z = dp.predict_proba(self.X_star[idx])[:,unique_Z]
             Z_star[idx] = [np.random.choice(unique_Z,p = post_Z_i / post_Z_i.sum() ) for post_Z_i in post_Z]
             assert(np.setdiff1d(Z_star,Z).size == 0)
     return(Z,Z_star)
 def __init__(self, articles_folder=None, labels_file=None):
     super().__init__(articles_folder, labels_file)
     self.clean()
     self.sentences = [
         "[CLS] " + sentence + " [SEP]" for sentence in self.sentences
     ]
     self.le = LE()
     self.labels = self.le.fit_transform(self.gold_labels)
Ejemplo n.º 5
0
def labele(tbl, cols='all'):
    from sklearn.preprocessing import LabelEncoder as LE
    if cols == 'all': cols = tbl.columns
    le = LE()
    for ac in tbl.columns:
        tbl.loc[:, ac] = le.fit(tbl[ac]).transform(
            tbl[ac])  #might have to return le
    return tbl
Ejemplo n.º 6
0
def main():
    features = [
        'time1', 'id1', 'ppid1', 'pid1', 'exe1', 'time2', 'id2', 'ppid2',
        'pid2', 'exe2', 'time3', 'id3', 'ppid3', 'pid3', 'exe3'
    ]
    try:
        fmatrix = pd.read_csv('feature_matrix.csv', header=0)
    except:
        fmatrix = pd.DataFrame({})
        for ftype in ['benign', 'malicious']:
            for i in range(1, 50):
                fvfile = 'fv_' + ftype + str(i) + '.out'
                timefile = 'times_' + ftype + str(i) + '.txt'
                fv = pd.read_csv(fvfile, header=0)
                with open(timefile) as t:
                    times = re.findall('(start|end)=(\d+)', t.read())
                    start = int(times[0][1])
                    end = int(times[1][1])
                if ftype == 'benign':
                    fv['label'] = [0] * len(fv)
                else:
                    fv['label'] = fv['time1'].apply(payload, args=(start, end))
                fmatrix = pd.concat((fmatrix, fv), ignore_index=True)

        for f in features:
            fmatrix[f] = LE().fit_transform(fmatrix[f])
        fmatrix.to_csv('feature_matrix.csv', index=False)

    #clf = xgb.XGBClassifier()
    clf = RandomForestClassifier()
    le = {f: LE() for f in features}
    train, test = train_test_split(fmatrix)
    clf.fit(train[features], train['label'])
    ypred = clf.predict_proba(test[features])
    fpr, tpr, _ = roc_curve(test['label'], ypred[:, 1])

    layout = Layout(title='ROC curve for random forest with all features',
                    xaxis=dict(title='False positive rate'),
                    yaxis=dict(title='True positive rate'),
                    showlegend=True)
    fig = Figure(data=[
        Scatter(x=fpr, y=tpr, mode='lines', name='AUC %f' % auc(fpr, tpr))
    ],
                 layout=layout)
    py.plot(fig, filename='rf feature roc', auto_open=False)
Ejemplo n.º 7
0
 def partition_data(self,j):
     dp = BayesianGaussianMixture(n_components = int(self.alpha*np.log(self.N)) ,weight_concentration_prior = self.alpha, init_params='kmeans',weight_concentration_prior_type='dirichlet_process')
     Z = dp.fit_predict(self.X[self.U[j]])
     le = LE()
     Z = le.fit_transform(Z)
     Z_count = np.bincount(Z)
     assert(Z.max()+1 == Z_count.size)
     self.K[j] = int(Z_count.size)
     self.marginal_LL_k[j] = {k:0 for k in range(int(self.K[j])) }
     return(Z,Z_count)
Ejemplo n.º 8
0
 def __init__(self, faqslist, type):
     self.faqslist = faqslist
     self.stemmer = LancasterStemmer()
     self.le = LE()
     self.vectorizers = {
         "tfidf": TfidfVectorGenerator(),
         "doc2vec": Doc2VecGenerator(),
         "bert": BertGenerator(),
         "sent2vec": Sent2VecGenerator()
     }
     self.build_model(type)
Ejemplo n.º 9
0
    def __init__(self, faqslist):
        self.faqslist = faqslist
        self.stemmer = LancasterStemmer()
        self.le = LE()
        self.vectorizer = TfidfVectorizer(min_df=1, stop_words='english')
        dataframeslist = [
            pd.read_csv(csvfile).dropna() for csvfile in self.faqslist
        ]
        self.data = pd.concat(dataframeslist, ignore_index=True)
        self.questions = self.data['Question'].values

        self.build_model()
Ejemplo n.º 10
0
def test_data_prep(csv_file):
    train_data = pd.read_csv(os.path.abspath(csv_file))
    train_data = train_data.set_index('PassengerId')
    train_data = train_data.drop(['Cabin','Name','Ticket'],axis = 1)
    train_data['Age'] = train_data['Age'].fillna(value = train_data['Age'].mean())
    train_data['Fare'] = train_data['Age'].fillna(value = train_data['Fare'].mean())
    train_data = train_data.dropna(axis=0)
    le = LE()
    train_data['Sex'] = le.fit_transform(train_data['Sex'])
    train_data['Embarked'] = le.fit_transform(train_data['Embarked'])
    train_data['Age'] = (tf.keras.utils.normalize(np.array(train_data['Age']),order=2)).reshape(-1,1)
    train_data['Fare'] = (tf.keras.utils.normalize(np.array(train_data['Fare']),order=2)).reshape(-1,1)
    return(train_data)
Ejemplo n.º 11
0
def _load_dataset():
    df_train = pd.read_csv(ENROLL_TRAIN)
    df_train['dataset'] = 'train'
    df_test = pd.read_csv(ENROLL_TEST)
    df_test['dataset'] = 'test'
    df = pd.concat([df_train, df_test])
    truth_df = pd.read_csv(TRUTH_TRAIN, names=[
        'enrollment_id',
        'target',
    ]).replace(1.0, 'dropout').replace(0.0, 'continue')
    df = df.merge(truth_df, how='left', on='enrollment_id')
    df['target'] = df['target'].fillna('testcase')
    df['course_num'] = LE().fit_transform(df['course_id'])
    return df
Ejemplo n.º 12
0
def _class_q(y_true=0, y_pred=0):
    labels_en = LE()
    labels_en.fit(y_true)
    y_pred = labels_en.fit(y_pred)
    try:
        a = accuracy_score(y_pred=y_pred,
                           y_true=y_true,
                           pos_label=labels_en.classes_)
    except:
        a = np.nan
    try:

        fpr, tpr, thresholds = roc_curve(y_true, y_pred)
        b = auc(fpr, tpr)
    except:
        b = np.nan
    try:
        c = average_precision_score(
            y_true,
            y_pred,
        )
    except:
        c = np.nan
    try:
        d = balanced_accuracy_score(y_pred=y_pred, y_true=y_true)
    except:
        d = np.nan

    #e = roc_auc_score(y_true,y_pred,'micro' )

    erg = np.zeros((1, 4))
    erg[0, 0] = a
    erg[0, 1] = b
    erg[0, 2] = c
    erg[0, 3] = d

    res_class = pd.DataFrame(data=erg,
                             columns=[
                                 'accuracy_score', 'auc',
                                 'average_precision_score',
                                 'balanced_accuracy_score'
                             ])

    return res_class
Ejemplo n.º 13
0
def train_data_prep(csv_file):
    train_data = pd.read_csv(os.path.abspath(csv_file))
    train_data = train_data.set_index('PassengerId')
    train_data = train_data.drop(['Cabin','Name','Ticket'],axis = 1)
    train_data['Age'] = train_data['Age'].fillna(value = train_data['Age'].mean())
    train_data['Fare'] = train_data['Age'].fillna(value = train_data['Fare'].mean())
    train_data = train_data.dropna(axis=0)
    le = LE()
    train_data['Sex'] = le.fit_transform(train_data['Sex'])
    train_data['Embarked'] = le.fit_transform(train_data['Embarked'])
    train_data['Age'] = (tf.keras.utils.normalize(np.array(train_data['Age']),order=2)).reshape(-1,1)
    train_data['Fare'] = (tf.keras.utils.normalize(np.array(train_data['Fare']),order=2)).reshape(-1,1)
#     train_data['Pclass'] = (tf.keras.utils.normalize(np.array(train_data['Pclass']),order=2)).reshape(-1,1)
#     train_data['Sex'] = (tf.keras.utils.normalize(np.array(train_data['Sex']),order=2)).reshape(-1,1)
#     train_data['Embarked'] = (tf.keras.utils.normalize(np.array(train_data['Embarked']),order=2)).reshape(-1,1)
#     train_data['Parch'] = (tf.keras.utils.normalize(np.array(train_data['Parch']),order=2)).reshape(-1,1)
    X = train_data.drop('Survived',axis = 1)
    y = train_data['Survived']
    return(train_test_split(X, y, test_size=0.10, random_state=42))
Ejemplo n.º 14
0
    def train_model(self):
        le = LE()
        jobs = pd.read_excel("train.xls")
        jobs['Location'] = le.fit_transform(jobs['Location'].values)  # 城市重编码

        jobs['Education'] = jobs['Education'].replace('中专', 0)  # 学历重编码
        jobs['Education'] = jobs['Education'].replace('高中', 1)
        jobs['Education'] = jobs['Education'].replace('大专', 2)
        jobs['Education'] = jobs['Education'].replace('本科', 3)
        jobs['Education'] = jobs['Education'].replace('硕士', 4)
        ## 特征选择
        X = jobs[['Location', 'Education', 'Experience']]
        ## 结果集
        y = jobs['Salary']

        ## 模型学习
        model = LR()
        model.fit(X, y)
        joblib.dump(
            le, "G:\practical-training-main\machine_learning\model\le.model")
        joblib.dump(
            model,
            "G:\practical-training-main\machine_learning\model\lr.model")
Ejemplo n.º 15
0
    def __init__(
        self,
        data_path,
        var_dict,
        n_bins_range=range(2, 21),
        batch_size=512,
        n_epoch=20,
        embedding_dim=16,
        lr=0.001,
        weight_decay=0.0,
        verbose=False,
        cv=10,
        n_init_bins_list=[5, 10, 15, 20],
        co_occur_cutoff=1,
    ):

        self.data = pd.read_csv(data_path)
        self.var_dict = var_dict

        self.n_bins_range = n_bins_range
        self.n_init_bins_list = n_init_bins_list

        self.semantic_binning = SemanticBinning(
            self.var_dict,
            batch_size=batch_size,
            n_epoch=n_epoch,
            embedding_dim=embedding_dim,
            lr=lr,
            weight_decay=weight_decay,
            verbose=verbose,
            co_occur_cutoff=co_occur_cutoff)
        self.cv = cv
        self.y = LE().fit_transform(self.data[var_dict['class_var']])

        self.lr_params = [0.5, 1.0]
        self.dt_params = [3, 4, 5]
        self.rf_params = [10, 20, 30]
Ejemplo n.º 16
0
if EXPORT_MODELS:
    if not os.path.isdir(model_dir+'/feature_extractors'):
        os.makedirs(model_dir+'/feature_extractors')
    if not os.path.isdir(model_dir+'/pca'):
        os.makedirs(model_dir+'/pca')
    if not os.path.isdir(model_dir+'/models'):
        os.makedirs(model_dir+'/models')
if args.toy:
    data = pd.read_csv(train_path, sep=',', na_values=None, na_filter=False).sample(10000) # toy
else:
    data = pd.read_csv(train_path, sep=',', na_values=None, na_filter=False)
if not TESTING:
    if IMPORT_MODELS and os.path.exists(model_dir+"/feature_extractors/le.pkl"):
        label_le = load_model(model_dir+"/feature_extractors/le.pkl")
    else:
        label_le = LE().fit(data.click)
        if EXPORT_MODELS:
            with open(model_dir+"/feature_extractors/le.pkl", "wb") as f: # export pca transformer
                pickle.dump(label_le, f, pickle.HIGHEST_PROTOCOL)
    label = label_le.transform(data.click)
    del data['click'] # 記得別讓答案變成一組 feature ,這樣 model 就直接看到答案了
# 特徵選擇、降維 改交給 SVD 分解完成

selected_col = ['spaceType','spaceId','adType','os','deviceType','campaignId','advertiserId']
data = data[selected_col]

if IMPORT_MODELS and os.path.exists(model_dir+"/feature_extractors/dv.pkl"):
    dv = load_model(model_dir+"/feature_extractors/dv.pkl")
else:
    dv = DictVectorizer(sparse=False).fit(data.T.to_dict().values()) # 要執行這步,你/妳的 RAM 要夠大 (>8G 一定沒問題)
    if EXPORT_MODELS:
Ejemplo n.º 17
0
# In[8]:

train_data = train_data.dropna(axis=0)

# In[9]:

pd.isna(train_data).sum()

# In[10]:

train_data[0:10]

# In[11]:

le = LE()
train_data['Sex'] = le.fit_transform(train_data['Sex'])
train_data['Embarked'] = le.fit_transform(train_data['Embarked'])

# In[12]:

train_data[0:10]

# In[13]:

train_data.describe()

# In[14]:

train_data['Age'] = (tf.keras.utils.normalize(np.array(train_data['Age']),
                                              order=2)).reshape(-1, 1)
Ejemplo n.º 18
0
def data_transform(sl_data, tk_data, tt_data, us_data):
    '''
	Merge serviceline data with ticket data
	Take distinct values in tags, group names, request types
	Create dummy variables for each 
	'''
    #@local Variables
    #CATEG = ['
    le = LE()
    #CATEG = ['device_type', 'plan_var']
    try:
        db_conn = ms.connect("localhost", "root", "root", "churn_model")
        print('MySQL connection:\t\t\t[OK]')
    except:
        print('MySQL connection:\t\t\t[Fail]')

    try:
        db_curs = db_conn.cursor()
        print('Cursor:\t\t\t[OK]')
    except:
        print('Cursor:\t\t\t[Fail]')

    sl_data = sl_data[[
        'sl_uuid', 'churn_flag', 'device_switch', 'customer_life',
        'device_type', 'multiline_flag', 'plan_var', 'device_life'
    ]]
    tk_data = tk_data[['sl_uuid', 'num_tickets', 'first_time', 'full_time']]
    tt_data = tt_data[[
        'sl_uuid', 'group_name', 'request_type', 'ticket_class', 'ticket_type',
        'tags', 'diff_date'
    ]]
    us_data = us_data[['sl_uuid', 'voice_dt', 'data_dt']]

    #---------PROCESS STEP:- MERGE DATA:: service line + number of tickets
    model_data = pd.merge(sl_data, tk_data, on=['sl_uuid'], how='left')
    model_data = pd.merge(model_data, us_data, on=['sl_uuid'], how='left')
    model_data = model_data.dropna(subset=['voice_dt'], how='all')
    print model_data.columns
    #---------PROCESS STEP:- Get distinct values of dummy variables
    device_name = sl_data['device_type'].unique().tolist()
    plan_name = sl_data['plan_var'].unique().tolist()
    group_name = tt_data['group_name'].unique().tolist()
    request_type = tt_data['request_type'].unique().tolist()
    ticket_class = tt_data['ticket_class'].unique().tolist()
    ticket_type = tt_data['ticket_type'].unique().tolist()
    tags = tt_data['tags'].tolist()
    tags_distinct = []
    for i in tags:
        split_strings = i.split(' ')
        for j in split_strings:
            if j not in tags_distinct:
                tags_distinct.append(j)

    #@substep: write the unique values to files
    group_file = open('Dummy variables/Group_name.txt', 'w')
    request_file = open('Dummy variables/Request_type.txt', 'w')
    tclass_file = open('Dummy variables/Ticket_class.txt', 'w')
    tag_file = open('Dummy variables/Tag_name.txt', 'w')
    ttype_file = open('Dummy variables/Ticket_type.txt', 'w')

    for g in group_name:
        group_file.write(str(g) + '')
        group_file.write('\n')

    for r in request_type:
        request_file.write(str(r) + '')
        request_file.write('\n')

    for tc in ticket_class:
        tclass_file.write(str(tc) + '')
        tclass_file.write('\n')

    for t in tags_distinct:
        tag_file.write(str(t) + '')
        tag_file.write('\n')

    for tt in ticket_type:
        ttype_file.write(str(tt) + '')
        ttype_file.write('\n')
    print("Writing to files:\t\t\t[OK]")

    group_file.close()
    tclass_file.close()
    tag_file.close()
    ttype_file.close()
    request_file.close()
    dummy_var_header = group_name + request_type + ticket_class + ticket_type
    dummy_var_header2 = device_name + plan_name
    dummy = {}
    sl_uuid = model_data['sl_uuid'].unique()

    #Create dataframes for ticket type and then use that to merge with serviceline data
    group_df = pd.get_dummies(tt_data['group_name'])
    group_df['sl_uuid'] = tt_data['sl_uuid']
    group_df = group_df.groupby(['sl_uuid']).sum()
    group_df['sl_uuid'] = group_df.index.tolist()
    print tt_data.head()
    print group_df.head()
    request_df = pd.get_dummies(tt_data['request_type'])
    request_df['sl_uuid'] = tt_data['sl_uuid']
    request_df = request_df.groupby(['sl_uuid']).sum()
    request_df['sl_uuid'] = request_df.index.tolist()
    class_df = pd.get_dummies(tt_data['ticket_class'])
    class_df['sl_uuid'] = tt_data['sl_uuid']
    class_df = class_df.groupby(['sl_uuid']).sum()
    class_df['sl_uuid'] = class_df.index.tolist()
    type_df = pd.get_dummies(tt_data['ticket_type'])
    type_df['sl_uuid'] = tt_data['sl_uuid']
    type_df = type_df.groupby(['sl_uuid']).sum()
    type_df['sl_uuid'] = type_df.index.tolist()
    device_df = pd.get_dummies(sl_data['device_type'])
    device_df['sl_uuid'] = model_data['sl_uuid']
    plan_df = pd.get_dummies(sl_data['plan_var'])
    plan_df['sl_uuid'] = model_data['sl_uuid']
    print len(model_data)
    #merge
    model_df = pd.merge(model_data[['sl_uuid']],
                        group_df,
                        on=['sl_uuid'],
                        how='left')
    model_df.fillna(0, inplace=True)

    model_df = pd.merge(model_df, request_df, on=['sl_uuid'], how='left')
    model_df.fillna(0, inplace=True)

    model_df = pd.merge(model_df, class_df, on=['sl_uuid'], how='left')
    model_df.fillna(0, inplace=True)

    model_df = pd.merge(model_df, type_df, on=['sl_uuid'], how='left')
    model_df.fillna(0, inplace=True)

    model_df = pd.merge(model_df, device_df, on=['sl_uuid'], how='left')
    print model_df.columns.tolist()
    model_df = pd.merge(model_df, plan_df, on=['sl_uuid'], how='left')
    print model_df.columns.tolist()
    model_df = model_df.dropna()
    print len(model_df)
    '''
	for i in sl_uuid:
		subset = tt_data[tt_data['sl_uuid'] == i]
		subset2 = sl_data[sl_data['sl_uuid'] == i]
		dummy_indiv = []
		if len(subset) > 0:
			#@comment - has some tickets and can be worked on creating dummy
			#group_name

			for x in group_name:
				if x in subset['group_name'].tolist():
					dummy_indiv.append(subset['group_name'].tolist().count(x))
				else:
					dummy_indiv.append(0)
		
			for x in request_type:
				if x in subset['request_type'].tolist():
					dummy_indiv.append(subset['request_type'].tolist().count(x))
				else:
					dummy_indiv.append(0)

			for x in ticket_class:
				if x in subset['ticket_class'].tolist():
					dummy_indiv.append(subset['ticket_class'].tolist().count(x))
				else:
					dummy_indiv.append(0)
			
			for x in ticket_type:
				if x in subset['ticket_type'].tolist():
					dummy_indiv.append(subset['ticket_type'].tolist().count(x))
				else:
					dummy_indiv.append(0)

	'''
    '''		
			sl_tags = []
			temp = subset['tags'].tolist()
			for y in temp:
				sl_tags += y.split(' ')
			sl_tags = list(set(sl_tags))	
			for x in tags_distinct:
				if x in sl_tags:
					dummy_indiv.append(1)
				else:
					dummy_indiv.append(0)
	'''
    '''
			dummy[i] = dummy_indiv								
		else:
			dummy[i] = [0] * len(dummy_var_header)
		if len(subset2) > 0:
			print "entering subset2"
			dummy_indiv = []
			for x in device_name:
				if x in subset2['device_type'].tolist():
					dummy_indiv.append(subset2['device_type'].tolist().count(x))
				else:
					dummy_indiv.append(0)
			for x in plan_name:
				if x in subset2['plan_var'].tolist():
					dummy_indiv.append(subset2['plan_var'].tolist().count(x))
				else:
					dummy_indiv.append(0)
			dummy[i]+= dummy_indiv
		else:
			dummy[i] += [0] * len(dummy_var_header2)
		print sl_uuid.tolist().index(i)
		print dummy[i]	
	'''
    '''
	model_df = pd.DataFrame.from_dict(dummy, 'index')
	model_df.columns = dummy_var_header	
	'''
    model_df = pd.merge(model_df, model_data, on=['sl_uuid'], how='left')
    col = model_df.columns.tolist()
    col.remove('device_type')
    col.remove('plan_var')
    model_df = model_df[col].copy()
    model_df['num_tickets'] = model_df['num_tickets'].fillna(0)
    col = model_df.columns.tolist()
    #col.remove('customer_life')

    #@sql write: to local db -> model dataset
    try:
        db_curs.execute('DROP TABLE model_data;')
    except:
        print('Table not present')
    '''	
	model_df = model_data	
	model_df['num_tickets'] = model_df['num_tickets'].fillna(0)
	col = model_df.columns.tolist()
	col.remove('num_tickets')
	model_df = model_df[['device_switch', 'churn_flag', 'sl_uuid']]
	pk.dump(model_df, open('/home/rsrash1990/Ravi Files/Work Projects/Churn Model/Pickle data/model_data.pk','w'))
	'''
    #model_df.to_sql(con=db_conn, name='model_data', if_exists='append', flavor='mysql')
    #print model_df.head()
    #print model_df.describe()
    '''
	for i in model_df.columns:
		temp = model_df[i].tolist()
		print (i + '\t:' + str(sum(x is None for x in temp)))
	'''

    #model = model_df[['sl_uuid', 'churn_flag']]

    pk.dump(
        model_df,
        open(
            '/home/rsrash1990/Ravi Files/Work Projects/Churn Model/Pickle data/model_df.pk',
            'w'))
    return model_df
Ejemplo n.º 19
0
def preprocessing(dataset):
    for col in dataset.columns:
        le = LE()
        dataset[col] = le.fit_transform(dataset[col])
    return dataset
    def __init__(self,
                 target_building,
                 target_srcids,
                 fold,
                 rounds,
                 use_all_metadata=False,
                 source_building=None
                 ):
        super(ActiveLearningInterface, self).__init__(
            target_building=target_building,
            target_srcids=target_srcids
        )

        srcids = [point['srcid'] for point
                  in LabeledMetadata.objects(building=target_building)]
        pt_type = [LabeledMetadata.objects(srcid=srcid).first().point_tagset
                   for srcid in srcids]
        if use_all_metadata:
            pt_name = []
            for srcid in srcids:
                raw_metadata = RawMetadata.objects(srcid=srcid).first().metadata
                sentence = []
                sentence = '\n'.join([raw_metadata.get(metadata_type, '')
                                      for metadata_type
                                      in ['VendorGivenName',
                                          'BACnetName',
                                          'BACnetDescription']
                                      ])
                pt_name.append(sentence)
        else:
            pt_name = [RawMetadata.objects(srcid=srcid).first()
                       .metadata['VendorGivenName'] for srcid in srcids]

        fn = get_name_features(pt_name)

        le = LE()
        try:
            le.fit(pt_type)
        except:
            pdb.set_trace()

        transfer_fn = []
        transfer_label = []

        if source_building:
            srcids = [point['srcid'] for point
                      in LabeledMetadata.objects(building=source_building)]
            source_type = [LabeledMetadata.objects(srcid=srcid).first().point_tagset
                       for srcid in srcids]
            source_name = [RawMetadata.objects(srcid=srcid).first()\
                       .metadata['VendorGivenName'] for srcid in srcids]

            fn_all = get_name_features( pt_name + source_name )
            fn = fn_all[:len(pt_name), :]
            transfer_fn = fn_all[len(pt_name):, :]

            try:
                le.fit( pt_type + source_type )
                transfer_label = le.transform(source_type)
            except:
                pdb.set_trace()

            print ('%d instances loaded from transferred bldg: %s'%(len(transfer_label), source_building))

        try:
            label = le.transform(pt_type)
        except:
            pdb.set_trace()

        #print ('# of classes is %d'%len(np.unique(label)))
        print ('running active learning by Hong on building %s'%target_building)
        print ('%d instances loaded'%len(pt_name))


        self.learner = active_learning(
            fold,
            rounds,
            #2 * len( np.unique(label) ),
            28,
            fn,
            label,
            transfer_fn,
            transfer_label
        )
q1, q3 = np.percentile(df['fractal_dimension_worst_log'], [25, 75])
iqr = q3 - q1
lower_bound = q1 - (1.5 * iqr)
upper_bound = q3 + (1.5 * iqr)
median = np.median(df['fractal_dimension_worst_log'])
count = 0
for i in range(len(df['fractal_dimension_worst_log'])):
    if (df.iloc[i]['fractal_dimension_worst_log'] > upper_bound) or (
            df.iloc[i]['fractal_dimension_worst_log'] < lower_bound):
        count = count + 1
# print((count/len(df['fractal_dimension_worst_log']))*100)

# sns.countplot(x='diagnosis',data=df)

from sklearn.preprocessing import LabelEncoder as LE
df['diagnosis_1'] = LE().fit_transform(df['diagnosis'])
# print ("\n\n", df.head(15))

df.drop([
    'radius_mean', 'Unnamed: 32', 'texture_mean', 'perimeter_mean',
    'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean',
    'concave points_mean', 'texture_worst', 'perimeter_worst', 'area_worst',
    'smoothness_worst', 'fractal_dimension_mean', 'radius_se', 'texture_se',
    'perimeter_se', 'area_se', 'smoothness_se', 'compactness_se',
    'concavity_se', 'concave points_se', 'symmetry_se', 'fractal_dimension_se',
    'radius_worst', 'texture_worst', 'perimeter_worst', 'area_worst',
    'smoothness_worst', 'compactness_worst', 'concavity_worst',
    'concave points_worst', 'symmetry_worst', 'fractal_dimension_worst'
],
        axis=1,
        inplace=True)
Ejemplo n.º 22
0
import re
import tensorflow as tf
from tensorflow.contrib import learn
import warnings
warnings.filterwarnings('ignore')
# Library for tensorflow logging
import logging
logging.getLogger().setLevel(logging.INFO)
df = pd.read_csv("../input/train.csv")
# Selecting features
features = ["bone_length", "rotting_flesh", "hair_length", "color", "has_soul"]
X = df[features]
y = df["type"]
# Encoding type (Ghost,Ghouls,Goblin) and color
from sklearn.preprocessing import LabelEncoder as LE
letype = LE()
y = letype.fit_transform(y)
lecolor = LE()
X["color"] = lecolor.fit_transform(X["color"])
# splitting function used for cross validation
from sklearn.cross_validation import train_test_split
# current test size = 0 to permit the usage of whole training data
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.0,
                                                    random_state=9)
# define a network with a single hidden RELU layer of 15 hidden units
tf_clf_dnn = learn.DNNClassifier(hidden_units=[15], n_classes=3)
tf_clf_dnn.fit(X_train, y_train, max_steps=5500)
from sklearn.metrics import accuracy_score as as_
# print(as_(y_test,tf_clf_dnn.predict(X_test)))
Ejemplo n.º 23
0
    data[i, 0] = data[i, 0].lower()
#print(data[:,0])
'''
# Encoding loop
#data_enc = LE()
data_enc = np.full(8, None)
for i in [0,5,6,7]:
    data_enc[i] = LE()
    data[:,i] = data_enc[i].fit_transform(data[:,i])
ohe_enc = OHE(categorical_features=[0])
#data = ohe_enc.fit_transform(data).toarray()
#data = data.astype(int)
print(data ,pd.DataFrame(data))
'''
# Encoding
level_enc = LE()
activitve_enc = LE()
study_enc = LE()
biography_enc = LE()
data[:, 0] = level_enc.fit_transform(data[:, 0])
data[:, 5] = activitve_enc.fit_transform(data[:, 5])
data[:, 6] = study_enc.fit_transform(data[:, 6])
data[:, 7] = biography_enc.fit_transform(data[:, 7])
print(data, pd.DataFrame(data))
# 輸出模型
pickle.dump(level_enc, open('model/level_enc_model.sav', 'wb'))
pickle.dump(activitve_enc, open('model/activitve_enc_model.sav', 'wb'))
pickle.dump(study_enc, open('model/study_enc_model.sav', 'wb'))
pickle.dump(biography_enc, open('model/biography_enc_model.sav', 'wb'))

# 正規化
Ejemplo n.º 24
0
 def __init__(self, faqslist, type='tfidf'):
     self.faqslist = faqslist
     self.stemmer = LancasterStemmer()
     self.le = LE()
     self.classifier = None
     self.build_model(type)
Ejemplo n.º 25
0
#importing the dataset
dataset=pd.read_csv('Data.csv')
X=dataset.iloc[:,:-1].values
Y=dataset.iloc[:,3].values

#taking care of missing data
from sklearn.preprocessing import Imputer
imputer=Imputer(missing_values='NaN',strategy='mean',axis=0)
imputer=imputer.fit(X[:,1:3])
X[:,1:3]=imputer.transform(X[:,1:3])

#Encoding Categorical data
from sklearn.preprocessing import LabelEncoder as LE
from sklearn.preprocessing import OneHotEncoder as OE
labelencoder_x=LE()
X[:,0]=labelencoder_x.fit_transform(X[:,0])
onehotencoder=OE(categorical_features=[0])
X=onehotencoder.fit_transform(X).toarray()
labelencoder_y=LE()
Y=labelencoder_y.fit_transform(Y)

#splitting the dataset into train daata seet and test data set
from sklearn.model_selection import train_test_split
X_Train,X_Test,Y_Train,Y_Test=train_test_split(X,Y,test_size=0.2,random_state=0)

#feature scaling
from sklearn.preprocessing import StandardScaler
sc_x=StandardScaler()
X_Train=sc_x.fit_transform(X_Train)
X_Test=sc_x.transform(X_Test)
Ejemplo n.º 26
0
    def __init__(self,
                 target_building,
                 target_srcids,
                 source_buildings,
                 config={},
                 load_from_file=1):
        super(BuildingAdapterInterface,
              self).__init__(target_building=target_building,
                             source_buildings=source_buildings,
                             target_srcids=target_srcids)

        #gather the source/target data and name features, labels
        #TODO: handle multiple source buildings
        self.stop_predict_flag = False

        if 'source_time_ranges' in config:
            self.source_time_ranges = config['source_time_ranges']
            assert len(self.source_time_ranges) == len(source_buildings)
        else:
            self.source_time_ranges = [(None, None)]\
                * len(source_buildings)
        if 'target_time_range' in config:
            self.target_time_range = config['target_time_range']
        else:
            self.target_time_range = (None, None)

        if 'threshold' in config:
            self.threshold = config['threshold']
        else:
            self.threshold = 0.5

        source_building = source_buildings[0]

        if not load_from_file:
            #data features
            source_ids, train_fd = get_data_features(
                source_building, self.source_time_ranges[0][0],
                self.source_time_ranges[0][1])
            target_ids, test_fd = get_data_features(target_building,
                                                    self.target_time_range[0],
                                                    self.target_time_range[1])

            #name features, labels
            source_res = get_namefeatures_labels(source_building)
            train_label = [source_res[srcid][1] for srcid in source_ids]

            self.target_res = get_namefeatures_labels(target_building)
            test_fn = np.asarray(
                [self.target_res[tgtid][0] for tgtid in target_ids])
            test_label = [self.target_res[tgtid][1] for tgtid in target_ids]

            #find the label intersection
            intersect = list(set(test_label) & set(train_label))
            print('intersected tagsets:', intersect)

            #preserve the intersection, get ids for indexing data feature matrices
            if intersect:
                train_filtered = [[i, j] for i, j in enumerate(train_label)
                                  if j in intersect]
                train_id, train_label = [list(x) for x in zip(*train_filtered)]
                test_filtered = [[
                    i, j, k
                ] for i, (j, k) in enumerate(zip(test_label, target_ids))
                                 if j in intersect]
                self.test_id, test_label, self.test_srcids = [
                    list(x) for x in zip(*test_filtered)
                ]
            else:
                raise ValueError('no common labels!')

            self.train_fd = train_fd[train_id, :]
            self.test_fd = test_fd[self.test_id, :]
            self.test_fn = test_fn[self.test_id, :]

            print('%d training examples left' % len(self.train_fd))
            print('%d testing examples left' % len(self.test_fd))

            self.le = LE()
            self.le.fit(intersect)
            self.train_label = self.le.transform(train_label)
            self.test_label = self.le.transform(test_label)

            res = [
                self.train_fd, self.test_fd, self.train_label, self.test_label,
                self.test_fn, self.test_srcids, self.target_res, self.le
            ]
            with open('./%s-%s.pkl' % (source_building, target_building),
                      'wb') as wf:
                pk.dump(res, wf)

        else:
            print('loading from prestored file')
            with open('./%s-%s.pkl' % (source_building, target_building),
                      'rb') as rf:
                res = pk.load(rf)
            self.train_fd, self.test_fd, self.train_label, self.test_label, self.test_fn, self.test_srcids, self.target_res, self.le = \
            res[0], res[1], res[2], res[3], res[4], res[5], res[6], res[7]

        print('# of classes:', len(set(self.train_label)))
        print('data features for %s with dim:' % source_building,
              self.train_fd.shape)
        print('data features for %s with dim:' % target_building,
              self.test_fd.shape)

        self.learner = transfer_learning(self.train_fd,
                                         self.test_fd,
                                         self.train_label,
                                         self.test_label,
                                         self.test_fn,
                                         threshold=self.threshold)

        self.run_auto()
Ejemplo n.º 27
0
# perform one hot encoding on categorical data of X
target_cols = [
    'workclass', 'education', 'marital_status', 'occupation', 'relationship',
    'race', 'sex', 'native_country'
]
print('\n dataset = pandas.get_dummies(dataset)')

X = pd.get_dummies(X, columns=target_cols)
print('\n X.shape: after 1 hot')
print(X.shape)

# perform categorical encoding of Y since d vals are a binary set
print('\n Y.head(10)')
print(Y.head(10))

encoder = LE()
encoder.fit(Y)
encoded_Y = encoder.transform(Y)

new_Y = pd.DataFrame(encoded_Y)

print('\n new_Y.head(10)')
print(new_Y.head(10))

# convert X and new_Y to numpy arrays
X = X.values
Y = new_Y.values

# minmax X
scaler = MMS(feature_range=(0, 1))
rescaledX = scaler.fit_transform(X)
Ejemplo n.º 28
0
def patient_education_level():
    df = pd.concat([
        pd.read_csv(g.FILE_TRAIN_OUT, usecols=['education_level']),
        pd.read_csv(g.FILE_TEST_OUT, usecols=['education_level']),
    ])
    return LE().fit_transform(df.education_level).reshape((-1, 1))
Ejemplo n.º 29
0
    l = []
    l.append(np.round(item * 100, decimals=2))
print('Correlation between this group and attrition is {}%'.format(l[0]))

# Feature Engineering and Preparation for Machine Learning
# --------------------------------------------------------
#
# Now that I've got a feel for my data I'll engage in a little feature engineering and preparation for classification.
#
# NOTE: Subsequent to preparing this kernel I learned that I could use the pandas.get_dummies() method to accomplish something very similar. It accomplishes the same thing and has the advantage of being native to pandas, which I always use anyway. I won't make the change to this kernel, though.

# In[16]:

from sklearn.preprocessing import LabelEncoder as LE

data['Attrition'] = LE().fit_transform(data['Attrition'])
data['Department'] = LE().fit_transform(data['Department'])
data['EducationField'] = LE().fit_transform(data['EducationField'])
data['Gender'] = LE().fit_transform(data['Gender'])
data['JobRole'] = LE().fit_transform(data['JobRole'])
data['MaritalStatus'] = LE().fit_transform(data['MaritalStatus'])
data['Over18'] = LE().fit_transform(data['Over18'])
data['OverTime'] = LE().fit_transform(data['OverTime'])
data['BusinessTravel'] = LE().fit_transform(data['BusinessTravel'])
del data['left']
del data['OT']
del data['EmployeeNumber']
del data['EmployeeCount']

# Now let's see if we can extract some goodness with a clustering algorithm. Something I have noticed with HR datasets is that employee behavior seems to exhibit a fair amount of clustering.
#
Ejemplo n.º 30
0
def show_course_activity(course_cd=9):
    with open(TEMPFILE, 'rb') as f:
        log_df = pickle.load(f)

    # Load logs
    enr_df = load_enroll()
    ref_course_df = pd.read_csv(REF_COURSE_CODE)
    truth_df = load_truth().fillna(-1)

    enr_df = enr_df.merge(truth_df, how='left', on='enrollment_id')
    enr_df = enr_df.merge(ref_course_df, how='left', on='course_id')
    enr_df = enr_df[enr_df['course_cd'] == course_cd]
    log_df = log_df[log_df['course_cd'] == course_cd]

    obj_df = pd.read_csv(OBJECT,
                         usecols=['module_id', 'category'
                                  ]).rename(columns={'module_id': 'object'})
    log_df = log_df.merge(obj_df, how='left', on='object')

    # Encode object ids
    log_df = log_df.sort('time')
    obj_time = log_df.groupby('object').head(1).reset_index()[[
        'object', 'time'
    ]]
    obj_encoder = LE()
    obj_time_ls = obj_encoder.fit(obj_time['object'])
    uniq_obj = len(log_df['object'].unique())
    uniq_obj_names = sorted(obj_df['category'].unique())

    true_enr_id = enr_df[enr_df['target'] == 1].head(50)
    false_enr_id = enr_df[enr_df['target'] == 0].head(50)

    f, ax_list = plt.subplots(50, 2, figsize=(10, 13), sharex=True)

    # For top50 dropout enrollment
    for i, (idx, row) in enumerate(true_enr_id.iterrows()):
        enr_id = row['enrollment_id']
        df = log_df[log_df['enrollment_id'] == enr_id]
        ax = ax_list[i, 0]

        sns.set_palette('husl')
        for category_name in uniq_obj_names:
            selected_df = df[df['category'] == category_name]
            ax.plot(selected_df['time'].map(parse_date),
                    obj_encoder.transform(selected_df['object']), '.')

        ax.set_ylim((0, uniq_obj))
        _change_tick_fontsize(ax, 8)

        dateFmt = mpl.dates.DateFormatter('%Y-%m-%d')
        ax.xaxis.set_major_formatter(dateFmt)
        daysLoc = mpl.dates.DayLocator()
        hoursLoc = mpl.dates.HourLocator(interval=6)
        ax.xaxis.set_major_locator(daysLoc)
        ax.xaxis.set_minor_locator(hoursLoc)
        for ticklabel in ax.xaxis.get_ticklabels():
            ticklabel.set_rotation(80)

    # For top50 continue enrollment
    for i, (idx, row) in enumerate(false_enr_id.iterrows()):
        enr_id = row['enrollment_id']
        df = log_df[log_df['enrollment_id'] == enr_id]
        ax = ax_list[i, 1]

        sns.set_palette('husl')
        for category_name in uniq_obj_names:
            selected_df = df[df['category'] == category_name]
            ax.plot(selected_df['time'].map(parse_date),
                    obj_encoder.transform(selected_df['object']), '.')

        ax.set_ylim((0, uniq_obj))
        _change_tick_fontsize(ax, 8)

        dateFmt = mpl.dates.DateFormatter('%Y-%m-%d')
        ax.xaxis.set_major_formatter(dateFmt)
        daysLoc = mpl.dates.DayLocator()
        hoursLoc = mpl.dates.HourLocator(interval=6)
        ax.xaxis.set_major_locator(daysLoc)
        ax.xaxis.set_minor_locator(hoursLoc)
        for ticklabel in ax.xaxis.get_ticklabels():
            ticklabel.set_rotation(80)

    plt.tight_layout()
    plt.subplots_adjust(top=0.962, hspace=0.09)
    plt.savefig(OUTPUT_PATH)