def model(): """TODO MLmodel """ from sklearn.linear_model import LogisticRegression from sklearn.metrics import accuracy_score data = dataset.load_data() print("all samples= ",data.shape) print("dataY contains:", np.unique(data[:,1])) data = pd.DataFrame(data, columns=['domain', 'label']) data = data.drop_duplicates(subset='domain') data = np.array(data) trainX = data[:30000,0] trainY = data[:30000,1].astype(int) testX = data[30000:30500, 0] testY = data[30000:30500,1].astype(int) #print(trainX) print("trainY contains: ", np.unique(trainY)) #print(testX) print("testY contains: ", np.unique(testY)) feature_table = get_feature(trainX) LR = LogisticRegression() LR = LR.fit(feature_table,trainY) pred = LR.predict(get_feature(testX)) acc = accuracy_score(testY, pred) print("acc stage 1: ", acc) joblib.dump(LR, './models/LR.pkl') algorithm_domains = dataset.load_simple_data() algorithm_domains = list(set(algorithm_domains)) algorithm_y = [0]*len(algorithm_domains) pred_feature = get_feature(algorithm_domains) pred = LR.predict(pred_feature) acc = accuracy_score(algorithm_y, pred) print("acc stage 2: ", acc) #if __name__ == '__main__': # model()
def getNextData(recalc=False, return_image_path=False, use_images_without_output=False): for image_path in images_path: """ json_path = image_path.rsplit('.')[0] + '.json' if not os.path.isfile(json_path): continue with open(json_path) as data_file: data = json.load(data_file) for i in xrange(len(data)/6): image = cv2.imread(image_path) x = [float(p[0])/image.shape[1] for p in data[i*6:i*6+6]] y = [p[1] for p in data[i*6:i*6+6]] y = sum(y)/len(y) crop_image = crop(image, y) #cv2.imwrite("la.jpg", crop_image) image = np.array(crop_image).astype(float).flatten()/255.0 #print "image shape:", image.shape, crop.shape yield (image, x) """ input_data = feature.get_input(image_path) output_data = feature.get_feature(image_path, recalc=recalc) if output_data is None and not use_images_without_output: continue if return_image_path: yield (input_data, output_data, image_path) else: yield (input_data, output_data)
def train(mode): if(mode == 1): x,y = get_feature(mode,Have_none_path) else: x, y = get_feature(mode, Pos_neg_path) input_shape = x.shape[1:]#与samples个数无关 split_at = len(x) - len(x) // 10 (x_train, x_val) = x[:split_at], x[split_at:] (y_train, y_val) = y[:split_at], y[split_at:] # convert class vectors to binary class matrices y_train = keras.utils.to_categorical(y_train, 2) y_val = keras.utils.to_categorical(y_val, 2) print x_train,y_train #开始建立CNN模型 batch_size = 128 epochs = 3 model = Sequential() model.add(Conv2D(32, kernel_size=(3, 3),activation='relu',input_shape=input_shape)) model.add(Conv2D(64, (3, 3), activation='relu')) model.add(MaxPooling2D(pool_size=(2, 2))) model.add(Dropout(0.25)) model.add(Flatten()) model.add(Dense(128, activation='relu')) model.add(Dropout(0.5)) model.add(Dense(2, activation='softmax')) #sgd = keras.optimizers.SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True) model.compile(loss=keras.losses.binary_crossentropy,optimizer='Adam',metrics=['accuracy']) model.summary() model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs,verbose=1, validation_data=(x_val, y_val)) score = model.evaluate(x_val, y_val, verbose=0) print('Test loss:', score[0]) print('Test accuracy:', score[1]) #将模型保存在本地 if(mode == 1): model.save(Have_none_model) else: model.save(Pos_neg_model)
def get_emotion_svm(filename): filePath = os.path.abspath(__file__) #文件绝对路径 dirPath = os.path.dirname(filePath) #文件的目录绝对路径 # 读取特征参数 feature = np.array(get_feature(filename=filename)) # 加载神经网络 svm = joblib.load(dirPath + "/svm2.m") # 加载降维模型 pca = joblib.load(dirPath + "/pca.m") # 降维 feature_new = pca.transform(feature.reshape([1, -1])) r = svm.predict(feature_new)[0] result = r return int(result)
def get_emotion_net(filename): filePath = os.path.abspath(__file__) #文件绝对路径 dirPath = os.path.dirname(filePath) #文件的目录绝对路径 # 读取特征参数 feature = np.array(get_feature(filename=filename)) # 加载神经网络 net = NetworkReader.readFrom(dirPath + '/net2.xml') # 加载降维模型 pca = joblib.load(dirPath + "/pca.m") # 降维 feature_new = pca.transform(feature.reshape([1, -1]))[0] o = net.activate(feature_new).tolist() result = o.index(max(o)) return result
df_es_train = pd.read_csv(spanish_train_path, sep='\t', names=['es0', 'en0', 'es1', 'en1', 'label']) # df_es2en = pd.read_csv(unlabel_spanish_train_path, # sep='\t', names=['es', 'en']) df_test = pd.read_csv(test_path, sep='\t', names=['es0', 'es1']) # df_test = pd.read_pickle('../output/df_en_test.pkl') if en: df_train = df_en_train else: df_train = df_es_train preprocess(df_train, df_test, en=en) get_feature(df_train, en=en) df_train.to_pickle( '../output/df_en_train.pkl' if en else '../output/df_es_train.pkl') # df_es_train = pd.read_pickle('../output/df_es_train.pkl') print(len(df_train)) predictors = ['dot'] + ['minkowski_' + str(i) for i in range(1, 3)] + ['wmd'] + \ ['ratio', 'partial_ratio', 'token_sort_ratio', 'token_set_ratio'] + ['jaccard'] +\ ['edit_distance'] + \ ['stop_' + token for token in (es_stop_list if not en else en_stop_list)] + \ ['5w1h_' + token for token in (es_5w1h_list if not en else en_5w1h_list)] target = 'label'
train_label_end_date, train_features_begin_dates, train_feature_end_date = get_dates(train_label_begin_date) else: test_label_begin_date = datetime(2017, 5, 1) test_label_end_date, test_feature_begin_dates, test_feature_end_date = get_dates(test_label_begin_date) train_label_begin_date = datetime(2017, 4, 1) train_label_end_date, train_feature_begin_dates, train_feature_end_date = get_dates(train_label_begin_date) ########## FEATURE EXTRACTION ########## # get training feature and label train_feature = get_feature(data, train_feature_begin_dates, train_feature_end_date, featured_month_periods) train_label = get_label(data, train_label_begin_date, train_label_end_date) # get test feature test_feature = get_feature(data, test_feature_begin_dates, test_feature_end_date, featured_month_periods) ########## MODEL TRAINING ########## x_train = train_feature.drop('user_id', axis=1) y_train = train_label.drop('user_id', axis=1) x_test = test_feature.drop('user_id', axis=1) model_params = { 'task': 'train', 'boosting_type': 'gbdt',
def detector(domain): feature_table = get_feature(domain) pred_y = model.predict(feature_table) _check(pred_y, len(domain)) return pred_y
def get_feature(id): feature_obj = feature.get_feature(id) serializer = FeatureGetSerializer(feature_obj) return serializer.data
model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs,verbose=1, validation_data=(x_val, y_val)) score = model.evaluate(x_val, y_val, verbose=0) print('Test loss:', score[0]) print('Test accuracy:', score[1]) #将模型保存在本地 if(mode == 1): model.save(Have_none_model) else: model.save(Pos_neg_model) # mode = 1 mode = 2#表示pos/neg情感分类 train(mode) #测试模型部分,用的是训练过程和提取特征过程都没接触的数据 from keras.models import load_model mode = 2 x,y = get_feature(mode,Test_pos_neg_path) model = load_model(Pos_neg_model) probabilities = model.predict(x) y_predict = [] print probabilities for i in probabilities: if(i[0]<i[1]): y_predict.append([1]) else: y_predict.append([0]) accuracy = np.mean(y_predict == y) print("Prediction Accuracy: %.2f%%" % (accuracy*100))
name3 = ["User_id",'Merchant_id','Coupon_id','Discount_rate','Distance','Date_received'] dfoff = pd.read_csv('data/ccf_offline_stage1_train.csv',names=name1) dftest = pd.read_csv('data/ccf_offline_stage1_test_revised.csv',names=name3) dfoff = pre(dfoff) dfoff['date'] = pd.to_datetime(dfoff['Date'], format='%Y%m%d') dfoff['label'] = list(map(lambda x ,y : 1 if (x - y).total_seconds()/60*60*24 <= 15 else 0 ,dfoff['date'],dfoff['date_received'])) dftest = pre(dftest) train_field = dfoff[dfoff['date_received'].isin(pd.date_range('2016/3/2',periods= 60))] validate_field = dfoff[dfoff['date_received'].isin(pd.date_range('2016/1/16',periods= 60))] test_field = dfoff[dfoff['date_received'].isin(pd.date_range('2016/4/17',periods= 60))] #构造训练集、验证集、与测试集 train = get_feature(train_field)[["User_id", "Coupon_id", "Date_received","is_manjian","discount_rate", "min_pay_of_manjian","null_Distance","label","simple_User_id_received_cnt", "simple_User_id_Coupon_id_received_cnt","simple_User_id_Date_received_received_cnt", "simple_User_id_Coupon_id_Date_received_received_cnt", "simple_User_id_Coupon_id_Date_received_repeat_received"]] validate = get_feature(validate_field)[["User_id", "Coupon_id", "Date_received","is_manjian","discount_rate", "min_pay_of_manjian","null_Distance","label","simple_User_id_received_cnt", "simple_User_id_Coupon_id_received_cnt","simple_User_id_Date_received_received_cnt", "simple_User_id_Coupon_id_Date_received_received_cnt", "simple_User_id_Coupon_id_Date_received_repeat_received"]] test = get_feature(test_field)[["User_id", "Coupon_id", "Date_received","is_manjian","discount_rate", "min_pay_of_manjian","null_Distance","simple_User_id_received_cnt", "simple_User_id_Coupon_id_received_cnt","simple_User_id_Date_received_received_cnt", "simple_User_id_Coupon_id_Date_received_received_cnt", "simple_User_id_Coupon_id_Date_received_repeat_received"]] dtrain = xgb.DMatrix(train.drop(["User_id", "Coupon_id", "Date_received",'label'], axis= 1) ,label= train['label']) dtest = xgb.DMatrix(test.drop(["User_id", "Coupon_id", "Date_received"],axis= 1))