def model():
	"""TODO MLmodel
	"""
	from sklearn.linear_model import LogisticRegression
	from sklearn.metrics import accuracy_score


	data = dataset.load_data()
	print("all samples= ",data.shape)

	print("dataY contains:", np.unique(data[:,1]))

	data = pd.DataFrame(data, columns=['domain', 'label'])
	data = data.drop_duplicates(subset='domain')
	data = np.array(data)

	trainX = data[:30000,0]
	trainY = data[:30000,1].astype(int) 
	testX = data[30000:30500, 0]
	testY = data[30000:30500,1].astype(int)

	#print(trainX)
	print("trainY contains: ", np.unique(trainY))
	#print(testX)
	print("testY contains: ", np.unique(testY))

	feature_table = get_feature(trainX)

	LR = LogisticRegression()
	LR = LR.fit(feature_table,trainY)
	pred = LR.predict(get_feature(testX))
	acc = accuracy_score(testY, pred)
	print("acc stage 1: ", acc)

	joblib.dump(LR, './models/LR.pkl')
	algorithm_domains = dataset.load_simple_data()
	algorithm_domains = list(set(algorithm_domains))
	algorithm_y = [0]*len(algorithm_domains)

	pred_feature = get_feature(algorithm_domains)
	pred = LR.predict(pred_feature)

	acc = accuracy_score(algorithm_y, pred)
	print("acc stage 2: ", acc)



#if __name__ == '__main__':
#	model()
Exemple #2
0
def getNextData(recalc=False,
                return_image_path=False,
                use_images_without_output=False):
    for image_path in images_path:
        """
        json_path = image_path.rsplit('.')[0] + '.json'
        if not os.path.isfile(json_path):
            continue
        with open(json_path) as data_file:    
            data = json.load(data_file)
        for i in xrange(len(data)/6):
            image = cv2.imread(image_path)
            x = [float(p[0])/image.shape[1] for p in data[i*6:i*6+6]]
            y = [p[1] for p in data[i*6:i*6+6]]
            y = sum(y)/len(y)
            crop_image = crop(image, y)
            #cv2.imwrite("la.jpg", crop_image)
            image = np.array(crop_image).astype(float).flatten()/255.0
            #print "image shape:", image.shape, crop.shape
            yield (image, x)
        """
        input_data = feature.get_input(image_path)
        output_data = feature.get_feature(image_path, recalc=recalc)
        if output_data is None and not use_images_without_output:
            continue

        if return_image_path:
            yield (input_data, output_data, image_path)
        else:
            yield (input_data, output_data)
Exemple #3
0
def train(mode):
    if(mode == 1):
        x,y = get_feature(mode,Have_none_path)
    else:
        x, y = get_feature(mode, Pos_neg_path)
    input_shape = x.shape[1:]#与samples个数无关
    split_at = len(x) - len(x) // 10
    (x_train, x_val) = x[:split_at], x[split_at:]
    (y_train, y_val) = y[:split_at], y[split_at:]

    # convert class vectors to binary class matrices
    y_train = keras.utils.to_categorical(y_train, 2)
    y_val = keras.utils.to_categorical(y_val, 2)
    print x_train,y_train

    #开始建立CNN模型
    batch_size = 128
    epochs = 3

    model = Sequential()
    model.add(Conv2D(32, kernel_size=(3, 3),activation='relu',input_shape=input_shape))
    model.add(Conv2D(64, (3, 3), activation='relu'))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(Dropout(0.25))
    model.add(Flatten())
    model.add(Dense(128, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(2, activation='softmax'))
    #sgd = keras.optimizers.SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)

    model.compile(loss=keras.losses.binary_crossentropy,optimizer='Adam',metrics=['accuracy'])
    model.summary()

    model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs,verbose=1, validation_data=(x_val, y_val))
    score = model.evaluate(x_val, y_val, verbose=0)
    print('Test loss:', score[0])
    print('Test accuracy:', score[1])

    #将模型保存在本地
    if(mode == 1):
        model.save(Have_none_model)
    else:
        model.save(Pos_neg_model)
Exemple #4
0
def get_emotion_svm(filename):

    filePath = os.path.abspath(__file__)  #文件绝对路径
    dirPath = os.path.dirname(filePath)  #文件的目录绝对路径

    #   读取特征参数
    feature = np.array(get_feature(filename=filename))
    #   加载神经网络
    svm = joblib.load(dirPath + "/svm2.m")
    #   加载降维模型
    pca = joblib.load(dirPath + "/pca.m")
    #   降维
    feature_new = pca.transform(feature.reshape([1, -1]))

    r = svm.predict(feature_new)[0]

    result = r

    return int(result)
Exemple #5
0
def get_emotion_net(filename):

    filePath = os.path.abspath(__file__)  #文件绝对路径
    dirPath = os.path.dirname(filePath)  #文件的目录绝对路径

    #   读取特征参数
    feature = np.array(get_feature(filename=filename))
    #   加载神经网络
    net = NetworkReader.readFrom(dirPath + '/net2.xml')
    #   加载降维模型
    pca = joblib.load(dirPath + "/pca.m")
    #   降维
    feature_new = pca.transform(feature.reshape([1, -1]))[0]

    o = net.activate(feature_new).tolist()

    result = o.index(max(o))

    return result
Exemple #6
0
    df_es_train = pd.read_csv(spanish_train_path,
                              sep='\t',
                              names=['es0', 'en0', 'es1', 'en1', 'label'])

    # df_es2en = pd.read_csv(unlabel_spanish_train_path,
    #                        sep='\t', names=['es', 'en'])
    df_test = pd.read_csv(test_path, sep='\t', names=['es0', 'es1'])
    # df_test = pd.read_pickle('../output/df_en_test.pkl')

    if en:
        df_train = df_en_train
    else:
        df_train = df_es_train
    preprocess(df_train, df_test, en=en)

    get_feature(df_train, en=en)
    df_train.to_pickle(
        '../output/df_en_train.pkl' if en else '../output/df_es_train.pkl')

    # df_es_train = pd.read_pickle('../output/df_es_train.pkl')

    print(len(df_train))

    predictors = ['dot'] + ['minkowski_' + str(i) for i in range(1, 3)] + ['wmd'] + \
                 ['ratio', 'partial_ratio', 'token_sort_ratio', 'token_set_ratio'] + ['jaccard'] +\
                 ['edit_distance'] + \
                 ['stop_' + token for token in (es_stop_list if not en else en_stop_list)] + \
                 ['5w1h_' +
                     token for token in (es_5w1h_list if not en else en_5w1h_list)]
    target = 'label'
Exemple #7
0
	train_label_end_date, train_features_begin_dates, train_feature_end_date = get_dates(train_label_begin_date)

else:

	test_label_begin_date = datetime(2017, 5, 1)
	test_label_end_date, test_feature_begin_dates, test_feature_end_date = get_dates(test_label_begin_date)

	train_label_begin_date = datetime(2017, 4, 1)
	train_label_end_date, train_feature_begin_dates, train_feature_end_date = get_dates(train_label_begin_date)


########## FEATURE EXTRACTION ##########


# get training feature and label
train_feature = get_feature(data, train_feature_begin_dates, train_feature_end_date, featured_month_periods)
train_label = get_label(data, train_label_begin_date, train_label_end_date)

# get test feature
test_feature = get_feature(data, test_feature_begin_dates, test_feature_end_date, featured_month_periods)


########## MODEL TRAINING ##########

x_train = train_feature.drop('user_id', axis=1)
y_train = train_label.drop('user_id', axis=1)
x_test = test_feature.drop('user_id', axis=1)

model_params = {
  'task': 'train',
  'boosting_type': 'gbdt',
Exemple #8
0
def detector(domain):
    feature_table = get_feature(domain)
    pred_y = model.predict(feature_table)
    _check(pred_y, len(domain))

    return pred_y
Exemple #9
0
def get_feature(id):
    feature_obj = feature.get_feature(id)
    serializer = FeatureGetSerializer(feature_obj)
    return serializer.data
Exemple #10
0
    model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs,verbose=1, validation_data=(x_val, y_val))
    score = model.evaluate(x_val, y_val, verbose=0)
    print('Test loss:', score[0])
    print('Test accuracy:', score[1])

    #将模型保存在本地
    if(mode == 1):
        model.save(Have_none_model)
    else:
        model.save(Pos_neg_model)

# mode = 1
mode = 2#表示pos/neg情感分类
train(mode)

#测试模型部分,用的是训练过程和提取特征过程都没接触的数据
from keras.models import load_model
mode = 2
x,y = get_feature(mode,Test_pos_neg_path)
model = load_model(Pos_neg_model)
probabilities = model.predict(x)
y_predict = []
print probabilities
for i in probabilities:
    if(i[0]<i[1]):
        y_predict.append([1])
    else:
        y_predict.append([0])
accuracy = np.mean(y_predict == y)
print("Prediction Accuracy: %.2f%%" % (accuracy*100))
Exemple #11
0
name3 = ["User_id",'Merchant_id','Coupon_id','Discount_rate','Distance','Date_received']
dfoff = pd.read_csv('data/ccf_offline_stage1_train.csv',names=name1)
dftest = pd.read_csv('data/ccf_offline_stage1_test_revised.csv',names=name3)

dfoff = pre(dfoff)
dfoff['date'] = pd.to_datetime(dfoff['Date'], format='%Y%m%d')
dfoff['label'] = list(map(lambda x ,y : 1 if (x - y).total_seconds()/60*60*24 <= 15 else 0 ,dfoff['date'],dfoff['date_received']))
dftest = pre(dftest)

train_field = dfoff[dfoff['date_received'].isin(pd.date_range('2016/3/2',periods= 60))]
validate_field = dfoff[dfoff['date_received'].isin(pd.date_range('2016/1/16',periods= 60))]
test_field = dfoff[dfoff['date_received'].isin(pd.date_range('2016/4/17',periods= 60))]
#构造训练集、验证集、与测试集
train = get_feature(train_field)[["User_id", "Coupon_id", "Date_received","is_manjian","discount_rate",
                                  "min_pay_of_manjian","null_Distance","label","simple_User_id_received_cnt",
                                  "simple_User_id_Coupon_id_received_cnt","simple_User_id_Date_received_received_cnt",
                                  "simple_User_id_Coupon_id_Date_received_received_cnt",
                                  "simple_User_id_Coupon_id_Date_received_repeat_received"]]
validate = get_feature(validate_field)[["User_id", "Coupon_id", "Date_received","is_manjian","discount_rate",
                                  "min_pay_of_manjian","null_Distance","label","simple_User_id_received_cnt",
                                  "simple_User_id_Coupon_id_received_cnt","simple_User_id_Date_received_received_cnt",
                                  "simple_User_id_Coupon_id_Date_received_received_cnt",
                                  "simple_User_id_Coupon_id_Date_received_repeat_received"]]
test = get_feature(test_field)[["User_id", "Coupon_id", "Date_received","is_manjian","discount_rate",
                                  "min_pay_of_manjian","null_Distance","simple_User_id_received_cnt",
                                  "simple_User_id_Coupon_id_received_cnt","simple_User_id_Date_received_received_cnt",
                                  "simple_User_id_Coupon_id_Date_received_received_cnt",
                                  "simple_User_id_Coupon_id_Date_received_repeat_received"]]

dtrain = xgb.DMatrix(train.drop(["User_id", "Coupon_id", "Date_received",'label'], axis= 1) ,label= train['label'])
dtest = xgb.DMatrix(test.drop(["User_id", "Coupon_id", "Date_received"],axis= 1))