Ejemplo n.º 1
0
def x_2func(traindir,testdir):
    '''
    train = nb.read('train_nb_eventmodel')
    sta = nb.sta_count(train)
    category_tokens = get_category_tokens()
    token_x = x_2(train,sta,category_tokens)
    nb.write(token_x,'token_x')
    
    '''
    category = nb.read('category_nb_eventmodel')
    tokens_x = nb.read('token_x')
    category_convert = nb.convert(category)
    tokens_all_x = []
    x_category = {}
    for i  in range(10):
        tokens = sorted(tokens_x[i],key=tokens_x[i].get,reverse = True)
        x_category[i] = tokens[:100]
        '''
        x_category[i] = []
        for word in tokens:
            if tokens_x[i][word] >10.83:
                x_category[i].append(word)
            else:
                break
        '''
        print len(x_category[i])
        tokens_all_x = set(tokens_all_x)|set(x_category[i])
    print len(tokens_all_x)
    nb.write(x_category,'x_category')
    nb.write(tokens_all_x,'tokens_all_x')
def logistic_l1():
    traindir ='./data/training'
    testdir = './data/test'
    
    tokens = list(nb.read('tokens'))
    train_x,train_y,category = vec.func2(traindir,tokens)
    train_x = np.array(train_x)
    train_y = np.array(train_y)
    print train_x.shape
    clf = LogisticRegression(penalty='l2')
    clf.fit(train_x,train_y)

    category = nb.read('category')
    result,test_x,test_file= vec.func3(testdir,tokens,category)
    test_x = np.array(test_x)
    print test_x.shape
    
    predict = np.array(clf.predict(test_x))
    
    test_file = np.array(test_file)
    predict = np.column_stack((test_file,predict))
    
    category = nb.read('category_nb_eventmodel')
    category_convert = nb.convert(category)
    result = nb.read('result')
    path = './data/logistic_l1.csv'
    evaluate = nb.sta_result(predict,category_convert,result,path)
def logistic_l1():
    traindir = './data/training'
    testdir = './data/test'

    tokens = list(nb.read('tokens'))
    train_x, train_y, category = vec.func2(traindir, tokens)
    train_x = np.array(train_x)
    train_y = np.array(train_y)
    print train_x.shape
    clf = LogisticRegression(penalty='l2')
    clf.fit(train_x, train_y)

    category = nb.read('category')
    result, test_x, test_file = vec.func3(testdir, tokens, category)
    test_x = np.array(test_x)
    print test_x.shape

    predict = np.array(clf.predict(test_x))

    test_file = np.array(test_file)
    predict = np.column_stack((test_file, predict))

    category = nb.read('category_nb_eventmodel')
    category_convert = nb.convert(category)
    result = nb.read('result')
    path = './data/logistic_l1.csv'
    evaluate = nb.sta_result(predict, category_convert, result, path)
def logistic_own():
    train_x = nb.read('train_x')
    train_y = nb.read('train_y')
    category = nb.read('category')
    result =nb.read('result')
    test_x = nb.read('test_x')
    test_file = nb.read('test_file')
    m,n=train_x.shape
    temp = np.ones((m,1))
    train_x = np.column_stack((temp,train_x))
    
    temp = np.ones((len(test_x),1))
    test_x = np.column_stack((temp,test_x))
    
    predict = np.zeros((len(test_x),1))
    train_x = np.mat(train_x)
    train_y = np.mat(train_y).transpose()
    test_x = np.mat(test_x)
    #由于要实现多分类,我们可以通过多个二分类来实现预测
    for i in range(10):
        binary_y = np.mat(np.zeros((m,1)).astype(int))
        for index in range(len(train_y)):
            if train_y[index]==i:
                binary_y[index]=1
            else:
                binary_y[index]=0
        weight = np.mat(np.ones((n+1,1)))
        alpha = 0.0001
        maxitem = 100
        for k in range(maxitem):
            h = sigmoid(train_x*weight)
            #我们在计算代价函数的时候,不能简单的用公式实现,应当进行判断
            J = calj(binary_y,h,m)
            #J = 1.0/m*(-binary_y.transpose()*np.log2(h)-(1-binary_y.transpose())*np.log2(1-h))
            error = h-binary_y
            weight -= alpha*(train_x.transpose()*error)
        binary_predict = test_x*weight
        for index in range(len(binary_predict)):
            if binary_predict[index]>0:
                predict[index]=i

    predict = np.array(predict).astype(int)
    test_file = np.array(test_file)
    predict = np.column_stack((test_file,predict))
    
    category = nb.read('category_nb_eventmodel')
    category_convert = nb.convert(category)
    result = nb.read('result')
    path = './data/logistic_l1.csv'
    evaluate = nb.sta_result(predict,category_convert,result,path)
def logistic_own():
    train_x = nb.read('train_x')
    train_y = nb.read('train_y')
    category = nb.read('category')
    result = nb.read('result')
    test_x = nb.read('test_x')
    test_file = nb.read('test_file')
    m, n = train_x.shape
    temp = np.ones((m, 1))
    train_x = np.column_stack((temp, train_x))

    temp = np.ones((len(test_x), 1))
    test_x = np.column_stack((temp, test_x))

    predict = np.zeros((len(test_x), 1))
    train_x = np.mat(train_x)
    train_y = np.mat(train_y).transpose()
    test_x = np.mat(test_x)
    #由于要实现多分类,我们可以通过多个二分类来实现预测
    for i in range(10):
        binary_y = np.mat(np.zeros((m, 1)).astype(int))
        for index in range(len(train_y)):
            if train_y[index] == i:
                binary_y[index] = 1
            else:
                binary_y[index] = 0
        weight = np.mat(np.ones((n + 1, 1)))
        alpha = 0.0001
        maxitem = 100
        for k in range(maxitem):
            h = sigmoid(train_x * weight)
            #我们在计算代价函数的时候,不能简单的用公式实现,应当进行判断
            J = calj(binary_y, h, m)
            #J = 1.0/m*(-binary_y.transpose()*np.log2(h)-(1-binary_y.transpose())*np.log2(1-h))
            error = h - binary_y
            weight -= alpha * (train_x.transpose() * error)
        binary_predict = test_x * weight
        for index in range(len(binary_predict)):
            if binary_predict[index] > 0:
                predict[index] = i

    predict = np.array(predict).astype(int)
    test_file = np.array(test_file)
    predict = np.column_stack((test_file, predict))

    category = nb.read('category_nb_eventmodel')
    category_convert = nb.convert(category)
    result = nb.read('result')
    path = './data/logistic_l1.csv'
    evaluate = nb.sta_result(predict, category_convert, result, path)
def logistic_x():
    train_x,train_y,category,result,test_x,test_file = preprocess()
    clf = LogisticRegression(penalty='l1')
    clf.fit(train_x,train_y)
    
    predict = clf.predict(test_x)
    
    predict = np.array(predict)
    predict = np.column_stack((test_file,predict))
    
    category = nb.read('category_nb_eventmodel')
    category_convert = nb.convert(category)
    result = nb.read('result')
    path = './data/logistic_l1.csv'
    evaluate = nb.sta_result(predict,category_convert,result,path)
def logistic_x():
    train_x, train_y, category, result, test_x, test_file = preprocess()
    clf = LogisticRegression(penalty='l1')
    clf.fit(train_x, train_y)

    predict = clf.predict(test_x)

    predict = np.array(predict)
    predict = np.column_stack((test_file, predict))

    category = nb.read('category_nb_eventmodel')
    category_convert = nb.convert(category)
    result = nb.read('result')
    path = './data/logistic_l1.csv'
    evaluate = nb.sta_result(predict, category_convert, result, path)
def sto_logistic():
    train_x = nb.read('train_x')
    train_y = nb.read('train_y')
    category = nb.read('category')
    result =nb.read('result')
    test_x = nb.read('test_x')
    test_file = nb.read('test_file')
    m,n=train_x.shape
    temp = np.ones((m,1))
    train_x = np.column_stack((temp,train_x))
    
    temp = np.ones((len(test_x),1))
    test_x = np.column_stack((temp,test_x))
    
    predict = np.zeros((len(test_x),1))
    train_x = np.mat(train_x)
    train_y = np.mat(train_y).transpose()
    test_x = np.mat(test_x)
    #由于要实现多分类,我们可以通过多个二分类来实现预测
    for i in range(10):
        binary_y = np.mat(np.zeros((m,1)).astype(int))
        for index in range(len(train_y)):
            if train_y[index]==i:
                binary_y[index]=1
            else:
                binary_y[index]=0
        weight = np.mat(np.ones((n+1,1)))
        alpha = 0.001
        maxitem =5000
        for k in range(maxitem):
            index = random.randrange(m)
            h = sigmoid(train_x[index]*weight)
            error = h - binary_y[index]
            weight -= alpha*(train_x[index].transpose()*error)
        binary_predict = test_x*weight
        for index in range(len(binary_predict)):
            if binary_predict[index]>0:
                predict[index]=i

    predict = np.array(predict).astype(int)
    test_file = np.array(test_file)
    predict = np.column_stack((test_file,predict))
    
    category = nb.read('category_nb_eventmodel')
    category_convert = nb.convert(category)
    result = nb.read('result')
    path = './data/logistic_l1.csv'
    evaluate = nb.sta_result(predict,category_convert,result,path)
def sto_logistic():
    train_x = nb.read('train_x')
    train_y = nb.read('train_y')
    category = nb.read('category')
    result = nb.read('result')
    test_x = nb.read('test_x')
    test_file = nb.read('test_file')
    m, n = train_x.shape
    temp = np.ones((m, 1))
    train_x = np.column_stack((temp, train_x))

    temp = np.ones((len(test_x), 1))
    test_x = np.column_stack((temp, test_x))

    predict = np.zeros((len(test_x), 1))
    train_x = np.mat(train_x)
    train_y = np.mat(train_y).transpose()
    test_x = np.mat(test_x)
    #由于要实现多分类,我们可以通过多个二分类来实现预测
    for i in range(10):
        binary_y = np.mat(np.zeros((m, 1)).astype(int))
        for index in range(len(train_y)):
            if train_y[index] == i:
                binary_y[index] = 1
            else:
                binary_y[index] = 0
        weight = np.mat(np.ones((n + 1, 1)))
        alpha = 0.001
        maxitem = 5000
        for k in range(maxitem):
            index = random.randrange(m)
            h = sigmoid(train_x[index] * weight)
            error = h - binary_y[index]
            weight -= alpha * (train_x[index].transpose() * error)
        binary_predict = test_x * weight
        for index in range(len(binary_predict)):
            if binary_predict[index] > 0:
                predict[index] = i

    predict = np.array(predict).astype(int)
    test_file = np.array(test_file)
    predict = np.column_stack((test_file, predict))

    category = nb.read('category_nb_eventmodel')
    category_convert = nb.convert(category)
    result = nb.read('result')
    path = './data/logistic_l1.csv'
    evaluate = nb.sta_result(predict, category_convert, result, path)
Ejemplo n.º 10
0
def gifunc():
    train = nb.read('train_nb_eventmodel')
    sta = nb.sta_count(train)
    all_tokens = get_all_tokens()
    tokens_gi = gi(train,sta,all_tokens)
    nb.write(tokens_gi,'tokens_gi')

    category = nb.read('category_nb_eventmodel')
    tokens_gi = nb.read('tokens_gi')
    category_convert = nb.convert(category)
    tokens_all_gi = []
    gi_category = {}
    for i in range(10):
        tokens = sorted(tokens_gi,key=tokens_gi.get,reverse=True)
        gi_category[i] = tokens[:100]
        tokens_all_gi = set(tokens_all_gi)|set(gi_category[i])
    print tokens_all_gi
    nb.write(gi_category,'gi_category')
    nb.write(tokens_all_gi,'tokens_all_gi')
Ejemplo n.º 11
0
def dffunc():
    #这里先使用文档频率,需要统计在某一类别下词t在多少个文档中出现
    '''
    train = nb.read('train_nb_eventmodel')
    sta = nb.sta_count(train)
    all_tokens = get_all_tokens()
    tokens_df = df(train,sta,all_tokens)
    nb.write(tokens_df,'tokens_df')
    '''
    category = nb.read('category_nb_eventmodel')
    tokens_df = nb.read('tokens_df')
    category_convert = nb.convert(category)
    tokens_all_df = []
    df_category = {}
    for i in range(10):
        tokens = sorted(tokens_df,key=tokens_df.get,reverse = True)
        df_category[i] = tokens[:200]
        tokens_all_df = set(tokens_all_df)|set(df_category[i])
    print tokens_all_df

    nb.write(df_category,'df_category')
    nb.write(tokens_all_df,'tokens_all_df')
Ejemplo n.º 12
0
def mi_func():
    '''
    train = nb.read('train_nb_eventmodel') 
    sta = nb.sta_count(train)
    category_tokens = get_category_tokens()
    token_mi = mi(train,sta,category_tokens)
    nb.write(token_mi,'token_mi')
    '''
    category = nb.read('category_nb_eventmodel')
    token_mi = nb.read('token_mi')
    category_convert = nb.convert(category)
    tokens_all_mi = []
    mi_category = {}
    for i in range(10):
        tokens = sorted(token_mi[i],key = token_mi[i].get,reverse = True)
        mi_category[i] = tokens[:500]
        tokens_all_mi = set(tokens_all_mi)|set(mi_category[i])
    print len(tokens_all_mi)
    nb.write(mi_category,'mi_category')
    nb.write(tokens_all_mi,'tokens_all_mi')
    
    '''