Esempio n. 1
0
def main():
    parser = OptionParser()
    parser.add_option("-f",
                      "--filename",
                      type="string",
                      dest="file",
                      default="DataBaseDB",
                      help="NEED CORRECT PARAMETERS")
    (options, args) = parser.parse_args()
    if options.file == "error":
        print "hello world"
        sys.exit(1)
    task_file = options.file

    t_path = config("../conf/dp.conf")
    data_path = t_path["data_path"]

    L_list = []

    task_list, c_dict = get_task(data_path)
    item_list, l_dict = c_to_l(c_dict, min_sup, 1)

    i_len = 1
    while l_dict != {}:
        L_list.append(l_dict)
        i_len += 1
        com_list = list(itertools.combinations(item_list, i_len))
        l_dict = get_ldict(task_list, com_list, min_sup, i_len)

    get_rules(L_list)
Esempio n. 2
0
def main():
    parser = OptionParser()
    parser.add_option("-f","--filename",type="string",dest="file",default="DataBaseDB",help="NEED CORRECT PARAMETERS")
    (options,args) = parser.parse_args()
    if options.file == "error":
        print "hello world"
        sys.exit(1)
    task_file = options.file

    t_path = config("../conf/dp.conf")
    data_path = t_path["data_path"]

    L_list = []

    task_list,c_dict = get_task(data_path)
    item_list,l_dict = c_to_l(c_dict,min_sup,1)

    i_len = 1
    while l_dict != {}:
        L_list.append(l_dict)
        i_len += 1
        com_list = list(itertools.combinations(item_list,i_len))
        l_dict = get_ldict(task_list,com_list,min_sup,i_len)

    get_rules(L_list)
Esempio n. 3
0
def main(options):
    dp = config("../conf/dp.conf")        
    #1 merge
    if options.merge == True:
        print "合并训练测试集"
        merge(dp)
    elif options.split==True:
        print "将得到的数据分开"
        sp(dp,options.tp)
    else:
        print "error 没有这个选项"
        sys.exit(1)
def main():
    parser = OptionParser()  
    parser.add_option("-m", "--model", dest="model",  \
                      help=u"选择模型:可选择的有LR,RF,NB", metavar="your_model",default="LR")
    parser.add_option("-t","--tokenize",dest="tokenize",action="store_true",\
                      help=u"选择是否进行tokenize,tokenize会得到稍微高一点的准确率,但是效率会慢很多,默认是true",\
                      metavar="your_tokenize",default=False)

    parser.add_option("-n","--nontext",dest="nontext",action="store_true",\
                      help=u"选择是否利用非文本特征,默认是false",default=False)

    parser.add_option("-l","--LSA",dest="LSA",action="store_true",\
                      help=u"选择是否LSA,注意当选用非LR模型的时候,LSA是必须默认开着的,这个在后来我会强制一下逻辑,现在没写",\
                      default=False)
    parser.add_option("-s","--fselect",dest="fs",action="store_true",
                      help=u"选择是否进行特征选择,默认是否,加上-s后会进行选择",default=False)
    parser.add_option("-p","--topic",dest="topic",action="store_true",\
                      help=u"选择是否读取主题分布,默认是否,加上-tp后会进行读取",default=False)

    parser.add_option("-c","--combine",dest="combine",action="store_true",\
                      help=u"选择是否进行模型融合,默认是否",default=False)
    (options, args) = parser.parse_args()
    print options
    
    #读入配置文件
    dp = config("../conf/dp.conf")
    #读入数据
    print "读取数据集"
    train,test,y,label,train_nontext,test_nontext = data(dp,options.tokenize)
    print "train 大小",len(train)
    print "test 大小",len(test)

    print "读取主题"
    total_topic = topic(dp)
    train_topic = total_topic[:len(train)]
    test_topic = total_topic[len(train):]
    print "train 大小",len(train_topic)
    print "test 大小",len(test_topic)

    if options.combine==False:
        result = train_model(train,test,y,options,train_topic,test_topic,train_nontext,test_nontext)
        print "产生结果"
        gen_submission(dp,result,label)

    else:
        combine_model(dp,train,test,y,label,train_topic,test_topic,train_nontext,test_nontext)
Esempio n. 5
0
#coding=utf-8

from word import word
from read_conf import config
from nlp import NLP
import numpy as np
import os
from sklearn import linear_model
from logistic_nd import LogisticRegression

data_conf = config('../conf/dp.conf')
tr_data_path = data_conf['train_path']
te_data_path = data_conf['test_path']

cat_dict = {'acq':0,'corn':1,'crude':2,'earn':3,'grain':4,'interest':5,'money-fx':6,'ship':7,'trade':8,'wheat':9}
nlp = NLP()

def get_doc_num(path):
    docs_dict = {'doc_sum':0}
    doc_dir = os.listdir(path)
    for doc_cat in doc_dir:
        file_list = os.listdir(path+doc_cat)
        docs_dict[cat_dict[doc_cat]] = len(file_list)
        docs_dict['doc_sum'] += docs_dict[cat_dict[doc_cat]]
    return docs_dict

def get_voc_set():
    word_dict = {}
    word_no = 0
    doc_dir = os.listdir(tr_data_path)
    for doc_cat in doc_dir:
Esempio n. 6
0
# Implementation of RAKE - Rapid Automtic Keyword Exraction algorithm
# as described in:
# Rose, S., D. Engel, N. Cramer, and W. Cowley (2010). 
# Automatic keyword extraction from indi-vidual documents. 
# In M. W. Berry and J. Kogan (Eds.), Text Mining: Applications and Theory.unknown: John Wiley and Sons, Ltd.

import re
import operator
import nltk

from read_conf import config
dp = config("../conf/dp.conf")

def is_number(s):
    try:
        float(s) if '.' in s else int(s)
        return True
    except ValueError:
        return False


def load_stop_words(stop_word_file):
    """
    Utility function to load stop words from a file and return as a list of words
    @param stop_word_file Path and file name of a file containing stop words.
    @return list A list of stop words.
    """
    stop_words = []
    for line in open(stop_word_file):
        if line.strip()[0:1] != "#":
            for word in line.split():  # in case more than one per line
Esempio n. 7
0
import csv

cat_dic = {
    'acq': 0,
    'corn': 1,
    'crude': 2,
    'earn': 3,
    'grain': 4,
    'interest': 5,
    'money-fx': 6,
    'ship': 7,
    'trade': 8,
    'wheat': 9
}

t_path = config("../conf/dp.conf")
train_path = t_path["train_path"]
test_path = t_path["test_path"]
wordset_path = t_path["wordset_path"]

stopword = stop_set(t_path["stopword_path"])
pattern = r'''[a-zA-Z]+'''


def get_num():
    num_set = {}
    doc_num = 0
    doc_dir = os.listdir(train_path)
    for dd in doc_dir:
        f_list = os.listdir(train_path + dd)
        num_set[cat_dic[dd]] = len(f_list)
Esempio n. 8
0
#coding=utf-8

from read_conf import config
import jieba
import re
import os
import sys
reload(sys)
sys.setdefaultencoding('utf-8')

t_path = config('../conf/dp.conf')

def get_stop_list(path):
    stop_list = []
    with open(path,'rb') as infile:
        lines = infile.readlines()
        line_num = 1
        for line in lines:
            if line_num < 278:
                stop_list.append(line.rstrip().decode('gbk'))
                line_num += 1
            else:
                break
        infile.close()
        stop_list.append(' ')
    return stop_list

def get_word_pinyin_dict(path):
    word_dict = {}
    with open(path,'rb') as infile:
        lines = infile.readlines()
Esempio n. 9
0
#coding=utf-8
'''
我也来封装一下呗
'''

from nltk import regexp_tokenize
from nltk.stem import WordNetLemmatizer
#import textblob
#from textblob.tokenizers import SentenceTokenizer as sent_tok
#from textblob.tokenizers import WordTokenizer as word_tok
from read_conf import config

stopwords = open(config('../conf/dp.conf')['stopword_path'])
stopwords = stopwords.readlines()
stopwords = [item.strip() for item in stopwords]

pattern = r'''[a-zA-Z]+'''


class NLP(object):
    def __init__(self):
        #self.__wordnetlem = WordNetLemmatizer()
        #self.__stokenizer = sent_tok()
        #self.__wtokenizer = word_tok()
        self.__stopwords = set(stopwords)

    def word_tokenize(self, document):
        tokens = regexp_tokenize(document, pattern)
        tokens = [item.lower() for item in tokens]
        tokens = [item for item in tokens if item not in stopwords]
        return tokens
Esempio n. 10
0
#coding=utf-8
'''
我也来封装一下呗
'''

from nltk import regexp_tokenize
from nltk.stem import WordNetLemmatizer
#import textblob
#from textblob.tokenizers import SentenceTokenizer as sent_tok
#from textblob.tokenizers import WordTokenizer as word_tok
from read_conf import config

stopwords = open(config('../conf/dp.conf')['stopword_path'])
stopwords = stopwords.readlines()
stopwords = [item.strip() for item in stopwords]

pattern = r'''[a-zA-Z]+'''

class NLP(object):
    def __init__(self):
        #self.__wordnetlem = WordNetLemmatizer()
        #self.__stokenizer = sent_tok()
        #self.__wtokenizer = word_tok()
        self.__stopwords = set(stopwords)

    def word_tokenize(self,document):
        tokens = regexp_tokenize(document,pattern)
        tokens = [item.lower() for item in tokens]
        tokens = [item for item in tokens if item not in stopwords]
        return tokens
'''
Esempio n. 11
0
#coding: utf-8
from read_conf import config

def corpus(file_dir,name):
    f = open(file_dir)
    result = f.readlines()
    if name == "my":
        result = [i.split(":")[0] for i in result]
    else:
        result = [i.split()[0] for i in result]
    return set(result)

if __name__ == '__main__':
    conf = config("lda.conf")
    my_dir = conf["words_dir"]
    blei_dir = "/home/lavi/publishrepo/lda/ap/vocab.txt"

    my_corpus = corpus(my_dir,"my")
    blei_corpus = corpus(blei_dir,"blei")
    common = 0
    for word in my_corpus:
        if word in blei_corpus:
            common += 1
    print "my:%s"%(1.0*common/len(my_corpus))
    print "blei:%s"%(1.0*common/len(blei_corpus))
    
    
Esempio n. 12
0
#coding: utf-8
'''
author:yaoming
这个文件和主体算法无关,只是我用来计算学校ID和老师ID的set的个数来看是否有价值将这两条作为feature加进去
'''
from read_conf import config
import csv, sys
import numpy as np


def count_different(conf, col_num_list):
    teacher_id, school_id1, school_id2 = [], [], []
    with open(conf["project"], 'r') as pf:
        reader = csv.reader(pf)
        for line in reader:
            teacher_id.append(line[col_num_list[0]])
            school_id1.append(line[col_num_list[1]])
            school_id2.append(line[col_num_list[2]])
    print "teacher id"
    print len(set(teacher_id))
    print "school id1"
    print len(set(school_id1))
    print "school id2"
    print len(set(school_id2))


if __name__ == "__main__":
    dp_conf = config("../conf/dp.conf")
    col_list = [1, 2, 3]
    count_different(dp_conf, col_list)
Esempio n. 13
0
'''
    数据、配置的载入
'''

from csv import DictReader
from read_conf import config
from item import item
from optparse import OptionParser
import unittest
import pickle
import sys

rawconf_dir = '../conf/raw_data.conf'
dbconf_dir = '../conf/db.conf'

raw_conf = config(rawconf_dir)
db_conf = config(dbconf_dir)


def get_raw_conf():
    return raw_conf


def get_db_conf():
    return db_conf


def get_one_item(op, data_path):
    infile = open(data_path, 'rb')

    for idx, row in enumerate(DictReader(infile)):
# coding: utf-8
"""
author:yaoming
这个文件和主体算法无关,只是我用来计算学校ID和老师ID的set的个数来看是否有价值将这两条作为feature加进去
"""
from read_conf import config
import csv, sys
import numpy as np


def count_different(conf, col_num_list):
    teacher_id, school_id1, school_id2 = [], [], []
    with open(conf["project"], "r") as pf:
        reader = csv.reader(pf)
        for line in reader:
            teacher_id.append(line[col_num_list[0]])
            school_id1.append(line[col_num_list[1]])
            school_id2.append(line[col_num_list[2]])
    print "teacher id"
    print len(set(teacher_id))
    print "school id1"
    print len(set(school_id1))
    print "school id2"
    print len(set(school_id2))


if __name__ == "__main__":
    dp_conf = config("../conf/dp.conf")
    col_list = [1, 2, 3]
    count_different(dp_conf, col_list)
Esempio n. 15
0
    s = open(conf["trans_dir"])
    t = open(conf["reduction_trans_dir"],"w")
    reader = csv.reader(s)
    a = 0

    for line in reader:
        if a == 0:
            a += 1
            continue

        if line[3] in category or line[4] in company or line[5] in brand:
            write_str = ','.join(line)
            t.write(write_str+"\n")
            
        if a % 10000 == 0:
            print a    
        a += 1
    
    
if __name__ == '__main__':
    print "hello"
    data_position_conf = config("../conf/data_position.conf")

    offer = extract_offer(data_position_conf)
    reduct_transactions(data_position_conf,offer)



    

Esempio n. 16
0
        print "cross validation",np.mean(cross_validation.cross_val_score(clf,train,y,cv=3,scoring='roc_auc',n_jobs=3))
    elif ctype == "predict":
        clf.fit(train,y)
        predict = clf.predict_proba(test)[:,1]

        f = open(conf["result_essay"],"w")
        f.write("projectid,is_exciting\n")
        for it in range(len(test_id)):
            f.write("%s,%s\n"%(test_id[it],predict[it]))
    
    
if __name__ == '__main__':
    print "hello"

    #读取数据文件的conf文件,获取地址
    dp = config("../conf/dp.conf")

    if len(sys.argv)!=2:
        print "usage python essay_bench.py <usage>"
        print "usage:split=> train test essay split"
        print "usage:get_y=> get well writen y"
        print "usage:train=> fit train and predict test"
        sys.exit(1)
        
    if sys.argv[1] == "split":
        #step 1: 先把train和test分开
        
        #sub step 1: 先获取test文件所有的id
        test_id = get_test_id(dp)
    
        #sub step 2: 读取所有的essay文件,在此分割,并且将其写入train和test文件中
Esempio n. 17
0
import pickle
import os
import nltk
import csv
from read_conf import config
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn import linear_model
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn.decomposition import PCA

cat_dic = {'acq':0,'corn':1,'crude':2,'earn':3,'grain':4,'interest':5,'money-fx':6,'ship':7,'trade':8,'wheat':9}

t_path = config("../conf/dp.conf")
train_path = t_path["train_path"]
test_path = t_path["test_path"]

def handle_doc(word_set,rs_path):
    doc_dir = os.listdir(rs_path)
    doc_matrix = []
    doc_cat = []
    for docs in doc_dir:
        files = os.listdir(rs_path+docs)
        print "start to handle the -->  "+docs
        for file_d in files:
            d_path = rs_path+docs+'/'+file_d
            #get the single file path
            with open(d_path,'rb') as text_file:
                str_tmp = ''
Esempio n. 18
0
#coding=utf-8

from word import word
from read_conf import config
from nlp import NLP
import numpy as np
import os
from sklearn import linear_model
from logistic_nd import LogisticRegression

data_conf = config('../conf/dp.conf')
tr_data_path = data_conf['train_path']
te_data_path = data_conf['test_path']

cat_dict = {
    'acq': 0,
    'corn': 1,
    'crude': 2,
    'earn': 3,
    'grain': 4,
    'interest': 5,
    'money-fx': 6,
    'ship': 7,
    'trade': 8,
    'wheat': 9
}
nlp = NLP()


def get_doc_num(path):
    docs_dict = {'doc_sum': 0}