Esempio n. 1
0
 def __init__(self, vocab = False, tfidf = False, max_feature = 1000):
     lab_fea = None
     if(vocab == True):
         print("select features...")
         lab_fea = select_feature('data\\feature_chi.txt', max_feature)["1"]
     
     self.vectorizer = None
     if(tfidf == True):
         self.vectorizer = TfidfVectorizer(analyzer = "word",
                              tokenizer = None,
                              preprocessor = None,
                              stop_words = None,
                              vocabulary = lab_fea,
                              max_features = max_feature)
     else:
         self.vectorizer = CountVectorizer(analyzer = "word",
                              tokenizer = None,
                              preprocessor = None,
                              stop_words = None,
                              vocabulary = lab_fea,
                              max_features = max_feature)
     self.lr = None
Esempio n. 2
0
 def __init__(self, vocab = False, tfidf = False, max_feature = 1000):
     lab_fea = None
     if(vocab == True):
         print "select features..."
         lab_fea = select_feature('data\\feature_chi.txt', max_feature)["1"]
     
     self.vectorizer = None
     if(tfidf == True):
         self.vectorizer = TfidfVectorizer(analyzer = "word",
                              tokenizer = None,
                              preprocessor = None,
                              stop_words = None,
                              vocabulary = lab_fea,
                              max_features = max_feature)
     else:
         self.vectorizer = CountVectorizer(analyzer = "word",
                              tokenizer = None,
                              preprocessor = None,
                              stop_words = None,
                              vocabulary = lab_fea,
                              max_features = max_feature)
     self.lr = None
Esempio n. 3
0
for line in sentence_vector:
    line = line.strip()
    strs = line.split(' ')[1:]
    vector = [float(item) for item in strs]
    if index == 1:
        print vector

    if index > 50000:
        break
    elif index <= 25000:
        train_data_features_d2v.append(vector)
    else:
        test_data_features_d2v.append(vector)
    index += 1

lab_fea = select_feature('data\\feature_chi.txt', 19000)["1"]

result = [0.0 for i in xrange(num_reviews_test)]

max_iter = 5
for epoch in xrange(max_iter):
    print "epoch: " + str(epoch)

    l1_train_bow, l1_train_d2v, l2_train_bow, l2_train_d2v, l1_label, l2_label = sample(
        clean_train_reviews, train_data_features_d2v, list(train["sentiment"]))

    # train logistic regression ...
    print "training bow ..."
    vectorizer_bow = TfidfVectorizer(analyzer="word",
                                     tokenizer=None,
                                     preprocessor=None,
Esempio n. 4
0
                   header=0,
                   delimiter="\t",
                   quoting=3)

num_reviews = len(test["review"])
clean_test_reviews = []

print("Cleaning and parsing the test set movie reviews...")
for i in range(0, num_reviews):
    clean_review = review_to_words(test["review"][i])
    clean_test_reviews.append(clean_review)

train_data_features_d2v = np.loadtxt('../../data/train_feature_d2v.txt')
test_data_features_d2v = np.loadtxt('../../data/test_feature_d2v.txt')

lab_fea = select_feature('../../data/feature_chi.txt', 1000)['1']

result = [0.0 for i in range(num_reviews)]

max_iter = 5
for epoch in range(max_iter):
    print("epoch: " + str(epoch))
    l1_train_bow, l1_train_d2v, l2_train_bow, l2_train_d2v, l1_label, l2_label = sample(
        clean_train_reviews, train_data_features_d2v,
        train["sentiment"].values)

    print("training bow ...")
    vectorizer_bow = TfidfVectorizer(analyzer="word",
                                     tokenizer=None,
                                     preprocessor=None,
                                     stop_words=None,
Esempio n. 5
0
for line in sentence_vector:
    line = line.strip()
    strs = line.split(' ')[1:]
    vector = [float(item) for item in strs]
    if index == 1:
        print vector
    
    if index > 50000:
        break
    elif index <= 25000:
        train_data_features_d2v.append(vector)
    else:
        test_data_features_d2v.append(vector)
    index += 1

lab_fea = select_feature('data\\feature_chi.txt', 19000)["1"]

result = [0.0 for i in xrange(num_reviews_test)]

max_iter = 5
for epoch in xrange(max_iter):
    print "epoch: " + str(epoch)
    
    l1_train_bow, l1_train_d2v, l2_train_bow, l2_train_d2v, l1_label, l2_label = sample(clean_train_reviews, train_data_features_d2v, 
                                                    list(train["sentiment"]))
    
    # train logistic regression ...
    print "training bow ..."
    vectorizer_bow = TfidfVectorizer(analyzer = "word",
                                     tokenizer = None,
                                     preprocessor = None,