def __init__(self, vocab = False, tfidf = False, max_feature = 1000): lab_fea = None if(vocab == True): print("select features...") lab_fea = select_feature('data\\feature_chi.txt', max_feature)["1"] self.vectorizer = None if(tfidf == True): self.vectorizer = TfidfVectorizer(analyzer = "word", tokenizer = None, preprocessor = None, stop_words = None, vocabulary = lab_fea, max_features = max_feature) else: self.vectorizer = CountVectorizer(analyzer = "word", tokenizer = None, preprocessor = None, stop_words = None, vocabulary = lab_fea, max_features = max_feature) self.lr = None
def __init__(self, vocab = False, tfidf = False, max_feature = 1000): lab_fea = None if(vocab == True): print "select features..." lab_fea = select_feature('data\\feature_chi.txt', max_feature)["1"] self.vectorizer = None if(tfidf == True): self.vectorizer = TfidfVectorizer(analyzer = "word", tokenizer = None, preprocessor = None, stop_words = None, vocabulary = lab_fea, max_features = max_feature) else: self.vectorizer = CountVectorizer(analyzer = "word", tokenizer = None, preprocessor = None, stop_words = None, vocabulary = lab_fea, max_features = max_feature) self.lr = None
for line in sentence_vector: line = line.strip() strs = line.split(' ')[1:] vector = [float(item) for item in strs] if index == 1: print vector if index > 50000: break elif index <= 25000: train_data_features_d2v.append(vector) else: test_data_features_d2v.append(vector) index += 1 lab_fea = select_feature('data\\feature_chi.txt', 19000)["1"] result = [0.0 for i in xrange(num_reviews_test)] max_iter = 5 for epoch in xrange(max_iter): print "epoch: " + str(epoch) l1_train_bow, l1_train_d2v, l2_train_bow, l2_train_d2v, l1_label, l2_label = sample( clean_train_reviews, train_data_features_d2v, list(train["sentiment"])) # train logistic regression ... print "training bow ..." vectorizer_bow = TfidfVectorizer(analyzer="word", tokenizer=None, preprocessor=None,
header=0, delimiter="\t", quoting=3) num_reviews = len(test["review"]) clean_test_reviews = [] print("Cleaning and parsing the test set movie reviews...") for i in range(0, num_reviews): clean_review = review_to_words(test["review"][i]) clean_test_reviews.append(clean_review) train_data_features_d2v = np.loadtxt('../../data/train_feature_d2v.txt') test_data_features_d2v = np.loadtxt('../../data/test_feature_d2v.txt') lab_fea = select_feature('../../data/feature_chi.txt', 1000)['1'] result = [0.0 for i in range(num_reviews)] max_iter = 5 for epoch in range(max_iter): print("epoch: " + str(epoch)) l1_train_bow, l1_train_d2v, l2_train_bow, l2_train_d2v, l1_label, l2_label = sample( clean_train_reviews, train_data_features_d2v, train["sentiment"].values) print("training bow ...") vectorizer_bow = TfidfVectorizer(analyzer="word", tokenizer=None, preprocessor=None, stop_words=None,
for line in sentence_vector: line = line.strip() strs = line.split(' ')[1:] vector = [float(item) for item in strs] if index == 1: print vector if index > 50000: break elif index <= 25000: train_data_features_d2v.append(vector) else: test_data_features_d2v.append(vector) index += 1 lab_fea = select_feature('data\\feature_chi.txt', 19000)["1"] result = [0.0 for i in xrange(num_reviews_test)] max_iter = 5 for epoch in xrange(max_iter): print "epoch: " + str(epoch) l1_train_bow, l1_train_d2v, l2_train_bow, l2_train_d2v, l1_label, l2_label = sample(clean_train_reviews, train_data_features_d2v, list(train["sentiment"])) # train logistic regression ... print "training bow ..." vectorizer_bow = TfidfVectorizer(analyzer = "word", tokenizer = None, preprocessor = None,