def train(self, train_corp): if not self._debug: train_corp = get_nice_data(train_corp) train_corp = Solution._remove_differencies(train_corp) texts = train_corp[0] target = self._encode_opinions(train_corp[1]) features_list = [] token_list = [] for text in texts: tokens = Solution._text_tokenize(text) token_list.append([]) for token in tokens: token_list[-1].append(token) for ngram in Solution._get_ngrams(token): self._ngr_add(ngram) for tokens in token_list: features_list.append(self._get_features_from_tokens(tokens)) if self._debug: print 'Initial number of features:', len(features_list[0]) features_list = self._feature_transformer.fit(features_list, target) if self._debug: print 'Reduced number of features:', len(features_list[0]) self._clf.fit(features_list, target)
all_ops.update(set(op)) d = dict() for op in all_ops: d[op] = len(d) return d def transform(ops, tr): ret = [] for op in ops: ret.append(list(map(lambda x: tr[x], op))) return ret if True: train_data = get_nice_data(get_data('reviews.json')) train_data = list(map(lambda x: np.array(x), train_data)) scores = [] for train_idx, test_idx in KFold(len(train_data[0]), n_folds=7, \ shuffle=True): X_train = train_data[0][train_idx] Y_train = train_data[1][train_idx] X_test, Y_test = Solution._remove_differencies((train_data[0][test_idx],\ train_data[1][test_idx]), True) sol = Solution(True) sol.train((X_train, Y_train)) # sometimes it says "AttributeError: '_ConstantPredictor'