def _word_tfidf_dist(documents): words_tfidf = {} if len(documents) > 0: if _check_is_sentence(documents): #if document contains only 1 or 2 or 3 chars --> acronyms try: text_analyzer = Vectorizer(ngram_range=(1,2),max_features=50) matrix = text_analyzer.fit_transform(documents).todense() for vocabulary in text_analyzer.vocabulary_.items(): word = vocabulary[0] indice = vocabulary[1] words_tfidf[word] = score_tfidf_freq(matrix[:,indice]) except ValueError: return {} else: return _freqdist(documents) return words_tfidf
def vectorize_videos(fpath, use_idf=False): ''' Converts a YouTube tag file to a sparse matrix pondered. We can assign weights based on IDF if specified. Arguments --------- fpath: a path to a file Each line is a song, tags are separated by space use_idf: bool (optinal, defaults to True) Indicates whether to use IDF. bottom_filter: float (defaults to 0.005, half of one percent) Minimum probability for tags to be considered useful ''' #Vectorizes to TF-IDF vectorizer = Vectorizer(analyzer=NoopAnalyzer(), use_idf=use_idf) sparse_matrix = vectorizer.fit_transform(clean_up(fpath, bottom_filter=0)) vocabulary = vectorizer.vocabulary return sparse_matrix, vocabulary
def test_vectorizer_inverse_transform(): # raw documents data = ALL_FOOD_DOCS for vectorizer in (Vectorizer(), CountVectorizer()): transformed_data = vectorizer.fit_transform(data) inversed_data = vectorizer.inverse_transform(transformed_data) for i, doc in enumerate(data): data_vec = np.sort(np.unique(vectorizer.analyzer.analyze(data[0]))) inversed_data_vec = np.sort(np.unique(inversed_data[0])) assert ((data_vec == inversed_data_vec).all())
def vectorize_videos(fpath, use_idf=False): ''' Converts a YouTube tag file to a sparse matrix pondered. We can assign weights based on IDF if specified. Arguments --------- fpath: a path to a file Each line is a song, tags are separated by space use_idf: bool (optinal, defaults to True) Indicates whether to use IDF. bottom_filter: float (defaults to 0.005, half of one percent) Minimum probability for tags to be considered useful ''' #Vectorizes to TF-IDF vectorizer = Vectorizer(analyzer=NoopAnalyzer(), use_idf = use_idf) sparse_matrix = vectorizer.fit_transform(clean_up(fpath, bottom_filter=0)) vocabulary = vectorizer.vocabulary return sparse_matrix, vocabulary
def __init__(self, data, n_train): try: self.i_body = data.features.index('body') except ValueError: raise ValueError('please include body') comments, scores = it.tee(it.islice(data.get_pairs(), 0, n_train)) comments = np.array(list(f[self.i_body] for (i, s, f) in comments)) scores = np.array(list(s for (i, s, f) in scores)) self.vectorizer = Vectorizer(ngram_range=(1,2)) X = self.vectorizer.fit_transform(comments) self.model = LinearRegression(normalize=True).fit(X, scores)
def __init__(self, data, n_train): try: self.i_body = data.features.index('body') except ValueError: raise ValueError('please include body') comments, scores = it.tee(it.islice(data.get_pairs(), 0, n_train)) comments = np.array(list(f[self.i_body] for (i, s, f) in comments)) scores = np.array(list((s for (i, s, f) in scores))) self.vectorizer = Vectorizer() X = self.vectorizer.fit_transform(comments) self.model = Perc(max_iter=100, tol=None) self.model.fit(X, scores)
def main(args): scorer = make_scorer(my_scorer) vectorizer = Vectorizer() early_stopping = EarlyStopping(monitor='val_loss', patience=1) for cat1ind in range(len(cats)-1): cat1 = cats[cat1ind] for cat2ind in range(cat1ind+1,len(cats)): cat2 = cats[cat2ind] subcats = [cat1, cat2] newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'), categories=subcats) vectors = vectorizer.fit_transform(newsgroups_train.data) #vectors = vectors.toarray() scaler = StandardScaler(with_mean=False) print(scaler.fit(vectors)) ## Doesn't seem to matter scaled_vectors = scaler.transform(vectors) ## Put targets in the range -1 to 1 instead of 0/1 binary_targets = newsgroups_train.target class_targets = newsgroups_train.target * 2 - 1 cat_targets = to_categorical(binary_targets) #targets = newsgroups_train.target * 2 - 1 print("Classifying %s vs. %s" % (cat1, cat2)) ## Get NN performance print("Classifying with svm-like (no hidden layer) neural network:") #model = get_model(vectors.shape[1]) sp_model = KerasClassifier(build_fn=get_svmlike_model, input_dims=vectors.shape[1], l2_weight=0.01, epochs=50, validation_split=0.2, batch_size=32) #score = np.average(cross_val_score(sp_model, vectors.toarray(), newsgroups_train.target, scoring=scorer, n_jobs=1, fit_params=dict(verbose=1, callbacks=[early_stopping]))) param_grid={'l2_weight':[0.001], 'lr':[0.1]} clf = GridSearchCV(sp_model, param_grid, scoring=scorer) clf.fit(vectors.toarray(), class_targets) print("\nScore of nn cross-validation=%f with parameters=%s" % (clf.best_score_, clf.best_params_)) #################################################################### # Below here is the actual svm #################################################################### print("Classifying with linear svm:") max_score = max_c = 0 params = {'C':[0.01, 0.1, 1.0, 10.0, 100]} svc = svm.LinearSVC() clf = GridSearchCV(svc, params, scoring=scorer) clf.fit(vectors, binary_targets) print("Best SVM performance was %f with c=%f" % (clf.best_score_, clf.best_params_['C'])) sys.exit(-1)
def tfidf(corpus, r=(1, 1), midf=3, madf=0.7, feats=10000, sublinear=True): vectorizer = Vectorizer(min_df=midf, max_df=madf, ngram_range=r, max_features=feats, stop_words='english', sublinear_tf=sublinear) X = vectorizer.fit_transform(corpus) print("n_samples: %d, n_features: %d" % X.shape) return X
def __init__(self, data, n_train): try: self.i_body = data.features.index('body') except ValueError: raise ValueError('please include body') comments, scores = it.tee(it.islice(data.get_pairs(), 0, n_train)) comments = np.array(list(f[self.i_body] for (i, s, f) in comments)) scores = np.array(list((s for (i, s, f) in scores))) self.vectorizer = Vectorizer() X = self.vectorizer.fit_transform(comments) self.model = LogisticRegression(solver='lbfgs').fit(X, scores)
def test_vectorizer_inverse_transform(): # raw documents data = ALL_FOOD_DOCS for vectorizer in (Vectorizer(), CountVectorizer()): transformed_data = vectorizer.fit_transform(data) inversed_data = vectorizer.inverse_transform(transformed_data) for i, doc in enumerate(data): data_vec = np.sort(np.unique(vectorizer.analyzer.analyze(data[0]))) inversed_data_vec = np.sort(np.unique(inversed_data[0])) assert ((data_vec == inversed_data_vec).all()) # Test that inverse_transform also works with numpy arrays transformed_data = np.asarray(transformed_data.todense()) assert (vectorizer.inverse_transform(transformed_data), inversed_data)
def train_classifier(texts, y): ''' Here is a perfect example of the "feel it ... func it" philosophy: The pype call uses the function arguments and function body to specify three variables, texts, a list of strings, y, a list of floats, and vectorizer, a scikit-learn object that vectorizes text. This reiterates the adivce that you should use the function body and function arguments to declare your scope, whenever you can. Line-by-line, here we go: {'vectorizer':vectorizer.fit, 'X':vectorizer.transform}, We build a dict, the first element of which is the fit vectorizer. Luckily, the 'fit' function returns an instance of the trained vectorizer, so we do not need to use _do. This vectorizer is then assigned to 'vectorizer'. Because iterating through dictionaries in Python3.6 preserves the order of the keys in which they were declared, we can apply the fit function to the vectorizer on the texts, assign that to the 'vectorizer' key. We need this instance of the vectorizer to run the classifier for unknown texts. After this, we apply the 'transform' to convert the texts into a training matrix keyed by 'X', whose rows are texts and whose columns are words. _a('classifier',(Classifier().fit,_['X'],y)), Finally, we can build a classifier. _a, or _assoc, means we are adding a key-value pair to the previous dictionary. This will be a new instance of our Classifier, which is trained through the fit function on the text-word matrix 'X' and the labels vector y. _d('X'), Since we don't need the X matrix anymore, we delete it from the returned JSON, which now only contains 'vectorizer' and 'classifier', the two things we will need to classify unknown texts. ''' vectorizer = Vectorizer() return p( texts, { 'vectorizer': vectorizer.fit, 'X': vectorizer.transform }, _a('classifier', (Classifier().fit, _['X'], y)), _d('X'), )
def get_20newsgroups_data_info_for_categories(categories): data = fetch_20newsgroups(subset='all', categories=categories, shuffle=False) vectorizer = Vectorizer() t0 = time() tfidf = vectorizer.fit_transform(data.data) pairwise_similarity = (tfidf * tfidf.T).todense().tolist() print "done in %fs" % (time() - t0) labels = [data.target_names[i] for i in data.target] payloads = [os.sep.join(e.split(os.sep)[-3:]) for e in data.filenames] # Similarity is from Zero to One - so (1-s) gives distance from 0 to 1. distances = [[(1-s) for s in row[:col_to+1]]for (col_to, row) in enumerate(pairwise_similarity)] # Fix the very slight off-ness involved in precision-conversion for row in distances: row[-1] = 0 pcd_tuples = zip(payloads, labels, distances) di = DataInfo.deserialize_pcd_tuples(pcd_tuples) return di
def test_vectorizer_inverse_transform(): # raw documents data = ALL_FOOD_DOCS for vectorizer in (Vectorizer(), CountVectorizer()): transformed_data = vectorizer.fit_transform(data) inversed_data = vectorizer.inverse_transform(transformed_data) for i, doc in enumerate(data): terms = np.sort(np.unique(vectorizer.analyzer.analyze(doc))) inversed_terms = np.sort(np.unique(inversed_data[i])) assert_array_equal(terms, inversed_terms) # Test that inverse_transform also works with numpy arrays transformed_data = transformed_data.toarray() inversed_data2 = vectorizer.inverse_transform(transformed_data) for terms, terms2 in zip(inversed_data, inversed_data2): assert_array_equal(terms, terms2)
def make_vectorizer(top_words): return Vectorizer( # Split based on words analyzer="word", # Cap features max_features=top_words, # Only take words that appear in 2 or more headlines # (deciding based on headline-unique # words is probably over-fitting) min_df=2, # Make the math simple for computing # probabilities / info-gain (word inclusion is either-or) # Shouldn't have a significant impact because very few # headlines contain have duplicate words binary=True)
def main(args): scorer = make_scorer(accuracy_score) vectorizer = Vectorizer() epochs = 50 valid_pct = 0.2 for cat1ind in range(len(cats) - 1): cat1 = cats[cat1ind] for cat2ind in range(cat1ind + 1, len(cats)): cat2 = cats[cat2ind] print("Classifying %s vs. %s" % (cat1, cat2)) subcats = [cat1, cat2] newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'), categories=subcats) vectors = vectorizer.fit_transform(newsgroups_train.data) #vectors = vectors.toarray() scaler = StandardScaler(with_mean=False) scaler.fit(vectors) ## Doesn't seem to matter scaled_vectors = scaler.transform(vectors) ## Put targets in the range -1 to 1 instead of 0/1 binary_targets = newsgroups_train.target class_targets = newsgroups_train.target * 2 - 1 #################################################################### # Below here is the actual svm #################################################################### max_score = max_c = 0 params = {'C': [0.01, 0.1, 1.0, 10.0, 100]} svc = svm.LinearSVC() clf = GridSearchCV(svc, params, scoring=scorer) clf.fit(vectors, binary_targets) print("Best SVM performance was acc=%f with c=%f" % (clf.best_score_, clf.best_params_['C']))
def test_pickle(): for obj in (CountVectorizer(), TfidfTransformer(), Vectorizer()): s = pickle.dumps(obj) assert_equal(type(pickle.loads(s)), obj.__class__)
def test_vectorizer(): # raw documents as an iterator train_data = iter(ALL_FOOD_DOCS[:-1]) test_data = [ALL_FOOD_DOCS[-1]] n_train = len(ALL_FOOD_DOCS) - 1 # test without vocabulary v1 = CountVectorizer(max_df=0.5) counts_train = v1.fit_transform(train_data) if hasattr(counts_train, 'tocsr'): counts_train = counts_train.tocsr() assert_equal(counts_train[0, v1.vocabulary[u"pizza"]], 2) # build a vectorizer v1 with the same vocabulary as the one fitted by v1 v2 = CountVectorizer(vocabulary=v1.vocabulary) # compare that the two vectorizer give the same output on the test sample for v in (v1, v2): counts_test = v.transform(test_data) if hasattr(counts_test, 'tocsr'): counts_test = counts_test.tocsr() assert_equal(counts_test[0, v.vocabulary[u"salad"]], 1) assert_equal(counts_test[0, v.vocabulary[u"tomato"]], 1) assert_equal(counts_test[0, v.vocabulary[u"water"]], 1) # stop word from the fixed list assert_false(u"the" in v.vocabulary) # stop word found automatically by the vectorizer DF thresholding # words that are high frequent across the complete corpus are likely # to be not informative (either real stop words of extraction # artifacts) assert_false(u"copyright" in v.vocabulary) # not present in the sample assert_equal(counts_test[0, v.vocabulary[u"coke"]], 0) assert_equal(counts_test[0, v.vocabulary[u"burger"]], 0) assert_equal(counts_test[0, v.vocabulary[u"beer"]], 0) assert_equal(counts_test[0, v.vocabulary[u"pizza"]], 0) # test tf-idf t1 = TfidfTransformer(norm='l1') tfidf = toarray(t1.fit(counts_train).transform(counts_train)) assert_equal(len(t1.idf_), len(v1.vocabulary)) assert_equal(tfidf.shape, (n_train, len(v1.vocabulary))) # test tf-idf with new data tfidf_test = toarray(t1.transform(counts_test)) assert_equal(tfidf_test.shape, (len(test_data), len(v1.vocabulary))) # test tf alone t2 = TfidfTransformer(norm='l1', use_idf=False) tf = toarray(t2.fit(counts_train).transform(counts_train)) assert_equal(t2.idf_, None) # L1-normalized term frequencies sum to one assert_array_almost_equal(np.sum(tf, axis=1), [1.0] * n_train) # test the direct tfidf vectorizer # (equivalent to term count vectorizer + tfidf transformer) train_data = iter(ALL_FOOD_DOCS[:-1]) tv = Vectorizer(norm='l1') tv.tc.max_df = v1.max_df tfidf2 = toarray(tv.fit_transform(train_data)) assert_array_almost_equal(tfidf, tfidf2) # test the direct tfidf vectorizer with new data tfidf_test2 = toarray(tv.transform(test_data)) assert_array_almost_equal(tfidf_test, tfidf_test2) # test empty vocabulary v3 = CountVectorizer(vocabulary=None) assert_raises(ValueError, v3.transform, train_data)
shuffle=True, random_state=42) filenames = np.concatenate((data_train.filenames, data_test.filenames)) target_names = set(data_train.target_names + data_test.target_names) print "%d documents" % len(filenames) print "%d categories" % len(target_names) print # split a training set and a test set labels = np.concatenate((data_train.target, data_test.target)) true_k = np.unique(labels).shape[0] print "Extracting features from the training dataset using a sparse vectorizer" t0 = time() vectorizer = Vectorizer(max_features=10000) X = vectorizer.fit_transform((open(f).read() for f in filenames)) X = Normalizer(norm="l2", copy=False).transform(X) print "done in %fs" % (time() - t0) print "n_samples: %d, n_features: %d" % X.shape print ############################################################################### # Now sparse MiniBatchKmeans print "_" * 80 mbkm = MiniBatchKMeans(init="random", k=true_k, max_iter=10, random_state=13,
shuffle=True, random_state=42) documents = data_train.data + data_test.data target_names = set(data_train.target_names + data_test.target_names) print "%d documents" % len(documents) print "%d categories" % len(target_names) print # split a training set and a test set labels = np.concatenate((data_train.target, data_test.target)) true_k = np.unique(labels).shape[0] print "Extracting features from the training dataset using a sparse vectorizer" t0 = time() vectorizer = Vectorizer(max_features=10000) X = vectorizer.fit_transform(documents) X = Normalizer(norm="l2", copy=False).transform(X) print "done in %fs" % (time() - t0) print "n_samples: %d, n_features: %d" % X.shape print ############################################################################### # Now sparse MiniBatchKmeans mbkm = MiniBatchKMeans(init="random", k=true_k, max_iter=10, random_state=13, chunk_size=1000) print "Clustering sparse data with %s" % str(mbkm) t0 = time()
def main(tcu_fpath): data = tcu_io.load_untreated_csv_to_numpy(tcu_fpath) data = data[data['Situacao'] == 'Aceito e Habilitado'] desc_column = data['Descricao'] des_cmp_column = data['DescricaoComplementar'] unidade_column = data['UnidadeFornecimento'] qtd_column = [str(qtd) for qtd in data['Quantidade']] #Transforms descriptions to base strings as_docs = [] for as_text in zip(desc_column, des_cmp_column, unidade_column, qtd_column): doc = " ".join(as_text) as_docs.append(doc) #Vectorizes to TF-IDF vectorizer = Vectorizer() doc_sparse_matrix = vectorizer.fit_transform(as_docs) #Compute clusters inter = {} intra = {} n_runs = 20 k_vals = range(2, 16) for i in xrange(n_runs): for k in k_vals: #Each K has n_runs clusterings inter_array = inter.setdefault(k, np.zeros(n_runs)) intra_array = intra.setdefault(k, np.zeros(n_runs)) #Run K-Means mbkm = MiniBatchKMeans(k, init = 'random') mbkm.fit(doc_sparse_matrix) centers = mbkm.cluster_centers_ labels = mbkm.labels_ #Inter distance. We use min because the ideia is to maximize this. #Min serves as a penalty for worse case. dist_centers = pairwise.euclidean_distances(centers) min_dist_between_centers = \ np.min(dist_centers[dist_centers > 0]) inter_array[i] = min_dist_between_centers #Intra distance dist_all_centers = mbkm.transform(doc_sparse_matrix) intra_dists = [] for doc_id, cluster in enumerate(labels): dist = dist_all_centers[doc_id, cluster] intra_dists.append(dist) intra_array[i] = np.mean(intra_dists) #Prints num elements per cluster print('Run %d ; k = %d' %(i, k)) counter = Counter(labels) for cluster, population in counter.items(): print('\tK = %d; Pop = %d' %(cluster, population)) print() x = inter.keys() y = [] c = [] for k in x: div = inter[k] / intra[k] y.append(np.mean(div)) c.append(half_confidence_interval_size(div, 0.90)) #hack for the zero to apper x = [0] + x y = [0] + y c = [0] + c ax = plt.gca() ax.set_yscale('log') ax.set_xticks(range(0, 16)) plt.ylabel('InterCluster/IntraCluster Ratio') plt.xlabel('Number of clusters') plt.errorbar(x, y, yerr=c, fmt='bo', markersize=8, elinewidth=2) plt.show()
def __init__(self): self.__morph = MorphAnalyzer() self.__tokenizer = RegexpTokenizer(r'\w+') self.__vectorizer = Vectorizer()
shuffle=True, random_state=42) print 'data loaded' categories = data_train.target_names # for case categories == None print "%d documents (training set)" % len(data_train.data) print "%d documents (testing set)" % len(data_test.data) print "%d categories" % len(categories) print # split a training set and a test set y_train, y_test = data_train.target, data_test.target print "Extracting features from the training dataset using a sparse vectorizer" t0 = time() vectorizer = Vectorizer(sublinear_tf=True, max_df=0.5) X_train = vectorizer.fit_transform(data_train.data) print "done in %fs" % (time() - t0) print "n_samples: %d, n_features: %d" % X_train.shape print print "Extracting features from the test dataset using the same vectorizer" t0 = time() X_test = vectorizer.transform(data_test.data) print "done in %fs" % (time() - t0) print "n_samples: %d, n_features: %d" % X_test.shape print if opts.select_chi2: print ("Extracting %d best features by a chi-squared test" % opts.select_chi2)
print categories if categories else "all" data_set = load_files('Privacypolicy/raw', categories = categories, shuffle = True, random_state = 42) print 'data loaded' # print "%d documents" % len(data_set.data) # print "%d categories" % len(data_set.target_names) print # load unlabeled data data_set_unlabel = load_files('Privacypolicy/unlabeled', shuffle = True, random_state = 30) # Extract features print "Extracting features from the training dataset using a sparse vectorizer" t0 = time() vectorizer = Vectorizer(max_features=10000) X = vectorizer.fit_transform(data_set.data) X = Normalizer(norm="l2", copy=False).transform(X) X = X.toarray() X_unlabel = vectorizer.transform(data_set_unlabel.data) X_unlabel = X_unlabel.toarray() y = data_set.target n_samples, n_features = X.shape print "done in %fs" % (time() - t0) print "n_samples: %d, n_features: %d" % (n_samples, n_features) print
sys.exit(1) input_data = csv.reader(open('descriptions_100.csv','rb')) dataset_data = [] dataset_target = [] for row in input_data: dataset_data.append(row[1]) dataset_target.append(row[0]) labels = dataset_target true_k = np.unique(labels).shape[0] print "Extracting features from the training dataset using a sparse vectorizer" t0 = time() vectorizer = Vectorizer(max_df=0.95, max_features=10000) X = vectorizer.fit_transform(dataset_data) print X print "done in %fs" % (time() - t0) print "n_samples: %d, n_features: %d" % X.shape ############################################################################### # Do the actual clustering km = MiniBatchKMeans(k=true_k, init='k-means++', n_init=1,init_size=1000,batch_size=1000, verbose=1) print "Clustering with %s" % km t0 = time() km.fit(X)
def main(args): vectorizer = Vectorizer() epochs = 500 valid_pct = 0.2 default_lr = 10.0 batch_size = 64 MAX_TRIES = 5 if torch.cuda.is_available(): print("CUDA found so processing will be done on the GPU") for cat1ind in range(len(cats) - 1): cat1 = cats[cat1ind] for cat2ind in range(cat1ind + 1, len(cats)): cat2 = cats[cat2ind] subcats = [cat1, cat2] print("Classifying %s vs. %s" % (cat1, cat2)) newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'), categories=subcats) vectors = vectorizer.fit_transform(newsgroups_train.data) #vectors = vectors.toarray() scaler = StandardScaler(with_mean=False) scaler.fit(vectors) ## Doesn't seem to matter scaled_vectors = scaler.transform(vectors) train_X_tensor = Tensor(scaled_vectors.toarray()) if torch.cuda.is_available(): pyt_data = TensorDataset(train_X_tensor.cuda(), train_X_tensor.cuda()) else: pyt_data = TensorDataset(train_X_tensor, train_X_tensor) end_train_range = int((1 - valid_pct) * vectors.shape[0]) iterations = 0 valid_X = pyt_data[end_train_range:][0] valid_y = train_X_tensor[end_train_range:] my_lr = default_lr model = AutoEncoderModel(vectors.shape[1], 2, lr=my_lr) for try_num in range(MAX_TRIES): nan = False for epoch in range(epochs): nan = False epoch_loss = 0 for batch_ind in range(end_train_range // batch_size): start_ind = batch_ind * batch_size end_ind = min(start_ind + batch_size, end_train_range) item = pyt_data[start_ind:end_ind] model.train() iterations += 1 answer = model(Variable(item[0])) loss = model.criterion(answer, Variable(item[1])) if np.isnan(loss.data[0]): sys.stderr.write( "Training batch %d at epoch %d has nan loss\n" % (batch_ind, epoch)) nan = True break epoch_loss += loss loss.backward() model.update() if nan: my_lr /= 2. if try_num + 1 < MAX_TRIES: print( "Attempting another try (%d) with learning rate halved to %f" % (try_num + 1, my_lr)) else: print( "Every learning rate resulted in NaN. Quitting." ) break ## Compute validation loss: valid_batch = pyt_data[end_train_range:][0] valid_answer = model(Variable(valid_batch)) valid_loss = model.criterion( valid_answer, Variable(pyt_data[end_train_range:][1])) #valid_f1 = f1_score(np.sign(valid_answer.data.numpy()), pyt_data[end_train_range:][1].numpy(), pos_label=-1) #valid_acc = accuracy_score(np.sign(valid_answer.cpu().data.numpy()), valid_y.numpy()) if epoch % 10 == 0: print( "Epoch %d with training loss %f and validation loss %f" % (epoch, epoch_loss.data[0], valid_loss.data[0])) if nan: break if not nan: ## If we got through the epochs without nan then save the model ## and don't take any more tries: break
if 'MLCOMP_DATASETS_HOME' not in os.environ: print "MLCOMP_DATASETS_HOME not set; please follow the above instructions" sys.exit(0) # Load the training set print "Loading 20 newsgroups training set... " news_train = load_mlcomp('20news-18828', 'train') print news_train.DESCR print "%d documents" % len(news_train.filenames) print "%d categories" % len(news_train.target_names) print "Extracting features from the dataset using a sparse vectorizer" t0 = time() vectorizer = Vectorizer() X_train = vectorizer.fit_transform((open(f).read() for f in news_train.filenames)) print "done in %fs" % (time() - t0) print "n_samples: %d, n_features: %d" % X_train.shape assert sp.issparse(X_train) y_train = news_train.target print "Loading 20 newsgroups test set... " news_test = load_mlcomp('20news-18828', 'test') t0 = time() print "done in %fs" % (time() - t0) print "Predicting the labels of the test set..." print "%d documents" % len(news_test.filenames) print "%d categories" % len(news_test.target_names)
def main(args): epochs = 100 valid_pct = 0.2 default_lr = 0.01 default_c = 1.0 default_decay = 0.0 hidden_nodes = 128 batch_size = 512 if len(args) < 1: sys.stderr.write("One required argument: <train directory>\n") sys.exit(-1) gpu = False if torch.cuda.is_available(): print("CUDA found so processing will be done on the GPU") gpu = True examples = [] labels = [] sys.stderr.write("Loading data\n") with open(join(args[0], 'train.csv')) as csvfile: csvreader = csv.reader(csvfile) for row in csvreader: labels.append(int(row[0])) examples.append(row[1]) sys.stderr.write("Transforming data into feature vectors\n") scorer = make_scorer(my_scorer) vectorizer = Vectorizer(max_features=10000) vectors = vectorizer.fit_transform(examples) all_y = np.array(labels).astype('float32') ## These labels are 1-2 by definition, relabel to +1/-1 for hinge loss: binary_targets = all_y - 1 class_targets = ((all_y - 1) * 2) - 1 sys.stderr.write("Rescaling data\n") scaler = StandardScaler(with_mean=False) scaler.fit(vectors) ## Doesn't seem to matter scaled_vectors = scaler.transform(vectors).astype('float32') num_valid_instances = min(1000, int(valid_pct * vectors.shape[0])) end_train_range = vectors.shape[0] - num_valid_instances #################################################################### # Here is the actual svm #################################################################### # print("Classifying with linear svm:") # max_score = max_c = 0 # params = {'C':[0.001, 0.01, 0.1, 1.0]} # svc = svm.LinearSVC() # clf = GridSearchCV(svc, params, scoring=scorer) # clf.fit(vectors, binary_targets) # print("Best SVM performance was with c=%f" % (clf.best_params_['C'])) # svc = svm.LinearSVC(C=clf.best_params_['C']) # svc.fit(scaled_vectors[:end_train_range], binary_targets[:end_train_range]) # score = svc.score(scaled_vectors[end_train_range:], binary_targets[end_train_range:]) # print("** SVM score with standard validation set is **%f**" % (score)) #################################################################### # transform the data into pytorch format: #################################################################### yelp_train_data = YelpPolarityDataset(scaled_vectors[:end_train_range], class_targets[:end_train_range]) train_loader = DataLoader(yelp_train_data, shuffle=True, batch_size=batch_size, num_workers=1) yelp_valid_data = YelpPolarityDataset(scaled_vectors[end_train_range:], class_targets[end_train_range:]) valid_loader = DataLoader(yelp_valid_data) ## Get NN performance iterations = 0 my_lr = default_lr model = SvmlikeModel(vectors.shape[1], lr=my_lr, c=default_c, decay=default_decay) if gpu: model.cuda() for epoch in range(epochs): epoch_loss = 0 epoch_start = time.time() for data in train_loader: inputs, labels = data if gpu: inputs = Variable(inputs.cuda()) labels = Variable(labels.cuda()) else: inputs, labels = Variable(inputs), Variable(labels) model.train() answer = model(inputs) loss = model.criterion(answer, labels) epoch_loss += loss loss.backward() model.update() train_time = time.time() - epoch_start #print("Training during epoch %d took %fs\n" % (epoch, train_time)) valid_acc = 0.0 valid_loss = 0.0 for data in valid_loader: inputs, labels = data if gpu: inputs = Variable(inputs.cuda()) labels = Variable(labels.cuda()) else: inputs, labels = Variable(inputs), Variable(labels) valid_answer = model(inputs) valid_loss += model.criterion(valid_answer, labels) data_proportion = float(inputs.size()[0]) / num_valid_instances valid_acc += data_proportion * accuracy_score( np.sign(valid_answer.cpu().data.numpy()), labels.cpu().data.numpy()) prev_valid_acc = valid_acc end_time = time.time() duration = end_time - epoch_start if epoch % 1 == 0: print( "Epoch %d took %ds with training loss %f and validation loss %f, acc=%f" % (epoch, duration, epoch_loss.data[0], valid_loss.data[0], prev_valid_acc))
def main(tcu_fpath): data = tcu_io.load_untreated_csv_to_numpy(tcu_fpath) #We only want accepted data data = data[data['Situacao'] == 'Aceito e Habilitado'] #Get invalid lines invalids = invalid(data) #Transforms descriptions to base strings desc_column = data['Descricao'] des_cmp_column = data['DescricaoComplementar'] unidade_column = data['UnidadeFornecimento'] qtd_column = [str(qtd) for qtd in data['Quantidade']] as_docs = [] for as_text in zip(desc_column, des_cmp_column, unidade_column, qtd_column): doc = " ".join(as_text) as_docs.append(doc) #Vectorizes to TF-IDF vectorizer = Vectorizer() doc_sparse_matrix = vectorizer.fit_transform(as_docs) #Run K-Means num_clusters = 7 mbkm = MiniBatchKMeans(num_clusters, init = 'random') mbkm.fit(doc_sparse_matrix) #New labels column, replaces both Descricao columns labels_column = mbkm.labels_ #Old columns to keep chave_column = data['ChavePregao'] uasg_column = data['UASG'] pregoeiro_column = data['PregoeiroOficial'] aceito_column = data['AceitoPara_CNPJ'] lance_column = data['PeloMenorLance'] ref_column = data['ValordeReferencia'] ganho_column = data['GanhoPregao'] #And a new column Superfaturamento super_faturamento = np.ndarray(shape=len(labels_column), dtype = 'S12') for i, ganho in enumerate(ganho_column): if ganho >= -50: #50% vezes o preco é aceito super_faturamento[i] = 'OK' elif ganho < -50 and ganho > -500: #Mais que isso é super faturado super_faturamento[i] = 'Super' elif ganho < -500: #Mais que 5x o valor é foda. super_faturamento[i] = 'SuperPlus' for i in xrange(len(labels_column)): if i not in invalids: print(labels_column[i], end=',') print(chave_column[i], end=',') print(uasg_column[i], end=',') print(pregoeiro_column[i], end=',') print(aceito_column[i], end=',') print(lance_column[i], end=',') print(ref_column[i], end=',') print(ganho_column[i], end=',') print(super_faturamento[i])
# -*- coding: utf-8 -*- from sklearn.feature_extraction.text import CountVectorizer as Vectorizer from sklearn.naive_bayes import BernoulliNB as Classifier from sklearn.externals import joblib from utils import * vectorizer = Vectorizer(min_df=1) classifier = Classifier() males_train = read_lines('./data/male_train.txt') females_train = read_lines('./data/female_train.txt') train_set = males_train + females_train labels = [MALE_LABEL] * len(males_train) + [FEMALE_LABEL] * len(females_train) train_features = vectorizer.fit_transform(train_set) classifier.fit(train_features, labels) # save model joblib.dump(vectorizer, './data/models/gender/vectorizer.pkl') joblib.dump(classifier, './data/models/gender/classifier.pkl') test_set = [ u'Снегирев Роман', u'Ольга Лепорская', u'Саша Сидоров', u'Арнольд Шварценеггер', u'Женя Иванова', u'Вера Брежнева', u'Алексей Вальков', u'Yakov Malinov', u'Лука Мудищев', u'Пидор Мутный' ] test_features = vectorizer.transform(test_set) predictions = classifier.predict(test_features) print predictions
data_train = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, random_state=42) data_test = fetch_20newsgroups(subset='test', categories=categories, shuffle=True, random_state=42) categories = data_train.target_names print "%d documents (training set)" % len(data_train.data) print "%d documents (testing set)" % len(data_test.data) print "%d categories" % len(categories) # split a training set and a test set y_train, y_test = data_train.target, data_test.target # Extracting features from the training dataset using a sparse vectorizer vectorizer = Vectorizer() X_train = vectorizer.fit_transform(data_train.data) # Extracting features from the test dataset using the same vectorizer X_test = vectorizer.transform(data_test.data) ############################################################################### # Benchmark classifier # sklearn.naive_bayes.MultinomialNB(alpha=310) gives 84.3% # LogisticRegression(C=1.9) gives 85.9% #clf = LogisticRegression(C=1.9, penalty='l2') # LinearSVC(C=.17) gives 85.9% from sklearn.svm.sparse import LinearSVC clf = LinearSVC(C=.17)
print i, data_train.target_names[i] # A primary thought on implementing multi-label classifier # Aborted later due to functions provided by most classifiers # Method: Transform y to one-else and use loops to learn binary classifiers # y_0 = y.copy() # for i in range(len(y_0)): # if y_0[i] == 1: # y_0[i] = 2 # Extract features print "Extracting features from Layer 1 training set using a sparse vectorizer..." t0 = time() vectorizer = Vectorizer() X = vectorizer.fit_transform(data_train.data) print "Done in %0.3fs" % (time() - t0) print "n_samples: %d, n_features: %d" % X.shape print # to dense array for logistic regression which does not work on sparse X_den = X.toarray() # # Feature selection # select_chi2 = 1000 # print ("Extracting %d best features by a chi-squared test" % select_chi2) # t0 = time() # ch2 = SelectKBest(chi2, k = select_chi2) # X = ch2.fit_transform(X, y) # print "Done in %fs" % (time() - t0)
shuffle=True, random_state=42) print 'data loaded' categories = data_train.target_names # for case categories == None print "%d documents (training set)" % len(data_train.data) print "%d documents (testing set)" % len(data_test.data) print "%d categories" % len(categories) print # split a training set and a test set y_train, y_test = data_train.target, data_test.target print "Extracting features from the training dataset using a sparse vectorizer" t0 = time() vectorizer = Vectorizer() X_train = vectorizer.fit_transform(data_train.data) print "done in %fs" % (time() - t0) print "n_samples: %d, n_features: %d" % X_train.shape print print "Extracting features from the test dataset using the same vectorizer" t0 = time() X_test = vectorizer.transform(data_test.data) print "done in %fs" % (time() - t0) print "n_samples: %d, n_features: %d" % X_test.shape print if opts.select_chi2: print ("Extracting %d best features by a chi-squared test" % opts.select_chi2)
#transform the categorical features with one-hot, then concatenate with numeric features X_cat_feat = pd.get_dummies(trainData[catFeatures]).fillna(0).as_matrix() test_cat_feat = pd.get_dummies(testData[catFeatures]).fillna(0).as_matrix() cat_feat_labels = list(pd.get_dummies(trainData[catFeatures])) print(cat_feat_labels) # In[ ]: #transform the text features with SKLearn's CountVectorizer from sklearn.feature_extraction.text import TfidfVectorizer as Vectorizer X_text_feat = np.ndarray(shape=(len(trainData), 0)) test_text_feat = np.ndarray(shape=(len(testData), 0)) text_feat_labels = [] for column in textColumns: vectorizer = Vectorizer(binary=True) #fit to the training data, transform the test data X_column_feat = vectorizer.fit_transform(trainData[column]).toarray() test_column_feat = vectorizer.transform(testData[column]).toarray() #append to the text features arrays X_text_feat = np.append(X_text_feat, X_column_feat, axis=1) test_text_feat = np.append(test_text_feat, test_column_feat, axis=1) text_feat_labels += vectorizer.get_feature_names() print(X_text_feat.shape) print(len(text_feat_labels)) # In[ ]: #Build the full feature matrices X_full = np.append(np.append(X_num_feat, X_cat_feat, axis=1),
shuffle=True, random_state=42) print 'data loaded' categories = data_train.target_names # for case categories == None print "%d documents (training set)" % len(data_train.data) print "%d documents (testing set)" % len(data_test.data) print "%d categories" % len(categories) print # split a training set and a test set y_train, y_test = data_train.target, data_test.target print "Extracting features from the training dataset using a sparse vectorizer" t0 = time() vectorizer = Vectorizer(sublinear_tf=True, max_df=0.5, stop_words='english') X_train = vectorizer.fit_transform(data_train.data) print "done in %fs" % (time() - t0) print "n_samples: %d, n_features: %d" % X_train.shape print print "Extracting features from the test dataset using the same vectorizer" t0 = time() X_test = vectorizer.transform(data_test.data) print "done in %fs" % (time() - t0) print "n_samples: %d, n_features: %d" % X_test.shape print if opts.select_chi2: print ("Extracting %d best features by a chi-squared test" % opts.select_chi2)
random_state=42) print 'data loaded' categories = data_train.target_names # for case categories == None print "%d documents (training set)" % len(data_train.data) print "%d documents (testing set)" % len(data_test.data) print "%d categories" % len(categories) print # split a training set and a test set y_train, y_test = data_train.target, data_test.target print "Extracting features from the training dataset using a sparse vectorizer" t0 = time() vectorizer = Vectorizer() X_train = vectorizer.fit_transform(data_train.data) print "done in %fs" % (time() - t0) print "n_samples: %d, n_features: %d" % X_train.shape print print "Extracting features from the test dataset using the same vectorizer" t0 = time() X_test = vectorizer.transform(data_test.data) print "done in %fs" % (time() - t0) print "n_samples: %d, n_features: %d" % X_test.shape print if opts.select_chi2: print("Extracting %d best features by a chi-squared test" % opts.select_chi2)
random_state=42) print 'data loaded' categories = data_train.target_names # for case categories == None print "%d documents (training set)" % len(data_train.data) print "%d documents (testing set)" % len(data_test.data) print "%d categories" % len(categories) print # split a training set and a test set y_train, y_test = data_train.target, data_test.target print "Extracting features from the training dataset using a sparse vectorizer" t0 = time() vectorizer = Vectorizer(sublinear_tf=True, max_df=0.5, stop_words='english') X_train = vectorizer.fit_transform(data_train.data) print "done in %fs" % (time() - t0) print "n_samples: %d, n_features: %d" % X_train.shape print print "Extracting features from the test dataset using the same vectorizer" t0 = time() X_test = vectorizer.transform(data_test.data) print "done in %fs" % (time() - t0) print "n_samples: %d, n_features: %d" % X_test.shape print if opts.select_chi2: print("Extracting %d best features by a chi-squared test" % opts.select_chi2)
from sklearn import metrics # The training data folder must be passed as first argument languages_data_folder = sys.argv[1] dataset = load_files(languages_data_folder) # Split the dataset in training and test set: docs_train, docs_test, y_train, y_test = train_test_split( dataset.data, dataset.target, test_fraction=0.5) # TASK: Build a vectorizer that splits strings into sequence of 1 to 3 # of 3 consecutive chars (1-grams, 2-grams and 3-grams of characters) # with IDF weights disabled (normalized term frequencies only) vectorizer = Vectorizer(analyzer='char', min_n=1, max_n=3, use_idf=False) # TASK: Chain the vectorizer with a linear classifier into a Pipeline # instance. Its variable should be named `pipeline`. pipeline = Pipeline([ ('vec', vectorizer), ('clf', Perceptron()), ]) # TASK: Fit the pipeline on the training set pipeline.fit(docs_train, y_train) # TASK: Predict the outcome on the testing set in a variable named y_predicted y_predicted = pipeline.predict(docs_test) # Print the classification report
print 'Data loaded!' print # Split datasets y_L1 = data_train.target y_L2_ca = ca_train.target y_L2_collect = collect_train.target y_L2_cookies = cookies_train.target y_L2_share = share_train.target # Extract features print "Extracting features from Layer 1 training set using a sparse vectorizer..." t0 = time() vectorizer = Vectorizer() X_L1 = vectorizer.fit_transform(data_train.data) print "Done in %0.3fs" % (time() - t0) print "L1: n_samples: %d, n_features: %d" % X_L1.shape print print "Extracting features from Layer 2 training sets using the same vectorizer..." t0 = time() X_L2_ca = vectorizer.transform(ca_train.data) X_L2_collect = vectorizer.transform(collect_train.data) X_L2_cookies = vectorizer.transform(cookies_train.data) X_L2_share = vectorizer.transform(share_train.data) print "Done in %0.3fs" % (time() - t0) print "CA: n_samples: %d, n_features: %d" % X_L2_ca.shape print "Collect: n_samples: %d, n_features: %d" % X_L2_collect.shape print "Cookies: n_samples: %d, n_features: %d" % X_L2_cookies.shape
# 'svc': [{'probability': True}], } # split a training set and a test set iter = ShuffleSplit(num_posts, n_iterations=1, test_fraction=0.15, indices=False) for (iter_no, (train_index, test_index)) in enumerate(iter): print 'Iteration no. %d' %(iter_no + 1) y_train = np.array([ x for (x, y) in zip(all_data['target'], train_index) if y ]) y_test = np.array([ x for (x, y) in zip(all_data['target'], test_index) if y ]) print 'Sampled %d training and %d test posts' %(len(y_train), len(y_test)) print "Extracting features from the training dataset using a sparse vectorizer" t0 = time() title_vectorizer = Vectorizer( analyzer=WordNGramAnalyzer( charset='utf-8', stop_words=set(['a', 'an', 'and', 'in', 'is', 'of', 'on', 'the', 'to']), ) ) title_train = title_vectorizer.fit_transform([ x for (x, y) in zip(all_data['title'], train_index) if y ]) domain_vectorizer = extract.SimpleVectorizer() domain_train = domain_vectorizer.fit_transform([ x for (x, y) in zip(all_data['domain'], train_index) if y ]) X_train = title_train print "done in %fs" % (time() - t0) print "n_samples: %d, n_features: %d" % X_train.shape print print "Extracting features from the test dataset using the same vectorizer" t0 = time() title_test = title_vectorizer.transform([ x for (x, y) in zip(all_data['title'], test_index) if y ]) domain_test = domain_vectorizer.transform([ x for (x, y) in zip(all_data['domain'], test_index) if y ])
print 'data loaded' documents = data_train.data + data_test.data target_names = set(data_train.target_names + data_test.target_names) print "%d documents" % len(documents) print "%d categories" % len(target_names) print # split a training set and a test set labels = np.concatenate((data_train.target, data_test.target)) true_k = np.unique(labels).shape[0] print "Extracting features from the training dataset using a sparse vectorizer" t0 = time() vectorizer = Vectorizer(max_features=10000) X = vectorizer.fit_transform(documents) X = Normalizer(norm="l2", copy=False).transform(X) print "done in %fs" % (time() - t0) print "n_samples: %d, n_features: %d" % X.shape print ############################################################################### # Now sparse MiniBatchKmeans mbkm = MiniBatchKMeans(init="random", k=true_k, max_iter=10, random_state=13, chunk_size=1000) print "Clustering sparse data with %s" % str(mbkm) t0 = time()
shuffle=True, random_state=42) print 'Data loaded!' print # Split datasets y_L1 = data_train.target y_L2_ca = ca_train.target y_L2_collect = collect_train.target y_L2_cookies = cookies_train.target y_L2_share = share_train.target # Extract features print "Extracting features from Layer 1 training set using a sparse vectorizer..." t0 = time() vectorizer = Vectorizer() X_L1 = vectorizer.fit_transform(data_train.data) print "Done in %0.3fs" % (time() - t0) print "L1: n_samples: %d, n_features: %d" % X_L1.shape print print "Extracting features from Layer 2 training sets using the same vectorizer..." t0 = time() X_L2_ca = vectorizer.transform(ca_train.data) X_L2_collect = vectorizer.transform(collect_train.data) X_L2_cookies = vectorizer.transform(cookies_train.data) X_L2_share = vectorizer.transform(share_train.data) print "Done in %0.3fs" % (time() - t0) print "CA: n_samples: %d, n_features: %d" % X_L2_ca.shape print "Collect: n_samples: %d, n_features: %d" % X_L2_collect.shape print "Cookies: n_samples: %d, n_features: %d" % X_L2_cookies.shape
dataset = fetch_20newsgroups(subset='all', categories=categories, shuffle=True, random_state=42) print "%d documents" % len(dataset.data) print "%d categories" % len(dataset.target_names) print labels = dataset.target true_k = np.unique(labels).shape[0] print "Extracting features from the training dataset using a sparse vectorizer" t0 = time() vectorizer = Vectorizer(max_df=0.95, max_features=10000) X = vectorizer.fit_transform(dataset.data) print "done in %fs" % (time() - t0) print "n_samples: %d, n_features: %d" % X.shape print ############################################################################### # Do the actual clustering if opts.minibatch: km = MiniBatchKMeans(k=true_k, init='k-means++', n_init=1, init_size=1000, batch_size=1000,
def main(args): scorer = make_scorer(my_scorer) vectorizer = Vectorizer() epochs = 100 valid_pct = 0.2 default_lr = 0.1 default_c = 0.1 default_decay = 0.0 hidden_nodes = 128 batch_size = 64 if torch.cuda.is_available(): print("CUDA found so processing will be done on the GPU") for cat1ind in range(len(cats)-1): cat1 = cats[cat1ind] for cat2ind in range(cat1ind+1,len(cats)): cat2 = cats[cat2ind] subcats = [cat1, cat2] newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'), categories=subcats) vectors = vectorizer.fit_transform(newsgroups_train.data) #vectors = vectors.toarray() scaler = StandardScaler(with_mean=False) scaler.fit(vectors) ## Doesn't seem to matter scaled_vectors = scaler.transform(vectors) ## Target vectors for the SVM: binary_targets = newsgroups_train.target ## Put targets in the range -1 to 1 instead of 0/1 for the nn hinge loss class_targets = newsgroups_train.target * 2 - 1 train_X_tensor = Tensor(scaled_vectors.toarray()) train_y_tensor = Tensor(class_targets) if torch.cuda.is_available(): pyt_data = TensorDataset(train_X_tensor.cuda(), train_y_tensor.cuda()) else: pyt_data = TensorDataset(train_X_tensor, train_y_tensor) print("Classifying %s vs. %s" % (cat1, cat2)) end_train_range = int((1-valid_pct) * vectors.shape[0]) ## Get NN performance iterations = 0 train_X = pyt_data[:end_train_range][0] train_y = train_y_tensor[:end_train_range] valid_X = pyt_data[end_train_range:][0] valid_y = train_y_tensor[end_train_range:] my_lr = default_lr #################################################################### # Here is the actual svm #################################################################### print("Classifying with linear svm:") max_score = max_c = 0 params = {'C':[0.01, 0.1, 1.0, 10.0, 100]} svc = svm.LinearSVC() clf = GridSearchCV(svc, params, scoring=scorer) clf.fit(vectors, binary_targets) print("Best SVM performance was with c=%f" % (clf.best_params_['C'])) svc = svm.LinearSVC(C=clf.best_params_['C']) svc.fit(scaled_vectors[:end_train_range], binary_targets[:end_train_range]) score = svc.score(scaled_vectors[end_train_range:], binary_targets[end_train_range:]) print("** SVM score with standard validation set is **%f**" % (score)) #################################################################### # Here are the different NN models #################################################################### for model_ind in (SVM_LIKE,): if model_ind == SVM_LIKE: print("Classifying with svm-like (no hidden layer) neural network:") elif model_ind == ONE_LAYER: print("Classifying with one hidden layer neural network with %d hidden nodes" % (hidden_nodes)) elif model_ind == SVM_INIT: print("Classifying with one hidden layer with %d hidden nodes initialized by previous system" % (hidden_nodes)) elif model_ind == SVM_REG: print("Classifying with one hidden layer with %d hidden nodes initialized and regularized by svm-like system." % (hidden_nodes)) for try_num in range(5): init_reg = False weight_reg = L2VectorLoss() if model_ind == SVM_LIKE: model = SvmlikeModel(vectors.shape[1], lr=my_lr, c=default_c, decay=default_decay) svmlike_model = model elif model_ind == ONE_LAYER: model = ExtendedModel(vectors.shape[1], hidden_nodes, lr=my_lr) elif model_ind == SVM_INIT: model = ExtendedModel(vectors.shape[1], hidden_nodes, lr=my_lr, init=saved_weights) else: model = ExtendedModel(vectors.shape[1], hidden_nodes, lr=my_lr, init=saved_weights) init_reg = True init_weight_reg = L2VectorLoss() ## Move the model to the GPU: if torch.cuda.is_available(): model.cuda() valid_answer = model(Variable(valid_X)).cpu() valid_acc = prev_valid_acc = accuracy_score(np.sign(valid_answer.data.numpy()), valid_y.numpy()) nan = False for epoch in range(epochs): #print("Epoch %d" % (epoch)) # single instance at a time: nan = False epoch_loss = 0 ## Shuffle data: shuffle = torch.randperm(end_train_range) train_X = train_X[shuffle] train_y = train_y[shuffle] for batch_ind in range(end_train_range // batch_size): start_ind = batch_ind * batch_size end_ind = min(start_ind+batch_size, end_train_range) item = (train_X[start_ind:end_ind], train_y[start_ind:end_ind]) model.train(); iterations += 1 answer = model(Variable(item[0])) if epoch == 0 and model_ind == SVM_INIT: svm_answer = svmlike_model(Variable(item[0])) loss = model.criterion(answer[:,0], Variable(item[1])) if np.isnan(loss.data[0]): sys.stderr.write("Training batch %d at epoch %d has nan loss\n" % (batch_ind, epoch)) nan = True break epoch_loss += loss loss.backward() ## If we're on the model that regularizes towards the initial conditions then run ## that loss function: if init_reg and epoch > 0: init_weight_reg_loss = init_weight_reg(model.fc1.weight[0,:], saved_weights) init_weight_reg_loss.backward() ## Always do L2 weight regularization: #weight_reg_loss = weight_reg(model.fc1.weight, torch.zeros(model.fc1.weight.size())) #weight_reg_loss.backward() model.update() #print("Epoch %d with loss %f and cumulative loss %f" % (epoch, loss.data[0], epoch_loss.data[0])) if nan: break valid_batch = pyt_data[end_train_range:][0] valid_answer = model(Variable(valid_batch))[:,0] valid_loss = model.criterion(valid_answer, Variable(pyt_data[end_train_range:][1])) #valid_f1 = f1_score(np.sign(valid_answer.data.numpy()), pyt_data[end_train_range:][1].numpy(), pos_label=-1) valid_acc = accuracy_score(np.sign(valid_answer.cpu().data.numpy()), valid_y.numpy()) if epoch % 10 == 0: print("Epoch %d with training loss %f and validation loss %f, acc=%f" % (epoch, epoch_loss.data[0], valid_loss.data[0], prev_valid_acc)) prev_valid_acc = valid_acc if not nan: print("** Finished with validation accuracy **%f**" % (valid_acc)) if model_ind == 0: saved_weights = Variable(model.fc1.weight[0,:].data) break elif try_num+1 < 5: my_lr /= 2. print("Attempting another try (%d) with learning rate halved to %f" % (try_num+1, my_lr)) else: print("Ran out of tries, giving up on this classification task.")
categories = ['Advertising','CA', 'Collect', 'Cookies', 'Security', 'Share', 'SafeHarbor','Truste', 'Change', 'Location', 'Children', 'Contact', 'Process', 'Retention'] # Load data print "Loading privacy policy dataset for categories:" print categories if categories else "all" data_set = load_files('Privacypolicy_balance/raw', categories = categories, shuffle = True, random_state = 42) print 'data loaded' print # Extract features print "Extracting features from the training dataset using a sparse vectorizer" t0 = time() vectorizer = Vectorizer(max_features=10000) X = vectorizer.fit_transform(data_set.data) X = Normalizer(norm="l2", copy=False).transform(X) y = data_set.target # feature selection ch2 = SelectKBest(chi2, k = 1800) X = ch2.fit_transform(X, y) X = X.toarray() n_samples, n_features = X.shape print "done in %fs" % (time() - t0) print "n_samples: %d, n_features: %d" % (n_samples, n_features) print
# The multiprocessing module is used as the backend of joblib.Parallel # that is used when n_jobs != 1 in GridSearchCV # the training data folder must be passed as first argument movie_reviews_data_folder = sys.argv[1] dataset = load_files(movie_reviews_data_folder, shuffle=False) print "n_samples: %d" % len(dataset.data) # split the dataset in training and test set: docs_train, docs_test, y_train, y_test = train_test_split( dataset.data, dataset.target, test_fraction=0.25, random_state=None) # TASK: Build a vectorizer / classifier pipeline using the previous # analyzer pipeline = Pipeline([ ('vect', Vectorizer(max_features=100000, max_df=0.9)), ('clf', LinearSVC()), ]) # TASK: Define a parameters grid for searching whether extracting bi-grams # is suited for this task, and which value of C in 1000 or 10000 is the # best for LinearSVC on this dataset. parameters = { 'vect__max_n': (1, 2), 'clf__C': (1000, 10000), } # TASK: Build a grid search to find out whether unigrams or bigrams are # more useful. # Fit the pipeline on the training set using grid search for the parameters # To make this run faster, fit it only on the top first 200 documents of