logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s") def nnalgo(X): nbrs = NearestNeighbors(n_neighbors=5, algorithm="auto").fit(X) distances, indices = nbrs.kneighbors(X) return [distances, indices] print("Loading Data from database") from database import getdataset sql = "SELECT * FROM news ORDER BY RAND()" dataset = getdataset(sql) print("%d documents" % len(dataset.data)) print("%d categories" % len(dataset.target_names)) print() print("Extracting features from the training dataset using a sparse vectorizer") t0 = time() # max no of features n_features = 10000 use_idf = True vectorizer = TfidfVectorizer(max_df=0.6, max_features=n_features, stop_words="english", use_idf=use_idf)
print(categories) ''' dataset = fetch_20newsgroups2(data_home='C:\ml_datasets\classification', subset='train', categories=categories, shuffle=True, random_state=42) dataset_test= fetch_20newsgroups2(data_home='C:/ml_datasets/classification/test', subset='train', categories=categories, shuffle=True, random_state=42) ''' from database import getdataset sql="SELECT * from news where subset=0 order by RAND()" #sql="SELECT * from news order by RAND()" dataset_train=getdataset(sql) #sqlsub="SELECT id from news where subset=0 order by RAND() LIMIT 40 " sql2="SELECT * from news where subset=1 order by RAND()" dataset_test=getdataset(sql2) from stemming import * stems(dataset_train) stems(dataset_test)