logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")


def nnalgo(X):
    nbrs = NearestNeighbors(n_neighbors=5, algorithm="auto").fit(X)
    distances, indices = nbrs.kneighbors(X)
    return [distances, indices]


print("Loading Data from database")

from database import getdataset

sql = "SELECT * FROM news ORDER BY RAND()"
dataset = getdataset(sql)


print("%d documents" % len(dataset.data))
print("%d categories" % len(dataset.target_names))
print()

print("Extracting features from the training dataset using a sparse vectorizer")
t0 = time()

# max no of features
n_features = 10000
use_idf = True

vectorizer = TfidfVectorizer(max_df=0.6, max_features=n_features, stop_words="english", use_idf=use_idf)
print(categories)
'''
dataset = fetch_20newsgroups2(data_home='C:\ml_datasets\classification', subset='train', categories=categories,
                             shuffle=True, random_state=42)

dataset_test= fetch_20newsgroups2(data_home='C:/ml_datasets/classification/test', subset='train', categories=categories,
                             shuffle=True, random_state=42)

'''
from database import getdataset


sql="SELECT * from news where subset=0  order by RAND()"
#sql="SELECT * from news order by RAND()" 
dataset_train=getdataset(sql)

#sqlsub="SELECT id from news where subset=0  order by RAND() LIMIT 40 "

sql2="SELECT * from news where subset=1 order by RAND()"

dataset_test=getdataset(sql2)



from stemming import *

stems(dataset_train)
stems(dataset_test)