import numpy as np
import matplotlib.pyplot as plt

from sklearn.datasets import make_classification
from sklearn.ensemble import ExtraTreesClassifier

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

import read_dataset as rd
from text_processor import Preprocessor

#Read text
train, test = rd.read()

train_text = rd.get_text(train)
y = rd.get_target(train)

#proccess text
processor = Preprocessor()
train_text = processor.process_dataset(train_text,
                                       n_gram=0,
                                       stem=False,
                                       tags=False,
                                       remove_stop_words=True,
                                       remove_punct=True,
                                       pos=False,
                                       dep=False,
                                       alpha=False,
                                       ent=True,
                                       vectorizer=None)
  file.write('Recall Macro: ' + str(recall_mean) + ' (+/-) ' + str(recall_std * 2) + '\n' )
  file.write('Precision Macro: ' + str(precision_mean) + ' (+/-) ' + str(precision_std * 2) + '\n' )
  file.write('F1 Macro: ' + str(f1_mean) + ' (+/-) ' +str(f1_std * 2) + '\n' )
  file.write('Accuracy: ' + str(accuracy_mean) + ' (+/-) ' +str(accuracy_std * 2) + '\n' )

  file.write('\n\n#############################################\n\n')
  file.close() 



## LENDO DATASET        ######################
train,test = rd.read()
categories = ['fake', 'real']

train_text = rd.get_text(train)
train_target = rd.get_target(train)

# test_text = rd.get_text(test)
# test_target = rd.get_target(test)
#################################################

combinations = get_combinations()
# combinations = use_custom()


for combination in combinations:
  analisar_features(train_text,
                    stem=combination['stem'],
                    remove_stop_words=combination['remove_stop_words'], 
                    remove_punct=combination['remove_punct'], 
                    n_gram=combination['n_gram'], 
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn import metrics
import numpy as np
from sklearn.datasets import fetch_20newsgroups

import read_dataset as rd

## LENDO DATASET        ######################
train, test = rd.read(percent_train=.5)
categories = ['fake', 'real']

train_text = rd.get_text(train)

train_target = rd.get_target(train)

test_text = rd.get_text(test)

test_target = rd.get_target(test)
#################################################

##              TREINANDO NAIVE               ##

print('Treinando modelo com Naive bayes...')
text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
])
text_clf.fit(train_text, train_target)