def print_classification(estimator, reduction=False): from sklearn.metrics import classification_report loader = CorpusLoader(reader, 2, shuffle=True, categories=labels) model = create_pipeline(estimator, reduction) for X_train, X_test, y_train, y_test in loader: model.fit(X_train, y_train) y_pred = model.predict(X_test) print(classification_report(y_test, y_pred, labels=labels))
import os from typing import List from loader import CorpusLoader from model import Word2VecModel from controller import Word2VecController # 从文件读取语料库 corpus_dir = "corpus" corpus: List[str] = [] for path in os.listdir(corpus_dir): with open(os.path.join(corpus_dir, path), "r") as f: corpus.append(f.read()) # 采样训练数据和测试数据 loader = CorpusLoader(corpus) n_train = 8000 X, Y = loader.sample_data(n=10000) x_train, y_train = X[:n_train], Y[:n_train] x_test, y_test = X[n_train:], Y[n_train:] model = Word2VecModel({ "vocabulary_size": x_train.shape[1], "hidden_layers": [32], }) controller = Word2VecController( model, { "epochs": 1000, "batch_size": 1024, "shuffle_buffer_size": 100000, "save_path": "./kite_word2vec.h5"
from reader import TweetsCorpusReader from loader import CorpusLoader from build import models from build import score_models import nltk import os import json import logging import re log = logging.getLogger("readability.readability") log.setLevel('WARNING') ROOT = 'C:\\Users\\niti.mishra\\Documents\\Personal\\cyberbullying\\' CORPUS = os.path.join(ROOT, 'data\\labelled_tweets') RESULTS = os.path.join(ROOT, 'results') DOC_PATTERN = r'.*\.json' # file = CORPUS+'\\random_tweets_2b.json' if __name__ == '__main__': corpus = TweetsCorpusReader(CORPUS, DOC_PATTERN) loader = CorpusLoader(corpus, 12, label='bullying_trace') for scores in score_models(models, loader): with open(RESULTS+'\\results.json', 'a') as f: f.write(json.dumps(scores) + "\n")
table.append(row) table.sort(key=lambda r: r[-1], reverse=True) print(tabulate.tabulate(table, headers=fields)) if __name__ == '__main__': results_file = "results.json" labels = [ "artistic_event", "other_event", ] # Initialzing corpus reader and loader (generates K-Folds) reader = PickledCorpusReader('./pickle_corpus') loader = CorpusLoader(reader, 5, shuffle=True, categories=labels) txt = TextNormalizer() txt.lemmatize("qu'", "") # Initalizing models models = [] for form in (LogisticRegression, SGDClassifier): models.append(create_pipeline(form(), True)) models.append(create_pipeline(form(), False)) models.append(create_pipeline(MultinomialNB(), False)) models.append(create_pipeline(GaussianNB(), True)) # Running all models for scores in score_models(models, loader):
def create_pipeline(estimator): steps = [ ('normalize', TextNormalizer()), ('vectorize', TfidfVectorizer(tokenizer=identity, preprocessor=None, lowercase=False)), ('classifier', estimator) ] return Pipeline(steps) models = [] for estimator in (LogisticRegression, MultinomialNB): models.append(create_pipeline(estimator())) # get data reader = CorpusReader() loader = CorpusLoader(reader, folds=10, shuffle=True) scores_table = [] scores_table_fields = ['model', 'precision', 'recall', 'accuracy', 'f1'] for model in models: scores = defaultdict(list) fold = 1 model_name = model.named_steps['classifier'].__class__.__name__ print('Model: {}'.format(model_name)) for X_train, X_test, y_train, y_test in loader: model.fit(X_train, y_train) y_pred = model.predict(X_test)
self.id2word.save(self.path) def fit(self, documents, labels=None): self.id2word = gensim.corpora.Dictionary(documents) self.save() def transform(self, documents): for document in documents: docvec = self.id2word.doc2bow(document) yield sparse2full(docvec, len(self.id2word)) if __name__ == '__main__': from loader import CorpusLoader from reader import PickledCorpusReader corpus = PickledCorpusReader('../corpus') loader = CorpusLoader(corpus, 12) docs = loader.documents(0, test=True) labels = loader.labels(0, test=True) # print(next(docs)[0][0][0]) normal = TextNormalizer() normal.fit(docs, labels) docs = list(normal.transform(docs)) vect = GensimVectorizer('lexicon.pkl') vect.fit(docs) docs = vect.transform(docs) print(next(docs))
corpus = TweetsCorpusReader(CORPUS.__str__(), DOC_PATTERN, bullying_trace=target) # processed_tweets = corpus.process_tweet() # from transformers import TextNormalizer # normalize = TextNormalizer(lemma=True) # normalized_tweets = list(normalize.fit_transform(processed_tweets)) # print(len(normalized_tweets)) # # X = [' '.join(doc) for doc in normalized_tweets] # y = list(corpus.fields(fields=target)) # perform classification with increasing training set size # idx = (np.linspace(1, 1.0, 1)*len(corpus.docs())).astype(int) idx = [i for i in range(500, len(corpus.docs()), 500)] for i in idx: loader = CorpusLoader(corpus, 6, label=target, size=i) for scores in score_models(binary_models, loader): print(scores) result_filename = 'TRACE_results' + str(i) + '.json' with open(Path.joinpath(RESULTS, result_filename), 'a') as f: f.write(json.dumps(scores) + '\n') # ########################################################################## # ## multi-classification # ########################################################################## # target = 'bullying_role' # if __name__ == '__main__': # corpus = TweetsCorpusReader(CORPUS.__str__(), DOC_PATTERN, bullying_trace=target) # idx = (np.linspace(1, 1.0, 1)*len(corpus.docs())).astype(int) # # idx = [i for i in range(100, len(corpus.docs()), 100) ]
X[i].isnull(), 1.0, 0.0) # create new dummy column, 1=missing in original X[i] = X[i].fillna(mu) # fill missing with mean with open(Path.joinpath(RESULTS, 'column_means.pkl'), 'wb') as f: pickle.dump(col_means, f) # save column means for use in prediction.py records = X.to_dict('records') final_cols = list(records[0].keys()) X = [list(i.values()) for i in records] y = list(data.fields(data_info['target'])) # will label encode in build.py # 3. determine how to split test, train and validation set if data_info.get('validation'): split_idx = True split_set = [str(i) for i in list(data.fields('Validation'))] loader = CorpusLoader(X, y, idx=split_set) # using predefined split else: split_idx = False loader = CorpusLoader(X, y, idx=None) # using cv # 4. train models and save models and its scores for scores in score_models(binary_models, loader, split_idx=split_idx, k=5, features=final_cols, outpath=RESULTS): print(scores) result_filename = 'results.json' with open(Path.joinpath(RESULTS, result_filename), 'a') as f: f.write(json.dumps(scores) + '\n')