Beispiel #1
0
def print_classification(estimator, reduction=False):
    from sklearn.metrics import classification_report

    loader = CorpusLoader(reader, 2, shuffle=True, categories=labels)
    model = create_pipeline(estimator, reduction)
    for X_train, X_test, y_train, y_test in loader:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        print(classification_report(y_test, y_pred, labels=labels))
Beispiel #2
0
import os

from typing import List
from loader import CorpusLoader
from model import Word2VecModel
from controller import Word2VecController

# 从文件读取语料库
corpus_dir = "corpus"
corpus: List[str] = []
for path in os.listdir(corpus_dir):
    with open(os.path.join(corpus_dir, path), "r") as f:
        corpus.append(f.read())

# 采样训练数据和测试数据
loader = CorpusLoader(corpus)
n_train = 8000
X, Y = loader.sample_data(n=10000)
x_train, y_train = X[:n_train], Y[:n_train]
x_test, y_test = X[n_train:], Y[n_train:]

model = Word2VecModel({
    "vocabulary_size": x_train.shape[1],
    "hidden_layers": [32],
})
controller = Word2VecController(
    model, {
        "epochs": 1000,
        "batch_size": 1024,
        "shuffle_buffer_size": 100000,
        "save_path": "./kite_word2vec.h5"
from reader import TweetsCorpusReader
from loader import CorpusLoader
from build import models
from build import score_models

import nltk
import os
import json
import logging
import re

log = logging.getLogger("readability.readability")
log.setLevel('WARNING')

ROOT = 'C:\\Users\\niti.mishra\\Documents\\Personal\\cyberbullying\\'
CORPUS = os.path.join(ROOT, 'data\\labelled_tweets')
RESULTS = os.path.join(ROOT, 'results')

DOC_PATTERN = r'.*\.json' 

# file = CORPUS+'\\random_tweets_2b.json'


if __name__ == '__main__':
    corpus = TweetsCorpusReader(CORPUS, DOC_PATTERN)
    loader = CorpusLoader(corpus, 12, label='bullying_trace')
    for scores in score_models(models, loader):
        with open(RESULTS+'\\results.json', 'a') as f:
            f.write(json.dumps(scores) + "\n")
Beispiel #4
0
            table.append(row)

    table.sort(key=lambda r: r[-1], reverse=True)
    print(tabulate.tabulate(table, headers=fields))


if __name__ == '__main__':
    results_file = "results.json"
    labels = [
        "artistic_event",
        "other_event",
    ]

    # Initialzing corpus reader and loader (generates K-Folds)
    reader = PickledCorpusReader('./pickle_corpus')
    loader = CorpusLoader(reader, 5, shuffle=True, categories=labels)

    txt = TextNormalizer()
    txt.lemmatize("qu'", "")

    # Initalizing models
    models = []
    for form in (LogisticRegression, SGDClassifier):
        models.append(create_pipeline(form(), True))
        models.append(create_pipeline(form(), False))

    models.append(create_pipeline(MultinomialNB(), False))
    models.append(create_pipeline(GaussianNB(), True))

    # Running all models
    for scores in score_models(models, loader):
Beispiel #5
0
def create_pipeline(estimator):
    steps = [
        ('normalize', TextNormalizer()),
        ('vectorize', TfidfVectorizer(tokenizer=identity, preprocessor=None, lowercase=False)),
        ('classifier', estimator)
    ]   
    return Pipeline(steps)

models = []
for estimator in (LogisticRegression, MultinomialNB):
    models.append(create_pipeline(estimator()))


# get data
reader = CorpusReader()
loader = CorpusLoader(reader, folds=10, shuffle=True)

scores_table = []
scores_table_fields = ['model', 'precision', 'recall', 'accuracy', 'f1']

for model in models:
    scores = defaultdict(list) 

    fold = 1
    model_name = model.named_steps['classifier'].__class__.__name__     

    print('Model: {}'.format(model_name))
    for X_train, X_test, y_train, y_test in loader:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
Beispiel #6
0
        self.id2word.save(self.path)

    def fit(self, documents, labels=None):
        self.id2word = gensim.corpora.Dictionary(documents)
        self.save()

    def transform(self, documents):
        for document in documents:
            docvec = self.id2word.doc2bow(document)
            yield sparse2full(docvec, len(self.id2word))

if __name__ == '__main__':
    from loader import CorpusLoader
    from reader import PickledCorpusReader

    corpus = PickledCorpusReader('../corpus')
    loader = CorpusLoader(corpus, 12)

    docs   = loader.documents(0, test=True)
    labels = loader.labels(0, test=True)
    # print(next(docs)[0][0][0])
    normal = TextNormalizer()
    normal.fit(docs, labels)

    docs   = list(normal.transform(docs))

    vect = GensimVectorizer('lexicon.pkl')
    vect.fit(docs)
    docs = vect.transform(docs)
    print(next(docs))
    corpus = TweetsCorpusReader(CORPUS.__str__(),
                                DOC_PATTERN,
                                bullying_trace=target)
    # processed_tweets = corpus.process_tweet()
    # from transformers import TextNormalizer
    # normalize  = TextNormalizer(lemma=True)
    # normalized_tweets = list(normalize.fit_transform(processed_tweets))
    # print(len(normalized_tweets))
    # # X = [' '.join(doc) for doc in normalized_tweets]
    # y = list(corpus.fields(fields=target))

    # perform classification with increasing training set size
    # idx = (np.linspace(1, 1.0, 1)*len(corpus.docs())).astype(int)
    idx = [i for i in range(500, len(corpus.docs()), 500)]
    for i in idx:
        loader = CorpusLoader(corpus, 6, label=target, size=i)
        for scores in score_models(binary_models, loader):
            print(scores)
            result_filename = 'TRACE_results' + str(i) + '.json'
            with open(Path.joinpath(RESULTS, result_filename), 'a') as f:
                f.write(json.dumps(scores) + '\n')

# ##########################################################################
# ## multi-classification
# ##########################################################################

# target = 'bullying_role'
# if __name__ == '__main__':
#     corpus = TweetsCorpusReader(CORPUS.__str__(), DOC_PATTERN, bullying_trace=target)
#     idx = (np.linspace(1, 1.0, 1)*len(corpus.docs())).astype(int)
#     # idx = [i for i in range(100, len(corpus.docs()), 100) ]
Beispiel #8
0
                X[i].isnull(), 1.0,
                0.0)  # create new dummy column, 1=missing in original
            X[i] = X[i].fillna(mu)  # fill missing with mean
    with open(Path.joinpath(RESULTS, 'column_means.pkl'), 'wb') as f:
        pickle.dump(col_means, f)  # save column means for use in prediction.py

    records = X.to_dict('records')
    final_cols = list(records[0].keys())
    X = [list(i.values()) for i in records]
    y = list(data.fields(data_info['target']))  # will label encode in build.py

    # 3. determine how to split test, train and validation set
    if data_info.get('validation'):
        split_idx = True
        split_set = [str(i) for i in list(data.fields('Validation'))]
        loader = CorpusLoader(X, y, idx=split_set)  # using predefined split
    else:
        split_idx = False
        loader = CorpusLoader(X, y, idx=None)  # using cv

    # 4. train models and save models and its scores
    for scores in score_models(binary_models,
                               loader,
                               split_idx=split_idx,
                               k=5,
                               features=final_cols,
                               outpath=RESULTS):
        print(scores)
        result_filename = 'results.json'
        with open(Path.joinpath(RESULTS, result_filename), 'a') as f:
            f.write(json.dumps(scores) + '\n')