Beispiel #1
0
def performance_report(y_test, y_pred):
    try:
        print(classification_report(y_test, y_pred, target_names=categories, zero_division=0))
    except ValueError as e:
        logger.critical(f'Classification report: Invalid param `target_names`.')
        logger.error(e)
        print(classification_report(y_test, y_pred, zero_division=0))
Beispiel #2
0
def save_obj(obj, filepath):
    try:
        with open(filepath, 'wb') as f:
            pickle.dump(obj, f)
        logger.debug(f'Pickled object `{filepath}` saved to disk.')
    except Exception as e:
        logger.critical(f'An unexpected error occurred while saving the object `{filepath}`.')
        logger.error(e)
Beispiel #3
0
def get_all_tags():
    try:
        conn = sqlite3.connect(DBFILE)
        c = conn.cursor()
        c.execute('SELECT filepath, tag, text, hosts FROM tags')
        res = c.fetchall()
        return res
    except Exception as e:
        logger.critical(f'Unable to fetch tags from database.')
        logger.error(e)
        sys.exit(2)
Beispiel #4
0
def load_obj(filepath):
    try:
        with open(filepath, 'rb') as f:
            obj = pickle.load(f, encoding='latin1')
        logger.debug(f'Pickled object loaded from `{filepath}`.')
        return obj
    except FileNotFoundError:
        logger.critical(f'Pickled object `{filepath}` was not found. Exiting.')
        sys.exit(2)
    except Exception as e:
        logger.critical(f'An unexpected error occurred while loading the object `{filepath}`.')
        logger.error(e)
        sys.exit(2)
Beispiel #5
0
def trainv2(std=False, algo='mnb', n=None):
    """Train a model using Naive Bayes

        Parameters
        ----------
        std : bool
            Standardize the data.

        algo : str
            The algorithm to use. Can be either `mnb` or `cnb`
        
        n : int
            Select n samples from each category. (Default: All)

        Returns
        -------
    """
    df = create_dataframe(n=n)
    counts, df = process_dataframe(df, algo=algo)
    ### Todo: Remove
    save_obj(df, 'v2_dataframe.p')
    save_obj(counts, 'v2_counts.p')
    ###
    # messages_train, messages_test, labels_train, labels_test
    x_train, x_test, y_train, y_test = train_test_split(counts,
                                                        df['label'],
                                                        test_size=0.3,
                                                        random_state=69)
    if std:
        x_train, x_test = standardize(x_train, x_test)
    if algo == 'cnb':
        model = ComplementNB()
    elif algo == 'mnb':
        model = MultinomialNB()
    else:
        logger.critical(
            f'Parameter `algo` specifies unknown algorithm. Defaulting to `mnb`.'
        )
        model = MultinomialNB()

    model.fit(x_train, y_train)
    save_model(model, version='v2', algo=algo)

    y_pred = model.predict(x_test)

    print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
    measure.evaluate(model, x_test, y_test)
    measure.performance_report(y_test, y_pred)
    measure.plot_confusion_mat(model, x_test, y_test)
    title = f'Learning Curves ({algo.upper()})'
    learning_curve.plot(model, x_test, y_test, title=title)
Beispiel #6
0
def warn_failed(failed):
    logger.critical(f'An error occurred with {len(failed)} files. See `failed.out` for filenames.')
    try:
        fail_cat = {
            'legit': len([fn for fn in failed if fn.startswith('legit')]),
            'spam': len([fn for fn in failed if fn.startswith('spam')]),
            'phish': len([fn for fn in failed if fn.startswith('phish')]),
            'malware': len([fn for fn in failed if fn.startswith('malware')]),
            'fraud': len([fn for fn in failed if fn.startswith('fraud')])
        }
        logger.critical(f'Failed:\n{fail_cat}')
    except:
        pass
    with open('failed.out', 'w') as f:
        f.write('\n'.join(failed))
Beispiel #7
0
def train(std=False, algo='mnb'):
    """Train a model using Naive Bayes

        Parameters
        ----------
        std : bool
            Standardize the data

        algo : str
            The algorithm to use. Can be either `mnb` or `cnb`

        Returns
        -------
    """
    dictionary = make_dictionary()
    features, labels = make_dataset(dictionary)
    ### Todo: Remove
    save_obj(features, 'v1_features.p')
    save_obj(labels, 'v1_labels.p')
    ###

    x_train, x_test, y_train, y_test = train_test_split(features,
                                                        labels,
                                                        test_size=0.3,
                                                        random_state=69)

    if std:
        x_train, x_test = standardize(x_train, x_test)
    if algo == 'cnb':
        model = ComplementNB()
    elif algo == 'mnb':
        model = MultinomialNB()
    else:
        logger.critical(
            f'Parameter `algo` specifies unknown algorithm. Defaulting to `mnb`.'
        )
        model = MultinomialNB()

    model.fit(x_train, y_train)
    save_model(model, version='v1', algo=algo)

    y_pred = model.predict(x_test)
    print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
    measure.evaluate(model, x_test, y_test)
    measure.performance_report(y_test, y_pred)
    measure.plot_confusion_mat(model, x_test, y_test)
    title = f'Learning Curves ({algo.upper()})'
    learning_curve.plot(model, x_test, y_test, title=title)
Beispiel #8
0
def get_n_tags(n):
    cats = [0, 1, 2, 3, 4]
    res = []
    try:
        conn = sqlite3.connect(DBFILE)
        c = conn.cursor()
        for cat in cats:
            try:
                c.execute('SELECT filepath, tag, text, hosts FROM tags WHERE tag=?', (cat,))
                tags = c.fetchall()
                if n <= len(tags):
                    res += random.sample(tags, n)
                else:
                    logger.warn(f'n={n} is higher than the number of samples in {cat}. Selecting {len(tags)} (all) samples.')
                    res += random.sample(tags, len(tags))
            except Exception as e:
                logger.critical(f'Unable to fetch {n} tags for category {cat}.')
                logger.error(e)
        return res
    except Exception as e:
        logger.critical(f'Unable to fetch tags from database.')
        logger.error(e)
        sys.exit(2)
Beispiel #9
0
import sys

from katatasso.helpers.const import CATEGORIES
from katatasso.helpers.extraction import (get_tfidf_counts, make_dictionary,
                                          process_dataframe)
from katatasso.helpers.logger import rootLogger as logger
from katatasso.helpers.utils import load_model

try:
    from sklearn.metrics import accuracy_score
    from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
    import juicer
    import numpy as np
    import pandas as pd
except ModuleNotFoundError as e:
    logger.critical(f'Module `{e.name}` not found. Please install before proceeding.')
    sys.exit(2)


def classify(text, algo='mnb'):
    """Classify the text using a Naive Bayes model with
        word vector counts

        Parameters
        ----------
        text : str
            The text input to classify

        algo : str
            The algorithm to use
            `mnb` for Multinomial Naïve Bayes,
Beispiel #10
0
def main():
    TEXT = None
    CONFIG = {}

    result = {}

    if len(sys.argv) < 2:
        print(HELPMSG)
        logger.critical('No input specified')
        sys.exit(2)

    argv = sys.argv[1:]

    try:
        opts, args = getopt.getopt(argv, 'hf:st:c:na:l:o:d:v', [
            'help', 'infile=', 'stdin', 'std', 'algo=', '--limit', 'train=',
            'classify=', 'outfile=', 'format=', 'verbose', 'log-file'
        ])
    except getopt.GetoptError:
        print(HELPMSG)
        sys.exit(2)

    if not opts:
        print(HELPMSG)
        sys.exit(0)
    """
    Increase verbosity
    """
    opts_v = len(list(filter(lambda opt: opt == ('-v', ''), opts)))
    if opts_v > 4:
        opts_v = 4
    v = 0
    while v < opts_v:
        increase_log_level()
        v += 1
    """
    Log to file
    """
    if v > 0:
        enable_logfile = list(
            filter(lambda opt: opt[0] in ('--log-file'), opts))
        if enable_logfile:
            log_to_file()

    for opt, arg in opts:
        if opt == '--help':
            print(HELPMSG)
            sys.exit(0)
        elif opt in ('-f', '--infile'):
            file_path = arg
            logger.debug(f'Using input file {file_path}')
            try:
                with open(file_path, 'r') as f:
                    TEXT = f.read()
            except FileNotFoundError:
                logger.critical(
                    f'The specified file {file_path} does not exist.')
                sys.exit(2)
            except Exception as e:
                logger.critical(
                    f'An error occurred while reading the file `{file_path}`.')
                logger.error(e)
                sys.exit(2)
        elif opt in ('-s', '--stdin'):
            try:
                logger.debug(f'Using input from STDIN')
                TEXT = sys.stdin.read()
            except Exception as e:
                logger.critical(f'An error occurred while reading from stdin.')
                logger.error(e)
                sys.exit(2)
        elif opt in ('-n', '--std'):
            logger.debug(f'OPTION: Standardizing data.')
            CONFIG['std'] = True
        elif opt in ('-a', '--algo'):
            logger.debug('OPTION: Using Complement Naïve Bayes algorithm')
            if arg not in ['mnb', 'cnb']:
                print(HELPMSG)
                logger.critical(
                    f'The specified algorithm `{arg}` is not available.')
                sys.exit(2)
            else:
                CONFIG['algo'] = arg
        elif opt in ('-l', '--limit'):
            if arg.isnumeric:
                logger.debug(f'OPTION: Using n={arg} samples.')
                CONFIG['n'] = int(arg)
            else:
                print(HELPMSG)
                logger.critical(f'n={arg} is non-numeric.')
                sys.exit(2)
        elif opt in ('-t', '--train'):
            logger.debug(f'ACTION: Creating model from dataset')
            if arg == 'v1':
                katatasso.train(std=CONFIG.get('std', False),
                                algo=CONFIG.get('algo', 'mnb'))
            elif arg == 'v2':
                katatasso.trainv2(std=CONFIG.get('std', False),
                                  algo=CONFIG.get('algo', 'mnb'),
                                  n=CONFIG.get('n', None))
            else:
                logger.critical(
                    f'Please specify either `v1` or `v2`. E.g. `katatasso -t v2`'
                )
                sys.exit(2)
        elif opt in ('-c', '--classify'):
            if TEXT:
                logger.debug(f'ACTION: Classifying input')
                if CONFIG.get('cnb'):
                    algo = 'cnb'
                else:
                    algo = 'mnb'
                if arg == 'v1':
                    category = katatasso.classify(TEXT, algo=algo)
                elif arg == 'v2':
                    category = katatasso.classifyv2(TEXT, algo=algo)
                else:
                    logger.critical(
                        f'Please specify either `v1` or `v2`. E.g. `katatasso -c v2`'
                    )
                    sys.exit(2)
                result = {
                    'category': category,
                    'accuracy': 'n/a',
                    'alias': CATEGORIES.get(category)
                }
            else:
                logger.critical(f'Missing input (specify using -f or -s)')
                sys.exit(2)
        elif opt in ('-o', '--outfile'):
            logger.debug(f'CONFIG: Setting output file to {arg}')
            CONFIG['outfile'] = arg
        elif opt in ('-d', '--format'):
            if arg in ['plain', 'json']:
                logger.debug(f'CONFIG: Setting output file format to {arg}')
                CONFIG['format'] = arg
            else:
                logger.critical('Invalid format. Must be one of [plain, json]')
                sys.exit(2)

    if result:
        outformat = CONFIG.get('format')
        outfile = CONFIG.get('outfile')
        if outfile:
            ext = 'json' if outformat == 'json' else 'txt'
            fname = f'{outfile}.{ext}'
            if outformat == 'plain':
                with open(fname, 'w') as f:
                    f.write('\n'.join(list(result.values())))
            elif outformat == 'json':
                import json
                with open(fname, 'w', encoding='utf-8') as f:
                    json.dump(result, f, ensure_ascii=False, indent=4)
            logger.debug(f'Results saved to file `{fname}`')
            sys.exit(0)
        else:
            for k, v in result.items():
                print(f'{k}: {v}')
            sys.exit(0)