Exemple #1
0
def __create_dataframe():
    failed = []
    labels = []
    contents = []
    tags = get_all_tags()
    tagger = juicer.initStanfordNERTagger()
    if tags:
        for filepath, tag in progress_bar(tags):
            try:
                email = emailyzer.from_file(filepath)
                content = email.html_as_text
                # Preprocess, extract entities
                words = juicer.extract_stanford(content, named_only=False, stemming=False, tagger=tagger)
                contents.append(words)
                labels.append(tag)
            except AttributeError:
                failed.append(filepath.replace(CLF_TRAININGDATA_PATH, ''))
                pass
        df = pd.DataFrame(list(zip(labels, contents)), columns = ['label', 'message'])

        if failed:
            warn_failed(failed)

        return df
    else:
        logger.error('No tags were found in the database.')
        return None
Exemple #2
0
def make_dictionary():
    failed = []
    tags = get_all_tags()
    words = []
    logger.debug('Creating dictionary..')
    if tags:
        for filepath, tag, text, hosts in progress_bar(tags):
            try:
                words += text.split()
            except AttributeError:
                failed.append(filepath.replace(CLF_TRAININGDATA_PATH, ''))
                pass

        # Remove non-alphanumeric values
        words = [word for word in words if word.isalpha()]

        # Get the count of each word
        dictionary = Counter(words)

        if failed:
            warn_failed(failed)

        return dictionary.most_common(CLF_DICT_NUM)
    else:
        logger.error('No tags were found in the database.')
        return None
Exemple #3
0
def performance_report(y_test, y_pred):
    try:
        print(classification_report(y_test, y_pred, target_names=categories, zero_division=0))
    except ValueError as e:
        logger.critical(f'Classification report: Invalid param `target_names`.')
        logger.error(e)
        print(classification_report(y_test, y_pred, zero_division=0))
Exemple #4
0
def save_obj(obj, filepath):
    try:
        with open(filepath, 'wb') as f:
            pickle.dump(obj, f)
        logger.debug(f'Pickled object `{filepath}` saved to disk.')
    except Exception as e:
        logger.critical(f'An unexpected error occurred while saving the object `{filepath}`.')
        logger.error(e)
Exemple #5
0
def get_all_tags():
    try:
        conn = sqlite3.connect(DBFILE)
        c = conn.cursor()
        c.execute('SELECT filepath, tag, text, hosts FROM tags')
        res = c.fetchall()
        return res
    except Exception as e:
        logger.critical(f'Unable to fetch tags from database.')
        logger.error(e)
        sys.exit(2)
Exemple #6
0
def load_obj(filepath):
    try:
        with open(filepath, 'rb') as f:
            obj = pickle.load(f, encoding='latin1')
        logger.debug(f'Pickled object loaded from `{filepath}`.')
        return obj
    except FileNotFoundError:
        logger.critical(f'Pickled object `{filepath}` was not found. Exiting.')
        sys.exit(2)
    except Exception as e:
        logger.critical(f'An unexpected error occurred while loading the object `{filepath}`.')
        logger.error(e)
        sys.exit(2)
Exemple #7
0
def create_dataframe(n=None):
    labels = []
    contents = []
    if n:
        tags = get_n_tags(n)
    else:
        tags = get_all_tags()
    if tags:
        for filepath, tag, text, hosts in progress_bar(tags):
            contents.append(text)
            labels.append(tag)

        return pd.DataFrame(list(zip(labels, contents)), columns = ['label', 'message'])
    else:
        logger.error('No tags were found in the database.')
        return None
Exemple #8
0
def get_n_tags(n):
    cats = [0, 1, 2, 3, 4]
    res = []
    try:
        conn = sqlite3.connect(DBFILE)
        c = conn.cursor()
        for cat in cats:
            try:
                c.execute('SELECT filepath, tag, text, hosts FROM tags WHERE tag=?', (cat,))
                tags = c.fetchall()
                if n <= len(tags):
                    res += random.sample(tags, n)
                else:
                    logger.warn(f'n={n} is higher than the number of samples in {cat}. Selecting {len(tags)} (all) samples.')
                    res += random.sample(tags, len(tags))
            except Exception as e:
                logger.critical(f'Unable to fetch {n} tags for category {cat}.')
                logger.error(e)
        return res
    except Exception as e:
        logger.critical(f'Unable to fetch tags from database.')
        logger.error(e)
        sys.exit(2)
Exemple #9
0
def main():
    TEXT = None
    CONFIG = {}

    result = {}

    if len(sys.argv) < 2:
        print(HELPMSG)
        logger.critical('No input specified')
        sys.exit(2)

    argv = sys.argv[1:]

    try:
        opts, args = getopt.getopt(argv, 'hf:st:c:na:l:o:d:v', [
            'help', 'infile=', 'stdin', 'std', 'algo=', '--limit', 'train=',
            'classify=', 'outfile=', 'format=', 'verbose', 'log-file'
        ])
    except getopt.GetoptError:
        print(HELPMSG)
        sys.exit(2)

    if not opts:
        print(HELPMSG)
        sys.exit(0)
    """
    Increase verbosity
    """
    opts_v = len(list(filter(lambda opt: opt == ('-v', ''), opts)))
    if opts_v > 4:
        opts_v = 4
    v = 0
    while v < opts_v:
        increase_log_level()
        v += 1
    """
    Log to file
    """
    if v > 0:
        enable_logfile = list(
            filter(lambda opt: opt[0] in ('--log-file'), opts))
        if enable_logfile:
            log_to_file()

    for opt, arg in opts:
        if opt == '--help':
            print(HELPMSG)
            sys.exit(0)
        elif opt in ('-f', '--infile'):
            file_path = arg
            logger.debug(f'Using input file {file_path}')
            try:
                with open(file_path, 'r') as f:
                    TEXT = f.read()
            except FileNotFoundError:
                logger.critical(
                    f'The specified file {file_path} does not exist.')
                sys.exit(2)
            except Exception as e:
                logger.critical(
                    f'An error occurred while reading the file `{file_path}`.')
                logger.error(e)
                sys.exit(2)
        elif opt in ('-s', '--stdin'):
            try:
                logger.debug(f'Using input from STDIN')
                TEXT = sys.stdin.read()
            except Exception as e:
                logger.critical(f'An error occurred while reading from stdin.')
                logger.error(e)
                sys.exit(2)
        elif opt in ('-n', '--std'):
            logger.debug(f'OPTION: Standardizing data.')
            CONFIG['std'] = True
        elif opt in ('-a', '--algo'):
            logger.debug('OPTION: Using Complement Naïve Bayes algorithm')
            if arg not in ['mnb', 'cnb']:
                print(HELPMSG)
                logger.critical(
                    f'The specified algorithm `{arg}` is not available.')
                sys.exit(2)
            else:
                CONFIG['algo'] = arg
        elif opt in ('-l', '--limit'):
            if arg.isnumeric:
                logger.debug(f'OPTION: Using n={arg} samples.')
                CONFIG['n'] = int(arg)
            else:
                print(HELPMSG)
                logger.critical(f'n={arg} is non-numeric.')
                sys.exit(2)
        elif opt in ('-t', '--train'):
            logger.debug(f'ACTION: Creating model from dataset')
            if arg == 'v1':
                katatasso.train(std=CONFIG.get('std', False),
                                algo=CONFIG.get('algo', 'mnb'))
            elif arg == 'v2':
                katatasso.trainv2(std=CONFIG.get('std', False),
                                  algo=CONFIG.get('algo', 'mnb'),
                                  n=CONFIG.get('n', None))
            else:
                logger.critical(
                    f'Please specify either `v1` or `v2`. E.g. `katatasso -t v2`'
                )
                sys.exit(2)
        elif opt in ('-c', '--classify'):
            if TEXT:
                logger.debug(f'ACTION: Classifying input')
                if CONFIG.get('cnb'):
                    algo = 'cnb'
                else:
                    algo = 'mnb'
                if arg == 'v1':
                    category = katatasso.classify(TEXT, algo=algo)
                elif arg == 'v2':
                    category = katatasso.classifyv2(TEXT, algo=algo)
                else:
                    logger.critical(
                        f'Please specify either `v1` or `v2`. E.g. `katatasso -c v2`'
                    )
                    sys.exit(2)
                result = {
                    'category': category,
                    'accuracy': 'n/a',
                    'alias': CATEGORIES.get(category)
                }
            else:
                logger.critical(f'Missing input (specify using -f or -s)')
                sys.exit(2)
        elif opt in ('-o', '--outfile'):
            logger.debug(f'CONFIG: Setting output file to {arg}')
            CONFIG['outfile'] = arg
        elif opt in ('-d', '--format'):
            if arg in ['plain', 'json']:
                logger.debug(f'CONFIG: Setting output file format to {arg}')
                CONFIG['format'] = arg
            else:
                logger.critical('Invalid format. Must be one of [plain, json]')
                sys.exit(2)

    if result:
        outformat = CONFIG.get('format')
        outfile = CONFIG.get('outfile')
        if outfile:
            ext = 'json' if outformat == 'json' else 'txt'
            fname = f'{outfile}.{ext}'
            if outformat == 'plain':
                with open(fname, 'w') as f:
                    f.write('\n'.join(list(result.values())))
            elif outformat == 'json':
                import json
                with open(fname, 'w', encoding='utf-8') as f:
                    json.dump(result, f, ensure_ascii=False, indent=4)
            logger.debug(f'Results saved to file `{fname}`')
            sys.exit(0)
        else:
            for k, v in result.items():
                print(f'{k}: {v}')
            sys.exit(0)