Beispiel #1
0
def main_validacao():
    """Versão de validação. Treina com arquivos da pasta treinamento e prediz arquivos da pasta validação"""
    random.seed(SEED)
    np.random.seed(SEED)

    #criação dos datasets a partir dos arquivos de áudio
    X_train, y_train = create_set(PATH_TREINAMENTO)
    X_val, y_val = create_set(PATH_VALIDACAO)

    #X_train = normalize(X_train, axis=0, norm='l2')
    #X_val = normalize(X_val, axis=0, norm='l2')

    classificador = RandomForestClassifier(n_estimators=75)
    classificador.fit(X_train, y_train)

    # classificação final e avaliação dos resultados

    print("\nAvaliação dos Resultados\n")
    y_pred = classificador.predict(X_val)

    print("\nPerformance do Modelo para cada classe de Caractér\n")
    print(classification_report(y_val, y_pred))

    print("\nMatriz de confusão\n")
    print(labeled_confusion_matrix(y_val, y_pred))

    #reorganização dos labels para formar os captchas
    captcha_test = rearrange(y_val, AUDIOS_POR_ARQUIVO)
    captcha_pred = rearrange(y_pred, AUDIOS_POR_ARQUIVO)

    acerto_captchas = accuracy_score(captcha_test, captcha_pred) * 100
    print(
        "\nA acurácia obtida com o modelo para a predição dos captchas no conjunto de validação foi de %2.2f%%.\n"
        % (acerto_captchas))
import re
import os
import util

# Gets all lines in xls or csv files by given regex and folder

folderName = raw_input('Folder: ')
filePattern = raw_input('Regex: ')
is_csv = filePattern.endswith('.csv')
mails = []
read = 1

for file in os.listdir(folderName):
    if re.match(filePattern, file):
        if is_csv:
            with open(folderName + "/" + file) as csvfile:
                for row in csvfile:
                    mails.extend(re.findall(r'[\w\.-]+@[\w\.-]+', row))
        else:
            book = xlrd.open_workbook(folderName + "/" + file)
            for sheet in book.sheets():
                for row in sheet.get_rows():
                    for cell in row:
                        if type(cell.value) is unicode:
                            mails.extend(re.findall(r'[\w\.-]+@[\w\.-]+', cell.value))
        print('Files read: ' + repr(read))
        read += 1
print('Filtering double mails (total mails ' + repr(len(mails)) + ')')
util.copy_to_clipboard(sorted(util.create_set(mails, lambda s: s.lower()),
                              key=(str.lower if is_csv else unicode.lower)))
__author__ = 'jens'

import util

# Remove all double entries and entries that have an extension out of the second list

print 'Base set'
base = util.create_set(util.raw_multi_line_input(), lambda s: s.lower())
print 'Extensions to remove from base'
to_del = util.create_set(util.raw_multi_line_input(), lambda s: s.lower())
for to_del_entry in to_del:
    base_set = set(base)
    for entry in base_set:
        if entry.lower().endswith(to_del_entry.lower()):
            base.remove(entry)
util.copy_to_clipboard(sorted(base, key=str.lower))
Beispiel #4
0
__author__ = 'jens'

import util

# Read multi line input and copies all unique lines to the clipboard
# This will also trim all leading and trailing spaces

multiLineInput = util.raw_multi_line_input()
uniqueSorted = util.trim(sorted(util.create_set(multiLineInput, lambda s: s.lower()), key=str.lower))
util.copy_to_clipboard(uniqueSorted)
Beispiel #5
0
__author__ = 'jens'

import util

# Read multi line input and copies all unique lines to the clipboard
# This will also trim all leading and trailing spaces

multiLineInput = util.raw_multi_line_input()
uniqueSorted = util.trim(
    sorted(util.create_set(multiLineInput, lambda s: s.lower()),
           key=str.lower))
util.copy_to_clipboard(uniqueSorted)