Beispiel #1
0
class WittenBellInterpolatedTrigramTests(unittest.TestCase):
    def setUp(self):
        vocab, training_text = _prepare_test_data(3)
        self.model = WittenBellInterpolated(3, vocabulary=vocab)
        self.model.fit(training_text)

    score_tests = [
        # For unigram scores by default revert to MLE
        # Total unigrams: 18
        # count('c'): 1
        ("c", None, 1.0 / 18),
        # in vocabulary but unseen
        # count("z") = 0
        ("z", None, 0.0 / 18),
        # out of vocabulary should use "UNK" score
        # count("<UNK>") = 3
        ("y", None, 3.0 / 18),
        # gamma(['b']) = 0.1111
        # mle.score('c', ['b']) = 0.5
        # (1 - gamma) * mle + gamma * mle('c') ~= 0.45 + .3 / 18
        ("c", ["b"], (1 - 0.1111) * 0.5 + 0.1111 * 1 / 18),
        # building on that, let's try 'a b c' as the trigram
        # gamma(['a', 'b']) = 0.0667
        # mle("c", ["a", "b"]) = 1
        ("c", ["a", "b"],
         (1 - 0.0667) + 0.0667 * ((1 - 0.1111) * 0.5 + 0.1111 / 18)),
    ]
Beispiel #2
0
class WittenBellInterpolatedTrigramTests(unittest.TestCase):
    def setUp(self):
        vocab, training_text = _prepare_test_data(3)
        self.model = WittenBellInterpolated(3, vocabulary=vocab)
        self.model.fit(training_text)

    score_tests = [
        # For unigram scores by default revert to MLE
        # Total unigrams: 18
        # count('c'): 1
        ("c", None, 1.0 / 18),
        # in vocabulary but unseen
        # count("z") = 0
        ("z", None, 0.0 / 18),
        # out of vocabulary should use "UNK" score
        # count("<UNK>") = 3
        ("y", None, 3.0 / 18),
        # gamma(['b']) = 0.1111
        # mle.score('c', ['b']) = 0.5
        # (1 - gamma) * mle + gamma * mle('c') ~= 0.45 + .3 / 18
        ("c", ["b"], (1 - 0.1111) * 0.5 + 0.1111 * 1 / 18),
        # building on that, let's try 'a b c' as the trigram
        # gamma(['a', 'b']) = 0.0667
        # mle("c", ["a", "b"]) = 1
        ("c", ["a", "b"], (1 - 0.0667) + 0.0667 * ((1 - 0.1111) * 0.5 + 0.1111 / 18)),
    ]
Beispiel #3
0
class WittenBellInterpolatedTrigramTests(unittest.TestCase, metaclass=ParametrizeTestsMeta):
    def setUp(self):
        vocab, training_text = _prepare_test_data(3)
        self.model = WittenBellInterpolated(3, vocabulary=vocab)
        self.model.fit(training_text)

    score_tests = [
        # For unigram scores by default revert to regular MLE
        # Total unigrams: 18
        # Vocab Size = 7
        # count('c'): 1
        ("c", None, 1.0 / 18),
        # in vocabulary but unseen
        # count("z") = 0
        ("z", None, 0 / 18),
        # out of vocabulary should use "UNK" score
        # count("<UNK>") = 3
        ("y", None, 3.0 / 18),
        # 2 words follow b and b occured a total of 2 times
        # gamma(['b']) = 2/(2+2) = 0.5
        # mle.score('c', ['b']) = 0.5
        # mle('c') = 1/18 = 0.055
        # (1 - gamma) * mle + gamma * mle('c') ~= 0.27 + 0.055
        ("c", ["b"], (1 - 0.5) * 0.5 + 0.5 * 1 / 18),
        # building on that, let's try 'a b c' as the trigram
        # 1 word follows 'a b' and 'a b' occured 1 time
        # gamma(['a', 'b']) = 1/(1+1) = 0.5
        # mle("c", ["a", "b"]) = 1
        ("c", ["a", "b"], (1 - 0.5) + 0.5 * ((1 - 0.5) * 0.5 + 0.5 * 1 / 18)),
        # The ngram 'z b c' was not seen, so we should simply revert to
        # the score of the ngram 'b c'. See issue #2332.
        ("c", ["z", "b"], ((1 - 0.5) * 0.5 + 0.5 * 1 / 18)),
    ]
Beispiel #4
0
def train_texts(train_files, exclude, extension, n_ngram):
    # Training data file
    # train_data_file = "./train/treino.txt"

    # read training data
    #train_data_files = glob.glob('./train/*' + extension)
    train_data_files = train_files.copy()

    if (exclude):
        print("Arquivos no diretorio do treino antes de remover o item do test: ", train_data_files)
        train_data_files.remove(exclude)

    print("Arquivos utilizados no treino: ", train_data_files)

    train_texts = ''

    for train_data_file in train_data_files:

        try:
            #path_file_train =
            with open(os.path.join("./train", train_data_file), encoding='utf-8') as f:
                train_text = f.read().lower()
        except:
            print("Não foi possível acessar os arquivos de treino com a extensão ." + extension + " no diretório train.")

        # apply preprocessing (remove text inside square and curly brackets and rem punc)
        train_text = re.sub(r"\[.*\]|\{.*\}", "", train_text)
        train_text = re.sub(r'[^\w\s]', "", train_text)
        train_texts += train_text

    # pad the text and tokenize
    training_data = list(pad_sequence(word_tokenize(train_texts), n_ngram,
                                      pad_left=True,
                                      left_pad_symbol="<s>"))

    print("training_data", training_data)

    # generate ngrams
    ngrams = list(everygrams(training_data, max_len=n_ngram))
    print("Number of ngrams:", len(ngrams))

    # build ngram language models
    model = WittenBellInterpolated(n_ngram)
    model.fit([ngrams], vocabulary_text=training_data)
    print(model.vocab)

    return model
Beispiel #5
0
def train():
    model = WittenBellInterpolated(n)
    data = open("wikiData.txt", 'r', encoding="utf-8")
    i = 0
    line = ''

    vocab = []
    for c in CHARS:
        vocab.append(c)

    for c in DIGITS:
        vocab.append(c)
    vocab.append(' ')
    vocab.append('<s>')
    vocab.append('</s>')

    try:
        while True:
            linep = data.readline()
            if linep == '':
                break
            line = line + linep
            i += 1
            if i % 100 == 0:
                print(i)
            if i % INTERVAL_OF_SAVING == 0:
                tokens = word_tokenize(line)
                for j in range(tokens.__len__()):
                    tokens[j] = ' ' + tokens[j] + ' '
                train_data = padded_everygram_pipeline(n, tokens)
                for t in train_data[0]:
                    print(list(t))
                model.fit(train_data[0], vocab)
                print(len(model.vocab))
                line = ''
                try:
                    with open(
                            '(char&int)kilgariff_ngram_model_' + str(i) +
                            '.pkl', 'wb') as fout:
                        pickle.dump(model, fout)
                        fout.close()
                except IOError:
                    continue
    finally:
        with open('kilgariff_ngram_model_final_.pkl', 'wb') as fout:
            pickle.dump(model, fout)
            fout.close()
def wittenbell_trigram_model(trigram_training_data, vocabulary):
    model = WittenBellInterpolated(3, vocabulary=vocabulary)
    model.fit(trigram_training_data)
    return model
Beispiel #7
0
def success():
    if request.method == 'POST':
        #pegando o arquivo e salvando na pasta
        a = request.form.getlist('ano')
        f = request.files['file']
        f.save(os.path.join(app.config['UPLOAD_FOLDER'], f.filename))

        #analisando arquivos:
        ano_salvo = str(a[0])
        resultado_analise = next(
            os.walk("/home/{nome_de_usuario}/mysite/resultados/"))
        path, dirs, files = next(
            os.walk("/home/{nome_de_usuario}/mysite/projetos/" + ano_salvo +
                    "/"))
        file_count = len(files)
        texto = "/home/{nome_de_usuario}/mysite/uploads/" + f.filename
        valores_maximos = []
        valores_medios = []
        valores_arquivo = []

        j = 0
        while j < file_count:
            #Variaveis:
            texto = "/home/{nome_de_usuario}/mysite/uploads/" + f.filename
            texto_salvo = parser.from_file(
                "/home/{nome_de_usuario}/mysite/projetos/" + ano_salvo + "/" +
                files[j])
            texto_fornecido = parser.from_file(texto)

            #Fornecendo dados para a variável "train_text" com o valor de "texto_salvo" para posteriormente ser analisado
            train_text = texto_salvo['content']
            # aplique o pré-processamento (remova o texto entre colchetes e chaves e rem punc)
            train_text = re.sub(r"\[.*\]|\{.*\}", "", train_text)
            train_text = re.sub(r'[^\w\s]', "", train_text)

            # definir o número ngram
            n = 5

            # preencher o texto e tokenizar
            training_data = list(
                pad_sequence(word_tokenize(train_text),
                             n,
                             pad_left=True,
                             left_pad_symbol="<s>"))

            # gerar ngrams
            ngrams = list(everygrams(training_data, max_len=n))

            # build ngram language models
            model = WittenBellInterpolated(n)
            model.fit([ngrams], vocabulary_text=training_data)

            #Fornecendo dados para a variável "testt_text" com o valor de pdf2_test para posteriormente ser comparado com o arquivo de treinamento
            test_text = texto_fornecido['content']
            test_text = re.sub(r'[^\w\s]', "", test_text)

            # Tokenize e preencha o texto
            testing_data = list(
                pad_sequence(word_tokenize(test_text),
                             n,
                             pad_left=True,
                             left_pad_symbol="<s>"))

            # atribuir pontuações
            scores = []
            for i, item in enumerate(testing_data[n - 1:]):
                s = model.score(item, testing_data[i:i + n - 1])
                scores.append(s)

            scores_np = np.array(scores)

            # definir largura e altura
            width = 8
            height = np.ceil(len(testing_data) / width).astype("int64")

            # copiar pontuações para matriz em branco retangular
            a = np.zeros(width * height)
            a[:len(scores_np)] = scores_np
            diff = len(a) - len(scores_np)

            # aplique suavização gaussiana para estética
            a = gaussian_filter(a, sigma=1.0)

            # remodelar para caber no retângulo
            a = a.reshape(-1, width)

            # rótulos de formato
            labels = [
                " ".join(testing_data[i:i + width])
                for i in range(n - 1, len(testing_data), width)
            ]
            labels_individual = [x.split() for x in labels]
            labels_individual[-1] += [""] * diff
            labels = [f"{x:60.60}" for x in labels]

            # criar mapa de calor para colocar no resultado visual
            fig = go.Figure(data=go.Heatmap(
                z=a,
                x0=0,
                dx=1,
                y=labels,
                zmin=0,
                zmax=1,
                customdata=labels_individual,
                hovertemplate=
                '%{customdata} <br><b>Pontuacao:%{z:.3f}<extra></extra>',
                colorscale="burg"))
            fig.update_layout({
                "height": height * 40,
                "width": 1000,
                "font": {
                    "family": "Courier New"
                }
            })
            #criando resultado visual:
            #plotly.offline.plot(fig, filename='/home/Allberson/mysite/resultados/resultado.html', auto_open=False)

            #Armazenando dados dos scores para mostrar posteriormente
            valores_scores = np.array(scores)

            #Atribuindo valores para propor condições de valores:
            buscar_max = 0.9000000000000000  #Nivel alto de plágio

            buscar_med = 0.8000000000000000  #Nível acima da média

            #atribuindo valores mais autos de cópia
            maximo = np.where(valores_scores > buscar_max)[0]
            medio = np.where(
                valores_scores > buscar_med)[0]  #Nao ustilizado no momento
            valores_maximos.insert(j, len(maximo))
            valores_medios.insert(j, len(medio))  #Nao ustilizado no momento
            valores_arquivo.insert(j, files[j])

            j = j + 1

        #buscando arquivo com maior nível igualdade:
        val_maximo = np.array(valores_maximos)
        val_medio = np.array(valores_medios)
        busc_val_max = 1090
        busc_val_med = 500
        maxx = np.where(val_maximo > busc_val_max)[0]
        medd = np.where(val_medio > busc_val_med)[0]

        #Iniciando a página web
        if len(maxx) == 0:
            ano = ano_salvo
            resultado_false = "Nenhum arquivo encontrado que se iguale com o seu"
            os.remove('/home/{nome_de_usuario}/mysite/uploads/' +
                      f.filename)  #removendo arquivo enviado pelo usuário
            return render_template("resultado_page.html",
                                   name=f.filename,
                                   resultado_neg=resultado_false,
                                   valor_ano=ano)
        elif len(maxx) > 0:
            ano = ano_salvo
            tot_projetos = file_count
            resultado_mensagem = 'Encontramos um projeto com uma grande similaridade.'
            valor = "80%"
            enc = "Encontramos alguns projetos tiveram resultados positivos no momento de nossa análise. Veja a tabela abaixo"
            projetos_nomes_ok = files[int(maxx)]
            mens = "O(s) projeto(s) analisado(s) pode/podem ter um valor igual ou superior ao mostrado na coluna 'valor de cópia' : "
            os.remove('/home/{nome_de_usuario}/mysite/uploads/' + f.filename)
            return render_template("resultado_page.html",
                                   name=f.filename,
                                   mensagem=mens,
                                   resultado_men=resultado_mensagem,
                                   resultado_proj=projetos_nomes_ok,
                                   resultado_max=valor,
                                   encontrado=enc,
                                   valor_ano=ano,
                                   tot_proj=tot_projetos)
    # Use the toktok tokenizer that requires no dependencies.
    toktok = ToktokTokenizer()
    word_tokenize = word_tokenize = toktok.tokenize

# Text version of https://kilgarriff.co.uk/Publications/2005-K-lineer.pdf
if os.path.isfile('language-never-random.txt'):
    with io.open('language-never-random.txt', encoding='utf8') as fin:
        text = fin.read()
else:
    url = "https://gist.githubusercontent.com/alvations/53b01e4076573fea47c6057120bb017a/raw/b01ff96a5f76848450e648f35da6497ca9454e4a/language-never-random.txt"
    text = requests.get(url).content.decode('utf8')
    with io.open('language-never-random.txt', 'w', encoding='utf8') as fout:
        fout.write(text)

tokenized_text = [
    list(map(str.lower, word_tokenize(sent))) for sent in sent_tokenize(text)
]

n = 3
train_data, padded_sents = padded_everygram_pipeline(n, tokenized_text)

from nltk.lm import WittenBellInterpolated
model = WittenBellInterpolated(n)

model.fit(train_data, padded_sents)

print(model.logscore("never", "language is".split()))
print(model.logscore("am", "language is".split()))
print(model.logscore("a", "language is".split()))
print(model.logscore("the", "language is".split()))