Exemple #1
0
 def test_tok_006(self):
     soluce = ['echo', ' ', '\\$', 'PATH', ' ', '||', ' ', 'echo',
               ' ', '$path', '\n']
     command = ''.join(soluce)
     tokens = []
     tk.tokenize(command, tokens)
     self.assertListEqual(tokens, soluce)
Exemple #2
0
 def test_tok_007(self):
     soluce = ['echo', ' ', '"', '\n', 'ewline',
               '\n', 'ewline', '"', '\n']
     command = ''.join(soluce)
     tokens = []
     tk.tokenize(command, tokens)
     self.assertListEqual(tokens, soluce)
Exemple #3
0
 def test_tok_008(self):
     command = 'echo ${var}_$var${var2}_'
     tokens = []
     tk.tokenize(command, tokens)
     self.assertListEqual(
         tokens, ['echo', ' ', '${', 'var', '}', '_', '$var',
                  '${', 'var2', '}', '_'])
Exemple #4
0
 def test_tok_004(self):
     soluce = ['ls', ' ', '-l', ' ', '\\\n', '/',
               ' ', ';', '\n']
     command = ''.join(soluce)
     tokens = []
     tk.tokenize(command, tokens)
     self.assertListEqual(tokens, soluce)
Exemple #5
0
 def test_tok_014(self):
     soluce = ['echo', ' ', 'text', '||',
               'file', ' ', '<<', ' ', 'file2', '\n']
     command = ''.join(soluce)
     tokens = []
     tk.tokenize(command, tokens)
     self.assertListEqual(tokens, soluce)
Exemple #6
0
def get_morph_analyzes(line, lang="turkish"):
    """

    :param lang:
    :param line: a sentence on a line (untokenized)
    :return:
    """
    if type(line) == str:
        tokens = tokenizer.tokenize(line)
    else:
        tokens = tokenizer.tokenize(line.decode("utf8"))
    fd, f_path = tempfile.mkstemp()
    with open(f_path, "w") as f:
        for token in tokens:
            f.write(token.encode("iso-8859-9") + "\n")
    os.close(fd)
    print(f_path)
    with codecs.open(f_path, "r", encoding="iso-8859-9") as f, open(os.devnull, "w") as devnull:
        # print f.readlines()
        string_output = subprocess.check_output(analyzer_command[lang],
                                                stdin=f,
                                                cwd=analyzer_paths[lang],
                                                stderr=devnull)

    # print string_output
    # print type(string_output)
    # print string_output.decode("iso-8859-9").encode('utf8')
    # print type(string_output.decode("iso-8859-9"))
    return string_output
Exemple #7
0
 def test_tok_017(self):
     soluce = ['VAR', ' ', '=', ' ', '0', '\n',
               'VAR', '=', '1', '\n', 'VAR', ' ',
               '+=', '1', '\n']
     command = ''.join(soluce)
     tokens = []
     tk.tokenize(command, tokens)
     self.assertListEqual(tokens, soluce)
Exemple #8
0
 def test_tok_009(self):
     soluce = ['echo', ' ', '1', '&&', '\n',
               '<(', 'LOL', ')', '||', '\n',
               'LOL', '\n']
     command = ''.join(soluce)
     tokens = []
     tk.tokenize(command, tokens)
     self.assertListEqual(tokens, soluce)
Exemple #9
0
 def test_tok_002(self):
     command = '1 && 0 || $PATH += "~/tmpsh/bin"; $VAR=0 &\necho LOL'
     tokens = []
     tk.tokenize(command, tokens)
     self.assertListEqual(tokens, ['1', ' ', '&&', ' ', '0', ' ', '||', ' ',
                                   '$PATH', ' ', '+=', ' ',
                                   '"', '~/tmpsh/bin', '"', ';', ' ',
                                   '$VAR', '=', '0', ' ', '&', '\n',
                                   'echo', ' ', 'LOL'])
Exemple #10
0
 def test_tok_003(self):
     command = '''>(<(echo test > file.txt)) && 		$VAR_TEST| cat'''
     command += ' << HERE\ntestHERE\nHERE'''
     tokens = []
     tk.tokenize(command, tokens)
     self.assertListEqual(tokens, ['>(', '<(', 'echo', ' ', 'test',
                                   ' ', '>', ' ', 'file.txt', ')', ')', ' ',
                                   '&&', ' \t\t', '$VAR_TEST', '|', ' ',
                                   'cat', ' ', '<<', ' ', 'HERE', '\n',
                                   'testHERE', '\n', 'HERE'])
Exemple #11
0
 def test_tok_015(self):
     soluce = ['echo', ' ', '\\f', '\\i', '\\l', '\\e', '\n', 'VAR',
               ' ', '=', ' ', '0', '\n', 'VAR', '=',
               '1', '\n', 'VAR', ' ', '+=', '1', '\n',
               'echo', ' ', '"', '\\$', 'VAR', ' ', '\\=',
               ' ', '$VAR', '"', '\n']
     command = ''.join(soluce)
     tokens = []
     tk.tokenize(command, tokens)
     self.assertListEqual(tokens, soluce)
Exemple #12
0
def get_all_content_words(sentences, N, stem):
    all_words = []
    for s in sentences:
        if stem:
            all_words.extend([stemmer.stem(r) for r in tokenizer.tokenize(s)])
        else:
            all_words.extend(tokenizer.tokenize(s))

    normalized_content_words = map(normalize_word, all_words)
    return normalized_content_words
Exemple #13
0
 def test_tok_011(self):
     soluce = ['echo', ' ', '"', '\\"', '"', ' ', '&',
               ';', ' ', '$(', 'echo', ' ', '-n', ' ',
               '1', ')', '||',
               '${', 'echo', '\n', '-n', ' ', '1', '}',
               ';', ' ', '2', '>&', '-', ' ', '>>',
               'file', ' ', '>>', ' ', 'file2', '\n']
     command = ''.join(soluce)
     tokens = []
     tk.tokenize(command, tokens)
     self.assertListEqual(tokens, soluce)
Exemple #14
0
 def test_tok_010(self):
     soluce = [' ', '<(', 'LOL', ')', '\n', '"', ' ',
               'LOL', ' ', '"', '\n', '>(', '   ',
               'LOL',
               ')', '\n', "'", 'LOL', '    ', "'", '\n',
               '$(', ' ', 'LOL', ')', '\n', '${', ' ',
               'LOL', '}', '\n', '{', 'LOL', '}', '\n',
               '(', 'LOL', ')', '\n']
     command = ''.join(soluce)
     tokens = []
     tk.tokenize(command, tokens)
     self.assertListEqual(tokens, soluce)
Exemple #15
0
    def _build_doc_vocabulary(self):
        # Instancia o HTML parser do BeautifulSoup
        parser = bs4.BeautifulSoup(self.raw_doc, 'html.parser')

        # Tags que vão ser removidas do documento
        # Não queremos nem script (js) nem style (css) pois não agregam informações
        clean_up_tags = ['script', 'style']

        for tag_name in clean_up_tags:
            for tag in parser(tag_name):
                # Remove uma tag do documento
                tag.extract()

        # Remove âncoras mas guarda seus textos
        anchor_texts = []
        non_breaking_space = '\xa0'
        for anchor in parser('a'):
            anchor_text = anchor.string
            if anchor_text is not None and len(
                    anchor_text) > 1 and anchor_text != non_breaking_space:
                anchor_texts.append(anchor_text)
            anchor.extract()

        # Pega texto do documento resultante das operações acima
        doc_text = parser.get_text()

        # Tokeniza (ver função tokenize)
        doc_words = tokenize(doc_text)

        # Tokeniza o texto de cada âncora removida
        anchor_words = [tokenize(anchor_text) for anchor_text in anchor_texts]

        # Transforma em uma única lista, pois antes era uma matriz
        # (Eram várias âncoras, cada uma foi tokenizada, logo, matriz 2D)
        flatten_words = [item for sublist in anchor_words for item in sublist]

        # Junta tokens do documento com tokens das âncoras removidas
        doc_words = doc_words + flatten_words

        # Inicializa vocabulário do documento
        # (É um mapeamento de palavras e sua frequência no documento)
        self.vocabulary = {}

        stemmer = nltk.stem.porter.PorterStemmer()
        # Faz a contagem
        for word in doc_words:
            token = stemmer.stem(word).lower()
            word_freq = self.vocabulary.get(token, 0)
            self.vocabulary[token] = word_freq + 1
 def tokenize_data(self):
     print('Tokenizing data...', flush=True)
     data_sequences = []
     for idx, seq in enumerate(self.driving_data):
         # todo: experiment with splitting list instead. lot less training data, but possibly less redundant data
         data_sequences += tokenize(seq, self.seq_len)
     self.driving_data = data_sequences
Exemple #17
0
def render_parser(G, algorithm: str, parser, is_ll1=False):
    """
    Render Parser Subsection
    """
    st.title('Parsear Cadena')
    w = st.text_input("Inserte la cadena a parsear")

    if st.button("comenzar"):
        st.subheader(f'Aplicando: {algorithm}')

        tokens = tokenize(G, w)

        if isinstance(tokens, list):
            productions = parser(tokens)
            if not productions:
                st.error("Error en parsing.\
                    La cadena no pertenece al lenguaje.")
            else:
                st.success("OK")
                if is_ll1:
                    tree = LLDerivationTree(productions)
                else:
                    tree = LRDerivationTree(productions)
                st.graphviz_chart(str(tree.graph()))
        else:
            st.error("Error en tokenize: " + tokens)
Exemple #18
0
    def search_query(self,
                     searcher,
                     query,
                     output_fn,
                     collection='robust04',
                     K=1000):
        output_dir = os.path.dirname(output_fn)
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        with open(output_fn, 'w', encoding="utf-8") as out:
            sentid2text = {}
            hits = searcher.search(self.JString(query), K)
            for i in range(len(hits)):
                sim = hits[i].score
                docno = hits[i].docid
                content = hits[i].content
                if collection == 'core18':
                    content_json = json.loads(content)
                    content = ''
                    for each in content_json['contents']:
                        if each is not None and 'content' in each.keys():
                            content += '{}\n'.format(each['content'])
                if collection == 'robust04':
                    content = parse_doc_from_index(content)
                clean_content = clean_html(content, collection=collection)
                tokenized_content = tokenizer.tokenize(clean_content)
                sentid = 0
                for sent in tokenized_content:
                    # Split sentence if it's longer than BERT's maximum input length
                    if len(sent.strip().split()) > MAX_INPUT_LENGTH:
                        seq_list = chunk_sent(sent, MAX_INPUT_LENGTH)
                        for seq in seq_list:
                            sentno = docno + '_' + str(sentid)
                            out.write(
                                '{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format(
                                    0, round(float(sim), 16), query, seq, 0,
                                    sentno, 0, self.didx))
                            out.flush()
                            sentid += 1
                            self.didx += 1
                            sentid2text[sentno] = seq
                    else:
                        sentno = docno + '_' + str(sentid)
                        out.write('{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format(
                            0, round(float(sim), 16), query, sent, 0, sentno,
                            0, self.didx))
                        out.flush()
                        sentid += 1
                        self.didx += 1
                        sentid2text[sentno] = sent

        return sentid2text
    def transform(self, raw_tweets):
        occurrence_list = []
        for tweet in raw_tweets:
            for preprocessor in self.preprocessors:
                tweet = preprocessor(tweet)

            occurrences = self.cluster_dict.copy()
            for token in tokenizer.tokenize(tweet):
                if token in ClusterTransformer.dictionary:
                    occurrences[ClusterTransformer.dictionary[token]] += 1
            occurrence_list.append(occurrences)
        vectorized = self.vectorizer.transform(occurrence_list)
        return normalize(vectorized, axis=0) if self.normalize else vectorized
Exemple #20
0
def get_all_content_words(sentences, N):
	all_words = []
	for s in sentences:
		all_words.extend([stemmer.stem(r) for r in tokenizer.tokenize(s)])

	if N == 1:
		content_words = [w for w in all_words if w not in stopset]
	else:
		content_words = all_words

	normalized_content_words = map(normalize_word, content_words)
	if N > 1:
		return [gram for gram in ngrams(normalized_content_words, N) if is_ngram_content(gram)]
	return normalized_content_words
Exemple #21
0
    def get_documents_for_query(self,
                                field: str,
                                query: str,
                                max_size: int = 10,
                                tf_idf: bool = True) -> [IndexDocument]:
        # Separa cada termo da consulta
        terms = tokenize(query.lower(), True)

        # Remove stopwords da consulta
        if field != 'foot':
            terms = list(filter(lambda term: not term in stopwords, terms))

        # Busca todos os documentos para cada termo da consulta
        docs = [self.find_documents(field, term) for term in terms]

        # Deixa a lista de documentos flat, i.e., em uma lista (antes numa matriz)
        docs = reduce(lambda acc, v: acc + v, docs, [])

        # Remove documentos duplicados
        docs = list(set(docs))

        # Transforma cada documento em um vetor
        docs_vectors = [DocumentVector(doc, self) for doc in docs]

        # Transforma termos da consulta no documento da consulta
        query_doc = QueryDocument(None, terms)

        # Transforma o documento da consulta em um vetor no espaço da consulta
        query_vector = DocumentVector(query_doc, self)
        query_vector.project(query_vector, tf_idf)

        # Para cada vetor de documento, projetamos ele no espaço da consulta
        for doc_vector in docs_vectors:
            doc_vector.project(query_vector, tf_idf)

        # Computamos a similiridade com o vetor de consulta de cada documento e associamos ao mesmo documento para recupera-los depois
        docs_score_vectors = [(query_vector.similarity(doc_vector), doc_vector)
                              for doc_vector in docs_vectors]

        # Ordenamos de acordo com o score (crescente)
        docs_score_vectors.sort()

        # Invertemos a ordem dos documentos para ter docs com scores altos primeiro
        docs_score_vectors.reverse()

        # Descartamos o score e ficamos somente com os documentos
        return [d[1].doc for d in docs_score_vectors][:max_size]
Exemple #22
0
def render_parser(G, algorithm: str, parser):
    """
    Render Parser Subsection
    """
    st.title('Parsear Cadena')
    w = st.text_input("Inserte la cadena a parsear")

    if st.button("comenzar"):
        st.subheader(f'Aplicando: {algorithm}')

        tokens = tokenize(G, w)

        if isinstance(tokens, list):
            left_parse = parser(tokens)
            if not left_parse:
                st.error("Error en parsing.\
                    La cadena no pertenece al lenguaje.")
            else:
                st.success("OK")
                st.subheader("Producciones a aplicar:")
                for production in left_parse:
                    st.text(f'{production.Left} -> {production.Right}')
        else:
            st.error("Error en tokenize: " + tokens)
Exemple #23
0
def process_questions(questions, include_blacklisted=True):

    # Make a list
    if not isinstance(questions, list):
        questions = [questions]

    # Clean and tokenize
    prepared_questions = []
    for question in questions:
        question = question.strip()
        prepared_questions.append(
            tokenize(question) if question else '##emptyquestion##')

    # Run inference
    answers_list = inference_helper(prepared_questions)
    print("Num of Answer list:" + str(answers_list[0]))
    # Process answers
    prepared_answers_list = []
    for index, answers in enumerate(answers_list):
        # answers = detokenize(answers)
        answers = replace_in_answers(answers, 'answers')
        answers_score = score_answers(answers, 'answers')
        best_index, best_score = get_best_score(answers_score,
                                                include_blacklisted)

        if prepared_questions[index] == '##emptyquestion##':
            prepared_answers_list.append(None)
        else:
            prepared_answers_list.append({
                'answers': answers,
                'scores': answers_score,
                'best_index': best_index,
                'best_score': best_score
            })

    return prepared_answers_list
Exemple #24
0
def FilterByNLTK():
    dirname = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
    data_dir = os.path.join(dirname, 'data')
    data_from_filename = 'data.from'
    data_to_filename = 'data.to'
    train_from_filename = 'train.from'
    train_to_filename = 'train.to'
    test_from_filename = 'test.from'
    test_to_filename = 'test.to'
    dev_from_filename = 'dev.from'
    dev_to_filename = 'dev.to'
    vocab_from_filename = 'vocab.from'
    vocab_to_filename = 'vocab.to'
    vocab_from = []
    vocab_to = []
    tokenized_lines_from = []
    tokenized_lines_to = []
    with open('{}/{}'.format(data_dir, data_from_filename),
              'r',
              encoding='utf-8',
              buffering=131072) as data_from_in:
        lines = ReadLines(data_from_in)
        for line in lines:
            tokenized_lines_from.append(tokenize(line))

    with open('{}/{}'.format(data_dir, data_to_filename),
              'r',
              encoding='utf-8',
              buffering=131072) as data_to_in:
        lines = ReadLines(data_to_in)
        for line in lines:
            tokenized_lines_to.append(tokenize(line))

    # Create vocab from
    vocab_from = GetVocab(tokenized_lines_from)
    # Create vocab to
    vocab_to = GetVocab(tokenized_lines_to)

    count_data = len(tokenized_lines_from)
    count_train = int(count_data * 0.7)  # 70% to train
    count_dev = int(count_data * 0.2)  # 20% to dev
    count_test = count_data - count_train - count_dev  # 10% to test
    # Create train data from and to
    train_from = tokenized_lines_from[:count_train]
    train_to = tokenized_lines_to[:count_train]
    # Create test data from and to
    dev_from = tokenized_lines_from[count_train:count_train + count_dev]
    dev_to = tokenized_lines_to[count_train:count_train + count_dev]
    # Create dev data from and to
    test_from = tokenized_lines_from[count_train + count_dev:]
    test_to = tokenized_lines_to[count_train + count_dev:]

    with open('{}/{}'.format(data_dir, train_from_filename),
              'w',
              encoding='utf-8',
              buffering=131072) as train_from_out:
        WriteLines(train_from_out, train_from)

    with open('{}/{}'.format(data_dir, train_to_filename),
              'w',
              encoding='utf-8',
              buffering=131072) as train_to_out:
        WriteLines(train_to_out, train_to)

    with open('{}/{}'.format(data_dir, dev_from_filename),
              'w',
              encoding='utf-8',
              buffering=131072) as dev_from_out:
        WriteLines(dev_from_out, dev_from)

    with open('{}/{}'.format(data_dir, dev_to_filename),
              'w',
              encoding='utf-8',
              buffering=131072) as dev_to_out:
        WriteLines(dev_to_out, dev_to)

    with open('{}/{}'.format(data_dir, test_from_filename),
              'w',
              encoding='utf-8',
              buffering=131072) as test_from_out:
        WriteLines(test_from_out, test_from)

    with open('{}/{}'.format(data_dir, test_to_filename),
              'w',
              encoding='utf-8',
              buffering=131072) as test_to_out:
        WriteLines(test_to_out, test_to)

    with open('{}/{}'.format(data_dir, vocab_from_filename),
              'w',
              encoding='utf-8',
              buffering=131072) as vocab_from_out:
        WriteLines(vocab_from_out, vocab_from)

    with open('{}/{}'.format(data_dir, vocab_to_filename),
              'w',
              encoding='utf-8',
              buffering=131072) as vocab_to_out:
        WriteLines(vocab_to_out, vocab_to)


# if __name__ == "__main__":
#     # prepare()
#     # Prepare()
#     p = "Parts 3-month inspection 3-month periodic inspection set vehicle collection fee Vehicle delivery charge"
#     words = nltk.tokenize.word_tokenize(p)
#     print(words)
Exemple #25
0
def Prepare():
    dirname = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
    data_dir = os.path.join(dirname, 'data')
    data_from_filename = 'data.from'
    data_to_filename = 'data.to'
    train_from_filename = 'train.from'
    train_to_filename = 'train.to'
    test_from_filename = 'test.from'
    test_to_filename = 'test.to'
    dev_from_filename = 'dev.from'
    dev_to_filename = 'dev.to'
    vocab_from_filename = 'vocab.from'
    vocab_to_filename = 'vocab.to'
    vocab_from = []
    vocab_to = []
    tokenized_lines_from = []
    tokenized_lines_to = []
    with open('{}/{}'.format(data_dir, data_from_filename),
              'r',
              encoding='utf-8',
              buffering=131072) as data_from_in:
        lines = ReadLines(data_from_in)
        for line in lines:
            tokenized_lines_from.append(tokenize(line))

    with open('{}/{}'.format(data_dir, data_to_filename),
              'r',
              encoding='utf-8',
              buffering=131072) as data_to_in:
        lines = ReadLines(data_to_in)
        for line in lines:
            words = line.split()
            sentence = " ".join(words)
            sentence = sentence.strip()
            tokenized_lines_to.append(sentence)

    # Create vocab from
    vocab_from = GetVocab(tokenized_lines_from)
    # Create vocab to
    vocab_to = GetVocab(tokenized_lines_to)

    count_data = len(tokenized_lines_from)
    count_train = int(count_data * 0.7)  # 70% to train
    count_dev = int(count_data * 0.2)  # 20% to dev
    count_test = count_data - count_train - count_dev  # 10% to test
    # Create train data from and to
    train_from = tokenized_lines_from[:count_train]
    train_to = tokenized_lines_to[:count_train]
    # Create test data from and to
    dev_from = tokenized_lines_from[count_train:count_train + count_dev]
    dev_to = tokenized_lines_to[count_train:count_train + count_dev]
    # Create dev data from and to
    test_from = tokenized_lines_from[count_train + count_dev:]
    test_to = tokenized_lines_to[count_train + count_dev:]

    with open('{}/{}'.format(data_dir, train_from_filename),
              'w',
              encoding='utf-8',
              buffering=131072) as train_from_out:
        WriteLines(train_from_out, train_from)

    with open('{}/{}'.format(data_dir, train_to_filename),
              'w',
              encoding='utf-8',
              buffering=131072) as train_to_out:
        WriteLines(train_to_out, train_to)

    with open('{}/{}'.format(data_dir, dev_from_filename),
              'w',
              encoding='utf-8',
              buffering=131072) as dev_from_out:
        WriteLines(dev_from_out, dev_from)

    with open('{}/{}'.format(data_dir, dev_to_filename),
              'w',
              encoding='utf-8',
              buffering=131072) as dev_to_out:
        WriteLines(dev_to_out, dev_to)

    with open('{}/{}'.format(data_dir, test_from_filename),
              'w',
              encoding='utf-8',
              buffering=131072) as test_from_out:
        WriteLines(test_from_out, test_from)

    with open('{}/{}'.format(data_dir, test_to_filename),
              'w',
              encoding='utf-8',
              buffering=131072) as test_to_out:
        WriteLines(test_to_out, test_to)

    with open('{}/{}'.format(data_dir, vocab_from_filename),
              'w',
              encoding='utf-8',
              buffering=131072) as vocab_from_out:
        WriteLines(vocab_from_out, vocab_from)

    with open('{}/{}'.format(data_dir, vocab_to_filename),
              'w',
              encoding='utf-8',
              buffering=131072) as vocab_to_out:
        WriteLines(vocab_to_out, vocab_to)
Exemple #26
0
 def get_terms(self) -> [str]:
     return [
         token.lower() for token in tokenizer.tokenize(self.query, True)
     ]
Exemple #27
0
def training(languages, EMBEDDING, train, test, type_model, pre):

    for lang in languages:
        print('Training ', lang)
        # train_new = train[train["language"] == lang]
        # test_new = test[test["language"] == lang]
        train_new = train
        test_new = test

        train_new['title'] = train_new['title'].str.lower()
        test_new['title'] = test_new['title'].str.lower()

        if type_model == 'three':
            train_new = build_features(train_new)
            test_new = build_features(test_new)

        # train_new["title"] = train_new["title"].progress_apply(lambda x: clean_numbers(x))
        train_new["title"] = train_new["title"].progress_apply(
            lambda x: replace_typical_misspell(x, lang))
        train_new["title"] = train_new["title"].progress_apply(
            lambda x: clean_text(x))
        train_new["title"] = train_new["title"].progress_apply(
            lambda x: normalize_title(x))

        # test_new["title"] = test_new["title"].progress_apply(lambda x: clean_numbers(x))
        test_new["title"] = test_new["title"].progress_apply(
            lambda x: replace_typical_misspell(x, lang))
        test_new["title"] = test_new["title"].progress_apply(
            lambda x: clean_text(x))
        test_new["title"] = test_new["title"].progress_apply(
            lambda x: normalize_title(x))

        X_train = train_new['title']

        Y_train = train_new['category'].values

        classes = train_new["category"].unique()

        X_test = test_new["title"]

        max_features = 20000
        maxlen = 20
        embed_size = 300
        batch_size = 32

        # Generate char embedding without preprocess
        text = (train_new['title'].tolist() + test_new["title"].tolist())
        char_vectorizer = CharVectorizer(max_features, text)
        char_embed_size = char_vectorizer.embed_size

        tok, X_train = tokenize(X_train, X_test, max_features, maxlen, lang)
        glove_embedding_matrix = meta_embedding(tok, EMBEDDING[lang][0],
                                                max_features, embed_size, lang)
        fast_embedding_matrix = meta_embedding(tok, EMBEDDING[lang][1],
                                               max_features, embed_size, lang)

        char_embedding = char_vectorizer.get_char_embedding(tok)

        # embedding_matrix = np.mean([glove_embedding_matrix, fast_embedding_matrix], axis=0)

        embedding_matrix = np.concatenate(
            (glove_embedding_matrix, fast_embedding_matrix, char_embedding),
            axis=1)

        if type_model == 'three':
            # X_train_2 = train_new[train_new['label_quality'] == 'reliable']['small_title']
            X_train_3 = train_new[[
                'n_words', 'length', 'n_chars_word', 'n_capital_letters',
                'n_numbers', 'small_length', 'small_n_chars_word',
                'small_n_capital_letters', 'small_n_numbers'
            ]].values

            # X_train_2 = tok.texts_to_sequences(X_train_2)
            # X_train_2 = sequence.pad_sequences(X_train_2, maxlen=6)

            X_train, X_val, X_train_3, X_val_3, Y_train, Y_val = train_test_split(
                X_train, X_train_3, Y_train, train_size=0.9, random_state=233)

            train_generator = DataGenerator([X_train, X_train_3],
                                            Y_train,
                                            classes,
                                            batch_size=batch_size,
                                            mode=type_model,
                                            train=False)
            val_generator = DataGenerator([X_val, X_val_3],
                                          Y_val,
                                          classes,
                                          batch_size=batch_size,
                                          mode=type_model,
                                          train=False)

        else:

            X_train, X_val, Y_train, Y_val = train_test_split(X_train,
                                                              Y_train,
                                                              train_size=0.9,
                                                              random_state=233)

            train_generator = DataGenerator(X_train,
                                            Y_train,
                                            classes,
                                            batch_size=batch_size,
                                            train=False)
            val_generator = DataGenerator(X_val,
                                          Y_val,
                                          classes,
                                          batch_size=batch_size,
                                          train=False)

        if type_model == 'small':
            model = get_small_model(maxlen, max_features,
                                    2 * embed_size + char_embed_size,
                                    embedding_matrix, len(classes))

        elif type_model == 'three':
            model = get_three_entrys_model(maxlen, max_features,
                                           2 * embed_size + char_embed_size,
                                           embedding_matrix, len(classes))

        else:
            model = get_model(maxlen, max_features,
                              2 * embed_size + char_embed_size,
                              embedding_matrix, len(classes))

            # embedding_matrix = np.mean([glove_embedding_matrix, fast_embedding_matrix], axis=0)

        class_weights = class_weight.compute_class_weight(
            'balanced', classes, Y_train)
        opt = Adam(lr=0.001)

        model.compile(loss=label_smooth_loss,
                      optimizer=opt,
                      metrics=['accuracy'])

        filepath = '../models/' + lang + '_model_{epoch:02d}_{val_acc:.4f}.h5'
        checkpoint = ModelCheckpoint(filepath,
                                     monitor='val_acc',
                                     verbose=1,
                                     save_best_only=False,
                                     mode='max',
                                     save_weights_only=False)
        early = EarlyStopping(monitor="val_loss", mode="min", patience=3)

        # clr = CyclicLR(base_lr=0.0003, max_lr=0.001,
        #                step_size=35000, reduce_on_plateau=1, monitor='val_loss', reduce_factor=10)

        reduce_lr = ReduceLROnPlateau(monitor='val_loss',
                                      factor=0.3,
                                      patience=1,
                                      verbose=1,
                                      mode='auto',
                                      epsilon=0.0001,
                                      cooldown=0,
                                      min_lr=0)

        callbacks_list = [checkpoint, early, reduce_lr]

        # lookahead = Lookahead(k=5, alpha=0.5)  # Initialize Lookahead
        # lookahead.inject(model)

        print("Treinando")

        model.fit_generator(generator=train_generator,
                            validation_data=val_generator,
                            callbacks=callbacks_list,
                            class_weight=class_weights,
                            epochs=50,
                            use_multiprocessing=True,
                            workers=42)
Exemple #28
0
 def search_document(self,
                     searcher,
                     qid2docid,
                     qid2text,
                     output_fn,
                     collection='robust04',
                     K=1000,
                     topics=None,
                     cv_fold=None):
     output_dir = os.path.dirname(output_fn)
     if not os.path.exists(output_dir):
         os.makedirs(output_dir)
     with open(output_fn, 'w', encoding="utf-8") as out:
         if 'core' in collection:
             # Robust04 provides CV topics
             topics = qid2text
         for qid in topics:
             text = qid2text[qid]
             hits = searcher.search(self.JString(text), K)
             for i in range(len(hits)):
                 sim = hits[i].score
                 docno = hits[i].docid
                 label = 1 if qid in qid2docid and docno in qid2docid[
                     qid] else 0
                 content = hits[i].content
                 if collection == 'core18':
                     content_json = json.loads(content)
                     content = ''
                     for each in content_json['contents']:
                         if each is not None and 'content' in each.keys():
                             content += '{}\n'.format(each['content'])
                 if collection == 'robust04':
                     content = parse_doc_from_index(content)
                 clean_content = clean_html(content, collection=collection)
                 tokenized_content = tokenizer.tokenize(clean_content)
                 sentid = 0
                 for sent in tokenized_content:
                     # Split sentence if it's longer than BERT's maximum input length
                     if len(sent.strip().split()) > MAX_INPUT_LENGTH:
                         seq_list = chunk_sent(sent, MAX_INPUT_LENGTH)
                         for seq in seq_list:
                             sentno = docno + '_' + str(sentid)
                             if cv_fold == '5':
                                 out.write(
                                     '{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.
                                     format(label, round(float(sim), 11),
                                            text, seq, qid, sentno, qid,
                                            self.didx - 1))
                             else:
                                 out.write(
                                     '{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.
                                     format(label, round(float(sim), 16),
                                            text, seq, qid, sentno,
                                            self.qidx, self.didx))
                             out.flush()
                             sentid += 1
                             self.didx += 1
                     else:
                         sentno = docno + '_' + str(sentid)
                         if cv_fold == '5':
                             out.write(
                                 '{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format(
                                     label, round(float(sim), 11), text,
                                     sent, qid, sentno, qid, self.didx - 1))
                         else:
                             out.write(
                                 '{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format(
                                     label, round(float(sim),
                                                  16), text, sent, qid,
                                     sentno, self.qidx, self.didx))
                         out.flush()
                         sentid += 1
                         self.didx += 1
             self.qidx += 1
Exemple #29
0
                    except KeyError:
                        matrix[-1].append(' ')

            st.subheader('Tabla de parsing')

            frame = pd.DataFrame(matrix, index=rows, columns=columns)
            st.write(frame)

            # Parsing
            st.subheader("Inserte la cadena a parsear")
            w = st.text_area('')

            if st.button("Parsear"):
                parser = metodo_predictivo_no_recursivo(G, M)

                tokens = tokenize(G, w)

                if isinstance(tokens, list):
                    left_parse = parser(tokens)
                    if not left_parse:
                        st.error("Error en parsing.\
                            La cadena no pertenece al lenguaje.")
                    else:
                        st.success("OK")
                        st.subheader("Producciones a aplicar:")
                        for production in left_parse:
                            s = str(production.Left) + ' -> ' + \
                                str(production.Right)
                            st.text(s)
                else:
                    st.error("Error en tokenize: " + tokens)
Exemple #30
0
 def test_tok_009(self):
     command = 'echo "yolo"&&ls'
     tokens = []
     tk.tokenize(command, tokens)
     self.assertListEqual(tokens, ['echo', ' ', '"', 'yolo',
                                   '"', '&&', 'ls'])
Exemple #31
0
 def test_tok_010(self):
     command = '$$ $? $- $!'
     tokens = []
     tk.tokenize(command, tokens)
     self.assertListEqual(tokens, ['$$', ' ', '$?', ' ', '$-', ' ', '$!'])
def tokenize_and_stem(text):
    """
    Tokenize and stem English text
    """
    global stemmer, tokenizer
    return [stemmer.stem(token) for token in tokenizer.tokenize(text)]