def test_tok_006(self): soluce = ['echo', ' ', '\\$', 'PATH', ' ', '||', ' ', 'echo', ' ', '$path', '\n'] command = ''.join(soluce) tokens = [] tk.tokenize(command, tokens) self.assertListEqual(tokens, soluce)
def test_tok_007(self): soluce = ['echo', ' ', '"', '\n', 'ewline', '\n', 'ewline', '"', '\n'] command = ''.join(soluce) tokens = [] tk.tokenize(command, tokens) self.assertListEqual(tokens, soluce)
def test_tok_008(self): command = 'echo ${var}_$var${var2}_' tokens = [] tk.tokenize(command, tokens) self.assertListEqual( tokens, ['echo', ' ', '${', 'var', '}', '_', '$var', '${', 'var2', '}', '_'])
def test_tok_004(self): soluce = ['ls', ' ', '-l', ' ', '\\\n', '/', ' ', ';', '\n'] command = ''.join(soluce) tokens = [] tk.tokenize(command, tokens) self.assertListEqual(tokens, soluce)
def test_tok_014(self): soluce = ['echo', ' ', 'text', '||', 'file', ' ', '<<', ' ', 'file2', '\n'] command = ''.join(soluce) tokens = [] tk.tokenize(command, tokens) self.assertListEqual(tokens, soluce)
def get_morph_analyzes(line, lang="turkish"): """ :param lang: :param line: a sentence on a line (untokenized) :return: """ if type(line) == str: tokens = tokenizer.tokenize(line) else: tokens = tokenizer.tokenize(line.decode("utf8")) fd, f_path = tempfile.mkstemp() with open(f_path, "w") as f: for token in tokens: f.write(token.encode("iso-8859-9") + "\n") os.close(fd) print(f_path) with codecs.open(f_path, "r", encoding="iso-8859-9") as f, open(os.devnull, "w") as devnull: # print f.readlines() string_output = subprocess.check_output(analyzer_command[lang], stdin=f, cwd=analyzer_paths[lang], stderr=devnull) # print string_output # print type(string_output) # print string_output.decode("iso-8859-9").encode('utf8') # print type(string_output.decode("iso-8859-9")) return string_output
def test_tok_017(self): soluce = ['VAR', ' ', '=', ' ', '0', '\n', 'VAR', '=', '1', '\n', 'VAR', ' ', '+=', '1', '\n'] command = ''.join(soluce) tokens = [] tk.tokenize(command, tokens) self.assertListEqual(tokens, soluce)
def test_tok_009(self): soluce = ['echo', ' ', '1', '&&', '\n', '<(', 'LOL', ')', '||', '\n', 'LOL', '\n'] command = ''.join(soluce) tokens = [] tk.tokenize(command, tokens) self.assertListEqual(tokens, soluce)
def test_tok_002(self): command = '1 && 0 || $PATH += "~/tmpsh/bin"; $VAR=0 &\necho LOL' tokens = [] tk.tokenize(command, tokens) self.assertListEqual(tokens, ['1', ' ', '&&', ' ', '0', ' ', '||', ' ', '$PATH', ' ', '+=', ' ', '"', '~/tmpsh/bin', '"', ';', ' ', '$VAR', '=', '0', ' ', '&', '\n', 'echo', ' ', 'LOL'])
def test_tok_003(self): command = '''>(<(echo test > file.txt)) && $VAR_TEST| cat''' command += ' << HERE\ntestHERE\nHERE''' tokens = [] tk.tokenize(command, tokens) self.assertListEqual(tokens, ['>(', '<(', 'echo', ' ', 'test', ' ', '>', ' ', 'file.txt', ')', ')', ' ', '&&', ' \t\t', '$VAR_TEST', '|', ' ', 'cat', ' ', '<<', ' ', 'HERE', '\n', 'testHERE', '\n', 'HERE'])
def test_tok_015(self): soluce = ['echo', ' ', '\\f', '\\i', '\\l', '\\e', '\n', 'VAR', ' ', '=', ' ', '0', '\n', 'VAR', '=', '1', '\n', 'VAR', ' ', '+=', '1', '\n', 'echo', ' ', '"', '\\$', 'VAR', ' ', '\\=', ' ', '$VAR', '"', '\n'] command = ''.join(soluce) tokens = [] tk.tokenize(command, tokens) self.assertListEqual(tokens, soluce)
def get_all_content_words(sentences, N, stem): all_words = [] for s in sentences: if stem: all_words.extend([stemmer.stem(r) for r in tokenizer.tokenize(s)]) else: all_words.extend(tokenizer.tokenize(s)) normalized_content_words = map(normalize_word, all_words) return normalized_content_words
def test_tok_011(self): soluce = ['echo', ' ', '"', '\\"', '"', ' ', '&', ';', ' ', '$(', 'echo', ' ', '-n', ' ', '1', ')', '||', '${', 'echo', '\n', '-n', ' ', '1', '}', ';', ' ', '2', '>&', '-', ' ', '>>', 'file', ' ', '>>', ' ', 'file2', '\n'] command = ''.join(soluce) tokens = [] tk.tokenize(command, tokens) self.assertListEqual(tokens, soluce)
def test_tok_010(self): soluce = [' ', '<(', 'LOL', ')', '\n', '"', ' ', 'LOL', ' ', '"', '\n', '>(', ' ', 'LOL', ')', '\n', "'", 'LOL', ' ', "'", '\n', '$(', ' ', 'LOL', ')', '\n', '${', ' ', 'LOL', '}', '\n', '{', 'LOL', '}', '\n', '(', 'LOL', ')', '\n'] command = ''.join(soluce) tokens = [] tk.tokenize(command, tokens) self.assertListEqual(tokens, soluce)
def _build_doc_vocabulary(self): # Instancia o HTML parser do BeautifulSoup parser = bs4.BeautifulSoup(self.raw_doc, 'html.parser') # Tags que vão ser removidas do documento # Não queremos nem script (js) nem style (css) pois não agregam informações clean_up_tags = ['script', 'style'] for tag_name in clean_up_tags: for tag in parser(tag_name): # Remove uma tag do documento tag.extract() # Remove âncoras mas guarda seus textos anchor_texts = [] non_breaking_space = '\xa0' for anchor in parser('a'): anchor_text = anchor.string if anchor_text is not None and len( anchor_text) > 1 and anchor_text != non_breaking_space: anchor_texts.append(anchor_text) anchor.extract() # Pega texto do documento resultante das operações acima doc_text = parser.get_text() # Tokeniza (ver função tokenize) doc_words = tokenize(doc_text) # Tokeniza o texto de cada âncora removida anchor_words = [tokenize(anchor_text) for anchor_text in anchor_texts] # Transforma em uma única lista, pois antes era uma matriz # (Eram várias âncoras, cada uma foi tokenizada, logo, matriz 2D) flatten_words = [item for sublist in anchor_words for item in sublist] # Junta tokens do documento com tokens das âncoras removidas doc_words = doc_words + flatten_words # Inicializa vocabulário do documento # (É um mapeamento de palavras e sua frequência no documento) self.vocabulary = {} stemmer = nltk.stem.porter.PorterStemmer() # Faz a contagem for word in doc_words: token = stemmer.stem(word).lower() word_freq = self.vocabulary.get(token, 0) self.vocabulary[token] = word_freq + 1
def tokenize_data(self): print('Tokenizing data...', flush=True) data_sequences = [] for idx, seq in enumerate(self.driving_data): # todo: experiment with splitting list instead. lot less training data, but possibly less redundant data data_sequences += tokenize(seq, self.seq_len) self.driving_data = data_sequences
def render_parser(G, algorithm: str, parser, is_ll1=False): """ Render Parser Subsection """ st.title('Parsear Cadena') w = st.text_input("Inserte la cadena a parsear") if st.button("comenzar"): st.subheader(f'Aplicando: {algorithm}') tokens = tokenize(G, w) if isinstance(tokens, list): productions = parser(tokens) if not productions: st.error("Error en parsing.\ La cadena no pertenece al lenguaje.") else: st.success("OK") if is_ll1: tree = LLDerivationTree(productions) else: tree = LRDerivationTree(productions) st.graphviz_chart(str(tree.graph())) else: st.error("Error en tokenize: " + tokens)
def search_query(self, searcher, query, output_fn, collection='robust04', K=1000): output_dir = os.path.dirname(output_fn) if not os.path.exists(output_dir): os.makedirs(output_dir) with open(output_fn, 'w', encoding="utf-8") as out: sentid2text = {} hits = searcher.search(self.JString(query), K) for i in range(len(hits)): sim = hits[i].score docno = hits[i].docid content = hits[i].content if collection == 'core18': content_json = json.loads(content) content = '' for each in content_json['contents']: if each is not None and 'content' in each.keys(): content += '{}\n'.format(each['content']) if collection == 'robust04': content = parse_doc_from_index(content) clean_content = clean_html(content, collection=collection) tokenized_content = tokenizer.tokenize(clean_content) sentid = 0 for sent in tokenized_content: # Split sentence if it's longer than BERT's maximum input length if len(sent.strip().split()) > MAX_INPUT_LENGTH: seq_list = chunk_sent(sent, MAX_INPUT_LENGTH) for seq in seq_list: sentno = docno + '_' + str(sentid) out.write( '{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format( 0, round(float(sim), 16), query, seq, 0, sentno, 0, self.didx)) out.flush() sentid += 1 self.didx += 1 sentid2text[sentno] = seq else: sentno = docno + '_' + str(sentid) out.write('{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format( 0, round(float(sim), 16), query, sent, 0, sentno, 0, self.didx)) out.flush() sentid += 1 self.didx += 1 sentid2text[sentno] = sent return sentid2text
def transform(self, raw_tweets): occurrence_list = [] for tweet in raw_tweets: for preprocessor in self.preprocessors: tweet = preprocessor(tweet) occurrences = self.cluster_dict.copy() for token in tokenizer.tokenize(tweet): if token in ClusterTransformer.dictionary: occurrences[ClusterTransformer.dictionary[token]] += 1 occurrence_list.append(occurrences) vectorized = self.vectorizer.transform(occurrence_list) return normalize(vectorized, axis=0) if self.normalize else vectorized
def get_all_content_words(sentences, N): all_words = [] for s in sentences: all_words.extend([stemmer.stem(r) for r in tokenizer.tokenize(s)]) if N == 1: content_words = [w for w in all_words if w not in stopset] else: content_words = all_words normalized_content_words = map(normalize_word, content_words) if N > 1: return [gram for gram in ngrams(normalized_content_words, N) if is_ngram_content(gram)] return normalized_content_words
def get_documents_for_query(self, field: str, query: str, max_size: int = 10, tf_idf: bool = True) -> [IndexDocument]: # Separa cada termo da consulta terms = tokenize(query.lower(), True) # Remove stopwords da consulta if field != 'foot': terms = list(filter(lambda term: not term in stopwords, terms)) # Busca todos os documentos para cada termo da consulta docs = [self.find_documents(field, term) for term in terms] # Deixa a lista de documentos flat, i.e., em uma lista (antes numa matriz) docs = reduce(lambda acc, v: acc + v, docs, []) # Remove documentos duplicados docs = list(set(docs)) # Transforma cada documento em um vetor docs_vectors = [DocumentVector(doc, self) for doc in docs] # Transforma termos da consulta no documento da consulta query_doc = QueryDocument(None, terms) # Transforma o documento da consulta em um vetor no espaço da consulta query_vector = DocumentVector(query_doc, self) query_vector.project(query_vector, tf_idf) # Para cada vetor de documento, projetamos ele no espaço da consulta for doc_vector in docs_vectors: doc_vector.project(query_vector, tf_idf) # Computamos a similiridade com o vetor de consulta de cada documento e associamos ao mesmo documento para recupera-los depois docs_score_vectors = [(query_vector.similarity(doc_vector), doc_vector) for doc_vector in docs_vectors] # Ordenamos de acordo com o score (crescente) docs_score_vectors.sort() # Invertemos a ordem dos documentos para ter docs com scores altos primeiro docs_score_vectors.reverse() # Descartamos o score e ficamos somente com os documentos return [d[1].doc for d in docs_score_vectors][:max_size]
def render_parser(G, algorithm: str, parser): """ Render Parser Subsection """ st.title('Parsear Cadena') w = st.text_input("Inserte la cadena a parsear") if st.button("comenzar"): st.subheader(f'Aplicando: {algorithm}') tokens = tokenize(G, w) if isinstance(tokens, list): left_parse = parser(tokens) if not left_parse: st.error("Error en parsing.\ La cadena no pertenece al lenguaje.") else: st.success("OK") st.subheader("Producciones a aplicar:") for production in left_parse: st.text(f'{production.Left} -> {production.Right}') else: st.error("Error en tokenize: " + tokens)
def process_questions(questions, include_blacklisted=True): # Make a list if not isinstance(questions, list): questions = [questions] # Clean and tokenize prepared_questions = [] for question in questions: question = question.strip() prepared_questions.append( tokenize(question) if question else '##emptyquestion##') # Run inference answers_list = inference_helper(prepared_questions) print("Num of Answer list:" + str(answers_list[0])) # Process answers prepared_answers_list = [] for index, answers in enumerate(answers_list): # answers = detokenize(answers) answers = replace_in_answers(answers, 'answers') answers_score = score_answers(answers, 'answers') best_index, best_score = get_best_score(answers_score, include_blacklisted) if prepared_questions[index] == '##emptyquestion##': prepared_answers_list.append(None) else: prepared_answers_list.append({ 'answers': answers, 'scores': answers_score, 'best_index': best_index, 'best_score': best_score }) return prepared_answers_list
def FilterByNLTK(): dirname = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) data_dir = os.path.join(dirname, 'data') data_from_filename = 'data.from' data_to_filename = 'data.to' train_from_filename = 'train.from' train_to_filename = 'train.to' test_from_filename = 'test.from' test_to_filename = 'test.to' dev_from_filename = 'dev.from' dev_to_filename = 'dev.to' vocab_from_filename = 'vocab.from' vocab_to_filename = 'vocab.to' vocab_from = [] vocab_to = [] tokenized_lines_from = [] tokenized_lines_to = [] with open('{}/{}'.format(data_dir, data_from_filename), 'r', encoding='utf-8', buffering=131072) as data_from_in: lines = ReadLines(data_from_in) for line in lines: tokenized_lines_from.append(tokenize(line)) with open('{}/{}'.format(data_dir, data_to_filename), 'r', encoding='utf-8', buffering=131072) as data_to_in: lines = ReadLines(data_to_in) for line in lines: tokenized_lines_to.append(tokenize(line)) # Create vocab from vocab_from = GetVocab(tokenized_lines_from) # Create vocab to vocab_to = GetVocab(tokenized_lines_to) count_data = len(tokenized_lines_from) count_train = int(count_data * 0.7) # 70% to train count_dev = int(count_data * 0.2) # 20% to dev count_test = count_data - count_train - count_dev # 10% to test # Create train data from and to train_from = tokenized_lines_from[:count_train] train_to = tokenized_lines_to[:count_train] # Create test data from and to dev_from = tokenized_lines_from[count_train:count_train + count_dev] dev_to = tokenized_lines_to[count_train:count_train + count_dev] # Create dev data from and to test_from = tokenized_lines_from[count_train + count_dev:] test_to = tokenized_lines_to[count_train + count_dev:] with open('{}/{}'.format(data_dir, train_from_filename), 'w', encoding='utf-8', buffering=131072) as train_from_out: WriteLines(train_from_out, train_from) with open('{}/{}'.format(data_dir, train_to_filename), 'w', encoding='utf-8', buffering=131072) as train_to_out: WriteLines(train_to_out, train_to) with open('{}/{}'.format(data_dir, dev_from_filename), 'w', encoding='utf-8', buffering=131072) as dev_from_out: WriteLines(dev_from_out, dev_from) with open('{}/{}'.format(data_dir, dev_to_filename), 'w', encoding='utf-8', buffering=131072) as dev_to_out: WriteLines(dev_to_out, dev_to) with open('{}/{}'.format(data_dir, test_from_filename), 'w', encoding='utf-8', buffering=131072) as test_from_out: WriteLines(test_from_out, test_from) with open('{}/{}'.format(data_dir, test_to_filename), 'w', encoding='utf-8', buffering=131072) as test_to_out: WriteLines(test_to_out, test_to) with open('{}/{}'.format(data_dir, vocab_from_filename), 'w', encoding='utf-8', buffering=131072) as vocab_from_out: WriteLines(vocab_from_out, vocab_from) with open('{}/{}'.format(data_dir, vocab_to_filename), 'w', encoding='utf-8', buffering=131072) as vocab_to_out: WriteLines(vocab_to_out, vocab_to) # if __name__ == "__main__": # # prepare() # # Prepare() # p = "Parts 3-month inspection 3-month periodic inspection set vehicle collection fee Vehicle delivery charge" # words = nltk.tokenize.word_tokenize(p) # print(words)
def Prepare(): dirname = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) data_dir = os.path.join(dirname, 'data') data_from_filename = 'data.from' data_to_filename = 'data.to' train_from_filename = 'train.from' train_to_filename = 'train.to' test_from_filename = 'test.from' test_to_filename = 'test.to' dev_from_filename = 'dev.from' dev_to_filename = 'dev.to' vocab_from_filename = 'vocab.from' vocab_to_filename = 'vocab.to' vocab_from = [] vocab_to = [] tokenized_lines_from = [] tokenized_lines_to = [] with open('{}/{}'.format(data_dir, data_from_filename), 'r', encoding='utf-8', buffering=131072) as data_from_in: lines = ReadLines(data_from_in) for line in lines: tokenized_lines_from.append(tokenize(line)) with open('{}/{}'.format(data_dir, data_to_filename), 'r', encoding='utf-8', buffering=131072) as data_to_in: lines = ReadLines(data_to_in) for line in lines: words = line.split() sentence = " ".join(words) sentence = sentence.strip() tokenized_lines_to.append(sentence) # Create vocab from vocab_from = GetVocab(tokenized_lines_from) # Create vocab to vocab_to = GetVocab(tokenized_lines_to) count_data = len(tokenized_lines_from) count_train = int(count_data * 0.7) # 70% to train count_dev = int(count_data * 0.2) # 20% to dev count_test = count_data - count_train - count_dev # 10% to test # Create train data from and to train_from = tokenized_lines_from[:count_train] train_to = tokenized_lines_to[:count_train] # Create test data from and to dev_from = tokenized_lines_from[count_train:count_train + count_dev] dev_to = tokenized_lines_to[count_train:count_train + count_dev] # Create dev data from and to test_from = tokenized_lines_from[count_train + count_dev:] test_to = tokenized_lines_to[count_train + count_dev:] with open('{}/{}'.format(data_dir, train_from_filename), 'w', encoding='utf-8', buffering=131072) as train_from_out: WriteLines(train_from_out, train_from) with open('{}/{}'.format(data_dir, train_to_filename), 'w', encoding='utf-8', buffering=131072) as train_to_out: WriteLines(train_to_out, train_to) with open('{}/{}'.format(data_dir, dev_from_filename), 'w', encoding='utf-8', buffering=131072) as dev_from_out: WriteLines(dev_from_out, dev_from) with open('{}/{}'.format(data_dir, dev_to_filename), 'w', encoding='utf-8', buffering=131072) as dev_to_out: WriteLines(dev_to_out, dev_to) with open('{}/{}'.format(data_dir, test_from_filename), 'w', encoding='utf-8', buffering=131072) as test_from_out: WriteLines(test_from_out, test_from) with open('{}/{}'.format(data_dir, test_to_filename), 'w', encoding='utf-8', buffering=131072) as test_to_out: WriteLines(test_to_out, test_to) with open('{}/{}'.format(data_dir, vocab_from_filename), 'w', encoding='utf-8', buffering=131072) as vocab_from_out: WriteLines(vocab_from_out, vocab_from) with open('{}/{}'.format(data_dir, vocab_to_filename), 'w', encoding='utf-8', buffering=131072) as vocab_to_out: WriteLines(vocab_to_out, vocab_to)
def get_terms(self) -> [str]: return [ token.lower() for token in tokenizer.tokenize(self.query, True) ]
def training(languages, EMBEDDING, train, test, type_model, pre): for lang in languages: print('Training ', lang) # train_new = train[train["language"] == lang] # test_new = test[test["language"] == lang] train_new = train test_new = test train_new['title'] = train_new['title'].str.lower() test_new['title'] = test_new['title'].str.lower() if type_model == 'three': train_new = build_features(train_new) test_new = build_features(test_new) # train_new["title"] = train_new["title"].progress_apply(lambda x: clean_numbers(x)) train_new["title"] = train_new["title"].progress_apply( lambda x: replace_typical_misspell(x, lang)) train_new["title"] = train_new["title"].progress_apply( lambda x: clean_text(x)) train_new["title"] = train_new["title"].progress_apply( lambda x: normalize_title(x)) # test_new["title"] = test_new["title"].progress_apply(lambda x: clean_numbers(x)) test_new["title"] = test_new["title"].progress_apply( lambda x: replace_typical_misspell(x, lang)) test_new["title"] = test_new["title"].progress_apply( lambda x: clean_text(x)) test_new["title"] = test_new["title"].progress_apply( lambda x: normalize_title(x)) X_train = train_new['title'] Y_train = train_new['category'].values classes = train_new["category"].unique() X_test = test_new["title"] max_features = 20000 maxlen = 20 embed_size = 300 batch_size = 32 # Generate char embedding without preprocess text = (train_new['title'].tolist() + test_new["title"].tolist()) char_vectorizer = CharVectorizer(max_features, text) char_embed_size = char_vectorizer.embed_size tok, X_train = tokenize(X_train, X_test, max_features, maxlen, lang) glove_embedding_matrix = meta_embedding(tok, EMBEDDING[lang][0], max_features, embed_size, lang) fast_embedding_matrix = meta_embedding(tok, EMBEDDING[lang][1], max_features, embed_size, lang) char_embedding = char_vectorizer.get_char_embedding(tok) # embedding_matrix = np.mean([glove_embedding_matrix, fast_embedding_matrix], axis=0) embedding_matrix = np.concatenate( (glove_embedding_matrix, fast_embedding_matrix, char_embedding), axis=1) if type_model == 'three': # X_train_2 = train_new[train_new['label_quality'] == 'reliable']['small_title'] X_train_3 = train_new[[ 'n_words', 'length', 'n_chars_word', 'n_capital_letters', 'n_numbers', 'small_length', 'small_n_chars_word', 'small_n_capital_letters', 'small_n_numbers' ]].values # X_train_2 = tok.texts_to_sequences(X_train_2) # X_train_2 = sequence.pad_sequences(X_train_2, maxlen=6) X_train, X_val, X_train_3, X_val_3, Y_train, Y_val = train_test_split( X_train, X_train_3, Y_train, train_size=0.9, random_state=233) train_generator = DataGenerator([X_train, X_train_3], Y_train, classes, batch_size=batch_size, mode=type_model, train=False) val_generator = DataGenerator([X_val, X_val_3], Y_val, classes, batch_size=batch_size, mode=type_model, train=False) else: X_train, X_val, Y_train, Y_val = train_test_split(X_train, Y_train, train_size=0.9, random_state=233) train_generator = DataGenerator(X_train, Y_train, classes, batch_size=batch_size, train=False) val_generator = DataGenerator(X_val, Y_val, classes, batch_size=batch_size, train=False) if type_model == 'small': model = get_small_model(maxlen, max_features, 2 * embed_size + char_embed_size, embedding_matrix, len(classes)) elif type_model == 'three': model = get_three_entrys_model(maxlen, max_features, 2 * embed_size + char_embed_size, embedding_matrix, len(classes)) else: model = get_model(maxlen, max_features, 2 * embed_size + char_embed_size, embedding_matrix, len(classes)) # embedding_matrix = np.mean([glove_embedding_matrix, fast_embedding_matrix], axis=0) class_weights = class_weight.compute_class_weight( 'balanced', classes, Y_train) opt = Adam(lr=0.001) model.compile(loss=label_smooth_loss, optimizer=opt, metrics=['accuracy']) filepath = '../models/' + lang + '_model_{epoch:02d}_{val_acc:.4f}.h5' checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=False, mode='max', save_weights_only=False) early = EarlyStopping(monitor="val_loss", mode="min", patience=3) # clr = CyclicLR(base_lr=0.0003, max_lr=0.001, # step_size=35000, reduce_on_plateau=1, monitor='val_loss', reduce_factor=10) reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.3, patience=1, verbose=1, mode='auto', epsilon=0.0001, cooldown=0, min_lr=0) callbacks_list = [checkpoint, early, reduce_lr] # lookahead = Lookahead(k=5, alpha=0.5) # Initialize Lookahead # lookahead.inject(model) print("Treinando") model.fit_generator(generator=train_generator, validation_data=val_generator, callbacks=callbacks_list, class_weight=class_weights, epochs=50, use_multiprocessing=True, workers=42)
def search_document(self, searcher, qid2docid, qid2text, output_fn, collection='robust04', K=1000, topics=None, cv_fold=None): output_dir = os.path.dirname(output_fn) if not os.path.exists(output_dir): os.makedirs(output_dir) with open(output_fn, 'w', encoding="utf-8") as out: if 'core' in collection: # Robust04 provides CV topics topics = qid2text for qid in topics: text = qid2text[qid] hits = searcher.search(self.JString(text), K) for i in range(len(hits)): sim = hits[i].score docno = hits[i].docid label = 1 if qid in qid2docid and docno in qid2docid[ qid] else 0 content = hits[i].content if collection == 'core18': content_json = json.loads(content) content = '' for each in content_json['contents']: if each is not None and 'content' in each.keys(): content += '{}\n'.format(each['content']) if collection == 'robust04': content = parse_doc_from_index(content) clean_content = clean_html(content, collection=collection) tokenized_content = tokenizer.tokenize(clean_content) sentid = 0 for sent in tokenized_content: # Split sentence if it's longer than BERT's maximum input length if len(sent.strip().split()) > MAX_INPUT_LENGTH: seq_list = chunk_sent(sent, MAX_INPUT_LENGTH) for seq in seq_list: sentno = docno + '_' + str(sentid) if cv_fold == '5': out.write( '{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n'. format(label, round(float(sim), 11), text, seq, qid, sentno, qid, self.didx - 1)) else: out.write( '{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n'. format(label, round(float(sim), 16), text, seq, qid, sentno, self.qidx, self.didx)) out.flush() sentid += 1 self.didx += 1 else: sentno = docno + '_' + str(sentid) if cv_fold == '5': out.write( '{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format( label, round(float(sim), 11), text, sent, qid, sentno, qid, self.didx - 1)) else: out.write( '{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format( label, round(float(sim), 16), text, sent, qid, sentno, self.qidx, self.didx)) out.flush() sentid += 1 self.didx += 1 self.qidx += 1
except KeyError: matrix[-1].append(' ') st.subheader('Tabla de parsing') frame = pd.DataFrame(matrix, index=rows, columns=columns) st.write(frame) # Parsing st.subheader("Inserte la cadena a parsear") w = st.text_area('') if st.button("Parsear"): parser = metodo_predictivo_no_recursivo(G, M) tokens = tokenize(G, w) if isinstance(tokens, list): left_parse = parser(tokens) if not left_parse: st.error("Error en parsing.\ La cadena no pertenece al lenguaje.") else: st.success("OK") st.subheader("Producciones a aplicar:") for production in left_parse: s = str(production.Left) + ' -> ' + \ str(production.Right) st.text(s) else: st.error("Error en tokenize: " + tokens)
def test_tok_009(self): command = 'echo "yolo"&&ls' tokens = [] tk.tokenize(command, tokens) self.assertListEqual(tokens, ['echo', ' ', '"', 'yolo', '"', '&&', 'ls'])
def test_tok_010(self): command = '$$ $? $- $!' tokens = [] tk.tokenize(command, tokens) self.assertListEqual(tokens, ['$$', ' ', '$?', ' ', '$-', ' ', '$!'])
def tokenize_and_stem(text): """ Tokenize and stem English text """ global stemmer, tokenizer return [stemmer.stem(token) for token in tokenizer.tokenize(text)]