def main(): dataset = data_loader.load_text_file("data_2.txt") tokenizer = Tokenizer() separated = tokenizer.tokenize([dataset]) morfeusz = MorfeuszWrapperLexeme() for sentence in separated: analysed = morfeusz.analyse([w for w, tag in sentence]) print(analysed) for word, analysis in analysed.items(): print("{}:".format(word)) print_analysis(analysis) print()
def main(): text = 'Charakteryzował się on ustawieniem zawodników w kształcie piramidy' \ ' – bramkarz - 2 obrońców - 3 pomocników - 5 napastników (1-2-3-5).' morfeusz = MorfeuszWrapperLexeme() tokenizer = Tokenizer() text = tokenizer.tokenize([text]) for sen in text: analysed = morfeusz.analyse([w for w, tag in sen], as_xml=False) print(analysed) analysed = morfeusz.analyse([w for w, tag in sen], as_xml=True) print(analysed) print()
class XlmEmbedding(TextEmbedding): def __init__(self): pass def get_embedding_fn(self, max_length=12): self.max_length = max_length self.s = Simplifier('tokenizer/zh_mapping.txt') self.t = Tokenizer('tokenizer/spiece_all_bpe/spiece.all.bpe.130000.lower.model', 'tokenizer/lg.all.voc', max_length ) self.max_length = max_length return self.embedding def embedding(self, text): simple = self.s.simplify(text) tokens = self.t.tokenize(simple) accents = run_strip_accents(tokens) ids = self.t.token_to_id(accents) return ids def size(self): return self.t.dico.counts @classmethod def get_feeder(cls): return DenseDataFeeder
def predict(self, X, part_of_speech=None, tagger_preprocessed=False, sentence_level=False): i = 0 if sentence_level: results = [] for text in X: tokenizer = Tokenizer() sentences = tokenizer.tokenize([text]) sentences = [ " ".join([token[0] for token in sentence]) for sentence in sentences ] preprocessed_sentences = self.preprocess_texts( sentences, part_of_speech=part_of_speech, tagger_preprocessed=tagger_preprocessed) X = self.vectorizer.transform(preprocessed_sentences).toarray() pred = self.nb_model.predict(X) results.append(int(round(np.mean(pred)))) print(i) i += 1 return np.array(results) else: preprocessed = self.preprocess_texts( X, part_of_speech=part_of_speech, tagger_preprocessed=tagger_preprocessed) X = self.vectorizer.transform(preprocessed).toarray() return self.nb_model.predict(X)
s = Simplifier('tokenizer/zh_mapping.txt') t = Tokenizer('tokenizer/spiece_all_bpe/spiece.all.bpe.130000.lower.model', 'tokenizer/lg.all.voc', doc_max_length) count = 0 config = tf.ConfigProto() config.gpu_options.allow_growth = True #f = ['a real test', 'b false test'] with open(infile, 'r', encoding='utf-8') as f, open(outfile, 'w', encoding='utf-8') as fo: with tf.Session(config=config) as sess: time = datetime.datetime.now() for line in f: simple = s.simplify(line) tokens = t.tokenize(simple) accents = run_strip_accents(tokens) ids = t.token_to_id(accents) l = len(ids) _ids = ids + [0] * (doc_max_length - l) _mask = [1] * l + [0] * (doc_max_length - l) _type = [0] * doc_max_length feed_dict = { doc_ids: [_ids], doc_mask: [_mask], doc_type: [_type], } result = sess.run(doc_output, feed_dict=feed_dict) result = [str(int(round(r * 65536))) for r in result[0]] fo.write(line.rstrip() + '\t' + '\t'.join(result))
splitters = '([%s])' % '|'.join([ re.escape(el) for el in ["х", "*", "x", "#", "Х", "\\", "X", "×", "="] ]) splitters = re.compile(splitters) for category in ['cable']: data = data_loader.load_data(category) data_count = len(data) iter_times = [] for iteration in range(iterations): _data = [(' '.join([ t.src_str for t in tokenizer.tokenize(text.offer_text, splitters) ]), text.category_name) for text in data] t1 = time.time() for t in _data: r = parsers[t[1]].parse_text(t[0]) # print(t[1], t[0], r) t2 = time.time() iter_times.append(round(t2 - t1, 5)) report.append([category, data_count, sum(iter_times) / iterations]) del data report = sorted(report, key=lambda x: x[-1]) print( tabulate(report,
def main(): dataset = data_loader.load_text_file("data_2.txt") tokenizer = Tokenizer() output = tokenizer.tokenize([dataset]) for sentence in output: print(sentence)