Beispiel #1
0
def main():
    dataset = data_loader.load_text_file("data_2.txt")
    tokenizer = Tokenizer()
    separated = tokenizer.tokenize([dataset])
    morfeusz = MorfeuszWrapperLexeme()
    for sentence in separated:
        analysed = morfeusz.analyse([w for w, tag in sentence])
        print(analysed)
        for word, analysis in analysed.items():
            print("{}:".format(word))
            print_analysis(analysis)
        print()
Beispiel #2
0
def main():
    text = 'Charakteryzował się on ustawieniem zawodników w kształcie piramidy' \
          ' – bramkarz - 2 obrońców - 3 pomocników - 5 napastników (1-2-3-5).'
    morfeusz = MorfeuszWrapperLexeme()
    tokenizer = Tokenizer()
    text = tokenizer.tokenize([text])
    for sen in text:
        analysed = morfeusz.analyse([w for w, tag in sen], as_xml=False)
        print(analysed)

        analysed = morfeusz.analyse([w for w, tag in sen], as_xml=True)
        print(analysed)
        print()
Beispiel #3
0
class XlmEmbedding(TextEmbedding):
    def __init__(self):
        pass
    def get_embedding_fn(self, max_length=12):
        self.max_length = max_length
        self.s = Simplifier('tokenizer/zh_mapping.txt')
        self.t = Tokenizer('tokenizer/spiece_all_bpe/spiece.all.bpe.130000.lower.model',
            'tokenizer/lg.all.voc',
            max_length
        )
        self.max_length = max_length
        return self.embedding
    def embedding(self, text):
        simple = self.s.simplify(text)
        tokens = self.t.tokenize(simple)
        accents = run_strip_accents(tokens)
        ids = self.t.token_to_id(accents)
        return ids
    def size(self):
        return self.t.dico.counts
    @classmethod
    def get_feeder(cls):
        return DenseDataFeeder
Beispiel #4
0
    def predict(self,
                X,
                part_of_speech=None,
                tagger_preprocessed=False,
                sentence_level=False):

        i = 0
        if sentence_level:
            results = []
            for text in X:
                tokenizer = Tokenizer()
                sentences = tokenizer.tokenize([text])
                sentences = [
                    " ".join([token[0] for token in sentence])
                    for sentence in sentences
                ]

                preprocessed_sentences = self.preprocess_texts(
                    sentences,
                    part_of_speech=part_of_speech,
                    tagger_preprocessed=tagger_preprocessed)

                X = self.vectorizer.transform(preprocessed_sentences).toarray()
                pred = self.nb_model.predict(X)
                results.append(int(round(np.mean(pred))))
                print(i)
                i += 1
            return np.array(results)

        else:
            preprocessed = self.preprocess_texts(
                X,
                part_of_speech=part_of_speech,
                tagger_preprocessed=tagger_preprocessed)
            X = self.vectorizer.transform(preprocessed).toarray()
            return self.nb_model.predict(X)
Beispiel #5
0
s = Simplifier('tokenizer/zh_mapping.txt')
t = Tokenizer('tokenizer/spiece_all_bpe/spiece.all.bpe.130000.lower.model',
              'tokenizer/lg.all.voc', doc_max_length)
count = 0
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
#f = ['a real test', 'b false test']
with open(infile, 'r', encoding='utf-8') as f, open(outfile,
                                                    'w',
                                                    encoding='utf-8') as fo:
    with tf.Session(config=config) as sess:
        time = datetime.datetime.now()
        for line in f:
            simple = s.simplify(line)
            tokens = t.tokenize(simple)
            accents = run_strip_accents(tokens)
            ids = t.token_to_id(accents)

            l = len(ids)
            _ids = ids + [0] * (doc_max_length - l)
            _mask = [1] * l + [0] * (doc_max_length - l)
            _type = [0] * doc_max_length
            feed_dict = {
                doc_ids: [_ids],
                doc_mask: [_mask],
                doc_type: [_type],
            }
            result = sess.run(doc_output, feed_dict=feed_dict)
            result = [str(int(round(r * 65536))) for r in result[0]]
            fo.write(line.rstrip() + '\t' + '\t'.join(result))
Beispiel #6
0
    splitters = '([%s])' % '|'.join([
        re.escape(el) for el in ["х", "*", "x", "#", "Х", "\\", "X", "×", "="]
    ])
    splitters = re.compile(splitters)

    for category in ['cable']:
        data = data_loader.load_data(category)
        data_count = len(data)

        iter_times = []
        for iteration in range(iterations):

            _data = [(' '.join([
                t.src_str
                for t in tokenizer.tokenize(text.offer_text, splitters)
            ]), text.category_name) for text in data]

            t1 = time.time()
            for t in _data:
                r = parsers[t[1]].parse_text(t[0])
                # print(t[1], t[0], r)
            t2 = time.time()

            iter_times.append(round(t2 - t1, 5))
        report.append([category, data_count, sum(iter_times) / iterations])
        del data

    report = sorted(report, key=lambda x: x[-1])
    print(
        tabulate(report,
def main():
    dataset = data_loader.load_text_file("data_2.txt")
    tokenizer = Tokenizer()
    output = tokenizer.tokenize([dataset])
    for sentence in output:
        print(sentence)