Example #1
0
    def __init__(self, messages, tokenizer_type):

        self.messages = messages
        Tokens.TOKENIZER = Tokenizer(tokenizer_type,
                                     spacer_annotate=True,
                                     preserve_placeholders=True,
                                     spacer_new=True)
Example #2
0
def tokenize_message(message, tokenizer_type, spacer_annotate,
                     preserve_placeholders, spacer_new):
    tokenizer = Tokenizer(tokenizer_type,
                          spacer_annotate=spacer_annotate,
                          preserve_placeholders=preserve_placeholders,
                          spacer_new=spacer_new)
    return tokenizer.tokenize(message)[0]
Example #3
0
 def __init__(self, config_path):
     self.config = json.loads(jsonnet_evaluate_file(config_path))
     self.lang_detect_model_path = self.config["lang_detect_model_path"]
     self.cat_detect_model_path = self.config["cat_detect_model_path"]
     self.max_tokens = self.config.get("max_tokens")
     self.is_lower = self.config["is_lower"]
     self.languages = self.config.get("languages", ["ru", "en"])
     self.is_news_only = self.config.get("is_news_only", False)
     assert os.path.exists(
         self.lang_detect_model_path), "No language detection model found"
     assert os.path.exists(
         self.cat_detect_model_path), "No category detection model found"
     self.lang_detect_model = ft_load_model(self.lang_detect_model_path)
     self.cat_detect_model = ft_load_model(self.cat_detect_model_path)
     self.tokenizer = Tokenizer("conservative", joiner_annotate=False)
Example #4
0
 def __init__(
     self,
     model_paths={
         "nn": "./models/classifier_3k_v3_4000.pth",
         "svm": "./models/svm.joblib",
         "knn": "./models/knn.joblib",
         "dt": "./models/dt.joblib"
     }):
     self.tokenizer = Tokenizer('conservative')
     self.smoothing = SmoothingFunction()
     self.classifiers = {
         key: self.load_sklearn_classifier(val)
         for key, val in model_paths.items() if key != "nn"
     }
     self.classifiers["nn"] = self.load_nn_classifier(model_paths["nn"])
     self.len_norm = self.load_sklearn_classifier(
         "./models/len_norm.joblib")
     self.src_norm = self.load_sklearn_classifier(
         "./models/src_norm.joblib")
Example #5
0
 def process(self):
     """
     The best tokenizer for error messages is TreebankWordTokenizer (nltk).
     It's good at tokenizing file paths.
     Alternative tokenizer. It performs much faster, but worse in tokenizing of paths.
     It splits all paths by "/".
     TODO: This method should be optimized to the same tokenization quality as TreebankWordTokenizer
     :return:
     """
     tokenized = []
     if self.type == 'nltk':
         for line in self.messages:
             tokenized.append(TreebankWordTokenizer().tokenize(line))
     elif self.tokenizer == 'pyonmttok':
         tokenizer = Tokenizer("space", joiner_annotate=False, segment_numbers=False)
         for line in self.messages:
             tokens, features = tokenizer.tokenize(line)
             tokenized.append(tokens)
     self.tokenized = self.clean_tokens(tokenized)
     return self.tokenized
def extend_file(file, size):
    tokenizer = Tokenizer('conservative')
    sentences = file.read().split('\n')
    new_sentence = ""
    new_set = []
    while sentences:
        tok_sent, _ = tokenizer.tokenize(new_sentence)
        if len(tok_sent) < size:
            sent = sentences[0]
            sentences.remove(sent)
            if new_sentence != "":
                new_sentence += " "
            new_sentence += sent
        else:
            new_set.append(new_sentence)
            new_sentence = ""

    if new_sentence != "":
        new_set.append(new_sentence)

    return '\n'.join(new_set)
Example #7
0
def tokenize_list(list):
    tokenizer = Tokenizer('conservative')
    return [tokenizer.tokenize(line)[0] for line in list]
Example #8
0
 def __init__(self):
     self.patterns = None
     self.tokenizer = Tokenizer("conservative", spacer_annotate=True)
 def __init__(self):
     self.word_map = load_dakshina_map()
     self.tokenizer = Tokenizer('aggressive')
Example #10
0
import string

from pyonmttok import Tokenizer
from pymorphy2 import MorphAnalyzer

tokenizer = Tokenizer("conservative", joiner_annotate=False)
morph = MorphAnalyzer()


def tokenize(text, lower=True):
    text = str(text).strip().replace("\n", " ").replace("\xa0", " ")
    if lower:
        text = text.lower()
    tokens, _ = tokenizer.tokenize(text)
    return tokens


def tokenize_to_lemmas(text):
    tokens = tokenize(text)
    tokens = filter(lambda x: x not in string.punctuation, tokens)
    tokens = filter(lambda x: not x.isnumeric(), tokens)
    tokens = filter(lambda x: len(x) >= 2, tokens)
    tokens = [morph.parse(token)[0].normal_form for token in tokens]
    return tokens