def __init__(self, lang, use_stemmer, stop_words): self.lang = lang self.use_stemmer = use_stemmer self.word_tokenizer = WordTokenizer(locale=lang) # As we have only one language currently, no need to # check if supported self.stemmer = Stemmer.Stemmer(self.lang) self.stop_words = stop_words
class Preprocess: def __init__(self, lang, use_stemmer, stop_words): self.lang = lang self.use_stemmer = use_stemmer self.word_tokenizer = WordTokenizer(locale=lang) # As we have only one language currently, no need to # check if supported self.stemmer = Stemmer.Stemmer(self.lang) self.stop_words = stop_words def stem(self, tokens): return (self.stemmer.stemWords(tokens) if self.use_stemmer else tokens) def remove_accents(self, text): return (unicodedata.normalize('NFD', text) .encode('ascii', 'ignore') .decode("utf-8")) def lower(self, text): return text.lower() def clear_html(self, text): # Html cleaner will throw exception when an incorrect tag # formation is detected a 'prompt' like symbol e.g.: '<- ' try: text = clean_html(html.fromstring(text)).text_content() except Exception as e: pass return text def is_token_stopword(self, token): return token in self.stop_words def is_alpha(self, token): return token.isalpha() def filtertokens(self, tokens): return filter( lambda t: self.is_alpha(t) and not (self.is_token_stopword(t)), tokens) def transform2words(self, text): return self.word_tokenizer.transform(Sequence(text)).tokens() def process(self, text): fnlist = [ self.remove_accents, self.lower, self.clear_html, self.transform2words, self.filtertokens, self.stem ] return compose(*reversed(fnlist))(text)
def main(input_fname, output_fname, lang, to_lower=True): en_tokenizer = WordTokenizer(locale='en') fr_tokenizer = WordTokenizer(locale=lang) def tokenizer(text, tokenizer_fn): seq = Sequence(text.strip()) return filter(lambda w: w != ' ', tokenizer_fn.transform(seq)) logging.info((lang, "counting pairs")) counter = Counter() for line_no, line in enumerate(io.open(input_fname, 'r', encoding='utf-8')): if to_lower: line = line.lower() parts = line.rstrip().split(' ||| ') if len(parts) != 4: continue source_lang, source_text, target_text, count = parts source_tokens = tokenizer(source_text, en_tokenizer) target_tokens = tokenizer(target_text, fr_tokenizer) if len(source_tokens) > 3 or len(target_tokens) > 3: continue count = int(count) if count > 1: if (re.sub('\p{P}', '', source_text[0]) == '' or re.sub('\p{P}', '', target_text[0]) == '' or re.sub('\p{P}', '', source_text[-1]) == '' or re.sub('\p{P}', '', target_text[-1]) == ''): continue pair = ' ||| '.join([ source_lang, ' '.join(source_tokens), ' '.join(target_tokens) ]) counter[pair] += count if line_no % 100000 == 0: logging.info((lang, line_no)) logging.info((lang, "writing pairs to {0}".format(output_fname))) with io.open(output_fname, 'w', encoding='utf-8') as out: for pair, count in counter.most_common(): if count < 10: break out.write('{0} ||| {1}\n'.format(pair, count))
def segment(args): lang = args.lang w_tokenizer = WordTokenizer(locale=lang) s_tokenizer = SentenceTokenizer(locale=lang) if args.only_sent: for l in args.input: seq = Sequence(l) if not seq.empty(): _print(u'\n'.join(s_tokenizer.transform(seq))) elif args.only_word: for l in args.input: seq = Sequence(l) if not seq.empty(): _print(u' '.join(w_tokenizer.transform(seq))) else: for l in args.input: seq = Sequence(l) sents = s_tokenizer.transform(seq) words = w_tokenizer.transform(seq) for tokenized_sent in words.split(sents): if not tokenized_sent.empty(): _print(u' '.join(tokenized_sent.tokens()))
def segment(args): lang = args.lang w_tokenizer = WordTokenizer(locale=lang) s_tokenizer = SentenceTokenizer(locale=lang) if args.only_sent: for l in args.input: seq = Sequence(l) if not seq.empty(): _print(s_tokenizer.transform(seq)) elif args.only_word: for l in args.input: seq = Sequence(l) if not seq.empty(): _print(w_tokenizer.transform(seq)) else: for l in args.input: seq = Sequence(l) sents = s_tokenizer.transform(seq) words = w_tokenizer.transform(seq) for tokenized_sent in words.split(sents): if not tokenized_sent.empty(): _print(u' '.join(tokenized_sent.tokens()))
def main(input_fname, lang, to_lower=True): en_tokenizer = WordTokenizer(locale='en') fr_tokenizer = WordTokenizer(locale=lang) for line_no, line in enumerate(smart_open(input_fname)): data = json.loads(line) en_text, pairs = data[0][0][1], data[5] if not isinstance(pairs, list): logging.error((input_fname, 'not list', pairs)) continue for source, _, targets, _, _, _, _ in pairs: if not isinstance(targets, list): logging.error((input_fname, 'not list', targets)) continue for target in targets: if source in en_text: count = int(target[1]) or 1 source = tokenizer(source, en_tokenizer, to_lower) target = tokenizer(target[0], fr_tokenizer, to_lower) if source and target: print('{0} ||| {1} ||| {2}'.format( source, target, count).encode('utf-8')) if line_no % 10000 == 0: logging.info((input_fname, line_no))
def word_tokenizer(self): word_tokenizer = WordTokenizer(locale=self.language.code) return word_tokenizer