def moses_tokenize(self, text, lang): if lang not in self.cache_moses_tokenizer: moses_tokenizer = sm.MosesTokenizer(lang=lang) self.cache_moses_tokenizer[lang] = moses_tokenizer return self.cache_moses_tokenizer[lang].tokenize( text, aggressive_dash_splits=True, return_str=False, escape=True )
def moses_tokenize(self, text, lang): if lang not in self.cache_moses_tokenizer: moses_tokenizer = sm.MosesTokenizer(lang=lang) self.cache_moses_tokenizer[lang] = moses_tokenizer else: moses_tokenizer = self.cache_moses_tokenizer[lang] return moses_tokenizer.tokenize(text, return_str=False, escape=False)
def tokenize_raw(text, lang='en'): mt = sacremoses.MosesTokenizer(lang) text = mt.tokenize(text, return_str=True) text = re.sub(r'"', '"', text) text = re.sub(r''', "'", text) text = re.sub(r'(\d)\.(\d)', r'\1 @.@ \2', text) text = re.sub(r'(\d),(\d)', r'\1 @,@ \2', text) text = re.sub(r'(\w)-(\w)', r'\1 @-@ \2', text) return text
def moses_tokenize(sents: List[str], lang: str) -> List[List[str]]: unsupported_langs = ['zh', 'ja', 'th'] if lang.split('-')[0] in unsupported_langs: utils.Logging.warn(f"Moses does not support \"{lang}\" because it is not space-delimited. " f"It will only split according to punctuation.") import sacremoses tok = sacremoses.MosesTokenizer(lang=lang) tok_sents = [tok.tokenize(sent.strip(), escape=False) for sent in sents] return tok_sents
def __init__(self, max_length=None): super().__init__(max_length) try: import sacremoses self.tokenize_fn = sacremoses.MosesTokenizer().tokenize except ImportError as e: import sys sys.stderr.write('ERROR: Please install sacremoses to use.') raise e
def main(args): """Tokenizes, preserving tabs""" mt = sacremoses.MosesTokenizer(lang=args.lang) def tok(s): return mt.tokenize(s, return_str=True) for line in sys.stdin: parts = list(map(tok, line.split("\t"))) print(*parts, sep="\t", flush=True)
def __init__(self, embeddings): self.embeddings = embeddings self.word_list = list(embeddings) self.special_list = self.SPECIAL_LIST for special_token in self.special_list: assert special_token not in self.embeddings self.full_token_list = self.special_list + self.word_list self.id_token_map = self.full_token_list self.token_id_map = { token: i for i, token in enumerate(self.id_token_map) } self.tokenizer = sacremoses.MosesTokenizer(lang='en')
def normalize(sentence: str, lowercase: bool = True, tokenizer: str = "13a", return_str: bool = True): if lowercase: sentence = sentence.lower() if tokenizer in ["13a", "intl", "none"]: tokenizer_obj = _get_tokenizer(name=tokenizer)() normalized_sent = tokenizer_obj(sentence) elif tokenizer == "moses": normalized_sent = sacremoses.MosesTokenizer().tokenize(sentence, return_str=True, escape=False) elif tokenizer == "penn": normalized_sent = sacremoses.MosesTokenizer().penn_tokenize( sentence, return_str=True) if not return_str: normalized_sent = normalized_sent.split() return normalized_sent
def preprocessing_en_file(input_file_name, output_file_name): tokenizer = sacremoses.MosesTokenizer() output_file = open(output_file_name, 'w', encoding='utf-8') with open(input_file_name, 'r', encoding='utf-8') as file: for line in file: # print('line\n', line) token_list = tokenizer.tokenize(line) # print('token_list\n', token_list) output_line = (' ').join(token_list) # print('output_line\n', output_line) output_file.write(output_line) output_file.close() return 0
def normalize(sentence, lowercase: bool = True, tokenizer: str = '13a', return_str: bool = True): if lowercase: sentence = sentence.lower() if tokenizer in ['13a', 'intl']: normalized_sent = sacrebleu.TOKENIZERS[tokenizer]()(sentence) elif tokenizer == 'moses': normalized_sent = sacremoses.MosesTokenizer().tokenize(sentence, return_str=True, escape=False) elif tokenizer == 'penn': normalized_sent = sacremoses.MosesTokenizer().penn_tokenize( sentence, return_str=True) else: normalized_sent = sentence if not return_str: normalized_sent = normalized_sent.split() return normalized_sent
def init_word_tokenizers(main, lang, word_tokenizer = 'default'): if lang not in main.settings_global['word_tokenizers']: lang = 'other' if word_tokenizer == 'default': word_tokenizer = main.settings_custom['word_tokenization']['word_tokenizers'][lang] # NLTK if word_tokenizer.startswith('nltk_'): if word_tokenizer == 'nltk_nist': if 'nltk_nist_tokenizer' not in main.__dict__: main.nltk_nist_tokenizer = nltk.tokenize.nist.NISTTokenizer() elif word_tokenizer == 'nltk_nltk': if 'nltk_nltk_tokenizer' not in main.__dict__: main.nltk_nltk_tokenizer = nltk.NLTKWordTokenizer() elif word_tokenizer == 'nltk_penn_treebank': if 'nltk_treebank_tokenizer' not in main.__dict__: main.nltk_treebank_tokenizer = nltk.TreebankWordTokenizer() elif word_tokenizer == 'nltk_tok_tok': if 'nltk_toktok_tokenizer' not in main.__dict__: main.nltk_toktok_tokenizer = nltk.ToktokTokenizer() elif word_tokenizer == 'nltk_twitter': if 'nltk_tweet_tokenizer' not in main.__dict__: main.nltk_tweet_tokenizer = nltk.TweetTokenizer() # Sacremoses elif word_tokenizer == 'sacremoses_moses': lang_sacremoses = wl_conversion.remove_lang_code_suffixes(main, wl_conversion.to_iso_639_1(main, lang)) lang = wl_conversion.remove_lang_code_suffixes(main, lang) if f'sacremoses_moses_tokenizer_{lang}' not in main.__dict__: main.__dict__[f'sacremoses_moses_tokenizer_{lang}'] = sacremoses.MosesTokenizer(lang = lang_sacremoses) # spaCy elif word_tokenizer.startswith('spacy_'): init_spacy_models(main, lang) # Chinese elif word_tokenizer == 'pkuseg_zho': if 'pkuseg_word_tokenizer' not in main.__dict__: main.pkuseg_word_tokenizer = pkuseg.pkuseg() # Chinese & Japanese elif word_tokenizer.startswith('wordless_'): init_spacy_models(main, 'eng_us') init_spacy_models(main, 'other') # Japanese elif word_tokenizer.startswith('sudachipy_jpn'): if 'sudachipy_word_tokenizer' not in main.__dict__: main.sudachipy_word_tokenizer = sudachipy.Dictionary().create() # Tibetan elif word_tokenizer == 'botok_bod': if 'botok_word_tokenizer' not in main.__dict__: main.botok_word_tokenizer = botok.WordTokenizer()
def __init__(self, lang: str = 'en', vocab: Optional[Vocab] = None): self._lang = lang self._vocab = vocab if lang == 'zh': warnings.warn( 'You may not use MosesTokenizer for Chinese sentences because it is ' 'not accurate. Try to use JiebaTokenizer. You may also tokenize the ' 'chinese sentence to characters and learn a BPE.') self._tokenizer = sacremoses.MosesTokenizer(lang=lang) self._detokenizer = sacremoses.MosesDetokenizer(lang=lang) # Here, we need to warm-up the tokenizer to compile the regex # This will boost the performance in MacOS # For benchmarking results, see # https://gist.github.com/sxjscience/f59d2b88262fefd4fb08565c9dec6099 self._warmup()
def loadAndTokenizeFile(lang, inputPath, outputPath, pattern, append=False): tok = sacremoses.MosesTokenizer(lang=lang) inputFile = open(inputPath, 'r') fileRights = 'a' if append else 'w' outputFile = open(outputPath, fileRights) p = re.compile(pattern) for line in inputFile: match = p.match(line) if match: outputFile.write( html.unescape(tok.tokenize(match.group(1), return_str=True)) + '\n') inputFile.close() outputFile.close()
def tokenize_data(data, token_type): # input: list of strings # return: list of list of tokens if token_type == "gru": tokenizer = sacremoses.MosesTokenizer() preprocessed_data = [] print("Processing data into tokens......") for sent in tqdm(data): tokenized_sent = tokenizer.tokenize(sent.lower()) preprocessed_data.append(tokenized_sent) elif token_type == "bert": tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True) preprocessed_data = [] print("Processing data into tokens......") for sent in tqdm(data): tokenized_sent = ["[CLS]"] + tokenizer.tokenize(sent) # BERT only accepts sequences of max length 512 preprocessed_data.append(tokenized_sent[:512]) return preprocessed_data, tokenizer
def normalize(sentence, lowercase: bool = True, tokenizer: str = '13a', return_str: bool = True): if lowercase: sentence = sentence.lower() if tokenizer == "13a": normalized_sent = sacrebleu.tokenize_13a(sentence) elif tokenizer == "intl": normalized_sent = sacrebleu.tokenize_v14_international(sentence) elif tokenizer == "moses": normalized_sent = sacremoses.MosesTokenizer().tokenize(sentence, return_str=True) else: normalized_sent = sentence if not return_str: normalized_sent = normalized_sent.split() return normalized_sent
def split_en(): # tokenize english text import sacremoses tokenizer = sacremoses.MosesTokenizer(lang="en") lines = 0 contents = [] filename = "/home/user_data55/wangdq/data/ccmt/zh-en/parallel/train.en" with open(filename, 'r') as f: with open(filename + '2', 'w') as f2: for line in f: tokens = tokenizer.tokenize(line, aggressive_dash_splits=True, return_str=True, escape=False) contents.append(tokens + '\n') lines += 1 if lines == 500: f2.writelines(contents) contents = [] lines = 0 if len(contents) > 0: f2.writelines(contents)
def wordless_word_tokenize(main, text, lang, word_tokenizer='default', keep_sentences=False): tokens_sentences = [] if lang not in main.settings_global['word_tokenizers']: lang = 'other' if word_tokenizer == 'default': word_tokenizer = main.settings_custom['word_tokenization'][ 'word_tokenizers'][lang] wordless_text_utils.check_word_tokenizers(main, lang=lang, word_tokenizer=word_tokenizer) if 'NLTK' in word_tokenizer: sentences = wordless_sentence_tokenize(main, text, lang) if word_tokenizer == main.tr('NLTK - Penn Treebank Tokenizer'): treebank_tokenizer = nltk.TreebankWordTokenizer() for sentence in sentences: tokens_sentences.append(treebank_tokenizer.tokenize(sentence)) elif word_tokenizer == main.tr('NLTK - Twitter Tokenizer'): tweet_tokenizer = nltk.TweetTokenizer() for sentence in sentences: tokens_sentences.append(tweet_tokenizer.tokenize(sentence)) elif word_tokenizer == main.tr('NLTK - NIST Tokenizer'): nist_tokenizer = nltk.tokenize.nist.NISTTokenizer() for sentence in sentences: tokens_sentences.append(nist_tokenizer.tokenize(sentence)) elif word_tokenizer == main.tr('NLTK - Tok-tok Tokenizer'): toktok_tokenizer = nltk.ToktokTokenizer() for sentence in sentences: tokens_sentences.append(toktok_tokenizer.tokenize(sentence)) if not keep_sentences: tokens_sentences = [ itertools.chain.from_iterable(tokens_sentences) ] elif 'Sacremoses' in word_tokenizer: if keep_sentences: sentences = wordless_sentence_tokenize(main, text, lang) else: sentences = [text] if word_tokenizer == main.tr('Sacremoses - Moses Tokenizer'): moses_tokenizer = sacremoses.MosesTokenizer( lang=wordless_conversion.to_iso_639_1(main, lang)) for sentence in sentences: tokens_sentences.append( moses_tokenizer.tokenize(sentence, escape=False)) elif word_tokenizer == main.tr('Sacremoses - Penn Treebank Tokenizer'): moses_tokenizer = sacremoses.MosesTokenizer( lang=wordless_conversion.to_iso_639_1(main, lang)) for sentence in sentences: tokens_sentences.append( moses_tokenizer.penn_tokenize(sentence)) elif 'spaCy' in word_tokenizer: nlp = main.__dict__[f'spacy_nlp_{lang}'] doc = nlp(text) # See Issue #3479: https://github.com/explosion/spaCy/issues/3479 doc.is_parsed = True if keep_sentences: for sentence in doc.sents: tokens_sentences.append( [token.text for token in sentence.as_doc()]) else: tokens_sentences.append([token.text for token in doc]) # Chinese & Japanese elif ('jieba' in word_tokenizer or 'nagisa' in word_tokenizer or 'Wordless' in word_tokenizer): if keep_sentences: sentences = wordless_sentence_tokenize(main, text, lang=lang) else: sentences = [text] # Chinese if word_tokenizer == main.tr('jieba - Chinese Word Tokenizer'): for sentence in sentences: tokens_sentences.append(jieba.cut(sentence)) elif word_tokenizer == main.tr( 'Wordless - Chinese Character Tokenizer'): for sentence in sentences: tokens = [] non_han_start = 0 for i, char in enumerate(sentence): if i >= non_han_start: if wordless_checking_unicode.is_han(char): tokens.append(char) non_han_start += 1 else: # English if wordless_checking_unicode.is_eng(char): for j, char in enumerate(sentence[i:]): if i + j + 1 == len( sentence ) or not wordless_checking_unicode.is_eng( sentence[i + j + 1]): tokens.extend( wordless_word_tokenize( main, sentence[non_han_start:i + j + 1], lang='eng')) non_han_start = i + j + 1 break # Other Languages else: for j, char in enumerate(sentence[i:]): if i + j + 1 == len( sentence ) or wordless_checking_unicode.is_han( sentence[i + j + 1]): tokens.extend( wordless_word_tokenize( main, sentence[non_han_start:i + j + 1], lang='other')) non_han_start = i + j + 1 break tokens_sentences.extend(tokens) # Japanese elif word_tokenizer == main.tr('nagisa - Japanese Word Tokenizer'): import nagisa for sentence in sentences: tokens_sentences.append(nagisa.tagging(str(sentence)).words) elif word_tokenizer == main.tr('Wordless - Japanese Kanji Tokenizer'): for sentence in sentences: tokens = [] non_han_start = 0 for i, char in enumerate(sentence): if i >= non_han_start: if wordless_checking_unicode.is_han(char): tokens.append(char) non_han_start += 1 else: # Japanese Kana if wordless_checking_unicode.is_kana(char): for j, char in enumerate(sentence[i:]): if i + j + 1 == len( sentence ) or not wordless_checking_unicode.is_kana( sentence[i + j + 1]): tokens.extend( wordless_word_tokenize( main, sentence[non_han_start:i + j + 1], lang='jpn')) non_han_start = i + j + 1 break # English elif wordless_checking_unicode.is_eng(char): for j, char in enumerate(sentence[i:]): if i + j + 1 == len( sentence ) or not wordless_checking_unicode.is_eng( sentence[i + j + 1]): tokens.extend( wordless_word_tokenize( main, sentence[non_han_start:i + j + 1], lang='eng')) non_han_start = i + j + 1 break # Other Languages else: for j, char in enumerate(sentence[i:]): if i + j + 1 == len( sentence ) or wordless_checking_unicode.is_han( sentence[i + j + 1]): tokens.extend( wordless_word_tokenize( main, sentence[non_han_start:i + j + 1], lang='other')) non_han_start = i + j + 1 break tokens_sentences.extend(tokens) # Thai elif 'PyThaiNLP' in word_tokenizer: sentences = wordless_sentence_tokenize( main, text, lang='tha', sentence_tokenizer='PyThaiNLP - Thai Sentence Tokenizer') if word_tokenizer == main.tr( 'PyThaiNLP - Maximum Matching Algorithm + TCC'): for sentence in sentences: tokens_sentences.append( pythainlp.tokenize.word_tokenize(sentence, engine='newmm')) elif word_tokenizer == main.tr( 'PyThaiNLP - Maximum Matching Algorithm'): for sentence in sentences: tokens_sentences.append( pythainlp.tokenize.word_tokenize(sentence, engine='mm')) elif word_tokenizer == main.tr('PyThaiNLP - Longest Matching'): for sentence in sentences: tokens_sentences.append( pythainlp.tokenize.word_tokenize( sentence, engine='longest-matching')) # Tibetan elif 'pybo' in word_tokenizer: if keep_sentences: sentences = wordless_sentence_tokenize(main, text, lang='bod') else: sentences = [text] if word_tokenizer == main.tr('pybo - Tibetan Word Tokenizer (GMD)'): for sentence in sentences: tokens_sentences.append([ token.text for token in main.pybo_tokenizer_gmd.tokenize(sentence) ]) elif word_tokenizer == main.tr('pybo - Tibetan Word Tokenizer (POS)'): for sentence in sentences: tokens_sentences.append([ token.text for token in main.pybo_tokenizer_pos.tokenize(sentence) ]) elif word_tokenizer == main.tr( 'pybo - Tibetan Word Tokenizer (tsikchen)'): for sentence in sentences: tokens_sentences.append([ token.text for token in main.pybo_tokenizer_tsikchen.tokenize(sentence) ]) # Vietnamese elif word_tokenizer == main.tr('Underthesea - Vietnamese Word Tokenizer'): if keep_sentences: sentences = wordless_sentence_tokenize( main, text, lang='vie', sentence_tokenizer='Underthesea - Vietnamese Sentence Tokenizer' ) else: sentences = [text] for sentence in sentences: tokens_sentences.append(underthesea.word_tokenize(str(sentence))) # Remove empty tokens and strip whitespace for i, tokens in enumerate(tokens_sentences): tokens_sentences[i] = [ token.strip() for token in tokens if token.strip() ] # Record token boundaries if lang in ['zho_cn', 'zho_tw', 'jpn']: for tokens in tokens_sentences: if tokens: tokens[-1] = wordless_text.Wordless_Token(tokens[-1], boundary='', sentence_ending=True) else: for tokens in tokens_sentences: if tokens: tokens[-1] = wordless_text.Wordless_Token(tokens[-1], boundary=' ', sentence_ending=True) return tokens_sentences
def __init__(self, lang: str): super().__init__() self.lang = lang self.moses = sacremoses.MosesTokenizer(lang) self.rm_accent = lang in self.LANG_WITHOUT_ACCENT self.ready = True
def processLanguagePair(lgpair, keyfile_prefix, rawtranslations_glob, lemtranslations_glob): # load sense keys from file sense_keys = [] k = open(keyfile_prefix + ".key.txt", 'r', encoding='utf-8') for line in k: elements = line.strip().split("\t") t = (elements[0], elements[1], elements[2], tuple(elements[3].split(" ")), tuple(elements[4].split(" "))) sense_keys.append(t) k.close() # load domain keys from file indomain_keys = set() outdomain_keys = set() d = open(keyfile_prefix + ".domain.txt", 'r', encoding='utf-8') for line in d: elements = line.strip().split("\t") if elements[2] == "in": indomain_keys.add((elements[0], elements[1])) else: outdomain_keys.add((elements[0], elements[1])) d.close() # initialize tokenizer tokenizer = sacremoses.MosesTokenizer(lang=lgpair[-2:]) # load and process submissions results = {} toksubmissions = sorted(glob.glob(rawtranslations_glob)) lemsubmissions = sorted(glob.glob(lemtranslations_glob)) for toksubmission, lemsubmission in zip(toksubmissions, lemsubmissions): if toksubmission.split("/")[-1] != lemsubmission.split( "/")[-1].replace(".parsed.toklemma", ""): print("Mismatch in filenames") print(toksubmission) print(lemsubmission) return counts = { "pos_in": 0, "pos_out": 0, "neg_in": 0, "neg_out": 0, "unk_in": 0, "unk_out": 0 } tokf = open(toksubmission, 'r', encoding='utf-8') lemf = open(lemsubmission, 'r', encoding='utf-8') for tokline, lemline, key in zip(tokf, lemf, sense_keys): if (key[2], " ".join(key[3])) in indomain_keys: suffix = "_in" elif (key[2], " ".join(key[3])) in outdomain_keys: suffix = "_out" else: print("Domain not found:", (key[2], " ".join(key[3]))) # first look in tokenized data tokwords = [ x.lower() for x in tokenizer.tokenize(tokline.strip(), escape=False) ] posfound = any([posword in tokwords for posword in key[3]]) negfound = any([negword in tokwords for negword in key[4]]) # if not found, look in lemmatized data if (not posfound) and (not negfound): lemwords = lemline.strip().lower().split(" ") posfound = any([posword in lemwords for posword in key[3]]) negfound = any([negword in lemwords for negword in key[4]]) if posfound and not negfound: counts["pos" + suffix] += 1 elif negfound: counts["neg" + suffix] += 1 else: counts["unk" + suffix] += 1 tokf.close() lemf.close() counts["cov_in"] = (counts["pos_in"] + counts["neg_in"]) / ( counts["pos_in"] + counts["neg_in"] + counts["unk_in"]) counts["cov_out"] = (counts["pos_out"] + counts["neg_out"]) / ( counts["pos_out"] + counts["neg_out"] + counts["unk_out"]) counts["cov_all"] = ( counts["pos_in"] + counts["neg_in"] + counts["pos_out"] + counts["neg_out"]) / (counts["pos_in"] + counts["neg_in"] + counts["unk_in"] + counts["pos_out"] + counts["neg_out"] + counts["unk_out"]) # Precision = pos / (pos+neg) counts["prec_in"] = 0 if counts["pos_in"] == 0 else counts[ "pos_in"] / (counts["pos_in"] + counts["neg_in"]) counts["prec_out"] = 0 if counts["pos_out"] == 0 else counts[ "pos_out"] / (counts["pos_out"] + counts["neg_out"]) counts["prec_all"] = 0 if ( counts["pos_in"] + counts["pos_out"]) == 0 else ( counts["pos_in"] + counts["pos_out"]) / (counts["pos_in"] + counts["neg_in"] + counts["pos_out"] + counts["neg_out"]) # Recall = pos / (pos+unk) counts["rec_in"] = 0 if counts["pos_in"] == 0 else counts["pos_in"] / ( counts["pos_in"] + counts["unk_in"]) counts["rec_out"] = 0 if counts["pos_out"] == 0 else counts[ "pos_out"] / (counts["pos_out"] + counts["unk_out"]) counts["rec_all"] = 0 if ( counts["pos_in"] + counts["pos_out"]) == 0 else ( counts["pos_in"] + counts["pos_out"]) / (counts["pos_in"] + counts["unk_in"] + counts["pos_out"] + counts["unk_out"]) counts["f1_in"] = 0 if ( counts["prec_in"] + counts["rec_in"] ) == 0 else 2 * counts["prec_in"] * counts["rec_in"] / ( counts["prec_in"] + counts["rec_in"]) counts["f1_out"] = 0 if ( counts["prec_out"] + counts["rec_out"] ) == 0 else 2 * counts["prec_out"] * counts["rec_out"] / ( counts["prec_out"] + counts["rec_out"]) counts["f1_all"] = 0 if ( counts["prec_all"] + counts["rec_all"] ) == 0 else 2 * counts["prec_all"] * counts["rec_all"] / ( counts["prec_all"] + counts["rec_all"]) submissionName = toksubmission.split("/")[-1] results[submissionName] = counts print(lgpair.upper()) print() print( "Submission\t\tInPos\tInNeg\tInUnk\tInCoverage\tInPrecision\tInRecall\tInFscore\t\tOutPos\tOutNeg\tOutUnk\tOutCoverage\tOutPrecision\tOutRecall\tOutFscore\t\tAllPos\tAllNeg\tAllUnk\tAllCoverage\tAllPrecision\tAllRecall\tAllFscore" ) for submission, result in sorted(results.items(), key=lambda x: x[1]["f1_all"], reverse=True): s = submission s += "\t\t{}\t{}\t{}\t{:.2f}%\t{:.2f}%\t{:.2f}%\t{:.2f}%".format( result["pos_in"], result["neg_in"], result["unk_in"], 100 * result["cov_in"], 100 * result["prec_in"], 100 * result["rec_in"], 100 * result["f1_in"]) s += "\t\t{}\t{}\t{}\t{:.2f}%\t{:.2f}%\t{:.2f}%\t{:.2f}%".format( result["pos_out"], result["neg_out"], result["unk_out"], 100 * result["cov_out"], 100 * result["prec_out"], 100 * result["rec_out"], 100 * result["f1_out"]) s += "\t\t{}\t{}\t{}\t{:.2f}%\t{:.2f}%\t{:.2f}%\t{:.2f}%".format( result["pos_in"] + result["pos_out"], result["neg_in"] + result["neg_out"], result["unk_in"] + result["unk_out"], 100 * result["cov_all"], 100 * result["prec_all"], 100 * result["rec_all"], 100 * result["f1_all"]) print(s) print()
def wordless_word_tokenize(main, text, lang, word_tokenizer='default', flat_tokens=True): tokens_hierarchical = [] if lang not in main.settings_global['word_tokenizers']: lang = 'other' if word_tokenizer == 'default': word_tokenizer = main.settings_custom['word_tokenization'][ 'word_tokenizers'][lang] # Check initialization status of word (and sentence) tokenizers if flat_tokens: wordless_text_utils.check_word_tokenizers( main, lang=lang, word_tokenizer=word_tokenizer) else: wordless_text_utils.check_tokenizers(main, lang=lang, word_tokenizer=word_tokenizer) # NLTK if 'NLTK' in word_tokenizer: sentences = wordless_sentence_tokenize(main, text, lang) if word_tokenizer == main.tr('NLTK - Penn Treebank Tokenizer'): treebank_tokenizer = nltk.TreebankWordTokenizer() for sentence in sentences: tokens_hierarchical.append( treebank_tokenizer.tokenize(sentence)) elif word_tokenizer == main.tr('NLTK - Twitter Tokenizer'): tweet_tokenizer = nltk.TweetTokenizer() for sentence in sentences: tokens_hierarchical.append(tweet_tokenizer.tokenize(sentence)) elif word_tokenizer == main.tr('NLTK - NIST Tokenizer'): nist_tokenizer = nltk.tokenize.nist.NISTTokenizer() for sentence in sentences: tokens_hierarchical.append(nist_tokenizer.tokenize(sentence)) elif word_tokenizer == main.tr('NLTK - Tok-tok Tokenizer'): toktok_tokenizer = nltk.ToktokTokenizer() for sentence in sentences: tokens_hierarchical.append(toktok_tokenizer.tokenize(sentence)) # Sacremoses elif 'Sacremoses' in word_tokenizer: if flat_tokens: sentences = [text] else: sentences = wordless_sentence_tokenize(main, text, lang) if word_tokenizer == main.tr('Sacremoses - Moses Tokenizer'): moses_tokenizer = sacremoses.MosesTokenizer( lang=wordless_conversion.to_iso_639_1(main, lang)) for sentence in sentences: tokens_hierarchical.append( moses_tokenizer.tokenize(sentence, escape=False)) elif word_tokenizer == main.tr('Sacremoses - Penn Treebank Tokenizer'): moses_tokenizer = sacremoses.MosesTokenizer( lang=wordless_conversion.to_iso_639_1(main, lang)) for sentence in sentences: tokens_hierarchical.append( moses_tokenizer.penn_tokenize(sentence)) # spaCy elif 'spaCy' in word_tokenizer: nlp = main.__dict__[f'spacy_nlp_{lang}'] doc = nlp(text) # See Issue #3479: https://github.com/explosion/spaCy/issues/3479 doc.is_parsed = True if flat_tokens: tokens_hierarchical.append([token.text for token in doc]) else: for sentence in doc.sents: tokens_hierarchical.append( [token.text for token in sentence.as_doc()]) # syntok elif word_tokenizer == 'syntok - Word Tokenizer': syntok_tokenizer = syntok.tokenizer.Tokenizer() if flat_tokens: tokens_hierarchical.append( [token.value for token in syntok_tokenizer.tokenize(text)]) else: for para in syntok.segmenter.analyze(text): for sentence in para: tokens_hierarchical.append( [token.value for token in sentence]) # Chinese & Japanese elif ('jieba' in word_tokenizer or 'nagisa' in word_tokenizer or 'Wordless' in word_tokenizer): if flat_tokens: sentences = [text] else: sentences = wordless_sentence_tokenize(main, text, lang=lang) # Chinese if word_tokenizer == main.tr('jieba - Chinese Word Tokenizer'): for sentence in sentences: tokens_hierarchical.append(jieba.cut(sentence)) elif word_tokenizer == main.tr( 'Wordless - Chinese Character Tokenizer'): for sentence in sentences: tokens = [] non_han_start = 0 for i, char in enumerate(sentence): if i >= non_han_start: if wordless_checking_unicode.is_han(char): tokens.append(char) non_han_start += 1 else: # English if wordless_checking_unicode.is_eng(char): for j, char in enumerate(sentence[i:]): if i + j + 1 == len( sentence ) or not wordless_checking_unicode.is_eng( sentence[i + j + 1]): tokens.extend( wordless_word_tokenize( main, sentence[non_han_start:i + j + 1], lang='eng')) non_han_start = i + j + 1 break # Other Languages else: for j, char in enumerate(sentence[i:]): if i + j + 1 == len( sentence ) or wordless_checking_unicode.is_han( sentence[i + j + 1]): tokens.extend( wordless_word_tokenize( main, sentence[non_han_start:i + j + 1], lang='other')) non_han_start = i + j + 1 break tokens_hierarchical.append(tokens) # Japanese elif word_tokenizer == main.tr('nagisa - Japanese Word Tokenizer'): import nagisa for sentence in sentences: tokens_hierarchical.append(nagisa.tagging(str(sentence)).words) elif word_tokenizer == main.tr('Wordless - Japanese Kanji Tokenizer'): for sentence in sentences: tokens = [] non_han_start = 0 for i, char in enumerate(sentence): if i >= non_han_start: if wordless_checking_unicode.is_han(char): tokens.append(char) non_han_start += 1 else: # Japanese Kana if wordless_checking_unicode.is_kana(char): for j, char in enumerate(sentence[i:]): if i + j + 1 == len( sentence ) or not wordless_checking_unicode.is_kana( sentence[i + j + 1]): tokens.extend( wordless_word_tokenize( main, sentence[non_han_start:i + j + 1], lang='jpn')) non_han_start = i + j + 1 break # English elif wordless_checking_unicode.is_eng(char): for j, char in enumerate(sentence[i:]): if i + j + 1 == len( sentence ) or not wordless_checking_unicode.is_eng( sentence[i + j + 1]): tokens.extend( wordless_word_tokenize( main, sentence[non_han_start:i + j + 1], lang='eng')) non_han_start = i + j + 1 break # Other Languages else: for j, char in enumerate(sentence[i:]): if i + j + 1 == len( sentence ) or wordless_checking_unicode.is_han( sentence[i + j + 1]): tokens.extend( wordless_word_tokenize( main, sentence[non_han_start:i + j + 1], lang='other')) non_han_start = i + j + 1 break tokens_hierarchical.append(tokens) # Russian elif word_tokenizer == 'razdel - Russian Word Tokenizer': if flat_tokens: sentences = [text] else: sentences = wordless_sentence_tokenize(main, text, lang='rus') for sentence in sentences: tokens_hierarchical.append( [token.text for token in razdel.tokenize(sentence)]) # Thai elif 'PyThaiNLP' in word_tokenizer: # Preserve sentence boundaries sentences = wordless_sentence_tokenize( main, text, lang='tha', sentence_tokenizer='PyThaiNLP - Thai Sentence Tokenizer') if word_tokenizer == main.tr( 'PyThaiNLP - Maximum Matching Algorithm + TCC'): for sentence in sentences: tokens_hierarchical.append( pythainlp.tokenize.word_tokenize(sentence, engine='newmm')) elif word_tokenizer == main.tr( 'PyThaiNLP - Maximum Matching Algorithm'): for sentence in sentences: tokens_hierarchical.append( pythainlp.tokenize.word_tokenize(sentence, engine='mm')) elif word_tokenizer == main.tr('PyThaiNLP - Longest Matching'): for sentence in sentences: tokens_hierarchical.append( pythainlp.tokenize.word_tokenize( sentence, engine='longest-matching')) # Tibetan elif 'botok' in word_tokenizer: if flat_tokens: sentences = [text] else: sentences = wordless_sentence_tokenize(main, text, lang='bod') botok_tokenizer = wordless_text_utils.check_botok_tokenizers( main, word_tokenizer) for sentence in sentences: tokens_hierarchical.append( [token.text for token in botok_tokenizer.tokenize(sentence)]) # Vietnamese elif word_tokenizer == main.tr('Underthesea - Vietnamese Word Tokenizer'): if flat_tokens: sentences = [text] else: sentences = wordless_sentence_tokenize( main, text, lang='vie', sentence_tokenizer='Underthesea - Vietnamese Sentence Tokenizer' ) for sentence in sentences: tokens_hierarchical.append(underthesea.word_tokenize( str(sentence))) # Remove empty tokens and strip whitespace for i, sentence in enumerate(tokens_hierarchical): tokens_hierarchical[i] = [ token.strip() for token in sentence if token.strip() ] # Record token boundaries if lang in ['zho_cn', 'zho_tw', 'jpn']: for sentence in tokens_hierarchical: if sentence: sentence[-1] = wordless_text.Wordless_Token( sentence[-1], boundary='', sentence_ending=True) else: for sentence in tokens_hierarchical: if sentence: sentence[-1] = wordless_text.Wordless_Token( sentence[-1], boundary=' ', sentence_ending=True) # Clause tokenization if not flat_tokens: for i, sentence in enumerate(tokens_hierarchical): tokens_hierarchical[i] = wordless_clause_tokenize( main, sentence, lang) # Flatten tokens tokens_flat = list(wordless_misc.flatten_list(tokens_hierarchical)) if flat_tokens: return tokens_flat else: return tokens_hierarchical
RegexpTokenizer(pattern=token_regexp).tokenize, ), ( "UnicodeSegmentTokenizer(word_bounds=False)", UnicodeSegmentTokenizer(word_bounds=False).tokenize, ), ( "UnicodeSegmentTokenizer(word_bounds=True)", UnicodeSegmentTokenizer(word_bounds=True).tokenize, ), ("VTextTokenizer('en')", VTextTokenizer("en").tokenize), ("CharacterTokenizer(4)", CharacterTokenizer(4).tokenize), ] if sacremoses is not None: db.append(("MosesTokenizer()", sacremoses.MosesTokenizer().tokenize)) if spacy is not None: from spacy.lang.en import English db.append(("Spacy en", English().tokenizer)) if blingfire is not None: db.append( ("BlingFire en", lambda x: blingfire.text_to_words(x).split(" "))) for label, func in db: t0 = time() out = [] for idx, doc in enumerate(data):
def __init__(self, special=None, min_freq=0, max_size=None, lower_case=False, delimiter=None, vocab_file=None, pretrained_vocab_file=None, never_split=None, unk_token="<unk>", eos_token="<eos>", additional_special_tokens=["<formula>"], language="en", **kwargs): super().__init__(unk_token=unk_token, eos_token=eos_token, additional_special_tokens=additional_special_tokens, **kwargs) if never_split is None: never_split = self.all_special_tokens if special is None: special = [] self.counter = Counter() self.special = special self.min_freq = min_freq self.max_size = max_size self.lower_case = lower_case self.delimiter = delimiter self.vocab_file = vocab_file self.never_split = never_split self.punctuation_symbols = '!"#$%&()*+,-./\\:;<=>?@[\\]^_`{|}~' self.punction_without_space_before_pattern = re.compile( r"[^\s][{}]".format(self.punctuation_symbols)) self.punctuation_with_space_around_pattern = self._compile_space_around_punctuation_pattern( ) self.language = language self.moses_punct_normalizer = sm.MosesPunctNormalizer(language) self.moses_tokenizer = sm.MosesTokenizer(language) self.moses_detokenizer = sm.MosesDetokenizer(language) try: if pretrained_vocab_file is not None: # Hack because, honestly this tokenizer was not made to be used # in a library like ours, at all. vocab_dict = torch.load(pretrained_vocab_file) for key, value in vocab_dict.items(): if key not in self.__dict__: self.__dict__[key] = value if vocab_file is not None: self.build_vocab() except Exception: raise ValueError( "Unable to parse file {}. Unknown format. " "If you tried to load a model saved through TransfoXLTokenizerFast," "please note they are not compatible.".format( pretrained_vocab_file)) if vocab_file is not None: self.build_vocab()
def init_moses(self, lang): self.moses_tokenizer = sacremoses.MosesTokenizer(lang['src']) self.moses_detokenizer = sacremoses.MosesDetokenizer(lang['tgt'])
def __init__(self): self.question_generator = xlingqg.QuestionGenerator() self.translator = xlingqg.Translator() self.tokenizer = sacremoses.MosesTokenizer() self.detokenizer = sacremoses.MosesDetokenizer() self.answer_encoder = AnswerEncoder()
def __init__(self, lang): self.tokenizer = sacremoses.MosesTokenizer(lang)
def whitespace_split(x): return x.split(" ") tok_db = [ # ("whitespace", lambda lang: whitespace_split), ("regexp", lambda lang: re.compile(r"\b\w\w+\b").findall), ( "unicode-segmentation", lambda lang: UnicodeSegmentTokenizer(word_bounds=True).tokenize, ), ("vtext", lambda lang: VTextTokenizer(lang).tokenize), ] if sacremoses is not None: tok_db.append(("MosesTokenizer", lambda lang: sacremoses.MosesTokenizer().tokenize)) if spacy is not None: def spacy_tokenizer(lang): if lang == "en": from spacy.lang.en import English as Nlp elif lang == "de": from spacy.lang.de import German as Nlp elif lang == "fr": from spacy.lang.fr import French as Nlp else: raise ValueError return Nlp().tokenizer tok_db.append(("spacy", spacy_tokenizer))
def __init__(self, special=None, min_freq=0, max_size=None, lower_case=False, delimiter=None, vocab_file=None, pretrained_vocab_file: str = None, never_split=None, unk_token="<unk>", eos_token="<eos>", additional_special_tokens=["<formula>"], language="en", **kwargs): super().__init__(unk_token=unk_token, eos_token=eos_token, additional_special_tokens=additional_special_tokens, **kwargs) if never_split is None: never_split = self.all_special_tokens if special is None: special = [] self.counter = Counter() self.special = special self.min_freq = min_freq self.max_size = max_size self.lower_case = lower_case self.delimiter = delimiter self.vocab_file = vocab_file self.never_split = never_split self.punctuation_symbols = '!"#$%&()*+,-./\\:;<=>?@[\\]^_`{|}~' self.punction_without_space_before_pattern = re.compile( r"[^\s][{}]".format(self.punctuation_symbols)) self.punctuation_with_space_around_pattern = self._compile_space_around_punctuation_pattern( ) self.language = language self.moses_punct_normalizer = sm.MosesPunctNormalizer(language) self.moses_tokenizer = sm.MosesTokenizer(language) self.moses_detokenizer = sm.MosesDetokenizer(language) # This try... catch... is not beautiful but honestly this tokenizer was not made to be used # in a library like ours, at all. try: vocab_dict = None if pretrained_vocab_file is not None: # Priority on pickle files (support PyTorch and TF) with open(pretrained_vocab_file, "rb") as f: vocab_dict = pickle.load(f) # Loading a torch-saved transfo-xl vocab dict with pickle results in an integer # Entering this if statement means that we tried to load a torch-saved file with pickle, and we failed. # We therefore load it with torch, if it's available. if type(vocab_dict) == int: if not is_torch_available(): raise ImportError( "Not trying to load dict with PyTorch as you need to install pytorch to load " "from a PyTorch pretrained vocabulary, " "or activate it with environment variables USE_TORCH=1 and USE_TF=0." ) vocab_dict = torch.load(pretrained_vocab_file) if vocab_dict is not None: for key, value in vocab_dict.items(): if key not in self.__dict__: self.__dict__[key] = value elif vocab_file is not None: self.build_vocab() except Exception as e: raise ValueError( "Unable to parse file {}. Unknown format. " "If you tried to load a model saved through TransfoXLTokenizerFast," "please note they are not compatible.".format( pretrained_vocab_file)) from e if vocab_file is not None: self.build_vocab()
def __init__( self, special=None, min_freq=0, max_size=None, lower_case=False, delimiter=None, vocab_file=None, pretrained_vocab_file=None, never_split=None, unk="<unk>", eos="<eos>", additional_special_tokens=["<formula>"], language="en", **kw, ): super().__init__( special=special, min_freq=min_freq, max_size=max_size, lower_case=lower_case, delimiter=delimiter, vocab_file=vocab_file, pretrained_vocab_file=pretrained_vocab_file, never_split=never_split, unk=unk, eos=eos, additional_special_tokens=additional_special_tokens, language=language, **kw, ) if never_split is None: never_split = self.all_special_tokens if special is None: special = [] self.counter = Counter() self.special = special self.min_freq = min_freq self.max_size = max_size self.lower_case = lower_case self.delimiter = delimiter self.vocab_file = vocab_file self.never_split = never_split self.punctuation_symbols = '!"#$%&()*+,-./\\:;<=>?@[\\]^_`{|}~' self.punction_without_space_before_pattern = re.compile( rf"[^\s][{self.punctuation_symbols}]" ) self.punctuation_with_space_around_pattern = ( self._compile_space_around_punctuation_pattern() ) self.language = language self.moses_punct_normalizer = sm.MosesPunctNormalizer(language) self.moses_tokenizer = sm.MosesTokenizer(language) self.moses_detokenizer = sm.MosesDetokenizer(language) try: vocab_dict = None if pretrained_vocab_file is not None: with open(pretrained_vocab_file, "rb") as f: vocab_dict = pickle.load(f) if type(vocab_dict) == int: if not is_torch_available(): raise ImportError( "Not trying to load dict with PyTorch as you need to install pytorch to load " "from a PyTorch pretrained vocabulary, " "or activate it with environment variables USE_TORCH=1 and USE_TF=0." ) vocab_dict = torch.load(pretrained_vocab_file) if vocab_dict is not None: for key, value in vocab_dict.items(): if key not in self.__dict__: self.__dict__[key] = value elif vocab_file is not None: self.build_vocab() except Exception as e: raise ValueError( f"Unable to parse file {pretrained_vocab_file}. Unknown format. " "If you tried to load a model saved through TokenizerFast, " "please note they are not compatible." ) from e if vocab_file is not None: self.build_vocab()
else: ## Load training data span_train_name = '' train_df = pd.read_csv('/scratch/xl3119/'+spam_train_name+'train.csv') val_df = pd.read_csv('/scratch/xl3119/'+spam_train_name+'dev.csv') train_texts, train_labels, train_rating = list(train_df.review), list(train_df.label), list(train_df.rating) val_texts, val_labels, val_rating = list(val_df.review), list(val_df.label), list(val_df.rating) print( f"Train size: {len(train_labels)}\n" f"Val size: {len(val_labels)}\n" ) ## Tokenize data tokenizer = sacremoses.MosesTokenizer() train_data_indices, train_labels = featurize(train_texts, train_labels, tokenizer, vocab) val_data_indices, val_labels = featurize(val_texts, val_labels, tokenizer, vocab) pickle_fake_news = {'train_indices': train_data_indices, 'train_labels': train_labels, 'train_rating': train_rating, 'val_indices': val_data_indices, 'val_labels': val_labels, 'val_rating': val_rating,} pickle.dump(pickle_fake_news,open(tokens_save_dir, "wb")) print('Data has been saved') ## Build data loader train_dataset = Fake_News_Dataset(train_data_indices, train_labels, max_sent_length)
return x.split(" ") tok_db = [ # ("whitespace", lambda lang: whitespace_split), ("regexp", lambda lang: re.compile(r"\b\w\w+\b").findall), ( "unicode-segmentation", lambda lang: UnicodeWordTokenizer(word_bounds=True).tokenize, ), ("vtext", lambda lang: VTextTokenizer(lang).tokenize), ] if sacremoses is not None: tok_db.append( ("MosesTokenizer", lambda lang: sacremoses.MosesTokenizer().tokenize)) if spacy is not None: def spacy_tokenizer(lang): if lang == "en": from spacy.lang.en import English as Nlp elif lang == "de": from spacy.lang.de import German as Nlp elif lang == "fr": from spacy.lang.fr import French as Nlp else: raise ValueError return Nlp().tokenizer tok_db.append(("spacy", spacy_tokenizer))