def transform(self, docs): docvecs = np.zeros((len(docs), self.gram_length)) print("making vectors") for index, doc in enumerate(tqdm_notebook(docs)): for word, count in Counter(text_to_words(doc)).items(): v = (self[word] * count) / (1 + self.idf[word]) docvecs[index] += v return docvecs
def fix_author_text(s): """Author text gets special treatment. No de-dashing, no tokenization, and replace periods by white space. """ if pd.isnull(s): return '' s = unidecode(s) # fix cases when quotes are repeated s = re.sub('"+', '"', s) # no periods as those make author first letter matching hard s = re.sub(r'\.', ' ', s) s = replace_special_whitespace_chars(s) s = standardize_whitespace_length(s) return text_to_words(s).lower().strip()
def encode_lines(self, lines): """ Encode a set of lines. All lines will be encoded together. """ enc_lines = [] for line in lines: line = line.strip() if len(line) == 0 and not self.args.keep_empty: return ["EMPTY", None] if self.args.tokenizer == 'bpe': tokens = self.encode(line) enc_lines.append(" ".join(tokens)) else: enc_lines.append(text_to_words(line)) return ["PASS", enc_lines]
def fix_text(s): """General purpose text fixing using nlpre package and then tokenizing with blingfire """ if pd.isnull(s): return '' s = unidecode(s) # fix cases when quotes are repeated s = re.sub('"+', '"', s) # dashes make quote matching difficult s = re.sub('-', ' ', s) s = replace_special_whitespace_chars(s) # tokenize s = text_to_words(s).lower().strip() # note: removing single non-alphanumerics # means that we will match ngrams that are # usually separate by e.g. commas in the text # this will improve # of matches but also # surface false positives return remove_single_non_alphanumerics(s)
UnicodeSegmentTokenizer(word_bounds=True).tokenize, ), ("VTextTokenizer('en')", VTextTokenizer("en").tokenize), ("CharacterTokenizer(4)", CharacterTokenizer(4).tokenize), ] if sacremoses is not None: db.append(("MosesTokenizer()", sacremoses.MosesTokenizer().tokenize)) if spacy is not None: from spacy.lang.en import English db.append(("Spacy en", English().tokenizer)) if blingfire is not None: db.append( ("BlingFire en", lambda x: blingfire.text_to_words(x).split(" "))) for label, func in db: t0 = time() out = [] for idx, doc in enumerate(data): out.append(func(doc)) dt = time() - t0 n_tokens = sum(len(tok) for tok in out) print("{:>45}: {:.2f}s [{:.1f} MB/s, {:.0f} kWPS]".format( label, dt, dataset_size / dt, n_tokens * 1e-3 / dt))
def fit(self, docs): self.idf = defaultdict(int) for doc in docs: for word in set(text_to_words(doc)): self.idf[word] += 1
def blingf_tokenizer(s: str): return text_to_words(s)
def bling_tokenkizer(lang): return lambda x: blingfire.text_to_words(x).split(" ")
import sys from blingfire import text_to_words for l in sys.stdin: if l.strip(): print(text_to_words(l.strip())) else: print('')
def word_tokenize(sent): return text_to_words(sent).split(' ')
def word_tokenize(string): """Tokenize space delimited string with blingfire.""" return text_to_words(string).split(' ')