def combine_stopwords(dataframe_in, stopword_dict): """ Please use the stopwords() function and input that into the stopword_dict parameter. returns filtered tokens. :param dataframe_in: :param stopword_dict: :return: """ nlp = spacy.load("en_core_web_lg") # Tokenizer tokenizer = Tokenizer(nlp.vocab) tokens = [] for doc in tokenizer.pipe(dataframe_in, batch_size=500): doc_tokens = [] for token in doc: if token.text.lower() not in stopword_dict: doc_tokens.append(token.text.lower()) tokens.append(doc_tokens) return tokens
def predict(): #define a prediction function body = body.str.replace tokenizer = Tokenizer(nlp.vocab) tokens = [] """ Make them tokens """ #stop words STOP_WORDS = nlp.Defaults.stop_words.union( ['', ' ', '-', 'reddit', 'post']) tokens = [] for doc in tokenizer.pipe(df['combo'], batch_size=500): doc_tokens = [] for token in doc: if ((token.text.lower() not in STOP_WORDS) and (token.is_stop == False) and (token.is_punct == False) and (token.pos_ != 'PRON')): doc_tokens.append(token.lemma_.lower()) tokens.append(' '.join(doc_tokens)) df['tokens'] = tokens tfidf = TfidfVectorizer(min_df=0.025, max_df=.98, ngram_range=(1, 2)) vec_text = tfidf.transform(user_input) output = model.predict(vec_text.todense()) # give output to sender. return jsonify({"response": output})
def _tokenizer(df): nlp = English() tokenizer = Tokenizer(nlp.vocab) for doc in tokenizer.pipe(df.values.tolist(), batch_size=50): for token in doc: yield token
def tokenize_data(input_data): nlp = spacy.load("en") tokenizer = Tokenizer(nlp.vocab) string_data = [str(data) for data in input_data] tokenized_data = [[str(w) for w in doc] for doc in tokenizer.pipe(string_data, batch_size=50)] return tokenized_data
def train_tokenizer_mldoc(train_size=1000, datapath='../data/', savepath=None): # Training a tokenizer for MLDoc dataset: create a vocabulary for each language in MLDoc languages = [ "english", "german", "spanish", "french", "italian", "russian", "chinese", "japanese" ] mldoc_folder = os.path.join(datapath, 'mldoc') savepath = os.path.join(mldoc_folder, 'vocab') if savepath is None else savepath for lang in languages: print("\n\n\t\t *** Training tokenizer for {} ***".format(lang)) train_f = os.path.join(mldoc_folder, lang + '.train.{}'.format(train_size)) print('loading data') train = pd.read_csv(train_f, delimiter='\t', header=None, names=["label", "text"]) print('data: {}'.format(train.shape)) tokenizer = Tokenizer(language=lang, train_list=train['text'].tolist(), ngram_range=(1, 1), min_freq=1, max_freq_perc=1.0, vocab_savefolder=savepath) print("creating new tokenizer") tokenizer2 = Tokenizer(language=lang, vocab_loadfolder=savepath) print('loaded vocab: {}'.format(len(tokenizer2.word2ind)))
def train_tokenizer_twitter_sent(datapath='../data/', savepath=None): languages = [ 'arabic', 'bulgarian', 'german', 'english', 'spanish', 'persian', 'croatian', 'hungarian', 'polish', 'portuguese', 'russian', 'slovak', 'slovenian', 'swedish', 'uyghur', 'chinese' ] savepath = os.path.join( datapath, 'twitter_sent/vocab') if savepath is None else savepath for lang in languages: print("\n\n\t\t *** Training tokenizer for {} ***".format(lang)) train = load_df_twitter_sent(method='train', language=lang, print_fn=print) print('data: {}'.format(train.shape)) # NOTE: We use tokenizer_method = 'clean' because data are already tokenized tokenizer = Tokenizer(language=lang, train_list=train['text'].tolist(), tokenizer_method='clean', remove_stopwords=False, ngram_range=(1, 1), min_freq=5, max_freq_perc=1.0, vocab_savefolder=savepath) print("creating new tokenizer") tokenizer2 = Tokenizer(language=lang, vocab_loadfolder=savepath, tokenizer_method='clean', remove_stopwords=False) print('loaded vocab: {}'.format(len(tokenizer2.word2ind))) return
def test_tokenizer_add_special_case_tag(text, tokens): vocab = Vocab() tokenizer = Tokenizer(vocab, {}, None, None, None) tokenizer.add_special_case(text, tokens) doc = tokenizer(text) assert doc[0].text == tokens[0]["orth"] assert doc[0].norm_ == tokens[0]["norm"] assert doc[1].text == tokens[1]["orth"]
def __init__(self, rootDir='.cache', vectorPath='vectors', tokenizerPath='tokenizer'): self.vectorPath = Path.cwd() / rootDir / vectorPath self.tokenizerPath = Path.cwd() / rootDir / tokenizerPath self.tokenizer = Tokenizer(Vocab()) self.vectors = Vectors(shape=(41299, 300))
def test_tokenizer_add_special_case_tag(text, tokens): vocab = Vocab(tag_map={"NN": {"pos": "NOUN"}}) tokenizer = Tokenizer(vocab, {}, None, None, None) tokenizer.add_special_case(text, tokens) doc = tokenizer(text) assert doc[0].text == tokens[0]["orth"] assert doc[0].tag_ == tokens[0]["tag"] assert doc[0].pos_ == "NOUN" assert doc[1].text == tokens[1]["orth"]
def test_tokenizer_flush_cache(en_vocab): suffix_re = re.compile(r"[\.]$") tokenizer = Tokenizer( en_vocab, suffix_search=suffix_re.search, ) assert [t.text for t in tokenizer("a.")] == ["a", "."] tokenizer.suffix_search = None assert [t.text for t in tokenizer("a.")] == ["a."]
def transform(self, data): tokenizer = Tokenizer(nlp.vocab) return np.array([ np.mean([ self.model[w.text.lower()] * self.word2weight[w.text.lower()] for w in words if w.text.lower() in self.model ] or [np.zeros(self.dim)], axis=0) for words in tokenizer.pipe(data) ])
def get_lemmas(text): # nlp = spacy.load("en_core_web_sm-2.2.5", path="airbnb_api/") nlp = spacy.load("en_core_web_sm-2.2.5", path="./") # nlp = spacy.load("en_core_web_sm") # nlp = en_core_web_sm.load() tokenizer = Tokenizer(nlp.vocab) STOP_WORDS = nlp.Defaults.stop_words.union([ ' ', 'und', '-', 'die', 'der', 'berlin', 'ein', 'das', 'mit', 'ist', 'im', 'zu', 'eine', 'es', 'für' 'berlin.', 'zum', 'sind', 'für', 'Berlin.', '-pron-', 's', 'u', '', "'", ' ', '-PRON-' ]) lemmas = [] doc = nlp(text) for token in doc: lemmas.append(token.lemma_) lemma_summary = [] working_set = "" for lemma in lemmas: working_set += lemma + ' ' lemma_summary.append(working_set) description = [lemma_summary[0]] tokens = [] for doc in tokenizer.pipe(description, batch_size=500): doc_tokens = [] for token in doc: if ((token.is_stop == False) and (token.is_punct == False)) and (token.pos_ != 'PRON'): if token.text.lower() not in STOP_WORDS: doc_tokens.append(token.text.lower()) tokens.append(doc_tokens) token_summary = [] for set_of_tokens in tokens: working_set = "" for variable in set_of_tokens: working_set += variable + ' ' token_summary.append(working_set) return token_summary[0]
def test_tokenizer_flush_specials(en_vocab): suffix_re = re.compile(r"[\.]$") rules = {"a a": [{"ORTH": "a a"}]} tokenizer1 = Tokenizer( en_vocab, suffix_search=suffix_re.search, rules=rules, ) assert [t.text for t in tokenizer1("a a.")] == ["a a", "."] tokenizer1.rules = {} assert [t.text for t in tokenizer1("a a.")] == ["a", "a", "."]
def __init__(self, whitespace_tokenizer_for_coref=True, whitespace_tokenizer_for_tokenizer=False): # for tokenization self.tokenize_nlp = spacy.load('en') if whitespace_tokenizer_for_tokenizer: self.tokenize_nlp.tokenizer = Tokenizer(self.tokenize_nlp.vocab) # for coreference resolution self.whitespace_tokenizer_for_coref = whitespace_tokenizer_for_coref self.coref_nlp = spacy.load('en') if whitespace_tokenizer_for_coref: self.coref_nlp.tokenizer = Tokenizer(self.coref_nlp.vocab)
def test_tokenizer_initial_special_case_explain(en_vocab): tokenizer = Tokenizer( en_vocab, token_match=re.compile("^id$").match, rules={ "id": [{"ORTH": "i"}, {"ORTH": "d"}], }, ) tokens = [t.text for t in tokenizer("id")] explain_tokens = [t[1] for t in tokenizer.explain("id")] assert tokens == explain_tokens
def test_tokenizer_explain_special_matcher(en_vocab): suffix_re = re.compile(r"[\.]$") infix_re = re.compile(r"[/]") rules = {"a.": [{"ORTH": "a."}]} tokenizer = Tokenizer( en_vocab, rules=rules, suffix_search=suffix_re.search, infix_finditer=infix_re.finditer, ) tokens = [t.text for t in tokenizer("a/a.")] explain_tokens = [t[1] for t in tokenizer.explain("a/a.")] assert tokens == explain_tokens
def test_spacy_tokenizer_pipe(nlp): tokenizer = Tokenizer(nlp.vocab) token_sets = [] for doc in tokenizer.pipe(DOCUMENTS, batch_size=2): doc_tokens = [token.text for token in doc] token_sets.append(doc_tokens) assert token_sets == [['all', 'the', 'kings', 'men'], ['ate', 'all', 'the', 'kings', 'hens'], [ 'until', 'they', 'all', 'got', 'tired', 'and', 'went', 'to', 'sleep', 'zzz' ]]
def test_tokenizer_infix_prefix(en_vocab): # the prefix and suffix matches overlap in the suffix lookbehind infixes = ["±"] suffixes = ["%"] infix_re = compile_infix_regex(infixes) suffix_re = compile_suffix_regex(suffixes) tokenizer = Tokenizer( en_vocab, infix_finditer=infix_re.finditer, suffix_search=suffix_re.search, ) tokens = [t.text for t in tokenizer("±10%")] assert tokens == ["±10", "%"] explain_tokens = [t[1] for t in tokenizer.explain("±10%")] assert tokens == explain_tokens
def test_tokenizer_prefix_suffix_overlap_lookbehind(en_vocab): # the prefix and suffix matches overlap in the suffix lookbehind prefixes = ["a(?=.)"] suffixes = [r"(?<=\w)\.", r"(?<=a)\d+\."] prefix_re = compile_prefix_regex(prefixes) suffix_re = compile_suffix_regex(suffixes) tokenizer = Tokenizer( en_vocab, prefix_search=prefix_re.search, suffix_search=suffix_re.search, ) tokens = [t.text for t in tokenizer("a10.")] assert tokens == ["a", "10", "."] explain_tokens = [t[1] for t in tokenizer.explain("a10.")] assert tokens == explain_tokens
def tokenize(df_in): """ Tokenize by inputting a dataframe. Outputs a tokenized list. :param df_in: :return: """ nlp = spacy.load("en_core_web_lg") # Tokenizer tokenizer = Tokenizer(nlp.vocab) tokens = [] for doc in tokenizer.pipe(df_in, batch_size=500): doc_tokens = [token.text for token in doc] tokens.append(doc_tokens) return tokens
def __init__(self): """ coppied from notebook at app/ml/Build_week_IsaacGrove.ipynb """ self.PICKLE_PATH = path.join(path.dirname(__file__), '..', 'pickles', '') # for now i'm loading data from a static link, will try to pull live data # in future iters leafly = pd.read_csv( 'https://raw.githubusercontent.com/Build-Week-Med-Cabinet-6/DS/mark-dev/data/cannabis.csv' ) # Set up spacy tokenizer nlp = English() tokenizer = Tokenizer(nlp.vocab) # work around for pickle self.nlp = nlp # clean some missing info leafly.replace('None', np.NaN, inplace=True) leafly = leafly.dropna() # Make tokens out of descriptions tokens = [] for desc in tokenizer.pipe(leafly['Description'], batch_size=500): desc_tokens = [token.text for token in desc] tokens.append(desc_tokens) leafly['tokens'] = tokens leafly['tokens'].head() # Instantiate vectorizer object tfidf = TfidfVectorizer(ngram_range=(1, 2), max_df=.7, min_df=.001, tokenizer=self.tokenize) # Create a vocabulary and get word counts per listing dtm = tfidf.fit_transform(leafly['Description']) # Get feature names to use a dataframe column headers dtm = pd.DataFrame(dtm.todense(), columns=tfidf.get_feature_names()) # Fit on dtm nn = NearestNeighbors(n_neighbors=20, algorithm='kd_tree') nn.fit(dtm) self.model = nn self.transform = tfidf return
def __init__(self, language): self.nlp = load_spacy(language) custom_infixes = ['/'] all_prefixes_re = spacy.util.compile_prefix_regex(tuple(list(self.nlp.Defaults.prefixes) + custom_infixes)) infix_re = spacy.util.compile_infix_regex(tuple(list(self.nlp.Defaults.infixes) + custom_infixes)) suffix_re = spacy.util.compile_suffix_regex(tuple(list(self.nlp.Defaults.suffixes) + custom_infixes)) self.nlp.tokenizer = Tokenizer(self.nlp.vocab, self.nlp.Defaults.tokenizer_exceptions, prefix_search = all_prefixes_re.search, infix_finditer = infix_re.finditer, suffix_search = suffix_re.search, token_match=None) self.matcher = Matcher(self.nlp.vocab) self.matcher.add('morphology', None, [ {"TEXT": {"REGEX" : r'^\d\d\d\d$'}}, {"TEXT": {"REGEX" : r'\s'}, "OP" : "*"}, {"TEXT": '/'}, {"TEXT": {"REGEX" : r'\s'}, "OP" : "*"}, {"TEXT": {"REGEX" : r'\d'}}, ])
def clean_mag_data(dataframe, save_path): samples = [] # prepare tokenization functions nlp = spacy.load("en_core_web_lg") tokenizer = Tokenizer(nlp.vocab) #take samples with at least 10 words in citation context for index, row in dataframe.iterrows(): context = row['context'] text = re.sub("[" + re.escape(string.punctuation) + "]", " ", context) text = [ token.lemma_ for token in tokenizer(text) if not token.like_num ] text = [token for token in text if token.strip()] if (len(text) < MIN_CONTEXT_LENGTH): continue # generate sample in correct format #"paper_id": row['paperid'], sample = { "context": context, "authors_citing": row['citingauthors'], "title_cited": row['citedtitle'], "authors_cited": row['citedauthors'] } samples.append(pd.DataFrame(sample, index=[0])) logger.info("mag samples ready to load to file...") dataset = pd.concat(samples, axis=0) dataset.to_csv(save_path, compression=None, index=False, index_label=False)
def __init__(self, rollout_num, vocab): #self.new_net = copy.deepcopy(net) self.vocab = vocab self.tokenizer = Tokenizer( Vocab(strings=list(vocab.labelToIdx.keys()))) self.rollout_num = rollout_num self.parser = StanfordParser(annots='tokenize')
def transform(self, data): tokenizer = Tokenizer(nlp.vocab) return np.array( [ np.mean( [ self.model[w.text.lower()] for w in words if w.text.lower() in self.model ] or [np.zeros(self.dim)], axis=0, ) for words in tokenizer.pipe(data) ] )
def search_func(user_input, num_results=10): """ Flexible function that searches for cannabis strains. ### Request Body - user_input str - num_results int: default 10 ### Response - `strain_recommendation`: dictionary of strain recommendations """ user_input = [user_input] nlp = English() tokenizer = Tokenizer(nlp.vocab) tf = TfidfVectorizer(stop_words='english') dtm = tf.fit_transform(df['search']) dtm = pd.DataFrame(dtm.todense(), columns=tf.get_feature_names()) nr = num_results nn = NearestNeighbors(n_neighbors=nr, algorithm='ball_tree') nn.fit(dtm) dtf = tf.transform(user_input) _, output = nn.kneighbors(dtf.todense()) recommendations = [] for n in output: for row in n: recommendations.append(row) result = [] for i in recommendations: data = (df.loc[i, :]) result.append(data) return {'strain_recommendations': result}
def load_data(data_path: str, tokenize: bool = False, tokenizer_type: str = "just_spaces") -> List[str]: if tokenizer_type == "just_spaces": tokenizer = SpacyWordSplitter() elif tokenizer_type == "spacy": nlp = spacy.load('en') tokenizer = Tokenizer(nlp.vocab) tokenized_examples = [] with tqdm(open(data_path, "r"), desc=f"loading {data_path}") as f: for line in f: if data_path.endswith(".jsonl") or data_path.endswith(".json"): example = json.loads(line) else: example = {"text": line} if tokenize: if tokenizer_type == 'just_spaces': tokens = list( map(str, tokenizer.split_words(example['text']))) elif tokenizer_type == 'spacy': tokens = list(map(str, tokenizer(example['text']))) text = ' '.join(tokens) else: text = example['text'] tokenized_examples.append(text) return tokenized_examples
def _create_tokenizer(nlp): infix_re = spacy.util.compile_infix_regex( TOKENIZER_INFIXES + [ # u'\w*[,-.–_—:;\(\)\[\]\{\}/]{1,3}\S\w*', # r'\w*[,\-.\-_:;\(\)\[\]\{\}\/]{1,3}\S\w*', # r'((?P<start_with_non_whitespace_and_one_or_more_punctation>\b\S+|[,.-_:;\(\)\[\]\{\}/\+])( # ?P<has_1_or_more_punctation>[,.-_:;\(\)\[\]\{\}/\+])+( # ?P<ends_with_non_whitespace_or_non_terminating_punctation>\S+\b[,.-_:;\(\)\[\]\{\}/\+]|[,.-_:;\(\)\[ # \]\{\}/\+|\-]|\S+\b))', # r'\w*\S-\S*\w', # u'\w*\S–\S*\w', # u'\w*\S—\S*\w', # u'\w*[,-.–_—:;\(\)\[\]\{\}/]{1,3}\S\w*' ur'(?P<start_with_non_whitespace_and_one_or_more_punctation>\b\S*|[,.-_-:–;—\(\[\{/\+]?)(' ur'?P<has_1_or_more_punctation>[,.-_-:–;—\(\)\[\]\{\}/\+])+(' ur'?P<ends_with_non_whitespace_or_non_terminating_punctation>\S+\b[,.-_-:–;—\)\]\}/\+]|[,' ur'.-_-:–;—\)\]\}/\+}]|\S+\b)' ]) # TODO: prefix and suffix raise TypeError: '_regex.Pattern' object is not callable # prefix_boundaries_to_keep = ur'\) \] \} \> , . - _ - : – ; — \+ -'.split() # suffix_boundaries_to_keep = ur'\( \[ \{ \< , . - _ - : – ; — \+ -'.split() # prefixe_re = spacy.util.compile_prefix_regex([i for i in TOKENIZER_PREFIXES if i not in # prefix_boundaries_to_keep]) # suffixe_re = spacy.util.compile_suffix_regex([i for i in TOKENIZER_SUFFIXES if i not in # suffix_boundaries_to_keep]) # # return Tokenizer(nlp.vocab, {}, prefixe_re.search, suffixe_re.search, # infix_re.finditer) return Tokenizer(nlp.vocab, {}, nlp.tokenizer.prefix_search, nlp.tokenizer.suffix_search, infix_re.finditer)
def get_spacy_tokens(self, spacy_sentence, target_word, spacy_model): """ A function to locate the target phrase spacy tokens in a spacy doc of a whole sentence. Args: spacy_sentence: spacy doc for the context sentence spacy_target_word: spacy doc for the target word/phrase only Returns: spacy_token_list: a list of the spacy tokens for the target phrase, using the information from the context sentence. """ # Create the tokeniser tokenizer = Tokenizer(spacy_model.vocab) spacy_token_list = [] for target in tokenizer(target_word): for wd in spacy_sentence: if target.text == wd.text: spacy_token_list.append(wd) break return spacy_token_list
def test_create(self): vocab = Vocab() dummy_re = re.compile(r'sklfb;s') tokenizer = Tokenizer(vocab, {}, dummy_re, dummy_re, dummy_re) doc = tokenizer(u'I am a document.') self.assertEqual(len(doc), 4)
def custom_tokenizer(self): """ Function that defines a tokenizer in order to be used Parameters ----------- nlp: spacy loaded object return: prepared tokenizer """ infixes = ( LIST_ELLIPSES + LIST_ICONS + [ r"(?<=[0-9])[+\-\*^](?=[0-9-])", r"(?<=[{al}{q}])\.(?=[{au}{q}])".format( al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES), r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA), #r"(?<=[{a}])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS), r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA), ]) infix_re = compile_infix_regex(infixes) return Tokenizer(self.nlp.vocab, prefix_search=self.nlp.tokenizer.prefix_search, suffix_search=self.nlp.tokenizer.suffix_search, infix_finditer=infix_re.finditer, token_match=self.nlp.tokenizer.token_match, rules=self.nlp.Defaults.tokenizer_exceptions)
def main(output_dir): ensure_dir(output_dir) ensure_dir(output_dir, "pos") ensure_dir(output_dir, "vocab") vocab = Vocab(tag_map=TAG_MAP) tokenizer = Tokenizer(vocab, {}, None, None, None) # The default_templates argument is where features are specified. See # spacy/tagger.pyx for the defaults. tagger = Tagger.blank(vocab, Tagger.default_templates()) for i in range(5): for words, tags in DATA: tokens = tokenizer.tokens_from_list(words) tagger.train(tokens, tags) random.shuffle(DATA) tagger.model.end_training(path.join(output_dir, 'pos', 'model')) vocab.strings.dump(path.join(output_dir, 'vocab', 'strings.txt'))
def count_freqs(input_loc, output_loc): print(output_loc) vocab = English.default_vocab(get_lex_attr=None) tokenizer = Tokenizer.from_dir(vocab, path.join(English.default_data_dir(), 'tokenizer')) counts = PreshCounter() for json_comment in iter_comments(input_loc): doc = tokenizer(json_comment['body']) doc.count_by(ORTH, counts=counts) with io.open(output_loc, 'w', 'utf8') as file_: for orth, freq in counts: string = tokenizer.vocab.strings[orth] if not string.isspace(): file_.write('%d\t%s\n' % (freq, string))
def test_serialize_custom_tokenizer(en_vocab, en_tokenizer): """Test that custom tokenizer with not all functions defined can be serialized and deserialized correctly (see #2494).""" tokenizer = Tokenizer(en_vocab, suffix_search=en_tokenizer.suffix_search) tokenizer_bytes = tokenizer.to_bytes() Tokenizer(en_vocab).from_bytes(tokenizer_bytes)
def test_load(self): data_dir = English.default_data_dir() vocab = Vocab.from_dir(path.join(data_dir, 'vocab')) tokenizer = Tokenizer.from_dir(vocab, path.join(data_dir, 'tokenizer'))