def preprocess_data(line, token_pattern=token_pattern, encode_digit=False): token_pattern = re.compile(token_pattern, flags=re.UNICODE | re.LOCALE) # tokenize tokens = [x.lower() for x in token_pattern.findall(line)] # stem tokens_stemmed = stem_tokens(tokens, english_stemmer) return tokens_stemmed
def preprocess_data(line): # tokenize tokens = token_pattern.findall(line) # stem tokens_stemmed = nlp_utils.stem_tokens(tokens, nlp_utils.english_stemmer) # Stop words removal tokens_stemmed = [x for x in tokens_stemmed if x not in nlp_utils.stopwords] return tokens_stemmed
def imojify_input(line, src_lang="en"): line = line.lower() sents = nlp_utils.tokenize(line) imojified = [] for s in sents: imojified.append(imojify_sentence(nlp_utils.stem_tokens(s, src_lang), src_lang)) return imojified
def preprocess_data(line, token_pattern=token_pattern, exclude_stopword=config.cooccurrence_word_exclude_stopword, encode_digit=False): token_pattern = re.compile(token_pattern, flags = re.UNICODE | re.LOCALE) tokens = [x.lower() for x in token_pattern.findall(line)] tokens_stemmed = stem_tokens(tokens, english_stemmer) if exclude_stopword: tokens_stemmed = [x for x in tokens_stemmed if x not in stopwords] return tokens_stemmed