def preprocess_text(text, remove_stopwords=True): global tokenizer text = text.lower().strip() tokens = tokenizer(text) text = [x.text for x in tokens] # Optionally, remove stop words if remove_stopwords: # stops = set(stopwords.words("english")) own_stopword = ['the','on','a','an','it','be','has','some','my','me', 'i'] stops = own_stopword text = [w for w in text if not w in stops] text = " ".join(text) if text[-1] in ['.','?']: text = text[:-1] + ' ' + text[-1] # Clean the text text = re.sub(r"[^A-Za-z0-9^,:!.\/'+=@-]", " ", text) text = re.sub(r"what's", "what is ", text) text = re.sub(r"\'s", " ", text) text = re.sub(r" n ", " and ", text) text = re.sub(r"\'ve", " have ", text) text = re.sub(r"can't", "cannot ", text) text = re.sub(r"n't", " not ", text) text = re.sub(r"i'm", "i am ", text) text = re.sub(r"\'re", " are ", text) text = re.sub(r"\'d", " would ", text) text = re.sub(r"\'ll", " will ", text) text = re.sub(r",", " ", text) # text = re.sub(r"\.", " ", text) text = re.sub(r"!", " ", text) # text = re.sub(r"\/", " ", text) text = re.sub(r"\^", " ^ ", text) text = re.sub(r"\+", " + ", text) # text = re.sub(r"\-", " - ", text) text = re.sub(r"\=", " = ", text) text = re.sub(r"'", " ", text) text = re.sub(r"(\d+)(k)", r"\g<1>000", text) # text = re.sub(r":", " : ", text) text = re.sub(r" e g ", " eg ", text) text = re.sub(r" b g ", " bg ", text) text = re.sub(r" u s ", " american ", text) text = re.sub(r"\0s", "0", text) text = re.sub(r" 9 11 ", "911", text) text = re.sub(r"e - mail", "email", text) text = re.sub(r"j k", "jk", text) text = re.sub(r"\s{2,}", " ", text) text = re.sub(' +', ' ', text) return text
def _spacy_tokenize(x, spacy): return [tok.text for tok in spacy.tokenizer(x)]
def tokenize_text(text): return [token.text for token in spacy.tokenizer(text)]
def spacy_tokenize(x): return [ tok.text for tok in spacy.tokenizer(x) if not tok.is_punct | tok.is_space ]
def spacyTokenize(spacy, lines): lines_new = [] for sent in lines: sent_new = ' '.join([tok.text for tok in spacy.tokenizer(sent)]) lines_new.append(sent_new) return lines_new
def tokenize(text, spacy): return [tok.text for tok in spacy.tokenizer(text)]