def tokenize_lemm(text): lemmatizer = WordNetLemmatizer() digits_tbl = {ord(d): u' ' for d in digits} punc_tbl = {ord(p): u' ' for p in string.punctuation} text = text.translate(digits_tbl) text = text.translate(punc_tbl) #text = "".join([ch for ch in text if ch not in string.punctuation]) tokens = word_tokenize(text) lemmas = lemm_tokens(tokens, lemmatizer) return lemmas
def tokenize_stem(text): stemmer = PorterStemmer() digits_tbl = {ord(d): u' ' for d in digits} punc_tbl = {ord(p): u' ' for p in string.punctuation} text = text.translate(digits_tbl) text = text.translate(punc_tbl) #additional_stop_words =set([u'--', u'san',u'francisco',u'ca',u'ibm', u'race', u'color', u'religion', u'twitter', u'microsoft', u'emc', u'ebay', u'yahoo', u'google', u'accenture', u'intel', u'amazone', u'oracle', u'gender', u'disability', u'veteran', u'status', u'age', u'status', u'genetic', u'origin', u'marital',u'ibm', u'race', u'color', u'disabilities', u'religion', u'twitter', u'microsoft', u'emc', u'ebay', u'yahoo', u'google', u'accenture', u'intel', u'amazone', u'oracle', u'MS', u'Intel', u'Hewlett', u'Packard', u'ebay', u'Twitter', u'IBM', u'yahoo', u'EMC', u'Accenture', u'Corporate', u'id', 'francisco','ca','ibm', 'race', 'color', 'religion', 'twitter', 'microsoft', 'emc', 'ebay', 'yahoo', 'google', 'accenture', 'intel', 'amazone', 'oracle', 'gender', 'disability', 'veteran', 'status', 'age', 'status', 'genetic', 'origin', 'marital','ibm', 'race', 'color', 'disabilities', 'religion', 'twitter', 'microsoft', 'emc', 'ebay', 'yahoo', 'google', 'accenture', 'intel', 'amazone', 'oracle', 'MS', 'Intel', 'Google', 'Hewlett', 'Packard', 'ebay', 'Twitter', 'IBM', 'yahoo', 'EMC', 'Accenture', 'Corporate', 'Oracle', 'id', 'travel', 'job', 'role' , 'committed', 'employment', 'jobs', 'citizenship', 'fair', 'position', 'type', 'environment', 'orientation', 'national', 'regard', 'identity', 'sexual', 'equal', 'francisco','ca','ibm', 'race', 'color', 'religion', 'twitter', 'microsoft', 'emc', 'ebay', 'yahoo', 'google', 'accenture', 'intel', 'amazone', 'oracle', 'gender', 'disability', 'veteran', 'status', 'age', 'status', 'genetic', 'origin', 'marital', 'city', 'compliance', 'country', 'diverse','genetics','immigration','opportunity','proud','regarding','usa','work', 'required', 'bonus','core','corporate','diversity','encouraged','involvement','join','regardless','rewarding','salary','workplace']) #text = "".join([ch for ch in text if ch not in string.punctuation and ch not in additional_stop_words]) tokens = word_tokenize(text) stems = stem_tokens(tokens, stemmer) return stems
def preprocess(text): text= re.sub(b"<.*?>", b" ", text)#no_tags text= re.sub(b"\n", b" ", text)#no_new_lines text= re.sub(b"\r", b" ", text)#no_returns #lowered with no punctuation text= text.translate(None, b'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~').lower() #removing the footer for all the reviews text= text[:-579] return text
def sent_list(docs,splitStr='__label__'): sent_analysis = [] for i in range(1,len(docs)): text=str(lines[i]) splitText=text.split(splitStr) secHalf=splitText[1] sentiment=secHalf[0] text=secHalf[2:len(secHalf)-1].lower() table=str.maketrans(' ',' ', string.punctuation) text.translate(table) if 'www.' in text or 'http:' in text or 'https:' in text or '.com' in text: text = re.sub(r"([^ ]+(?<=\.[a-z]{3}))", "<url>", text) text = re.sub(r'\d+', '', text) sent_analysis.append([text,sentiment]) return sent_analysis
def tokenize_text(text): text = text.lower() text = re.sub(r"http\S+", "", text) text = text.translate(string.punctuation) words = nltk.word_tokenize(text) words = [re.sub("[^A-Za-z0-9]", "", word) for word in words] final_words = [] for word in words: if not word: continue if word in nltk.corpus.stopwords.words("english"): continue if word.startswith("@") or word.startswith("#"): continue if word.isnumeric(): continue final_words.append(word) return final_words
def clean_text(text, remove_stop_words=False): text = text.lower() replace_punctuation = str.maketrans(string.punctuation, ' ' * len(string.punctuation)) text = text.translate(replace_punctuation) text = re.sub(r'\s+', ' ', text) text = re.sub('[\n\r]', '', text) if remove_stop_words == True: text = text.split() new_text = [] stemmer = PorterStemmer() for word in text: if word not in STOPWORDS: new_text.append(stemmer.stem(word)) text = ' '.join(new_text) return text
def clean_text(text, country): text = reduce(lambda a, kv: a.replace(*kv), contractions.items(), text.lower()) text = text.replace('\t', ' ').replace('\n', ' ').replace('\r', ' ') text = strip_accents(text) text = text.translate( str.maketrans(string.punctuation, ' ' * len(string.punctuation))) tokens = tk.tokenize(text) if country == 'USA': stopwords = usa_stopwords elif country == 'Canada': stopwords = canada_stopwords elif country == 'UK': stopwords = britain_stopwords else: raise ValueError("Country is invalid.") tokens = [ w for w in tokens if w not in stopwords and len(w) > 2 and w != ' ' and not w.isdigit() ] return ' '.join(tokens)
def cleanup(text): text = textClean(text) text= text.translate(string.maketrans("","")) return text
def remove_punc(text): table = str.maketrans(' ', ' ', string.punctuation) return text.translate(table)