def expand_match(contraction): match = contraction.group(0) first_char = match[0] expanded_contraction = CONTRACTION_MAP.get(match)\ if CONTRACTION_MAP.get(match)\ else CONTRACTION_MAP.get(match.lower()) expanded_contraction = first_char + expanded_contraction[1:] return expanded_contraction
def get_cleaned_text_data(): df = pd.read_sql('SELECT * FROM security_tweet_data', con=con) ##########################preprocessing####################################################################################################### df['sql_tweet_text'] = df['sql_tweet_text'].replace( r'''(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’]))''', '', regex=True) df['sql_tweet_text'] = df['sql_tweet_text'].replace(r'\B@\w+', '', regex=True) df["sql_tweet_text"] = df['sql_tweet_text'].apply(lambda x: [ CONTRACTION_MAP[item.lower()] if item.lower() in CONTRACTION_MAP.keys( ) else item for item in str(x).split() ]) df["sql_tweet_text"] = df['sql_tweet_text'].apply(lambda x: [ item[:-2] if item[-2:] == "'s" or item[-2:] == "’s" else item for item in x ]) df["sql_tweet_text"] = df["sql_tweet_text"].str.join(" ") df["sql_tweet_text"] = df["sql_tweet_text"].replace('[^a-zA-Z0-9 ]', '', regex=True) df["sql_tweet_text"] = df['sql_tweet_text'].apply(lambda x: [ item.lower() for item in str(x).split() if item.isdigit() == False and item not in stop ]) df["sql_tweet_text"] = df["sql_tweet_text"].str.join(" ") #["sql_tweet_text"] = df['sql_tweet_text'].apply(lambda x: ''.join(item[0] for item in itertools.groupby(x))) df["sql_tweet_text"] = df['sql_tweet_text'].apply(lambda x: " ".join([ lemmatizer.lemmatize(item.lower()) for item in str(x).split() if len(item) <= 10 and len(item) > 2 and item not in stop ])) ############################################################################################################################################## return (df)
def processing_text(tweetobj): ##make a list of the main words in the tweet, minus punctuation, links, @ mentions, stop words, does have hashtags no_links = re.sub( r'''(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’]))''', '', tweetobj) no_hashtags_ats = re.sub(r'\B@\w+', '', no_links) expand_contractions = [ CONTRACTION_MAP[item.lower()] if item.lower() in CONTRACTION_MAP.keys() else item for item in str(no_hashtags_ats).split() ] remove_extra_contractions = " ".join([ item[:-2] if item[-2:] == "'s" or item[-2:] == "’s" else item for item in expand_contractions ]) alphanumeric_only = re.sub(r'[^a-zA-Z0-9 ]', '', remove_extra_contractions) minus_stopw_and_hashes = " ".join([ item.lower() for item in str(alphanumeric_only).split() if item.isdigit() == False and item not in stopwords ]) minus_repeat_characters = ''.join( item[0] for item in itertools.groupby(minus_stopw_and_hashes)) filtered_words = [ snowballstemmer.stem(item.lower()) for item in str(minus_repeat_characters).split() if len(item) <= 10 and len(item) > 3 ] return (filtered_words)
def expand_contractions(text): pattern = re.compile("({})".format("|".join(CONTRACTION_MAP.keys())),flags = re.DOTALL| re.IGNORECASE) def replace_text(t): txt = t.group(0) if txt.lower() in CONTRACTION_MAP.keys(): return CONTRACTION_MAP[txt.lower()] expand_text = pattern.sub(replace_text,text) return expand_text
def expand_contractions(text): contractions_pattern = re.compile('({})'.format('|'.join( CONTRACTION_MAP.keys())), flags=re.IGNORECASE | re.DOTALL) def expand_match(contraction): match = contraction.group(0) first_char = match[0] expanded_contraction = CONTRACTION_MAP.get(match)\ if CONTRACTION_MAP.get(match)\ else CONTRACTION_MAP.get(match.lower()) expanded_contraction = first_char + expanded_contraction[1:] return expanded_contraction expanded_text = contractions_pattern.sub(expand_match, text) expanded_text = re.sub("'", "", expanded_text) return expanded_text
def clean_text(text): '''Remove unwanted characters, stopwords, and format the text to create fewer nulls word embeddings''' # Convert words to lower case text = text.lower() # Add contractions # Format words and remove unwanted characters text = re.sub(r'https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE) text = re.sub(r'\<a href', ' ', text) text = re.sub(r'&', '', text) text = re.sub(r'[_"\-;%()|+&=*%.,!?:#$@\[\]/]', ' ', text) text = re.sub(r'<br />', ' ', text) text = re.sub(r'\'', ' ', text) contractions_pattern = re.compile('({})'.format('|'.join( CONTRACTION_MAP.keys())), flags=re.IGNORECASE | re.DOTALL) def expand_match(contraction): match = contraction.group(0) first_char = match[0] expanded_contraction = CONTRACTION_MAP.get(match)\ if CONTRACTION_MAP.get(match)\ else CONTRACTION_MAP.get(match.lower()) expanded_contraction = first_char + expanded_contraction[1:] return expanded_contraction expanded_text = contractions_pattern.sub(expand_match, text) expanded_text = re.sub("'", "", expanded_text) # Optionally, remove stop words text = expanded_text.split() stops = set(stopwords.words("english")) text = [w for w in text if not w in stops] text = " ".join(text) logging.info("sentence cleaned") # Lemmatization wordnet_lemmatizer = WordNetLemmatizer() def get_tag(tag): if tag.startswith('J'): return wordnet.ADJ elif tag.startswith('V'): return wordnet.VERB elif tag.startswith('N'): return wordnet.NOUN elif tag.startswith('R'): return wordnet.ADV else: return '' text_result = [] tokens = word_tokenize(text) # Generate list of tokens tagged = pos_tag(tokens) for t in tagged: try: text_result.append( wordnet_lemmatizer.lemmatize(t[0], get_tag(t[1][:2]))) except: text_result.append(wordnet_lemmatizer.lemmatize(t[0])) paragraph = " ".join(str(x) for x in text_result) return paragraph
def replace_text(t): txt = t.group(0) if txt.lower() in CONTRACTION_MAP.keys(): return CONTRACTION_MAP[txt.lower()]