def _clean_line(self, text): text = self._re_sub(r"http\S+", "<url>", text) text = self._re_sub(r"@[A-Za-z0-9]+", "<user>", text) text = self._re_sub(r"#[A-Za-z0-9]+", "", text) text = text.lower() text = text.strip() return text
def preprocess_text(text): """ Function to preprocess text: removes links, punctuation, spaces, non-alpha words and stop_words Parameters ---------- text: str a string to be preprocessed Returns ------- text: str a preprocessed string """ text = text.lower() #lowercase text = re.sub(r"http\S+", "", text) #replace links with "" text = re.sub(r"\@\S+", "", text) #replace mentions with "" text = re.sub(r"#\S+", "", text) #replace hashtags with "" text = re.sub(r"won\'t", "would not", text) #deal with contractions text = re.sub(r"n\'t", " not", text) #deal with contractions text = REPLACE_BY_SPACE.sub(' ', text) #replace punctuation with space text = [word.strip() for word in text.split()] #strip space from words text = [word for word in text if len(word)>2] #removing words less than 2 characters text = [word for word in text if word!='amp'] #removing twitter amp text = ' '.join(text) return text
def preprocess(self, text): punctuation_edit = string.punctuation +"0123456789" text = text.lower() #remove numbers text = re.sub(r'\d+', '', text) #remove extra whitespace text = " ".join(text.split()) #remove punctuation text = text.translate(str.maketrans('', '', string.punctuation)) #stop words stop_words = stopwords.words('english') word_tokens = word_tokenize(text) for word in stop_words: if word in word_tokens: word_tokens.remove(word) #Lemmatization lemmatizer = WordNetLemmatizer() lemmas = [lemmatizer.lemmatize(word, pos ='v') for word in word_tokens] text = ' '.join(lemmas) text = [text] tok = Tokenizer(num_words=20000, filters=punctuation_edit) tok.fit_on_texts(list(text)) seq = tok.texts_to_sequences(text) pad = sequence.pad_sequences(seq, maxlen=100) return pad
def _clean_line(self, text): text = re.sub(r'http\S+', '', text) text = re.sub(r'@[A-Za-z0-9]+', '', text) text = re.sub(r'#[A-Za-z0-9]+', '', text) text = text.replace('RT', '') text = text.lower() text = text.strip() return text
def _clean_line(self, text): text = re.sub(r"http\S+", "", text) text = re.sub(r"@[A-Za-z0-9]+", "", text) text = re.sub(r"#[A-Za-z0-9]+", "", text) text = text.replace("RT","") text = text.lower() text = text.strip() return text
def clean_text(text): ''' text: a string returns: a modified shorter string ''' text = BeautifulSoup(text, "html.parser").text # html encoding text = text.lower() # convert text to all lowercase text = REPLACE_BY_SPACE_RE.sub(' ', text) # replaces the REPLACE_BY_SPACE_RE symbols with a space text = BAD_SYMBOLS_RE.sub('', text) # removes BAD_SYMBOLS_RE text = ' '.join(word for word in text.split() if word not in STOPWORDS) # deletes stopwords return text # returns cleaner comment_text
def lowercase_text(text): text = text.lower() return text
# #using speech_recognition library in python for converting speech to text r = sr.Recognizer() try: # use the microphone as source for input. with sr.Microphone() as source: #taking 1 second to recognize the background noise for clearer and more effective audio to text translation r.adjust_for_ambient_noise(source, duration=1) print("Getting an idea of your background noise") time.sleep(1.1) #listens for the user's input print("Speak now") audio = r.listen(source) # Using ggogle to recognize audio text = r.recognize_google(audio) # MyText = r.recognize_sphinx(audio) text = text.lower() print(text) except sr.RequestError as e: text="" print("Error") except sr.UnknownValueError: text="" print("unknown error") test_requests=[ text ] tag_encoder = MultiLabelBinarizer() # tags_encoded = tag_encoder.fit_transform(sentiments_encoded) # num_tags = len(tags_encoded[0])