def clean_tweets(tweet_list, class_label): ''' take a list of tweets, returns a list of clean tweets ''' cleaned_tweets = [] for tweet in tweet_list: if "RT" not in tweet: tweet = clean_text(tweet) if detect_language(tweet) == 'english': cleaned_tweets.append((tweet, class_label)) return cleaned_tweets
def get_website_description(url, lang1, lang2=None): headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:84.0) Gecko/20100101 Firefox/84.0', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'DNT': '1', 'Upgrade-Insecure-Requests': '1', 'Sec-GPC': '1', 'Cache-Control': 'max-age=0', } if lang1 is not None: lang_list = [lang1] if lang2 is not None: lang_list.append(lang2) headers['Accept-Language'] = f'{",".join(lang_list)};q=0.8' try: response = searx.network.get(url, headers=headers, timeout=10) response.raise_for_status() except Exception: return (None, None) try: html = fromstring(response.text) except ValueError: html = fromstring(response.content) description = extract_text( html.xpath('/html/head/meta[@name="description"]/@content')) if not description: description = extract_text( html.xpath('/html/head/meta[@property="og:description"]/@content')) if not description: description = extract_text(html.xpath('/html/head/title')) lang = extract_text(html.xpath('/html/@lang')) if lang is None and len(lang1) > 0: lang = lang1 lang = detect_language(description) or lang or 'en' lang = lang.split('_')[0] lang = lang.split('-')[0] return (lang, description)
def clean_input_text(text): if detect_language(text) == 'english': text = clean_text(text) return text else: return None