Ejemplo n.º 1
0
def clean_tweets(tweet_list, class_label):
    '''
    take a list of tweets, returns a list of clean tweets
    '''
    cleaned_tweets = []
    for tweet in tweet_list:
        if "RT" not in tweet:
            tweet = clean_text(tweet) 
            if detect_language(tweet) == 'english':
                cleaned_tweets.append((tweet, class_label))

    return cleaned_tweets
Ejemplo n.º 2
0
def clean_tweets(tweet_list, class_label):
    '''
    take a list of tweets, returns a list of clean tweets
    '''
    cleaned_tweets = []
    for tweet in tweet_list:
        if "RT" not in tweet:
            tweet = clean_text(tweet)
            if detect_language(tweet) == 'english':
                cleaned_tweets.append((tweet, class_label))

    return cleaned_tweets
Ejemplo n.º 3
0
def get_website_description(url, lang1, lang2=None):
    headers = {
        'User-Agent':
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:84.0) Gecko/20100101 Firefox/84.0',
        'Accept':
        'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
        'DNT': '1',
        'Upgrade-Insecure-Requests': '1',
        'Sec-GPC': '1',
        'Cache-Control': 'max-age=0',
    }
    if lang1 is not None:
        lang_list = [lang1]
        if lang2 is not None:
            lang_list.append(lang2)
        headers['Accept-Language'] = f'{",".join(lang_list)};q=0.8'
    try:
        response = searx.network.get(url, headers=headers, timeout=10)
        response.raise_for_status()
    except Exception:
        return (None, None)

    try:
        html = fromstring(response.text)
    except ValueError:
        html = fromstring(response.content)

    description = extract_text(
        html.xpath('/html/head/meta[@name="description"]/@content'))
    if not description:
        description = extract_text(
            html.xpath('/html/head/meta[@property="og:description"]/@content'))
    if not description:
        description = extract_text(html.xpath('/html/head/title'))
    lang = extract_text(html.xpath('/html/@lang'))
    if lang is None and len(lang1) > 0:
        lang = lang1
    lang = detect_language(description) or lang or 'en'
    lang = lang.split('_')[0]
    lang = lang.split('-')[0]
    return (lang, description)
Ejemplo n.º 4
0
def clean_input_text(text):
    if detect_language(text) == 'english':
        text = clean_text(text)
        return text
    else:
        return None
Ejemplo n.º 5
0
def clean_input_text(text):
    if detect_language(text) == 'english':
        text = clean_text(text)
        return text
    else:
        return None