Exemple #1
0
 def curate_articles(self):
     for i in self.words:
         for t in self.api.search(q=i[0], rpp=10, lang='en'):
             t = t._json
             if t['entities'].get('urls'):
                 try:
                     url = t['entities']['urls'][0]['expanded_url']
                     a = Article(url)
                     a.download()
                     a.parse()
                     if a.has_top_image(
                     ) and a.meta_lang == 'en' and self.is_article_url(
                             a.canonical_link):
                         a.nlp()
                         temp_data = {
                             "url": a.canonical_link,
                             "title": a.title,
                             "image": a.top_img,
                             "description": a.meta_description,
                             "keywords": a.keywords,
                             "summary": a.summary
                         }
                         print("{} | Saving | {}".format(
                             self.screen_name, a.title))
                         k = self.coll.insert_one(temp_data)
                         self.articles.append(temp_data)
                 except Exception as e:
                     print(e)
                     print("Continue...")
def cleanup_blog_tweets(tweet_df, num_posts):
    """
    This gets the potential tweets from medium users and then 
    filters out the ones that dont have a medium link in them
    
    num_posts: this depicts how many posts we want to extract, we can scale this up as the capacity of the marketing channels get better
    """
    ## First process the tweet df and remove any tweets that dont have links in them
    link_tweet_inds = []
    for i in range(len(tweet_df)):
        tweet_url_list = tweet_df['urls'].iloc[i]
        
        if len(tweet_url_list) > 0:
            link_tweet_inds.append(i)
    
    link_tweet_inds = list(set(link_tweet_inds))
    link_tweet_df = tweet_df.iloc[link_tweet_inds]
#     print('Number of link tweets -> %s' % len(link_tweet_df))
    
    ## For now we first want to filter out and only work with tweets that are in english
    english_tweet_df = link_tweet_df[link_tweet_df['language']=='en']
    english_tweet_df = english_tweet_df.sort_values(by=['nlikes'], ascending=False)
#     print('Number of english tweets -> %s' % len(english_tweet_df))
    
    top_english_tweet_df = english_tweet_df.iloc[0:int(num_posts/2)]
    bottom_english_tweet_df = english_tweet_df.iloc[-int(num_posts/2):]
    tweet_df = pd.concat([top_english_tweet_df, bottom_english_tweet_df])
#     print('Number of processing tweets -> %s' % len(tweet_df))
    ## Now we get only the top 50 and bottom 50, this is all we process for now
    
    blog_tweet_inds = []
    for i in range(len(tweet_df)):

        tweet_url_list = tweet_df['urls'].iloc[i]
        
        # Process the link to check if it passes the parameters of what a blog post should be
        try:
            article = Article(tweet_url_list[0])
            article.download()
            article.parse()
            top_image = article.has_top_image()
            text_len = len(article.text)

            if top_image and (text_len > 1000):
                blog_tweet_inds.append(i)
                
        except Exception as e:
            pass


    blog_tweet_inds = list(set(blog_tweet_inds))
    blog_tweet_df = tweet_df.iloc[blog_tweet_inds]

    # Sort them by number of likes
    blog_tweet_df = blog_tweet_df.sort_values(by=['nlikes'], ascending=False)
    
    return blog_tweet_df
Exemple #3
0
 def extract_article_from(self, url):
     article = {}
     doc = Article(url)
     try:
         doc.download()
         doc.parse()
     except ArticleException:
         print("Exception getting article from url [{}]".format(url))
         return
     article["image"] = ""
     if doc.has_top_image():
         article["image"] = "<img src={}>".format(doc.top_image)
     article["title"] = doc.title
     article["source_title"] = "notYetSet"
     article["summary"] = article["image"] + doc.text[:300] + " ...</br>"
     article["href"]=url
     return article
Exemple #4
0
def get_document_json(post):
    """
    Parameters
    -------------
    post: dict
        post data.
    Returns
    -------------
    dict: document data.
    """
    try:
        article = Article(post['url'])
        article.download()
        article.parse()
        article.nlp()
        if article.publish_date is None or isinstance(article.publish_date, str):
            date = None
        else:
            date = article.publish_date.strftime('%Y-%m-%d')
        if article.meta_lang != None and article.meta_lang != '':
            stopwords = safe_get_stop_words(article.meta_lang)
            keywords = [i for i in article.keywords if i not in stopwords]
        else:
            keywords = article.keywords
        keywords = list(set([slugify(i) for i in keywords]))
        json = {
            'title': article.title,
            'authors': article.authors,
            'created_on': date,
            'language': article.meta_lang,
            'keywords': keywords,
            'url': post['url'],
        }
        if article.has_top_image() and post['image'] == MISSING_IMAGE:
            post['image'] = article.top_image
    except ArticleException:
        json = {
            'url': post['url']
        }
    return json
Exemple #5
0
def article_handler(url=None, nlp=False):
    response = {
        'publish_date': None,
        'html': None,
        'title': None,
        'top_image': None,
        'source_url': None,
        'images': None,
        'authors': None,
        'text': None,
        'canonical_link': None,
        'movies': None,
        'keywords': None,
        'summary': None
    }

    if not url:
        statsd.increment('url_analysis.empty')
        loggly.error("Cannot parse empty URL")
        return response
    ## if
    try:
        article = Article(url)
        if not article.is_downloaded:
            statsd.increment('url_analysis.download')
            loggly.info("Downloading article")
            article.download()
        ##if

        # response['html'] = article.html

        if not article.is_parsed:
            statsd.increment('url_analysis.parse')
            loggly.info("Parsing article")
            article.parse()
        ##if

        response['title'] = article.title

        if article.has_top_image() is True:
            statsd.increment('url_analysis.get_top_image')
            loggly.info("Extracting top_image")
            response['top_image'] = article.top_image
        ##if-else

        if nlp is True:
            statsd.increment('url_analysis.nlp_process')
            loggly.info("Doing NLP processing")
            article.nlp()
            response['summary'] = article.summary
            response['keywords'] = article.keywords
        ##if

        response['movies'] = article.movies
        response['images'] = article.images
        response['authors'] = article.authors
        response['text'] = article.text
        response['publish_date'] = article.publish_date
        response['source_url'] = article.source_url
        response['canonical_link'] = article.canonical_link

        statsd.increment('url_analysis.ok')
        return response
    except Exception as e:
        statsd.increment('url_analysis.error')
        loggly.error(e)
        return response