def curate_articles(self): for i in self.words: for t in self.api.search(q=i[0], rpp=10, lang='en'): t = t._json if t['entities'].get('urls'): try: url = t['entities']['urls'][0]['expanded_url'] a = Article(url) a.download() a.parse() if a.has_top_image( ) and a.meta_lang == 'en' and self.is_article_url( a.canonical_link): a.nlp() temp_data = { "url": a.canonical_link, "title": a.title, "image": a.top_img, "description": a.meta_description, "keywords": a.keywords, "summary": a.summary } print("{} | Saving | {}".format( self.screen_name, a.title)) k = self.coll.insert_one(temp_data) self.articles.append(temp_data) except Exception as e: print(e) print("Continue...")
def cleanup_blog_tweets(tweet_df, num_posts): """ This gets the potential tweets from medium users and then filters out the ones that dont have a medium link in them num_posts: this depicts how many posts we want to extract, we can scale this up as the capacity of the marketing channels get better """ ## First process the tweet df and remove any tweets that dont have links in them link_tweet_inds = [] for i in range(len(tweet_df)): tweet_url_list = tweet_df['urls'].iloc[i] if len(tweet_url_list) > 0: link_tweet_inds.append(i) link_tweet_inds = list(set(link_tweet_inds)) link_tweet_df = tweet_df.iloc[link_tweet_inds] # print('Number of link tweets -> %s' % len(link_tweet_df)) ## For now we first want to filter out and only work with tweets that are in english english_tweet_df = link_tweet_df[link_tweet_df['language']=='en'] english_tweet_df = english_tweet_df.sort_values(by=['nlikes'], ascending=False) # print('Number of english tweets -> %s' % len(english_tweet_df)) top_english_tweet_df = english_tweet_df.iloc[0:int(num_posts/2)] bottom_english_tweet_df = english_tweet_df.iloc[-int(num_posts/2):] tweet_df = pd.concat([top_english_tweet_df, bottom_english_tweet_df]) # print('Number of processing tweets -> %s' % len(tweet_df)) ## Now we get only the top 50 and bottom 50, this is all we process for now blog_tweet_inds = [] for i in range(len(tweet_df)): tweet_url_list = tweet_df['urls'].iloc[i] # Process the link to check if it passes the parameters of what a blog post should be try: article = Article(tweet_url_list[0]) article.download() article.parse() top_image = article.has_top_image() text_len = len(article.text) if top_image and (text_len > 1000): blog_tweet_inds.append(i) except Exception as e: pass blog_tweet_inds = list(set(blog_tweet_inds)) blog_tweet_df = tweet_df.iloc[blog_tweet_inds] # Sort them by number of likes blog_tweet_df = blog_tweet_df.sort_values(by=['nlikes'], ascending=False) return blog_tweet_df
def extract_article_from(self, url): article = {} doc = Article(url) try: doc.download() doc.parse() except ArticleException: print("Exception getting article from url [{}]".format(url)) return article["image"] = "" if doc.has_top_image(): article["image"] = "<img src={}>".format(doc.top_image) article["title"] = doc.title article["source_title"] = "notYetSet" article["summary"] = article["image"] + doc.text[:300] + " ...</br>" article["href"]=url return article
def get_document_json(post): """ Parameters ------------- post: dict post data. Returns ------------- dict: document data. """ try: article = Article(post['url']) article.download() article.parse() article.nlp() if article.publish_date is None or isinstance(article.publish_date, str): date = None else: date = article.publish_date.strftime('%Y-%m-%d') if article.meta_lang != None and article.meta_lang != '': stopwords = safe_get_stop_words(article.meta_lang) keywords = [i for i in article.keywords if i not in stopwords] else: keywords = article.keywords keywords = list(set([slugify(i) for i in keywords])) json = { 'title': article.title, 'authors': article.authors, 'created_on': date, 'language': article.meta_lang, 'keywords': keywords, 'url': post['url'], } if article.has_top_image() and post['image'] == MISSING_IMAGE: post['image'] = article.top_image except ArticleException: json = { 'url': post['url'] } return json
def article_handler(url=None, nlp=False): response = { 'publish_date': None, 'html': None, 'title': None, 'top_image': None, 'source_url': None, 'images': None, 'authors': None, 'text': None, 'canonical_link': None, 'movies': None, 'keywords': None, 'summary': None } if not url: statsd.increment('url_analysis.empty') loggly.error("Cannot parse empty URL") return response ## if try: article = Article(url) if not article.is_downloaded: statsd.increment('url_analysis.download') loggly.info("Downloading article") article.download() ##if # response['html'] = article.html if not article.is_parsed: statsd.increment('url_analysis.parse') loggly.info("Parsing article") article.parse() ##if response['title'] = article.title if article.has_top_image() is True: statsd.increment('url_analysis.get_top_image') loggly.info("Extracting top_image") response['top_image'] = article.top_image ##if-else if nlp is True: statsd.increment('url_analysis.nlp_process') loggly.info("Doing NLP processing") article.nlp() response['summary'] = article.summary response['keywords'] = article.keywords ##if response['movies'] = article.movies response['images'] = article.images response['authors'] = article.authors response['text'] = article.text response['publish_date'] = article.publish_date response['source_url'] = article.source_url response['canonical_link'] = article.canonical_link statsd.increment('url_analysis.ok') return response except Exception as e: statsd.increment('url_analysis.error') loggly.error(e) return response