def __urlImageGenerator(cls, link): """ given a link, try to get images from it by the Article Library """ try: a = Article(url=link) a.download() a.parse() a.fetch_images() for img in a.imgs: yield img except Exception: pass
def post(self, request, *args, **kwargs): url = request.POST.get("url") context = {} a = Article(url, language='en') a.download() a.parse() context["title"] = a.title context["text"] = a.text context["authors"] = ", ".join(a.authors) context["top_image"] = a.top_image a.fetch_images() context["images"] = a.images context["publish_date"] = a.publish_date context["movies"] = a.movies a.nlp() context["keywords"] = ", ".join(a.keywords) context["summary"] = a.summary context["url"] = url context["method"] = "post" return render(request, self.template_name, context)
# first filename witt be i+1 # so i should be the latest number given to an article i=0 with open("links-sz-2020-07-13.txt" , "r") as link_file : all_lines = link_file.readlines() for link in all_lines[0:3]: # for debugging, only run the first three links article = Article(link) print(i , ": ", link) article.download() print("title: " , article.title) time.sleep(2) article.parse() article.nlp() article.fetch_images() # I am not working with images at the moment ## generate a filename i=i+1 filename = f'{i:05}' # should check, if file exists ... keep = article.meta_data['og'] keep['authors'] = article.authors keep['text-link'] = filename keep['images-link'] = list(article.images) keep['publish-date'] = str(article.publish_date) keep['paper'] = 'sueddeutsche' keep['id'] = link[-1:-9] with open(filename + ".json", "w") as write_file:
#article.parse() #print(article.authors) #print(article.text) # first filename will be i+1 i = 65 with open("links-nytimes-2020-07-12-new.txt", "r") as link_file: all_lines = link_file.readlines() for link in all_lines: article = Article(link) article.download() time.sleep(2) article.parse() article.nlp() article.fetch_images() i = i + 1 filename = f'{i:05}' keep = article.meta_data['og'] keep['authors'] = article.authors keep['text-link'] = filename keep['images-link'] = list(article.images) keep['publish-date'] = str(article.publish_date) keep['paper'] = 'nytimes' with open(filename + ".json", "w") as write_file: json.dump(keep, write_file) with open(filename + ".txt", "w") as write_file:
def parse_post(self, response): title = response.xpath( '//*[contains(concat( " ", @class, " " ), concat( " ", "_eYtD2XCVieq6emjKBH3m", " " ))]/text()' ).get(default='') upvoted = response.xpath( '//*[contains(concat( " ", @class, " " ), concat( " ", "t4Hq30BDzTeJ85vREX7_M", " " ))]//span/text()' ).get(default='') post_timestamp = response.xpath( '//*[@data-click-id="timestamp"]/text()' ).get(default='') post_images = response.xpath( '//img[@alt="Post image"]/@src' ).getall() post_videos = response.xpath( '//a[contains(concat(" ",normalize-space(@class)," ")," _13svhQIUZqD9PVzFcLwOKT ")]/@href' ).getall() post_screenshot = response.xpath( '//*[contains(concat(" ",normalize-space(@class)," ")," D3IL3FD0RFy_mkKLPwL4 ")]' '//div[contains(concat(" ",normalize-space(@class)," ")," _292iotee39Lmt0MkQZ2hPV ")]' '//pre[contains(concat(" ",normalize-space(@class)," ")," _3GnarIQX9tD_qsgXkfSDz1 ")]' '//code[contains(concat(" ",normalize-space(@class)," ")," _34q3PgLsx9zIU5BiSOjFoM ")]/text()' ).getall() post_text = response.xpath( '//*[contains(concat(" ",normalize-space(@class)," ")," D3IL3FD0RFy_mkKLPwL4 ")]' '//div[contains(concat(" ",normalize-space(@class)," ")," _292iotee39Lmt0MkQZ2hPV ")]' '//p[contains(concat(" ",normalize-space(@class)," ")," _1qeIAgB0cPwnLhDF9XSiJM ")]/text()').getall() post_tags = response.xpath( '//*[contains(concat(" ",normalize-space(@class)," ")," D3IL3FD0RFy_mkKLPwL4 ")]' '//div[contains(concat(" ",normalize-space(@class)," ")," _292iotee39Lmt0MkQZ2hPV ")]' '//p[contains(concat(" ",normalize-space(@class)," ")," _1qeIAgB0cPwnLhDF9XSiJM ")]' '//a[contains(concat(" ",normalize-space(@class)," ")," _3t5uN8xUmg0TOwRCOGQEcU ")]/text()').getall() comment_text = response.xpath( '//div[contains(concat(" ",normalize-space(@class)," ")," _3cjCphgls6DH-irkVaA0GM ")]' '//p[contains(concat(" ",normalize-space(@class)," ")," _1qeIAgB0cPwnLhDF9XSiJM ")]/text()').getall() comment_count = response.xpath( '//*[contains(concat( " ", @class, " " ), concat( " ", "FHCV02u6Cp2zYL0fhQPsO", " " ))]/text()').getall() comment_timestamp = response.xpath( '//*[@id="CommentTopMeta--Created--t1_fxcpc90"]//span/text()').getall() read_more_comments = response.xpath( '//*[contains(concat(" ",normalize-space(@class)," ")," _23013peWUhznY89KuYPZKv ")]').getall() for comment in read_more_comments: comment_text = response.xpath( '//div[contains(concat(" ",normalize-space(@class)," ")," _3cjCphgls6DH-irkVaA0GM ")]' '//p[contains(concat(" ",normalize-space(@class)," ")," _1qeIAgB0cPwnLhDF9XSiJM ")]/text()').getall() article = Article(url=response.url) article.fetch_images = lambda: True article.set_html(response.text) article.parse() yield SocialMediaItem( url=response.url, keyword=requests.utils.urlparse(response.url).path.rsplit('/', 5)[1], post_id=requests.utils.urlparse(response.url).path.rsplit('/', 5)[3], title=title, title_addresses=re.findall(r'(?:bc1|[13])[a-zA-HJ-NP-Z0-9]{25,39}|' '0x[a-zA-Z0-9]{40}|' '[LM3][a-km-zA-HJ-NP-Z1-9]{26,33}|' '(?:bitcoincash\:)?[qp][a-z0-9]{41}|' '(?:BITCOINCASH\:)?[QP][A-Z0-9]{41}|r[0-9a-zA-Z]{24,34}', str(title)), post=post_text, mentions=' ', call_to_action=' ', bitcoin_addresses=re.findall(r'(?:bc1|[13])[a-zA-HJ-NP-Z0-9]{25,39}', str(post_text)), ethereum_addresses=re.findall(r'0x[a-zA-Z0-9]{40}', str(post_text)), litecoin_addresses=re.findall(r'[LM3][a-km-zA-HJ-NP-Z1-9]{26,33}', str(post_text)), bitcoincash_addresses=re.findall(r'((bitcoincash:)?(q|p)[a-z0-9]{41})|' '((BITCOINCASH:)?(Q|P)[A-Z0-9]{41})', str(post_text)), ripple_addresses = re.findall(r'r[0-9a-zA-Z]{24,34}', str(post_text)), author_username='', author_fullname='', post_sentiment={ 'likes':upvoted, 'code_snippet':post_screenshot, 'comment_count': comment_count, }, post_timestamp=post_timestamp, post_images=post_images, post_videos=post_videos, comments=[dict( comment_id='', comment_text=comment_text, mentions=' ', call_to_action=' ', btc_addresses=re.findall(r'(?:bc1|[13])[a-zA-HJ-NP-Z0-9]{25,39}', str(comment_text)), eth_addresses=re.findall(r'0x[a-zA-Z0-9]{40}', str(comment_text)), ltc_addresses=re.findall(r'[LM3][a-km-zA-HJ-NP-Z1-9]{26,33}', str(comment_text)), bch_addresses=re.findall(r'((?:bitcoincash:)?(q|p)[a-z0-9]{41})|' '((?:BITCOINCASH:)?(Q|P)[A-Z0-9]{41})', str(comment_text)), xrp_addresses = re.findall(r'r[0-9a-zA-Z]{24,34}', str(comment_text)), sentiment={'retweets': '', 'likes': ''}, timestamp=comment_timestamp, images='', videos='', author={'username': '', 'fullname': ''} )], network='clearweb', source='reddit', html='' if IS_LOCAL_SERVER_ENV else article.html, images=article.images, tags=list(article.tags), movies=article.movies, meta_description=article.meta_description, meta_keywords=article.meta_keywords, meta_lang=article.meta_lang, )
def scrape_reddit(reddit, engine, limit_, yest): try: i = 0 for submission in reddit.subreddit('news').hot(limit=limit_): if (submission.created > yest): query_comments = '''SELECT EXISTS(SELECT * FROM MemeNews.every_comment WHERE post_id LIKE '{0}' LIMIT 1)'''.format( submission.id) query_articles = '''SELECT EXISTS(SELECT * FROM MemeNews.Daily_Articles WHERE id LIKE '{0}' LIMIT 1)'''.format( submission.id) if (engine.execute(query_articles).fetchone()[0]): continue submission.comment_sort = 'best' article = Article(submission.url) try: article.download() article.parse() article.nlp() article.fetch_images() except: continue articles_dict = { "title": re.sub(r'[^\x00-\x7F]', '', submission.title.replace('"', "'")), "score": submission.score, "id": submission.id, "url": submission.url, "comms_num": submission.num_comments, "created": submission.created, "body": re.sub(r'[^\x00-\x7F]', '', article.text.replace('"', "'")), "image": article.top_image, "keywords": ', '.join(article.keywords).replace('"', "'"), "summary": re.sub(r'[^\x00-\x7F]', '', article.summary.replace('"', "'")) } #add articles articles_data = pd.DataFrame(articles_dict, index=[i]) articles_data.to_sql('Daily_Articles', con=engine, if_exists='append', dtype={'None': VARCHAR(5)}) print("article added with url: ", submission.url) if (engine.execute(query_comments).fetchone()[0]): continue comment_dict = { "post_id": [], 'post_title': [], "id": [], "author": [], "body": [], "created": [], 'score': [], 'is_submitter': [], 'parent_id': [] } for top_level_comment in submission.comments.list()[:100]: try: comment_dict['is_submitter'].append( top_level_comment.is_submitter) comment_dict['post_id'].append(submission.id) comment_dict['id'].append(top_level_comment.id) comment_dict['author'].append(top_level_comment.author) comment_dict['body'].append( re.sub(r'[^\x00-\x7F]', '', top_level_comment.body)) comment_dict['score'].append(top_level_comment.score) comment_dict['created'].append( top_level_comment.created_utc) comment_dict['parent_id'].append( top_level_comment.parent_id) comment_dict['post_title'].append(submission.title) except: continue comment_data = pd.DataFrame(comment_dict) comment_data.to_sql('every_comment', con=engine, if_exists='append', dtype={'None': VARCHAR(5)}) print("comments added") i += 1 return 1 except err: print(err) return 0
def parse_comment(self, response): comment_url = response.request.url logging.info("Processing --> " + response.url) comment_url = response.request.url comment_username = response.xpath( '//*[@class="permalink-inner permalink-tweet-container"]' '//*[@class="username u-dir u-textTruncate"]/b/text()' ).get(default="") comment_full_name = response.xpath( '//*[@class="permalink-inner permalink-tweet-container"]' '//*[@class="FullNameGroup"]/strong/text()' ).get(default="") try: comment_text = ( response.xpath("//title/text()").get(default="").split(":")[1].strip() ) except: comment_text = " ".join( response.xpath( '//*[contains(@class,"permalink-inner permalink-tweet-container")]' '//*[@class="js-tweet-text-container"]/p//text()' ).getall() ).strip() comment_image_list = response.xpath( '//*[contains(@class,"permalink-inner permalink-tweet-container")]' '//*[@class="AdaptiveMediaOuterContainer"]//img/@src' ).getall() comment_video = response.xpath( '//*[contains(concat(" ",normalize-space(@class)," ")," js-tweet-text-container ")]' '//p[contains(concat(" ",normalize-space(@class)," ")," TweetTextSize--jumbo ")]' '//a[contains(concat(" ",normalize-space(@class)," ")," twitter-timeline-link ")]/@href' ).get(default="") comment_date_time = response.xpath( '//*[contains(@class,"permalink-inner permalink-tweet-container")]' '//*[@class="js-tweet-details-fixer tweet-details-fixer"]' '/div[@class="client-and-actions"]/span[@class="metadata"]/span/text()' ).get(default="") comment_retweets = response.xpath( '//*[contains(@class,"permalink-inner permalink-tweet-container")]' '//*[@class="js-tweet-details-fixer tweet-details-fixer"]' '/div[@class="js-tweet-stats-container tweet-stats-container"]' '//*[@class="js-stat-count js-stat-retweets stat-count"]/a/strong/text()' ).get(default="") comment_likes = response.xpath( '//*[contains(@class,"permalink-inner permalink-tweet-container")]' '//*[@class="js-tweet-details-fixer tweet-details-fixer"]' '/div[@class="js-tweet-stats-container tweet-stats-container"]' '//*[@class="js-stat-count js-stat-favorites stat-count"]/a/strong/text()' ).get(default="") comment_call_to_action = re.findall(r"(?P<url>https?://[^\s]+)", comment_text) comment_mentions = re.findall("(^|[^@\w])@(\w{1,15})", comment_text) if len(comment_mentions) != 0: comment_mentions = [i[1] for i in comment_mentions] username = response.meta["username"] full_name = response.meta["full_name"] tweet_text = response.meta["tweet_text"] tweet_time = response.meta["tweet_time"] likes = response.meta["number_of_likes"] retweets = response.meta["no_of_retweets"] replies = response.meta["no_of_replies"] image_url = response.meta["image_url"] post_video = response.meta["post_video"] current_url = response.meta["current_url"] call_to_action = response.meta["call_to_action"] mentions = response.meta["mentions"] article = Article(url=response.url) article.fetch_images = lambda: True article.set_html(response.text) article.parse() yield SocialMediaItem( url=response.url, keyword=requests.utils.urlparse(current_url).path.rsplit("/", 2)[1], post_id=requests.utils.urlparse(response.url).path.rsplit("/", 2)[2], title="", title_addresses=re.findall( r"(?:bc1|[13])[a-zA-HJ-NP-Z0-9]{25,39}|" "0x[a-zA-Z0-9]{40}|" "[LM3][a-km-zA-HJ-NP-Z1-9]{26,33}|" "(?:bitcoincash\:)?[qp][a-z0-9]{41}|" "(?:BITCOINCASH\:)?[QP][A-Z0-9]{41}|" "r[0-9a-zA-Z]{24,34}", str(call_to_action), ), post=tweet_text, mentions=" ".join(mentions), call_to_action=" ".join(call_to_action), bitcoin_addresses=re.findall( r"(?:bc1|[13])[a-zA-HJ-NP-Z0-9]{25,39}", str(tweet_text) ), ethereum_addresses=re.findall(r"0x[a-zA-Z0-9]{40}", str(tweet_text)), litecoin_addresses=re.findall( r"[LM3][a-km-zA-HJ-NP-Z1-9]{26,33}", str(tweet_text) ), bitcoincash_addresses=re.findall( r"((bitcoincash:)?(q|p)[a-z0-9]{41})|((BITCOINCASH:)?(Q|P)[A-Z0-9]{41})", str(tweet_text), ), ripple_addresses=re.findall(r"r[0-9a-zA-Z]{24,34}", str(tweet_text)), total_addresses=re.findall( r"(?:bc1|[13])[a-zA-HJ-NP-Z0-9]{25,39}|" "0x[a-zA-Z0-9]{40}|[LM3][a-km-zA-HJ-NP-Z1-9]{26,33}|" "(?:bitcoincash\:)?[qp][a-z0-9]{41}|" "(?:BITCOINCASH\:)?[QP][A-Z0-9]{41}|" "r[0-9a-zA-Z]{24,34}", str(tweet_text), ), author_username=username, author_fullname=full_name, post_sentiment={ "likes": str(likes), "retweets": str(retweets), "code_snippet": "", "comment_count": str(replies), }, post_timestamp=str(tweet_time), post_images=image_url, post_videos=post_video, comments=[ dict( comment_id=requests.utils.urlparse(comment_url).path.rsplit("/", 2)[ 2 ], comment_text=comment_text, mentions=" ".join(comment_mentions), call_to_action=" ".join(comment_call_to_action), cta_addresses=re.findall( r"(?:bc1|[13])[a-zA-HJ-NP-Z0-9]{25,39}|" "0x[a-zA-Z0-9]{40}|" "[LM3][a-km-zA-HJ-NP-Z1-9]{26,33}|" "(?:bitcoincash\:)?[qp][a-z0-9]{41}|" "(?:BITCOINCASH\:)?[QP][A-Z0-9]{41}|" "r[0-9a-zA-Z]{24,34}", str(comment_call_to_action), ), mentions_addresses=re.findall( r"(?:bc1|[13])[a-zA-HJ-NP-Z0-9]{25,39}|" "0x[a-zA-Z0-9]{40}|" "[LM3][a-km-zA-HJ-NP-Z1-9]{26,33}|" "(?:bitcoincash\:)?[qp][a-z0-9]{41}|" "(?:BITCOINCASH\:)?[QP][A-Z0-9]{41}|" "r[0-9a-zA-Z]{24,34}", str(mentions), ), btc_addresses=re.findall( r"(?:bc1|[13])[a-zA-HJ-NP-Z0-9]{25,39}", str(comment_text) ), eth_addresses=re.findall(r"0x[a-zA-Z0-9]{40}", str(comment_text)), ltc_addresses=re.findall( r"[LM3][a-km-zA-HJ-NP-Z1-9]{26,33}", str(comment_text) ), bch_addresses=re.findall( r"((?:bitcoincash:)?(q|p)[a-z0-9]{41})|" "((?:BITCOINCASH:)?(Q|P)[A-Z0-9]{41})", str(comment_text), ), xrp_addresses=re.findall(r"r[0-9a-zA-Z]{24,34}", str(comment_text)), total_addresses=re.findall( r"(?:bc1|[13])[a-zA-HJ-NP-Z0-9]{25,39}|" "0x[a-zA-Z0-9]{40}|" "[LM3][a-km-zA-HJ-NP-Z1-9]{26,33}|" "(?:bitcoincash\:)?[qp][a-z0-9]{41}|" "(?:BITCOINCASH\:)?[QP][A-Z0-9]{41}|" "r[0-9a-zA-Z]{24,34}", str(comment_text), ), sentiment={ "retweets": str(comment_retweets), "likes": str(comment_likes), }, timestamp=str(comment_date_time), images=comment_image_list, videos=comment_video, author={ "username": comment_username, "fullname": comment_full_name, }, ) ], network="clearweb", source="twitter", html="" if IS_LOCAL_SERVER_ENV else article.html, images=article.images, tags=list(article.tags), movies=article.movies, meta_description=article.meta_description, meta_keywords=article.meta_keywords, meta_lang=article.meta_lang, )