def __urlImageGenerator(cls, link):
        """ given a link, try to get images from it by the Article Library
        """

        try:
            a = Article(url=link)
            a.download()
            a.parse()
            a.fetch_images()

            for img in a.imgs:
                yield img
        except Exception:
            pass
 def post(self, request, *args, **kwargs):
     url = request.POST.get("url")
     context = {}
     a = Article(url, language='en')
     a.download()
     a.parse()
     context["title"] = a.title
     context["text"] = a.text
     context["authors"] = ", ".join(a.authors)
     context["top_image"] = a.top_image
     a.fetch_images()
     context["images"] = a.images
     context["publish_date"] = a.publish_date
     context["movies"] = a.movies
     a.nlp()
     context["keywords"] = ", ".join(a.keywords)
     context["summary"] = a.summary
     context["url"] = url
     context["method"] = "post"
     return render(request, self.template_name, context)
Exemple #3
0
# first filename witt be i+1
# so i should be the latest number given to an article
i=0


with open("links-sz-2020-07-13.txt" , "r") as link_file :
	all_lines = link_file.readlines()
	for link in all_lines[0:3]: # for debugging, only run the first three links
		article = Article(link)
		print(i , ": ", link)
		article.download()
		print("title: " , article.title)
		time.sleep(2)
		article.parse()
		article.nlp()
		article.fetch_images() # I am not working with images at the moment

		## generate a filename
		i=i+1
		filename = f'{i:05}'
			# should check, if file exists ...

		keep = article.meta_data['og']
		keep['authors'] = article.authors
		keep['text-link'] = filename
		keep['images-link'] = list(article.images)
		keep['publish-date'] = str(article.publish_date)
		keep['paper'] = 'sueddeutsche'
		keep['id'] = link[-1:-9]

		with open(filename + ".json", "w") as write_file:
Exemple #4
0
#article.parse()
#print(article.authors)
#print(article.text)

# first filename will be i+1
i = 65

with open("links-nytimes-2020-07-12-new.txt", "r") as link_file:
    all_lines = link_file.readlines()
    for link in all_lines:
        article = Article(link)
        article.download()
        time.sleep(2)
        article.parse()
        article.nlp()
        article.fetch_images()

        i = i + 1
        filename = f'{i:05}'

        keep = article.meta_data['og']
        keep['authors'] = article.authors
        keep['text-link'] = filename
        keep['images-link'] = list(article.images)
        keep['publish-date'] = str(article.publish_date)
        keep['paper'] = 'nytimes'

        with open(filename + ".json", "w") as write_file:
            json.dump(keep, write_file)

        with open(filename + ".txt", "w") as write_file:
 def parse_post(self, response):
     title = response.xpath(
         '//*[contains(concat( " ", @class, " " ), concat( " ", "_eYtD2XCVieq6emjKBH3m", " " ))]/text()'
         ).get(default='')
     upvoted = response.xpath(
         '//*[contains(concat( " ", @class, " " ), concat( " ", "t4Hq30BDzTeJ85vREX7_M", " " ))]//span/text()'
         ).get(default='') 
     post_timestamp = response.xpath(
         '//*[@data-click-id="timestamp"]/text()'
         ).get(default='')
     post_images = response.xpath(
         '//img[@alt="Post image"]/@src'
         ).getall()
     post_videos = response.xpath(
         '//a[contains(concat(" ",normalize-space(@class)," ")," _13svhQIUZqD9PVzFcLwOKT ")]/@href'
         ).getall()
     post_screenshot = response.xpath(
         '//*[contains(concat(" ",normalize-space(@class)," ")," D3IL3FD0RFy_mkKLPwL4 ")]'
         '//div[contains(concat(" ",normalize-space(@class)," ")," _292iotee39Lmt0MkQZ2hPV ")]'
         '//pre[contains(concat(" ",normalize-space(@class)," ")," _3GnarIQX9tD_qsgXkfSDz1 ")]'
         '//code[contains(concat(" ",normalize-space(@class)," ")," _34q3PgLsx9zIU5BiSOjFoM ")]/text()'
         ).getall()  
     post_text = response.xpath(
         '//*[contains(concat(" ",normalize-space(@class)," ")," D3IL3FD0RFy_mkKLPwL4 ")]'
         '//div[contains(concat(" ",normalize-space(@class)," ")," _292iotee39Lmt0MkQZ2hPV ")]'
         '//p[contains(concat(" ",normalize-space(@class)," ")," _1qeIAgB0cPwnLhDF9XSiJM ")]/text()').getall()
     post_tags = response.xpath(
         '//*[contains(concat(" ",normalize-space(@class)," ")," D3IL3FD0RFy_mkKLPwL4 ")]'
         '//div[contains(concat(" ",normalize-space(@class)," ")," _292iotee39Lmt0MkQZ2hPV ")]'
         '//p[contains(concat(" ",normalize-space(@class)," ")," _1qeIAgB0cPwnLhDF9XSiJM ")]'
         '//a[contains(concat(" ",normalize-space(@class)," ")," _3t5uN8xUmg0TOwRCOGQEcU ")]/text()').getall()  
     comment_text = response.xpath(
         '//div[contains(concat(" ",normalize-space(@class)," ")," _3cjCphgls6DH-irkVaA0GM ")]'
         '//p[contains(concat(" ",normalize-space(@class)," ")," _1qeIAgB0cPwnLhDF9XSiJM ")]/text()').getall()
     comment_count = response.xpath(
         '//*[contains(concat( " ", @class, " " ), concat( " ", "FHCV02u6Cp2zYL0fhQPsO", " " ))]/text()').getall()
     comment_timestamp = response.xpath(
         '//*[@id="CommentTopMeta--Created--t1_fxcpc90"]//span/text()').getall()
     read_more_comments = response.xpath(
         '//*[contains(concat(" ",normalize-space(@class)," ")," _23013peWUhznY89KuYPZKv ")]').getall()
     for comment in read_more_comments:
         comment_text = response.xpath(
             '//div[contains(concat(" ",normalize-space(@class)," ")," _3cjCphgls6DH-irkVaA0GM ")]'
             '//p[contains(concat(" ",normalize-space(@class)," ")," _1qeIAgB0cPwnLhDF9XSiJM ")]/text()').getall()
     article = Article(url=response.url)
     article.fetch_images = lambda: True
     article.set_html(response.text)
     article.parse()
 
     yield SocialMediaItem(
             url=response.url,
             keyword=requests.utils.urlparse(response.url).path.rsplit('/', 5)[1],
             post_id=requests.utils.urlparse(response.url).path.rsplit('/', 5)[3],
             title=title,
             title_addresses=re.findall(r'(?:bc1|[13])[a-zA-HJ-NP-Z0-9]{25,39}|'
                 '0x[a-zA-Z0-9]{40}|'
                 '[LM3][a-km-zA-HJ-NP-Z1-9]{26,33}|'
                 '(?:bitcoincash\:)?[qp][a-z0-9]{41}|'
                 '(?:BITCOINCASH\:)?[QP][A-Z0-9]{41}|r[0-9a-zA-Z]{24,34}', str(title)),
             post=post_text,
             mentions=' ',
             call_to_action=' ',
             bitcoin_addresses=re.findall(r'(?:bc1|[13])[a-zA-HJ-NP-Z0-9]{25,39}', str(post_text)),
             ethereum_addresses=re.findall(r'0x[a-zA-Z0-9]{40}', str(post_text)),
             litecoin_addresses=re.findall(r'[LM3][a-km-zA-HJ-NP-Z1-9]{26,33}', str(post_text)),
             bitcoincash_addresses=re.findall(r'((bitcoincash:)?(q|p)[a-z0-9]{41})|'
                 '((BITCOINCASH:)?(Q|P)[A-Z0-9]{41})', str(post_text)),
             ripple_addresses = re.findall(r'r[0-9a-zA-Z]{24,34}', str(post_text)),
             author_username='',
             author_fullname='',
             post_sentiment={
                                 'likes':upvoted,
                                 'code_snippet':post_screenshot,
                                 'comment_count': comment_count,
                             },
             post_timestamp=post_timestamp,
             post_images=post_images,
             post_videos=post_videos,
             comments=[dict(
                         comment_id='',
                         comment_text=comment_text,
                         mentions=' ',
                         call_to_action=' ',
                         btc_addresses=re.findall(r'(?:bc1|[13])[a-zA-HJ-NP-Z0-9]{25,39}', str(comment_text)),
                         eth_addresses=re.findall(r'0x[a-zA-Z0-9]{40}', str(comment_text)),
                         ltc_addresses=re.findall(r'[LM3][a-km-zA-HJ-NP-Z1-9]{26,33}', str(comment_text)),
                         bch_addresses=re.findall(r'((?:bitcoincash:)?(q|p)[a-z0-9]{41})|'
                             '((?:BITCOINCASH:)?(Q|P)[A-Z0-9]{41})', str(comment_text)),
                         xrp_addresses = re.findall(r'r[0-9a-zA-Z]{24,34}', str(comment_text)),
                         sentiment={'retweets': '', 'likes': ''},
                         timestamp=comment_timestamp,
                         images='',
                         videos='',
                         author={'username': '', 'fullname': ''}
                         )],   
             network='clearweb',
             source='reddit',
             html='' if IS_LOCAL_SERVER_ENV else article.html,
             images=article.images,
             tags=list(article.tags),
             movies=article.movies,
             meta_description=article.meta_description,
             meta_keywords=article.meta_keywords,
             meta_lang=article.meta_lang,
         )
Exemple #6
0
def scrape_reddit(reddit, engine, limit_, yest):
    try:
        i = 0
        for submission in reddit.subreddit('news').hot(limit=limit_):
            if (submission.created > yest):
                query_comments = '''SELECT EXISTS(SELECT * FROM MemeNews.every_comment  WHERE post_id LIKE '{0}' LIMIT 1)'''.format(
                    submission.id)
                query_articles = '''SELECT EXISTS(SELECT * FROM MemeNews.Daily_Articles  WHERE id LIKE '{0}' LIMIT 1)'''.format(
                    submission.id)
                if (engine.execute(query_articles).fetchone()[0]):
                    continue
                submission.comment_sort = 'best'
                article = Article(submission.url)
                try:
                    article.download()
                    article.parse()
                    article.nlp()
                    article.fetch_images()
                except:
                    continue
                articles_dict = {
                    "title":
                    re.sub(r'[^\x00-\x7F]', '',
                           submission.title.replace('"', "'")),
                    "score":
                    submission.score,
                    "id":
                    submission.id,
                    "url":
                    submission.url,
                    "comms_num":
                    submission.num_comments,
                    "created":
                    submission.created,
                    "body":
                    re.sub(r'[^\x00-\x7F]', '', article.text.replace('"',
                                                                     "'")),
                    "image":
                    article.top_image,
                    "keywords":
                    ', '.join(article.keywords).replace('"', "'"),
                    "summary":
                    re.sub(r'[^\x00-\x7F]', '',
                           article.summary.replace('"', "'"))
                }
                #add articles
                articles_data = pd.DataFrame(articles_dict, index=[i])
                articles_data.to_sql('Daily_Articles',
                                     con=engine,
                                     if_exists='append',
                                     dtype={'None': VARCHAR(5)})
                print("article added with url: ", submission.url)
                if (engine.execute(query_comments).fetchone()[0]):
                    continue
                comment_dict = {
                    "post_id": [],
                    'post_title': [],
                    "id": [],
                    "author": [],
                    "body": [],
                    "created": [],
                    'score': [],
                    'is_submitter': [],
                    'parent_id': []
                }
                for top_level_comment in submission.comments.list()[:100]:
                    try:
                        comment_dict['is_submitter'].append(
                            top_level_comment.is_submitter)
                        comment_dict['post_id'].append(submission.id)
                        comment_dict['id'].append(top_level_comment.id)
                        comment_dict['author'].append(top_level_comment.author)
                        comment_dict['body'].append(
                            re.sub(r'[^\x00-\x7F]', '',
                                   top_level_comment.body))
                        comment_dict['score'].append(top_level_comment.score)
                        comment_dict['created'].append(
                            top_level_comment.created_utc)
                        comment_dict['parent_id'].append(
                            top_level_comment.parent_id)
                        comment_dict['post_title'].append(submission.title)
                    except:
                        continue
                comment_data = pd.DataFrame(comment_dict)
                comment_data.to_sql('every_comment',
                                    con=engine,
                                    if_exists='append',
                                    dtype={'None': VARCHAR(5)})
                print("comments added")
                i += 1
        return 1
    except err:
        print(err)
        return 0
Exemple #7
0
    def parse_comment(self, response):
        comment_url = response.request.url
        logging.info("Processing --> " + response.url)
        comment_url = response.request.url
        comment_username = response.xpath(
            '//*[@class="permalink-inner permalink-tweet-container"]'
            '//*[@class="username u-dir u-textTruncate"]/b/text()'
        ).get(default="")
        comment_full_name = response.xpath(
            '//*[@class="permalink-inner permalink-tweet-container"]'
            '//*[@class="FullNameGroup"]/strong/text()'
        ).get(default="")
        try:
            comment_text = (
                response.xpath("//title/text()").get(default="").split(":")[1].strip()
            )
        except:
            comment_text = " ".join(
                response.xpath(
                    '//*[contains(@class,"permalink-inner permalink-tweet-container")]'
                    '//*[@class="js-tweet-text-container"]/p//text()'
                ).getall()
            ).strip()
        comment_image_list = response.xpath(
            '//*[contains(@class,"permalink-inner permalink-tweet-container")]'
            '//*[@class="AdaptiveMediaOuterContainer"]//img/@src'
        ).getall()
        comment_video = response.xpath(
            '//*[contains(concat(" ",normalize-space(@class)," ")," js-tweet-text-container ")]'
            '//p[contains(concat(" ",normalize-space(@class)," ")," TweetTextSize--jumbo ")]'
            '//a[contains(concat(" ",normalize-space(@class)," ")," twitter-timeline-link ")]/@href'
        ).get(default="")
        comment_date_time = response.xpath(
            '//*[contains(@class,"permalink-inner permalink-tweet-container")]'
            '//*[@class="js-tweet-details-fixer tweet-details-fixer"]'
            '/div[@class="client-and-actions"]/span[@class="metadata"]/span/text()'
        ).get(default="")
        comment_retweets = response.xpath(
            '//*[contains(@class,"permalink-inner permalink-tweet-container")]'
            '//*[@class="js-tweet-details-fixer tweet-details-fixer"]'
            '/div[@class="js-tweet-stats-container tweet-stats-container"]'
            '//*[@class="js-stat-count js-stat-retweets stat-count"]/a/strong/text()'
        ).get(default="")
        comment_likes = response.xpath(
            '//*[contains(@class,"permalink-inner permalink-tweet-container")]'
            '//*[@class="js-tweet-details-fixer tweet-details-fixer"]'
            '/div[@class="js-tweet-stats-container tweet-stats-container"]'
            '//*[@class="js-stat-count js-stat-favorites stat-count"]/a/strong/text()'
        ).get(default="")
        comment_call_to_action = re.findall(r"(?P<url>https?://[^\s]+)", comment_text)
        comment_mentions = re.findall("(^|[^@\w])@(\w{1,15})", comment_text)
        if len(comment_mentions) != 0:
            comment_mentions = [i[1] for i in comment_mentions]

        username = response.meta["username"]
        full_name = response.meta["full_name"]
        tweet_text = response.meta["tweet_text"]
        tweet_time = response.meta["tweet_time"]
        likes = response.meta["number_of_likes"]
        retweets = response.meta["no_of_retweets"]
        replies = response.meta["no_of_replies"]
        image_url = response.meta["image_url"]
        post_video = response.meta["post_video"]
        current_url = response.meta["current_url"]
        call_to_action = response.meta["call_to_action"]
        mentions = response.meta["mentions"]

        article = Article(url=response.url)
        article.fetch_images = lambda: True
        article.set_html(response.text)
        article.parse()

        yield SocialMediaItem(
            url=response.url,
            keyword=requests.utils.urlparse(current_url).path.rsplit("/", 2)[1],
            post_id=requests.utils.urlparse(response.url).path.rsplit("/", 2)[2],
            title="",
            title_addresses=re.findall(
                r"(?:bc1|[13])[a-zA-HJ-NP-Z0-9]{25,39}|"
                "0x[a-zA-Z0-9]{40}|"
                "[LM3][a-km-zA-HJ-NP-Z1-9]{26,33}|"
                "(?:bitcoincash\:)?[qp][a-z0-9]{41}|"
                "(?:BITCOINCASH\:)?[QP][A-Z0-9]{41}|"
                "r[0-9a-zA-Z]{24,34}",
                str(call_to_action),
            ),
            post=tweet_text,
            mentions=" ".join(mentions),
            call_to_action=" ".join(call_to_action),
            bitcoin_addresses=re.findall(
                r"(?:bc1|[13])[a-zA-HJ-NP-Z0-9]{25,39}", str(tweet_text)
            ),
            ethereum_addresses=re.findall(r"0x[a-zA-Z0-9]{40}", str(tweet_text)),
            litecoin_addresses=re.findall(
                r"[LM3][a-km-zA-HJ-NP-Z1-9]{26,33}", str(tweet_text)
            ),
            bitcoincash_addresses=re.findall(
                r"((bitcoincash:)?(q|p)[a-z0-9]{41})|((BITCOINCASH:)?(Q|P)[A-Z0-9]{41})",
                str(tweet_text),
            ),
            ripple_addresses=re.findall(r"r[0-9a-zA-Z]{24,34}", str(tweet_text)),
            total_addresses=re.findall(
                r"(?:bc1|[13])[a-zA-HJ-NP-Z0-9]{25,39}|"
                "0x[a-zA-Z0-9]{40}|[LM3][a-km-zA-HJ-NP-Z1-9]{26,33}|"
                "(?:bitcoincash\:)?[qp][a-z0-9]{41}|"
                "(?:BITCOINCASH\:)?[QP][A-Z0-9]{41}|"
                "r[0-9a-zA-Z]{24,34}",
                str(tweet_text),
            ),
            author_username=username,
            author_fullname=full_name,
            post_sentiment={
                "likes": str(likes),
                "retweets": str(retweets),
                "code_snippet": "",
                "comment_count": str(replies),
            },
            post_timestamp=str(tweet_time),
            post_images=image_url,
            post_videos=post_video,
            comments=[
                dict(
                    comment_id=requests.utils.urlparse(comment_url).path.rsplit("/", 2)[
                        2
                    ],
                    comment_text=comment_text,
                    mentions=" ".join(comment_mentions),
                    call_to_action=" ".join(comment_call_to_action),
                    cta_addresses=re.findall(
                        r"(?:bc1|[13])[a-zA-HJ-NP-Z0-9]{25,39}|"
                        "0x[a-zA-Z0-9]{40}|"
                        "[LM3][a-km-zA-HJ-NP-Z1-9]{26,33}|"
                        "(?:bitcoincash\:)?[qp][a-z0-9]{41}|"
                        "(?:BITCOINCASH\:)?[QP][A-Z0-9]{41}|"
                        "r[0-9a-zA-Z]{24,34}",
                        str(comment_call_to_action),
                    ),
                    mentions_addresses=re.findall(
                        r"(?:bc1|[13])[a-zA-HJ-NP-Z0-9]{25,39}|"
                        "0x[a-zA-Z0-9]{40}|"
                        "[LM3][a-km-zA-HJ-NP-Z1-9]{26,33}|"
                        "(?:bitcoincash\:)?[qp][a-z0-9]{41}|"
                        "(?:BITCOINCASH\:)?[QP][A-Z0-9]{41}|"
                        "r[0-9a-zA-Z]{24,34}",
                        str(mentions),
                    ),
                    btc_addresses=re.findall(
                        r"(?:bc1|[13])[a-zA-HJ-NP-Z0-9]{25,39}", str(comment_text)
                    ),
                    eth_addresses=re.findall(r"0x[a-zA-Z0-9]{40}", str(comment_text)),
                    ltc_addresses=re.findall(
                        r"[LM3][a-km-zA-HJ-NP-Z1-9]{26,33}", str(comment_text)
                    ),
                    bch_addresses=re.findall(
                        r"((?:bitcoincash:)?(q|p)[a-z0-9]{41})|"
                        "((?:BITCOINCASH:)?(Q|P)[A-Z0-9]{41})",
                        str(comment_text),
                    ),
                    xrp_addresses=re.findall(r"r[0-9a-zA-Z]{24,34}", str(comment_text)),
                    total_addresses=re.findall(
                        r"(?:bc1|[13])[a-zA-HJ-NP-Z0-9]{25,39}|"
                        "0x[a-zA-Z0-9]{40}|"
                        "[LM3][a-km-zA-HJ-NP-Z1-9]{26,33}|"
                        "(?:bitcoincash\:)?[qp][a-z0-9]{41}|"
                        "(?:BITCOINCASH\:)?[QP][A-Z0-9]{41}|"
                        "r[0-9a-zA-Z]{24,34}",
                        str(comment_text),
                    ),
                    sentiment={
                        "retweets": str(comment_retweets),
                        "likes": str(comment_likes),
                    },
                    timestamp=str(comment_date_time),
                    images=comment_image_list,
                    videos=comment_video,
                    author={
                        "username": comment_username,
                        "fullname": comment_full_name,
                    },
                )
            ],
            network="clearweb",
            source="twitter",
            html="" if IS_LOCAL_SERVER_ENV else article.html,
            images=article.images,
            tags=list(article.tags),
            movies=article.movies,
            meta_description=article.meta_description,
            meta_keywords=article.meta_keywords,
            meta_lang=article.meta_lang,
        )