Beispiel #1
0
    def caronaInfo(self):
        resp = requests.get('https://www.worldometers.info/coronavirus/')
        values = {
            'world': 'N/A',
            'india': 'N/A',
            'deathCases': 'N/A',
            'recCases': 'N/A',
            'newCases': 'N/A',
            'totalDeath': 'N/A',
            'totalRec': 'N/A'
        }
        sel = Selector(resp)
        soup = BeautifulSoup(sel.get(), "html.parser")
        world = soup.find(id="main_table_countries_today").find(
            "td", string="World").parent.td.next_sibling.next_sibling.string
        india = soup.find(id="main_table_countries_today").find(
            "a",
            string="India").parent.parent.td.next_sibling.next_sibling.string
        deathCases = soup.find(id="main_table_countries_today").find(
            "a", string="India"
        ).parent.parent.td.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.string
        recCases = soup.find(id="main_table_countries_today").find(
            "a", string="India"
        ).parent.parent.td.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.string
        newCases = soup.find(id="main_table_countries_today").find(
            "a", string="India"
        ).parent.parent.td.next_sibling.next_sibling.next_sibling.next_sibling.string
        totalDeath = soup.find_all(id="maincounter-wrap")[1].span.string
        totalRec = soup.find_all(id="maincounter-wrap")[2].span.string

        values.update({"world": world})
        values.update({"totalDeath": totalDeath})
        values.update({"totalRec": totalRec})
        values.update({"india": india})
        values.update({"deathCases": deathCases})
        values.update({"recCases": recCases})
        values.update({"newCases": newCases})

        return values
Beispiel #2
0
    def parse(self, response):
        """
		abstract method from Spider object overwritten in script - see documentation

		:param response: callback of request to handle generated responses
		:return:
		"""

        # inspect_response(response, self)
        es = ElementSelectors()
        item_batch = response.css(es.sale_items_batch).getall()
        # batch_len = len(item_batch)
        # batch_sel = Selector(text=item_batch[71])
        # print("DEBUG1: {}".format(batch_sel.get()))

        dict_items_num = {}
        dict_items_name = {}

        for _ in range(len(item_batch)):
            dict_item_info = {}
            item_link = Selector(text=item_batch[_]).css('body >' +
                                                         es.item_link)
            item_link_elem = item_link.get()
            item_link_sel = Selector(text=item_link_elem)
            dict_item_info['item_link'] = item_link.attrib['href']
            dict_item_info['item_old_price'] = item_link_sel.css(
                es.item_old_cost).get()
            dict_item_info['item_sale_price'] = item_link_sel.css(
                es.item_current_cost).get()
            dict_items_num[_] = dict_item_info

        for _ in range(len(dict_items_num.keys())):
            print("ITEM {}: {} {} {} ".format(
                _, dict_items_num[_]['item_link'],
                dict_items_num[_]['item_old_price'],
                dict_items_num[_]['item_sale_price']))
Beispiel #3
0
    def parse(self, response):
        # Find out post number (id)
        try:
            match = re.search(r't=(\d*)', response.url)
            post_number = match.group(1)
        except Exception as e:
            print(response.url)
            print(e)

        folder_path = "download/" + str(post_number)

        # Keep track of all the visited pages
        if (post_number not in visited):
            visited[post_number] = {
                "comments": [],
                "n_pages": 0
            }

        not_found = Selector(response).xpath("//table[@class='forumline']//td[@class='row1']/table/tr[2]/td/span[@class='gen']").extract()
        if len(not_found) > 0:
            print("ERROR PAGE")
            return

        # Create folder if doesn't exist
        try:
            if not os.path.isdir(folder_path):
                os.mkdir(folder_path)
        except Exception as e:
            print(e)

        # Save page N of the current post id
        with open(folder_path + "/" + str(visited[post_number]['n_pages']) + ".html", "wb") as f:
            f.write(response.body)

        # Start scrapping of data
        row = Selector(response).xpath("//table[@class ='forumline']/tr").extract()
        title = Selector(response).xpath("///a[@class ='maintitle']/text()").get()
        next_page = Selector(response).xpath("//span[@class='nav']//a[contains(text(),'Next')]/@href")

        for item in row[2:]:
            author_1 = Selector(text=item).xpath("//td[1]//b/span/text()").get()
            author_2 = Selector(text=item).xpath("//td[1]//b/text()").get()
            author = author_1 if author_1 is not None else author_2
            if author:
                date_text = Selector(text=item).xpath("//td[2]/table//span[@class='postdetails']/text()").get().replace("Posted: ", "").strip()
                comment_content = Selector(text=item).xpath("//td[2]/table/tr[3]/td").get()
                visited[post_number]['comments'].append(Comment(author, dateparser.parse(date_text), comment_content))

        # If has more comment pages, queue them up to be visited
        if(len(next_page) > 0):
            visited[post_number]['n_pages'] += 1
            next_page_url = BASE_URL + next_page.get()
            yield scrapy.Request(next_page_url, callback=self.parse)

        # For the last page, store the accumulated json in a file and delete the content of comments to not clog RAM
        else:
            post = visited[post_number]['comments'][0]
            new_post = Post(post_number, title, post.author, post.content, post.date, visited[post_number]['comments'][1:])
            with open(folder_path + "/items.json", "w") as f:
                f.write(json.dumps(new_post.toJson()))
            del visited[post_number]['comments']