コード例 #1
0
ファイル: helper.py プロジェクト: Kashyap10/NewsScrapping
 def processNewsBasedOnTitle(news_collection, processed_collection,
                             company):
     isInserted = 0
     rowCount = 0
     for row in DbOperations.GetData(news_collection, {
             "is_used": {
                 '$exists': False
             },
             "news_provider": company
     }, {}):
         try:
             DbOperations.InsertIntoMongo(processed_collection, row)
             isInserted = 1
             print('Success in inserting Process collection => [title: "' +
                   row['title'] + '"]')
             DbOperations.Update_oneMongo(
                 news_collection, {"news_title_uid": row['news_title_uid']},
                 {"$set": {
                     "is_used": 1
                 }})
             rowCount = rowCount + 1
         except Exception as e:
             print(
                 'Error in inserting Process collection => [title: "' +
                 row['title'] + '"]', e)
             pass
     return isInserted, rowCount
コード例 #2
0
ファイル: pemex.py プロジェクト: Kashyap10/NewsScrapping
    def crawler(self):
        try:
            response = crawler.MakeRequest(self.url, "Get")
            soup = BeautifulSoup(response.content, "html.parser")
            data = []
            boxs = soup.find_all("div", {"class": 'news-box span3 left'})
            for box in boxs:
                datadict = Helper.get_news_dict()
                url = "https://www.pemex.com" + box.find("a")['href']
                # Check if already present
                unqUrl = hashlib.md5(url.encode()).hexdigest()
                chkIsExists = DbOperations.GetData(
                    self.news_collection, {"news_url_uid": str(unqUrl)}, {},
                    QueryType.one)
                if (chkIsExists):
                    print("Already saved. url - ( " + url + " )")
                    continue

                datadict.update(
                    {"url": "https://www.pemex.com" + box.find("a")['href']})
                description = self.fetchDescription("https://www.pemex.com" +
                                                    box.find("a")['href'])
                datadict.update({
                    "date":
                    box.find("p", {
                        "class": "news-meta news-date"
                    }).text,
                    "news_provider":
                    "pemex",
                    "formatted_sub_header":
                    box.find("div", {
                        "class": "ms-WPBody h2"
                    }).text,
                    "publishedAt":
                    Helper.parse_date(
                        box.find("p", {
                            "class": "news-meta news-date"
                        }).text),
                    "description":
                    description,
                    "title":
                    box.find("div", {
                        "class": "ms-WPBody h2"
                    }).text,
                    "link":
                    self.url,
                    "text":
                    description,
                    "company_id":
                    "pemex",
                    "news_url_uid":
                    hashlib.md5(("https://www.pemex.com" +
                                 box.find("a")['href']).encode()).hexdigest()
                })
                data.append(datadict)

            DbOperations.InsertIntoMongo(self.news_collection, data)
        except Exception as e:
            self.logger.error(f"Error Occured : \n", exc_info=True)
コード例 #3
0
ファイル: helper.py プロジェクト: Kashyap10/NewsScrapping
 def makeLog(newlogcollection, processedcollection, companyname):
     print("Your Hourly Collection is - " + processedcollection)
     log = {}
     log['db_name'] = processedcollection
     log['processed_by_all_type_news'] = 1
     log['endt'] = datetime.datetime.now()
     log['script_scrapped_name'] = str(
         companyname) + '_daily_scrapping_python'
     DbOperations.InsertIntoMongo(newlogcollection, log)
コード例 #4
0
ファイル: BD.py プロジェクト: Kashyap10/NewsScrapping
    def crawler(self):
        try:
            counter = 1
            data = []
            while True:

                response = crawler.MakeRequest(self.url, "Get")
                soup = BeautifulSoup(response.content, "html.parser")
                if response.status_code == 200:

                    boxs = soup.find_all("div", {"class": 'item'})
                    for box in boxs:
                        date = Helper.parse_date(
                            box.find("p", {
                                "class": "fade"
                            }).text)
                        if date:
                            if date.year < datetime.datetime.now().year:
                                break

                        url = "https://www.bd.com/" + box.find("a")['href']

                        # Check if already present
                        unqUrl = hashlib.md5(url.encode()).hexdigest()
                        chkIsExists = DbOperations.GetData(
                            self.news_collection,
                            {"news_url_uid": str(unqUrl)}, {}, QueryType.one)
                        if (chkIsExists):
                            print("Already saved. url - ( " + url + " )")
                            continue
                        datadict = Helper.get_news_dict()
                        datadict.update({
                            "url":
                            "https://www.bd.com/" + box.find("a")['href']
                        })
                        description = self.fetchDescription(
                            "https://www.bd.com/" + box.find("a")['href'])
                        datadict.update({
                            "date":
                            Helper.parse_date(
                                box.find("p", {
                                    "class": "fade"
                                }).text),
                            "news_provider":
                            "Becton, Dickinson and Company",
                            "formatted_sub_header":
                            box.find("a").text.strip(),
                            "publishedAt":
                            Helper.parse_date(
                                box.find("p", {
                                    "class": "fade"
                                }).text),
                            "description":
                            description,
                            "title":
                            box.find("a").text.strip(),
                            "news_title_uid":
                            hashlib.md5(box.find(
                                "a").text.strip().encode()).hexdigest(),
                            "link":
                            url,
                            "text":
                            description,
                            "ticker":
                            "bd_scrapped",
                            "industry_name":
                            "Becton, Dickinson and Company",
                            "company_id":
                            "Becton, Dickinson and Company",
                            "news_url_uid":
                            hashlib.md5(url.encode()).hexdigest()
                        })
                        data.append(datadict)
                else:
                    break
            DbOperations.InsertIntoMongo(self.news_collection, data)
        except Exception as e:
            self.logger.error(f"Error Occured : \n", exc_info=True)
コード例 #5
0
ファイル: panasonic.py プロジェクト: Kashyap10/NewsScrapping
    def crawler(self):
        try:
            data = []
            counter = 1
            while True:
                response = crawler.MakeRequest(
                    self.url.format(counter=counter), "Get")
                if response.status_code == 200:
                    soup = BeautifulSoup(response.content, "html.parser")

                    boxs = soup.find_all("div",
                                         {"class": 'unicom-newsListItem'})
                    for box in boxs:
                        date = box.find("p", {
                            "class": "unicom-listInformationDate"
                        }).text
                        if date:
                            date = Helper.parse_date(date)
                            if date.year < datetime.datetime.now().year:
                                break
                        datadict = Helper.get_news_dict()
                        url = box.find("a")['href']
                        # Check if already present
                        unqUrl = hashlib.md5(url.encode()).hexdigest()
                        chkIsExists = DbOperations.GetData(
                            self.news_collection,
                            {"news_url_uid": str(unqUrl)}, {}, QueryType.one)
                        if (chkIsExists):
                            print("Already saved. url - ( " + url + " )")
                            continue
                        datadict.update({"newsurl": box.find("a")['href']})
                        description = self.fetchDescription(
                            box.find("a")['href'])
                        datadict.update({
                            "url":
                            url,
                            "link":
                            url,
                            "news_url_uid":
                            hashlib.md5(url.encode()).hexdigest(),
                            "date":
                            box.find("p", {
                                "class": "unicom-listInformationDate"
                            }).text,
                            "news_provider":
                            "panasonic",
                            "formatted_sub_header":
                            box.find("h3", {
                                "class": "unicom-newsListTitleIn"
                            }).text,
                            "publishedAt":
                            date,
                            "description":
                            description,
                            "title":
                            box.find("h3", {
                                "class": "unicom-newsListTitleIn"
                            }).text
                        })

                        data.append(datadict)
                    counter += counter
                    self.url = "https://news.panasonic.com/global/all/all_{counter}.html"
                else:
                    break
            DbOperations.InsertIntoMongo(self.news_collection, data)
        except Exception as e:
            self.logger.error(f"Error Occured : \n", exc_info=True)