def processNewsBasedOnTitle(news_collection, processed_collection, company): isInserted = 0 rowCount = 0 for row in DbOperations.GetData(news_collection, { "is_used": { '$exists': False }, "news_provider": company }, {}): try: DbOperations.InsertIntoMongo(processed_collection, row) isInserted = 1 print('Success in inserting Process collection => [title: "' + row['title'] + '"]') DbOperations.Update_oneMongo( news_collection, {"news_title_uid": row['news_title_uid']}, {"$set": { "is_used": 1 }}) rowCount = rowCount + 1 except Exception as e: print( 'Error in inserting Process collection => [title: "' + row['title'] + '"]', e) pass return isInserted, rowCount
def crawler(self): try: response = crawler.MakeRequest(self.url, "Get") soup = BeautifulSoup(response.content, "html.parser") data = [] boxs = soup.find_all("div", {"class": 'news-box span3 left'}) for box in boxs: datadict = Helper.get_news_dict() url = "https://www.pemex.com" + box.find("a")['href'] # Check if already present unqUrl = hashlib.md5(url.encode()).hexdigest() chkIsExists = DbOperations.GetData( self.news_collection, {"news_url_uid": str(unqUrl)}, {}, QueryType.one) if (chkIsExists): print("Already saved. url - ( " + url + " )") continue datadict.update( {"url": "https://www.pemex.com" + box.find("a")['href']}) description = self.fetchDescription("https://www.pemex.com" + box.find("a")['href']) datadict.update({ "date": box.find("p", { "class": "news-meta news-date" }).text, "news_provider": "pemex", "formatted_sub_header": box.find("div", { "class": "ms-WPBody h2" }).text, "publishedAt": Helper.parse_date( box.find("p", { "class": "news-meta news-date" }).text), "description": description, "title": box.find("div", { "class": "ms-WPBody h2" }).text, "link": self.url, "text": description, "company_id": "pemex", "news_url_uid": hashlib.md5(("https://www.pemex.com" + box.find("a")['href']).encode()).hexdigest() }) data.append(datadict) DbOperations.InsertIntoMongo(self.news_collection, data) except Exception as e: self.logger.error(f"Error Occured : \n", exc_info=True)
def makeLog(newlogcollection, processedcollection, companyname): print("Your Hourly Collection is - " + processedcollection) log = {} log['db_name'] = processedcollection log['processed_by_all_type_news'] = 1 log['endt'] = datetime.datetime.now() log['script_scrapped_name'] = str( companyname) + '_daily_scrapping_python' DbOperations.InsertIntoMongo(newlogcollection, log)
def crawler(self): try: counter = 1 data = [] while True: response = crawler.MakeRequest(self.url, "Get") soup = BeautifulSoup(response.content, "html.parser") if response.status_code == 200: boxs = soup.find_all("div", {"class": 'item'}) for box in boxs: date = Helper.parse_date( box.find("p", { "class": "fade" }).text) if date: if date.year < datetime.datetime.now().year: break url = "https://www.bd.com/" + box.find("a")['href'] # Check if already present unqUrl = hashlib.md5(url.encode()).hexdigest() chkIsExists = DbOperations.GetData( self.news_collection, {"news_url_uid": str(unqUrl)}, {}, QueryType.one) if (chkIsExists): print("Already saved. url - ( " + url + " )") continue datadict = Helper.get_news_dict() datadict.update({ "url": "https://www.bd.com/" + box.find("a")['href'] }) description = self.fetchDescription( "https://www.bd.com/" + box.find("a")['href']) datadict.update({ "date": Helper.parse_date( box.find("p", { "class": "fade" }).text), "news_provider": "Becton, Dickinson and Company", "formatted_sub_header": box.find("a").text.strip(), "publishedAt": Helper.parse_date( box.find("p", { "class": "fade" }).text), "description": description, "title": box.find("a").text.strip(), "news_title_uid": hashlib.md5(box.find( "a").text.strip().encode()).hexdigest(), "link": url, "text": description, "ticker": "bd_scrapped", "industry_name": "Becton, Dickinson and Company", "company_id": "Becton, Dickinson and Company", "news_url_uid": hashlib.md5(url.encode()).hexdigest() }) data.append(datadict) else: break DbOperations.InsertIntoMongo(self.news_collection, data) except Exception as e: self.logger.error(f"Error Occured : \n", exc_info=True)
def crawler(self): try: data = [] counter = 1 while True: response = crawler.MakeRequest( self.url.format(counter=counter), "Get") if response.status_code == 200: soup = BeautifulSoup(response.content, "html.parser") boxs = soup.find_all("div", {"class": 'unicom-newsListItem'}) for box in boxs: date = box.find("p", { "class": "unicom-listInformationDate" }).text if date: date = Helper.parse_date(date) if date.year < datetime.datetime.now().year: break datadict = Helper.get_news_dict() url = box.find("a")['href'] # Check if already present unqUrl = hashlib.md5(url.encode()).hexdigest() chkIsExists = DbOperations.GetData( self.news_collection, {"news_url_uid": str(unqUrl)}, {}, QueryType.one) if (chkIsExists): print("Already saved. url - ( " + url + " )") continue datadict.update({"newsurl": box.find("a")['href']}) description = self.fetchDescription( box.find("a")['href']) datadict.update({ "url": url, "link": url, "news_url_uid": hashlib.md5(url.encode()).hexdigest(), "date": box.find("p", { "class": "unicom-listInformationDate" }).text, "news_provider": "panasonic", "formatted_sub_header": box.find("h3", { "class": "unicom-newsListTitleIn" }).text, "publishedAt": date, "description": description, "title": box.find("h3", { "class": "unicom-newsListTitleIn" }).text }) data.append(datadict) counter += counter self.url = "https://news.panasonic.com/global/all/all_{counter}.html" else: break DbOperations.InsertIntoMongo(self.news_collection, data) except Exception as e: self.logger.error(f"Error Occured : \n", exc_info=True)