def parse_conference_page_info(self, page: str, qlink): page_dom = BeautifulSoup(page, 'html.parser') title = page_dom.find(name="span", attrs={ "property": "v:description" }).text url = page_dom.find(name="a", attrs={"target": "_newtab"})["href"] info = self.extract_info(page_dom) if "deadline" not in info: raise ValueError( "Deadline is a mandatory field, could not parse the page") categories = self.extract_categories(page_dom) bulk_text = "" try: qresult = page_dom.select("div.cfp") bulk_text = qresult[0].text except Exception as e: self.logger.warn( "Failed to parse bulk text information css query result: {} error : {} " .format(qresult, e)) metadata = self.create_metadata(qlink, self.base_address, self.site_name) return Conference( **info, **{ "title": title, "url": url, "categories": categories, "bulk_text": bulk_text, "metadata": metadata })
def test_insert_datetime(self): self.assertDictEqual.__self__.maxDiff = None with warnings.catch_warnings(): warnings.simplefilter("ignore") conf = Conference(title="Something", url="www.something.com", deadline=datetime.now(), metadata=Metadata("something", datetime.now(), "www.something.com", "www.something.com", "something"), dateRange=[ datetime.now(), datetime.now() + timedelta(days=10) ], finalDue=datetime.now(), location="something", categories=["something"], bulkText="somthing") flag = None is_inserted = self.mongo_db.put(conf) if (is_inserted != None): flag = True else: flag = False self.assertEqual(True, flag)
def get_conferenceobj(self): title = "title" url = "http://url.com" deadline = "deadline" metadata = self.get_metadata() conference_obj = Conference(title, url, deadline, metadata) return conference_obj
def get_valid_data(self): return Conference(title="something", url="anything", deadline=datetime.now(), metadata=Metadata(__name__, datetime.now(), "something.com/something", "something.com", "anythingProd"))
def parse_action(self): meta = Metadata(__name__, datetime.datetime.now(), website_url="somename.co.in/link", domain_url="somename.co.in", domain_name="somename", **{"extra": "info you want to keep"}) data = Conference( **{ "title": "", "url": "", "deadline": datetime.datetime.now(), "metadata": meta }) ## There are other optional fields also for conference ## check out the docstring ## Once done you can call dbaction ## Use the already provided method from Scrapper class like ## getDate , getPage etc. ## They are tested methods and have lesser chance of breaking your code. # self.getPage(" -- some page link --" , " -- some debug message --") # # PARSE DATA # # self.push_todb(data) self.logger.info( "Yay !! data was put into db hopefully !! Check error logs , i.e run with log level error" )
def _parse_top_conference(self, link: str) -> Conference: """ Parses individual top conference page and, return Conference object Args: --- link: str Returns: --- Conference: object """ page = self.get_page(qlink=link, debug_msg=f'Parsing {link}', allow_redirects=True) try: content = page.content except Exception as e: PageParsingError( f"The following error occured while parsing {link} Trace:{e}") soup = bs(content, "html5lib") post_div = soup.find("div", attrs={"class": "single_post"}) post_tables = post_div.find_all("table") title = soup.h1.text conf_info = self._get_top_conf_info(name=title, table=post_tables[0]) rating_info = self._get_top_conf_ranking(name=title, table=post_tables[1]) categories = list(rating_info.keys())[1:-1] if rating_info else [] bulk_text = self._get_top_conf_bulk_text(soup) url = conf_info.get("link") deadline = conf_info.get("deadline") metadata = Metadata( __name__, dt.now(), link, self.base_address, self.scrapper_name, ) additional_data = {} additional_data["dateRange"] = conf_info.get("dateRange") additional_data["location"] = conf_info.get("location") self.logger.debug(f"{title} is now added to the database") if not deadline: self.logger.debug( f"{title} not added, because, deadline info not available") return None else: return Conference(title=title, url=url, deadline=deadline, metadata=metadata, bulkText=bulk_text, categories=categories, rankings=rating_info, **additional_data)
def __init__(self, logger: Logger, database_name: str, collection_name: str, host: str = 'localhost', port: int = 27017, maxPoolSize: int = None, **kwargs): """Mongo database object Arguments: logger {[logging]} -- logger passed by user database_name {[type]} -- name of database to be used collection_name {[type]} -- collection for the mongodb db Keyword Arguments: host {str} -- host for mongodb (default: {'localhost'}) port {int} -- port for mongodb (default: {27017}) maxPoolSize {[type]} -- maxpoolsize for mongo db (default: {None}) Raises: e: error when database connection fails. These are unhandled connection and the application must stop immeditely in such cases """ self.logger = logger try: self.logger.debug("Using Database name {}".format(database_name)) self.logger.debug("Using address {}:{}".format(host, port)) client = MongoClient(host, int(port), maxPoolSize=maxPoolSize) self.client = client db = client[ database_name] ## Create a new database if not existing ## ## Quirks of pymongo client , any error from this statement below ## leads to unsuported operation for database , where as intended ## strcuture is a collection. Should be addressed in the pymongo self.logger.debug("Using Collection name {}".format("conferences")) collection = db[collection_name] client.server_info() self.logger.info( "Succefully created mongodb client connection on host:{} , port:{} " .format(host, port)) self.db = db self.collection = collection index_info = collection.index_information() possible_index = Conference.index() # -> [(string,bool)] possible_index = filter(lambda x: (x[0] + "_1") not in index_info, possible_index) for idx, unique in possible_index: collection.create_index([(idx, pymongo.ASCENDING)], unique=unique) except Exception as e: self.logger.error( "Failed to initiate mongodb client error: {}".format(e)) raise e
def _parse_all_conference(self, link: str): """ Parses individual conference page and, return Conference object Args: --- link: str Returns: --- Conference: object """ page = self.get_page(qlink=link, debug_msg=f'Parsing {link}', allow_redirects=True) try: content = page.content except Exception as e: self.logger.error( f"The following error occured while trying to parse {link} {e}" ) soup = bs(content, "html5lib") content_div = soup.find("div", attrs={"id": "content_box"}) title = content_div.h1.text tables = soup.find_all('table') conf_info = self._get_all_conf_info(name=title, infotable=tables[0]) bulk_text = self._get_all_conf_bulk(soup=soup) metadata = Metadata( __name__, dt.now(), link, self.base_address, self.scrapper_name, ) additional_data = {} additional_data["bulkText"] = bulk_text additional_data["dateRange"] = conf_info.get("dateRange") additional_data["location"] = conf_info.get("location") deadline = conf_info.get("deadline") self.logger.debug(f"{title} is now added to database") if not deadline: self.logger.debug( f"{title} not added because, deadline info not available") return None else: return Conference(title=title, url=conf_info.get("link"), deadline=deadline, metadata=metadata, **additional_data)
def test_invalid_datetime(self): self.assertDictEqual.__self__.maxDiff = None with warnings.catch_warnings(): warnings.simplefilter("ignore") conf = Conference(title="Something", url="www.something.com", deadline=datetime.now() - timedelta(days=10), metadata=Metadata( "something", datetime.now(), "www.something.com\somthing.html", "www.something.com", "something"), dateRange=[ datetime.now(), datetime.now() - timedelta(days=10) ], finalDue=datetime.now(), location="something", categories=["something"], bulkText="somthing") is_inserted = self.mongo_db.put(conf) self.assertEqual(None, is_inserted)
def get_invalid_data(self): return Conference(title="something", url="anything", deadline="anything", metadata="anything")