Example #1
0
    def parse_conference_page_info(self, page: str, qlink):
        page_dom = BeautifulSoup(page, 'html.parser')
        title = page_dom.find(name="span", attrs={
            "property": "v:description"
        }).text
        url = page_dom.find(name="a", attrs={"target": "_newtab"})["href"]
        info = self.extract_info(page_dom)
        if "deadline" not in info:
            raise ValueError(
                "Deadline is a mandatory field, could not parse the page")
        categories = self.extract_categories(page_dom)
        bulk_text = ""
        try:
            qresult = page_dom.select("div.cfp")
            bulk_text = qresult[0].text
        except Exception as e:
            self.logger.warn(
                "Failed to parse bulk text information css query result: {} error : {} "
                .format(qresult, e))

        metadata = self.create_metadata(qlink, self.base_address,
                                        self.site_name)

        return Conference(
            **info, **{
                "title": title,
                "url": url,
                "categories": categories,
                "bulk_text": bulk_text,
                "metadata": metadata
            })
Example #2
0
    def test_insert_datetime(self):
        self.assertDictEqual.__self__.maxDiff = None
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            conf = Conference(title="Something",
                              url="www.something.com",
                              deadline=datetime.now(),
                              metadata=Metadata("something", datetime.now(),
                                                "www.something.com",
                                                "www.something.com",
                                                "something"),
                              dateRange=[
                                  datetime.now(),
                                  datetime.now() + timedelta(days=10)
                              ],
                              finalDue=datetime.now(),
                              location="something",
                              categories=["something"],
                              bulkText="somthing")

            flag = None
            is_inserted = self.mongo_db.put(conf)
            if (is_inserted != None):
                flag = True
            else:
                flag = False

            self.assertEqual(True, flag)
 def get_conferenceobj(self):
     title = "title"
     url = "http://url.com"
     deadline = "deadline"
     metadata = self.get_metadata()
     conference_obj = Conference(title, url, deadline, metadata)
     return conference_obj
 def get_valid_data(self):
     return Conference(title="something",
                       url="anything",
                       deadline=datetime.now(),
                       metadata=Metadata(__name__, datetime.now(),
                                         "something.com/something",
                                         "something.com", "anythingProd"))
Example #5
0
    def parse_action(self):

        meta = Metadata(__name__,
                        datetime.datetime.now(),
                        website_url="somename.co.in/link",
                        domain_url="somename.co.in",
                        domain_name="somename",
                        **{"extra": "info you want to keep"})

        data = Conference(
            **{
                "title": "",
                "url": "",
                "deadline": datetime.datetime.now(),
                "metadata": meta
            })

        ## There are other optional fields also for conference
        ## check out the docstring
        ## Once done you can call dbaction

        ## Use the already provided method from Scrapper class like
        ## getDate , getPage etc.
        ## They are tested methods and have lesser chance of breaking your code.

        # self.getPage(" -- some page link --" , " -- some debug message --")
        #
        # PARSE DATA
        #
        # self.push_todb(data)

        self.logger.info(
            "Yay !! data was put into db hopefully !! Check error logs , i.e run with log level error"
        )
Example #6
0
    def _parse_top_conference(self, link: str) -> Conference:
        """
        Parses individual top conference page and,
        return Conference object

        Args:
        ---
        link: str

        Returns:
        ---
        Conference: object
        """
        page = self.get_page(qlink=link,
                             debug_msg=f'Parsing {link}',
                             allow_redirects=True)
        try:
            content = page.content
        except Exception as e:
            PageParsingError(
                f"The following error occured while parsing {link} Trace:{e}")
        soup = bs(content, "html5lib")
        post_div = soup.find("div", attrs={"class": "single_post"})
        post_tables = post_div.find_all("table")
        title = soup.h1.text
        conf_info = self._get_top_conf_info(name=title, table=post_tables[0])
        rating_info = self._get_top_conf_ranking(name=title,
                                                 table=post_tables[1])
        categories = list(rating_info.keys())[1:-1] if rating_info else []
        bulk_text = self._get_top_conf_bulk_text(soup)
        url = conf_info.get("link")
        deadline = conf_info.get("deadline")
        metadata = Metadata(
            __name__,
            dt.now(),
            link,
            self.base_address,
            self.scrapper_name,
        )
        additional_data = {}
        additional_data["dateRange"] = conf_info.get("dateRange")
        additional_data["location"] = conf_info.get("location")
        self.logger.debug(f"{title} is now added to the database")
        if not deadline:
            self.logger.debug(
                f"{title} not added, because, deadline info not available")
            return None
        else:
            return Conference(title=title,
                              url=url,
                              deadline=deadline,
                              metadata=metadata,
                              bulkText=bulk_text,
                              categories=categories,
                              rankings=rating_info,
                              **additional_data)
Example #7
0
    def __init__(self,
                 logger: Logger,
                 database_name: str,
                 collection_name: str,
                 host: str = 'localhost',
                 port: int = 27017,
                 maxPoolSize: int = None,
                 **kwargs):
        """Mongo database object
        Arguments:
            logger {[logging]} -- logger passed by user 
            database_name {[type]} -- name of database to be used
            collection_name {[type]} -- collection for the mongodb db
        
        Keyword Arguments:
            host {str} -- host for mongodb (default: {'localhost'})
            port {int} -- port for mongodb (default: {27017})
            maxPoolSize {[type]} -- maxpoolsize for mongo db (default: {None})
        
        Raises:
            e:  error when database connection fails. These are unhandled connection 
                and the application must stop immeditely in such cases
        """
        self.logger = logger
        try:
            self.logger.debug("Using Database name {}".format(database_name))
            self.logger.debug("Using address {}:{}".format(host, port))
            client = MongoClient(host, int(port), maxPoolSize=maxPoolSize)
            self.client = client
            db = client[
                database_name]  ## Create a new database if not existing
            ##
            ## Quirks of pymongo client , any error from this statement below
            ## leads to unsuported operation for database , where as intended
            ## strcuture is a collection. Should be addressed in the pymongo
            self.logger.debug("Using Collection name {}".format("conferences"))
            collection = db[collection_name]
            client.server_info()
            self.logger.info(
                "Succefully created mongodb client connection on host:{} , port:{} "
                .format(host, port))
            self.db = db
            self.collection = collection
            index_info = collection.index_information()
            possible_index = Conference.index()  #  -> [(string,bool)]
            possible_index = filter(lambda x: (x[0] + "_1") not in index_info,
                                    possible_index)
            for idx, unique in possible_index:
                collection.create_index([(idx, pymongo.ASCENDING)],
                                        unique=unique)

        except Exception as e:
            self.logger.error(
                "Failed to initiate mongodb client error: {}".format(e))
            raise e
Example #8
0
    def _parse_all_conference(self, link: str):
        """
        Parses individual conference page and,
        return Conference object

        Args:
        ---
        link: str

        Returns:
        ---
        Conference: object
        """
        page = self.get_page(qlink=link,
                             debug_msg=f'Parsing {link}',
                             allow_redirects=True)
        try:
            content = page.content
        except Exception as e:
            self.logger.error(
                f"The following error occured while trying to parse {link} {e}"
            )
        soup = bs(content, "html5lib")
        content_div = soup.find("div", attrs={"id": "content_box"})
        title = content_div.h1.text
        tables = soup.find_all('table')
        conf_info = self._get_all_conf_info(name=title, infotable=tables[0])
        bulk_text = self._get_all_conf_bulk(soup=soup)
        metadata = Metadata(
            __name__,
            dt.now(),
            link,
            self.base_address,
            self.scrapper_name,
        )
        additional_data = {}
        additional_data["bulkText"] = bulk_text
        additional_data["dateRange"] = conf_info.get("dateRange")
        additional_data["location"] = conf_info.get("location")
        deadline = conf_info.get("deadline")
        self.logger.debug(f"{title} is now added to database")
        if not deadline:
            self.logger.debug(
                f"{title} not added because, deadline info not available")
            return None
        else:
            return Conference(title=title,
                              url=conf_info.get("link"),
                              deadline=deadline,
                              metadata=metadata,
                              **additional_data)
Example #9
0
        def test_invalid_datetime(self):
            self.assertDictEqual.__self__.maxDiff = None
            with warnings.catch_warnings():
                warnings.simplefilter("ignore")
                conf = Conference(title="Something",
                                  url="www.something.com",
                                  deadline=datetime.now() - timedelta(days=10),
                                  metadata=Metadata(
                                      "something", datetime.now(),
                                      "www.something.com\somthing.html",
                                      "www.something.com", "something"),
                                  dateRange=[
                                      datetime.now(),
                                      datetime.now() - timedelta(days=10)
                                  ],
                                  finalDue=datetime.now(),
                                  location="something",
                                  categories=["something"],
                                  bulkText="somthing")

                is_inserted = self.mongo_db.put(conf)
                self.assertEqual(None, is_inserted)
 def get_invalid_data(self):
     return Conference(title="something",
                       url="anything",
                       deadline="anything",
                       metadata="anything")