def get_valid_data(self):
     return Conference(title="something",
                       url="anything",
                       deadline=datetime.now(),
                       metadata=Metadata(__name__, datetime.now(),
                                         "something.com/something",
                                         "something.com", "anythingProd"))
Exemple #2
0
    def test_insert_datetime(self):
        self.assertDictEqual.__self__.maxDiff = None
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            conf = Conference(title="Something",
                              url="www.something.com",
                              deadline=datetime.now(),
                              metadata=Metadata("something", datetime.now(),
                                                "www.something.com",
                                                "www.something.com",
                                                "something"),
                              dateRange=[
                                  datetime.now(),
                                  datetime.now() + timedelta(days=10)
                              ],
                              finalDue=datetime.now(),
                              location="something",
                              categories=["something"],
                              bulkText="somthing")

            flag = None
            is_inserted = self.mongo_db.put(conf)
            if (is_inserted != None):
                flag = True
            else:
                flag = False

            self.assertEqual(True, flag)
    def parse_action(self):

        meta = Metadata(__name__,
                        datetime.datetime.now(),
                        website_url="somename.co.in/link",
                        domain_url="somename.co.in",
                        domain_name="somename",
                        **{"extra": "info you want to keep"})

        data = Conference(
            **{
                "title": "",
                "url": "",
                "deadline": datetime.datetime.now(),
                "metadata": meta
            })

        ## There are other optional fields also for conference
        ## check out the docstring
        ## Once done you can call dbaction

        ## Use the already provided method from Scrapper class like
        ## getDate , getPage etc.
        ## They are tested methods and have lesser chance of breaking your code.

        # self.getPage(" -- some page link --" , " -- some debug message --")
        #
        # PARSE DATA
        #
        # self.push_todb(data)

        self.logger.info(
            "Yay !! data was put into db hopefully !! Check error logs , i.e run with log level error"
        )
 def get_metadata(self):
     worker ="worker"
     date_extracted ="date_extracted"
     website_url = "website_url"
     domain_url = "domain_url"
     domain_name = "domain_name"
     meta_data_obj = Metadata(worker, date_extracted, website_url, domain_url, domain_name)
     return meta_data_obj
Exemple #5
0
    def _parse_top_conference(self, link: str) -> Conference:
        """
        Parses individual top conference page and,
        return Conference object

        Args:
        ---
        link: str

        Returns:
        ---
        Conference: object
        """
        page = self.get_page(qlink=link,
                             debug_msg=f'Parsing {link}',
                             allow_redirects=True)
        try:
            content = page.content
        except Exception as e:
            PageParsingError(
                f"The following error occured while parsing {link} Trace:{e}")
        soup = bs(content, "html5lib")
        post_div = soup.find("div", attrs={"class": "single_post"})
        post_tables = post_div.find_all("table")
        title = soup.h1.text
        conf_info = self._get_top_conf_info(name=title, table=post_tables[0])
        rating_info = self._get_top_conf_ranking(name=title,
                                                 table=post_tables[1])
        categories = list(rating_info.keys())[1:-1] if rating_info else []
        bulk_text = self._get_top_conf_bulk_text(soup)
        url = conf_info.get("link")
        deadline = conf_info.get("deadline")
        metadata = Metadata(
            __name__,
            dt.now(),
            link,
            self.base_address,
            self.scrapper_name,
        )
        additional_data = {}
        additional_data["dateRange"] = conf_info.get("dateRange")
        additional_data["location"] = conf_info.get("location")
        self.logger.debug(f"{title} is now added to the database")
        if not deadline:
            self.logger.debug(
                f"{title} not added, because, deadline info not available")
            return None
        else:
            return Conference(title=title,
                              url=url,
                              deadline=deadline,
                              metadata=metadata,
                              bulkText=bulk_text,
                              categories=categories,
                              rankings=rating_info,
                              **additional_data)
Exemple #6
0
    def _parse_all_conference(self, link: str):
        """
        Parses individual conference page and,
        return Conference object

        Args:
        ---
        link: str

        Returns:
        ---
        Conference: object
        """
        page = self.get_page(qlink=link,
                             debug_msg=f'Parsing {link}',
                             allow_redirects=True)
        try:
            content = page.content
        except Exception as e:
            self.logger.error(
                f"The following error occured while trying to parse {link} {e}"
            )
        soup = bs(content, "html5lib")
        content_div = soup.find("div", attrs={"id": "content_box"})
        title = content_div.h1.text
        tables = soup.find_all('table')
        conf_info = self._get_all_conf_info(name=title, infotable=tables[0])
        bulk_text = self._get_all_conf_bulk(soup=soup)
        metadata = Metadata(
            __name__,
            dt.now(),
            link,
            self.base_address,
            self.scrapper_name,
        )
        additional_data = {}
        additional_data["bulkText"] = bulk_text
        additional_data["dateRange"] = conf_info.get("dateRange")
        additional_data["location"] = conf_info.get("location")
        deadline = conf_info.get("deadline")
        self.logger.debug(f"{title} is now added to database")
        if not deadline:
            self.logger.debug(
                f"{title} not added because, deadline info not available")
            return None
        else:
            return Conference(title=title,
                              url=conf_info.get("link"),
                              deadline=deadline,
                              metadata=metadata,
                              **additional_data)
Exemple #7
0
        def test_invalid_datetime(self):
            self.assertDictEqual.__self__.maxDiff = None
            with warnings.catch_warnings():
                warnings.simplefilter("ignore")
                conf = Conference(title="Something",
                                  url="www.something.com",
                                  deadline=datetime.now() - timedelta(days=10),
                                  metadata=Metadata(
                                      "something", datetime.now(),
                                      "www.something.com\somthing.html",
                                      "www.something.com", "something"),
                                  dateRange=[
                                      datetime.now(),
                                      datetime.now() - timedelta(days=10)
                                  ],
                                  finalDue=datetime.now(),
                                  location="something",
                                  categories=["something"],
                                  bulkText="somthing")

                is_inserted = self.mongo_db.put(conf)
                self.assertEqual(None, is_inserted)
Exemple #8
0
 def create_metadata(self, website_url, domain_url, domain_name, **kwargs):
     return Metadata(__name__, datetime.datetime.now(), website_url,
                     domain_url, domain_name, **kwargs)
 def get_invalid_data(self):
     return Metadata(__name__, "anything", "something.com/something",
                     "something.com", "anythingProd")
 def get_valid_data(self):
     return Metadata(__name__, datetime.now(), "something.com/something",
                     "something.com", "anythingProd")