Beispiel #1
0
 def go_to_next_page(self, html_element, parent, predicted):
     """
     Executes action of going to the next page
     """
     if predicted == self.rule_provider.get_mapping(m.next_page):
         try:
             first_a_html_element_inside_whole = html_element.findAll(
                 "a")[0]
             link = first_a_html_element_inside_whole['href']
             self.logger_dbg.info("Going to next page: " + str(parent) +
                                  " unwrapped url: " + link)
             yield scrapy.Request(url=build_link(self.base_domain, link),
                                  callback=self.parse,
                                  meta={'parent': parent})
         except BaseException as e:
             self.logger_dbg.error("Couldn't go to next page of: " +
                                   str(parent) + " due to: " + str(e))
             self.logger_dbg.error("Element that caused the problem: " +
                                   str(html_element))
     elif predicted == self.rule_provider.get_mapping(m.next_page_link):
         self.logger_dbg.info("Going to next page: " + str(parent) +
                              " url: " + html_element['href'])
         yield scrapy.Request(url=build_link(self.base_domain,
                                             html_element['href']),
                              callback=self.parse,
                              meta={'parent': parent})
Beispiel #2
0
    def test_given_domain_and_link_when_building_link_proper_build_invision(self):
        domain = 'http://www.uk420.com/boards/'
        link = 'http://www.uk420.com/boards/index.php?/forum/103-outdoor-growing/'

        res = html_util.build_link(domain,link)

        self.assertEqual(res,link)
Beispiel #3
0
    def test_given_url_from_without_domain_when_checking_true(self):
        domain = 'https://www.forum.haszysz.com/'
        link = 'forumdisplay.php?97-Hodowla'

        res = html_util.build_link(domain, link)

        self.assertTrue(res)
Beispiel #4
0
    def test_given_domain_and_link_when_building_link_proper_build_vbulletin(self):
        domain = 'https://www.forum.haszysz.com/'
        link = 'forumdisplay.php?97-Hodowla'

        res = html_util.build_link(domain,link)

        self.assertEqual(res,domain+link)
Beispiel #5
0
    def test_given_domain_and_link_when_building_link_proper_build_phpbb(self):
        domain = 'https://forum.vwgolf.pl/'
        link = './viewforum.php?f=157&sid=339596b98a9c27072f8ed07d68be22cd'

        res = html_util.build_link(domain,link)

        self.assertEqual(res,domain+link)
Beispiel #6
0
    def parse_categories(self, html_element, predicted, parent):
        """
        Executes action of parsing the categories
        """
        category = None
        """ Title found"""
        if predicted == self.rule_provider.get_mapping(m.category_title):
            link = html_element['href']
            title = str(html_element.contents[0])
            category = self.repository.save_category(title, link, parent,
                                                     self.forum)
            self.logger_dbg.info(title + " " + self.base_domain + link)
        """ Unwrapping needed """
        if predicted == self.rule_provider.get_mapping(m.category_whole):
            try:
                first_a_html_element_inside_whole = html_element.findAll(
                    "a")[0]
                link = first_a_html_element_inside_whole['href']
                title = str(first_a_html_element_inside_whole.contents[0])
                category = self.repository.save_category(
                    title, link, parent, self.forum)
                self.logger_dbg.info(title + " " + self.base_domain + link)
            except BaseException as e:
                self.logger_dbg.error(str(e))
                self.logger_dbg.error("Can't find category inside: " +
                                      str(html_element))

        if category is not None and html_util.url_not_from_other_domain(
                category.link, self.base_domain):
            yield scrapy.Request(url=build_link(self.base_domain,
                                                category.link),
                                 callback=self.parse,
                                 meta={'parent': category})
Beispiel #7
0
 def prepare_strategy(self, spider):
     """
     Read all the categories to scrap from config file
     """
     config_file = pd.read_csv("config/categories.csv", sep=';')
     category_ids = set(config_file['category_id'])
     categories = self.repository.get_categories(category_ids)
     base_link = self.forum.link
     self.strategy_initialized = True
     for category in categories:
         yield scrapy.Request(url=build_link(base_link, category.link),
                              callback=spider.parse,
                              meta={'parent': category})
Beispiel #8
0
    def parse_topics(self, html_element, parent):
        author = None
        date = None
        link = None
        title = None
        for tag in self.rule_provider.possible_tags_topics:
            elements_inside_tag = html_element.findAll(tag)
            for elem in elements_inside_tag:
                if html_util.element_has_css_class(elem):
                    predicted = self.rule_provider.predict(tag, elem["class"])
                    if predicted == self.rule_provider.get_mapping(
                            m.topic_title):
                        title = elem.contents[0]
                        link = elem['href']
                        self.logger_dbg.info(title + " " + link)
                    if predicted == self.rule_provider.get_mapping(m.author):
                        author = elem.contents[0]
                    if predicted == self.rule_provider.get_mapping(
                            m.topic_date):
                        date = dpt.parse_date(elem.contents)
        """ Additional check english speaking invision """
        time_tags = html_element.findAll("time")
        if len(time_tags) > 0:
            date = dpt.parse_english_date(time_tags[0].contents)
            link = html_element.findAll('a')[0]['href']
            title = html_element.findAll('a')[0]['title']

        if title is None or link is None:
            self.logger_dbg.info("Can't find topic inside: " +
                                 str(html_element))
            return

        if not filtering.topic_meets_criterion(title, author, date):
            return
        topic = self.repository.save_topic(author, date, link, parent, title)
        self.logger_dbg.info("Scrapped topic: " + str(topic))
        yield scrapy.Request(dont_filter=True,
                             url=build_link(self.base_domain, topic.link),
                             callback=self.parse,
                             meta={'parent': topic})