Esempio n. 1
0
    def build_matcher(self, url, callback):
        task   = Task(url)
        extra_rules = []

        for rule in RobotRule.objects(site=task.url_host):
            extra_rules.append(('allow' if rule.flag else 'deny', rule.path, rule.order))

        extra_rules = sorted(extra_rules, key=lambda x: x[2])

        try:
            parser = RobotParser(useragent=self.settings.USER_AGENT, extra_rules=extra_rules)
        except Exception as e:
            logging.error("Exception building robot parser", e)
            raise e

        v, t = yield gen.Task(self.fetch.process, task)

        # Save the robots.txt
        yield gen.Task(self.store.process, task)

        if task.content:
            parser.parse(task.content)

        matcher = parser.matcher(self.settings.ROBOT_NAME)

        callback(matcher)
Esempio n. 2
0
    def build_matcher(self, url, callback):
        task   = Task(url)
        extra_rules = []

        for rule in RobotRule.objects(site=task.url_host):
            extra_rules.append(('allow' if rule.flag else 'deny', rule.path))

        parser = RobotParser(useragent=self.settings.USER_AGENT, extra_rules=extra_rules)

        v, t = yield gen.Task(self.fetch.process, task)

        # Save the robots.txt
        yield gen.Task(self.store.process, task)

        if task.content:
            parser.parse(task.content)

        matcher = parser.matcher(self.settings.ROBOT_NAME)

        callback(matcher)