コード例 #1
0
    def __init__(self, bazgen=None, bufsize=None):
        super(LoggingCrawler, self).__init__(bazgen)
        Logging.__init__(self)

        if bufsize is None:
            sizearg = ()  # empty tuple
        else:
            sizearg = (bufsize,)
        self.visited_pages = LimitedBuffer(*sizearg)
        self.crawled_concordances = LimitedBuffer(*sizearg)

        self._raw_filter_link = self.filter_link
        self.filter_link = self.filter_link_logwrapper

        self._raw_language_filter = self.visitor.language_filter
        self.visitor.language_filter = self.language_filter_log_wrapper

        self._raw_norm_encoding = self.visitor.norm_encoding
        self.visitor.norm_encoding = self.norm_encoding
コード例 #2
0
def load_from_corpus(filename, format):
	assert format in ("xml", "json"), "unknown format"
	maxid = 0
	links = LimitedBuffer()
	concordances = LimitedBuffer()
	with open(filename, "r") as f:
		for line in f:
			if is_row('id', line):
				m = re.search(r"\d+",line)
				id = int(line[m.start():m.end()])
				maxid = max(id, maxid)
			elif is_row('concordance',line):
				concordance = extract_value('concordance', line, format)
				concordances.insert(concordance)
			elif is_row('url', line):
				url = extract_value('url', line, format)
				links.insert(url)

	if maxid==0:
		mode = "w"
	else:
		mode = "a"

	outstream = open(filename, mode)
	of = create_formatter(
		format=format,
		output_stream=outstream,
		extending= maxid!=0,
		)
	return {
		'links':links,
		'concordances':concordances,
		'maxid':maxid,
		'output': outstream,
		'formatter': of,
	}
コード例 #3
0
class LoggingCrawler(ConcordanceCrawler, Logging):
    """Crawls concordances, logs progress and statistics and handles errors and exceptions
	
	Most of its methods only call parent's methods and add logging and error handling.
	"""

    # add allow_logging to configurable attributes
    attributes = ConcordanceCrawler.attributes + ["allow_logging"]

    # recommended logging format, it includes date to log message
    log_format = "%(asctime)-15s %(levelname)s: %(message)s"

    def __init__(self, bazgen=None, bufsize=None):
        super(LoggingCrawler, self).__init__(bazgen)
        Logging.__init__(self)

        if bufsize is None:
            sizearg = ()  # empty tuple
        else:
            sizearg = (bufsize,)
        self.visited_pages = LimitedBuffer(*sizearg)
        self.crawled_concordances = LimitedBuffer(*sizearg)

        self._raw_filter_link = self.filter_link
        self.filter_link = self.filter_link_logwrapper

        self._raw_language_filter = self.visitor.language_filter
        self.visitor.language_filter = self.language_filter_log_wrapper

        self._raw_norm_encoding = self.visitor.norm_encoding
        self.visitor.norm_encoding = self.norm_encoding

    def norm_encoding(self, document, headers):
        res = self._raw_norm_encoding(document, headers)
        if not res:
            self.Logger.debug("page rejected by encoding filter")
            self.page_enc_filtered += 1
        return res

    def language_filter_log_wrapper(self, text):
        res = self._raw_language_filter(text)
        """
#		for manual testing of accuracy of language filter
#		import re
#		text = re.sub(r"\n+",r"\n",text,flags=re.M)
#		import random
#		f = open(("yes" if res else "no") + "/text"+str(random.random()),"w")
#		f.write(text)
#		f.close()
		"""
        if res:
            return True
        self.Logger.debug("page rejected by language filter")
        self.page_lan_filtered += 1
        return False

    def modify_concordance(self, con):
        """adds id to every concordance
		change \n to spaces, delete more than two following spaces"""
        con["id"] = self.num_concordances
        c = con["concordance"].strip()
        c = re.sub(r"\s", " ", c, flags=re.UNICODE | re.MULTILINE)
        c = re.sub(r" +", " ", c)
        con["concordance"] = c
        return con

    def yield_concordances(self, words):
        for con in super(LoggingCrawler, self).yield_concordances(words):
            con = self.modify_concordance(con)
            yield con

    def filter_link_logwrapper(self, link):
        res = self._raw_filter_link(link)
        if not res:
            self.Logger.debug("link {0} rejected because of format suffix".format(link))
            self.links_filtered += 1
            self.log_state()
            return False
        if self.visited_pages.contains(link):
            self.Logger.debug("link {0} rejected because it has already been visited".format(link))
            self.links_filtered_rep += 1
            self.log_state()
            return False
        return True

    def get_links(self, *args, **kwargs):
        links = []
        try:
            links = super(LoggingCrawler, self).get_links(*args, **kwargs)
        except SERPError:
            self.Logger.error("SERP error")
            self.serp_errors += 1
            self.log_state()
        else:
            self.num_serps += 1
            self.log_details("crawled SERP, parsed {0} links".format(len(links)))
            self.links_crawled += len(links)
            self.log_state()

        self.stopping_criterion()
        return links

    def _yield_concordances_from_link(self, link, words):
        self.Logger.debug("trying to download {0}".format(link))
        for c in super(LoggingCrawler, self)._yield_concordances_from_link(link, words):
            # repeatedly crawled concordances are filtered here
            if not self.crawled_concordances.contains(c["concordance"]):
                self.crawled_concordances.insert(c["concordance"])
                self.num_concordances += 1
                self.log_state()
                yield c
            else:
                self.log_details("following concordance crawled repeatedly {0}".format(c))
                self.repeated_concordances += 1
                self.log_state()

    def concordances_from_link(self, link, words):
        try:
            concordances = super(LoggingCrawler, self).concordances_from_link(link, words)
        except (requests.exceptions.RequestException, UrlRequestException) as e:
            self.Logger.error("'{}' occured during getting {}".format(e, link))
            self.page_errors += 1
        except VisitTooLongException:
            self.Logger.error("processing of '{}' took too long".format(link))
            self.page_errors += 1
        except KeyboardInterrupt:  # terminate whole application
            raise
        except Exception:
            self.Logger.error("!!! Unknown error occured, {0}".format(format_exc()))
            self.page_errors += 1
        else:
            self.visited_pages.insert(link)
            self.log_details("page {0} visited, {1} concordances found".format(link, len(concordances)))
            self.num_pages += 1
            self.log_state()
            return concordances

    def stopping_criterion(self):
        """if crawling is unperspective or there is some error with SE, 
		assign False to self.crawling_allowed here and crawler will be aborted soon.
		"""
        if self.serp_errors > 10:
            self.Logger.critical("aborting crawler due to high number of serp errors")
            self.crawling_allowed = False