Exemple #1
0
def collect(corpus, text, annis_server):
	corpus_name = corpus.annis_corpus_name
	formats = corpus.html_visualization_formats.all()
	logger.info('Fetching %d visualizations' % len(formats))
	driver = None

	for html_format in formats:
		html_vis_url = annis_server.url_html_visualization(corpus_name, text.title, html_format.slug)

		vis_tries_left = MAX_VIS_TRIES
		text_html = False

		while not text_html and vis_tries_left:
			try:
				if not driver:
					logger.debug("Starting browser")
					try:
						driver = webdriver.Chrome(os.environ.get('CHROMEDRIVER', '/usr/lib/chromium-browser/chromedriver'))
					except Exception as e:
						logger.error('Unable to start browser: %s' % e)
						return
					logger.debug(driver)

				logger.info(html_format.title)
				retries_left = 5
				connection_accepted = False
				vis_fetch_start_time = time()
				while not connection_accepted and retries_left:
					try:
						driver.get(html_vis_url)
						connection_accepted = True
					except ConnectionRefusedError as cre:
						logger.warning(cre)
						retries_left -= 1
						sleep(15)

				if retries_left == 0:
					raise VisServerRefusingConn()

				logger.info('Calling WebDriverWait')
				WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.CLASS_NAME, "htmlvis")))
				text_html = driver.find_element_by_xpath("/html/body").get_attribute("innerHTML")
				logger.info('WebDriverWait returned\t%s\t%s\t%s\t%d\t%d\t%f' % (
					corpus_name, text.title, html_format.slug, len(text_html),
					MAX_VIS_TRIES - vis_tries_left, time() - vis_fetch_start_time))
				driver.delete_all_cookies()

			except Exception as e:
				vis_tries_left -= 1
				logger.error('Error getting %s: %s' % (html_vis_url, e))
				logger.error('Page source: ' + driver.page_source)
				driver.quit()
				driver = None

		if not text_html:
			logger.error('Unable to get %s in %d tries.' % (html_vis_url, MAX_VIS_TRIES))
		else:
			# Add the styles
			for style_elem in driver.find_elements_by_xpath("/html/head/style"):
				text_html += "<style>" + style_elem.get_attribute("innerHTML") + "</style>"

			# Remove JavaScript elements
			for script_elem in re.findall(r'<script.*script>', text_html, re.DOTALL):
				text_html = text_html.replace(script_elem, "")

			vis = HtmlVisualization()
			vis.visualization_format = html_format
			vis.html = text_html
			vis.save()

			text.html_visualizations.add(vis)

		self_max_mem, child_max_mem = [resource.getrusage(who).ru_maxrss for who in (resource.RUSAGE_SELF, resource.RUSAGE_CHILDREN)]
		logger.info('Max mem, self: {:,}, children: {:,}'.format(self_max_mem, child_max_mem))

	if driver:
		driver.quit()