Exemple #1
0
def ingest_expired_text(text_id):


    from texts.models import Text, TextMeta, HtmlVisualization
    from annis.models import AnnisServer

    # Set up an instance of the logger
    logger = logging.getLogger(__name__)

    # Define HTML Formats and the ANNIS server to query
    annis_server = AnnisServer.objects.all()[:1] 
    vdisplay = Xvfb()
    vdisplay.start()
    driver = webdriver.Firefox()

    if len(annis_server) > 0:
        annis_server = annis_server[0]

        # Ensure trailing slash in the annis server base domain
        if not annis_server.base_domain.endswith("/"):
            annis_server.base_domain += "/"

    else:
        logger.error("Error with single text re-ingest, no ANNIS server found")
        return False

    # Get the text 
    try:
        text = Text.objects.get(id=text_id)
    except Text.DoesNotExist:
        logger.error("Error with single text re-ingest, no text found for text id", text_id)
        return False


    # Log the import information
    logger.info(" -- Importing " + text.corpus.title + " " + text.title + " " + str(text.id))

    # Query ANNIS for the metadata for the document
    meta_query_url = annis_server.base_domain + annis_server.document_metadata_url.replace(":corpus_name", text.corpus.annis_corpus_name).replace(":document_name", text.title)
    res = request.urlopen(meta_query_url)
    xml = res.read() 
    soup = BeautifulSoup(xml)
    meta_items = soup.find_all("annotation")

    # Remove old textmeta items
    # And save each meta item for the text document
    text.text_meta.all().delete()
    for meta_item in meta_items:
        text_meta = TextMeta()
        text_meta.name = meta_item.find("name").text
        text_meta.value = meta_item.find("value").text
        text_meta.pre = meta_item.find("pre").text
        meta_corpus_name = meta_item.find("corpusName")
        if meta_corpus_name:
            text_meta.corpus_name = meta_corpus_name.text 
        text_meta.save()
        text.text_meta.add(text_meta)

    # Remove the html visualizations from the text
    # Query ANNIS for each HTML format of the documents
    text.html_visualizations.all().delete()
    for html_format in text.corpus.html_visualization_formats.all():

        # Add the corpus corpus name to the URL
        corpora_url = annis_server.base_domain + annis_server.html_visualization_url.replace(":corpus_name", text.corpus.annis_corpus_name).replace(":document_name", text.title).replace(":html_visualization_format", html_format.slug)

        # Fetch the HTML for the corpus/document/html_format from ANNIS
        driver.get(corpora_url)

        # Wait for visualization to load in browser
        try:
            element = WebDriverWait(driver, 20).until(
                EC.presence_of_element_located((By.CLASS_NAME, "htmlvis"))
            )
            driver.delete_all_cookies()

            body = driver.find_element_by_xpath("/html/body")
            text_html = body.get_attribute("innerHTML")
            styles = driver.find_elements_by_xpath("/html/head/style")

        except:
            text_html = ""
            styles = []
            driver.quit()
            driver = webdriver.Firefox()

        # Check to ensure there's html returned
        # if "Could not query document" in text_html or "error" in text_html:
        if "Client response status: 403" in text_html:
            logger.error(" -- Error fetching " + corpora_url)
            text_html = ""

        # Remove Javascript from the body content
        if len(text_html):

            # Add the styles
            for style_elem in styles: 
                style_css = style_elem.get_attribute("innerHTML")
                text_html = text_html + "<style>" + style_css + "</style>"

            # For script element in the html, remove it
            script_elems = re.findall(r'<script.*script>', text_html, re.DOTALL)
            for script_elem in script_elems:
                text_html = text_html.replace(script_elem, "")

        # Create the new html_visualization
        html_visualization = HtmlVisualization()
        html_visualization.visualization_format = html_format
        html_visualization.html = text_html
        html_visualization.save()

        # Add the html visualization to the text
        text.html_visualizations.add(html_visualization)

    # Save the updated text
    text.is_expired = False 
    text.save()
                
    driver.quit()
    vdisplay.stop()
    return text.id
Exemple #2
0
def collect(corpus, text, annis_server):
	corpus_name = corpus.annis_corpus_name
	formats = corpus.html_visualization_formats.all()
	logger.info('Fetching %d visualizations' % len(formats))
	driver = None

	for html_format in formats:
		html_vis_url = annis_server.url_html_visualization(corpus_name, text.title, html_format.slug)

		vis_tries_left = MAX_VIS_TRIES
		text_html = False

		while not text_html and vis_tries_left:
			try:
				if not driver:
					logger.debug("Starting browser")
					try:
						driver = webdriver.Chrome(os.environ.get('CHROMEDRIVER', '/usr/lib/chromium-browser/chromedriver'))
					except Exception as e:
						logger.error('Unable to start browser: %s' % e)
						return
					logger.debug(driver)

				logger.info(html_format.title)
				retries_left = 5
				connection_accepted = False
				vis_fetch_start_time = time()
				while not connection_accepted and retries_left:
					try:
						driver.get(html_vis_url)
						connection_accepted = True
					except ConnectionRefusedError as cre:
						logger.warning(cre)
						retries_left -= 1
						sleep(15)

				if retries_left == 0:
					raise VisServerRefusingConn()

				logger.info('Calling WebDriverWait')
				WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.CLASS_NAME, "htmlvis")))
				text_html = driver.find_element_by_xpath("/html/body").get_attribute("innerHTML")
				logger.info('WebDriverWait returned\t%s\t%s\t%s\t%d\t%d\t%f' % (
					corpus_name, text.title, html_format.slug, len(text_html),
					MAX_VIS_TRIES - vis_tries_left, time() - vis_fetch_start_time))
				driver.delete_all_cookies()

			except Exception as e:
				vis_tries_left -= 1
				logger.error('Error getting %s: %s' % (html_vis_url, e))
				logger.error('Page source: ' + driver.page_source)
				driver.quit()
				driver = None

		if not text_html:
			logger.error('Unable to get %s in %d tries.' % (html_vis_url, MAX_VIS_TRIES))
		else:
			# Add the styles
			for style_elem in driver.find_elements_by_xpath("/html/head/style"):
				text_html += "<style>" + style_elem.get_attribute("innerHTML") + "</style>"

			# Remove JavaScript elements
			for script_elem in re.findall(r'<script.*script>', text_html, re.DOTALL):
				text_html = text_html.replace(script_elem, "")

			vis = HtmlVisualization()
			vis.visualization_format = html_format
			vis.html = text_html
			vis.save()

			text.html_visualizations.add(vis)

		self_max_mem, child_max_mem = [resource.getrusage(who).ru_maxrss for who in (resource.RUSAGE_SELF, resource.RUSAGE_CHILDREN)]
		logger.info('Max mem, self: {:,}, children: {:,}'.format(self_max_mem, child_max_mem))

	if driver:
		driver.quit()
Exemple #3
0
def collect(corpus, text, annis_server, driver):
	corpus_name = corpus.annis_corpus_name
	formats = corpus.html_visualization_formats.all()
	logger.info('Fetching %d visualizations' % len(formats))

	for html_format in formats:
		html_vis_url = annis_server.url_html_visualization(corpus_name, text.title, html_format.slug)

		try:
			logger.info(html_format.title)
			retries_left = 5
			connection_accepted = False
			while not connection_accepted and retries_left:
				try:
					driver.get(html_vis_url)
					connection_accepted = True
				except ConnectionRefusedError as cre:
					logger.warning(cre)
					driver.close()
					retries_left -= 1
					sleep(15)

			if retries_left == 0:
				raise VisServerRefusingConn()

			WebDriverWait(driver, 60 * 2).until(EC.presence_of_element_located((By.CLASS_NAME, "htmlvis")))
			driver.delete_all_cookies()

			body = driver.find_element_by_xpath("/html/body")
			text_html = body.get_attribute("innerHTML")
			styles = driver.find_elements_by_xpath("/html/head/style")
		except Exception as e:
			logger.error('Error getting %s: %s' % (html_vis_url, e))
			logger.error('Page source: ' + driver.page_source)
			text_html = ""
			styles = []

		if "Client response status: 403" in text_html:
			logger.error(" -- Error fetching " + html_vis_url)
			text_html = ""

		if text_html:
			# Add the styles
			for style_elem in styles:
				style_css = style_elem.get_attribute("innerHTML")
				text_html = text_html + "<style>" + style_css + "</style>"

			# For script element in the html, remove it
			script_elems = re.findall(r'<script.*script>', text_html, re.DOTALL)
			for script_elem in script_elems:
				text_html = text_html.replace( script_elem, "" )

		vis = HtmlVisualization()
		vis.visualization_format = html_format
		vis.html = text_html
		vis.save()

		text.html_visualizations.add(vis)

		self_max_mem, child_max_mem = [resource.getrusage(who).ru_maxrss for who in (resource.RUSAGE_SELF, resource.RUSAGE_CHILDREN)]
		logger.info('Max mem, self: {:,}, children: {:,}'.format(self_max_mem, child_max_mem))