def get_paper(self, paper_url=None, paper_id=None): """ Load paper details for the paper given by detail page URL or numeric ID """ paper_url = '%svo020.asp?VOLFDNR=%s' % (self.config.BASE_URL, paper_id) logging.info("Getting paper %d from %s", paper_id, paper_url) # stupid re-try concept because AllRis sometimes misses start < at tags at first request try_counter = 0 while True: try: response = self.get_url(paper_url) if not response: return if "noauth" in response.url: logging.warn("Paper %s in %s seems to private" % (paper_id, paper_url)) return text = self.preprocess_text(response.text) doc = html.fromstring(text) data = {} # Beratungsfolge-Table checken table = self.table_css(doc)[0] # lets hope we always have this table self.consultation_list_start = False last_headline = '' for line in table: if line.tag == 'tr': headline = line[0].text elif line.tag == 'td': headline = line.text else: logging.error("ERROR: Serious error in data table. Unable to parse.") if headline: headline = headline.split(":")[0].lower() if headline[-1]==":": headline = headline[:-1] if headline == "betreff": value = line[1].text_content().strip() value = value.split("-->")[1] # there is some html comment with a script tag in front of the text which we remove data[headline] = " ".join(value.split()) # remove all multiple spaces from the string elif headline in ['verfasser', u'federführend', 'drucksache-art']: data[headline] = line[1].text.strip() elif headline in ['status']: data[headline] = line[1].text.strip() # related papers if len(line) > 2: if len(line[3]): # gets identifier. is there something else at this position? (will break) data['paper'] = [{'paper': Paper(numeric_id=line[3][0][0][1][0].get('href').split('=')[1].split('&')[0] , identifier=line[3][0][0][1][0].text)}] elif headline == "beratungsfolge": # the actual list will be in the next row inside a table, so we only set a marker data = self.parse_consultation_list_headline(line, data) # for parser which have the consultation list here elif self.consultation_list_start: data = self.parse_consultation_list(line, data) # for parser which have the consultation list in the next tr self.consultation_list_start = False # set the marker to False again as we have read it last_headline = headline # we simply ignore the rest (there might not be much more actually) # the actual text comes after the table in a div but it's not valid XML or HTML this using regex data['docs'] = self.body_re.findall(response.text) first_date = False for single_consultation in data['consultation']: if first_date: if single_consultation['date'] < first_date: first_date = single_consultation['date'] else: first_date = single_consultation['date'] paper = Paper(numeric_id = paper_id) paper.original_url = paper_url paper.title = data['betreff'] paper.description = data['docs'] paper.type = data['drucksache-art'] if first_date: paper.date = first_date.strftime("%Y-%m-%d") if 'identifier' in data: paper.identifier = data['identifier'] paper.document = [] # get the attachments step 1 (Drucksache) document_1 = self.attachment_1_css(doc) if len(document_1): if document_1[0].value: href = '%sdo027.asp' % self.config.BASE_URL identifier = document_1[0].value title = 'Drucksache' document = Document( identifier=identifier, numeric_id=int(identifier), title=title) document = self.get_document_file(document, href, True) paper.document.append({'document': document, 'relation': 'misc'}) # get the attachments step 2 (additional attachments) documents = self.attachments_css(doc) if len(documents) > 0: if len(documents[0]) > 1: if documents[0][1][0].text.strip() == "Anlagen:": for tr in documents[0][2:]: link = tr[0][0] href = "%s%s" % (self.config.BASE_URL, link.attrib["href"]) title = link.text identifier = str(int(link.attrib["href"].split('/')[4])) document = Document( identifier=identifier, numeric_id=int(identifier), title=title) document = self.get_document_file(document, href) paper.document.append({'document': document, 'relation': 'misc'}) oid = self.db.save_paper(paper) return except (KeyError, IndexError): if try_counter < 3: logging.info("Try again: Getting paper %d from %s", paper_id, paper_url) try_counter += 1 else: logging.error("Failed getting paper %d from %s", paper_id, paper_url) return
def get_paper(self, paper_url=None, paper_id=None): """ Load paper details for the paper given by detail page URL or numeric ID """ # Read either paper_id or paper_url from the opposite if paper_id is not None: paper_url = self.urls['SUBMISSION_DETAIL_PRINT_PATTERN'] % paper_id elif paper_url is not None: parsed = parse.search(self.urls['SUBMISSION_DETAIL_PARSE_PATTERN'], paper_url) paper_id = parsed['paper_id'] logging.info("Getting paper %d from %s", paper_id, paper_url) paper = Paper(numeric_id=paper_id) try_until = 1 try_counter = 0 try_found = False while (try_counter < try_until): try_counter += 1 try_found = False time.sleep(self.config.WAIT_TIME) try: response = self.user_agent.open(paper_url) except urllib2.HTTPError, e: if e.code == 404: sys.stderr.write("URL not found (HTTP 404) error caught: %s\n" % paper_url) sys.stderr.write("Please check BASE_URL in your configuration.\n") sys.exit(1) elif e.code == 502: try_until = 4 try_found = True if try_until == try_counter: logging.error("Permanent error in %s after 4 retrys.", paper_url) return else: logging.info("Original RIS Server Bug, restart scraping paper %s", paper_url) mechanize_forms = mechanize.ParseResponse(response, backwards_compat=False) response.seek(0) html = response.read() html = html.replace(' ', ' ') parser = etree.HTMLParser() dom = etree.parse(StringIO(html), parser) # Hole die Seite noch einmal wenn unbekannter zufällig auftretender Fehler ohne Fehlermeldung ausgegeben wird (gefunden in Duisburg, vermutlich kaputte Server Config) try: page_title = dom.xpath('//h1')[0].text if 'Fehler' in page_title: try_until = 4 try_found = True if try_until == try_counter: logging.error("Permanent error in %s after 3 retrys, proceed.", paper_url) else: logging.info("Original RIS Server Bug, restart scraping paper %s", paper_url) except: pass if (try_found == False): # check for page errors try: if 'Fehlermeldung' in page_title: logging.info("Page %s cannot be accessed due to server error", paper_url) return if 'Berechtigungsfehler' in page_title: logging.info("Page %s cannot be accessed due to permissions", paper_url) return except: pass paper.original_url = paper_url paper_related = [] # Paper title try: stitle = dom.xpath(self.xpath['SUBMISSION_DETAIL_TITLE']) paper.title = stitle[0].text except: logging.critical('Cannot find paper title element using XPath SUBMISSION_DETAIL_TITLE') raise TemplateError('Cannot find paper title element using XPath SUBMISSION_DETAIL_TITLE') # Submission identifier, date, type etc tds = dom.xpath(self.xpath['SUBMISSION_DETAIL_IDENTIFIER_TD']) if len(tds) == 0: logging.critical('Cannot find table fields using XPath SUBMISSION_DETAIL_IDENTIFIER_TD') logging.critical('HTML Dump:' + html) raise TemplateError('Cannot find table fields using XPath SUBMISSION_DETAIL_IDENTIFIER_TD') else: current_category = None for n in range(0, len(tds)): try: tdcontent = tds[n].text.strip() except: continue if tdcontent == 'Name:': paper.identifier = tds[n + 1].text.strip() elif tdcontent == 'Art:': paper.type = tds[n + 1].text.strip() elif tdcontent == 'Datum:': paper.date = tds[n + 1].text.strip() elif tdcontent == 'Name:': paper.identifier = tds[n + 1].text.strip() elif tdcontent == 'Betreff:': paper.subject = '; '.join(tds[n + 1].xpath('./text()')) elif tdcontent == 'Aktenzeichen:': paper.reference_number = tds[n + 1].text.strip() elif tdcontent == 'Referenzvorlage:': link = tds[n + 1].xpath('a')[0] href = link.get('href') parsed = parse.search(self.urls['SUBMISSION_DETAIL_PARSE_PATTERN'], href) superordinated_paper = Paper(numeric_id=parsed['paper_id'], identifier=link.text.strip()) paper_related.append({ 'relation': 'superordinated', 'paper': superordinated_paper}) # add superordinate paper to queue if hasattr(self, 'paper_queue'): self.paper_queue.add(parsed['paper_id']) # subordinate papers are added to the queue elif tdcontent == 'Untergeordnete Vorlage(n):': current_category = 'subordinates' for link in tds[n + 1].xpath('a'): href = link.get('href') parsed = parse.search(self.urls['SUBMISSION_DETAIL_PARSE_PATTERN'], href) subordinated_paper = Paper(numeric_id=parsed['paper_id'], identifier=link.text.strip()) paper_related.append({ 'relation': 'subordinated', 'paper': subordinated_paper}) if hasattr(self, 'paper_queue') and parsed is not None: # add subordinate paper to queue self.paper_queue.add(parsed['paper_id']) else: if current_category == 'subordinates' and len(tds) > n+1: for link in tds[n + 1].xpath('a'): href = link.get('href') parsed = parse.search(self.urls['SUBMISSION_DETAIL_PARSE_PATTERN'], href) subordinated_paper = Paper(numeric_id=parsed['paper_id'], identifier=link.text.strip()) paper_related.append({ 'relation': 'subordinated', 'paper': subordinated_paper}) if hasattr(self, 'paper_queue') and parsed is not None: self.paper_queue.add(parsed['paper_id']) if len(paper_related): paper.paper = paper_related if not hasattr(paper, 'identifier'): logging.critical('Cannot find paper identifier using SESSION_DETAIL_IDENTIFIER_TD_XPATH') raise TemplateError('Cannot find paper identifier using SESSION_DETAIL_IDENTIFIER_TD_XPATH') # "Beratungsfolge"(list of sessions for this paper) # This is currently not parsed for scraping, but only for # gathering session-document ids for later exclusion found_documents = [] rows = dom.xpath(self.xpath['SUBMISSION_DETAIL_AGENDA_ROWS']) for row in rows: # find forms formfields = row.xpath('.//input[@type="hidden"][@name="DT"]') for formfield in formfields: document_id = formfield.get('value') if document_id is not None: found_documents.append(document_id) # find links links = row.xpath('.//a[contains(@href,"getfile.")]') for link in links: if not link.xpath('.//img'): file_link = self.config.BASE_URL + link.get('href') document_id = file_link.split('id=')[1].split('&')[0] found_documents.append(document_id) # paper-related documents documents = [] paper.document = [] containers = dom.xpath(self.xpath['SUBMISSION_DETAIL_ATTACHMENTS']) for container in containers: try: classes = container.get('class').split(' ') except: continue if self.xpath['SUBMISSION_DETAIL_ATTACHMENTS_CONTAINER_CLASSNAME'] not in classes: continue rows = container.xpath('.//tr') for row in rows: # seems that we have direct links if not row.xpath('.//form'): links = row.xpath('.//a') for link in links: # ignore additional pdf icon links if not link.xpath('.//img'): title = ' '.join(link.xpath('./text()')).strip() file_link = self.config.BASE_URL + link.get('href') document_id = file_link.split('id=')[1].split('&')[0] if document_id in found_documents: continue document = Document( identifier=document_id, numeric_id=document_id, title=title, original_url=file_link) document = self.get_document_file(document=document, link=file_link) if 'Einladung' in title: document_type = 'invitation' elif 'Niederschrift' in title: document_type = 'results_protocol' else: document_type = 'misc' paper.document.append({'relation': document_type, 'document': document}) found_documents.append(document_id) # no direct link, so we have to handle forms else: forms = row.xpath('.//form') for form in forms: title = " ".join(row.xpath('./td/text()')).strip() for hidden_field in form.xpath('input[@name="DT"]'): document_id = hidden_field.get('value') if document_id in found_documents: continue document = Document( identifier=document_id, numeric_id=document_id, title=title) # Traversing the whole mechanize response to submit this form for mform in mechanize_forms: for control in mform.controls: if control.name == 'DT' and control.value == document_id: document = self.get_document_file(document=document, form=mform) if 'Einladung' in title: document_type = 'invitation' elif 'Niederschrift' in title: document_type = 'results_protocol' else: document_type = 'misc' paper.document.append({'relation': document_type, 'document': document}) found_documents.append(document_id) if len(documents): paper.document = documents # forcing overwrite=True here oid = self.db.save_paper(paper)