def __init__(self, _id, content=None, **kwargs): self._id = str(_id) self.paper = Paper() page_data = col_paper.find_one({"url_id": self._id}) if page_data: # 数据库中已经存在,直接返回 return if content is None: self.content = Downloader(host + self._id)() if self.content: self.valid = True else: logger.error("当前网页为空,无法进行解析\turl_id:" + self._id) self.valid = False return else: self.valid = True self.content = content self.selector = etree.HTML(self.content) self.paper.url_id = self._id
def get_paper(self, paper_url=None, paper_id=None): """ Load paper details for the paper given by detail page URL or numeric ID """ paper_url = '%svo020.asp?VOLFDNR=%s' % (self.config.BASE_URL, paper_id) logging.info("Getting paper %d from %s", paper_id, paper_url) # stupid re-try concept because AllRis sometimes misses start < at tags at first request try_counter = 0 while True: try: response = self.get_url(paper_url) if not response: return if "noauth" in response.url: logging.warn("Paper %s in %s seems to private" % (paper_id, paper_url)) return text = self.preprocess_text(response.text) doc = html.fromstring(text) data = {} # Beratungsfolge-Table checken table = self.table_css(doc)[0] # lets hope we always have this table self.consultation_list_start = False last_headline = '' for line in table: if line.tag == 'tr': headline = line[0].text elif line.tag == 'td': headline = line.text else: logging.error("ERROR: Serious error in data table. Unable to parse.") if headline: headline = headline.split(":")[0].lower() if headline[-1]==":": headline = headline[:-1] if headline == "betreff": value = line[1].text_content().strip() value = value.split("-->")[1] # there is some html comment with a script tag in front of the text which we remove data[headline] = " ".join(value.split()) # remove all multiple spaces from the string elif headline in ['verfasser', u'federführend', 'drucksache-art']: data[headline] = line[1].text.strip() elif headline in ['status']: data[headline] = line[1].text.strip() # related papers if len(line) > 2: if len(line[3]): # gets identifier. is there something else at this position? (will break) data['paper'] = [{'paper': Paper(numeric_id=line[3][0][0][1][0].get('href').split('=')[1].split('&')[0] , identifier=line[3][0][0][1][0].text)}] elif headline == "beratungsfolge": # the actual list will be in the next row inside a table, so we only set a marker data = self.parse_consultation_list_headline(line, data) # for parser which have the consultation list here elif self.consultation_list_start: data = self.parse_consultation_list(line, data) # for parser which have the consultation list in the next tr self.consultation_list_start = False # set the marker to False again as we have read it last_headline = headline # we simply ignore the rest (there might not be much more actually) # the actual text comes after the table in a div but it's not valid XML or HTML this using regex data['docs'] = self.body_re.findall(response.text) first_date = False for single_consultation in data['consultation']: if first_date: if single_consultation['date'] < first_date: first_date = single_consultation['date'] else: first_date = single_consultation['date'] paper = Paper(numeric_id = paper_id) paper.original_url = paper_url paper.title = data['betreff'] paper.description = data['docs'] paper.type = data['drucksache-art'] if first_date: paper.date = first_date.strftime("%Y-%m-%d") if 'identifier' in data: paper.identifier = data['identifier'] paper.document = [] # get the attachments step 1 (Drucksache) document_1 = self.attachment_1_css(doc) if len(document_1): if document_1[0].value: href = '%sdo027.asp' % self.config.BASE_URL identifier = document_1[0].value title = 'Drucksache' document = Document( identifier=identifier, numeric_id=int(identifier), title=title) document = self.get_document_file(document, href, True) paper.document.append({'document': document, 'relation': 'misc'}) # get the attachments step 2 (additional attachments) documents = self.attachments_css(doc) if len(documents) > 0: if len(documents[0]) > 1: if documents[0][1][0].text.strip() == "Anlagen:": for tr in documents[0][2:]: link = tr[0][0] href = "%s%s" % (self.config.BASE_URL, link.attrib["href"]) title = link.text identifier = str(int(link.attrib["href"].split('/')[4])) document = Document( identifier=identifier, numeric_id=int(identifier), title=title) document = self.get_document_file(document, href) paper.document.append({'document': document, 'relation': 'misc'}) oid = self.db.save_paper(paper) return except (KeyError, IndexError): if try_counter < 3: logging.info("Try again: Getting paper %d from %s", paper_id, paper_url) try_counter += 1 else: logging.error("Failed getting paper %d from %s", paper_id, paper_url) return
def get_paper(self, paper_url=None, paper_id=None): """ Load paper details for the paper given by detail page URL or numeric ID """ # Read either paper_id or paper_url from the opposite if paper_id is not None: paper_url = self.urls['SUBMISSION_DETAIL_PRINT_PATTERN'] % paper_id elif paper_url is not None: parsed = parse.search(self.urls['SUBMISSION_DETAIL_PARSE_PATTERN'], paper_url) paper_id = parsed['paper_id'] logging.info("Getting paper %d from %s", paper_id, paper_url) paper = Paper(numeric_id=paper_id) try_until = 1 try_counter = 0 try_found = False while (try_counter < try_until): try_counter += 1 try_found = False time.sleep(self.config.WAIT_TIME) try: response = self.user_agent.open(paper_url) except urllib2.HTTPError, e: if e.code == 404: sys.stderr.write("URL not found (HTTP 404) error caught: %s\n" % paper_url) sys.stderr.write("Please check BASE_URL in your configuration.\n") sys.exit(1) elif e.code == 502: try_until = 4 try_found = True if try_until == try_counter: logging.error("Permanent error in %s after 4 retrys.", paper_url) return else: logging.info("Original RIS Server Bug, restart scraping paper %s", paper_url) mechanize_forms = mechanize.ParseResponse(response, backwards_compat=False) response.seek(0) html = response.read() html = html.replace(' ', ' ') parser = etree.HTMLParser() dom = etree.parse(StringIO(html), parser) # Hole die Seite noch einmal wenn unbekannter zufällig auftretender Fehler ohne Fehlermeldung ausgegeben wird (gefunden in Duisburg, vermutlich kaputte Server Config) try: page_title = dom.xpath('//h1')[0].text if 'Fehler' in page_title: try_until = 4 try_found = True if try_until == try_counter: logging.error("Permanent error in %s after 3 retrys, proceed.", paper_url) else: logging.info("Original RIS Server Bug, restart scraping paper %s", paper_url) except: pass if (try_found == False): # check for page errors try: if 'Fehlermeldung' in page_title: logging.info("Page %s cannot be accessed due to server error", paper_url) return if 'Berechtigungsfehler' in page_title: logging.info("Page %s cannot be accessed due to permissions", paper_url) return except: pass paper.original_url = paper_url paper_related = [] # Paper title try: stitle = dom.xpath(self.xpath['SUBMISSION_DETAIL_TITLE']) paper.title = stitle[0].text except: logging.critical('Cannot find paper title element using XPath SUBMISSION_DETAIL_TITLE') raise TemplateError('Cannot find paper title element using XPath SUBMISSION_DETAIL_TITLE') # Submission identifier, date, type etc tds = dom.xpath(self.xpath['SUBMISSION_DETAIL_IDENTIFIER_TD']) if len(tds) == 0: logging.critical('Cannot find table fields using XPath SUBMISSION_DETAIL_IDENTIFIER_TD') logging.critical('HTML Dump:' + html) raise TemplateError('Cannot find table fields using XPath SUBMISSION_DETAIL_IDENTIFIER_TD') else: current_category = None for n in range(0, len(tds)): try: tdcontent = tds[n].text.strip() except: continue if tdcontent == 'Name:': paper.identifier = tds[n + 1].text.strip() elif tdcontent == 'Art:': paper.type = tds[n + 1].text.strip() elif tdcontent == 'Datum:': paper.date = tds[n + 1].text.strip() elif tdcontent == 'Name:': paper.identifier = tds[n + 1].text.strip() elif tdcontent == 'Betreff:': paper.subject = '; '.join(tds[n + 1].xpath('./text()')) elif tdcontent == 'Aktenzeichen:': paper.reference_number = tds[n + 1].text.strip() elif tdcontent == 'Referenzvorlage:': link = tds[n + 1].xpath('a')[0] href = link.get('href') parsed = parse.search(self.urls['SUBMISSION_DETAIL_PARSE_PATTERN'], href) superordinated_paper = Paper(numeric_id=parsed['paper_id'], identifier=link.text.strip()) paper_related.append({ 'relation': 'superordinated', 'paper': superordinated_paper}) # add superordinate paper to queue if hasattr(self, 'paper_queue'): self.paper_queue.add(parsed['paper_id']) # subordinate papers are added to the queue elif tdcontent == 'Untergeordnete Vorlage(n):': current_category = 'subordinates' for link in tds[n + 1].xpath('a'): href = link.get('href') parsed = parse.search(self.urls['SUBMISSION_DETAIL_PARSE_PATTERN'], href) subordinated_paper = Paper(numeric_id=parsed['paper_id'], identifier=link.text.strip()) paper_related.append({ 'relation': 'subordinated', 'paper': subordinated_paper}) if hasattr(self, 'paper_queue') and parsed is not None: # add subordinate paper to queue self.paper_queue.add(parsed['paper_id']) else: if current_category == 'subordinates' and len(tds) > n+1: for link in tds[n + 1].xpath('a'): href = link.get('href') parsed = parse.search(self.urls['SUBMISSION_DETAIL_PARSE_PATTERN'], href) subordinated_paper = Paper(numeric_id=parsed['paper_id'], identifier=link.text.strip()) paper_related.append({ 'relation': 'subordinated', 'paper': subordinated_paper}) if hasattr(self, 'paper_queue') and parsed is not None: self.paper_queue.add(parsed['paper_id']) if len(paper_related): paper.paper = paper_related if not hasattr(paper, 'identifier'): logging.critical('Cannot find paper identifier using SESSION_DETAIL_IDENTIFIER_TD_XPATH') raise TemplateError('Cannot find paper identifier using SESSION_DETAIL_IDENTIFIER_TD_XPATH') # "Beratungsfolge"(list of sessions for this paper) # This is currently not parsed for scraping, but only for # gathering session-document ids for later exclusion found_documents = [] rows = dom.xpath(self.xpath['SUBMISSION_DETAIL_AGENDA_ROWS']) for row in rows: # find forms formfields = row.xpath('.//input[@type="hidden"][@name="DT"]') for formfield in formfields: document_id = formfield.get('value') if document_id is not None: found_documents.append(document_id) # find links links = row.xpath('.//a[contains(@href,"getfile.")]') for link in links: if not link.xpath('.//img'): file_link = self.config.BASE_URL + link.get('href') document_id = file_link.split('id=')[1].split('&')[0] found_documents.append(document_id) # paper-related documents documents = [] paper.document = [] containers = dom.xpath(self.xpath['SUBMISSION_DETAIL_ATTACHMENTS']) for container in containers: try: classes = container.get('class').split(' ') except: continue if self.xpath['SUBMISSION_DETAIL_ATTACHMENTS_CONTAINER_CLASSNAME'] not in classes: continue rows = container.xpath('.//tr') for row in rows: # seems that we have direct links if not row.xpath('.//form'): links = row.xpath('.//a') for link in links: # ignore additional pdf icon links if not link.xpath('.//img'): title = ' '.join(link.xpath('./text()')).strip() file_link = self.config.BASE_URL + link.get('href') document_id = file_link.split('id=')[1].split('&')[0] if document_id in found_documents: continue document = Document( identifier=document_id, numeric_id=document_id, title=title, original_url=file_link) document = self.get_document_file(document=document, link=file_link) if 'Einladung' in title: document_type = 'invitation' elif 'Niederschrift' in title: document_type = 'results_protocol' else: document_type = 'misc' paper.document.append({'relation': document_type, 'document': document}) found_documents.append(document_id) # no direct link, so we have to handle forms else: forms = row.xpath('.//form') for form in forms: title = " ".join(row.xpath('./td/text()')).strip() for hidden_field in form.xpath('input[@name="DT"]'): document_id = hidden_field.get('value') if document_id in found_documents: continue document = Document( identifier=document_id, numeric_id=document_id, title=title) # Traversing the whole mechanize response to submit this form for mform in mechanize_forms: for control in mform.controls: if control.name == 'DT' and control.value == document_id: document = self.get_document_file(document=document, form=mform) if 'Einladung' in title: document_type = 'invitation' elif 'Niederschrift' in title: document_type = 'results_protocol' else: document_type = 'misc' paper.document.append({'relation': document_type, 'document': document}) found_documents.append(document_id) if len(documents): paper.document = documents # forcing overwrite=True here oid = self.db.save_paper(paper)
def get_paper(self, paper_url=None, paper_id=None): """ Load paper details for the paper given by detail page URL or numeric ID """ # Read either paper_id or paper_url from the opposite if paper_id is not None: paper_url = self.urls['PAPER_DETAIL_PRINT_PATTERN'] % (self.config["scraper"]["base_url"], paper_id) elif paper_url is not None: parsed = parse.search(self.urls['PAPER_DETAIL_PARSE_PATTERN'], paper_url) paper_id = parsed['paper_id'] logging.info("Getting paper %d from %s", paper_id, paper_url) paper = Paper(originalId=paper_id) try_until = 1 try_counter = 0 try_found = False while (try_counter < try_until): try_counter += 1 try_found = False time.sleep(self.config['scraper']['wait_time']) try: response = self.user_agent.open(paper_url) except urllib2.HTTPError, e: if e.code == 404: sys.stderr.write("URL not found (HTTP 404) error caught: %s\n" % paper_url) sys.stderr.write("Please check BASE_URL in your configuration.\n") sys.exit(1) elif e.code == 502 or e.code == 500: try_until = 4 try_found = True if try_until == try_counter: logging.error("Permanent error in %s after 4 retrys.", paper_url) return else: logging.info("Original RIS Server Bug, restart fetching paper %s", paper_url) if not response: return mechanize_forms = mechanize.ParseResponse(response, backwards_compat=False) response.seek(0) html = response.read() html = html.replace(' ', ' ') parser = etree.HTMLParser() dom = etree.parse(StringIO(html), parser) # Hole die Seite noch einmal wenn unbekannter zufällig auftretender Fehler ohne Fehlermeldung ausgegeben wird (gefunden in Duisburg, vermutlich kaputte Server Config) try: page_title = dom.xpath('//h1')[0].text if 'Fehler' in page_title: try_until = 4 try_found = True if try_until == try_counter: logging.error("Permanent error in %s after 3 retrys, proceed.", paper_url) else: logging.info("Original RIS Server Bug, restart scraping paper %s", paper_url) except: pass if (try_found == False): # check for page errors try: if 'Fehlermeldung' in page_title: logging.info("Page %s cannot be accessed due to server error", paper_url) return if 'Berechtigungsfehler' in page_title: logging.info("Page %s cannot be accessed due to permissions", paper_url) return except: pass paper.originalUrl = paper_url superordinated_papers = [] subordinated_papers = [] # Paper title try: stitle = dom.xpath(self.xpath['PAPER_DETAIL_TITLE']) paper.title = stitle[0].text except: logging.critical('Cannot find paper title element using XPath PAPER_DETAIL_TITLE') raise TemplateError('Cannot find paper title element using XPath PAPER_DETAIL_TITLE') # Paper identifier, date, type etc tds = dom.xpath(self.xpath['PAPER_DETAIL_IDENTIFIER_TD']) if len(tds) == 0: logging.critical('Cannot find table fields using XPath PAPER_DETAIL_IDENTIFIER_TD') logging.critical('HTML Dump:' + html) raise TemplateError('Cannot find table fields using XPath PAPER_DETAIL_IDENTIFIER_TD') else: current_category = None for n in range(0, len(tds)): try: tdcontent = tds[n].text.strip() except: continue if tdcontent == 'Name:': paper.nameShort = tds[n + 1].text.strip() # TODO: Dereferenzierung von Paper Type Strings elif tdcontent == 'Art:': paper.paperType = tds[n + 1].text.strip() elif tdcontent == 'Datum:': paper.publishedDate = tds[n + 1].text.strip() elif tdcontent == 'Betreff:': paper.name = '; '.join(tds[n + 1].xpath('./text()')) elif tdcontent == 'Aktenzeichen:': paper.reference = tds[n + 1].text.strip() elif tdcontent == 'Referenzvorlage:': link = tds[n + 1].xpath('a')[0] href = link.get('href') parsed = parse.search(self.urls['PAPER_DETAIL_PARSE_PATTERN'], href) superordinated_paper = Paper(originalId=parsed['paper_id'], nameShort=link.text.strip()) superordinated_papers.append(superordinated_paper) # add superordinate paper to queue if hasattr(self, 'paper_queue'): self.paper_queue.add(parsed['paper_id']) # subordinate papers are added to the queue elif tdcontent == 'Untergeordnete Vorlage(n):': current_category = 'subordinates' for link in tds[n + 1].xpath('a'): href = link.get('href') parsed = parse.search(self.urls['PAPER_DETAIL_PARSE_PATTERN'], href) subordinated_paper = Paper(originalId=parsed['paper_id'], nameShort=link.text.strip()) subordinated_papers.append(subordinated_paper) if hasattr(self, 'paper_queue') and parsed is not None: # add subordinate paper to queue self.paper_queue.add(parsed['paper_id']) elif tdcontent == u'Anträge zur Vorlage:': current_category = 'todo' pass #TODO: WTF is this? else: if current_category == 'subordinates' and len(tds) > n+1: for link in tds[n + 1].xpath('a'): href = link.get('href') parsed = parse.search(self.urls['PAPER_DETAIL_PARSE_PATTERN'], href) subordinated_paper = Paper(originalId=parsed['paper_id'], nameShort=link.text.strip()) subordinated_papers.append(subordinated_paper) if hasattr(self, 'paper_queue') and parsed is not None: self.paper_queue.add(parsed['paper_id']) if len(subordinated_papers): paper.subordinatedPaper = subordinated_papers if len(superordinated_papers): paper.superordinatedPaper = superordinated_papers if not hasattr(paper, 'originalId'): logging.critical('Cannot find paper identifier using MEETING_DETAIL_IDENTIFIER_TD') raise TemplateError('Cannot find paper identifier using MEETING_DETAIL_IDENTIFIER_TD') # "Beratungsfolge"(list of sessions for this paper) # This is currently not parsed for scraping, but only for # gathering session-document ids for later exclusion found_files = [] #already changed: found_files, files. todo: document_foo rows = dom.xpath(self.xpath['PAPER_DETAIL_AGENDA_ROWS']) for row in rows: # find forms formfields = row.xpath('.//input[@type="hidden"][@name="DT"]') for formfield in formfields: file_id = formfield.get('value') if file_id is not None: found_files.append(file_id) # find links links = row.xpath('.//a[contains(@href,"getfile.")]') for link in links: if not link.xpath('.//img'): file_link = self.config['scraper']['base_url'] + link.get('href') file_id = file_link.split('id=')[1].split('&')[0] found_files.append(file_id) # paper-related documents files = [] containers = dom.xpath(self.xpath['PAPER_DETAIL_FILES']) for container in containers: try: classes = container.get('class').split(' ') except: continue if self.xpath['PAPER_DETAIL_FILES_CONTAINER_CLASSNAME'] not in classes: continue rows = container.xpath('.//tr') for row in rows: # seems that we have direct links if not row.xpath('.//form'): links = row.xpath('.//a') for link in links: # ignore additional pdf icon links if not link.xpath('.//img'): name = ' '.join(link.xpath('./text()')).strip() file_link = self.config['scraper']['base_url'] + link.get('href') file_id = file_link.split('id=')[1].split('&')[0] if file_id in found_files: continue file = File( originalId=file_id, name=name, originalUrl=file_link, originalDownloadPossible = True) file = self.get_file(file=file, link=file_link) files.append(file) found_files.append(file_id) # no direct link, so we have to handle forms else: forms = row.xpath('.//form') for form in forms: name = " ".join(row.xpath('./td/text()')).strip() for hidden_field in form.xpath('input[@name="DT"]'): file_id = hidden_field.get('value') if file_id in found_files: continue file = File( originalId=file_id, name=name, originalDownloadPossible = False) # Traversing the whole mechanize response to submit this form for mform in mechanize_forms: for control in mform.controls: if control.name == 'DT' and control.value == file_id: file = self.get_file(file=file, form=mform) files.append(file) found_files.append(file_id) if len(files): paper.mainFile = files[0] if len(files) > 1: paper.auxiliaryFile = files[1:] oid = self.db.save_paper(paper)
def get_paper(self, paper_url=None, paper_id=None): """ Load paper details for the paper given by detail page URL or numeric ID """ paper_url = ('%svo020.asp?VOLFDNR=%s' % (self.config['scraper']['base_url'], paper_id)) logging.info("Getting paper %d from %s", paper_id, paper_url) # Stupid re-try concept because AllRis sometimes misses # start < at tags at first request. try_counter = 0 while True: try: response = self.get_url(paper_url) if not response: return if "noauth" in response.url: logging.warn("Paper %s in %s seems to private", paper_id, paper_url) return text = response.text doc = html.fromstring(text) data = {} # Beratungsfolge-Table checken # lets hope we always have this table table = self.table_css(doc)[0] self.consultation_list_start = False last_headline = '' for line in table: if line.tag == 'tr': headline = line[0].text elif line.tag == 'td': headline = line.text else: logging.error("ERROR: Serious error in data table. " "Unable to parse.") if headline: headline = headline.split(":")[0].lower() if headline[-1] == ":": headline = headline[:-1] if headline == "betreff": value = line[1].text_content().strip() # There is some html comment with a script # tag in front of the text which we remove. value = value.split("-->")[1] # remove all multiple spaces from the string data[headline] = " ".join(value.split()) elif headline in [ 'verfasser', u'federführend', 'drucksache-art' ]: data[headline] = line[1].text.strip() elif headline in ['status']: data[headline] = line[1].text.strip() # related papers if len(line) > 2: if len(line[3]): # Gets originalId. is there something # else at this position? (will break) paper_id = line[3][0][0][1][0].get( 'href').split('=')[1].split('&')[0] data['relatedPaper'] = [ Paper(originalId=paper_id) ] # Lot's of scraping just because of the date (?) elif headline == "beratungsfolge": # The actual list will be in the next row # inside a table, so we only set a marker. self.consultation_list_start = True elif self.consultation_list_start: elem = line[0][0] # The first line is pixel images, so skip # it, then we need to jump in steps of two. amount = (len(elem) - 1) / 2 consultations = [] date_list = [] i = 0 item = None for elem_line in elem: if i == 0: i += 1 continue """ Here we need to parse the actual list which can have different forms. A complex example can be found at http://ratsinfo.aachen.de/bi/vo020.asp?VOLFDNR=10822 The first line is some sort of headline with the committee in question and the type of consultation. After that 0-n lines of detailed information of meetings with a date, transscript and decision. The first line has 3 columns (thanks to colspan) and the others have 7. Here we make every meeting a separate entry, we can group them together later again if we want to. """ # now we need to parse the actual list # those lists new_consultation = Consultation() new_consultation.status = \ elem_line[0].attrib['title'].lower() if len(elem_line) == 3: # The order is "color/status", name of # committee / link to TOP, more info we # define a head dict here which can be # shared for the other lines once we find # another head line we will create a new # one here. new_consultation.role = \ elem_line[2].text.strip() # Name of committee, e.g. # "Finanzausschuss", unfort. without id #'committee' : elem_line[1].text.strip(), # For some obscure reasons sometimes action # is missing. elif len(elem_line) == 2: # The order is "color/status", name of # committee / link to TOP, more info. status = \ elem_line[0].attrib['title'].lower() # We define a head dict here which can be # shared for the other lines once we find # another head line we will create a new # one here. # name of committee, e.g. # "Finanzausschuss", unfort. without id #'committee' : elem_line[1].text.strip(), elif len(elem_line) == 7: try: # This is about line 2 with lots of # more stuff to process. # Date can be text or a link with that # text. # We have a link (and ignore it). if len(elem_line[1]) == 1: date_text = elem_line[1][0].text else: date_text = elem_line[1].text date_list.append( datetime.datetime.strptime( date_text.strip(), "%d.%m.%Y")) if len(elem_line[2]): # Form with silfdnr and toplfdnr # but only in link (action= # "to010.asp?topSelected=57023") form = elem_line[2][0] meeting_id = form[0].attrib[ 'value'] new_consultation.meeting = [ Meeting(originalId=meeting_id) ] # Full name of meeting, e.g. # "A/31/WP.16 öffentliche/ # nichtöffentliche Sitzung des # Finanzausschusses" #item['meeting'] = \ # elem_line[3][0].text.strip() else: # No link to TOP. Should not be # possible but happens. # (TODO: Bugreport?) # Here we have no link but the text # is in the TD directly - will be # scaped as meeting. #item['meeting'] = \ # elem_line[3].text.strip() logging.warn( "AgendaItem in consultation " "list on the web page does not " "contain a link to the actual " "meeting at paper %s", paper_url) toplfdnr = None if len(elem_line[6]) > 0: form = elem_line[6][0] toplfdnr = form[0].attrib['value'] if toplfdnr: new_consultation.originalId = \ "%s-%s" % (toplfdnr, paper_id) # actually the id of the transcript new_consultation.agendaItem = \ AgendaItem( originalId=toplfdnr) # e.g. "ungeändert beschlossen" new_consultation.agendaItem.result \ = elem_line[4].text.strip() consultations.append( new_consultation) else: logging.error( "missing agendaItem ID in " "consultation list at %s", paper_url) except (IndexError, KeyError): logging.error( "ERROR: Serious error in " "consultation list. Unable to " "parse.") logging.error( "Serious error in consultation " "list. Unable to parse.") return [] i += 1 # Theory: we don't need this at all, because it's # scraped at meeting. #data['consultations'] = consultations # set the marker to False again as we have read it self.consultation_list_start = False last_headline = headline # We simply ignore the rest (there might not be much more # actually). # The actual text comes after the table in a div but it's not # valid XML or HTML this using regex. data['docs'] = self.body_re.findall(response.text) first_date = False for single_date in date_list: if first_date: if single_date < first_date: first_date = single_date else: first_date = single_date paper = Paper(originalId=paper_id) paper.originalUrl = paper_url paper.name = data['betreff'] paper.description = data['docs'] if 'drucksache-art' in data: paper.paperType = data['drucksache-art'] if first_date: paper.publishedDate = first_date.strftime("%d.%m.%Y") # see theory above #if 'consultations' in data: # paper.consultation = data['consultations'] paper.auxiliaryFile = [] # get the attachments step 1 (Drucksache) file_1 = self.attachment_1_css(doc) if len(file_1): if file_1[0].value: href = ('%sdo027.asp' % self.config['scraper']['base_url']) original_id = file_1[0].value name = 'Drucksache' main_file = File(originalId=original_id, name=name) main_file = self.get_file(main_file, href, True) paper.mainFile = main_file # get the attachments step 2 (additional attachments) files = self.attachments_css(doc) if len(files) > 0: if len(files[0]) > 1: if files[0][1][0].text.strip() == "Anlagen:": for tr in files[0][2:]: link = tr[0][0] href = ("%s%s" % (self.config['scraper']['base_url'], link.attrib["href"])) name = link.text path_tokens = link.attrib["href"].split('/') original_id = "%d-%d" % (int( path_tokens[4]), int(path_tokens[6])) aux_file = File(originalId=original_id, name=name) aux_file = self.get_file(aux_file, href) paper.auxiliaryFile.append(aux_file) print paper.auxiliaryFile if not len(paper.auxiliaryFile): del paper.auxiliaryFile oid = self.db.save_paper(paper) return except (KeyError, IndexError): if try_counter < 3: logging.info("Try again: Getting paper %d from %s", paper_id, paper_url) try_counter += 1 else: logging.error("Failed getting paper %d from %s", paper_id, paper_url) return
def get_meeting(self, meeting_url=None, meeting_id=None): """ Load meeting details (e.g. agendaitems) for the given detail page URL or numeric ID """ meeting_url = ("%sto010.asp?selfaction=ws&template=xyz&SILFDNR=%s" % (self.config['scraper']['base_url'], meeting_id)) logging.info("Getting meeting %d from %s", meeting_id, meeting_url) r = self.get_url(meeting_url) if not r: return # If r.history has an item we have a problem if len(r.history): if r.history[0].status_code == 302: logging.info("Meeting %d from %s seems to be private", meeting_id, meeting_id) else: logging.error( "Strange redirect %d from %s with status code %s", meeting_id, meeting_url, r.history[0].status_code) return h = HTMLParser.HTMLParser() xml = str(r.text.encode('ascii', 'xmlcharrefreplace')) parser = etree.XMLParser(recover=True) root = etree.fromstring(xml, parser=parser) meeting = Meeting(originalId=meeting_id) # special area special = {} for item in root[0].iterchildren(): special[item.tag] = item.text # Woher kriegen wir das Datum? Nur über die Übersicht? #if 'sisb' in special: #if 'sise' in special: if 'saname' in special: meeting.type = special['saname'] # head area head = {} for item in root[1].iterchildren(): if item.text: head[item.tag] = h.unescape(item.text) else: head[item.text] = '' if 'sitext' in head: meeting.name = head['sitext'] if 'raname' in head: meeting.room = head['raname'] if 'raort' in head: meeting.address = head['raort'] agendaitems = [] for item in root[2].iterchildren(): elem = {} for e in item.iterchildren(): elem[e.tag] = e.text section = [elem['tofnum'], elem['tofunum'], elem['tofuunum']] section = [x for x in section if x != "0"] elem['section'] = ".".join(section) agendaitem = AgendaItem() agendaitem.originalId = int(elem['tolfdnr']) agendaitem.public = (elem['toostLang'] == u'öffentlich') #agendaitem.name = elem['totext1'] # get agenda detail page # TODO: Own Queue time.sleep(self.config['scraper']['wait_time']) agendaitem_url = ( '%sto020.asp?selfaction=ws&template=xyz&TOLFDNR=%s' % (self.config['scraper']['base_url'], agendaitem.originalId)) logging.info("Getting agendaitem %d from %s", agendaitem.originalId, agendaitem_url) agendaitem_r = self.get_url(agendaitem_url) if not agendaitem_r: return if len(agendaitem_r.history): logging.info("Agenda item %d from %s seems to be private", meeting_id, meeting_url) else: agendaitem_xml = agendaitem_r.text.encode( 'ascii', 'xmlcharrefreplace') # TODO: mixup of agendaitem_parser / parser below? agendaitem_parser = etree.XMLParser(recover=True) agendaitem_root = etree.fromstring(agendaitem_xml, parser=parser) add_agenda_item = {} for add_item in agendaitem_root[0].iterchildren(): if add_item.tag == "rtfWP" and len(add_item) > 0: try: agendaitem.resolution_text = h.unescape( etree.tostring(add_item[0][1][0])) except: logging.warn( "Unable to parse resolution text at " "%s", agendaitem_url) else: if add_item.text: add_agenda_item[add_item.tag] = h.unescape( add_item.text) if 'toptext' in add_agenda_item: agendaitem.name = add_agenda_item['toptext'] # there are papers with id = 0. we don't need them. if int(elem['volfdnr']): consult_id = (unicode(agendaitem.originalId) + unicode(int(elem['volfdnr']))) consultation = Consultation(originalId=consult_id) paper_id = int(elem['volfdnr']) if 'voname' in add_agenda_item: consultation.paper = Paper( originalId=paper_id, name=add_agenda_item['voname']) else: consultation.paper = Paper(originalId=paper_id) agendaitem.consultation = [consultation] if 'vobetr' in add_agenda_item: if add_agenda_item['vobetr'] != agendaitem.name: logging.warn( "different values for name: %s and %s", agendaitem.name, add_agenda_item['vobetr']) if hasattr(self, 'paper_queue'): self.paper_queue.add(int(elem['volfdnr'])) if 'totyp' in add_agenda_item: agendaitem.result = add_agenda_item['totyp'] agendaitems.append(agendaitem) meeting.agendaItem = agendaitems oid = self.db.save_meeting(meeting) logging.info("Meeting %d stored with _id %s", meeting_id, oid)
class PaperPage(object): valid = False # 网页数据的有效性,无效则不进行解析与储存 def __init__(self, _id, content=None, **kwargs): self._id = str(_id) self.paper = Paper() page_data = col_paper.find_one({"url_id": self._id}) if page_data: # 数据库中已经存在,直接返回 return if content is None: self.content = Downloader(host + self._id)() if self.content: self.valid = True else: logger.error("当前网页为空,无法进行解析\turl_id:" + self._id) self.valid = False return else: self.valid = True self.content = content self.selector = etree.HTML(self.content) self.paper.url_id = self._id def run(self): if not self.valid: logger.info("该paper已存在\turl_id:" + self._id) return self.main_page() self.get_in_citation() if len(self.paper.in_citations) < 1: logger.info("该paper参考文献小于1,已进行排除\turl_id:" + self._id) return self.get_citing_sentence() self.get_out_citation() self.paper.save() logger.info("完成解析\turl:%s\tpaper_id:%s" % (self._id, self.paper._id)) def main_page(self): try: self.paper.year = to_int( deep_select(self.selector, 0, "//table//tr[5]/td/text()")) # if self.paper.year < 2013: # # 对数据集进行筛选 # return # 列出所有作者 authors = deep_select(self.selector, return_type="list", xpath="//table//tr[6]/td//a/@href") if not authors: # 如果self.paper中无作者,则直接剔除数据 self.valid = False return authors_id = [to_num(x) for x in authors] self.paper.authors_full_name = deep_select( self.selector, return_type="list", xpath="//table//tr[6]/td//a/text()") self.paper.authors = authors_id self.paper._id = deep_select(self.selector, 0, "//table//tr[1]/td/text()") if not self.paper._id: self.valid = False return self.paper.title = deep_select(self.selector, 0, "//table//tr[2]/td/text()") self.paper.venue = deep_select(self.selector, 0, "//table//tr[3]/td/text()") self.paper.session = deep_select(self.selector, 0, "//table//tr[4]/td/text()") self.paper.abstract = clean( deep_select(self.selector, 0, '//div[@id="abstract"]/p/text()')) except Exception as e: logger.error("id:%s\t%s" % (self._id, e)) def get_out_citation(self): if not self.valid: return self.selector = etree.HTML( Downloader('http://aan.how/browse/outgoing_citations/' + self._id)()) out_citations = deep_select(self.selector, return_type="list", xpath='//a/@href') if out_citations: self.paper.out_citations = [to_num(x) for x in out_citations] def get_in_citation(self): if not self.valid: return self.selector = etree.HTML( Downloader('http://aan.how/browse/incoming_citations/' + self._id)()) in_citations = deep_select(self.selector, return_type="list", xpath='//a/@href') if in_citations: self.paper.in_citations = [to_num(x) for x in in_citations] def get_citing_sentence(self): if not self.valid: return self.selector = etree.HTML( Downloader('http://aan.how/browse/citing_sentences/' + self._id)()) paper_id = deep_select(self.selector, return_type="list", xpath='//a/text()') sentence = deep_select(self.selector, return_type="list", xpath="//tr/td[4]/div/text()") line = deep_select(self.selector, return_type="list", xpath="//tr/td[3]/text()") if paper_id and sentence: for x in range(len(paper_id)): citing_sentences = { "paper_id": paper_id[x], "sentence": clean(sentence[x]), "line": line[x] } self.paper.citing_sentences.append(citing_sentences)
def get_paper(self, paper_url=None, paper_id=None): """ Load paper details for the paper given by detail page URL or numeric ID """ paper_url = ('%svo020.asp?VOLFDNR=%s' % (self.config['scraper']['base_url'], paper_id)) logging.info("Getting paper %d from %s", paper_id, paper_url) # Stupid re-try concept because AllRis sometimes misses # start < at tags at first request. try_counter = 0 while True: try: response = self.get_url(paper_url) if not response: return if "noauth" in response.url: logging.warn("Paper %s in %s seems to private", paper_id, paper_url) return text = response.text doc = html.fromstring(text) data = {} # Beratungsfolge-Table checken # lets hope we always have this table table = self.table_css(doc)[0] self.consultation_list_start = False last_headline = '' for line in table: if line.tag == 'tr': headline = line[0].text elif line.tag == 'td': headline = line.text else: logging.error("ERROR: Serious error in data table. " "Unable to parse.") if headline: headline = headline.split(":")[0].lower() if headline[-1] == ":": headline = headline[:-1] if headline == "betreff": value = line[1].text_content().strip() # There is some html comment with a script # tag in front of the text which we remove. value = value.split("-->")[1] # remove all multiple spaces from the string data[headline] = " ".join(value.split()) elif headline in ['verfasser', u'federführend', 'drucksache-art']: data[headline] = line[1].text.strip() elif headline in ['status']: data[headline] = line[1].text.strip() # related papers if len(line) > 2: if len(line[3]): # Gets originalId. is there something # else at this position? (will break) paper_id = line[3][0][0][1][0].get( 'href').split('=')[1].split('&')[0] data['relatedPaper'] = [Paper( originalId=paper_id)] # Lot's of scraping just because of the date (?) elif headline == "beratungsfolge": # The actual list will be in the next row # inside a table, so we only set a marker. self.consultation_list_start = True elif self.consultation_list_start: elem = line[0][0] # The first line is pixel images, so skip # it, then we need to jump in steps of two. amount = (len(elem) - 1) / 2 consultations = [] date_list = [] i = 0 item = None for elem_line in elem: if i == 0: i += 1 continue """ Here we need to parse the actual list which can have different forms. A complex example can be found at http://ratsinfo.aachen.de/bi/vo020.asp?VOLFDNR=10822 The first line is some sort of headline with the committee in question and the type of consultation. After that 0-n lines of detailed information of meetings with a date, transscript and decision. The first line has 3 columns (thanks to colspan) and the others have 7. Here we make every meeting a separate entry, we can group them together later again if we want to. """ # now we need to parse the actual list # those lists new_consultation = Consultation() new_consultation.status = \ elem_line[0].attrib['title'].lower() if len(elem_line) == 3: # The order is "color/status", name of # committee / link to TOP, more info we # define a head dict here which can be # shared for the other lines once we find # another head line we will create a new # one here. new_consultation.role = \ elem_line[2].text.strip() # Name of committee, e.g. # "Finanzausschuss", unfort. without id #'committee' : elem_line[1].text.strip(), # For some obscure reasons sometimes action # is missing. elif len(elem_line) == 2: # The order is "color/status", name of # committee / link to TOP, more info. status = \ elem_line[0].attrib['title'].lower() # We define a head dict here which can be # shared for the other lines once we find # another head line we will create a new # one here. # name of committee, e.g. # "Finanzausschuss", unfort. without id #'committee' : elem_line[1].text.strip(), elif len(elem_line) == 7: try: # This is about line 2 with lots of # more stuff to process. # Date can be text or a link with that # text. # We have a link (and ignore it). if len(elem_line[1]) == 1: date_text = elem_line[1][0].text else: date_text = elem_line[1].text date_list.append( datetime.datetime.strptime( date_text.strip(), "%d.%m.%Y")) if len(elem_line[2]): # Form with silfdnr and toplfdnr # but only in link (action= # "to010.asp?topSelected=57023") form = elem_line[2][0] meeting_id = form[0].attrib['value'] new_consultation.meeting = [Meeting( originalId=meeting_id)] # Full name of meeting, e.g. # "A/31/WP.16 öffentliche/ # nichtöffentliche Sitzung des # Finanzausschusses" #item['meeting'] = \ # elem_line[3][0].text.strip() else: # No link to TOP. Should not be # possible but happens. # (TODO: Bugreport?) # Here we have no link but the text # is in the TD directly - will be # scaped as meeting. #item['meeting'] = \ # elem_line[3].text.strip() logging.warn( "AgendaItem in consultation " "list on the web page does not " "contain a link to the actual " "meeting at paper %s", paper_url) toplfdnr = None if len(elem_line[6]) > 0: form = elem_line[6][0] toplfdnr = form[0].attrib['value'] if toplfdnr: new_consultation.originalId = \ "%s-%s" % (toplfdnr, paper_id) # actually the id of the transcript new_consultation.agendaItem = \ AgendaItem( originalId=toplfdnr) # e.g. "ungeändert beschlossen" new_consultation.agendaItem.result \ = elem_line[4].text.strip() consultations.append( new_consultation) else: logging.error( "missing agendaItem ID in " "consultation list at %s", paper_url) except (IndexError, KeyError): logging.error( "ERROR: Serious error in " "consultation list. Unable to " "parse.") logging.error( "Serious error in consultation " "list. Unable to parse.") return [] i += 1 # Theory: we don't need this at all, because it's # scraped at meeting. #data['consultations'] = consultations # set the marker to False again as we have read it self.consultation_list_start = False last_headline = headline # We simply ignore the rest (there might not be much more # actually). # The actual text comes after the table in a div but it's not # valid XML or HTML this using regex. data['docs'] = self.body_re.findall(response.text) first_date = False for single_date in date_list: if first_date: if single_date < first_date: first_date = single_date else: first_date = single_date paper = Paper(originalId=paper_id) paper.originalUrl = paper_url paper.name = data['betreff'] paper.description = data['docs'] if 'drucksache-art' in data: paper.paperType = data['drucksache-art'] if first_date: paper.publishedDate = first_date.strftime("%d.%m.%Y") # see theory above #if 'consultations' in data: # paper.consultation = data['consultations'] paper.auxiliaryFile = [] # get the attachments step 1 (Drucksache) file_1 = self.attachment_1_css(doc) if len(file_1): if file_1[0].value: href = ('%sdo027.asp' % self.config['scraper']['base_url']) original_id = file_1[0].value name = 'Drucksache' main_file = File(originalId=original_id, name=name) main_file = self.get_file(main_file, href, True) paper.mainFile = main_file # get the attachments step 2 (additional attachments) files = self.attachments_css(doc) if len(files) > 0: if len(files[0]) > 1: if files[0][1][0].text.strip() == "Anlagen:": for tr in files[0][2:]: link = tr[0][0] href = ("%s%s" % (self.config['scraper']['base_url'], link.attrib["href"])) name = link.text path_tokens = link.attrib["href"].split('/') original_id = "%d-%d" % (int(path_tokens[4]), int(path_tokens[6])) aux_file = File(originalId=original_id, name=name) aux_file = self.get_file(aux_file, href) paper.auxiliaryFile.append(aux_file) print paper.auxiliaryFile if not len(paper.auxiliaryFile): del paper.auxiliaryFile oid = self.db.save_paper(paper) return except (KeyError, IndexError): if try_counter < 3: logging.info("Try again: Getting paper %d from %s", paper_id, paper_url) try_counter += 1 else: logging.error("Failed getting paper %d from %s", paper_id, paper_url) return