コード例 #1
0
ファイル: scraperallris.py プロジェクト: OpenRuhr/ris-scraper
  def get_paper(self, paper_url=None, paper_id=None):
    """
    Load paper details for the paper given by detail page URL
    or numeric ID
    """
    paper_url = '%svo020.asp?VOLFDNR=%s' % (self.config.BASE_URL, paper_id)
    logging.info("Getting paper %d from %s", paper_id, paper_url)

    # stupid re-try concept because AllRis sometimes misses start < at tags at first request
    try_counter = 0
    while True:
      try:
        response = self.get_url(paper_url)
        if not response:
          return
        if "noauth" in response.url:
          logging.warn("Paper %s in %s seems to private" % (paper_id, paper_url))
          return
        text = self.preprocess_text(response.text)
        doc = html.fromstring(text)
        data = {}
        
        # Beratungsfolge-Table checken
        table = self.table_css(doc)[0] # lets hope we always have this table
        self.consultation_list_start = False
        last_headline = ''
        for line in table:
          if line.tag == 'tr':
            headline = line[0].text
          elif line.tag == 'td':
            headline = line.text
          else:
            logging.error("ERROR: Serious error in data table. Unable to parse.")
          if headline:
            headline = headline.split(":")[0].lower()
            if headline[-1]==":":
              headline = headline[:-1]
            if headline == "betreff":
              value = line[1].text_content().strip()
              value = value.split("-->")[1]         # there is some html comment with a script tag in front of the text which we remove
              data[headline] = " ".join(value.split())  # remove all multiple spaces from the string
            elif headline in ['verfasser', u'federführend', 'drucksache-art']:
              data[headline] = line[1].text.strip()
            elif headline in ['status']:
              data[headline] = line[1].text.strip()
              # related papers
              if len(line) > 2:
                if len(line[3]):
                  # gets identifier. is there something else at this position? (will break)
                  data['paper'] = [{'paper': Paper(numeric_id=line[3][0][0][1][0].get('href').split('=')[1].split('&')[0] , identifier=line[3][0][0][1][0].text)}]
                  
            elif headline == "beratungsfolge":
              # the actual list will be in the next row inside a table, so we only set a marker
              data = self.parse_consultation_list_headline(line, data) # for parser which have the consultation list here
            elif self.consultation_list_start:
              data = self.parse_consultation_list(line, data) # for parser which have the consultation list in the next tr
              self.consultation_list_start = False # set the marker to False again as we have read it
          last_headline = headline
          # we simply ignore the rest (there might not be much more actually)
        # the actual text comes after the table in a div but it's not valid XML or HTML this using regex
        data['docs'] = self.body_re.findall(response.text)
        first_date = False
        for single_consultation in data['consultation']:
          if first_date:
            if single_consultation['date'] < first_date:
              first_date = single_consultation['date']
          else:
            first_date = single_consultation['date']
        
        paper = Paper(numeric_id = paper_id)
        paper.original_url = paper_url
        paper.title = data['betreff']
        paper.description = data['docs']
        paper.type = data['drucksache-art']
        if first_date:
          paper.date = first_date.strftime("%Y-%m-%d")
        if 'identifier' in data:
          paper.identifier = data['identifier']
        
        paper.document = []
        # get the attachments step 1 (Drucksache)
        document_1 = self.attachment_1_css(doc)
        if len(document_1):
          if document_1[0].value:
            href = '%sdo027.asp' % self.config.BASE_URL
            identifier = document_1[0].value
            title = 'Drucksache'
            document = Document(
              identifier=identifier,
              numeric_id=int(identifier),
              title=title)
            document = self.get_document_file(document, href, True)
            paper.document.append({'document': document, 'relation': 'misc'})
        # get the attachments step 2 (additional attachments)
        documents = self.attachments_css(doc)
        if len(documents) > 0:
          if len(documents[0]) > 1:
            if documents[0][1][0].text.strip() == "Anlagen:":
              for tr in documents[0][2:]:
                link = tr[0][0]
                href = "%s%s" % (self.config.BASE_URL, link.attrib["href"])
                title = link.text
                identifier = str(int(link.attrib["href"].split('/')[4]))
                document = Document(
                  identifier=identifier,
                  numeric_id=int(identifier),
                  title=title)
                document = self.get_document_file(document, href)
                paper.document.append({'document': document, 'relation': 'misc'})
        oid = self.db.save_paper(paper)
        return
      except (KeyError, IndexError):
        if try_counter < 3:
          logging.info("Try again: Getting paper %d from %s", paper_id, paper_url)
          try_counter += 1
        else:
          logging.error("Failed getting paper %d from %s", paper_id, paper_url)
          return
コード例 #2
0
 def get_paper(self, paper_url=None, paper_id=None):
   """
   Load paper details for the paper given by detail page URL
   or numeric ID
   """
   # Read either paper_id or paper_url from the opposite
   if paper_id is not None:
     paper_url = self.urls['SUBMISSION_DETAIL_PRINT_PATTERN'] % paper_id
   elif paper_url is not None:
     parsed = parse.search(self.urls['SUBMISSION_DETAIL_PARSE_PATTERN'], paper_url)
     paper_id = parsed['paper_id']
 
   logging.info("Getting paper %d from %s", paper_id, paper_url)
   
   paper = Paper(numeric_id=paper_id)
   try_until = 1
   try_counter = 0
   try_found = False
   
   while (try_counter < try_until):
     try_counter += 1
     try_found = False
     time.sleep(self.config.WAIT_TIME)
     try:
       response = self.user_agent.open(paper_url)
     except urllib2.HTTPError, e:
       if e.code == 404:
         sys.stderr.write("URL not found (HTTP 404) error caught: %s\n" % paper_url)
         sys.stderr.write("Please check BASE_URL in your configuration.\n")
         sys.exit(1)
       elif e.code == 502:
         try_until = 4
         try_found = True
         if try_until == try_counter:
           logging.error("Permanent error in %s after 4 retrys.", paper_url)
           return
         else:
           logging.info("Original RIS Server Bug, restart scraping paper %s", paper_url)
     mechanize_forms = mechanize.ParseResponse(response, backwards_compat=False)
     response.seek(0)
     html = response.read()
     html = html.replace('&nbsp;', ' ')
     parser = etree.HTMLParser()
     dom = etree.parse(StringIO(html), parser)
     # Hole die Seite noch einmal wenn unbekannter zufällig auftretender Fehler ohne Fehlermeldung ausgegeben wird (gefunden in Duisburg, vermutlich kaputte Server Config)
     try:
       page_title = dom.xpath('//h1')[0].text
       if 'Fehler' in page_title:
         try_until = 4
         try_found = True
         if try_until == try_counter:
           logging.error("Permanent error in %s after 3 retrys, proceed.", paper_url)
         else:
           logging.info("Original RIS Server Bug, restart scraping paper %s", paper_url)
     except:
       pass
     if (try_found == False):
       # check for page errors
       try:
         if 'Fehlermeldung' in page_title:
           logging.info("Page %s cannot be accessed due to server error", paper_url)
           return
         if 'Berechtigungsfehler' in page_title:
           logging.info("Page %s cannot be accessed due to permissions", paper_url)
           return
       except:
         pass
   
       paper.original_url = paper_url
       paper_related = []
       # Paper title
       try:
         stitle = dom.xpath(self.xpath['SUBMISSION_DETAIL_TITLE'])
         paper.title = stitle[0].text
       except:
         logging.critical('Cannot find paper title element using XPath SUBMISSION_DETAIL_TITLE')
         raise TemplateError('Cannot find paper title element using XPath SUBMISSION_DETAIL_TITLE')
     
       # Submission identifier, date, type etc
       tds = dom.xpath(self.xpath['SUBMISSION_DETAIL_IDENTIFIER_TD'])
       if len(tds) == 0:
         logging.critical('Cannot find table fields using XPath SUBMISSION_DETAIL_IDENTIFIER_TD')
         logging.critical('HTML Dump:' + html)
         raise TemplateError('Cannot find table fields using XPath SUBMISSION_DETAIL_IDENTIFIER_TD')
       else:
         current_category = None
         for n in range(0, len(tds)):
           try:
             tdcontent = tds[n].text.strip()
           except:
             continue
           if tdcontent == 'Name:':
             paper.identifier = tds[n + 1].text.strip()
           elif tdcontent == 'Art:':
             paper.type = tds[n + 1].text.strip()
           elif tdcontent == 'Datum:':
             paper.date = tds[n + 1].text.strip()
           elif tdcontent == 'Name:':
             paper.identifier = tds[n + 1].text.strip()
           elif tdcontent == 'Betreff:':
             paper.subject = '; '.join(tds[n + 1].xpath('./text()'))
           elif tdcontent == 'Aktenzeichen:':
             paper.reference_number = tds[n + 1].text.strip()
           elif tdcontent == 'Referenzvorlage:':
             link = tds[n + 1].xpath('a')[0]
             href = link.get('href')
             parsed = parse.search(self.urls['SUBMISSION_DETAIL_PARSE_PATTERN'], href)
             superordinated_paper = Paper(numeric_id=parsed['paper_id'], identifier=link.text.strip())
             paper_related.append({ 'relation': 'superordinated',
                             'paper':  superordinated_paper})
             # add superordinate paper to queue
             if hasattr(self, 'paper_queue'):
               self.paper_queue.add(parsed['paper_id'])
           # subordinate papers are added to the queue
           elif tdcontent == 'Untergeordnete Vorlage(n):':
             current_category = 'subordinates'
             for link in tds[n + 1].xpath('a'):
               href = link.get('href')
               parsed = parse.search(self.urls['SUBMISSION_DETAIL_PARSE_PATTERN'], href)
               subordinated_paper = Paper(numeric_id=parsed['paper_id'], identifier=link.text.strip())
               paper_related.append({ 'relation': 'subordinated',
                             'paper':  subordinated_paper})
               if hasattr(self, 'paper_queue') and parsed is not None:
                 # add subordinate paper to queue
                 self.paper_queue.add(parsed['paper_id'])
           else:
             if current_category == 'subordinates' and len(tds) > n+1:
               for link in tds[n + 1].xpath('a'):
                 href = link.get('href')
                 parsed = parse.search(self.urls['SUBMISSION_DETAIL_PARSE_PATTERN'], href)
                 subordinated_paper = Paper(numeric_id=parsed['paper_id'], identifier=link.text.strip())
                 paper_related.append({ 'relation': 'subordinated',
                               'paper':  subordinated_paper})
                 if hasattr(self, 'paper_queue') and parsed is not None:
                   self.paper_queue.add(parsed['paper_id'])
         if len(paper_related):
           paper.paper = paper_related
         if not hasattr(paper, 'identifier'):
           logging.critical('Cannot find paper identifier using SESSION_DETAIL_IDENTIFIER_TD_XPATH')
           raise TemplateError('Cannot find paper identifier using SESSION_DETAIL_IDENTIFIER_TD_XPATH')
     
       # "Beratungsfolge"(list of sessions for this paper)
       # This is currently not parsed for scraping, but only for
       # gathering session-document ids for later exclusion
       found_documents = []
       rows = dom.xpath(self.xpath['SUBMISSION_DETAIL_AGENDA_ROWS'])
       for row in rows:
         # find forms
         formfields = row.xpath('.//input[@type="hidden"][@name="DT"]')
         for formfield in formfields:
           document_id = formfield.get('value')
           if document_id is not None:
             found_documents.append(document_id)
         # find links
         links = row.xpath('.//a[contains(@href,"getfile.")]')
         for link in links:
           if not link.xpath('.//img'):
             file_link = self.config.BASE_URL + link.get('href')
             document_id = file_link.split('id=')[1].split('&')[0]
             found_documents.append(document_id)
       # paper-related documents
       documents = []
       paper.document = []
       containers = dom.xpath(self.xpath['SUBMISSION_DETAIL_ATTACHMENTS'])
       for container in containers:
         try:
           classes = container.get('class').split(' ')
         except:
           continue
         if self.xpath['SUBMISSION_DETAIL_ATTACHMENTS_CONTAINER_CLASSNAME'] not in classes:
           continue
         rows = container.xpath('.//tr')
         for row in rows:
           # seems that we have direct links
           if not row.xpath('.//form'):
             links = row.xpath('.//a')
             for link in links:
               # ignore additional pdf icon links
               if not link.xpath('.//img'):
                 title = ' '.join(link.xpath('./text()')).strip()
                 file_link = self.config.BASE_URL + link.get('href')
                 document_id = file_link.split('id=')[1].split('&')[0]
                 if document_id in found_documents:
                   continue
                 document = Document(
                   identifier=document_id,
                   numeric_id=document_id,
                   title=title,
                   original_url=file_link)
                 document = self.get_document_file(document=document, link=file_link)
                 if 'Einladung' in title:
                   document_type = 'invitation'
                 elif 'Niederschrift' in title:
                   document_type = 'results_protocol'
                 else:
                   document_type = 'misc'
                 paper.document.append({'relation': document_type, 'document': document})
                 found_documents.append(document_id)
           # no direct link, so we have to handle forms
           else:
             forms = row.xpath('.//form')
             for form in forms:
               title = " ".join(row.xpath('./td/text()')).strip()
               for hidden_field in form.xpath('input[@name="DT"]'):
                 document_id = hidden_field.get('value')
                 if document_id in found_documents:
                   continue
                 document = Document(
                   identifier=document_id,
                   numeric_id=document_id,
                   title=title)
                 # Traversing the whole mechanize response to submit this form
                 for mform in mechanize_forms:
                   for control in mform.controls:
                     if control.name == 'DT' and control.value == document_id:
                       document = self.get_document_file(document=document, form=mform)
                       if 'Einladung' in title:
                         document_type = 'invitation'
                       elif 'Niederschrift' in title:
                         document_type = 'results_protocol'
                       else:
                         document_type = 'misc'
                       paper.document.append({'relation': document_type, 'document': document})
                       found_documents.append(document_id)
       if len(documents):
         paper.document = documents
       # forcing overwrite=True here
       oid = self.db.save_paper(paper)