Beispiel #1
0
 def __init__(self, _id, content=None, **kwargs):
     self._id = str(_id)
     self.paper = Paper()
     page_data = col_paper.find_one({"url_id": self._id})
     if page_data:
         # 数据库中已经存在,直接返回
         return
     if content is None:
         self.content = Downloader(host + self._id)()
         if self.content:
             self.valid = True
         else:
             logger.error("当前网页为空,无法进行解析\turl_id:" + self._id)
             self.valid = False
             return
     else:
         self.valid = True
         self.content = content
     self.selector = etree.HTML(self.content)
     self.paper.url_id = self._id
Beispiel #2
0
  def get_paper(self, paper_url=None, paper_id=None):
    """
    Load paper details for the paper given by detail page URL
    or numeric ID
    """
    paper_url = '%svo020.asp?VOLFDNR=%s' % (self.config.BASE_URL, paper_id)
    logging.info("Getting paper %d from %s", paper_id, paper_url)

    # stupid re-try concept because AllRis sometimes misses start < at tags at first request
    try_counter = 0
    while True:
      try:
        response = self.get_url(paper_url)
        if not response:
          return
        if "noauth" in response.url:
          logging.warn("Paper %s in %s seems to private" % (paper_id, paper_url))
          return
        text = self.preprocess_text(response.text)
        doc = html.fromstring(text)
        data = {}
        
        # Beratungsfolge-Table checken
        table = self.table_css(doc)[0] # lets hope we always have this table
        self.consultation_list_start = False
        last_headline = ''
        for line in table:
          if line.tag == 'tr':
            headline = line[0].text
          elif line.tag == 'td':
            headline = line.text
          else:
            logging.error("ERROR: Serious error in data table. Unable to parse.")
          if headline:
            headline = headline.split(":")[0].lower()
            if headline[-1]==":":
              headline = headline[:-1]
            if headline == "betreff":
              value = line[1].text_content().strip()
              value = value.split("-->")[1]         # there is some html comment with a script tag in front of the text which we remove
              data[headline] = " ".join(value.split())  # remove all multiple spaces from the string
            elif headline in ['verfasser', u'federführend', 'drucksache-art']:
              data[headline] = line[1].text.strip()
            elif headline in ['status']:
              data[headline] = line[1].text.strip()
              # related papers
              if len(line) > 2:
                if len(line[3]):
                  # gets identifier. is there something else at this position? (will break)
                  data['paper'] = [{'paper': Paper(numeric_id=line[3][0][0][1][0].get('href').split('=')[1].split('&')[0] , identifier=line[3][0][0][1][0].text)}]
                  
            elif headline == "beratungsfolge":
              # the actual list will be in the next row inside a table, so we only set a marker
              data = self.parse_consultation_list_headline(line, data) # for parser which have the consultation list here
            elif self.consultation_list_start:
              data = self.parse_consultation_list(line, data) # for parser which have the consultation list in the next tr
              self.consultation_list_start = False # set the marker to False again as we have read it
          last_headline = headline
          # we simply ignore the rest (there might not be much more actually)
        # the actual text comes after the table in a div but it's not valid XML or HTML this using regex
        data['docs'] = self.body_re.findall(response.text)
        first_date = False
        for single_consultation in data['consultation']:
          if first_date:
            if single_consultation['date'] < first_date:
              first_date = single_consultation['date']
          else:
            first_date = single_consultation['date']
        
        paper = Paper(numeric_id = paper_id)
        paper.original_url = paper_url
        paper.title = data['betreff']
        paper.description = data['docs']
        paper.type = data['drucksache-art']
        if first_date:
          paper.date = first_date.strftime("%Y-%m-%d")
        if 'identifier' in data:
          paper.identifier = data['identifier']
        
        paper.document = []
        # get the attachments step 1 (Drucksache)
        document_1 = self.attachment_1_css(doc)
        if len(document_1):
          if document_1[0].value:
            href = '%sdo027.asp' % self.config.BASE_URL
            identifier = document_1[0].value
            title = 'Drucksache'
            document = Document(
              identifier=identifier,
              numeric_id=int(identifier),
              title=title)
            document = self.get_document_file(document, href, True)
            paper.document.append({'document': document, 'relation': 'misc'})
        # get the attachments step 2 (additional attachments)
        documents = self.attachments_css(doc)
        if len(documents) > 0:
          if len(documents[0]) > 1:
            if documents[0][1][0].text.strip() == "Anlagen:":
              for tr in documents[0][2:]:
                link = tr[0][0]
                href = "%s%s" % (self.config.BASE_URL, link.attrib["href"])
                title = link.text
                identifier = str(int(link.attrib["href"].split('/')[4]))
                document = Document(
                  identifier=identifier,
                  numeric_id=int(identifier),
                  title=title)
                document = self.get_document_file(document, href)
                paper.document.append({'document': document, 'relation': 'misc'})
        oid = self.db.save_paper(paper)
        return
      except (KeyError, IndexError):
        if try_counter < 3:
          logging.info("Try again: Getting paper %d from %s", paper_id, paper_url)
          try_counter += 1
        else:
          logging.error("Failed getting paper %d from %s", paper_id, paper_url)
          return
Beispiel #3
0
 def get_paper(self, paper_url=None, paper_id=None):
   """
   Load paper details for the paper given by detail page URL
   or numeric ID
   """
   # Read either paper_id or paper_url from the opposite
   if paper_id is not None:
     paper_url = self.urls['SUBMISSION_DETAIL_PRINT_PATTERN'] % paper_id
   elif paper_url is not None:
     parsed = parse.search(self.urls['SUBMISSION_DETAIL_PARSE_PATTERN'], paper_url)
     paper_id = parsed['paper_id']
 
   logging.info("Getting paper %d from %s", paper_id, paper_url)
   
   paper = Paper(numeric_id=paper_id)
   try_until = 1
   try_counter = 0
   try_found = False
   
   while (try_counter < try_until):
     try_counter += 1
     try_found = False
     time.sleep(self.config.WAIT_TIME)
     try:
       response = self.user_agent.open(paper_url)
     except urllib2.HTTPError, e:
       if e.code == 404:
         sys.stderr.write("URL not found (HTTP 404) error caught: %s\n" % paper_url)
         sys.stderr.write("Please check BASE_URL in your configuration.\n")
         sys.exit(1)
       elif e.code == 502:
         try_until = 4
         try_found = True
         if try_until == try_counter:
           logging.error("Permanent error in %s after 4 retrys.", paper_url)
           return
         else:
           logging.info("Original RIS Server Bug, restart scraping paper %s", paper_url)
     mechanize_forms = mechanize.ParseResponse(response, backwards_compat=False)
     response.seek(0)
     html = response.read()
     html = html.replace('&nbsp;', ' ')
     parser = etree.HTMLParser()
     dom = etree.parse(StringIO(html), parser)
     # Hole die Seite noch einmal wenn unbekannter zufällig auftretender Fehler ohne Fehlermeldung ausgegeben wird (gefunden in Duisburg, vermutlich kaputte Server Config)
     try:
       page_title = dom.xpath('//h1')[0].text
       if 'Fehler' in page_title:
         try_until = 4
         try_found = True
         if try_until == try_counter:
           logging.error("Permanent error in %s after 3 retrys, proceed.", paper_url)
         else:
           logging.info("Original RIS Server Bug, restart scraping paper %s", paper_url)
     except:
       pass
     if (try_found == False):
       # check for page errors
       try:
         if 'Fehlermeldung' in page_title:
           logging.info("Page %s cannot be accessed due to server error", paper_url)
           return
         if 'Berechtigungsfehler' in page_title:
           logging.info("Page %s cannot be accessed due to permissions", paper_url)
           return
       except:
         pass
   
       paper.original_url = paper_url
       paper_related = []
       # Paper title
       try:
         stitle = dom.xpath(self.xpath['SUBMISSION_DETAIL_TITLE'])
         paper.title = stitle[0].text
       except:
         logging.critical('Cannot find paper title element using XPath SUBMISSION_DETAIL_TITLE')
         raise TemplateError('Cannot find paper title element using XPath SUBMISSION_DETAIL_TITLE')
     
       # Submission identifier, date, type etc
       tds = dom.xpath(self.xpath['SUBMISSION_DETAIL_IDENTIFIER_TD'])
       if len(tds) == 0:
         logging.critical('Cannot find table fields using XPath SUBMISSION_DETAIL_IDENTIFIER_TD')
         logging.critical('HTML Dump:' + html)
         raise TemplateError('Cannot find table fields using XPath SUBMISSION_DETAIL_IDENTIFIER_TD')
       else:
         current_category = None
         for n in range(0, len(tds)):
           try:
             tdcontent = tds[n].text.strip()
           except:
             continue
           if tdcontent == 'Name:':
             paper.identifier = tds[n + 1].text.strip()
           elif tdcontent == 'Art:':
             paper.type = tds[n + 1].text.strip()
           elif tdcontent == 'Datum:':
             paper.date = tds[n + 1].text.strip()
           elif tdcontent == 'Name:':
             paper.identifier = tds[n + 1].text.strip()
           elif tdcontent == 'Betreff:':
             paper.subject = '; '.join(tds[n + 1].xpath('./text()'))
           elif tdcontent == 'Aktenzeichen:':
             paper.reference_number = tds[n + 1].text.strip()
           elif tdcontent == 'Referenzvorlage:':
             link = tds[n + 1].xpath('a')[0]
             href = link.get('href')
             parsed = parse.search(self.urls['SUBMISSION_DETAIL_PARSE_PATTERN'], href)
             superordinated_paper = Paper(numeric_id=parsed['paper_id'], identifier=link.text.strip())
             paper_related.append({ 'relation': 'superordinated',
                             'paper':  superordinated_paper})
             # add superordinate paper to queue
             if hasattr(self, 'paper_queue'):
               self.paper_queue.add(parsed['paper_id'])
           # subordinate papers are added to the queue
           elif tdcontent == 'Untergeordnete Vorlage(n):':
             current_category = 'subordinates'
             for link in tds[n + 1].xpath('a'):
               href = link.get('href')
               parsed = parse.search(self.urls['SUBMISSION_DETAIL_PARSE_PATTERN'], href)
               subordinated_paper = Paper(numeric_id=parsed['paper_id'], identifier=link.text.strip())
               paper_related.append({ 'relation': 'subordinated',
                             'paper':  subordinated_paper})
               if hasattr(self, 'paper_queue') and parsed is not None:
                 # add subordinate paper to queue
                 self.paper_queue.add(parsed['paper_id'])
           else:
             if current_category == 'subordinates' and len(tds) > n+1:
               for link in tds[n + 1].xpath('a'):
                 href = link.get('href')
                 parsed = parse.search(self.urls['SUBMISSION_DETAIL_PARSE_PATTERN'], href)
                 subordinated_paper = Paper(numeric_id=parsed['paper_id'], identifier=link.text.strip())
                 paper_related.append({ 'relation': 'subordinated',
                               'paper':  subordinated_paper})
                 if hasattr(self, 'paper_queue') and parsed is not None:
                   self.paper_queue.add(parsed['paper_id'])
         if len(paper_related):
           paper.paper = paper_related
         if not hasattr(paper, 'identifier'):
           logging.critical('Cannot find paper identifier using SESSION_DETAIL_IDENTIFIER_TD_XPATH')
           raise TemplateError('Cannot find paper identifier using SESSION_DETAIL_IDENTIFIER_TD_XPATH')
     
       # "Beratungsfolge"(list of sessions for this paper)
       # This is currently not parsed for scraping, but only for
       # gathering session-document ids for later exclusion
       found_documents = []
       rows = dom.xpath(self.xpath['SUBMISSION_DETAIL_AGENDA_ROWS'])
       for row in rows:
         # find forms
         formfields = row.xpath('.//input[@type="hidden"][@name="DT"]')
         for formfield in formfields:
           document_id = formfield.get('value')
           if document_id is not None:
             found_documents.append(document_id)
         # find links
         links = row.xpath('.//a[contains(@href,"getfile.")]')
         for link in links:
           if not link.xpath('.//img'):
             file_link = self.config.BASE_URL + link.get('href')
             document_id = file_link.split('id=')[1].split('&')[0]
             found_documents.append(document_id)
       # paper-related documents
       documents = []
       paper.document = []
       containers = dom.xpath(self.xpath['SUBMISSION_DETAIL_ATTACHMENTS'])
       for container in containers:
         try:
           classes = container.get('class').split(' ')
         except:
           continue
         if self.xpath['SUBMISSION_DETAIL_ATTACHMENTS_CONTAINER_CLASSNAME'] not in classes:
           continue
         rows = container.xpath('.//tr')
         for row in rows:
           # seems that we have direct links
           if not row.xpath('.//form'):
             links = row.xpath('.//a')
             for link in links:
               # ignore additional pdf icon links
               if not link.xpath('.//img'):
                 title = ' '.join(link.xpath('./text()')).strip()
                 file_link = self.config.BASE_URL + link.get('href')
                 document_id = file_link.split('id=')[1].split('&')[0]
                 if document_id in found_documents:
                   continue
                 document = Document(
                   identifier=document_id,
                   numeric_id=document_id,
                   title=title,
                   original_url=file_link)
                 document = self.get_document_file(document=document, link=file_link)
                 if 'Einladung' in title:
                   document_type = 'invitation'
                 elif 'Niederschrift' in title:
                   document_type = 'results_protocol'
                 else:
                   document_type = 'misc'
                 paper.document.append({'relation': document_type, 'document': document})
                 found_documents.append(document_id)
           # no direct link, so we have to handle forms
           else:
             forms = row.xpath('.//form')
             for form in forms:
               title = " ".join(row.xpath('./td/text()')).strip()
               for hidden_field in form.xpath('input[@name="DT"]'):
                 document_id = hidden_field.get('value')
                 if document_id in found_documents:
                   continue
                 document = Document(
                   identifier=document_id,
                   numeric_id=document_id,
                   title=title)
                 # Traversing the whole mechanize response to submit this form
                 for mform in mechanize_forms:
                   for control in mform.controls:
                     if control.name == 'DT' and control.value == document_id:
                       document = self.get_document_file(document=document, form=mform)
                       if 'Einladung' in title:
                         document_type = 'invitation'
                       elif 'Niederschrift' in title:
                         document_type = 'results_protocol'
                       else:
                         document_type = 'misc'
                       paper.document.append({'relation': document_type, 'document': document})
                       found_documents.append(document_id)
       if len(documents):
         paper.document = documents
       # forcing overwrite=True here
       oid = self.db.save_paper(paper)
 def get_paper(self, paper_url=None, paper_id=None):
   """
   Load paper details for the paper given by detail page URL
   or numeric ID
   """
   # Read either paper_id or paper_url from the opposite
   if paper_id is not None:
     paper_url = self.urls['PAPER_DETAIL_PRINT_PATTERN'] % (self.config["scraper"]["base_url"], paper_id)
   elif paper_url is not None:
     parsed = parse.search(self.urls['PAPER_DETAIL_PARSE_PATTERN'], paper_url)
     paper_id = parsed['paper_id']
 
   logging.info("Getting paper %d from %s", paper_id, paper_url)
   
   paper = Paper(originalId=paper_id)
   try_until = 1
   try_counter = 0
   try_found = False
   
   while (try_counter < try_until):
     try_counter += 1
     try_found = False
     time.sleep(self.config['scraper']['wait_time'])
     try:
       response = self.user_agent.open(paper_url)
     except urllib2.HTTPError, e:
       if e.code == 404:
         sys.stderr.write("URL not found (HTTP 404) error caught: %s\n" % paper_url)
         sys.stderr.write("Please check BASE_URL in your configuration.\n")
         sys.exit(1)
       elif e.code == 502 or e.code == 500:
         try_until = 4
         try_found = True
         if try_until == try_counter:
           logging.error("Permanent error in %s after 4 retrys.", paper_url)
           return
         else:
           logging.info("Original RIS Server Bug, restart fetching paper %s", paper_url)
     if not response:
       return
     mechanize_forms = mechanize.ParseResponse(response, backwards_compat=False)
     response.seek(0)
     html = response.read()
     html = html.replace('&nbsp;', ' ')
     parser = etree.HTMLParser()
     dom = etree.parse(StringIO(html), parser)
     # Hole die Seite noch einmal wenn unbekannter zufällig auftretender Fehler ohne Fehlermeldung ausgegeben wird (gefunden in Duisburg, vermutlich kaputte Server Config)
     try:
       page_title = dom.xpath('//h1')[0].text
       if 'Fehler' in page_title:
         try_until = 4
         try_found = True
         if try_until == try_counter:
           logging.error("Permanent error in %s after 3 retrys, proceed.", paper_url)
         else:
           logging.info("Original RIS Server Bug, restart scraping paper %s", paper_url)
     except:
       pass
     if (try_found == False):
       # check for page errors
       try:
         if 'Fehlermeldung' in page_title:
           logging.info("Page %s cannot be accessed due to server error", paper_url)
           return
         if 'Berechtigungsfehler' in page_title:
           logging.info("Page %s cannot be accessed due to permissions", paper_url)
           return
       except:
         pass
   
       paper.originalUrl = paper_url
       superordinated_papers = []
       subordinated_papers = []
       
       # Paper title
       try:
         stitle = dom.xpath(self.xpath['PAPER_DETAIL_TITLE'])
         paper.title = stitle[0].text
       except:
         logging.critical('Cannot find paper title element using XPath PAPER_DETAIL_TITLE')
         raise TemplateError('Cannot find paper title element using XPath PAPER_DETAIL_TITLE')
     
       # Paper identifier, date, type etc
       tds = dom.xpath(self.xpath['PAPER_DETAIL_IDENTIFIER_TD'])
       if len(tds) == 0:
         logging.critical('Cannot find table fields using XPath PAPER_DETAIL_IDENTIFIER_TD')
         logging.critical('HTML Dump:' + html)
         raise TemplateError('Cannot find table fields using XPath PAPER_DETAIL_IDENTIFIER_TD')
       else:
         current_category = None
         for n in range(0, len(tds)):
           try:
             tdcontent = tds[n].text.strip()
           except:
             continue
           if tdcontent == 'Name:':
             paper.nameShort = tds[n + 1].text.strip()
           # TODO: Dereferenzierung von Paper Type Strings
           elif tdcontent == 'Art:':
             paper.paperType = tds[n + 1].text.strip()
           elif tdcontent == 'Datum:':
             paper.publishedDate = tds[n + 1].text.strip()
           elif tdcontent == 'Betreff:':
             paper.name = '; '.join(tds[n + 1].xpath('./text()'))
           elif tdcontent == 'Aktenzeichen:':
             paper.reference = tds[n + 1].text.strip()
           elif tdcontent == 'Referenzvorlage:':
             link = tds[n + 1].xpath('a')[0]
             href = link.get('href')
             parsed = parse.search(self.urls['PAPER_DETAIL_PARSE_PATTERN'], href)
             superordinated_paper = Paper(originalId=parsed['paper_id'], nameShort=link.text.strip())
             superordinated_papers.append(superordinated_paper)
             # add superordinate paper to queue
             if hasattr(self, 'paper_queue'):
               self.paper_queue.add(parsed['paper_id'])
           # subordinate papers are added to the queue
           elif tdcontent == 'Untergeordnete Vorlage(n):':
             current_category = 'subordinates'
             for link in tds[n + 1].xpath('a'):
               href = link.get('href')
               parsed = parse.search(self.urls['PAPER_DETAIL_PARSE_PATTERN'], href)
               subordinated_paper = Paper(originalId=parsed['paper_id'], nameShort=link.text.strip())
               subordinated_papers.append(subordinated_paper)
               if hasattr(self, 'paper_queue') and parsed is not None:
                 # add subordinate paper to queue
                 self.paper_queue.add(parsed['paper_id'])
           elif tdcontent == u'Anträge zur Vorlage:':
             current_category = 'todo'
             pass #TODO: WTF is this?
           else:
             if current_category == 'subordinates' and len(tds) > n+1:
               for link in tds[n + 1].xpath('a'):
                 href = link.get('href')
                 parsed = parse.search(self.urls['PAPER_DETAIL_PARSE_PATTERN'], href)
                 subordinated_paper = Paper(originalId=parsed['paper_id'], nameShort=link.text.strip())
                 subordinated_papers.append(subordinated_paper)
                 if hasattr(self, 'paper_queue') and parsed is not None:
                   self.paper_queue.add(parsed['paper_id'])
         if len(subordinated_papers):
           paper.subordinatedPaper = subordinated_papers
         if len(superordinated_papers):
           paper.superordinatedPaper = superordinated_papers
         if not hasattr(paper, 'originalId'):
           logging.critical('Cannot find paper identifier using MEETING_DETAIL_IDENTIFIER_TD')
           raise TemplateError('Cannot find paper identifier using MEETING_DETAIL_IDENTIFIER_TD')
     
       # "Beratungsfolge"(list of sessions for this paper)
       # This is currently not parsed for scraping, but only for
       # gathering session-document ids for later exclusion
       found_files = [] #already changed: found_files, files. todo: document_foo
       rows = dom.xpath(self.xpath['PAPER_DETAIL_AGENDA_ROWS'])
       for row in rows:
         # find forms
         formfields = row.xpath('.//input[@type="hidden"][@name="DT"]')
         for formfield in formfields:
           file_id = formfield.get('value')
           if file_id is not None:
             found_files.append(file_id)
         # find links
         links = row.xpath('.//a[contains(@href,"getfile.")]')
         for link in links:
           if not link.xpath('.//img'):
             file_link = self.config['scraper']['base_url'] + link.get('href')
             file_id = file_link.split('id=')[1].split('&')[0]
             found_files.append(file_id)
       # paper-related documents
       files = []
       containers = dom.xpath(self.xpath['PAPER_DETAIL_FILES'])
       for container in containers:
         try:
           classes = container.get('class').split(' ')
         except:
           continue
         if self.xpath['PAPER_DETAIL_FILES_CONTAINER_CLASSNAME'] not in classes:
           continue
         rows = container.xpath('.//tr')
         for row in rows:
           # seems that we have direct links
           if not row.xpath('.//form'):
             links = row.xpath('.//a')
             for link in links:
               # ignore additional pdf icon links
               if not link.xpath('.//img'):
                 name = ' '.join(link.xpath('./text()')).strip()
                 file_link = self.config['scraper']['base_url'] + link.get('href')
                 file_id = file_link.split('id=')[1].split('&')[0]
                 if file_id in found_files:
                   continue
                 file = File(
                   originalId=file_id,
                   name=name,
                   originalUrl=file_link,
                   originalDownloadPossible = True)
                 file = self.get_file(file=file, link=file_link)
                 files.append(file)
                 found_files.append(file_id)
                 
           # no direct link, so we have to handle forms
           else:
             forms = row.xpath('.//form')
             for form in forms:
               name = " ".join(row.xpath('./td/text()')).strip()
               for hidden_field in form.xpath('input[@name="DT"]'):
                 file_id = hidden_field.get('value')
                 if file_id in found_files:
                   continue
                 file = File(
                   originalId=file_id,
                   name=name,
                   originalDownloadPossible = False)
                 # Traversing the whole mechanize response to submit this form
                 for mform in mechanize_forms:
                   for control in mform.controls:
                     if control.name == 'DT' and control.value == file_id:
                       file = self.get_file(file=file, form=mform)
                       files.append(file)
                       found_files.append(file_id)
       if len(files):
         paper.mainFile = files[0]
       if len(files) > 1:
         paper.auxiliaryFile = files[1:]
       oid = self.db.save_paper(paper)
Beispiel #5
0
    def get_paper(self, paper_url=None, paper_id=None):
        """
        Load paper details for the paper given by detail page URL
        or numeric ID
        """
        paper_url = ('%svo020.asp?VOLFDNR=%s' %
                     (self.config['scraper']['base_url'], paper_id))
        logging.info("Getting paper %d from %s", paper_id, paper_url)

        # Stupid re-try concept because AllRis sometimes misses
        # start < at tags at first request.
        try_counter = 0
        while True:
            try:
                response = self.get_url(paper_url)
                if not response:
                    return
                if "noauth" in response.url:
                    logging.warn("Paper %s in %s seems to private", paper_id,
                                 paper_url)
                    return
                text = response.text
                doc = html.fromstring(text)
                data = {}

                # Beratungsfolge-Table checken
                # lets hope we always have this table
                table = self.table_css(doc)[0]
                self.consultation_list_start = False
                last_headline = ''
                for line in table:
                    if line.tag == 'tr':
                        headline = line[0].text
                    elif line.tag == 'td':
                        headline = line.text
                    else:
                        logging.error("ERROR: Serious error in data table. "
                                      "Unable to parse.")
                    if headline:
                        headline = headline.split(":")[0].lower()
                        if headline[-1] == ":":
                            headline = headline[:-1]
                        if headline == "betreff":
                            value = line[1].text_content().strip()
                            # There is some html comment with a script
                            # tag in front of the text which we remove.
                            value = value.split("-->")[1]
                            # remove all multiple spaces from the string
                            data[headline] = " ".join(value.split())
                        elif headline in [
                                'verfasser', u'federführend', 'drucksache-art'
                        ]:
                            data[headline] = line[1].text.strip()
                        elif headline in ['status']:
                            data[headline] = line[1].text.strip()
                            # related papers
                            if len(line) > 2:
                                if len(line[3]):
                                    # Gets originalId. is there something
                                    # else at this position? (will break)
                                    paper_id = line[3][0][0][1][0].get(
                                        'href').split('=')[1].split('&')[0]
                                    data['relatedPaper'] = [
                                        Paper(originalId=paper_id)
                                    ]

                        # Lot's of scraping just because of the date (?)
                        elif headline == "beratungsfolge":
                            # The actual list will be in the next row
                            # inside a table, so we only set a marker.
                            self.consultation_list_start = True
                        elif self.consultation_list_start:
                            elem = line[0][0]
                            # The first line is pixel images, so skip
                            # it, then we need to jump in steps of two.
                            amount = (len(elem) - 1) / 2
                            consultations = []
                            date_list = []
                            i = 0
                            item = None
                            for elem_line in elem:
                                if i == 0:
                                    i += 1
                                    continue
                                """
                                Here we need to parse the actual list which can have different forms. A complex example
                                can be found at http://ratsinfo.aachen.de/bi/vo020.asp?VOLFDNR=10822
                                The first line is some sort of headline with the committee in question and the type of consultation.
                                After that 0-n lines of detailed information of meetings with a date, transscript and decision.
                                The first line has 3 columns (thanks to colspan) and the others have 7.

                                Here we make every meeting a separate entry, we can group them together later again if we want to.
                                """

                                # now we need to parse the actual list
                                # those lists
                                new_consultation = Consultation()
                                new_consultation.status = \
                                        elem_line[0].attrib['title'].lower()
                                if len(elem_line) == 3:
                                    # The order is "color/status", name of
                                    # committee / link to TOP, more info we
                                    # define a head dict here which can be
                                    # shared for the other lines once we find
                                    # another head line we will create a new
                                    # one here.
                                    new_consultation.role = \
                                            elem_line[2].text.strip()

                                    # Name of committee, e.g.
                                    # "Finanzausschuss", unfort. without id
                                    #'committee' : elem_line[1].text.strip(),
                                # For some obscure reasons sometimes action
                                # is missing.
                                elif len(elem_line) == 2:
                                    # The order is "color/status", name of
                                    # committee / link to TOP, more info.
                                    status = \
                                            elem_line[0].attrib['title'].lower()
                                    # We define a head dict here which can be
                                    # shared for the other lines once we find
                                    # another head line we will create a new
                                    # one here.
                                    # name of committee, e.g.
                                    # "Finanzausschuss", unfort. without id
                                    #'committee' : elem_line[1].text.strip(),
                                elif len(elem_line) == 7:
                                    try:
                                        # This is about line 2 with lots of
                                        # more stuff to process.
                                        # Date can be text or a link with that
                                        # text.
                                        # We have a link (and ignore it).
                                        if len(elem_line[1]) == 1:
                                            date_text = elem_line[1][0].text
                                        else:
                                            date_text = elem_line[1].text
                                        date_list.append(
                                            datetime.datetime.strptime(
                                                date_text.strip(), "%d.%m.%Y"))
                                        if len(elem_line[2]):
                                            # Form with silfdnr and toplfdnr
                                            # but only in link (action=
                                            #   "to010.asp?topSelected=57023")
                                            form = elem_line[2][0]
                                            meeting_id = form[0].attrib[
                                                'value']
                                            new_consultation.meeting = [
                                                Meeting(originalId=meeting_id)
                                            ]
                                            # Full name of meeting, e.g.
                                            # "A/31/WP.16 öffentliche/
                                            #   nichtöffentliche Sitzung des
                                            # Finanzausschusses"
                                            #item['meeting'] = \
                                            #    elem_line[3][0].text.strip()
                                        else:
                                            # No link to TOP. Should not be
                                            # possible but happens.
                                            #   (TODO: Bugreport?)
                                            # Here we have no link but the text
                                            # is in the TD directly - will be
                                            # scaped as meeting.
                                            #item['meeting'] = \
                                            #    elem_line[3].text.strip()
                                            logging.warn(
                                                "AgendaItem in consultation "
                                                "list on the web page does not "
                                                "contain a link to the actual "
                                                "meeting at paper %s",
                                                paper_url)
                                        toplfdnr = None
                                        if len(elem_line[6]) > 0:
                                            form = elem_line[6][0]
                                            toplfdnr = form[0].attrib['value']
                                        if toplfdnr:
                                            new_consultation.originalId = \
                                                    "%s-%s" % (toplfdnr,
                                                               paper_id)
                                            # actually the id of the transcript
                                            new_consultation.agendaItem = \
                                                    AgendaItem(
                                                        originalId=toplfdnr)
                                            # e.g. "ungeändert beschlossen"
                                            new_consultation.agendaItem.result \
                                                    = elem_line[4].text.strip()
                                            consultations.append(
                                                new_consultation)
                                        else:
                                            logging.error(
                                                "missing agendaItem ID in "
                                                "consultation list at %s",
                                                paper_url)
                                    except (IndexError, KeyError):
                                        logging.error(
                                            "ERROR: Serious error in "
                                            "consultation list. Unable to "
                                            "parse.")
                                        logging.error(
                                            "Serious error in consultation "
                                            "list. Unable to parse.")
                                        return []
                                i += 1
                            # Theory: we don't need this at all, because it's
                            # scraped at meeting.
                            #data['consultations'] = consultations
                            # set the marker to False again as we have read it
                            self.consultation_list_start = False
                    last_headline = headline
                    # We simply ignore the rest (there might not be much more
                    # actually).
                # The actual text comes after the table in a div but it's not
                # valid XML or HTML this using regex.
                data['docs'] = self.body_re.findall(response.text)
                first_date = False
                for single_date in date_list:
                    if first_date:
                        if single_date < first_date:
                            first_date = single_date
                    else:
                        first_date = single_date
                paper = Paper(originalId=paper_id)
                paper.originalUrl = paper_url
                paper.name = data['betreff']
                paper.description = data['docs']
                if 'drucksache-art' in data:
                    paper.paperType = data['drucksache-art']
                if first_date:
                    paper.publishedDate = first_date.strftime("%d.%m.%Y")
                # see theory above
                #if 'consultations' in data:
                #    paper.consultation = data['consultations']
                paper.auxiliaryFile = []
                # get the attachments step 1 (Drucksache)
                file_1 = self.attachment_1_css(doc)
                if len(file_1):
                    if file_1[0].value:
                        href = ('%sdo027.asp' %
                                self.config['scraper']['base_url'])
                        original_id = file_1[0].value
                        name = 'Drucksache'
                        main_file = File(originalId=original_id, name=name)
                        main_file = self.get_file(main_file, href, True)
                        paper.mainFile = main_file
                # get the attachments step 2 (additional attachments)
                files = self.attachments_css(doc)
                if len(files) > 0:
                    if len(files[0]) > 1:
                        if files[0][1][0].text.strip() == "Anlagen:":
                            for tr in files[0][2:]:
                                link = tr[0][0]
                                href = ("%s%s" %
                                        (self.config['scraper']['base_url'],
                                         link.attrib["href"]))
                                name = link.text
                                path_tokens = link.attrib["href"].split('/')
                                original_id = "%d-%d" % (int(
                                    path_tokens[4]), int(path_tokens[6]))
                                aux_file = File(originalId=original_id,
                                                name=name)
                                aux_file = self.get_file(aux_file, href)
                                paper.auxiliaryFile.append(aux_file)
                print paper.auxiliaryFile
                if not len(paper.auxiliaryFile):
                    del paper.auxiliaryFile
                oid = self.db.save_paper(paper)
                return
            except (KeyError, IndexError):
                if try_counter < 3:
                    logging.info("Try again: Getting paper %d from %s",
                                 paper_id, paper_url)
                    try_counter += 1
                else:
                    logging.error("Failed getting paper %d from %s", paper_id,
                                  paper_url)
                    return
Beispiel #6
0
    def get_meeting(self, meeting_url=None, meeting_id=None):
        """ Load meeting details (e.g. agendaitems) for the given detail page
        URL or numeric ID
        """
        meeting_url = ("%sto010.asp?selfaction=ws&template=xyz&SILFDNR=%s" %
                       (self.config['scraper']['base_url'], meeting_id))

        logging.info("Getting meeting %d from %s", meeting_id, meeting_url)

        r = self.get_url(meeting_url)
        if not r:
            return
        # If r.history has an item we have a problem
        if len(r.history):
            if r.history[0].status_code == 302:
                logging.info("Meeting %d from %s seems to be private",
                             meeting_id, meeting_id)
            else:
                logging.error(
                    "Strange redirect %d from %s with status code %s",
                    meeting_id, meeting_url, r.history[0].status_code)
            return
        h = HTMLParser.HTMLParser()
        xml = str(r.text.encode('ascii', 'xmlcharrefreplace'))
        parser = etree.XMLParser(recover=True)
        root = etree.fromstring(xml, parser=parser)

        meeting = Meeting(originalId=meeting_id)

        # special area
        special = {}
        for item in root[0].iterchildren():
            special[item.tag] = item.text
        # Woher kriegen wir das Datum? Nur über die Übersicht?
        #if 'sisb' in special:
        #if 'sise' in special:
        if 'saname' in special:
            meeting.type = special['saname']
        # head area
        head = {}
        for item in root[1].iterchildren():
            if item.text:
                head[item.tag] = h.unescape(item.text)
            else:
                head[item.text] = ''
        if 'sitext' in head:
            meeting.name = head['sitext']
        if 'raname' in head:
            meeting.room = head['raname']
        if 'raort' in head:
            meeting.address = head['raort']
        agendaitems = []

        for item in root[2].iterchildren():
            elem = {}
            for e in item.iterchildren():
                elem[e.tag] = e.text

            section = [elem['tofnum'], elem['tofunum'], elem['tofuunum']]
            section = [x for x in section if x != "0"]
            elem['section'] = ".".join(section)
            agendaitem = AgendaItem()

            agendaitem.originalId = int(elem['tolfdnr'])
            agendaitem.public = (elem['toostLang'] == u'öffentlich')
            #agendaitem.name = elem['totext1']
            # get agenda detail page
            # TODO: Own Queue
            time.sleep(self.config['scraper']['wait_time'])
            agendaitem_url = (
                '%sto020.asp?selfaction=ws&template=xyz&TOLFDNR=%s' %
                (self.config['scraper']['base_url'], agendaitem.originalId))
            logging.info("Getting agendaitem %d from %s",
                         agendaitem.originalId, agendaitem_url)

            agendaitem_r = self.get_url(agendaitem_url)
            if not agendaitem_r:
                return

            if len(agendaitem_r.history):
                logging.info("Agenda item %d from %s seems to be private",
                             meeting_id, meeting_url)
            else:
                agendaitem_xml = agendaitem_r.text.encode(
                    'ascii', 'xmlcharrefreplace')
                # TODO: mixup of agendaitem_parser / parser below?
                agendaitem_parser = etree.XMLParser(recover=True)
                agendaitem_root = etree.fromstring(agendaitem_xml,
                                                   parser=parser)
                add_agenda_item = {}
                for add_item in agendaitem_root[0].iterchildren():
                    if add_item.tag == "rtfWP" and len(add_item) > 0:
                        try:
                            agendaitem.resolution_text = h.unescape(
                                etree.tostring(add_item[0][1][0]))
                        except:
                            logging.warn(
                                "Unable to parse resolution text at "
                                "%s", agendaitem_url)
                    else:
                        if add_item.text:
                            add_agenda_item[add_item.tag] = h.unescape(
                                add_item.text)
                if 'toptext' in add_agenda_item:
                    agendaitem.name = add_agenda_item['toptext']

                # there are papers with id = 0. we don't need them.
                if int(elem['volfdnr']):
                    consult_id = (unicode(agendaitem.originalId) +
                                  unicode(int(elem['volfdnr'])))
                    consultation = Consultation(originalId=consult_id)
                    paper_id = int(elem['volfdnr'])
                    if 'voname' in add_agenda_item:
                        consultation.paper = Paper(
                            originalId=paper_id,
                            name=add_agenda_item['voname'])
                    else:
                        consultation.paper = Paper(originalId=paper_id)
                    agendaitem.consultation = [consultation]
                    if 'vobetr' in add_agenda_item:
                        if add_agenda_item['vobetr'] != agendaitem.name:
                            logging.warn(
                                "different values for name: %s and %s",
                                agendaitem.name, add_agenda_item['vobetr'])
                    if hasattr(self, 'paper_queue'):
                        self.paper_queue.add(int(elem['volfdnr']))
                if 'totyp' in add_agenda_item:
                    agendaitem.result = add_agenda_item['totyp']
                agendaitems.append(agendaitem)
        meeting.agendaItem = agendaitems

        oid = self.db.save_meeting(meeting)
        logging.info("Meeting %d stored with _id %s", meeting_id, oid)
Beispiel #7
0
class PaperPage(object):
    valid = False  # 网页数据的有效性,无效则不进行解析与储存

    def __init__(self, _id, content=None, **kwargs):
        self._id = str(_id)
        self.paper = Paper()
        page_data = col_paper.find_one({"url_id": self._id})
        if page_data:
            # 数据库中已经存在,直接返回
            return
        if content is None:
            self.content = Downloader(host + self._id)()
            if self.content:
                self.valid = True
            else:
                logger.error("当前网页为空,无法进行解析\turl_id:" + self._id)
                self.valid = False
                return
        else:
            self.valid = True
            self.content = content
        self.selector = etree.HTML(self.content)
        self.paper.url_id = self._id

    def run(self):
        if not self.valid:
            logger.info("该paper已存在\turl_id:" + self._id)
            return
        self.main_page()
        self.get_in_citation()
        if len(self.paper.in_citations) < 1:
            logger.info("该paper参考文献小于1,已进行排除\turl_id:" + self._id)
            return
        self.get_citing_sentence()
        self.get_out_citation()
        self.paper.save()
        logger.info("完成解析\turl:%s\tpaper_id:%s" % (self._id, self.paper._id))

    def main_page(self):
        try:
            self.paper.year = to_int(
                deep_select(self.selector, 0, "//table//tr[5]/td/text()"))
            # if self.paper.year < 2013:
            #     # 对数据集进行筛选
            #     return
            # 列出所有作者
            authors = deep_select(self.selector,
                                  return_type="list",
                                  xpath="//table//tr[6]/td//a/@href")
            if not authors:
                # 如果self.paper中无作者,则直接剔除数据
                self.valid = False
                return
            authors_id = [to_num(x) for x in authors]
            self.paper.authors_full_name = deep_select(
                self.selector,
                return_type="list",
                xpath="//table//tr[6]/td//a/text()")
            self.paper.authors = authors_id
            self.paper._id = deep_select(self.selector, 0,
                                         "//table//tr[1]/td/text()")
            if not self.paper._id:
                self.valid = False
                return
            self.paper.title = deep_select(self.selector, 0,
                                           "//table//tr[2]/td/text()")
            self.paper.venue = deep_select(self.selector, 0,
                                           "//table//tr[3]/td/text()")
            self.paper.session = deep_select(self.selector, 0,
                                             "//table//tr[4]/td/text()")

            self.paper.abstract = clean(
                deep_select(self.selector, 0,
                            '//div[@id="abstract"]/p/text()'))
        except Exception as e:
            logger.error("id:%s\t%s" % (self._id, e))

    def get_out_citation(self):
        if not self.valid:
            return
        self.selector = etree.HTML(
            Downloader('http://aan.how/browse/outgoing_citations/' +
                       self._id)())
        out_citations = deep_select(self.selector,
                                    return_type="list",
                                    xpath='//a/@href')
        if out_citations:
            self.paper.out_citations = [to_num(x) for x in out_citations]

    def get_in_citation(self):
        if not self.valid:
            return
        self.selector = etree.HTML(
            Downloader('http://aan.how/browse/incoming_citations/' +
                       self._id)())
        in_citations = deep_select(self.selector,
                                   return_type="list",
                                   xpath='//a/@href')
        if in_citations:
            self.paper.in_citations = [to_num(x) for x in in_citations]

    def get_citing_sentence(self):
        if not self.valid:
            return
        self.selector = etree.HTML(
            Downloader('http://aan.how/browse/citing_sentences/' + self._id)())
        paper_id = deep_select(self.selector,
                               return_type="list",
                               xpath='//a/text()')
        sentence = deep_select(self.selector,
                               return_type="list",
                               xpath="//tr/td[4]/div/text()")
        line = deep_select(self.selector,
                           return_type="list",
                           xpath="//tr/td[3]/text()")
        if paper_id and sentence:
            for x in range(len(paper_id)):
                citing_sentences = {
                    "paper_id": paper_id[x],
                    "sentence": clean(sentence[x]),
                    "line": line[x]
                }
                self.paper.citing_sentences.append(citing_sentences)
    def get_paper(self, paper_url=None, paper_id=None):
        """
        Load paper details for the paper given by detail page URL
        or numeric ID
        """
        paper_url = ('%svo020.asp?VOLFDNR=%s'
                     % (self.config['scraper']['base_url'], paper_id))
        logging.info("Getting paper %d from %s", paper_id, paper_url)

        # Stupid re-try concept because AllRis sometimes misses
        # start < at tags at first request.
        try_counter = 0
        while True:
            try:
                response = self.get_url(paper_url)
                if not response:
                    return
                if "noauth" in response.url:
                    logging.warn("Paper %s in %s seems to private",
                                 paper_id, paper_url)
                    return
                text = response.text
                doc = html.fromstring(text)
                data = {}

                # Beratungsfolge-Table checken
                # lets hope we always have this table
                table = self.table_css(doc)[0]
                self.consultation_list_start = False
                last_headline = ''
                for line in table:
                    if line.tag == 'tr':
                        headline = line[0].text
                    elif line.tag == 'td':
                        headline = line.text
                    else:
                        logging.error("ERROR: Serious error in data table. "
                                      "Unable to parse.")
                    if headline:
                        headline = headline.split(":")[0].lower()
                        if headline[-1] == ":":
                            headline = headline[:-1]
                        if headline == "betreff":
                            value = line[1].text_content().strip()
                            # There is some html comment with a script
                            # tag in front of the text which we remove.
                            value = value.split("-->")[1]
                            # remove all multiple spaces from the string
                            data[headline] = " ".join(value.split())
                        elif headline in ['verfasser', u'federführend',
                                          'drucksache-art']:
                            data[headline] = line[1].text.strip()
                        elif headline in ['status']:
                            data[headline] = line[1].text.strip()
                            # related papers
                            if len(line) > 2:
                                if len(line[3]):
                                    # Gets originalId. is there something
                                    # else at this position? (will break)
                                    paper_id = line[3][0][0][1][0].get(
                                        'href').split('=')[1].split('&')[0]
                                    data['relatedPaper'] = [Paper(
                                        originalId=paper_id)]

                        # Lot's of scraping just because of the date (?)
                        elif headline == "beratungsfolge":
                            # The actual list will be in the next row
                            # inside a table, so we only set a marker.
                            self.consultation_list_start = True
                        elif self.consultation_list_start:
                            elem = line[0][0]
                            # The first line is pixel images, so skip
                            # it, then we need to jump in steps of two.
                            amount = (len(elem) - 1) / 2
                            consultations = []
                            date_list = []
                            i = 0
                            item = None
                            for elem_line in elem:
                                if i == 0:
                                    i += 1
                                    continue

                                """
                                Here we need to parse the actual list which can have different forms. A complex example
                                can be found at http://ratsinfo.aachen.de/bi/vo020.asp?VOLFDNR=10822
                                The first line is some sort of headline with the committee in question and the type of consultation.
                                After that 0-n lines of detailed information of meetings with a date, transscript and decision.
                                The first line has 3 columns (thanks to colspan) and the others have 7.

                                Here we make every meeting a separate entry, we can group them together later again if we want to.
                                """

                                # now we need to parse the actual list
                                # those lists
                                new_consultation = Consultation()
                                new_consultation.status = \
                                        elem_line[0].attrib['title'].lower()
                                if len(elem_line) == 3:
                                    # The order is "color/status", name of
                                    # committee / link to TOP, more info we
                                    # define a head dict here which can be
                                    # shared for the other lines once we find
                                    # another head line we will create a new
                                    # one here.
                                    new_consultation.role = \
                                            elem_line[2].text.strip()

                                    # Name of committee, e.g.
                                    # "Finanzausschuss", unfort. without id
                                    #'committee' : elem_line[1].text.strip(),
                                # For some obscure reasons sometimes action
                                # is missing.
                                elif len(elem_line) == 2:
                                    # The order is "color/status", name of
                                    # committee / link to TOP, more info.
                                    status = \
                                            elem_line[0].attrib['title'].lower()
                                    # We define a head dict here which can be
                                    # shared for the other lines once we find
                                    # another head line we will create a new
                                    # one here.
                                    # name of committee, e.g.
                                    # "Finanzausschuss", unfort. without id
                                    #'committee' : elem_line[1].text.strip(),
                                elif len(elem_line) == 7:
                                    try:
                                        # This is about line 2 with lots of
                                        # more stuff to process.
                                        # Date can be text or a link with that
                                        # text.
                                        # We have a link (and ignore it).
                                        if len(elem_line[1]) == 1:
                                            date_text = elem_line[1][0].text
                                        else:
                                            date_text = elem_line[1].text
                                        date_list.append(
                                            datetime.datetime.strptime(
                                                date_text.strip(), "%d.%m.%Y"))
                                        if len(elem_line[2]):
                                            # Form with silfdnr and toplfdnr
                                            # but only in link (action=
                                            #   "to010.asp?topSelected=57023")
                                            form = elem_line[2][0]
                                            meeting_id = form[0].attrib['value']
                                            new_consultation.meeting = [Meeting(
                                                originalId=meeting_id)]
                                            # Full name of meeting, e.g.
                                            # "A/31/WP.16 öffentliche/
                                            #   nichtöffentliche Sitzung des
                                            # Finanzausschusses"
                                            #item['meeting'] = \
                                            #    elem_line[3][0].text.strip()
                                        else:
                                            # No link to TOP. Should not be
                                            # possible but happens.
                                            #   (TODO: Bugreport?)
                                            # Here we have no link but the text
                                            # is in the TD directly - will be
                                            # scaped as meeting.
                                            #item['meeting'] = \
                                            #    elem_line[3].text.strip()
                                            logging.warn(
                                                "AgendaItem in consultation "
                                                "list on the web page does not "
                                                "contain a link to the actual "
                                                "meeting at paper %s",
                                                paper_url)
                                        toplfdnr = None
                                        if len(elem_line[6]) > 0:
                                            form = elem_line[6][0]
                                            toplfdnr = form[0].attrib['value']
                                        if toplfdnr:
                                            new_consultation.originalId = \
                                                    "%s-%s" % (toplfdnr,
                                                               paper_id)
                                            # actually the id of the transcript
                                            new_consultation.agendaItem = \
                                                    AgendaItem(
                                                        originalId=toplfdnr)
                                            # e.g. "ungeändert beschlossen"
                                            new_consultation.agendaItem.result \
                                                    = elem_line[4].text.strip()
                                            consultations.append(
                                                new_consultation)
                                        else:
                                            logging.error(
                                                "missing agendaItem ID in "
                                                "consultation list at %s",
                                                paper_url)
                                    except (IndexError, KeyError):
                                        logging.error(
                                            "ERROR: Serious error in "
                                            "consultation list. Unable to "
                                            "parse.")
                                        logging.error(
                                            "Serious error in consultation "
                                            "list. Unable to parse.")
                                        return []
                                i += 1
                            # Theory: we don't need this at all, because it's
                            # scraped at meeting.
                            #data['consultations'] = consultations
                            # set the marker to False again as we have read it
                            self.consultation_list_start = False
                    last_headline = headline
                    # We simply ignore the rest (there might not be much more
                    # actually).
                # The actual text comes after the table in a div but it's not
                # valid XML or HTML this using regex.
                data['docs'] = self.body_re.findall(response.text)
                first_date = False
                for single_date in date_list:
                    if first_date:
                        if single_date < first_date:
                            first_date = single_date
                    else:
                        first_date = single_date
                paper = Paper(originalId=paper_id)
                paper.originalUrl = paper_url
                paper.name = data['betreff']
                paper.description = data['docs']
                if 'drucksache-art' in data:
                    paper.paperType = data['drucksache-art']
                if first_date:
                    paper.publishedDate = first_date.strftime("%d.%m.%Y")
                # see theory above
                #if 'consultations' in data:
                #    paper.consultation = data['consultations']
                paper.auxiliaryFile = []
                # get the attachments step 1 (Drucksache)
                file_1 = self.attachment_1_css(doc)
                if len(file_1):
                    if file_1[0].value:
                        href = ('%sdo027.asp'
                                % self.config['scraper']['base_url'])
                        original_id = file_1[0].value
                        name = 'Drucksache'
                        main_file = File(originalId=original_id, name=name)
                        main_file = self.get_file(main_file, href, True)
                        paper.mainFile = main_file
                # get the attachments step 2 (additional attachments)
                files = self.attachments_css(doc)
                if len(files) > 0:
                    if len(files[0]) > 1:
                        if files[0][1][0].text.strip() == "Anlagen:":
                            for tr in files[0][2:]:
                                link = tr[0][0]
                                href = ("%s%s"
                                        % (self.config['scraper']['base_url'],
                                           link.attrib["href"]))
                                name = link.text
                                path_tokens = link.attrib["href"].split('/')
                                original_id = "%d-%d" % (int(path_tokens[4]),
                                                         int(path_tokens[6]))
                                aux_file = File(originalId=original_id,
                                                name=name)
                                aux_file = self.get_file(aux_file, href)
                                paper.auxiliaryFile.append(aux_file)
                print paper.auxiliaryFile
                if not len(paper.auxiliaryFile):
                    del paper.auxiliaryFile
                oid = self.db.save_paper(paper)
                return
            except (KeyError, IndexError):
                if try_counter < 3:
                    logging.info("Try again: Getting paper %d from %s",
                                 paper_id, paper_url)
                    try_counter += 1
                else:
                    logging.error("Failed getting paper %d from %s",
                                  paper_id, paper_url)
                    return