Beispiel #1
0
  def find_meeting(self, start_date=None, end_date=None):
    """
    Find meetings within a given time frame and add them to the meeting queue.
    """
    meeting_url = "%ssi010.asp?selfaction=ws&template=xyz&kaldatvon=%s&kaldatbis=%s" % (self.config.BASE_URL, start_date.strftime("%d.%m.%Y"), end_date.strftime("%d.%m.%Y"))
    logging.info("Getting meeting overview from %s", meeting_url)
    
    
    parser = etree.XMLParser(recover=True)
    
    r = self.get_url(meeting_url)
    if not r:
      return
    
    xml = r.text.encode('ascii','xmlcharrefreplace') 
    root = etree.fromstring(xml, parser=parser)

    for item in root[1].iterchildren():
      raw_meeting = {}
      for e in item.iterchildren():
        raw_meeting[e.tag] = e.text
      meeting = Meeting(numeric_id=int(raw_meeting['silfdnr']), identifier=int(raw_meeting['silfdnr']))
      meeting.date_start = self.parse_date(raw_meeting['sisbvcs'])
      meeting.date_end = self.parse_date(raw_meeting['sisevcs'])
      meeting.identifier = raw_meeting['siname']
      meeting.original_url = "%sto010.asp?SILFDNR=%s&options=4" % (self.config.BASE_URL, raw_meeting['silfdnr'])
      meeting.title = raw_meeting['sitext']
      meeting.committee_name = raw_meeting['grname']
      meeting.description = raw_meeting['sitext']
      oid = self.db.save_meeting(meeting)
      self.meeting_queue.add(meeting.numeric_id)
Beispiel #2
0
    def get_meetings_list(self, raw_calendar_data):
        meetings_list = []
        for num, meeting in enumerate(self.raw_calendar_data, 1):
            num = num
            summary = self._get_summary(meeting)
            start = parse(meeting['start'].get('dateTime',
                                               meeting['start'].get('date')))
            end = parse(meeting['end'].get('dateTime',
                                           meeting['end'].get('date')))
            duration = end - start
            num_attendees = self._get_num_attendees(meeting.get('attendees'))

            m = Meeting(num, summary, start, end, duration, num_attendees)
            meetings_list.append(m)
        return meetings_list
Beispiel #3
0
    def find_meeting(self, start_date=None, end_date=None):
        """ Find meetings within a given time frame and add them to the meeting
        queue.
        """
        meeting_find_url = (
            self.config['scraper']['allris']['meeting_find_url'] %
            (self.config['scraper']['base_url'],
             start_date.strftime("%d.%m.%Y"), end_date.strftime("%d.%m.%Y")))
        logging.info("Getting meeting overview from %s", meeting_find_url)

        parser = etree.XMLParser(recover=True)
        h = HTMLParser.HTMLParser()

        r = self.get_url(meeting_find_url)
        if not r:
            return

        xml = r.text.encode('ascii', 'xmlcharrefreplace').replace('</a>', '')
        xml = re.sub(r'<a href="([^"]*)" target="_blank" ?>', r'\1', xml)
        root = etree.fromstring(xml, parser=parser)
        for item in root:
            if item.tag == 'list':
                root = item
                break
        for item in root.iterchildren():
            raw_meeting = {}
            for e in item.iterchildren():
                if e.text:
                    raw_meeting[e.tag] = h.unescape(e.text)
                else:
                    raw_meeting[e.tag] = ''
            meeting = Meeting(originalId=int(raw_meeting['silfdnr']))
            meeting.start = self.parse_date(raw_meeting['sisbvcs'])
            meeting.end = self.parse_date(raw_meeting['sisevcs'])
            meeting.name = raw_meeting['siname']
            meeting.originalUrl = (
                "%sto010.asp?SILFDNR=%s&options=4" %
                (self.config['scraper']['base_url'], raw_meeting['silfdnr']))
            meeting.name = raw_meeting['sitext']
            meeting.organization_name = raw_meeting['grname']
            # meeting.description = raw_meeting['sitext'] # WHAT TO DO WITH THIS
            self.db.save_meeting(meeting)
            self.meeting_queue.add(meeting.originalId)
    def find_meeting(self, start_date=None, end_date=None):
        """ Find meetings within a given time frame and add them to the meeting
        queue.
        """
        meeting_find_url = (self.config['scraper']['allris']['meeting_find_url']
                            % (self.config['scraper']['base_url'],
                               start_date.strftime("%d.%m.%Y"),
                               end_date.strftime("%d.%m.%Y")))
        logging.info("Getting meeting overview from %s", meeting_find_url)

        parser = etree.XMLParser(recover=True)
        h = HTMLParser.HTMLParser()

        r = self.get_url(meeting_find_url)
        if not r:
            return

        xml = r.text.encode('ascii', 'xmlcharrefreplace').replace('</a>', '')
        xml = re.sub(r'<a href="([^"]*)" target="_blank" ?>', r'\1', xml)
        root = etree.fromstring(xml, parser=parser)
        for item in root:
            if item.tag == 'list':
                root = item
                break
        for item in root.iterchildren():
            raw_meeting = {}
            for e in item.iterchildren():
                if e.text:
                    raw_meeting[e.tag] = h.unescape(e.text)
                else:
                    raw_meeting[e.tag] = ''
            meeting = Meeting(originalId=int(raw_meeting['silfdnr']))
            meeting.start = self.parse_date(raw_meeting['sisbvcs'])
            meeting.end = self.parse_date(raw_meeting['sisevcs'])
            meeting.name = raw_meeting['siname']
            meeting.originalUrl = ("%sto010.asp?SILFDNR=%s&options=4"
                                   % (self.config['scraper']['base_url'],
                                      raw_meeting['silfdnr']))
            meeting.name = raw_meeting['sitext']
            meeting.organization_name = raw_meeting['grname']
            # meeting.description = raw_meeting['sitext'] # WHAT TO DO WITH THIS
            self.db.save_meeting(meeting)
            self.meeting_queue.add(meeting.originalId)
Beispiel #5
0
  def get_meeting(self, meeting_url=None, meeting_id=None):
    """
    Load meeting details (e.g. agendaitems) for the given detail page URL or numeric ID
    """
    meeting_url = "%sto010.asp?selfaction=ws&template=xyz&SILFDNR=%s" % (self.config.BASE_URL, meeting_id)
    
    logging.info("Getting meeting %d from %s", meeting_id, meeting_url)
    
    r = self.get_url(meeting_url)
    if not r:
      return
    # If r.history has an item we have a problem
    if len(r.history):
      if r.history[0].status_code == 302:
        logging.info("Meeting %d from %s seems to be private", meeting_id, meeting_id)
      else:
        logging.error("Strange redirect %d from %s with status code %s", meeting_id, meeting_url, r.history[0].status_code)
      return
    xml = r.text.encode('ascii','xmlcharrefreplace') 
    parser = etree.XMLParser(recover=True)
    root = etree.fromstring(xml, parser=parser)
    
    meeting = Meeting(numeric_id=meeting_id)

    # special area
    special = {}
    for item in root[0].iterchildren():
      special[item.tag] = item.text
    # Woher kriegen wir das Datum? Nur über die Übersicht?
    #if 'sisb' in special:
    #if 'sise' in special:
    if 'saname' in special:
      if special['saname'] in self.config.MEETING_TYPE:
        meeting.type = self.config.MEETING_TYPE[special['saname']]
      else:
        logging.warn("String '%s' not found in MEETING_TYPE", special['saname'])
    # head area
    head = {}
    for item in root[0].iterchildren():
      head[item.tag] = item.text
    if 'raname' in head:
      meeting.room = head['raname']
    if 'raort' in head:
      meeting.address = head['raort']
    agendaitems = []
    
    for item in root[2].iterchildren():
      elem = {}
      for e in item.iterchildren():
        elem[e.tag] = e.text

      section = [elem['tofnum'], elem['tofunum'], elem['tofuunum']]
      section = [x for x in section if x!="0"]
      elem['section'] = ".".join(section)
      agendaitem = Agendaitem()
      
      #agendaitem = elem['topnr']
      agendaitem.numeric_id = int(elem['tolfdnr'])
      if elem['toostLang'] == u'öffentlich':
        agendaitem.public = True
      else:
        agendaitem.public = False
      agendaitem.title = elem['totext1']
      # get agenda detail page
      # TODO: Own Queue
      time.sleep(self.config.WAIT_TIME)
      agendaitem_url = '%sto020.asp?selfaction=ws&template=xyz&TOLFDNR=%s' % (self.config.BASE_URL, agendaitem.numeric_id)
      logging.info("Getting agendaitem %d from %s", agendaitem.numeric_id, agendaitem_url)
      
      agendaitem_r = self.get_url(agendaitem_url)
      if not agendaitem_r:
        return
      
      if len(agendaitem_r.history):
        logging.info("Agenda item %d from %s seems to be private", meeting_id, meeting_url)
      else:
        agendaitem_xml = agendaitem_r.text.encode('ascii','xmlcharrefreplace') 
        agendaitem_parser = etree.XMLParser(recover=True)
        agendaitem_root = etree.fromstring(agendaitem_xml, parser=parser)
        add_agenda_item = {}
        for add_item in agendaitem_root[0].iterchildren():
          if add_item.tag == "rtfWP" and len(add_item) > 0:
            try:
              agendaitem.resolution_text = etree.tostring(add_item[0][1][0])
            except:
              logging.warn("Unable to parse resolution text at %s", agendaitem_url)
          else:
            add_agenda_item[add_item.tag] = add_item.text
        if 'voname' in add_agenda_item:
          # create paper with identifier
          agendaitem.paper = [Paper(numeric_id = int(elem['volfdnr']), title=add_agenda_item['voname'])]
          if add_agenda_item['vobetr'] != agendaitem.title:
            logging.warn("different values for title: %s and %s", agendaitem.title, add_agenda_item['vobetr'])
          if hasattr(self, 'paper_queue'):
            self.paper_queue.add(int(elem['volfdnr']))
        elif int(elem['volfdnr']) is not 0:
          # create paper without identifier
          agendaitem.paper = [Paper(numeric_id = int(elem['volfdnr']))]
          if hasattr(self, 'paper_queue'):
            self.paper_queue.add(int(elem['volfdnr']))
        if "nowDate" not in add_agenda_item:
          # something is broken with this so we don't store it
          logging.warn("Skipping broken agenda at ", agendaitem_url)
        else:
          # dereference result
          if add_agenda_item['totyp'] in self.config.RESULT_STRINGS:
            agendaitem.result = self.config.RESULT_STRINGS[add_agenda_item['totyp']]
          else:
            logging.warn("String '%s' not found in configured RESULT_STRINGS", add_agenda_item['totyp'])
        agendaitems.append(agendaitem)
    meeting.agendaitem = agendaitems
    
    oid = self.db.save_meeting(meeting)
    logging.info("Meeting %d stored with _id %s", meeting_id, oid)
Beispiel #6
0
  def get_meeting(self, meeting_url=None, meeting_id=None):
    """
    Load meeting details for the given detail page URL or numeric ID
    """
    # Read either meeting_id or meeting_url from the opposite
    if meeting_id is not None:
      meeting_url = self.urls['SESSION_DETAIL_PRINT_PATTERN'] % meeting_id
    elif meeting_url is not None:
      parsed = parse.search(self.urls['SESSION_DETAIL_PARSE_PATTERN'], meeting_url)
      meeting_id = parsed['meeting_id']
  
    logging.info("Getting meeting (session) %d from %s", meeting_id, meeting_url)
  
    meeting = Meeting(numeric_id=meeting_id)
    
    time.sleep(self.config.WAIT_TIME)
    response = self.get_url(meeting_url)
    if not response:
      return
    
    # forms for later document download
    mechanize_forms = mechanize.ParseResponse(response, backwards_compat=False)
    # seek(0) is necessary to reset response pointer.
    response.seek(0)
    html = response.read()
    html = html.replace('&nbsp;', ' ')
    parser = etree.HTMLParser()
    dom = etree.parse(StringIO(html), parser)
    # check for page errors
    try:
      page_title = dom.xpath('//h1')[0].text
      if 'Fehlermeldung' in page_title:
        logging.info("Page %s cannot be accessed due to server error", meeting_url)
        return
      if 'Berechtigungsfehler' in page_title:
        logging.info("Page %s cannot be accessed due to permissions", meeting_url)
        return
    except:
      pass
    try:
      error_h3 = dom.xpath('//h3[@class="smc_h3"]')[0].text.strip()
      if 'Keine Daten gefunden' in error_h3:
        logging.info("Page %s does not contain any agenda items", meeting_url)
        return
      if 'Fehlercode: 1104' in error_h3:
        logging.info("Page %s cannot be accessed due to permissions", meeting_url)
        return
    except:
      pass
  
    meeting.original_url = meeting_url
    # Session title
    try:
      meeting.title = dom.xpath(self.xpath['SESSION_DETAIL_TITLE'])[0].text
    except:
      logging.critical('Cannot find session title element using XPath SESSION_DETAIL_TITLE')
      raise TemplateError('Cannot find session title element using XPath SESSION_DETAIL_TITLE')
  
    # Committe link
    #try:
    #  links = dom.xpath(self.xpath['SESSION_DETAIL_COMMITTEE_LINK'])
    #  for link in links:
    #    href = link.get('href')
    #    parsed = parse.search(self.urls['COMMITTEE_DETAIL_PARSE_PATTERN'], href)
    #    if parsed is not None:
    #      meeting.committees = [Commitee(numeric_id=int(parsed['committee_id']))]
    #      if hasattr(self, 'committee_queue'):
    #        self.committee_queue.add(int(parsed['committee_id']))
    #except:
    #  logging.critical('Cannot find link to committee detail page using SESSION_DETAIL_COMMITTEE_LINK_XPATH')
    #  raise TemplateError('Cannot find link to committee detail page using SESSION_DETAIL_COMMITTEE_LINK_XPATH')
  
    # Meeting identifier, date, address etc
    tds = dom.xpath(self.xpath['SESSION_DETAIL_IDENTIFIER_TD'])
    if len(tds) == 0:
      logging.critical('Cannot find table fields using SESSION_DETAIL_IDENTIFIER_TD_XPATH at session ' + meeting_url)
      raise TemplateError('Cannot find table fields using SESSION_DETAIL_IDENTIFIER_TD_XPATH at session ' + meeting_url)
    else:
      for n in range(0, len(tds)):
        try:
          tdcontent = tds[n].text.strip()
          nextcontent = tds[n + 1].text.strip()
        except:
          continue
        if tdcontent == 'Sitzung:':
          meeting.identifier = nextcontent
        # We don't need this any more because it's scraped in committee detail page(?)
        #elif tdcontent == 'Gremium:':
        #  meeting.committee_name = nextcontent
        elif tdcontent == 'Datum:':
          start = nextcontent
          end = nextcontent
          if tds[n + 2].text == 'Zeit:':
            if tds[n + 3].text is not None:
              times = tds[n + 3].text.replace(' Uhr', '').split('-')
              start = start + ' ' + times[0]
              if len(times) > 1:
                end = end + ' ' + times[1]
              else:
                end = start
            meeting.start = start
            meeting.end = end
        elif tdcontent == 'Raum:':
          meeting.address = " ".join(tds[n + 1].xpath('./text()'))
        elif tdcontent == 'Bezeichnung:':
          meeting.description = nextcontent
        if not hasattr(meeting, 'identifier'):
          logging.critical('Cannot find session identifier using XPath SESSION_DETAIL_IDENTIFIER_TD')
          raise TemplateError('Cannot find session identifier using XPath SESSION_DETAIL_IDENTIFIER_TD')
  
    # Agendaitems
    found_documents = []
    rows = dom.xpath(self.xpath['SESSION_DETAIL_AGENDA_ROWS'])
    if len(rows) == 0:
      logging.critical('Cannot find agenda using XPath SESSION_DETAIL_AGENDA_ROWS')
      raise TemplateError('Cannot find agenda using XPath SESSION_DETAIL_AGENDA_ROWS')
      meeting.agendaitem = []
    else:
      agendaitems = []
      agendaitem_id = None
      public = True
      agendaitem = None
      for row in rows:
        row_id = row.get('id')
        row_classes = row.get('class').split(' ')
        fields = row.xpath('td')
        number = fields[0].xpath('./text()')
        if len(number) > 0:
          number = number[0]
        else:
          # when theres a updated notice theres an additional spam
          number = fields[0].xpath('.//span/text()')
          if len(number) > 0:
            number = number[0]
        if number == []:
          number = None
        if row_id is not None:
          # Agendaitem main row
          # first: save agendaitem from before
          if agendaitem:
            agendaitems.append(agendaitem)
          # create new agendaitem
          agendaitem = Agendaitem(numeric_id=int(row_id.rsplit('_', 1)[1]))
          if number is not None:
            agendaitem.sequence_number = number
          # in some ris this is a link, sometimes not. test both.
          if len(fields[1].xpath('./a/text()')):
            agendaitem.title = "; ".join(fields[1].xpath('./a/text()'))
          elif len(fields[1].xpath('./text()')):
            agendaitem.title = "; ".join(fields[1].xpath('./text()'))
          # ignore no agendaitem information
          if agendaitem.title == 'keine Tagesordnungspunkte':
            agendaitem = None
            continue
          agendaitem.public = public
          # paper links
          links = row.xpath(self.xpath['SESSION_DETAIL_AGENDA_ROWS_SUBMISSION_LINK'])
          papers = []
          for link in links:
            href = link.get('href')
            if href is None:
              continue
            parsed = parse.search(self.urls['SUBMISSION_DETAIL_PARSE_PATTERN'], href)
            if parsed is not None:
              paper = Paper(numeric_id=int(parsed['paper_id']), identifier=link.text)
              papers.append(paper)
              # Add paper to paper queue
              if hasattr(self, 'paper_queue'):
                self.paper_queue.add(int(parsed['paper_id']))
          if len(papers):
            agendaitem.paper = papers
          """
          Note: we don't scrape agendaitem-related documents for now,
          based on the assumption that they are all found via paper
          detail pages. All we do here is get a list of document IDs
          in found_documents
          """
          # find links
          links = row.xpath('.//a[contains(@href,"getfile.")]')
          for link in links:
            if not link.xpath('.//img'):
              file_link = self.config.BASE_URL + link.get('href')
              document_id = file_link.split('id=')[1].split('&')[0]
              found_documents.append(document_id)
          # find forms
          forms = row.xpath('.//form')
          for form in forms:
            for hidden_field in form.xpath('input'):
              if hidden_field.get('name') != 'DT':
                continue
              document_id = hidden_field.get('value')
              found_documents.append(document_id)
        # Alternative für smc_tophz wegen Version 4.3.5 bi (Layout 3)
        elif ('smc_tophz' in row_classes) or (row.get('valign') == 'top' and row.get('debug') == '3'):
          # additional (optional row for agendaitem)
          label = fields[1].text
          value = fields[2].text
          if label is not None and value is not None:
            label = label.strip()
            value = value.strip()
            if label in ['Ergebnis:', 'Beschluss:', 'Beratungsergebnis:']:
              if value in self.config.RESULT_STRINGS:
                agendaitem.result = self.config.RESULT_STRINGS[value]
              else:
                logging.warn("String '%s' not found in configured RESULT_STRINGS", value)
              agendaitem.result = value
            elif label in ['Bemerkung:', 'Abstimmung:']:
              agendaitem.result_details = value
            # What's this?
            #elif label == 'Abstimmung:':
            #  agendaitems[agendaitem_id]['voting'] = value
            else:
              logging.critical("Agendaitem info label '%s' is unknown", label)
              raise ValueError('Agendaitem info label "%s" is unknown' % label)
        elif 'smcrowh' in row_classes:
          # Subheading (public / nonpublic part)
          if fields[0].text is not None and "Nicht öffentlich" in fields[0].text.encode('utf-8'):
            public = False
      meeting.agendaitem = agendaitems

    # meeting-related documents
    containers = dom.xpath(self.xpath['SESSION_DETAIL_ATTACHMENTS'])
    for container in containers:
      classes = container.get('class')
      if classes is None:
        continue
      classes = classes.split(' ')
      if self.xpath['SESSION_DETAIL_ATTACHMENTS_CONTAINER_CLASSNAME'] not in classes:
        continue
      documents = []
      rows = container.xpath('.//tr')
      for row in rows:
        if not row.xpath('.//form'):
          links = row.xpath('.//a')
          for link in links:
            # ignore additional pdf icon links
            if not link.xpath('.//img'):
              title = ' '.join(link.xpath('./text()')).strip()
              file_link = self.config.BASE_URL + link.get('href')
              document_id = file_link.split('id=')[1].split('&')[0]
              if document_id in found_documents:
                continue
              document = Document(
                identifier=document_id,
                numeric_id=document_id,
                title=title,
                original_url=file_link)
              document = self.get_document_file(document=document, link=file_link)
              if 'Einladung' in title:
                document_type = 'invitation'
              elif 'Niederschrift' in title:
                document_type = 'results_protocol'
              else:
                document_type = 'misc'
              documents.append({'relation': document_type, 'document': document})
              found_documents.append(document_id)
        else:
          forms = row.xpath('.//form')
          for form in forms:
            title = " ".join(row.xpath('./td/text()')).strip()
            for hidden_field in form.xpath('input'):
              if hidden_field.get('name') != 'DT':
                continue
              document_id = hidden_field.get('value')
              # make sure to add only those which aren't agendaitem-related
              if document_id not in found_documents:
                document = Document(
                  identifier=document_id,
                  numeric_id=document_id,
                  title=title
                )
                # Traversing the whole mechanize response to submit this form
                for mform in mechanize_forms:
                  for control in mform.controls:
                    if control.name == 'DT' and control.value == document_id:
                      document = self.get_document_file(document, mform)
                if 'Einladung' in title:
                  document_type = 'invitation'
                elif 'Niederschrift' in title:
                  document_type = 'results_protocol'
                else:
                  document_type = 'misc'
                documents.append({'relation': document_type, 'document': document})
                found_documents.append(document_id)
      if len(documents):
        meeting.document = documents
    oid = self.db.save_meeting(meeting)
    logging.info("Meeting %d stored with _id %s", meeting_id, oid)
Beispiel #7
0
    def get_paper(self, paper_url=None, paper_id=None):
        """
        Load paper details for the paper given by detail page URL
        or numeric ID
        """
        paper_url = ('%svo020.asp?VOLFDNR=%s' %
                     (self.config['scraper']['base_url'], paper_id))
        logging.info("Getting paper %d from %s", paper_id, paper_url)

        # Stupid re-try concept because AllRis sometimes misses
        # start < at tags at first request.
        try_counter = 0
        while True:
            try:
                response = self.get_url(paper_url)
                if not response:
                    return
                if "noauth" in response.url:
                    logging.warn("Paper %s in %s seems to private", paper_id,
                                 paper_url)
                    return
                text = response.text
                doc = html.fromstring(text)
                data = {}

                # Beratungsfolge-Table checken
                # lets hope we always have this table
                table = self.table_css(doc)[0]
                self.consultation_list_start = False
                last_headline = ''
                for line in table:
                    if line.tag == 'tr':
                        headline = line[0].text
                    elif line.tag == 'td':
                        headline = line.text
                    else:
                        logging.error("ERROR: Serious error in data table. "
                                      "Unable to parse.")
                    if headline:
                        headline = headline.split(":")[0].lower()
                        if headline[-1] == ":":
                            headline = headline[:-1]
                        if headline == "betreff":
                            value = line[1].text_content().strip()
                            # There is some html comment with a script
                            # tag in front of the text which we remove.
                            value = value.split("-->")[1]
                            # remove all multiple spaces from the string
                            data[headline] = " ".join(value.split())
                        elif headline in [
                                'verfasser', u'federführend', 'drucksache-art'
                        ]:
                            data[headline] = line[1].text.strip()
                        elif headline in ['status']:
                            data[headline] = line[1].text.strip()
                            # related papers
                            if len(line) > 2:
                                if len(line[3]):
                                    # Gets originalId. is there something
                                    # else at this position? (will break)
                                    paper_id = line[3][0][0][1][0].get(
                                        'href').split('=')[1].split('&')[0]
                                    data['relatedPaper'] = [
                                        Paper(originalId=paper_id)
                                    ]

                        # Lot's of scraping just because of the date (?)
                        elif headline == "beratungsfolge":
                            # The actual list will be in the next row
                            # inside a table, so we only set a marker.
                            self.consultation_list_start = True
                        elif self.consultation_list_start:
                            elem = line[0][0]
                            # The first line is pixel images, so skip
                            # it, then we need to jump in steps of two.
                            amount = (len(elem) - 1) / 2
                            consultations = []
                            date_list = []
                            i = 0
                            item = None
                            for elem_line in elem:
                                if i == 0:
                                    i += 1
                                    continue
                                """
                                Here we need to parse the actual list which can have different forms. A complex example
                                can be found at http://ratsinfo.aachen.de/bi/vo020.asp?VOLFDNR=10822
                                The first line is some sort of headline with the committee in question and the type of consultation.
                                After that 0-n lines of detailed information of meetings with a date, transscript and decision.
                                The first line has 3 columns (thanks to colspan) and the others have 7.

                                Here we make every meeting a separate entry, we can group them together later again if we want to.
                                """

                                # now we need to parse the actual list
                                # those lists
                                new_consultation = Consultation()
                                new_consultation.status = \
                                        elem_line[0].attrib['title'].lower()
                                if len(elem_line) == 3:
                                    # The order is "color/status", name of
                                    # committee / link to TOP, more info we
                                    # define a head dict here which can be
                                    # shared for the other lines once we find
                                    # another head line we will create a new
                                    # one here.
                                    new_consultation.role = \
                                            elem_line[2].text.strip()

                                    # Name of committee, e.g.
                                    # "Finanzausschuss", unfort. without id
                                    #'committee' : elem_line[1].text.strip(),
                                # For some obscure reasons sometimes action
                                # is missing.
                                elif len(elem_line) == 2:
                                    # The order is "color/status", name of
                                    # committee / link to TOP, more info.
                                    status = \
                                            elem_line[0].attrib['title'].lower()
                                    # We define a head dict here which can be
                                    # shared for the other lines once we find
                                    # another head line we will create a new
                                    # one here.
                                    # name of committee, e.g.
                                    # "Finanzausschuss", unfort. without id
                                    #'committee' : elem_line[1].text.strip(),
                                elif len(elem_line) == 7:
                                    try:
                                        # This is about line 2 with lots of
                                        # more stuff to process.
                                        # Date can be text or a link with that
                                        # text.
                                        # We have a link (and ignore it).
                                        if len(elem_line[1]) == 1:
                                            date_text = elem_line[1][0].text
                                        else:
                                            date_text = elem_line[1].text
                                        date_list.append(
                                            datetime.datetime.strptime(
                                                date_text.strip(), "%d.%m.%Y"))
                                        if len(elem_line[2]):
                                            # Form with silfdnr and toplfdnr
                                            # but only in link (action=
                                            #   "to010.asp?topSelected=57023")
                                            form = elem_line[2][0]
                                            meeting_id = form[0].attrib[
                                                'value']
                                            new_consultation.meeting = [
                                                Meeting(originalId=meeting_id)
                                            ]
                                            # Full name of meeting, e.g.
                                            # "A/31/WP.16 öffentliche/
                                            #   nichtöffentliche Sitzung des
                                            # Finanzausschusses"
                                            #item['meeting'] = \
                                            #    elem_line[3][0].text.strip()
                                        else:
                                            # No link to TOP. Should not be
                                            # possible but happens.
                                            #   (TODO: Bugreport?)
                                            # Here we have no link but the text
                                            # is in the TD directly - will be
                                            # scaped as meeting.
                                            #item['meeting'] = \
                                            #    elem_line[3].text.strip()
                                            logging.warn(
                                                "AgendaItem in consultation "
                                                "list on the web page does not "
                                                "contain a link to the actual "
                                                "meeting at paper %s",
                                                paper_url)
                                        toplfdnr = None
                                        if len(elem_line[6]) > 0:
                                            form = elem_line[6][0]
                                            toplfdnr = form[0].attrib['value']
                                        if toplfdnr:
                                            new_consultation.originalId = \
                                                    "%s-%s" % (toplfdnr,
                                                               paper_id)
                                            # actually the id of the transcript
                                            new_consultation.agendaItem = \
                                                    AgendaItem(
                                                        originalId=toplfdnr)
                                            # e.g. "ungeändert beschlossen"
                                            new_consultation.agendaItem.result \
                                                    = elem_line[4].text.strip()
                                            consultations.append(
                                                new_consultation)
                                        else:
                                            logging.error(
                                                "missing agendaItem ID in "
                                                "consultation list at %s",
                                                paper_url)
                                    except (IndexError, KeyError):
                                        logging.error(
                                            "ERROR: Serious error in "
                                            "consultation list. Unable to "
                                            "parse.")
                                        logging.error(
                                            "Serious error in consultation "
                                            "list. Unable to parse.")
                                        return []
                                i += 1
                            # Theory: we don't need this at all, because it's
                            # scraped at meeting.
                            #data['consultations'] = consultations
                            # set the marker to False again as we have read it
                            self.consultation_list_start = False
                    last_headline = headline
                    # We simply ignore the rest (there might not be much more
                    # actually).
                # The actual text comes after the table in a div but it's not
                # valid XML or HTML this using regex.
                data['docs'] = self.body_re.findall(response.text)
                first_date = False
                for single_date in date_list:
                    if first_date:
                        if single_date < first_date:
                            first_date = single_date
                    else:
                        first_date = single_date
                paper = Paper(originalId=paper_id)
                paper.originalUrl = paper_url
                paper.name = data['betreff']
                paper.description = data['docs']
                if 'drucksache-art' in data:
                    paper.paperType = data['drucksache-art']
                if first_date:
                    paper.publishedDate = first_date.strftime("%d.%m.%Y")
                # see theory above
                #if 'consultations' in data:
                #    paper.consultation = data['consultations']
                paper.auxiliaryFile = []
                # get the attachments step 1 (Drucksache)
                file_1 = self.attachment_1_css(doc)
                if len(file_1):
                    if file_1[0].value:
                        href = ('%sdo027.asp' %
                                self.config['scraper']['base_url'])
                        original_id = file_1[0].value
                        name = 'Drucksache'
                        main_file = File(originalId=original_id, name=name)
                        main_file = self.get_file(main_file, href, True)
                        paper.mainFile = main_file
                # get the attachments step 2 (additional attachments)
                files = self.attachments_css(doc)
                if len(files) > 0:
                    if len(files[0]) > 1:
                        if files[0][1][0].text.strip() == "Anlagen:":
                            for tr in files[0][2:]:
                                link = tr[0][0]
                                href = ("%s%s" %
                                        (self.config['scraper']['base_url'],
                                         link.attrib["href"]))
                                name = link.text
                                path_tokens = link.attrib["href"].split('/')
                                original_id = "%d-%d" % (int(
                                    path_tokens[4]), int(path_tokens[6]))
                                aux_file = File(originalId=original_id,
                                                name=name)
                                aux_file = self.get_file(aux_file, href)
                                paper.auxiliaryFile.append(aux_file)
                print paper.auxiliaryFile
                if not len(paper.auxiliaryFile):
                    del paper.auxiliaryFile
                oid = self.db.save_paper(paper)
                return
            except (KeyError, IndexError):
                if try_counter < 3:
                    logging.info("Try again: Getting paper %d from %s",
                                 paper_id, paper_url)
                    try_counter += 1
                else:
                    logging.error("Failed getting paper %d from %s", paper_id,
                                  paper_url)
                    return
Beispiel #8
0
    def get_meeting(self, meeting_url=None, meeting_id=None):
        """ Load meeting details (e.g. agendaitems) for the given detail page
        URL or numeric ID
        """
        meeting_url = ("%sto010.asp?selfaction=ws&template=xyz&SILFDNR=%s" %
                       (self.config['scraper']['base_url'], meeting_id))

        logging.info("Getting meeting %d from %s", meeting_id, meeting_url)

        r = self.get_url(meeting_url)
        if not r:
            return
        # If r.history has an item we have a problem
        if len(r.history):
            if r.history[0].status_code == 302:
                logging.info("Meeting %d from %s seems to be private",
                             meeting_id, meeting_id)
            else:
                logging.error(
                    "Strange redirect %d from %s with status code %s",
                    meeting_id, meeting_url, r.history[0].status_code)
            return
        h = HTMLParser.HTMLParser()
        xml = str(r.text.encode('ascii', 'xmlcharrefreplace'))
        parser = etree.XMLParser(recover=True)
        root = etree.fromstring(xml, parser=parser)

        meeting = Meeting(originalId=meeting_id)

        # special area
        special = {}
        for item in root[0].iterchildren():
            special[item.tag] = item.text
        # Woher kriegen wir das Datum? Nur über die Übersicht?
        #if 'sisb' in special:
        #if 'sise' in special:
        if 'saname' in special:
            meeting.type = special['saname']
        # head area
        head = {}
        for item in root[1].iterchildren():
            if item.text:
                head[item.tag] = h.unescape(item.text)
            else:
                head[item.text] = ''
        if 'sitext' in head:
            meeting.name = head['sitext']
        if 'raname' in head:
            meeting.room = head['raname']
        if 'raort' in head:
            meeting.address = head['raort']
        agendaitems = []

        for item in root[2].iterchildren():
            elem = {}
            for e in item.iterchildren():
                elem[e.tag] = e.text

            section = [elem['tofnum'], elem['tofunum'], elem['tofuunum']]
            section = [x for x in section if x != "0"]
            elem['section'] = ".".join(section)
            agendaitem = AgendaItem()

            agendaitem.originalId = int(elem['tolfdnr'])
            agendaitem.public = (elem['toostLang'] == u'öffentlich')
            #agendaitem.name = elem['totext1']
            # get agenda detail page
            # TODO: Own Queue
            time.sleep(self.config['scraper']['wait_time'])
            agendaitem_url = (
                '%sto020.asp?selfaction=ws&template=xyz&TOLFDNR=%s' %
                (self.config['scraper']['base_url'], agendaitem.originalId))
            logging.info("Getting agendaitem %d from %s",
                         agendaitem.originalId, agendaitem_url)

            agendaitem_r = self.get_url(agendaitem_url)
            if not agendaitem_r:
                return

            if len(agendaitem_r.history):
                logging.info("Agenda item %d from %s seems to be private",
                             meeting_id, meeting_url)
            else:
                agendaitem_xml = agendaitem_r.text.encode(
                    'ascii', 'xmlcharrefreplace')
                # TODO: mixup of agendaitem_parser / parser below?
                agendaitem_parser = etree.XMLParser(recover=True)
                agendaitem_root = etree.fromstring(agendaitem_xml,
                                                   parser=parser)
                add_agenda_item = {}
                for add_item in agendaitem_root[0].iterchildren():
                    if add_item.tag == "rtfWP" and len(add_item) > 0:
                        try:
                            agendaitem.resolution_text = h.unescape(
                                etree.tostring(add_item[0][1][0]))
                        except:
                            logging.warn(
                                "Unable to parse resolution text at "
                                "%s", agendaitem_url)
                    else:
                        if add_item.text:
                            add_agenda_item[add_item.tag] = h.unescape(
                                add_item.text)
                if 'toptext' in add_agenda_item:
                    agendaitem.name = add_agenda_item['toptext']

                # there are papers with id = 0. we don't need them.
                if int(elem['volfdnr']):
                    consult_id = (unicode(agendaitem.originalId) +
                                  unicode(int(elem['volfdnr'])))
                    consultation = Consultation(originalId=consult_id)
                    paper_id = int(elem['volfdnr'])
                    if 'voname' in add_agenda_item:
                        consultation.paper = Paper(
                            originalId=paper_id,
                            name=add_agenda_item['voname'])
                    else:
                        consultation.paper = Paper(originalId=paper_id)
                    agendaitem.consultation = [consultation]
                    if 'vobetr' in add_agenda_item:
                        if add_agenda_item['vobetr'] != agendaitem.name:
                            logging.warn(
                                "different values for name: %s and %s",
                                agendaitem.name, add_agenda_item['vobetr'])
                    if hasattr(self, 'paper_queue'):
                        self.paper_queue.add(int(elem['volfdnr']))
                if 'totyp' in add_agenda_item:
                    agendaitem.result = add_agenda_item['totyp']
                agendaitems.append(agendaitem)
        meeting.agendaItem = agendaitems

        oid = self.db.save_meeting(meeting)
        logging.info("Meeting %d stored with _id %s", meeting_id, oid)
    def get_meeting(self, meeting_url=None, meeting_id=None):
        """ Load meeting details (e.g. agendaitems) for the given detail page
        URL or numeric ID
        """
        meeting_url = ("%sto010.asp?selfaction=ws&template=xyz&SILFDNR=%s"
                       % (self.config['scraper']['base_url'], meeting_id))

        logging.info("Getting meeting %d from %s", meeting_id, meeting_url)

        r = self.get_url(meeting_url)
        if not r:
            return
        # If r.history has an item we have a problem
        if len(r.history):
            if r.history[0].status_code == 302:
                logging.info("Meeting %d from %s seems to be private",
                             meeting_id, meeting_id)
            else:
                logging.error("Strange redirect %d from %s with status code %s",
                              meeting_id, meeting_url, r.history[0].status_code)
            return
        h = HTMLParser.HTMLParser()
        xml = str(r.text.encode('ascii', 'xmlcharrefreplace'))
        parser = etree.XMLParser(recover=True)
        root = etree.fromstring(xml, parser=parser)

        meeting = Meeting(originalId=meeting_id)

        # special area
        special = {}
        for item in root[0].iterchildren():
            special[item.tag] = item.text
        # Woher kriegen wir das Datum? Nur über die Übersicht?
        #if 'sisb' in special:
        #if 'sise' in special:
        if 'saname' in special:
            meeting.type = special['saname']
        # head area
        head = {}
        for item in root[1].iterchildren():
            if item.text:
                head[item.tag] = h.unescape(item.text)
            else:
                head[item.text] = ''
        if 'sitext' in head:
            meeting.name = head['sitext']
        if 'raname' in head:
            meeting.room = head['raname']
        if 'raort' in head:
            meeting.address = head['raort']
        agendaitems = []

        for item in root[2].iterchildren():
            elem = {}
            for e in item.iterchildren():
                elem[e.tag] = e.text

            section = [elem['tofnum'], elem['tofunum'], elem['tofuunum']]
            section = [x for x in section if x != "0"]
            elem['section'] = ".".join(section)
            agendaitem = AgendaItem()

            agendaitem.originalId = int(elem['tolfdnr'])
            agendaitem.public = (elem['toostLang'] == u'öffentlich')
            #agendaitem.name = elem['totext1']
            # get agenda detail page
            # TODO: Own Queue
            time.sleep(self.config['scraper']['wait_time'])
            agendaitem_url = ('%sto020.asp?selfaction=ws&template=xyz&TOLFDNR=%s'
                              % (self.config['scraper']['base_url'],
                                 agendaitem.originalId))
            logging.info("Getting agendaitem %d from %s",
                         agendaitem.originalId, agendaitem_url)

            agendaitem_r = self.get_url(agendaitem_url)
            if not agendaitem_r:
                return

            if len(agendaitem_r.history):
                logging.info("Agenda item %d from %s seems to be private",
                             meeting_id, meeting_url)
            else:
                agendaitem_xml = agendaitem_r.text.encode('ascii',
                                                          'xmlcharrefreplace')
                # TODO: mixup of agendaitem_parser / parser below?
                agendaitem_parser = etree.XMLParser(recover=True)
                agendaitem_root = etree.fromstring(agendaitem_xml,
                                                   parser=parser)
                add_agenda_item = {}
                for add_item in agendaitem_root[0].iterchildren():
                    if add_item.tag == "rtfWP" and len(add_item) > 0:
                        try:
                            agendaitem.resolution_text = h.unescape(
                                etree.tostring(add_item[0][1][0]))
                        except:
                            logging.warn("Unable to parse resolution text at "
                                         "%s", agendaitem_url)
                    else:
                        if add_item.text:
                            add_agenda_item[add_item.tag] = h.unescape(
                                add_item.text)
                if 'toptext' in add_agenda_item:
                    agendaitem.name = add_agenda_item['toptext']

                # there are papers with id = 0. we don't need them.
                if int(elem['volfdnr']):
                    consult_id = (unicode(agendaitem.originalId)
                                  + unicode(int(elem['volfdnr'])))
                    consultation = Consultation(originalId=consult_id)
                    paper_id = int(elem['volfdnr'])
                    if 'voname' in add_agenda_item:
                        consultation.paper = Paper(
                            originalId=paper_id, name=add_agenda_item['voname'])
                    else:
                        consultation.paper = Paper(originalId=paper_id)
                    agendaitem.consultation = [consultation]
                    if 'vobetr' in add_agenda_item:
                        if add_agenda_item['vobetr'] != agendaitem.name:
                            logging.warn("different values for name: %s and %s",
                                         agendaitem.name,
                                         add_agenda_item['vobetr'])
                    if hasattr(self, 'paper_queue'):
                        self.paper_queue.add(int(elem['volfdnr']))
                if 'totyp' in add_agenda_item:
                    agendaitem.result = add_agenda_item['totyp']
                agendaitems.append(agendaitem)
        meeting.agendaItem = agendaitems

        oid = self.db.save_meeting(meeting)
        logging.info("Meeting %d stored with _id %s", meeting_id, oid)