def post(self): queue_name = self.request.get("queue_name") # FIXME: This queue lookup should be shared between handlers. queue = Queue.queue_with_name(queue_name) if not queue: self.error(404) return attachment_id = self._int_from_request("attachment_id") attachment = Attachment(attachment_id) last_status = attachment.status_for_queue(queue) # Ideally we should use a transaction for the calls to # WorkItems and ActiveWorkItems. # Only remove it from the queue if the last message is not a retry request. # Allow removing it from the queue even if there is no last_status for easier testing. if not last_status or not last_status.is_retry_request(): queue.work_items().remove_work_item(attachment_id) RecordPatchEvent.stopped(attachment_id, queue_name) else: RecordPatchEvent.retrying(attachment_id, queue_name) # Always release the lock on the item. queue.active_work_items().expire_item(attachment_id)
def get(self, attachment_id): attachment = Attachment(int(attachment_id)) bubbles = [ self._build_bubble(name_pair, attachment) for name_pair in self._queues_to_display ] template_values = { "bubbles": bubbles, } self.response.out.write( template.render("templates/statusbubble.html", template_values))
def get(self, attachment_id_string): attachment_id = int(attachment_id_string) attachment = Attachment(attachment_id) bubbles, show_submit_to_ews = self._build_bubbles_for_attachment(attachment) template_values = { "bubbles": bubbles, "attachment_id": attachment_id, "show_submit_to_ews": show_submit_to_ews, } self.response.out.write(template.render("templates/statusbubble.html", template_values))
def get(self, attachment_id_string): attachment_id = int(attachment_id_string) attachment = Attachment(attachment_id) bubbles, show_submit_to_ews, show_failure_to_apply = self._build_bubbles_for_attachment(attachment) template_values = { "bubbles": bubbles, "attachment_id": attachment_id, "show_submit_to_ews": False, # Disabled Submit to old EWS button for now. "show_failure_to_apply": show_failure_to_apply, } self.response.out.write(template.render("templates/statusbubble.html", template_values))
def post(self): fileDescription = self.request.body if fileDescription is not None: currentUser = self.current_user teamId = currentUser.teamId contentType = self.request.headers.get("Content-Type") width = 0 height = 0 filename = self.request.headers.get("X_filename") if filename: filename = urllib.parse.unquote(filename) fileName = generateFileName() attachmentPath = options.attachmentPath filePath = '%s/%s'%(attachmentPath, fileName) print(contentType) if contentType in options.allowImageFileType: fileType = '0' stream = BytesIO(fileDescription) image = Image.open(stream) width, height = image.size maxSize = (260, 180) image.thumbnail(maxSize, Image.ANTIALIAS) image.save(filePath, 'PNG', optimize = True) filePath += 'origin' elif contentType in options.allowDocumentFileType.keys(): fileType = '1' else: self.writeFailedResult() self.finish() return with open(filePath, 'wb') as out: out.write(fileDescription) attachment = Attachment(url= fileName, name= filename, contentType= contentType, width= width, height= height, fileType= fileType, own_id=currentUser.id, createTime=datetime.datetime.now()) try: db.session.add(attachment) db.session.commit() except: db.session.rollback() raise self.writeSuccessResult(attachment)
def post(self): queue_name = self.request.get("queue_name") # FIXME: This queue lookup should be shared between handlers. queue = Queue.queue_with_name(queue_name) if not queue: self.error(404) return attachment_id = self._int_from_request("attachment_id") attachment = Attachment(attachment_id) last_status = attachment.status_for_queue(queue) # Ideally we should use a transaction for the calls to # WorkItems and ActiveWorkItems. queue.work_items().remove_work_item(attachment_id) RecordPatchEvent.stopped(attachment_id, queue_name) queue.active_work_items().expire_item(attachment_id)
def post(self): attachment_id = self._int_from_request("attachment_id") attachment = Attachment(attachment_id) self._add_attachment_to_ews_queues(attachment) if self.request.get("next_action") == "return_to_bubbles": self.redirect("/status-bubble/%s" % attachment_id)
def get_session(self, session_url=None, session_id=None): """ Load session details for the given detail page URL or numeric ID """ # Read either session_id or session_url from the opposite if session_id is not None: session_url = self.urls['SESSION_DETAIL_PRINT_PATTERN'] % session_id elif session_url is not None: parsed = parse.search(self.urls['SESSION_DETAIL_PARSE_PATTERN'], session_url) session_id = parsed['session_id'] logging.info("Getting session %d from %s", session_id, session_url) session = Session(numeric_id=session_id) time.sleep(self.config.WAIT_TIME) response = self.user_agent.open(session_url) # forms for later attachment download mechanize_forms = mechanize.ParseResponse(response, backwards_compat=False) # seek(0) is necessary to reset response pointer. response.seek(0) html = response.read() html = html.replace(' ', ' ') parser = etree.HTMLParser() dom = etree.parse(StringIO(html), parser) # check for page errors try: page_title = dom.xpath('//h1')[0].text if 'Fehlermeldung' in page_title: logging.info("Page %s cannot be accessed due to server error", session_url) if self.options.verbose: print "Page %s cannot be accessed due to server error" % session_url return if 'Berechtigungsfehler' in page_title: logging.info("Page %s cannot be accessed due to permissions", session_url) if self.options.verbose: print "Page %s cannot be accessed due to permissions" % session_url return except: pass try: error_h3 = dom.xpath('//h3[@class="smc_h3"]')[0].text.strip() if 'Keine Daten gefunden' in error_h3: logging.info("Page %s does not contain any agenda items", session_url) if self.options.verbose: print "Page %s does not contain agenda items" % session_url return except: pass session.original_url = session_url # Session title try: session.title = dom.xpath( self.xpath['SESSION_DETAIL_TITLE'])[0].text except: logging.critical( 'Cannot find session title element using XPath SESSION_DETAIL_TITLE' ) raise TemplateError( 'Cannot find session title element using XPath SESSION_DETAIL_TITLE' ) # Committe link try: links = dom.xpath(self.xpath['SESSION_DETAIL_COMMITTEE_LINK']) for link in links: href = link.get('href') parsed = parse.search( self.urls['COMMITTEE_DETAIL_PARSE_PATTERN'], href) if parsed is not None: session.committee_id = parsed['committee_id'] except: logging.critical( 'Cannot find link to committee detail page using SESSION_DETAIL_COMMITTEE_LINK_XPATH' ) raise TemplateError( 'Cannot find link to committee detail page using SESSION_DETAIL_COMMITTEE_LINK_XPATH' ) # Session identifier, date, address etc tds = dom.xpath(self.xpath['SESSION_DETAIL_IDENTIFIER_TD']) if len(tds) == 0: logging.critical( 'Cannot find table fields using SESSION_DETAIL_IDENTIFIER_TD_XPATH' ) raise TemplateError( 'Cannot find table fields using SESSION_DETAIL_IDENTIFIER_TD_XPATH' ) else: for n in range(0, len(tds)): try: tdcontent = tds[n].text.strip() nextcontent = tds[n + 1].text.strip() except: continue if tdcontent == 'Sitzung:': session.identifier = nextcontent elif tdcontent == 'Gremium:': session.committee_name = nextcontent elif tdcontent == 'Datum:': datestring = nextcontent if tds[n + 2].text == 'Zeit:': if (n + 3) in tds and tds[n + 3].text is not None: datestring + ' ' + tds[n + 3].text session.date_start = datestring elif tdcontent == 'Raum:': session.address = " ".join(tds[n + 1].xpath('./text()')) elif tdcontent == 'Bezeichnung:': session.description = nextcontent if not hasattr(session, 'identifier'): logging.critical( 'Cannot find session identifier using XPath SESSION_DETAIL_IDENTIFIER_TD' ) raise TemplateError( 'Cannot find session identifier using XPath SESSION_DETAIL_IDENTIFIER_TD' ) # Agendaitems found_attachments = [] rows = dom.xpath(self.xpath['SESSION_DETAIL_AGENDA_ROWS']) if len(rows) == 0: logging.critical( 'Cannot find agenda using XPath SESSION_DETAIL_AGENDA_ROWS') raise TemplateError( 'Cannot find agenda using XPath SESSION_DETAIL_AGENDA_ROWS') else: agendaitems = {} agendaitem_id = None public = True for row in rows: row_id = row.get('id') row_classes = row.get('class').split(' ') fields = row.xpath('td') number = fields[0].xpath('./text()') if len(number) > 0: number = number[0] if number == []: number = None #print "number: %s" % number if row_id is not None: # Agendaitem main row agendaitem_id = row_id.rsplit('_', 1)[1] agendaitems[agendaitem_id] = {} agendaitems[agendaitem_id]['id'] = int(agendaitem_id) if number is not None: agendaitems[agendaitem_id]['number'] = number agendaitems[agendaitem_id]['subject'] = "; ".join( fields[1].xpath('./text()')) agendaitems[agendaitem_id]['public'] = public # submission links links = row.xpath( self. xpath['SESSION_DETAIL_AGENDA_ROWS_SUBMISSION_LINK']) submissions = [] for link in links: href = link.get('href') if href is None: continue parsed = parse.search( self.urls['SUBMISSION_DETAIL_PARSE_PATTERN'], href) if parsed is not None: submission = Submission(numeric_id=int( parsed['submission_id']), identifier=link.text) submissions.append(submission) # Add submission to submission queue if hasattr(self, 'submission_queue'): self.submission_queue.add( int(parsed['submission_id'])) if len(submissions): agendaitems[agendaitem_id]['submissions'] = submissions """ Note: we don't scrape agendaitem-related attachments for now, based on the assumption that they are all found via submission detail pages. All we do here is get a list of attachment IDs in found_attachments """ #attachments = [] forms = row.xpath('.//form') for form in forms: for hidden_field in form.xpath('input'): if hidden_field.get('name') != 'DT': continue attachment_id = hidden_field.get('value') #attachments.append(attachment_id) found_attachments.append(attachment_id) #if len(attachments): # agendaitems[agendaitem_id]['attachments'] = attachments elif 'smc_tophz' in row_classes: # additional (optional row for agendaitem) label = fields[1].text value = fields[2].text if label is not None and value is not None: label = label.strip() value = value.strip() #print (label, value) if label in ['Ergebnis:', 'Beschluss:']: if value in self.config.RESULT_STRINGS: agendaitems[agendaitem_id][ 'result'] = self.config.RESULT_STRINGS[ value] else: logging.warn( "String '%s' not found in configured RESULT_STRINGS", value) if self.options.verbose: print "WARNING: String '%s' not found in RESULT_STRINGS\n" % value agendaitems[agendaitem_id]['result'] = value elif label == 'Bemerkung:': agendaitems[agendaitem_id]['result_note'] = value elif label == 'Abstimmung:': agendaitems[agendaitem_id]['voting'] = value else: logging.critical( "Agendaitem info label '%s' is unknown", label) raise ValueError( 'Agendaitem info label "%s" is unknown' % label) elif 'smcrowh' in row_classes: # Subheading (public / nonpublic part) if fields[ 0].text is not None and "Nicht öffentlich" in fields[ 0].text.encode('utf-8'): public = False #print json.dumps(agendaitems, indent=2) session.agendaitems = agendaitems.values() # session-related attachments containers = dom.xpath(self.xpath['SESSION_DETAIL_ATTACHMENTS']) for container in containers: classes = container.get('class') if classes is None: continue classes = classes.split(' ') if self.xpath[ 'SESSION_DETAIL_ATTACHMENTS_CONTAINER_CLASSNAME'] not in classes: continue attachments = [] rows = container.xpath('.//tr') for row in rows: forms = row.xpath('.//form') for form in forms: #print "Form: ", form name = " ".join(row.xpath('./td/text()')).strip() for hidden_field in form.xpath('input'): if hidden_field.get('name') != 'DT': continue attachment_id = hidden_field.get('value') # make sure to add only those which aren't agendaitem-related if attachment_id not in found_attachments: attachment = Attachment(identifier=attachment_id, name=name) # Traversing the whole mechanize response to submit this form for mform in mechanize_forms: #print "Form found: '%s'" % mform for control in mform.controls: if control.name == 'DT' and control.value == attachment_id: #print "Found matching form: ", control.name, control.value attachment = self.get_attachment_file( attachment, mform) attachments.append(attachment) found_attachments.append(attachment_id) if len(attachments): session.attachments = attachments oid = self.db.save_session(session) if self.options.verbose: logging.info("Session %d stored with _id %s", session_id, oid)
classes = container.get('class').split(' ') except: continue if self.xpath[ 'SUBMISSION_DETAIL_ATTACHMENTS_CONTAINER_CLASSNAME'] not in classes: continue rows = container.xpath('.//tr') for row in rows: forms = row.xpath('.//form') for form in forms: name = " ".join(row.xpath('./td/text()')).strip() for hidden_field in form.xpath('input[@name="DT"]'): attachment_id = hidden_field.get('value') if attachment_id in found_attachments: continue attachment = Attachment(identifier=attachment_id, name=name) #print attachment_id # Traversing the whole mechanize response to submit this form for mform in mechanize_forms: #print "Form found: '%s'" % mform for control in mform.controls: if control.name == 'DT' and control.value == attachment_id: got_attachment = False try: attachment = self.get_attachment_file( attachment, mform) got_attachment = True except: # Second attempt in case of a stupid network error # (see #22) time.sleep(3)
def get(self, attachment_id): attachment = Attachment(int(attachment_id)) template_values = {"summary": attachment.summary()} self.response.out.write( template.render("templates/statusbubble.html", template_values))
def get_submission(self, submission_url=None, submission_id=None): """ Load submission (Vorlage) details for the submission given by detail page URL or numeric ID """ # Read either submission_id or submission_url from the opposite if submission_id is not None: submission_url = self.urls[ 'SUBMISSION_DETAIL_PRINT_PATTERN'] % submission_id elif submission_url is not None: parsed = parse.search(self.urls['SUBMISSION_DETAIL_PARSE_PATTERN'], submission_url) submission_id = parsed['submission_id'] logging.info("Getting submission %d from %s", submission_id, submission_url) submission = Submission(numeric_id=submission_id) try_until = 1 try_counter = 0 try_found = False while (try_counter < try_until): try_counter += 1 try_found = False time.sleep(self.config.WAIT_TIME) try: response = self.user_agent.open(submission_url) except urllib2.HTTPError, e: if e.code == 404: sys.stderr.write( "URL not found (HTTP 404) error caught: %s\n" % submission_url) sys.stderr.write( "Please check BASE_URL in your configuration.\n") sys.exit(1) mechanize_forms = mechanize.ParseResponse(response, backwards_compat=False) response.seek(0) html = response.read() html = html.replace(' ', ' ') parser = etree.HTMLParser() dom = etree.parse(StringIO(html), parser) # Hole die Seite noch einmal wenn unbekannter zufällig auftretender Fehler ohne Fehlermeldung ausgegeben wird (gefunden in Duisburg, vermutlich kaputte Server Config) try: page_title = dom.xpath('//h1')[0].text if 'Fehler' in page_title: try_until = 3 try_found = True logging.info( "Original RIS Server Bug, restart scraping submission %s", submission_url) except: pass if (try_found == False): # check for page errors try: if 'Fehlermeldung' in page_title: logging.info( "Page %s cannot be accessed due to server error", submission_url) if self.options.verbose: print "Page %s cannot be accessed due to server error" % submission_url return if 'Berechtigungsfehler' in page_title: logging.info( "Page %s cannot be accessed due to permissions", submission_url) if self.options.verbose: print "Page %s cannot be accessed due to permissions" % submission_url return except: pass submission.original_url = submission_url # Session title try: stitle = dom.xpath(self.xpath['SUBMISSION_DETAIL_TITLE']) submission.title = stitle[0].text except: logging.critical( 'Cannot find submission title element using XPath SUBMISSION_DETAIL_TITLE' ) raise TemplateError( 'Cannot find submission title element using XPath SUBMISSION_DETAIL_TITLE' ) # Submission identifier, date, type etc tds = dom.xpath(self.xpath['SUBMISSION_DETAIL_IDENTIFIER_TD']) if len(tds) == 0: logging.critical( 'Cannot find table fields using XPath SUBMISSION_DETAIL_IDENTIFIER_TD' ) logging.critical('HTML Dump:' + html) raise TemplateError( 'Cannot find table fields using XPath SUBMISSION_DETAIL_IDENTIFIER_TD' ) else: current_category = None for n in range(0, len(tds)): try: tdcontent = tds[n].text.strip() except: continue if tdcontent == 'Name:': submission.identifier = tds[n + 1].text.strip() elif tdcontent == 'Art:': submission.type = tds[n + 1].text.strip() elif tdcontent == 'Datum:': submission.date = tds[n + 1].text.strip() elif tdcontent == 'Name:': submission.identifier = tds[n + 1].text.strip() elif tdcontent == 'Betreff:': submission.subject = '; '.join( tds[n + 1].xpath('./text()')) elif tdcontent == 'Referenzvorlage:': link = tds[n + 1].xpath('a')[0] href = link.get('href') parsed = parse.search( self.urls['SUBMISSION_DETAIL_PARSE_PATTERN'], href) submission.superordinate = { 'identifier': link.text.strip(), 'numeric_id': parsed['submission_id'] } # add superordinate submission to queue if hasattr(self, 'submission_queue'): self.submission_queue.add( parsed['submission_id']) # subordinate submissions are added to the queue elif tdcontent == 'Untergeordnete Vorlage(n):': current_category = 'subordinates' for link in tds[n + 1].xpath('a'): href = link.get('href') parsed = parse.search( self. urls['SUBMISSION_DETAIL_PARSE_PATTERN'], href) if hasattr(self, 'submission_queue' ) and parsed is not None: #add subordinate submission to queue self.submission_queue.add( parsed['submission_id']) else: if current_category == 'subordinates': for link in tds[n + 1].xpath('a'): href = link.get('href') parsed = parse.search( self.urls[ 'SUBMISSION_DETAIL_PARSE_PATTERN'], href) if hasattr(self, 'submission_queue' ) and parsed is not None: self.submission_queue.add( parsed['submission_id']) if not hasattr(submission, 'identifier'): logging.critical( 'Cannot find session identifier using SESSION_DETAIL_IDENTIFIER_TD_XPATH' ) raise TemplateError( 'Cannot find session identifier using SESSION_DETAIL_IDENTIFIER_TD_XPATH' ) # "Beratungsfolge"(list of sessions for this submission) # This is currently not parsed for scraping, but only for # gathering session-attachment ids fpr later exclusion found_attachments = [] rows = dom.xpath(self.xpath['SUBMISSION_DETAIL_AGENDA_ROWS']) for row in rows: formfields = row.xpath( './/input[@type="hidden"][@name="DT"]') if len(formfields): attachment_id = formfields[0].get('value') if attachment_id is not None: found_attachments.append(attachment_id) # submission-related attachments submission.attachments = [] containers = dom.xpath( self.xpath['SUBMISSION_DETAIL_ATTACHMENTS']) for container in containers: try: classes = container.get('class').split(' ') except: continue if self.xpath[ 'SUBMISSION_DETAIL_ATTACHMENTS_CONTAINER_CLASSNAME'] not in classes: continue rows = container.xpath('.//tr') for row in rows: forms = row.xpath('.//form') for form in forms: name = " ".join(row.xpath('./td/text()')).strip() for hidden_field in form.xpath( 'input[@name="DT"]'): attachment_id = hidden_field.get('value') if attachment_id in found_attachments: continue attachment = Attachment( identifier=attachment_id, name=name) #print attachment_id # Traversing the whole mechanize response to submit this form #print mechanize_forms for mform in mechanize_forms: #print "Form found: '%s'" % mform for control in mform.controls: if control.name == 'DT' and control.value == attachment_id: attachment = self.get_attachment_file( attachment, mform) submission.attachments.append( attachment) # forcing overwrite=True here oid = self.db.save_submission(submission)
def post(self): attachment_id = self._int_from_request("attachment_id") attachment = Attachment(attachment_id) self._add_attachment_to_ews_queues(attachment)