def update_issues(): """Fetch a list of "recent" petitions on Althingi and update our database accordingly. """ session_num = get_last_session_num() session, created = Session.objects.get_or_create(session_num=session_num) if created: print 'Added session: %s' % session_num else: print 'Already have session: %s' % session_num issue_list_xml = minidom.parse(urllib.urlopen(ISSUE_LIST_URL % session_num)) issues_xml = issue_list_xml.getElementsByTagName(u'mál') for issue_xml in issues_xml: name = issue_xml.getElementsByTagName(u'málsheiti')[0].firstChild.nodeValue description = issue_xml.getElementsByTagName(u'efnisgreining')[0].firstChild description = description.nodeValue if description != None else 'engin lýsing útgefin' issue_type = issue_xml.getElementsByTagName(u'málstegund')[0].getAttribute(u'málstegund') issue_num = int(issue_xml.getAttribute(u'málsnúmer')) issue_try = Issue.objects.filter(issue_num=issue_num, session=session) if issue_try.count() > 0: issue = issue_try[0] print 'Already have issue: %s' % issue else: issue = Issue() issue.issue_num = issue_num issue.issue_type = issue_type issue.name = name issue.description = description issue.session = session issue.save() print 'Added issue: %s' % issue # Import the issue's documents. issue_xml = minidom.parse(urllib.urlopen(ISSUE_URL % (session_num, issue.issue_num))) docs_xml = issue_xml.getElementsByTagName(u'þingskjöl')[0].getElementsByTagName(u'þingskjal') lowest_doc_num = 0 # Lowest document number will always be the main document of the issue. for doc_xml in docs_xml: # Make sure that this is indeed the correct issue. if int(doc_xml.getAttribute(u'málsnúmer')) != issue.issue_num or int(doc_xml.getAttribute(u'þingnúmer')) != session_num: continue doc_num = int(doc_xml.getAttribute(u'skjalsnúmer')) doc_type = doc_xml.getElementsByTagName(u'skjalategund')[0].firstChild.nodeValue time_published = doc_xml.getElementsByTagName(u'útbýting')[0].firstChild.nodeValue + "+00:00" paths_xml = doc_xml.getElementsByTagName(u'slóð') html_paths_xml = paths_xml[0].getElementsByTagName(u'html') pdf_paths_xml = paths_xml[0].getElementsByTagName(u'pdf') if len(html_paths_xml) == 0: print 'Document not published: %d' % doc_num continue path_html = html_paths_xml[0].firstChild.nodeValue path_pdf = pdf_paths_xml[0].firstChild.nodeValue if lowest_doc_num == 0: lowest_doc_num = doc_num elif lowest_doc_num > doc_num: lowest_doc_num = doc_num doc_try = Document.objects.filter(doc_num=doc_num, issue=issue) if doc_try.count() > 0: doc = doc_try[0] print 'Already have document: %s' % doc else: doc = Document() doc.doc_num = doc_num doc.doc_type = doc_type doc.time_published = time_published doc.path_html = path_html doc.path_pdf = path_pdf doc.issue = issue doc.save() print '- Added document: %s' % doc if lowest_doc_num == 0: issue.delete() print '- Has no documents, being removed' continue main_doc = Document.objects.get(issue=issue, doc_num=lowest_doc_num) main_doc.is_main = True main_doc.save() print '- Main document determined to be: %s' % main_doc