Example #1
0
    def handle(self, **options):
        self.verbosity = int(options['verbosity'])
        self.logger = logging.getLogger(__name__)
        self.options = options
        self.data_path = os.path.join(settings.PROJECT_ROOT, 'data')
        addr_fname = os.path.join(self.data_path, 'pks_osoite.csv')
        if os.path.isfile(addr_fname):
            addr_f = open(addr_fname, 'r')
            self.geocoder = AhjoGeocoder()
            self.geocoder.load_address_database(addr_f)
            addr_f.close()
        else:
            print "Address database not found; geocoder not available."
            self.geocoder = None

        self.import_committees()
        self.import_categories()
        self.scanner = AhjoScanner(verbosity=self.verbosity)
        doc_list = self.scanner.scan_documents(cached=options['cached'])
        media_dir = settings.MEDIA_ROOT
        self.scanner.doc_store_path = os.path.join(media_dir, settings.AHJO_PATHS['zip'])
        self.xml_path = os.path.join(media_dir, settings.AHJO_PATHS['xml'])
        self.attachment_path = os.path.join(media_dir, settings.AHJO_PATHS['attachment'])
        self.video_path = os.path.join(media_dir, settings.AHJO_PATHS['video'])
        for path in (self.xml_path, self.attachment_path, self.video_path):
            if not os.path.exists(path):
                os.makedirs(path)

        for info in doc_list:
            if options['meeting_id']:
                if info['origin_id'] == options['meeting_id']:
                    self.import_doc(info)
                    break
                else:
                    continue

            if options['start_from']:
                if options['start_from'] == info['origin_id']:
                    options['start_from'] = ''
                else:
                    continue

            if options['committee_id'] and info['committee_id'] != options['committee_id']:
                continue
            self.import_doc(info)
        else:
            if options['meeting_id']:
                print "No meeting document with id '%s' found" % options['meeting_id']
                exit(1)

        if self.geocoder and self.geocoder.no_match_addresses:
            print "No coordinate match found for addresses:"
            for adr in set(self.geocoder.no_match_addresses):
                print adr
        if self.failed_import_list:
            print "Importing failed for following documents:"
            for doc in self.failed_import_list:
                print doc
Example #2
0
    def handle(self, **options):
        self.verbosity = int(options['verbosity'])
        self.logger = logging.getLogger(__name__)
        self.options = options
        self.data_path = os.path.join(settings.PROJECT_ROOT, 'data')
        self.geocoder = AhjoGeocoder()

        self.import_policymakers()
        self.import_categories()
        self.scanner = AhjoScanner(verbosity=self.verbosity)
        doc_list = self.scanner.scan_documents(cached=options['cached'])
        media_dir = settings.MEDIA_ROOT
        self.scanner.doc_store_path = os.path.join(media_dir, settings.AHJO_PATHS['zip'])
        self.xml_path = os.path.join(media_dir, settings.AHJO_PATHS['xml'])
        self.attachment_path = os.path.join(media_dir, settings.AHJO_PATHS['attachment'])
        self.video_path = os.path.join(media_dir, settings.AHJO_PATHS['video'])
        for path in (self.xml_path, self.attachment_path, self.video_path):
            if not os.path.exists(path):
                os.makedirs(path)

        plan_path = os.path.join(self.data_path, 'plans')
        if os.path.isdir(plan_path) and not options['no_geocoding']:
            self.geocoder.load_plans(os.path.join(plan_path, 'Kaava_Vireilla.tab'))
            self.geocoder.load_plans(os.path.join(plan_path, 'Kaava_Voimassa.tab'))
            self.geocode_plans = True
        else:
            print "Plan database not found; plan geocoding not available."
            self.geocode_plans = False

        property_path = os.path.join(self.data_path, 'properties')
        if os.path.isdir(property_path) and not options['no_geocoding']:
            self.geocoder.load_plan_units(os.path.join(property_path, 'Kaava_kaavayksikko_Voimassa.tab'))
            self.geocoder.load_properties(os.path.join(property_path, 'GISestx.csv'))
            self.geocode_plan_units = True
        else:
            print "Plan unit database not found; plan unit geocoding not available."
            self.geocode_plan_units = False

        addr_fname = os.path.join(self.data_path, 'pks_osoite.csv')
        if os.path.isfile(addr_fname) and not options['no_geocoding']:
            addr_f = open(addr_fname, 'r')
            self.geocoder.load_address_database(addr_f)
            addr_f.close()
            self.geocode_addresses = True
        else:
            print "Address database not found; address geocoding not available."
            self.geocode_addresses = False

        for info in doc_list:
            if options['meeting_id']:
                if info['origin_id'] == options['meeting_id']:
                    self.import_doc(info)
                    break
                else:
                    continue

            if options['start_from']:
                if options['start_from'] == info['origin_id']:
                    options['start_from'] = ''
                else:
                    continue

            if options['policymaker_id'] and info['policymaker_id'] != options['policymaker_id']:
                continue
            self.import_doc(info)
        else:
            if options['meeting_id']:
                print "No meeting document with id '%s' found" % options['meeting_id']
                exit(1)

        if self.geocoder.no_match_addresses:
            s = u"No coordinate match found for addresses:\n"
            for adr in set(self.geocoder.no_match_addresses):
                s += adr.decode('utf8') + '\n'
            self.logger.info(s)
        if self.geocoder.no_match_plans:
            print "No coordinate match found for plans:"
            for plan in self.geocoder.no_match_plans:
                print plan
        if self.failed_import_list:
            print "Importing failed for following documents:"
            for doc in self.failed_import_list:
                print doc
Example #3
0
class Command(BaseCommand):
    help = "Import OpenAHJO documents"
    option_list = BaseCommand.option_list + (
        make_option('--cached', dest='cached', action='store_true', help='cache HTTP requests'),
        make_option('--meeting-id', dest='meeting_id', action='store', help='import one meeting'),
        make_option('--start-from', dest='start_from', action='store', help='start from provided meeting'),
        make_option('--policymaker-id', dest='policymaker_id', action='store', help='process only provided policymaker'),
        make_option('--full-update', dest='full_update', action='store_true', help='perform full update (i.e. replace existing elements)'),
        make_option('--no-attachments', dest='no_attachments', action='store_true', help='do not process document attachments'),
        make_option('--no-videos', dest='no_videos', action='store_true', help='do not import meeting videos'),
        make_option('--no-geocoding', dest='no_geocoding', action='store_true', help='do not perform geocoding'),
        make_option('--force-policymakers', dest='force_policymakers', action='store_true', help='force importing of policymakers'),
        make_option('--ignore-attachment-size', dest='ignore-attachment-size', action='store_true', help='disable attachment size checks')
    )

    def __init__(self):
        self.failed_import_list = []
        return super(Command, self).__init__()

    def geocode_issue(self, issue, info):
        # Attempt to geocode first from subject and keywords.
        # If no matches are found, attempt to geocode from content text.
        text_list = []
        text_list.append(info['subject'])
        for kw in info['keywords']:
            text_list.append(kw)
        geom_list = self.geocoder.geocode_from_text_list(text_list)
        matched_texts = set()

        districts = {}
        for g in geom_list:
            matched_texts.add(g['text'])
            del g['text']
            args = dict(type=g['type'], name=g['name'])
            try:
                igeom = IssueGeometry.objects.get(**args)
            except IssueGeometry.DoesNotExist:
                args['geometry'] = g['geometry']
                igeom = IssueGeometry(**args)
                igeom.save()
            issue.geometries.add(igeom)
            # Assume geometry doesn't change.
            #igeom.geometry = g['geometry']
            #igeom.save()
            if igeom.type == 'district':
                continue
            # workaround for invalid plan geometry
            if g['type'] == 'plan' and g['name'] == '12079':
                continue
            d_list = District.objects.filter(borders__contains=igeom.geometry)
            for d in d_list:
                districts[d.pk] = d

        issue.districts = districts.values()
        return matched_texts

    def store_keywords(self, issue, text_list):
        for kw in text_list:
            if kw in ['Valtuustoaloite',
                      'Toivomusponnet']:
                kw = kw.lower()
            keyword, _ = IssueKeyword.objects.get_or_create(name=kw)
            issue.keywords.add(keyword)

    def store_issue(self, meeting, meeting_doc, info, adoc):
        try:
            issue = Issue.objects.get(register_id=info['register_id'])
        except Issue.DoesNotExist:
            issue = Issue(register_id=info['register_id'])

        if not issue.subject:
            issue.subject = info['subject']
        else:
            issue.subject = issue.determine_subject()

        s = info['category']
        m = re.match(r"[\d\s]+", s)
        cat_id = s[0:m.end()].strip()
        category = Category.objects.get(origin_id=cat_id)
        issue.category = category
        issue.reference_text = info.get('reference_text')
        issue.save()

        geo_matches = self.geocode_issue(issue, info)
        text_list = [i for i in info['keywords'] if i not in geo_matches]
        self.store_keywords(issue, text_list)

        try:
            agenda_item = AgendaItem.objects.get(issue=issue, meeting=meeting)
        except AgendaItem.DoesNotExist:
            agenda_item = AgendaItem(issue=issue, meeting=meeting)
        agenda_item.subject = info['subject']
        agenda_item.index = info['number']
        agenda_item.from_minutes = meeting_doc.type == 'minutes'
        agenda_item.origin_last_modified_time = meeting_doc.last_modified_time
        agenda_item.resolution = info.get('resolution')
        agenda_item.preparer = info.get('preparer')
        agenda_item.introducer = info.get('introducer')
        agenda_item.classification_code = info.get('classification_code')
        agenda_item.classification_description = info.get('classification_description')
        agenda_item.save()

        latest_date = issue.determine_latest_decision_date()
        if latest_date != issue.latest_decision_date:
            issue.latest_decision_date = latest_date
            issue.save()

        for idx, p in enumerate(info['content']):
            args = {'agenda_item': agenda_item, 'index': idx}
            try:
                section = ContentSection.objects.get(**args)
            except ContentSection.DoesNotExist:
                section = ContentSection(**args)
            section.type = p[0]
            section.text = '\n'.join(p[1])
            section.save()

        if self.options['no_attachments']:
            return
        for att in info['attachments']:
            args = {'agenda_item': agenda_item, 'number': att['number']}
            try:
                obj = Attachment.objects.get(**args)
            except Attachment.DoesNotExist:
                obj = Attachment(**args)
            if not att['public']:
                obj.public = False
                obj.file = None
                obj.hash = None
                obj.save()
                continue
            adoc.extract_zip_attachment(att, self.attachment_path)
            obj.public = True
            obj.file = os.path.join(settings.AHJO_PATHS['attachment'], att['file'])
            obj.file_type = att['type']
            obj.hash = att['hash']
            obj.name = att['name']
            obj.save()

    @transaction.commit_on_success
    def import_doc(self, info):
        origin_id = info['origin_id']
        try:
            doc = MeetingDocument.objects.get(origin_id=origin_id)
            if not self.options['full_update'] and doc.last_modified_time >= info['last_modified']:
                if self.verbosity >= 2:
                    self.logger.info("Up-to-date document %s (last modified %s)" % (origin_id, info['last_modified']))
                return
            else:
                print "Re-importing document %s" % origin_id
        except MeetingDocument.DoesNotExist:
            print "Adding new document %s" % origin_id
            doc = MeetingDocument(origin_id=origin_id)

        d = [int(x) for x in info['date'].split('-')]
        doc_date = datetime.date(*d)

        policymaker = Policymaker.objects.get(origin_id=info['policymaker_id'])
        args = {'policymaker': policymaker, 'number': info['meeting_nr'],
                'year': doc_date.year}
        if not policymaker.abbreviation and 'policymaker_abbr' in info:
            self.logger.info("Saving abbreviation '%s' for %s" % (info['policymaker_abbr'], policymaker))
            policymaker.abbreviation = info['policymaker_abbr']
            policymaker.save()
        try:
            meeting = Meeting.objects.get(**args)
        except Meeting.DoesNotExist:
            meeting = Meeting(**args)
            meeting.minutes = False
            meeting.date = info['date']
            meeting.save()

        doc.meeting = meeting
        doc.organisation = info['org']
        doc.policymaker = info['policymaker']
        doc.date = doc_date
        if str(meeting.date) != str(doc.date):
            raise Exception("Date mismatch between doc and meeting (%s vs. %s)" % (meeting.date, doc.date))
        doc.meeting_nr = info['meeting_nr']
        doc.origin_url = info['url']

        adoc = AhjoDocument(verbosity=self.verbosity, options=self.options)
        zipf = self.scanner.download_document(info)
        try:
            adoc.import_from_zip(zipf)
        except ParseError as e:
            self.logger.error("Error importing document %s" % origin_id, exc_info=e)
            self.failed_import_list.append(origin_id)
            raise

        fname = info['origin_id'] + '.xml'
        print "Storing cleaned XML to %s" % fname
        xmlf = open(os.path.join(self.xml_path, fname), 'w')
        doc.type = adoc.type
        if doc.type == 'agenda':
            assert info['doc_type'] == 'agenda'
        elif doc.type == 'minutes':
            assert info['doc_type'] == 'minutes'
        adoc.output_cleaned_xml(xmlf)
        xmlf.close()
        doc.xml_file = os.path.join(settings.AHJO_PATHS['xml'], fname)
        doc.publish_time = adoc.publish_time
        doc.last_modified_time = info['last_modified']
        doc.save()

        if info['policymaker_id'] != adoc.policymaker_id:
            raise Exception("Policymaker id mismatch (%s vs. %s)" % (info['policymaker_id'], adoc.policymaker_id))

        if meeting.minutes and info['doc_type'] == 'agenda':
            self.logger.info("Skipping agenda doc because minutes already exists")
            return

        # Perform some sanity checks.
        existing_ais = AgendaItem.objects.filter(meeting=meeting).order_by('index')
        if existing_ais.count() > len(adoc.items):
            self.logger.warning("More agenda items in DB (%d) than in document (%d)" % (existing_ais.count(), len(adoc.items)))
            existing_ais.delete()
        for idx, ai in enumerate(existing_ais):
            adi = adoc.items[idx]
            if adi['register_id'] == ai.issue.register_id and adi['number'] == ai.index:
                continue
            self.logger.warning("Issue mismatch at index %d: %s vs. %s" % (idx, adi['register_id'], ai.issue.register_id))
            AgendaItem.objects.filter(meeting=meeting, index__gte=ai.index).delete()
            break

        for issue in adoc.items:
            self.store_issue(meeting, doc, issue, adoc)

        if doc.type == 'minutes':
            meeting.minutes = True
            meeting.save()

        if not self.options['no_videos']:
            self.import_videos(meeting)

    def get_video_screenshot(self, video, video_stream):
        meeting_id = '%d-%d' % (video.meeting.number, video.meeting.year)
        path = os.path.join(self.video_path, meeting_id)
        if not os.path.exists(path):
            os.makedirs(path)
        if not video.agenda_item:
            fname = 'meeting.jpg'
            # Take screenshot at 4 minutes
            pos = 240
        else:
            fname = 'item%d-%d.jpg' % (video.agenda_item.index, video.index)
            pos = video.start_pos + video.duration / 2.0

        self.logger.debug("Fetching screenshot as %s" % fname)
        ss_img = get_video_frame(video_stream, pos)
        ss_img.save(os.path.join(path, fname))
        video.screenshot = os.path.join(settings.AHJO_PATHS['video'], meeting_id, fname)

    def download_video(self, url):
        fname = url.split('/')[-1]
        path = os.path.join(self.video_path, fname)
        if not os.path.exists(path):
            self.logger.debug("Downloading video at %s" % url)
            download_file(url, path)
        return path

    def import_videos(self, meeting):
        # Only Kaupunginvaltuusto supported for now.
        if meeting.policymaker.origin_id != '02900':
            return
        self.logger.debug("Checking for videos for %s" % meeting)
        meeting_info = {'year': meeting.year, 'nr': meeting.number}
        video_info = get_videos_for_meeting(meeting_info)
        if not video_info:
            return
        try:
            video = Video.objects.get(meeting=meeting, agenda_item=None)
        except Video.DoesNotExist:
            video = Video(meeting=meeting, agenda_item=None)
        video.start_pos = 0
        video.speaker = None
        video.index = 0
        video.url = video_info['video']['http_url']

        video_fname = self.download_video(video.url)
        video_stream = open_video(video_fname)
        video.duration = video_stream.duration
        self.get_video_screenshot(video, video_stream)
        video.save()
        ai_list = AgendaItem.objects.filter(meeting=meeting).order_by('index')
        if self.verbosity >= 2:
            # DEBUG
            print "Video"
            titles = ["%s. %s" % (i['id'], i['title']) for i in video_info['issues']]
            for t in titles: print "\t" + t

            print "Ahjo"
            titles = ["%s. %s" % (i.index, i.subject) for i in ai_list]
            for t in titles: print "\t" + t

        for idx, issue in enumerate(video_info['issues']):
            agenda_index = issue['id']
            # Skip subsections (like question hour)
            if '.' in agenda_index:
                #agenda_index = agenda_index.split('.')[0]
                continue
            agenda_index = int(agenda_index)
            for ai in ai_list:
                if ai.index == agenda_index:
                    break
            else:
                self.logger.info(u"No agenda item found for issue: %s" % issue['title'])
                continue
            title = issue['title'].strip()
            # Remove leading 'Stj / '
            re.sub(r'^[\w]{2,4} / ?', '', title)

            if ai.subject != title:
                db_subj = ai.subject
                if len(title) > 100 and len(db_subj) != len(title):
                    min_len = min(len(db_subj), len(title))
                    title = title[0:min_len]
                    db_subj = db_subj[0:min_len]
                # Attempt a fuzzy match
                matcher = difflib.SequenceMatcher(None, db_subj, title)
                if matcher.ratio() < 0.90:
                    self.logger.error(u"Mismatch between titles: '%s' vs. '%s'" % (ai.subject, title))
                    raise Exception("Title mismatch")
            vid_list = [{'start_pos': issue['video_position'], 'speaker': None, 'party': None}]
            for statement in issue['statements']:
                vid = {'start_pos': statement['video_position'], 'duration': statement['duration']}
                vid['speaker'] = statement['participant']['name']
                vid['party'] = statement['participant']['party']
                vid_list.append(vid)
            for idx, vid_info in enumerate(vid_list):
                args = dict(meeting=meeting, agenda_item=ai, index=idx)
                try:
                    video = Video.objects.get(**args)
                except Video.DoesNotExist:
                    video = Video(**args)
                video.url = video.url
                video.speaker = vid_info['speaker']
                video.start_pos = vid_info['start_pos']
                video.party = vid_info['party']
                video.url = video_info['video']['http_url']
                if 'duration' in vid_info:
                    video.duration = vid_info['duration']
                else:
                    if idx < len(vid_list) - 1:
                        video.duration = vid_list[idx+1]['start_pos'] - video.start_pos
                    else:
                        video.duration = 0
                self.get_video_screenshot(video, video_stream)
                video.save()

    def import_categories(self):
        if Category.objects.count():
            return
        f = open(os.path.join(self.data_path, 'categories.csv'), 'r')
        reader = csv.reader(f)
        for row in reader:
            (cat_id, cat_name) = row
            classes = cat_id.split(' ')
            if len(classes) == 1:
                parent = None
            else:
                parent_id = ' '.join(classes[0:-1])
                parent = Category.objects.get(origin_id=parent_id)
            defaults = {'parent': parent, 'name': cat_name}
            cat, c = Category.objects.get_or_create(origin_id=cat_id, defaults=defaults)
            print "%-15s %s" % (cat_id, cat_name)

    def _import_pm_desc(self):
        f = open(os.path.join(self.data_path, 'policymaker.txt'), 'r')
        desc = {}
        active = None
        for l in f.readlines():
            l = l.decode('utf8')
            if l[0] == '[':
                l = l.strip('[]\n')
                desc[l] = []
                active = desc[l]
            else:
                active.append(l.strip())
        for name, lines in desc.items():
            content = '\n'.join(lines).strip()
            if not content:
                del desc[name]
                continue
            content = markdown.markdown(content)
            desc[name] = content
        return desc

    def import_policymakers(self):
        ORG_TYPES = {
            1: 'Valtuusto',
            10: 'Esittelijä',
            11: 'Esittelijä_toimiala',
            12: 'Viranhaltija',
            13: 'Kaupunki',
            2: 'Hallitus',
            3: 'Johtajisto',
            4: 'Jaosto',
            5: 'Lautakunta',
            6: 'Yleinen',
            7: 'Toimiala',
            8: 'Virasto',
            9: 'Osasto',
        }

        if not self.options['force_policymakers'] and Policymaker.objects.count():
            return

        desc = self._import_pm_desc()

        f = open(os.path.join(self.data_path, 'organisaatiokoodit.csv'), 'r')
        reader = csv.reader(f)
        # skip header
        reader.next()
        for row in reader:
            (org_id, org_name, org_name_swe, org_type) = row
            if len(org_id) == 3:
                org_id = '00' + org_id
            elif len(org_id) == 4:
                org_id = '0' + org_id
            org_type = int(org_type)
            # Only choose the political policymakers
            if org_type not in (1, 2, 3, 4, 5):
                continue
            org_name = org_name.decode('utf8')
            defaults = {'name': org_name}
            pm, c = Policymaker.objects.get_or_create(origin_id=org_id, defaults=defaults)
            if org_name in desc:
                pm.summary = desc[org_name]
                pm.save()
            print "%10s %55s %15s" % (org_id, org_name, ORG_TYPES[int(org_type)])

    def handle(self, **options):
        self.verbosity = int(options['verbosity'])
        self.logger = logging.getLogger(__name__)
        self.options = options
        self.data_path = os.path.join(settings.PROJECT_ROOT, 'data')
        self.geocoder = AhjoGeocoder()

        self.import_policymakers()
        self.import_categories()
        self.scanner = AhjoScanner(verbosity=self.verbosity)
        doc_list = self.scanner.scan_documents(cached=options['cached'])
        media_dir = settings.MEDIA_ROOT
        self.scanner.doc_store_path = os.path.join(media_dir, settings.AHJO_PATHS['zip'])
        self.xml_path = os.path.join(media_dir, settings.AHJO_PATHS['xml'])
        self.attachment_path = os.path.join(media_dir, settings.AHJO_PATHS['attachment'])
        self.video_path = os.path.join(media_dir, settings.AHJO_PATHS['video'])
        for path in (self.xml_path, self.attachment_path, self.video_path):
            if not os.path.exists(path):
                os.makedirs(path)

        plan_path = os.path.join(self.data_path, 'plans')
        if os.path.isdir(plan_path) and not options['no_geocoding']:
            self.geocoder.load_plans(os.path.join(plan_path, 'Kaava_Vireilla.tab'))
            self.geocoder.load_plans(os.path.join(plan_path, 'Kaava_Voimassa.tab'))
            self.geocode_plans = True
        else:
            print "Plan database not found; plan geocoding not available."
            self.geocode_plans = False

        property_path = os.path.join(self.data_path, 'properties')
        if os.path.isdir(property_path) and not options['no_geocoding']:
            self.geocoder.load_plan_units(os.path.join(property_path, 'Kaava_kaavayksikko_Voimassa.tab'))
            self.geocoder.load_properties(os.path.join(property_path, 'GISestx.csv'))
            self.geocode_plan_units = True
        else:
            print "Plan unit database not found; plan unit geocoding not available."
            self.geocode_plan_units = False

        addr_fname = os.path.join(self.data_path, 'pks_osoite.csv')
        if os.path.isfile(addr_fname) and not options['no_geocoding']:
            addr_f = open(addr_fname, 'r')
            self.geocoder.load_address_database(addr_f)
            addr_f.close()
            self.geocode_addresses = True
        else:
            print "Address database not found; address geocoding not available."
            self.geocode_addresses = False

        for info in doc_list:
            if options['meeting_id']:
                if info['origin_id'] == options['meeting_id']:
                    self.import_doc(info)
                    break
                else:
                    continue

            if options['start_from']:
                if options['start_from'] == info['origin_id']:
                    options['start_from'] = ''
                else:
                    continue

            if options['policymaker_id'] and info['policymaker_id'] != options['policymaker_id']:
                continue
            self.import_doc(info)
        else:
            if options['meeting_id']:
                print "No meeting document with id '%s' found" % options['meeting_id']
                exit(1)

        if self.geocoder.no_match_addresses:
            s = u"No coordinate match found for addresses:\n"
            for adr in set(self.geocoder.no_match_addresses):
                s += adr.decode('utf8') + '\n'
            self.logger.info(s)
        if self.geocoder.no_match_plans:
            print "No coordinate match found for plans:"
            for plan in self.geocoder.no_match_plans:
                print plan
        if self.failed_import_list:
            print "Importing failed for following documents:"
            for doc in self.failed_import_list:
                print doc
Example #4
0
    def handle(self, **options):
        self.verbosity = int(options['verbosity'])
        self.logger = logging.getLogger(__name__)
        self.options = options
        self.data_path = os.path.join(settings.PROJECT_ROOT, 'data')
        self.geocoder = AhjoGeocoder()

        self.import_policymakers()
        self.import_categories()
        self.scanner = AhjoScanner(verbosity=self.verbosity)
        doc_list = self.scanner.scan_documents(cached=options['cached'])
        media_dir = settings.MEDIA_ROOT
        self.scanner.doc_store_path = os.path.join(media_dir,
                                                   settings.AHJO_PATHS['zip'])
        self.xml_path = os.path.join(media_dir, settings.AHJO_PATHS['xml'])
        self.attachment_path = os.path.join(media_dir,
                                            settings.AHJO_PATHS['attachment'])
        self.video_path = os.path.join(media_dir, settings.AHJO_PATHS['video'])
        for path in (self.xml_path, self.attachment_path, self.video_path):
            if not os.path.exists(path):
                os.makedirs(path)

        plan_path = os.path.join(self.data_path, 'plans')
        if os.path.isdir(plan_path) and not options['no_geocoding']:
            self.geocoder.load_plans(os.path.join(plan_path,
                                                  'Kaava_Vireilla.tab'),
                                     in_effect=False)
            self.geocoder.load_plans(os.path.join(plan_path,
                                                  'Kaava_Voimassa.tab'),
                                     in_effect=True)
            self.geocode_plans = True
        else:
            print "Plan database not found; plan geocoding not available."
            self.geocode_plans = False

        property_path = os.path.join(self.data_path, 'properties')
        if os.path.isdir(property_path) and not options['no_geocoding']:
            self.geocoder.load_plan_units(
                os.path.join(property_path, 'Kaava_kaavayksikko_Voimassa.tab'))
            self.geocoder.load_properties(
                os.path.join(property_path, 'kiinteistoalueet.tab'))
            self.geocode_plan_units = True
        else:
            print "Plan unit database not found; plan unit geocoding not available."
            self.geocode_plan_units = False

        addr_fname = os.path.join(self.data_path, 'pks_osoite.csv')
        if os.path.isfile(addr_fname) and not options['no_geocoding']:
            addr_f = open(addr_fname, 'r')
            self.geocoder.load_address_database(addr_f)
            addr_f.close()
            self.geocode_addresses = True
        else:
            print "Address database not found; address geocoding not available."
            self.geocode_addresses = False

        for info in doc_list:
            if options['meeting_id']:
                if info['origin_id'] == options['meeting_id']:
                    self.import_doc(info)
                    break
                else:
                    continue

            if options['start_from']:
                if options['start_from'] == info['origin_id']:
                    options['start_from'] = ''
                else:
                    continue

            #if not 'VH' in info['policymaker_id']:
            #    continue

            if options['policymaker_id'] and \
               info['policymaker_id'].lower() != options['policymaker_id'].lower():
                continue
            self.import_doc(info)
        else:
            if options['meeting_id']:
                print "No meeting document with id '%s' found" % options[
                    'meeting_id']
                exit(1)

        if self.geocoder.no_match_addresses:
            s = u"No coordinate match found for addresses:\n"
            for adr in set(self.geocoder.no_match_addresses):
                s += adr.decode('utf8') + '\n'
            self.logger.info(s)
        if self.geocoder.no_match_plans:
            print "No coordinate match found for plans:"
            for plan in self.geocoder.no_match_plans:
                print plan
        if self.failed_import_list:
            print "Importing failed for following documents:"
            for doc in self.failed_import_list:
                print doc
Example #5
0
class Command(BaseCommand):
    help = "Import OpenAHJO documents"
    option_list = BaseCommand.option_list + (
        make_option('--cached',
                    dest='cached',
                    action='store_true',
                    help='cache HTTP requests'),
        make_option('--meeting-id',
                    dest='meeting_id',
                    action='store',
                    help='import one meeting'),
        make_option('--start-from',
                    dest='start_from',
                    action='store',
                    help='start from provided meeting'),
        make_option('--policymaker-id',
                    dest='policymaker_id',
                    action='store',
                    help='process only provided policymaker'),
        make_option(
            '--full-update',
            dest='full_update',
            action='store_true',
            help='perform full update (i.e. replace existing elements)'),
        make_option('--skip-existing-attachments',
                    dest='skip_existing_attachments',
                    action='store_true',
                    help='do not process existing document attachments'),
        make_option('--no-videos',
                    dest='no_videos',
                    action='store_true',
                    help='do not import meeting videos'),
        make_option('--no-geocoding',
                    dest='no_geocoding',
                    action='store_true',
                    help='do not perform geocoding'),
        make_option('--force-policymakers',
                    dest='force_policymakers',
                    action='store_true',
                    help='force importing of policymakers'),
        make_option('--ignore-attachment-size',
                    dest='ignore-attachment-size',
                    action='store_true',
                    help='disable attachment size checks'))

    def __init__(self):
        self.failed_import_list = []
        return super(Command, self).__init__()

    def geocode_issue(self, issue, info):
        # Attempt to geocode first from subject and keywords.
        # If no matches are found, attempt to geocode from content text.
        text_list = []
        text_list.append(info['subject'])
        for kw in info['keywords']:
            text_list.append(kw)
        geom_list = self.geocoder.geocode_from_text_list(text_list)
        matched_texts = set()

        districts = {}
        for g in geom_list:
            matched_texts.add(g['text'])
            del g['text']
            args = dict(type=g['type'], name=g['name'])
            try:
                igeom = IssueGeometry.objects.get(**args)
            except IssueGeometry.DoesNotExist:
                args['geometry'] = g['geometry']
                igeom = IssueGeometry(**args)
                igeom.save()
            issue.geometries.add(igeom)
            # Assume geometry doesn't change.
            #igeom.geometry = g['geometry']
            #igeom.save()
            if igeom.type == 'district':
                continue
            # workaround for invalid plan geometry
            if g['type'] == 'plan' and g['name'] == '12079':
                continue
            d_list = District.objects.filter(borders__contains=igeom.geometry)
            for d in d_list:
                districts[d.pk] = d

        issue.districts = districts.values()
        return matched_texts

    def store_keywords(self, issue, text_list):
        for kw in text_list:
            if kw in ['Valtuustoaloite', 'Toivomusponnet']:
                kw = kw.lower()
            keyword, _ = IssueKeyword.objects.get_or_create(name=kw)
            issue.keywords.add(keyword)

    def store_issue(self, meeting, meeting_doc, info, adoc):
        try:
            agenda_item = AgendaItem.objects.get(index=info['number'],
                                                 meeting=meeting)
        except AgendaItem.DoesNotExist:
            agenda_item = AgendaItem(index=info['number'], meeting=meeting)
        agenda_item.subject = info['subject']
        agenda_item.from_minutes = meeting_doc.type == 'minutes'
        agenda_item.origin_last_modified_time = meeting_doc.last_modified_time
        agenda_item.resolution = info.get('resolution')
        agenda_item.preparer = info.get('preparer')
        agenda_item.introducer = info.get('introducer')
        agenda_item.classification_code = info.get('classification_code')
        agenda_item.classification_description = info.get(
            'classification_description')
        agenda_item.issue = None
        agenda_item.save()

        for idx, p in enumerate(info['content']):
            args = {'agenda_item': agenda_item, 'index': idx}
            try:
                section = ContentSection.objects.get(**args)
            except ContentSection.DoesNotExist:
                section = ContentSection(**args)
            section.type = p[0]
            section.text = '\n'.join(p[1])
            section.save()

        att_list = Attachment.objects.filter(agenda_item=agenda_item)
        if att_list.count(
        ) == 0 or not self.options['skip_existing_attachments']:
            for att in info['attachments']:
                for obj in att_list:
                    if obj.number == att['number']:
                        obj._found = True
                        break
                else:
                    obj = Attachment(agenda_item=agenda_item,
                                     number=att['number'])
                    obj._found = True

                if not att['public']:
                    obj.public = False
                    obj.confidentiality_reason = att.get(
                        'confidentiality_reason', None)
                    obj.file = None
                    obj.hash = None
                    obj.save()
                    continue
                adoc.extract_zip_attachment(att, self.attachment_path)
                obj.public = True
                obj.file = os.path.join(settings.AHJO_PATHS['attachment'],
                                        att['file'])
                obj.file_type = att['type']
                obj.hash = att['hash']
                obj.name = att['name']
                obj.save()

            for obj in att_list:
                if not getattr(obj, '_found', False):
                    self.logger.info("Deleting attachment %s" % obj)
                    obj.delete()

        if not info['register_id']:
            return

        try:
            issue = Issue.objects.get(register_id=info['register_id'])
        except Issue.DoesNotExist:
            issue = Issue(register_id=info['register_id'])

        if not issue.subject:
            issue.subject = info['subject']
        else:
            issue.subject = issue.determine_subject()

        s = info['category']
        m = re.match(r"[\d\s]+", s)
        cat_id = s[0:m.end()].strip()
        category = Category.objects.get(origin_id=cat_id)
        issue.category = category
        issue.reference_text = info.get('reference_text')
        issue.save()

        if agenda_item.issue != issue:
            agenda_item.issue = issue
            agenda_item.save(update_fields=['issue'])

        geo_matches = self.geocode_issue(issue, info)
        text_list = [i for i in info['keywords'] if i not in geo_matches]
        self.store_keywords(issue, text_list)

        latest_date = issue.determine_latest_decision_date()
        if latest_date != issue.latest_decision_date:
            issue.latest_decision_date = latest_date
            issue.save(update_fields=['latest_decision_date'])

    @transaction.commit_on_success
    def import_doc(self, info):
        origin_id = info['origin_id']
        try:
            doc = MeetingDocument.objects.get(origin_id=origin_id)
            if not self.options[
                    'full_update'] and doc.last_modified_time >= info[
                        'last_modified']:
                if self.verbosity >= 2:
                    self.logger.info(
                        "Up-to-date document %s (last modified %s)" %
                        (origin_id, info['last_modified']))
                return
            else:
                print "Re-importing document %s" % origin_id
        except MeetingDocument.DoesNotExist:
            print "Adding new document %s" % origin_id
            doc = MeetingDocument(origin_id=origin_id)

        d = [int(x) for x in info['date'].split('-')]
        doc_date = datetime.date(*d)

        try:
            policymaker = Policymaker.objects.get(
                origin_id=info['policymaker_id'])
        except Policymaker.DoesNotExist:
            org = Organization.objects.get(origin_id=info['policymaker_id'])
            print "Creating new policymaker for %s" % org
            args = {
                'name': org.name_fi,
                'abbreviation': org.abbreviation,
                'type': org.type,
                'origin_id': info['policymaker_id']
            }
            policymaker = Policymaker(**args)
            policymaker.slug = org.slug
            policymaker.save()
            org.policymaker = policymaker
            org.save(update_fields=['policymaker'])

        if not policymaker.abbreviation and 'policymaker_abbr' in info:
            self.logger.info("Saving abbreviation '%s' for %s" %
                             (info['policymaker_abbr'], policymaker))
            policymaker.abbreviation = info['policymaker_abbr']
            policymaker.save()

        args = {
            'policymaker': policymaker,
            'number': info['meeting_nr'],
            'year': doc_date.year
        }
        try:
            meeting = Meeting.objects.get(**args)
        except Meeting.DoesNotExist:
            meeting = Meeting(**args)
            meeting.minutes = False
            meeting.date = info['date']
            meeting.save()

        doc.meeting = meeting
        doc.organisation = info['org']
        doc.policymaker = info['policymaker']
        doc.date = doc_date
        if str(meeting.date) != str(doc.date):
            # If the new meeting date comes from a document with the latest modification
            # time, assume the earlier meeting date is incorrect. Otherwise, bail out.
            latest_doc = meeting.meetingdocument_set.order_by(
                '-last_modified_time')[0]
            if info['last_modified'] > latest_doc.last_modified_time:
                self.logger.warning(
                    "Fixing date mismatch between doc and meeting (%s vs. %s)"
                    % (meeting.date, doc.date))
                meeting.date = doc.date
                meeting.save(update_fields=['date'])
            else:
                raise Exception(
                    "Date mismatch between doc and meeting (%s vs. %s)" %
                    (meeting.date, doc.date))
        doc.meeting_nr = info['meeting_nr']
        doc.origin_url = info['url']

        adoc = AhjoDocument(verbosity=self.verbosity, options=self.options)
        zipf = self.scanner.download_document(info)
        try:
            adoc.import_from_zip(zipf)
        except ParseError as e:
            self.logger.error("Error importing document %s" % origin_id,
                              exc_info=e)
            self.failed_import_list.append(origin_id)
            raise

        fname = info['origin_id'] + '.xml'
        print "Storing cleaned XML to %s" % fname
        xmlf = open(os.path.join(self.xml_path, fname), 'w')
        doc.type = adoc.type
        if doc.type == 'agenda':
            assert info['doc_type'] == 'agenda'
        elif doc.type == 'minutes':
            assert info['doc_type'] == 'minutes'
        adoc.output_cleaned_xml(xmlf)
        xmlf.close()
        doc.xml_file = os.path.join(settings.AHJO_PATHS['xml'], fname)
        doc.publish_time = adoc.publish_time
        doc.last_modified_time = info['last_modified']
        doc.save()

        if info['policymaker_id'] != adoc.policymaker_id:
            raise Exception("Policymaker id mismatch (%s vs. %s)" %
                            (info['policymaker_id'], adoc.policymaker_id))

        if meeting.minutes and info['doc_type'] == 'agenda':
            self.logger.info(
                "Skipping agenda doc because minutes already exists")
            return

        # Perform some sanity checks.
        existing_ais = AgendaItem.objects.filter(
            meeting=meeting).order_by('index')
        if existing_ais.count() > len(adoc.items):
            self.logger.warning(
                "More agenda items in DB (%d) than in document (%d)" %
                (existing_ais.count(), len(adoc.items)))
            existing_ais.delete()

        register_ids = set()
        for adi in adoc.items:
            register_id = adi.get('register_id', None)
            if register_id is None:
                continue
            if register_id in register_ids:
                self.logger.warning(
                    "Issue %s listed more than twice in a meeting" %
                    register_id)
            else:
                register_ids.add(register_id)

        for ai in existing_ais:
            for adi in adoc.items:
                if adi['number'] == ai.index:
                    break
            else:
                self.logger.warning(
                    "Agenda item %s not found in incoming items" % ai)
                ai.should_delete = True

            if ai.issue is not None:
                obj_register_id = ai.issue.register_id
            else:
                obj_register_id = None
            if adi.get('register_id', None) != obj_register_id:
                self.logger.warning(
                    "Issue mismatch at index %d: %s vs. %s" %
                    (ai.index, adi['register_id'], obj_register_id))
                AgendaItem.objects.filter(meeting=meeting,
                                          index__gte=ai.index).delete()
                break

        for ai in existing_ais:
            if getattr(ai, 'should_delete', False):
                self.logger.warning("Deleting stale agenda item %s" % ai)
                ai.delete()

        for issue in adoc.items:
            self.store_issue(meeting, doc, issue, adoc)

        if doc.type == 'minutes':
            meeting.minutes = True
            meeting.save()

        if not self.options['no_videos']:
            self.import_videos(meeting)

    def get_video_screenshot(self, video, video_file):
        meeting_id = '%d-%d' % (video.meeting.number, video.meeting.year)
        path = os.path.join(self.video_path, meeting_id)
        if not os.path.exists(path):
            os.makedirs(path)
        if not video.agenda_item:
            fname = 'meeting.jpg'
            # Take screenshot at 4 minutes
            pos = 240
        else:
            fname = 'item%d-%d.jpg' % (video.agenda_item.index, video.index)
            pos = video.start_pos + video.duration / 2.0

        self.logger.debug("Fetching screenshot as %s" % fname)
        video_file.take_screenshot(pos, os.path.join(path, fname))
        video.screenshot = os.path.join(settings.AHJO_PATHS['video'],
                                        meeting_id, fname)

    def download_video(self, url):
        fname = url.split('/')[-1]
        path = os.path.join(self.video_path, fname)
        if not os.path.exists(path):
            self.logger.debug("Downloading video at %s" % url)
            download_file(url, path)
        return path

    def import_videos(self, meeting):
        # Only Kaupunginvaltuusto supported for now.
        if meeting.policymaker.origin_id != '02900':
            return
        # FIXME: Broken in API
        if meeting.year == 2014 and meeting.number == 3:
            return
        if meeting.year == 2015 and meeting.number == 6:
            return
        if meeting.year == 2015 and meeting.number == 10:
            return

        self.logger.debug("Checking for videos for %s" % meeting)
        meeting_info = {'year': meeting.year, 'nr': meeting.number}
        video_info = get_videos_for_meeting(meeting_info)
        if not video_info:
            return
        try:
            video = Video.objects.get(meeting=meeting, agenda_item=None)
        except Video.DoesNotExist:
            video = Video(meeting=meeting, agenda_item=None)
        video.start_pos = 0
        video.speaker = None
        video.index = 0
        video.url = video_info['video']['http_url']

        video_fname = self.download_video(video.url)
        video_file = VideoFile(video_fname)
        video.duration = video_file.get_duration()
        self.get_video_screenshot(video, video_file)
        video.save()
        ai_list = AgendaItem.objects.filter(meeting=meeting).order_by('index')
        if self.verbosity >= 2:
            # DEBUG
            print "Video"
            titles = [
                "%s. %s" % (i['id'], i['title']) for i in video_info['issues']
            ]
            for t in titles:
                print "\t" + t

            print "Ahjo"
            titles = ["%s. %s" % (i.index, i.subject) for i in ai_list]
            for t in titles:
                print "\t" + t

        for idx, issue in enumerate(video_info['issues']):
            agenda_index = issue['id']
            # Skip subsections (like question hour)
            if '.' in agenda_index:
                #agenda_index = agenda_index.split('.')[0]
                continue
            agenda_index = int(agenda_index)
            for ai in ai_list:
                if ai.index == agenda_index:
                    break
            else:
                self.logger.info(u"No agenda item found for issue: %s" %
                                 issue['title'])
                continue
            title = issue['title'].strip()
            # Remove leading 'Stj / '
            re.sub(r'^[\w]{2,4} / ?', '', title)

            if ai.subject != title:
                db_subj = ai.subject
                if len(title) > 100 and len(db_subj) != len(title):
                    min_len = min(len(db_subj), len(title))
                    title = title[0:min_len]
                    db_subj = db_subj[0:min_len]
                # Attempt a fuzzy match
                matcher = difflib.SequenceMatcher(None, db_subj, title)
                if matcher.ratio() < 0.90:
                    self.logger.error(
                        u"Mismatch between titles: '%s' vs. '%s'" %
                        (ai.subject, title))
                    raise Exception("Title mismatch")
            vid_list = [{
                'start_pos': issue['video_position'],
                'speaker': None,
                'party': None
            }]
            for statement in issue['statements']:
                vid = {
                    'start_pos': statement['video_position'],
                    'duration': statement['duration']
                }
                vid['speaker'] = statement['participant']['name']
                vid['party'] = statement['participant']['party']
                vid_list.append(vid)
            for idx, vid_info in enumerate(vid_list):
                args = dict(meeting=meeting, agenda_item=ai, index=idx)
                try:
                    video = Video.objects.get(**args)
                except Video.DoesNotExist:
                    video = Video(**args)
                video.url = video.url
                video.speaker = vid_info['speaker']
                video.start_pos = vid_info['start_pos']
                video.party = vid_info['party']
                video.url = video_info['video']['http_url']
                if 'duration' in vid_info:
                    video.duration = vid_info['duration']
                else:
                    if idx < len(vid_list) - 1:
                        video.duration = vid_list[
                            idx + 1]['start_pos'] - video.start_pos
                    else:
                        video.duration = 0
                self.get_video_screenshot(video, video_file)
                video.save()

    def import_categories(self):
        if Category.objects.count():
            return
        f = open(os.path.join(self.data_path, 'categories.csv'), 'r')
        reader = csv.reader(f)
        for row in reader:
            (cat_id, cat_name) = row
            classes = cat_id.split(' ')
            if len(classes) == 1:
                parent = None
            else:
                parent_id = ' '.join(classes[0:-1])
                parent = Category.objects.get(origin_id=parent_id)
            defaults = {'parent': parent, 'name': cat_name}
            cat, c = Category.objects.get_or_create(origin_id=cat_id,
                                                    defaults=defaults)
            print "%-15s %s" % (cat_id, cat_name)

    def _import_pm_desc(self):
        f = open(os.path.join(self.data_path, 'policymaker.txt'), 'r')
        desc = {}
        active = None
        for l in f.readlines():
            l = l.decode('utf8')
            if l[0] == '[':
                l = l.strip('[]\n')
                desc[l] = []
                active = desc[l]
            else:
                active.append(l.strip())
        for name, lines in desc.items():
            content = '\n'.join(lines).strip()
            if not content:
                del desc[name]
                continue
            content = markdown.markdown(content)
            desc[name] = content
        return desc

    def import_policymakers(self):
        ORG_TYPES = {
            1: 'Valtuusto',
            10: 'Esittelijä',
            11: 'Esittelijä_toimiala',
            12: 'Viranhaltija',
            13: 'Kaupunki',
            2: 'Hallitus',
            3: 'Johtajisto',
            4: 'Jaosto',
            5: 'Lautakunta',
            6: 'Yleinen',
            7: 'Toimiala',
            8: 'Virasto',
            9: 'Osasto',
        }

        if not self.options[
                'force_policymakers'] and Policymaker.objects.count():
            return

        desc = self._import_pm_desc()

        f = open(os.path.join(self.data_path, 'organisaatiokoodit.csv'), 'r')
        reader = csv.reader(f)
        # skip header
        reader.next()
        for row in reader:
            (org_id, org_name, org_name_swe, org_type) = row
            if len(org_id) == 3:
                org_id = '00' + org_id
            elif len(org_id) == 4:
                org_id = '0' + org_id
            org_type = int(org_type)
            # Only choose the political policymakers
            if org_type not in (1, 2, 3, 4, 5):
                continue
            org_name = org_name.decode('utf8')
            defaults = {'name': org_name}
            pm, c = Policymaker.objects.get_or_create(origin_id=org_id,
                                                      defaults=defaults)
            if org_name in desc:
                pm.summary = desc[org_name]
                pm.save()
            print "%10s %55s %15s" % (org_id, org_name,
                                      ORG_TYPES[int(org_type)])

    def handle(self, **options):
        self.verbosity = int(options['verbosity'])
        self.logger = logging.getLogger(__name__)
        self.options = options
        self.data_path = os.path.join(settings.PROJECT_ROOT, 'data')
        self.geocoder = AhjoGeocoder()

        self.import_policymakers()
        self.import_categories()
        self.scanner = AhjoScanner(verbosity=self.verbosity)
        doc_list = self.scanner.scan_documents(cached=options['cached'])
        media_dir = settings.MEDIA_ROOT
        self.scanner.doc_store_path = os.path.join(media_dir,
                                                   settings.AHJO_PATHS['zip'])
        self.xml_path = os.path.join(media_dir, settings.AHJO_PATHS['xml'])
        self.attachment_path = os.path.join(media_dir,
                                            settings.AHJO_PATHS['attachment'])
        self.video_path = os.path.join(media_dir, settings.AHJO_PATHS['video'])
        for path in (self.xml_path, self.attachment_path, self.video_path):
            if not os.path.exists(path):
                os.makedirs(path)

        plan_path = os.path.join(self.data_path, 'plans')
        if os.path.isdir(plan_path) and not options['no_geocoding']:
            self.geocoder.load_plans(os.path.join(plan_path,
                                                  'Kaava_Vireilla.tab'),
                                     in_effect=False)
            self.geocoder.load_plans(os.path.join(plan_path,
                                                  'Kaava_Voimassa.tab'),
                                     in_effect=True)
            self.geocode_plans = True
        else:
            print "Plan database not found; plan geocoding not available."
            self.geocode_plans = False

        property_path = os.path.join(self.data_path, 'properties')
        if os.path.isdir(property_path) and not options['no_geocoding']:
            self.geocoder.load_plan_units(
                os.path.join(property_path, 'Kaava_kaavayksikko_Voimassa.tab'))
            self.geocoder.load_properties(
                os.path.join(property_path, 'kiinteistoalueet.tab'))
            self.geocode_plan_units = True
        else:
            print "Plan unit database not found; plan unit geocoding not available."
            self.geocode_plan_units = False

        addr_fname = os.path.join(self.data_path, 'pks_osoite.csv')
        if os.path.isfile(addr_fname) and not options['no_geocoding']:
            addr_f = open(addr_fname, 'r')
            self.geocoder.load_address_database(addr_f)
            addr_f.close()
            self.geocode_addresses = True
        else:
            print "Address database not found; address geocoding not available."
            self.geocode_addresses = False

        for info in doc_list:
            if options['meeting_id']:
                if info['origin_id'] == options['meeting_id']:
                    self.import_doc(info)
                    break
                else:
                    continue

            if options['start_from']:
                if options['start_from'] == info['origin_id']:
                    options['start_from'] = ''
                else:
                    continue

            #if not 'VH' in info['policymaker_id']:
            #    continue

            if options['policymaker_id'] and \
               info['policymaker_id'].lower() != options['policymaker_id'].lower():
                continue
            self.import_doc(info)
        else:
            if options['meeting_id']:
                print "No meeting document with id '%s' found" % options[
                    'meeting_id']
                exit(1)

        if self.geocoder.no_match_addresses:
            s = u"No coordinate match found for addresses:\n"
            for adr in set(self.geocoder.no_match_addresses):
                s += adr.decode('utf8') + '\n'
            self.logger.info(s)
        if self.geocoder.no_match_plans:
            print "No coordinate match found for plans:"
            for plan in self.geocoder.no_match_plans:
                print plan
        if self.failed_import_list:
            print "Importing failed for following documents:"
            for doc in self.failed_import_list:
                print doc
Example #6
0
class Command(BaseCommand):
    help = "Import OpenAHJO documents"
    option_list = BaseCommand.option_list + (
        make_option('--cached', dest='cached', action='store_true', help='cache HTTP requests'),
        make_option('--meeting-id', dest='meeting_id', action='store', help='import one meeting'),
        make_option('--start-from', dest='start_from', action='store', help='start from provided meeting'),
        make_option('--committee-id', dest='committee_id', action='store', help='process only provided committee'),
        make_option('--full-update', dest='full_update', action='store_true', help='perform full update (i.e. replace existing elements)'),
        make_option('--no-attachments', dest='no_attachments', action='store_true', help='do not process document attachments'),
        make_option('--no-videos', dest='no_videos', action='store_true', help='do not import meeting videos'),
        make_option('--force-committees', dest='force_committees', action='store_true', help='force importing of committees'),
    )

    def __init__(self):
        self.failed_import_list = []
        return super(Command, self).__init__()

    def geocode_issue(self, issue, info):
        if not self.geocoder:
            return
        # Attempt to geocode first from subject and keywords.
        # If no matches are found, attempt to geocode from content text.
        text_list = []
        text_list.append(info['subject'])
        for kw in info['keywords']:
            text_list.append(kw)
        markers = self.geocoder.geocode_from_text_list(text_list)
        if not markers:
            pass
        if markers:
            for m in markers:
                try:
                    igeom = IssueGeometry.objects.get(issue=issue, name=m['name'])
                except IssueGeometry.DoesNotExist:
                    igeom = IssueGeometry(issue=issue, name=m['name'])
                igeom.geometry = m['location']
                igeom.save()

    def store_issue(self, meeting, meeting_doc, info, adoc):
        try:
            issue = Issue.objects.get(register_id=info['register_id'])
        except Issue.DoesNotExist:
            issue = Issue(register_id=info['register_id'])

        issue.subject = info['subject']
        print issue.subject
        s = info['category']
        m = re.match(r"[\d\s]+", s)
        cat_id = s[0:m.end()].strip()
        category = Category.objects.get(origin_id=cat_id)
        issue.category = category
        issue.save()

        self.geocode_issue(issue, info)

        try:
            agenda_item = AgendaItem.objects.get(issue=issue, meeting=meeting)
        except AgendaItem.DoesNotExist:
            agenda_item = AgendaItem(issue=issue, meeting=meeting)
        agenda_item.subject = info['subject']
        agenda_item.index = info['number']
        agenda_item.from_minutes = meeting_doc.type == 'minutes'
        agenda_item.last_modified_time = meeting_doc.last_modified_time
        agenda_item.save()

        for idx, p in enumerate(info['content']):
            args = {'agenda_item': agenda_item, 'index': idx}
            try:
                section = ContentSection.objects.get(**args)
            except ContentSection.DoesNotExist:
                section = ContentSection(**args)
            section.type = p[0]
            section.text = '\n'.join(p[1])
            section.save()

        if self.options['no_attachments']:
            return
        for att in info['attachments']:
            args = {'agenda_item': agenda_item, 'number': att['number']}
            try:
                obj = Attachment.objects.get(**args)
            except Attachment.DoesNotExist:
                obj = Attachment(**args)
            if not att['public']:
                obj.public = False
                obj.file = None
                obj.hash = None
                obj.save()
                continue
            adoc.extract_zip_attachment(att, self.attachment_path)
            obj.public = True
            obj.file = os.path.join(settings.AHJO_PATHS['attachment'], att['file'])
            obj.file_type = att['type']
            obj.hash = att['hash']
            obj.name = att['name']
            obj.save()

    def import_doc(self, info):
        origin_id = info['origin_id']
        try:
            doc = MeetingDocument.objects.get(origin_id=origin_id)
            if not self.options['full_update'] and doc.last_modified_time >= info['last_modified']:
                self.logger.info("Up-to-date document %s (last modified %s)" % (origin_id, info['last_modified']))
                return
        except MeetingDocument.DoesNotExist:
            print "Adding new document %s" % origin_id
            doc = MeetingDocument(origin_id=origin_id)

        d = [int(x) for x in info['date'].split('-')]
        doc_date = datetime.date(*d)

        committee = Committee.objects.get(origin_id=info['committee_id'])
        args = {'committee': committee, 'number': info['meeting_nr'],
                'year': doc_date.year}
        try:
            meeting = Meeting.objects.get(**args)
        except Meeting.DoesNotExist:
            meeting = Meeting(**args)
            meeting.minutes = False
            meeting.date = info['date']
            meeting.save()

        doc.meeting = meeting
        doc.organisation = info['org']
        doc.committee = info['committee']
        doc.date = doc_date
        if str(meeting.date) != str(doc.date):
            raise Exception("Date mismatch between doc and meeting (%s vs. %s)" % (meeting.date, doc.date))
        doc.meeting_nr = info['meeting_nr']
        doc.origin_url = info['url']

        adoc = AhjoDocument(verbosity=self.verbosity)
        zipf = self.scanner.download_document(info)
        try:
            adoc.import_from_zip(zipf)
        except ParseError as e:
            self.logger.error("Error importing document %s" % origin_id, exc_info=e)
            self.failed_import_list.append(origin_id)
            return

        fname = info['origin_id'] + '.xml'
        print "Storing cleaned XML to %s" % fname
        xmlf = open(os.path.join(self.xml_path, fname), 'w')
        doc.type = adoc.type
        if doc.type == 'agenda':
            assert info['doc_type'] == 'agenda'
        elif doc.type == 'minutes':
            assert info['doc_type'] == 'minutes'
        adoc.output_cleaned_xml(xmlf)
        xmlf.close()
        doc.xml_file = os.path.join(settings.AHJO_PATHS['xml'], fname)
        doc.publish_time = adoc.publish_time
        doc.last_modified_time = info['last_modified']
        doc.save()

        if info['committee_id'] != adoc.committee_id:
            raise Exception("Committee id mismatch (%s vs. %s)" % (info['committee_id'], adoc.committee_id))

        if meeting.minutes and info['doc_type'] == 'agenda':
            self.logger.info("Skipping agenda doc because minutes already exists")
            return

        # Perform some sanity checks.
        existing_ais = AgendaItem.objects.filter(meeting=meeting).order_by('index')
        if existing_ais.count() > len(adoc.items):
            self.logger.warning("More agenda items in DB (%d) than in document (%d)" % (existing_ais.count(), len(adoc.items)))
            existing_ais.delete()
        for idx, ai in enumerate(existing_ais):
            adi = adoc.items[idx]
            if adi['register_id'] == ai.issue.register_id and adi['number'] == ai.index:
                continue
            self.logger.warning("Issue mismatch at index %d: %s vs. %s" % (idx, adi['register_id'], ai.issue.register_id))
            AgendaItem.objects.filter(meeting=meeting, index__gte=ai.index).delete()
            break

        for issue in adoc.items:
            self.store_issue(meeting, doc, issue, adoc)

        if doc.type == 'minutes':
            meeting.minutes = True
            meeting.save()

        if not self.options['no_videos']:
            self.import_videos(meeting)

    def get_video_screenshot(self, video, video_stream):
        meeting_id = '%d-%d' % (video.meeting.number, video.meeting.year)
        path = os.path.join(self.video_path, meeting_id)
        if not os.path.exists(path):
            os.makedirs(path)
        if not video.agenda_item:
            fname = 'meeting.jpg'
            # Take screenshot at 4 minutes
            pos = 240
        else:
            fname = 'item%d-%d.jpg' % (video.agenda_item.index, video.index)
            pos = video.start_pos + video.duration / 2.0

        self.logger.debug("Fetching screenshot as %s" % fname)
        ss_img = get_video_frame(video_stream, pos)
        ss_img.save(os.path.join(path, fname))
        video.screenshot = os.path.join(settings.AHJO_PATHS['video'], meeting_id, fname)

    def download_video(self, url):
        fname = url.split('/')[-1]
        path = os.path.join(self.video_path, fname)
        if not os.path.exists(path):
            self.logger.debug("Downloading video at %s" % url)
            download_file(url, path)
        return path

    def import_videos(self, meeting):
        # Only Kaupunginvaltuusto supported for now.
        if meeting.committee.origin_id != '02900':
            return
        self.logger.debug("Checking for videos for %s" % meeting)
        meeting_info = {'year': meeting.year, 'nr': meeting.number}
        video_info = get_videos_for_meeting(meeting_info)
        if not video_info:
            return
        try:
            video = Video.objects.get(meeting=meeting, agenda_item=None)
        except Video.DoesNotExist:
            video = Video(meeting=meeting, agenda_item=None)
        video.start_pos = 0
        video.speaker = None
        video.index = 0
        video.url = video_info['video']['http_url']

        video_fname = self.download_video(video.url)
        video_stream = open_video(video_fname)
        video.duration = video_stream.duration
        self.get_video_screenshot(video, video_stream)
        video.save()
        ai_list = AgendaItem.objects.filter(meeting=meeting).order_by('index')
        if self.verbosity >= 2:
            # DEBUG
            print "Video"
            titles = ["%s. %s" % (i['id'], i['title']) for i in video_info['issues']]
            for t in titles: print "\t" + t

            print "Ahjo"
            titles = ["%s. %s" % (i.index, i.subject) for i in ai_list]
            for t in titles: print "\t" + t

        for idx, issue in enumerate(video_info['issues']):
            agenda_index = issue['id']
            # Skip subsections (like question hour)
            if '.' in agenda_index:
                #agenda_index = agenda_index.split('.')[0]
                continue
            agenda_index = int(agenda_index)
            for ai in ai_list:
                if ai.index == agenda_index:
                    break
            else:
                self.logger.info(u"No agenda item found for issue: %s" % issue['title'])
                continue
            title = issue['title'].strip()
            if ai.subject != title:
                # Attempt a fuzzy match
                matcher = difflib.SequenceMatcher(None, ai.subject, title)
                if matcher.ratio() < 0.90:
                    self.logger.error(u"Mismatch between titles: '%s' vs. '%s'" % (ai.subject, title))
                    raise Exception("Title mismatch")
            vid_list = [{'start_pos': issue['video_position'], 'speaker': None, 'party': None}]
            for statement in issue['statements']:
                vid = {'start_pos': statement['video_position'], 'duration': statement['duration']}
                vid['speaker'] = statement['participant']['name']
                vid['party'] = statement['participant']['party']
                vid_list.append(vid)
            for idx, vid_info in enumerate(vid_list):
                args = dict(meeting=meeting, agenda_item=ai, index=idx)
                try:
                    video = Video.objects.get(**args)
                except Video.DoesNotExist:
                    video = Video(**args)
                video.url = video.url
                video.speaker = vid_info['speaker']
                video.start_pos = vid_info['start_pos']
                video.party = vid_info['party']
                video.url = video_info['video']['http_url']
                if 'duration' in vid_info:
                    video.duration = vid_info['duration']
                else:
                    if idx < len(vid_list) - 1:
                        video.duration = vid_list[idx+1]['start_pos'] - video.start_pos
                    else:
                        video.duration = 0
                self.get_video_screenshot(video, video_stream)
                video.save()

    def import_categories(self):
        if Category.objects.count():
            return
        f = open(os.path.join(self.data_path, 'categories.csv'), 'r')
        reader = csv.reader(f)
        for row in reader:
            (cat_id, cat_name) = row
            classes = cat_id.split(' ')
            if len(classes) == 1:
                parent = None
            else:
                parent_id = ' '.join(classes[0:-1])
                parent = Category.objects.get(origin_id=parent_id)
            defaults = {'parent': parent, 'name': cat_name}
            cat, c = Category.objects.get_or_create(origin_id=cat_id, defaults=defaults)
            print "%-15s %s" % (cat_id, cat_name)

    def import_committees(self):
        ORG_TYPES = {
            1: 'Valtuusto',
            10: 'Esittelijä',
            11: 'Esittelijä_toimiala',
            12: 'Viranhaltija',
            13: 'Kaupunki',
            2: 'Hallitus',
            3: 'Johtajisto',
            4: 'Jaosto',
            5: 'Lautakunta',
            6: 'Yleinen',
            7: 'Toimiala',
            8: 'Virasto',
            9: 'Osasto',
        }

        if not self.options['force_committees'] and Committee.objects.count():
            return
        f = open(os.path.join(self.data_path, 'organisaatiokoodit.csv'), 'r')
        reader = csv.reader(f)
        # skip header
        reader.next()
        for row in reader:
            (org_id, org_name, org_name_swe, org_type) = row
            if len(org_id) == 3:
                org_id = '00' + org_id
            elif len(org_id) == 4:
                org_id = '0' + org_id
            org_type = int(org_type)
            # Only choose the political committees
            if org_type not in (1, 2, 3, 4, 5):
                continue
            defaults = {'name': org_name}
            comm, c = Committee.objects.get_or_create(origin_id=org_id, defaults=defaults)
            print "%10s %55s %15s" % (org_id, org_name, ORG_TYPES[int(org_type)])

    def handle(self, **options):
        self.verbosity = int(options['verbosity'])
        self.logger = logging.getLogger(__name__)
        self.options = options
        self.data_path = os.path.join(settings.PROJECT_ROOT, 'data')
        addr_fname = os.path.join(self.data_path, 'pks_osoite.csv')
        if os.path.isfile(addr_fname):
            addr_f = open(addr_fname, 'r')
            self.geocoder = AhjoGeocoder()
            self.geocoder.load_address_database(addr_f)
            addr_f.close()
        else:
            print "Address database not found; geocoder not available."
            self.geocoder = None

        self.import_committees()
        self.import_categories()
        self.scanner = AhjoScanner(verbosity=self.verbosity)
        doc_list = self.scanner.scan_documents(cached=options['cached'])
        media_dir = settings.MEDIA_ROOT
        self.scanner.doc_store_path = os.path.join(media_dir, settings.AHJO_PATHS['zip'])
        self.xml_path = os.path.join(media_dir, settings.AHJO_PATHS['xml'])
        self.attachment_path = os.path.join(media_dir, settings.AHJO_PATHS['attachment'])
        self.video_path = os.path.join(media_dir, settings.AHJO_PATHS['video'])
        for path in (self.xml_path, self.attachment_path, self.video_path):
            if not os.path.exists(path):
                os.makedirs(path)

        for info in doc_list:
            if options['meeting_id']:
                if info['origin_id'] == options['meeting_id']:
                    self.import_doc(info)
                    break
                else:
                    continue

            if options['start_from']:
                if options['start_from'] == info['origin_id']:
                    options['start_from'] = ''
                else:
                    continue

            if options['committee_id'] and info['committee_id'] != options['committee_id']:
                continue
            self.import_doc(info)
        else:
            if options['meeting_id']:
                print "No meeting document with id '%s' found" % options['meeting_id']
                exit(1)

        if self.geocoder and self.geocoder.no_match_addresses:
            print "No coordinate match found for addresses:"
            for adr in set(self.geocoder.no_match_addresses):
                print adr
        if self.failed_import_list:
            print "Importing failed for following documents:"
            for doc in self.failed_import_list:
                print doc