Example #1
0
    def parse_document(self):
        debate = self.xml.debate

        if self.ns:
            people = self.tree.findall('an:debate/an:meta/an:references/an:TLCPerson', namespaces={'an': self.ns})
        else:
            people = self.tree.findall('debate/meta/references/TLCPerson')
        if people is None: people = []
        for person in people:
            id = person.get('id')
            href = person.get('href')
            try:
                p = Person.objects.get(popit_id=href)
            except Person.DoesNotExist:
                p = Person(popit_id=href, api_instance=self.ai)
                if self.commit:
                    p.save()

            try:
                speaker = Speaker.objects.get(instance=self.instance, person=p)
            except Speaker.DoesNotExist:
                speaker = Speaker(instance=self.instance, name=person.get('showAs'), person=p)
                if self.commit:
                    speaker.save()

            self.speakers[id] = speaker

        if self.ns:
            docDate = debate.find('an:preface//an:docDate', namespaces={'an': self.ns})
        else:
            docDate = debate.find('preface//docDate')
        if docDate is not None:
            self.start_date = dateutil.parse(docDate.get('date'))

        self.visit(debate.debateBody, None)
Example #2
0
    def parse_document(self):
        debate = self.xml.debate

        if self.ns:
            people = debate.findall('an:meta/an:references/an:TLCPerson',
                                    namespaces={'an': self.ns})
        else:
            people = debate.findall('meta/references/TLCPerson')
        if people is None: people = []
        for person in people:
            id = person.get('id')
            href = person.get('href')
            try:
                p = Person.objects.get(popit_id=href)
            except Person.DoesNotExist:
                p = Person(popit_id=href, api_instance=self.ai)
                if self.commit:
                    p.save()

            try:
                speaker = Speaker.objects.get(instance=self.instance, person=p)
            except Speaker.DoesNotExist:
                speaker = Speaker(instance=self.instance,
                                  name=person.get('showAs'),
                                  person=p)
                if self.commit:
                    speaker.save()

            self.speakers[id] = speaker

        if self.ns:
            docDate = debate.find(
                'an:coverPage//an:docDate|an:preface//an:docDate',
                namespaces={'an': self.ns})
        else:
            docDate = debate.find('coverPage//docDate|preface//docDate')
        if docDate is not None:
            self.start_date = dateutil.parse(docDate.get('date'))

        if self.ns:
            docTitle = debate.find(
                'an:coverPage//an:docTitle|an:preface//an:docTitle',
                namespaces={'an': self.ns})
        else:
            docTitle = debate.find('coverPage//docTitle|preface//docTitle')
        if docTitle is None:
            section = None
        else:
            section = self.make(Section, parent=None, title=docTitle.text)

        self.visit(debate.debateBody, section)
Example #3
0
 def get_persons_data(self, dp):
     persons = next((resource for resource in dp.resources
                     if resource.descriptor['name'] == 'persons-person'),
                    None)
     # One day this information will be useful, but not now...
     # positions = next((resource for resource in dp.resources if resource.descriptor['name'] == 'persons-position'),
     #                  None)
     # persons_to_positions = next(
     #     (resource for resource in dp.resources if resource.descriptor['name'] == 'persons-persons-to-positions'),
     #     None)
     if persons is not None:
         for person in persons.data:
             speaker = Speaker(instance=self.instance)
             speaker.given_name = person['first_name']
             speaker.family_name = person['last_name']
             speaker.name = speaker.given_name + ' ' + speaker.family_name
             speaker.email = person['email']
             speaker.gender = person['gender_description']
             speaker.save()
Example #4
0
    def get_person(self, name):
        cached = self.person_cache.get(name, None)
        if cached:
            return cached

        display_name = name or '(narrative)'

        speaker = None
        popit_person = None

        if name:
            self.speakers_count += 1
            if self.resolver:
                popit_person = self.resolver.get_person(display_name)

                if popit_person:
                    self.speakers_matched += 1
                    try:
                        speaker = Speaker.objects.get(
                            instance = self.instance,
                            person = popit_person)
                    except Speaker.DoesNotExist:
                        pass
                else:
                    logger.info(" - Failed to get user %s" % display_name)

        if not speaker:
            try:
                speaker = Speaker.objects.get(instance=self.instance, name=display_name)
            except Speaker.DoesNotExist:
                speaker = Speaker(instance=self.instance, name=display_name)
                if self.commit:
                    speaker.save()

            if popit_person:
                speaker.person = popit_person
                if self.commit:
                    speaker.save()

        self.person_cache[name] = speaker
        return speaker
Example #5
0
    def get_person(self, name, party, pombola_person_slug=None):

        # If we can directly find the person from the
        # pombola_person_slug, use that - the Code4SA / PMG
        # identification of speakers seems to be better than that from
        # popolo_name_resolver.
        speaker_from_slug = None
        if pombola_person_slug is not None:
            speaker_from_slug = Speaker.objects.filter(
                identifiers__scheme='pombola_person_slug',
                identifiers__identifier=pombola_person_slug).first()
            if speaker_from_slug:
                return speaker_from_slug

        cached = self.person_cache.get(name, None)
        if cached:
            return cached

        display_name = name or '(narrative)'

        speaker = None
        person = None

        if name:
            person = self.resolver.get_person(display_name, party)
            if person:
                speaker = person.speaker

        if not speaker:
            try:
                speaker = Speaker.objects.get(
                    instance=self.instance, name=display_name)
            except Speaker.DoesNotExist:
                speaker = Speaker(instance=self.instance, name=display_name)
                if self.commit:
                    speaker.save()

        self.person_cache[name] = speaker
        return speaker
Example #6
0
    def parse_document(self):
        debate = self.xml.debate

        if self.ns:
            people = debate.findall('an:meta/an:references/an:TLCPerson', namespaces={'an': self.ns})
        else:
            people = debate.findall('meta/references/TLCPerson')
        if people is None: people = []
        for person in people:
            id = person.get('id')
            href = person.get('href')
            try:
                speaker = Speaker.objects.get(instance=self.instance, identifiers__identifier=href)
            except Speaker.DoesNotExist:
                speaker = Speaker(instance=self.instance, name=person.get('showAs'))
                if self.commit:
                    speaker.save()
                    speaker.identifiers.create(identifier=href, scheme='Akoma Ntoso import')

            self.speakers[id] = speaker

        if self.ns:
            docDate = debate.find('an:coverPage//an:docDate|an:preface//an:docDate', namespaces={'an': self.ns})
        else:
            docDate = debate.find('coverPage//docDate|preface//docDate')
        if docDate is not None:
            self.start_date = dateutil.parse(docDate.get('date'))

        if self.ns:
            docTitle = debate.find('an:coverPage//an:docTitle|an:preface//an:docTitle', namespaces={'an': self.ns})
        else:
            docTitle = debate.find('coverPage//docTitle|preface//docTitle')
        if docTitle is None:
            section = None
        else:
            section = self.make(Section, parent=None, title=docTitle.text)

        self.visit(debate.debateBody, section)
Example #7
0
    def get_person(self, name, party, pombola_person_slug=None):

        # If we can directly find the person from the
        # pombola_person_slug, use that - the Code4SA / PMG
        # identification of speakers seems to be better than that from
        # popolo_name_resolver.
        speaker_from_slug = None
        if pombola_person_slug is not None:
            speaker_from_slug = Speaker.objects.filter(
                identifiers__scheme='pombola_person_slug',
                identifiers__identifier=pombola_person_slug).first()
            if speaker_from_slug:
                return speaker_from_slug

        cached = self.person_cache.get(name, None)
        if cached:
            return cached

        display_name = name or '(narrative)'

        speaker = None
        person = None

        if name:
            person = self.resolver.get_person(display_name, party)
            if person:
                speaker = person.speaker

        if not speaker:
            try:
                speaker = Speaker.objects.get(instance=self.instance, name=display_name)
            except Speaker.DoesNotExist:
                speaker = Speaker(instance=self.instance, name=display_name)
                if self.commit:
                    speaker.save()

        self.person_cache[name] = speaker
        return speaker
Example #8
0
    def parse_document(self):
        self.stats = {Speaker: 0}
        debate = self.xml.debate

        if self.ns:
            people = debate.findall(
                'an:meta/an:references/an:TLCPerson',
                namespaces={'an': self.ns},
                )
        else:
            people = debate.findall('meta/references/TLCPerson')
        if people is None:
            people = []
        for person in people:
            id = person.get('id')
            href = person.get('href')
            try:
                speaker = Speaker.objects.get(
                    instance=self.instance, identifiers__identifier=href)
            except Speaker.DoesNotExist:
                name = person.get('showAs')
                if not name:
                    raise Exception("TLCPerson '%s' is missing showAs" % href)
                speaker = Speaker(instance=self.instance, name=name)
                self.stats[Speaker] += 1

                if self.commit:
                    speaker.save()
                    speaker.identifiers.create(
                        identifier=href, scheme='Akoma Ntoso import')

            self.speakers[id] = speaker

        docDate = self.get_preface_tag(debate, 'docDate')
        if docDate is not None:
            date = docDate.get('date')
            if date:
                try:
                    self.start_date = dateutil.parse(date)
                except ValueError:
                    logger.warn("docDate element did not parse '%s'" % date)
            else:
                logger.warn("docDate element missing required date attribute")

        docTitle = self.get_preface_tag(debate, 'docTitle')
        if docTitle:
            docTitle = docTitle.text

        docNumber = self.get_preface_tag(debate, 'docNumber')
        if docNumber:
            docNumber = docNumber.text

        legislature = self.get_preface_tag(debate, 'legislature')
        if legislature:
            legislature = legislature.text

        session = self.get_preface_tag(debate, 'session')
        if session:
            session = session.text

        source_url = self.get_preface_tag(debate, 'link')
        if source_url is not None:
            source_url = source_url.get('href')

        self.imported_section_ids = set()

        section = None
        if docTitle:
            kwargs = {
                'parent': None,
                'heading': docTitle,
                'start_date': self.start_date,
                'number': docNumber or '',
                'legislature': legislature or '',
                'session': session or '',
            }

            section = self.make_section(source_url=source_url or '', **kwargs)

            if not section:
                return self.stats

        self.visit(debate.debateBody, section)
        return self.stats
Example #9
0
    def parse_document(self):
        debate = self.xml.debate

        if self.ns:
            people = debate.findall(
                'an:meta/an:references/an:TLCPerson',
                namespaces={'an': self.ns},
            )
        else:
            people = debate.findall('meta/references/TLCPerson')
        if people is None:
            people = []
        for person in people:
            id = person.get('id')
            href = person.get('href')
            try:
                speaker = Speaker.objects.get(instance=self.instance,
                                              identifiers__identifier=href)
            except Speaker.DoesNotExist:
                speaker = Speaker(instance=self.instance,
                                  name=person.get('showAs'))
                if self.commit:
                    speaker.save()
                    speaker.identifiers.create(identifier=href,
                                               scheme='Akoma Ntoso import')

            self.speakers[id] = speaker

        docDate = self.get_preface_tag(debate, 'docDate')
        if docDate:
            self.start_date = dateutil.parse(docDate.get('date'))

        docTitle = self.get_preface_tag(debate, 'docTitle')
        if docTitle:
            docTitle = docTitle.text

        docNumber = self.get_preface_tag(debate, 'docNumber')
        if docNumber:
            docNumber = docNumber.text

        legislature = self.get_preface_tag(debate, 'legislature')
        if legislature:
            legislature = legislature.text

        session = self.get_preface_tag(debate, 'session')
        if session:
            session = session.text

        self.imported_section_ids = set()

        section = None
        if docTitle:
            kwargs = {
                'parent': None,
                'heading': docTitle,
                'start_date': self.start_date,
                'number': docNumber or '',
                'legislature': legislature or '',
                'session': session or '',
            }
            section = self.make_section(**kwargs)
            if not section:
                return

        self.visit(debate.debateBody, section)
Example #10
0
    def parse_document(self):
        debate = self.xml.debate

        if self.ns:
            people = debate.findall(
                'an:meta/an:references/an:TLCPerson',
                namespaces={'an': self.ns},
                )
        else:
            people = debate.findall('meta/references/TLCPerson')
        if people is None:
            people = []
        for person in people:
            id = person.get('id')
            href = person.get('href')
            try:
                speaker = Speaker.objects.get(
                    instance=self.instance, identifiers__identifier=href)
            except Speaker.DoesNotExist:
                speaker = Speaker(
                    instance=self.instance, name=person.get('showAs'))
                if self.commit:
                    speaker.save()
                    speaker.identifiers.create(
                        identifier=href, scheme='Akoma Ntoso import')

            self.speakers[id] = speaker

        docDate = self.get_preface_tag(debate, 'docDate')
        if docDate:
            self.start_date = dateutil.parse(docDate.get('date'))

        docTitle = self.get_preface_tag(debate, 'docTitle')
        if docTitle:
            docTitle = docTitle.text

        docNumber = self.get_preface_tag(debate, 'docNumber')
        if docNumber:
            docNumber = docNumber.text

        legislature = self.get_preface_tag(debate, 'legislature')
        if legislature:
            legislature = legislature.text

        session = self.get_preface_tag(debate, 'session')
        if session:
            session = session.text

        section = None
        if docTitle:
            section = self.make(
                Section,
                parent=None,
                heading=docTitle,
                start_date=self.start_date,
                number=docNumber or '',
                legislature=legislature or '',
                session=session or '',
            )

        self.visit(debate.debateBody, section)
    def parse_document(self):
        debate = self.xml.debate

        if self.ns:
            people = debate.findall("an:meta/an:references/an:TLCPerson", namespaces={"an": self.ns})
        else:
            people = debate.findall("meta/references/TLCPerson")
        if people is None:
            people = []
        for person in people:
            id = person.get("id")
            href = person.get("href")
            try:
                speaker = Speaker.objects.get(instance=self.instance, identifiers__identifier=href)
            except Speaker.DoesNotExist:
                speaker = Speaker(instance=self.instance, name=person.get("showAs"))
                if self.commit:
                    speaker.save()
                    speaker.identifiers.create(identifier=href, scheme="Akoma Ntoso import")

            self.speakers[id] = speaker

        docDate = self.get_preface_tag(debate, "docDate")
        if docDate:
            self.start_date = dateutil.parse(docDate.get("date"))

        docTitle = self.get_preface_tag(debate, "docTitle")
        if docTitle:
            docTitle = docTitle.text

        docNumber = self.get_preface_tag(debate, "docNumber")
        if docNumber:
            docNumber = docNumber.text

        legislature = self.get_preface_tag(debate, "legislature")
        if legislature:
            legislature = legislature.text

        session = self.get_preface_tag(debate, "session")
        if session:
            session = session.text

        source_url = self.get_preface_tag(debate, "link") or ""
        if source_url:
            source_url = source_url.get("href")

        section = None
        if docTitle:
            kwargs = {
                "parent": None,
                "heading": docTitle,
                "start_date": self.start_date,
                "number": docNumber or "",
                "legislature": legislature or "",
                "session": session or "",
            }

            # If the importer has no opinion on clobbering, just import the section,
            # potentially creating a duplicate section.
            if self.clobber is not None:
                try:
                    section = Section.objects.for_instance(self.instance).get(**kwargs)
                    if self.clobber:
                        logger.info("Clobbering %s" % docTitle)
                        for speech in section.descendant_speeches():
                            speech.delete()
                        section.delete()
                    else:
                        logger.info("Skipping %s" % docTitle)
                        return
                except Section.DoesNotExist:
                    logger.info("Importing %s" % docTitle)

            section = self.make(Section, source_url=source_url, **kwargs)

        self.visit(debate.debateBody, section)
Example #12
0
    def parse_document(self):
        debate = self.xml.debate

        if self.ns:
            people = debate.findall(
                'an:meta/an:references/an:TLCPerson',
                namespaces={'an': self.ns},
                )
        else:
            people = debate.findall('meta/references/TLCPerson')
        if people is None:
            people = []
        for person in people:
            id = person.get('id')
            href = person.get('href')
            try:
                speaker = Speaker.objects.get(
                    instance=self.instance, identifiers__identifier=href)
            except Speaker.DoesNotExist:
                speaker = Speaker(
                    instance=self.instance, name=person.get('showAs'))
                if self.commit:
                    speaker.save()
                    speaker.identifiers.create(
                        identifier=href, scheme='Akoma Ntoso import')

            self.speakers[id] = speaker

        docDate = self.get_preface_tag(debate, 'docDate')
        if docDate:
            self.start_date = dateutil.parse(docDate.get('date'))

        docTitle = self.get_preface_tag(debate, 'docTitle')
        if docTitle:
            docTitle = docTitle.text

        docNumber = self.get_preface_tag(debate, 'docNumber')
        if docNumber:
            docNumber = docNumber.text

        legislature = self.get_preface_tag(debate, 'legislature')
        if legislature:
            legislature = legislature.text

        session = self.get_preface_tag(debate, 'session')
        if session:
            session = session.text

        section = None
        if docTitle:
            kwargs = {
                'parent': None,
                'heading': docTitle,
                'start_date': self.start_date,
                'number': docNumber or '',
                'legislature': legislature or '',
                'session': session or '',
            }

            # If the importer has no opinion on clobbering, just import the section,
            # potentially creating a duplicate section.
            if self.clobber is not None:
                try:
                    section = Section.objects.for_instance(self.instance).get(**kwargs)
                    if self.clobber:
                        logger.info('Clobbering %s' % docTitle)
                        for speech in section.descendant_speeches():
                            speech.delete()
                        section.delete()
                    else:
                        logger.info('Skipping %s' % docTitle)
                        return
                except Section.DoesNotExist:
                    logger.info('Importing %s' % docTitle)

            section = self.make(Section, **kwargs)

        self.visit(debate.debateBody, section)
    def parse_document(self):
        self.stats = {Speaker: 0}
        debate = self.xml.debate

        if self.ns:
            people = debate.findall(
                'an:meta/an:references/an:TLCPerson',
                namespaces={'an': self.ns},
            )
        else:
            people = debate.findall('meta/references/TLCPerson')
        if people is None:
            people = []
        for person in people:
            id = person.get('id')
            href = person.get('href')
            try:
                speaker = Speaker.objects.get(instance=self.instance,
                                              identifiers__identifier=href)
            except Speaker.DoesNotExist:
                name = person.get('showAs')
                if not name:
                    raise Exception("TLCPerson '%s' is missing showAs" % href)
                speaker = Speaker(instance=self.instance, name=name)
                self.stats[Speaker] += 1

                if self.commit:
                    speaker.save()
                    speaker.identifiers.create(identifier=href,
                                               scheme='Akoma Ntoso import')

            self.speakers[id] = speaker

        docDate = self.get_preface_tag(debate, 'docDate')
        if docDate is not None:
            date = docDate.get('date')
            if date:
                try:
                    self.start_date = dateutil.parse(date)
                except ValueError:
                    logger.warn("docDate element did not parse '%s'" % date)
            else:
                logger.warn("docDate element missing required date attribute")

        docTitle = self.get_preface_tag(debate, 'docTitle')
        if docTitle:
            docTitle = docTitle.text

        docNumber = self.get_preface_tag(debate, 'docNumber')
        if docNumber:
            docNumber = docNumber.text

        legislature = self.get_preface_tag(debate, 'legislature')
        if legislature:
            legislature = legislature.text

        session = self.get_preface_tag(debate, 'session')
        if session:
            session = session.text

        source_url = self.get_preface_tag(debate, 'link')
        if source_url is not None:
            source_url = source_url.get('href')

        self.imported_section_ids = set()

        section = None
        if docTitle:
            kwargs = {
                'parent': None,
                'heading': docTitle,
                'start_date': self.start_date,
                'number': docNumber or '',
                'legislature': legislature or '',
                'session': session or '',
            }

            section = self.make_section(source_url=source_url or '', **kwargs)

            if not section:
                return self.stats

        self.visit(debate.debateBody, section)
        return self.stats