Example #1
0
    def parse_document(self):
        debate = self.xml.debate

        if self.ns:
            people = debate.findall('an:meta/an:references/an:TLCPerson',
                                    namespaces={'an': self.ns})
        else:
            people = debate.findall('meta/references/TLCPerson')
        if people is None: people = []
        for person in people:
            id = person.get('id')
            href = person.get('href')
            try:
                p = Person.objects.get(popit_id=href)
            except Person.DoesNotExist:
                p = Person(popit_id=href, api_instance=self.ai)
                if self.commit:
                    p.save()

            try:
                speaker = Speaker.objects.get(instance=self.instance, person=p)
            except Speaker.DoesNotExist:
                speaker = Speaker(instance=self.instance,
                                  name=person.get('showAs'),
                                  person=p)
                if self.commit:
                    speaker.save()

            self.speakers[id] = speaker

        if self.ns:
            docDate = debate.find(
                'an:coverPage//an:docDate|an:preface//an:docDate',
                namespaces={'an': self.ns})
        else:
            docDate = debate.find('coverPage//docDate|preface//docDate')
        if docDate is not None:
            self.start_date = dateutil.parse(docDate.get('date'))

        if self.ns:
            docTitle = debate.find(
                'an:coverPage//an:docTitle|an:preface//an:docTitle',
                namespaces={'an': self.ns})
        else:
            docTitle = debate.find('coverPage//docTitle|preface//docTitle')
        if docTitle is None:
            section = None
        else:
            section = self.make(Section, parent=None, title=docTitle.text)

        self.visit(debate.debateBody, section)
Example #2
0
 def get_persons_data(self, dp):
     persons = next((resource for resource in dp.resources
                     if resource.descriptor['name'] == 'persons-person'),
                    None)
     # One day this information will be useful, but not now...
     # positions = next((resource for resource in dp.resources if resource.descriptor['name'] == 'persons-position'),
     #                  None)
     # persons_to_positions = next(
     #     (resource for resource in dp.resources if resource.descriptor['name'] == 'persons-persons-to-positions'),
     #     None)
     if persons is not None:
         for person in persons.data:
             speaker = Speaker(instance=self.instance)
             speaker.given_name = person['first_name']
             speaker.family_name = person['last_name']
             speaker.name = speaker.given_name + ' ' + speaker.family_name
             speaker.email = person['email']
             speaker.gender = person['gender_description']
             speaker.save()
Example #3
0
    def get_person(self, name, party, pombola_person_slug=None):

        # If we can directly find the person from the
        # pombola_person_slug, use that - the Code4SA / PMG
        # identification of speakers seems to be better than that from
        # popolo_name_resolver.
        speaker_from_slug = None
        if pombola_person_slug is not None:
            speaker_from_slug = Speaker.objects.filter(
                identifiers__scheme='pombola_person_slug',
                identifiers__identifier=pombola_person_slug).first()
            if speaker_from_slug:
                return speaker_from_slug

        cached = self.person_cache.get(name, None)
        if cached:
            return cached

        display_name = name or '(narrative)'

        speaker = None
        person = None

        if name:
            person = self.resolver.get_person(display_name, party)
            if person:
                speaker = person.speaker

        if not speaker:
            try:
                speaker = Speaker.objects.get(
                    instance=self.instance, name=display_name)
            except Speaker.DoesNotExist:
                speaker = Speaker(instance=self.instance, name=display_name)
                if self.commit:
                    speaker.save()

        self.person_cache[name] = speaker
        return speaker
Example #4
0
    def parse_document(self):
        debate = self.xml.debate

        if self.ns:
            people = debate.findall(
                'an:meta/an:references/an:TLCPerson',
                namespaces={'an': self.ns},
            )
        else:
            people = debate.findall('meta/references/TLCPerson')
        if people is None:
            people = []
        for person in people:
            id = person.get('id')
            href = person.get('href')
            try:
                speaker = Speaker.objects.get(instance=self.instance,
                                              identifiers__identifier=href)
            except Speaker.DoesNotExist:
                speaker = Speaker(instance=self.instance,
                                  name=person.get('showAs'))
                if self.commit:
                    speaker.save()
                    speaker.identifiers.create(identifier=href,
                                               scheme='Akoma Ntoso import')

            self.speakers[id] = speaker

        docDate = self.get_preface_tag(debate, 'docDate')
        if docDate:
            self.start_date = dateutil.parse(docDate.get('date'))

        docTitle = self.get_preface_tag(debate, 'docTitle')
        if docTitle:
            docTitle = docTitle.text

        docNumber = self.get_preface_tag(debate, 'docNumber')
        if docNumber:
            docNumber = docNumber.text

        legislature = self.get_preface_tag(debate, 'legislature')
        if legislature:
            legislature = legislature.text

        session = self.get_preface_tag(debate, 'session')
        if session:
            session = session.text

        self.imported_section_ids = set()

        section = None
        if docTitle:
            kwargs = {
                'parent': None,
                'heading': docTitle,
                'start_date': self.start_date,
                'number': docNumber or '',
                'legislature': legislature or '',
                'session': session or '',
            }
            section = self.make_section(**kwargs)
            if not section:
                return

        self.visit(debate.debateBody, section)
    def handle(self, *args, **options):
        if options['list'] or len(args) != 1:
            self.stdout.write('Plays:\n')
            for play in sorted(PLAYS.values()):
                self.stdout.write('* %s\n' % play)
            if not options['list']:
                raise CommandError("Please specify a play")
            return

        play = args[0]
        file = None
        for f, p in PLAYS.items():
            if play == p:
                file = f
                break

        if not file:
            raise CommandError("No matching play found")

        try:
            self.instance = Instance.objects.get(label=options['instance'])
        except:
            raise CommandError("Instance specified not found")

        self.commit = options['commit']

        xml = urlopen('http://www.ibiblio.org/xml/examples/shakespeare/%s' %
                      file).read()
        play_xml = etree.fromstring(xml)
        play_section = self.make(Section, heading=play)

        speakers = {}
        for act in play_xml:
            if act.tag != 'ACT':
                continue
            act_heading = act[0].text
            act_section = self.make(Section,
                                    heading=act_heading,
                                    parent=play_section)
            scenes = act[1:]
            for scene in scenes:
                scene_heading = scene[0].text
                scene_section = self.make(Section,
                                          heading=scene_heading,
                                          parent=act_section)
                speeches_xml = scene[1:]
                for sp in speeches_xml:
                    if sp.tag == 'STAGEDIR' or sp.tag == 'SUBHEAD' or sp.tag == 'SUBTITLE':
                        self.make(Speech,
                                  section=scene_section,
                                  text='<p><i>%s</i></p>' % sp.text,
                                  type='narrative')
                        continue

                    if not sp[0].text:
                        speaker = None
                    elif self.commit:
                        name = sp[0].text.replace('[', '').replace(']', '')
                        if name in speakers:
                            speaker = speakers[name]
                        else:
                            speaker = Speaker.objects.create(
                                name=name, instance=self.instance)
                            speakers[name] = speaker
                    else:
                        speaker = Speaker(name=sp[0].text,
                                          instance=self.instance)

                    text = ""
                    lines = sp[1:]
                    for line in lines:
                        if len(line):
                            text += '<i>%s</i>' % line[0].text
                            if line[0].tail:
                                text += ' %s' % line[0].tail.strip()
                            text += '<br>\n'
                        elif line.tag == 'LINE':
                            text += '%s<br>\n' % line.text
                        elif line.tag == 'STAGEDIR':
                            text += '<i>%s</i><br>\n' % line.text

                    text = '<p>%s</p>' % text
                    self.make(Speech,
                              speaker=speaker,
                              section=scene_section,
                              text=text,
                              type='speech')
    def parse_document(self):
        self.stats = {Speaker: 0}
        debate = self.xml.debate

        if self.ns:
            people = debate.findall(
                'an:meta/an:references/an:TLCPerson',
                namespaces={'an': self.ns},
            )
        else:
            people = debate.findall('meta/references/TLCPerson')
        if people is None:
            people = []
        for person in people:
            id = person.get('id')
            href = person.get('href')
            try:
                speaker = Speaker.objects.get(instance=self.instance,
                                              identifiers__identifier=href)
            except Speaker.DoesNotExist:
                name = person.get('showAs')
                if not name:
                    raise Exception("TLCPerson '%s' is missing showAs" % href)
                speaker = Speaker(instance=self.instance, name=name)
                self.stats[Speaker] += 1

                if self.commit:
                    speaker.save()
                    speaker.identifiers.create(identifier=href,
                                               scheme='Akoma Ntoso import')

            self.speakers[id] = speaker

        docDate = self.get_preface_tag(debate, 'docDate')
        if docDate is not None:
            date = docDate.get('date')
            if date:
                try:
                    self.start_date = dateutil.parse(date)
                except ValueError:
                    logger.warn("docDate element did not parse '%s'" % date)
            else:
                logger.warn("docDate element missing required date attribute")

        docTitle = self.get_preface_tag(debate, 'docTitle')
        if docTitle:
            docTitle = docTitle.text

        docNumber = self.get_preface_tag(debate, 'docNumber')
        if docNumber:
            docNumber = docNumber.text

        legislature = self.get_preface_tag(debate, 'legislature')
        if legislature:
            legislature = legislature.text

        session = self.get_preface_tag(debate, 'session')
        if session:
            session = session.text

        source_url = self.get_preface_tag(debate, 'link')
        if source_url is not None:
            source_url = source_url.get('href')

        self.imported_section_ids = set()

        section = None
        if docTitle:
            kwargs = {
                'parent': None,
                'heading': docTitle,
                'start_date': self.start_date,
                'number': docNumber or '',
                'legislature': legislature or '',
                'session': session or '',
            }

            section = self.make_section(source_url=source_url or '', **kwargs)

            if not section:
                return self.stats

        self.visit(debate.debateBody, section)
        return self.stats