def parse_document(self): debate = self.xml.debate if self.ns: people = self.tree.findall('an:debate/an:meta/an:references/an:TLCPerson', namespaces={'an': self.ns}) else: people = self.tree.findall('debate/meta/references/TLCPerson') if people is None: people = [] for person in people: id = person.get('id') href = person.get('href') try: p = Person.objects.get(popit_id=href) except Person.DoesNotExist: p = Person(popit_id=href, api_instance=self.ai) if self.commit: p.save() try: speaker = Speaker.objects.get(instance=self.instance, person=p) except Speaker.DoesNotExist: speaker = Speaker(instance=self.instance, name=person.get('showAs'), person=p) if self.commit: speaker.save() self.speakers[id] = speaker if self.ns: docDate = debate.find('an:preface//an:docDate', namespaces={'an': self.ns}) else: docDate = debate.find('preface//docDate') if docDate is not None: self.start_date = dateutil.parse(docDate.get('date')) self.visit(debate.debateBody, None)
def parse_document(self): debate = self.xml.debate if self.ns: people = debate.findall('an:meta/an:references/an:TLCPerson', namespaces={'an': self.ns}) else: people = debate.findall('meta/references/TLCPerson') if people is None: people = [] for person in people: id = person.get('id') href = person.get('href') try: p = Person.objects.get(popit_id=href) except Person.DoesNotExist: p = Person(popit_id=href, api_instance=self.ai) if self.commit: p.save() try: speaker = Speaker.objects.get(instance=self.instance, person=p) except Speaker.DoesNotExist: speaker = Speaker(instance=self.instance, name=person.get('showAs'), person=p) if self.commit: speaker.save() self.speakers[id] = speaker if self.ns: docDate = debate.find( 'an:coverPage//an:docDate|an:preface//an:docDate', namespaces={'an': self.ns}) else: docDate = debate.find('coverPage//docDate|preface//docDate') if docDate is not None: self.start_date = dateutil.parse(docDate.get('date')) if self.ns: docTitle = debate.find( 'an:coverPage//an:docTitle|an:preface//an:docTitle', namespaces={'an': self.ns}) else: docTitle = debate.find('coverPage//docTitle|preface//docTitle') if docTitle is None: section = None else: section = self.make(Section, parent=None, title=docTitle.text) self.visit(debate.debateBody, section)
def get_person(self, name, party, pombola_person_slug=None): # If we can directly find the person from the # pombola_person_slug, use that - the Code4SA / PMG # identification of speakers seems to be better than that from # popolo_name_resolver. speaker_from_slug = None if pombola_person_slug is not None: speaker_from_slug = Speaker.objects.filter( identifiers__scheme='pombola_person_slug', identifiers__identifier=pombola_person_slug).first() if speaker_from_slug: return speaker_from_slug cached = self.person_cache.get(name, None) if cached: return cached display_name = name or '(narrative)' speaker = None person = None if name: person = self.resolver.get_person(display_name, party) if person: speaker = person.speaker if not speaker: try: speaker = Speaker.objects.get( instance=self.instance, name=display_name) except Speaker.DoesNotExist: speaker = Speaker(instance=self.instance, name=display_name) if self.commit: speaker.save() self.person_cache[name] = speaker return speaker
def parse_document(self): debate = self.xml.debate if self.ns: people = debate.findall('an:meta/an:references/an:TLCPerson', namespaces={'an': self.ns}) else: people = debate.findall('meta/references/TLCPerson') if people is None: people = [] for person in people: id = person.get('id') href = person.get('href') try: speaker = Speaker.objects.get(instance=self.instance, identifiers__identifier=href) except Speaker.DoesNotExist: speaker = Speaker(instance=self.instance, name=person.get('showAs')) if self.commit: speaker.save() speaker.identifiers.create(identifier=href, scheme='Akoma Ntoso import') self.speakers[id] = speaker if self.ns: docDate = debate.find('an:coverPage//an:docDate|an:preface//an:docDate', namespaces={'an': self.ns}) else: docDate = debate.find('coverPage//docDate|preface//docDate') if docDate is not None: self.start_date = dateutil.parse(docDate.get('date')) if self.ns: docTitle = debate.find('an:coverPage//an:docTitle|an:preface//an:docTitle', namespaces={'an': self.ns}) else: docTitle = debate.find('coverPage//docTitle|preface//docTitle') if docTitle is None: section = None else: section = self.make(Section, parent=None, title=docTitle.text) self.visit(debate.debateBody, section)
def get_persons_data(self, dp): persons = next((resource for resource in dp.resources if resource.descriptor['name'] == 'persons-person'), None) # One day this information will be useful, but not now... # positions = next((resource for resource in dp.resources if resource.descriptor['name'] == 'persons-position'), # None) # persons_to_positions = next( # (resource for resource in dp.resources if resource.descriptor['name'] == 'persons-persons-to-positions'), # None) if persons is not None: for person in persons.data: speaker = Speaker(instance=self.instance) speaker.given_name = person['first_name'] speaker.family_name = person['last_name'] speaker.name = speaker.given_name + ' ' + speaker.family_name speaker.email = person['email'] speaker.gender = person['gender_description'] speaker.save()
def get_person(self, name, party, pombola_person_slug=None): # If we can directly find the person from the # pombola_person_slug, use that - the Code4SA / PMG # identification of speakers seems to be better than that from # popolo_name_resolver. speaker_from_slug = None if pombola_person_slug is not None: speaker_from_slug = Speaker.objects.filter( identifiers__scheme='pombola_person_slug', identifiers__identifier=pombola_person_slug).first() if speaker_from_slug: return speaker_from_slug cached = self.person_cache.get(name, None) if cached: return cached display_name = name or '(narrative)' speaker = None person = None if name: person = self.resolver.get_person(display_name, party) if person: speaker = person.speaker if not speaker: try: speaker = Speaker.objects.get(instance=self.instance, name=display_name) except Speaker.DoesNotExist: speaker = Speaker(instance=self.instance, name=display_name) if self.commit: speaker.save() self.person_cache[name] = speaker return speaker
def get_person(self, name): cached = self.person_cache.get(name, None) if cached: return cached display_name = name or '(narrative)' speaker = None popit_person = None if name: self.speakers_count += 1 if self.resolver: popit_person = self.resolver.get_person(display_name) if popit_person: self.speakers_matched += 1 try: speaker = Speaker.objects.get( instance = self.instance, person = popit_person) except Speaker.DoesNotExist: pass else: logger.info(" - Failed to get user %s" % display_name) if not speaker: try: speaker = Speaker.objects.get(instance=self.instance, name=display_name) except Speaker.DoesNotExist: speaker = Speaker(instance=self.instance, name=display_name) if self.commit: speaker.save() if popit_person: speaker.person = popit_person if self.commit: speaker.save() self.person_cache[name] = speaker return speaker
def parse_document(self): self.stats = {Speaker: 0} debate = self.xml.debate if self.ns: people = debate.findall( 'an:meta/an:references/an:TLCPerson', namespaces={'an': self.ns}, ) else: people = debate.findall('meta/references/TLCPerson') if people is None: people = [] for person in people: id = person.get('id') href = person.get('href') try: speaker = Speaker.objects.get( instance=self.instance, identifiers__identifier=href) except Speaker.DoesNotExist: name = person.get('showAs') if not name: raise Exception("TLCPerson '%s' is missing showAs" % href) speaker = Speaker(instance=self.instance, name=name) self.stats[Speaker] += 1 if self.commit: speaker.save() speaker.identifiers.create( identifier=href, scheme='Akoma Ntoso import') self.speakers[id] = speaker docDate = self.get_preface_tag(debate, 'docDate') if docDate is not None: date = docDate.get('date') if date: try: self.start_date = dateutil.parse(date) except ValueError: logger.warn("docDate element did not parse '%s'" % date) else: logger.warn("docDate element missing required date attribute") docTitle = self.get_preface_tag(debate, 'docTitle') if docTitle: docTitle = docTitle.text docNumber = self.get_preface_tag(debate, 'docNumber') if docNumber: docNumber = docNumber.text legislature = self.get_preface_tag(debate, 'legislature') if legislature: legislature = legislature.text session = self.get_preface_tag(debate, 'session') if session: session = session.text source_url = self.get_preface_tag(debate, 'link') if source_url is not None: source_url = source_url.get('href') self.imported_section_ids = set() section = None if docTitle: kwargs = { 'parent': None, 'heading': docTitle, 'start_date': self.start_date, 'number': docNumber or '', 'legislature': legislature or '', 'session': session or '', } section = self.make_section(source_url=source_url or '', **kwargs) if not section: return self.stats self.visit(debate.debateBody, section) return self.stats
def parse_document(self): debate = self.xml.debate if self.ns: people = debate.findall( 'an:meta/an:references/an:TLCPerson', namespaces={'an': self.ns}, ) else: people = debate.findall('meta/references/TLCPerson') if people is None: people = [] for person in people: id = person.get('id') href = person.get('href') try: speaker = Speaker.objects.get(instance=self.instance, identifiers__identifier=href) except Speaker.DoesNotExist: speaker = Speaker(instance=self.instance, name=person.get('showAs')) if self.commit: speaker.save() speaker.identifiers.create(identifier=href, scheme='Akoma Ntoso import') self.speakers[id] = speaker docDate = self.get_preface_tag(debate, 'docDate') if docDate: self.start_date = dateutil.parse(docDate.get('date')) docTitle = self.get_preface_tag(debate, 'docTitle') if docTitle: docTitle = docTitle.text docNumber = self.get_preface_tag(debate, 'docNumber') if docNumber: docNumber = docNumber.text legislature = self.get_preface_tag(debate, 'legislature') if legislature: legislature = legislature.text session = self.get_preface_tag(debate, 'session') if session: session = session.text self.imported_section_ids = set() section = None if docTitle: kwargs = { 'parent': None, 'heading': docTitle, 'start_date': self.start_date, 'number': docNumber or '', 'legislature': legislature or '', 'session': session or '', } section = self.make_section(**kwargs) if not section: return self.visit(debate.debateBody, section)
def handle(self, *args, **options): if options['list'] or len(args) != 1: self.stdout.write('Plays:\n') for play in sorted(PLAYS.values()): self.stdout.write('* %s\n' % play) if not options['list']: raise CommandError("Please specify a play") return play = args[0] file = None for f, p in PLAYS.items(): if play == p: file = f break if not file: raise CommandError("No matching play found") try: self.instance = Instance.objects.get(label=options['instance']) except: raise CommandError("Instance specified not found") self.commit = options['commit'] xml = urlopen('http://www.ibiblio.org/xml/examples/shakespeare/%s' % file).read() play_xml = etree.fromstring(xml) play_section = self.make(Section, heading=play) speakers = {} for act in play_xml: if act.tag != 'ACT': continue act_heading = act[0].text act_section = self.make(Section, heading=act_heading, parent=play_section) scenes = act[1:] for scene in scenes: scene_heading = scene[0].text scene_section = self.make(Section, heading=scene_heading, parent=act_section) speeches_xml = scene[1:] for sp in speeches_xml: if sp.tag == 'STAGEDIR' or sp.tag == 'SUBHEAD' or sp.tag == 'SUBTITLE': self.make(Speech, section=scene_section, text='<p><i>%s</i></p>' % sp.text, type='narrative') continue if not sp[0].text: speaker = None elif self.commit: name = sp[0].text.replace('[', '').replace(']', '') if name in speakers: speaker = speakers[name] else: speaker = Speaker.objects.create( name=name, instance=self.instance) speakers[name] = speaker else: speaker = Speaker(name=sp[0].text, instance=self.instance) text = "" lines = sp[1:] for line in lines: if len(line): text += '<i>%s</i>' % line[0].text if line[0].tail: text += ' %s' % line[0].tail.strip() text += '<br>\n' elif line.tag == 'LINE': text += '%s<br>\n' % line.text elif line.tag == 'STAGEDIR': text += '<i>%s</i><br>\n' % line.text text = '<p>%s</p>' % text self.make(Speech, speaker=speaker, section=scene_section, text=text, type='speech')
def parse_document(self): debate = self.xml.debate if self.ns: people = debate.findall( 'an:meta/an:references/an:TLCPerson', namespaces={'an': self.ns}, ) else: people = debate.findall('meta/references/TLCPerson') if people is None: people = [] for person in people: id = person.get('id') href = person.get('href') try: speaker = Speaker.objects.get( instance=self.instance, identifiers__identifier=href) except Speaker.DoesNotExist: speaker = Speaker( instance=self.instance, name=person.get('showAs')) if self.commit: speaker.save() speaker.identifiers.create( identifier=href, scheme='Akoma Ntoso import') self.speakers[id] = speaker docDate = self.get_preface_tag(debate, 'docDate') if docDate: self.start_date = dateutil.parse(docDate.get('date')) docTitle = self.get_preface_tag(debate, 'docTitle') if docTitle: docTitle = docTitle.text docNumber = self.get_preface_tag(debate, 'docNumber') if docNumber: docNumber = docNumber.text legislature = self.get_preface_tag(debate, 'legislature') if legislature: legislature = legislature.text session = self.get_preface_tag(debate, 'session') if session: session = session.text section = None if docTitle: section = self.make( Section, parent=None, heading=docTitle, start_date=self.start_date, number=docNumber or '', legislature=legislature or '', session=session or '', ) self.visit(debate.debateBody, section)
def parse_document(self): debate = self.xml.debate if self.ns: people = debate.findall("an:meta/an:references/an:TLCPerson", namespaces={"an": self.ns}) else: people = debate.findall("meta/references/TLCPerson") if people is None: people = [] for person in people: id = person.get("id") href = person.get("href") try: speaker = Speaker.objects.get(instance=self.instance, identifiers__identifier=href) except Speaker.DoesNotExist: speaker = Speaker(instance=self.instance, name=person.get("showAs")) if self.commit: speaker.save() speaker.identifiers.create(identifier=href, scheme="Akoma Ntoso import") self.speakers[id] = speaker docDate = self.get_preface_tag(debate, "docDate") if docDate: self.start_date = dateutil.parse(docDate.get("date")) docTitle = self.get_preface_tag(debate, "docTitle") if docTitle: docTitle = docTitle.text docNumber = self.get_preface_tag(debate, "docNumber") if docNumber: docNumber = docNumber.text legislature = self.get_preface_tag(debate, "legislature") if legislature: legislature = legislature.text session = self.get_preface_tag(debate, "session") if session: session = session.text source_url = self.get_preface_tag(debate, "link") or "" if source_url: source_url = source_url.get("href") section = None if docTitle: kwargs = { "parent": None, "heading": docTitle, "start_date": self.start_date, "number": docNumber or "", "legislature": legislature or "", "session": session or "", } # If the importer has no opinion on clobbering, just import the section, # potentially creating a duplicate section. if self.clobber is not None: try: section = Section.objects.for_instance(self.instance).get(**kwargs) if self.clobber: logger.info("Clobbering %s" % docTitle) for speech in section.descendant_speeches(): speech.delete() section.delete() else: logger.info("Skipping %s" % docTitle) return except Section.DoesNotExist: logger.info("Importing %s" % docTitle) section = self.make(Section, source_url=source_url, **kwargs) self.visit(debate.debateBody, section)
def parse_document(self): debate = self.xml.debate if self.ns: people = debate.findall( 'an:meta/an:references/an:TLCPerson', namespaces={'an': self.ns}, ) else: people = debate.findall('meta/references/TLCPerson') if people is None: people = [] for person in people: id = person.get('id') href = person.get('href') try: speaker = Speaker.objects.get( instance=self.instance, identifiers__identifier=href) except Speaker.DoesNotExist: speaker = Speaker( instance=self.instance, name=person.get('showAs')) if self.commit: speaker.save() speaker.identifiers.create( identifier=href, scheme='Akoma Ntoso import') self.speakers[id] = speaker docDate = self.get_preface_tag(debate, 'docDate') if docDate: self.start_date = dateutil.parse(docDate.get('date')) docTitle = self.get_preface_tag(debate, 'docTitle') if docTitle: docTitle = docTitle.text docNumber = self.get_preface_tag(debate, 'docNumber') if docNumber: docNumber = docNumber.text legislature = self.get_preface_tag(debate, 'legislature') if legislature: legislature = legislature.text session = self.get_preface_tag(debate, 'session') if session: session = session.text section = None if docTitle: kwargs = { 'parent': None, 'heading': docTitle, 'start_date': self.start_date, 'number': docNumber or '', 'legislature': legislature or '', 'session': session or '', } # If the importer has no opinion on clobbering, just import the section, # potentially creating a duplicate section. if self.clobber is not None: try: section = Section.objects.for_instance(self.instance).get(**kwargs) if self.clobber: logger.info('Clobbering %s' % docTitle) for speech in section.descendant_speeches(): speech.delete() section.delete() else: logger.info('Skipping %s' % docTitle) return except Section.DoesNotExist: logger.info('Importing %s' % docTitle) section = self.make(Section, **kwargs) self.visit(debate.debateBody, section)
def parse_document(self): self.stats = {Speaker: 0} debate = self.xml.debate if self.ns: people = debate.findall( 'an:meta/an:references/an:TLCPerson', namespaces={'an': self.ns}, ) else: people = debate.findall('meta/references/TLCPerson') if people is None: people = [] for person in people: id = person.get('id') href = person.get('href') try: speaker = Speaker.objects.get(instance=self.instance, identifiers__identifier=href) except Speaker.DoesNotExist: name = person.get('showAs') if not name: raise Exception("TLCPerson '%s' is missing showAs" % href) speaker = Speaker(instance=self.instance, name=name) self.stats[Speaker] += 1 if self.commit: speaker.save() speaker.identifiers.create(identifier=href, scheme='Akoma Ntoso import') self.speakers[id] = speaker docDate = self.get_preface_tag(debate, 'docDate') if docDate is not None: date = docDate.get('date') if date: try: self.start_date = dateutil.parse(date) except ValueError: logger.warn("docDate element did not parse '%s'" % date) else: logger.warn("docDate element missing required date attribute") docTitle = self.get_preface_tag(debate, 'docTitle') if docTitle: docTitle = docTitle.text docNumber = self.get_preface_tag(debate, 'docNumber') if docNumber: docNumber = docNumber.text legislature = self.get_preface_tag(debate, 'legislature') if legislature: legislature = legislature.text session = self.get_preface_tag(debate, 'session') if session: session = session.text source_url = self.get_preface_tag(debate, 'link') if source_url is not None: source_url = source_url.get('href') self.imported_section_ids = set() section = None if docTitle: kwargs = { 'parent': None, 'heading': docTitle, 'start_date': self.start_date, 'number': docNumber or '', 'legislature': legislature or '', 'session': session or '', } section = self.make_section(source_url=source_url or '', **kwargs) if not section: return self.stats self.visit(debate.debateBody, section) return self.stats