def import_document(document, interactive=True, reimport_preserving_sequence=False): old_statements = None if document.statement_set.all().exists(): if reimport_preserving_sequence: if OldSequenceMapping.objects.filter(document=document).exists(): logger.error("Sequence mapping already exits for %s" % document) return old_statements = list(document.statement_set.all()) document.statement_set.all().delete() else: if not interactive: return sys.stderr.write("Statements already exist for %r.\nDelete them? (y/n) " % document) if raw_input().strip() != 'y': return document.statement_set.all().delete() document.download() xml_en = document.get_cached_xml('en') pdoc_en = alpheus.parse_file(xml_en) xml_en.close() xml_fr = document.get_cached_xml('fr') pdoc_fr = alpheus.parse_file(xml_fr) xml_fr.close() if document.date and document.date != pdoc_en.meta['date']: # Sometimes they get the date wrong if document.date != pdoc_fr.meta['date']: logger.error("Date mismatch on document #%s: %s %s" % ( document.id, document.date, pdoc_en.meta['date'])) else: document.date = pdoc_en.meta['date'] document.number = pdoc_en.meta['document_number'] document.public = True statements = [] for pstate in pdoc_en.statements: s = Statement( document=document, sequence=len(statements), content_en=pstate.content, time=pstate.meta['timestamp']) s.source_id = pstate.meta['id'] s.h1 = pstate.meta.get('h1', '') s.h2 = pstate.meta.get('h2', '') s.h3 = pstate.meta.get('h3', '') if s.h3 and not s.h2: s.h2 = s.h3 s.h3 = '' s.who = pstate.meta.get('person_attribution', '') s.who_hocid = int(pstate.meta['person_id']) if pstate.meta.get('person_id') else None s.who_context = pstate.meta.get('person_context', '') s.statement_type = pstate.meta.get('intervention_type', '').lower() s.written_question = pstate.meta.get('written_question', '').upper()[:1] if s.who_hocid and not pstate.meta.get('person_type'): # At the moment. person_type is only set if we know the person # is a non-politician. This might change... try: s.politician = Politician.objects.get_by_parl_id(s.who_hocid, session=document.session) s.member = ElectedMember.objects.get_by_pol(s.politician, date=document.date) except Politician.DoesNotExist: logger.info("Could not resolve speaking politician ID %s for %r" % (s.who_hocid, s.who)) s._related_pols = set() s._related_bills = set() s.content_en = _process_related_links(s.content_en, s) statements.append(s) if len(statements) != len(pdoc_fr.statements): logger.info("French and English statement counts don't match for %r" % document) _r_paragraphs = re.compile(ur'<p[^>]* data-HoCid=.+?</p>') _r_paragraph_id = re.compile(ur'<p[^>]* data-HoCid="(?P<id>\d+)"') fr_paragraphs = dict() def _get_paragraph_id(p): return int(_r_paragraph_id.match(p).group('id')) for st in pdoc_fr.statements: for p in _r_paragraphs.findall(st.content): fr_paragraphs[_get_paragraph_id(p)] = p def _substitute_french_content(match): try: return fr_paragraphs[_get_paragraph_id(match.group(0))] except KeyError: logger.error("Paragraph ID %s not found in French for %s" % (match.group(0), document)) return match.group(0) for st in statements: st.content_fr = _process_related_links( _r_paragraphs.sub(_substitute_french_content, st.content_en), st ) document.multilingual = True Statement.set_slugs(statements) if old_statements: for mapping in _align_sequences(statements, old_statements): OldSequenceMapping.objects.create( document=document, sequence=mapping[0], slug=mapping[1] ) for s in statements: s.save() s.mentioned_politicians.add(*list(s._related_pols)) s.bills.add(*list(s._related_bills)) if getattr(s, '_related_vote', False): s._related_vote.context_statement = s s._related_vote.save() document.save() return document
def import_document(document, interactive=True, reimport_preserving_sequence=False): old_statements = None if document.statement_set.all().exists(): if reimport_preserving_sequence: if OldSequenceMapping.objects.filter(document=document).exists(): logger.error("Sequence mapping already exits for %s" % document) return old_statements = list(document.statement_set.all()) document.statement_set.all().delete() else: if not interactive: return sys.stderr.write("Statements already exist for %r.\nDelete them? (y/n) " % document) if raw_input().strip() != 'y': return document.statement_set.all().delete() if not document.downloaded: return False xml_en = document.get_cached_xml('en') pdoc_en = alpheus.parse_file(xml_en) xml_en.close() xml_fr = document.get_cached_xml('fr') pdoc_fr = alpheus.parse_file(xml_fr) xml_fr.close() if document.date and document.date != pdoc_en.meta['date']: # Sometimes they get the date wrong if document.date != pdoc_fr.meta['date']: logger.error("Date mismatch on document #%s: %s %s" % ( document.id, document.date, pdoc_en.meta['date'])) else: document.date = pdoc_en.meta['date'] document.number = pdoc_en.meta['document_number'] document.public = True statements = [] for pstate in pdoc_en.statements: s = Statement( document=document, sequence=len(statements), content_en=pstate.content, time=pstate.meta['timestamp']) s.source_id = pstate.meta['id'] s.h1_en = pstate.meta.get('h1', '') s.h2_en = pstate.meta.get('h2', '') s.h3_en = pstate.meta.get('h3', '') if s.h1_en and not s.h2_en: s.h2_en = s.h3_en s.h3_en = '' s.who_en = pstate.meta.get('person_attribution', '') s.who_hocid = int(pstate.meta['person_id']) if pstate.meta.get('person_id') else None s.who_context_en = pstate.meta.get('person_context', '') s.statement_type = pstate.meta.get('intervention_type', '').lower() s.written_question = pstate.meta.get('written_question', '').upper()[:1] if s.who_hocid and not pstate.meta.get('person_type'): # At the moment. person_type is only set if we know the person # is a non-politician. This might change... try: s.politician = Politician.objects.get_by_parl_id(s.who_hocid, session=document.session) s.member = ElectedMember.objects.get_by_pol(s.politician, date=document.date) except Politician.DoesNotExist: logger.info("Could not resolve speaking politician ID %s for %r" % (s.who_hocid, s.who)) s._related_pols = set() s._related_bills = set() s.content_en = _process_related_links(s.content_en, s) statements.append(s) if len(statements) != len(pdoc_fr.statements): logger.info("French and English statement counts don't match for %r" % document) _r_paragraphs = re.compile(ur'<p[^>]* data-HoCid=.+?</p>') _r_paragraph_id = re.compile(ur'<p[^>]* data-HoCid="(?P<id>\d+)"') fr_paragraphs = dict() fr_statements = dict() missing_id_count = 0 def _get_paragraph_id(p): return int(_r_paragraph_id.match(p).group('id')) def _get_paragraphs_and_ids(content): return [(p, _get_paragraph_id(p)) for p in _r_paragraphs.findall(content)] for st in pdoc_fr.statements: if st.meta['id']: fr_statements[st.meta['id']] = st for p, pid in _get_paragraphs_and_ids(st.content): if pid: fr_paragraphs[pid] = p else: missing_id_count += 1 def _substitute_french_content(match): try: pid = _get_paragraph_id(match.group(0)) if pid: return fr_paragraphs[pid] else: return match.group(0) except KeyError: logger.error("Paragraph ID %s not found in French for %s" % (match.group(0), document)) return match.group(0) if missing_id_count > float(len(fr_paragraphs)): logger.error("French paragraphs not available") document.multilingual = False else: document.multilingual = True for st in statements: fr_data = fr_statements.get(st.source_id) pids_en = [pid for p, pid in _get_paragraphs_and_ids(st.content_en)] pids_fr = [pid for p, pid in _get_paragraphs_and_ids(fr_data.content)] if fr_data else None if fr_data and pids_en == pids_fr: # Match by statement st.content_fr = _process_related_links(fr_data.content, st) elif all(pids_en): # Match by paragraph st.content_fr = _process_related_links( _r_paragraphs.sub(_substitute_french_content, st.content_en), st ) else: logger.warning("Could not do multilingual match of statement %s", st.source_id) document.multilingual = False if fr_data: st.h1_fr = fr_data.meta.get('h1', '') st.h2_fr = fr_data.meta.get('h2', '') st.h3_fr = fr_data.meta.get('h3', '') if st.h1_fr and not st.h2_fr: st.h2_fr = s.h3_fr st.h3_fr = '' st.who_fr = fr_data.meta.get('person_attribution', '') st.who_context_fr = fr_data.meta.get('person_context', '') Statement.set_slugs(statements) if old_statements: for mapping in _align_sequences(statements, old_statements): OldSequenceMapping.objects.create( document=document, sequence=mapping[0], slug=mapping[1] ) for s in statements: s.save() s.mentioned_politicians.add(*list(s._related_pols)) s.bills.add(*list(s._related_bills)) if getattr(s, '_related_vote', False): s._related_vote.context_statement = s s._related_vote.save() document.save() return document