def import_document(document, interactive=True, reimport_preserving_sequence=False):
    old_statements = None
    if document.statement_set.all().exists():
        if reimport_preserving_sequence:
            if OldSequenceMapping.objects.filter(document=document).exists():
                logger.error("Sequence mapping already exits for %s" % document)
                return
            old_statements = list(document.statement_set.all())
            document.statement_set.all().delete()
        else:
            if not interactive:
                return
            sys.stderr.write("Statements already exist for %r.\nDelete them? (y/n) " % document)
            if raw_input().strip() != 'y':
                return
            document.statement_set.all().delete()

    document.download()
    xml_en = document.get_cached_xml('en')
    pdoc_en = alpheus.parse_file(xml_en)
    xml_en.close()

    xml_fr = document.get_cached_xml('fr')
    pdoc_fr = alpheus.parse_file(xml_fr)
    xml_fr.close()
    
    if document.date and document.date != pdoc_en.meta['date']:
        # Sometimes they get the date wrong
        if document.date != pdoc_fr.meta['date']:
            logger.error("Date mismatch on document #%s: %s %s" % (
                document.id, document.date, pdoc_en.meta['date']))
    else:
        document.date = pdoc_en.meta['date']
    document.number = pdoc_en.meta['document_number']
    document.public = True

    statements = []

    for pstate in pdoc_en.statements:
        s = Statement(
            document=document,
            sequence=len(statements),
            content_en=pstate.content,
            time=pstate.meta['timestamp'])
        s.source_id = pstate.meta['id']
        s.h1 = pstate.meta.get('h1', '')
        s.h2 = pstate.meta.get('h2', '')
        s.h3 = pstate.meta.get('h3', '')

        if s.h3 and not s.h2:
            s.h2 = s.h3
            s.h3 = ''

        s.who = pstate.meta.get('person_attribution', '')
        s.who_hocid = int(pstate.meta['person_id']) if pstate.meta.get('person_id') else None
        s.who_context = pstate.meta.get('person_context', '')

        s.statement_type = pstate.meta.get('intervention_type', '').lower()
        s.written_question = pstate.meta.get('written_question', '').upper()[:1]

        if s.who_hocid and not pstate.meta.get('person_type'):
            # At the moment. person_type is only set if we know the person
            # is a non-politician. This might change...
            try:
                s.politician = Politician.objects.get_by_parl_id(s.who_hocid, session=document.session)
                s.member = ElectedMember.objects.get_by_pol(s.politician, date=document.date)
            except Politician.DoesNotExist:
                logger.info("Could not resolve speaking politician ID %s for %r" % (s.who_hocid, s.who))

        s._related_pols = set()
        s._related_bills = set()
        s.content_en = _process_related_links(s.content_en, s)

        statements.append(s)

    if len(statements) != len(pdoc_fr.statements):
        logger.info("French and English statement counts don't match for %r" % document)

    _r_paragraphs = re.compile(ur'<p[^>]* data-HoCid=.+?</p>')
    _r_paragraph_id = re.compile(ur'<p[^>]* data-HoCid="(?P<id>\d+)"')
    fr_paragraphs = dict()

    def _get_paragraph_id(p):
        return int(_r_paragraph_id.match(p).group('id'))

    for st in pdoc_fr.statements:
        for p in _r_paragraphs.findall(st.content):
            fr_paragraphs[_get_paragraph_id(p)] = p

    def _substitute_french_content(match):
        try:
            return fr_paragraphs[_get_paragraph_id(match.group(0))]
        except KeyError:
            logger.error("Paragraph ID %s not found in French for %s" % (match.group(0), document))
            return match.group(0)

    for st in statements:
        st.content_fr = _process_related_links(
            _r_paragraphs.sub(_substitute_french_content, st.content_en),
            st
        )
    document.multilingual = True

    Statement.set_slugs(statements)

    if old_statements:
        for mapping in _align_sequences(statements, old_statements):
            OldSequenceMapping.objects.create(
                document=document,
                sequence=mapping[0],
                slug=mapping[1]
            )
        
    for s in statements:
        s.save()

        s.mentioned_politicians.add(*list(s._related_pols))
        s.bills.add(*list(s._related_bills))
        if getattr(s, '_related_vote', False):
            s._related_vote.context_statement = s
            s._related_vote.save()

    document.save()

    return document
Beispiel #2
0
def import_document(document, interactive=True, reimport_preserving_sequence=False):
    old_statements = None
    if document.statement_set.all().exists():
        if reimport_preserving_sequence:
            if OldSequenceMapping.objects.filter(document=document).exists():
                logger.error("Sequence mapping already exits for %s" % document)
                return
            old_statements = list(document.statement_set.all())
            document.statement_set.all().delete()
        else:
            if not interactive:
                return
            sys.stderr.write("Statements already exist for %r.\nDelete them? (y/n) " % document)
            if raw_input().strip() != 'y':
                return
            document.statement_set.all().delete()

    if not document.downloaded:
        return False
    xml_en = document.get_cached_xml('en')
    pdoc_en = alpheus.parse_file(xml_en)
    xml_en.close()

    xml_fr = document.get_cached_xml('fr')
    pdoc_fr = alpheus.parse_file(xml_fr)
    xml_fr.close()
    
    if document.date and document.date != pdoc_en.meta['date']:
        # Sometimes they get the date wrong
        if document.date != pdoc_fr.meta['date']:
            logger.error("Date mismatch on document #%s: %s %s" % (
                document.id, document.date, pdoc_en.meta['date']))
    else:
        document.date = pdoc_en.meta['date']
    document.number = pdoc_en.meta['document_number']
    document.public = True

    statements = []

    for pstate in pdoc_en.statements:
        s = Statement(
            document=document,
            sequence=len(statements),
            content_en=pstate.content,
            time=pstate.meta['timestamp'])
        s.source_id = pstate.meta['id']
        s.h1_en = pstate.meta.get('h1', '')
        s.h2_en = pstate.meta.get('h2', '')
        s.h3_en = pstate.meta.get('h3', '')

        if s.h1_en and not s.h2_en:
            s.h2_en = s.h3_en
            s.h3_en = ''

        s.who_en = pstate.meta.get('person_attribution', '')
        s.who_hocid = int(pstate.meta['person_id']) if pstate.meta.get('person_id') else None
        s.who_context_en = pstate.meta.get('person_context', '')

        s.statement_type = pstate.meta.get('intervention_type', '').lower()
        s.written_question = pstate.meta.get('written_question', '').upper()[:1]

        if s.who_hocid and not pstate.meta.get('person_type'):
            # At the moment. person_type is only set if we know the person
            # is a non-politician. This might change...
            try:
                s.politician = Politician.objects.get_by_parl_id(s.who_hocid, session=document.session)
                s.member = ElectedMember.objects.get_by_pol(s.politician, date=document.date)
            except Politician.DoesNotExist:
                logger.info("Could not resolve speaking politician ID %s for %r" % (s.who_hocid, s.who))

        s._related_pols = set()
        s._related_bills = set()
        s.content_en = _process_related_links(s.content_en, s)

        statements.append(s)

    if len(statements) != len(pdoc_fr.statements):
        logger.info("French and English statement counts don't match for %r" % document)

    _r_paragraphs = re.compile(ur'<p[^>]* data-HoCid=.+?</p>')
    _r_paragraph_id = re.compile(ur'<p[^>]* data-HoCid="(?P<id>\d+)"')
    fr_paragraphs = dict()
    fr_statements = dict()
    missing_id_count = 0

    def _get_paragraph_id(p):
        return int(_r_paragraph_id.match(p).group('id'))

    def _get_paragraphs_and_ids(content):
        return [(p, _get_paragraph_id(p)) for p in _r_paragraphs.findall(content)]

    for st in pdoc_fr.statements:
        if st.meta['id']:
            fr_statements[st.meta['id']] = st
        for p, pid in _get_paragraphs_and_ids(st.content):
            if pid:
                fr_paragraphs[pid] = p
            else:
                missing_id_count += 1

    def _substitute_french_content(match):
        try:
            pid = _get_paragraph_id(match.group(0))
            if pid:
                return fr_paragraphs[pid]
            else:
                return match.group(0)
        except KeyError:
            logger.error("Paragraph ID %s not found in French for %s" % (match.group(0), document))
            return match.group(0)

    if missing_id_count > float(len(fr_paragraphs)):
        logger.error("French paragraphs not available")
        document.multilingual = False
    else:
        document.multilingual = True
        for st in statements:
            fr_data = fr_statements.get(st.source_id)
            pids_en = [pid for p, pid in _get_paragraphs_and_ids(st.content_en)]
            pids_fr = [pid for p, pid in _get_paragraphs_and_ids(fr_data.content)] if fr_data else None
            if fr_data and pids_en == pids_fr:
                # Match by statement
                st.content_fr = _process_related_links(fr_data.content, st)
            elif all(pids_en):
                # Match by paragraph
                st.content_fr = _process_related_links(
                    _r_paragraphs.sub(_substitute_french_content, st.content_en),
                    st
                )
            else:
                logger.warning("Could not do multilingual match of statement %s", st.source_id)
                document.multilingual = False
            if fr_data:
                st.h1_fr = fr_data.meta.get('h1', '')
                st.h2_fr = fr_data.meta.get('h2', '')
                st.h3_fr = fr_data.meta.get('h3', '')
                if st.h1_fr and not st.h2_fr:
                    st.h2_fr = s.h3_fr
                    st.h3_fr = ''
                st.who_fr = fr_data.meta.get('person_attribution', '')
                st.who_context_fr = fr_data.meta.get('person_context', '')

    Statement.set_slugs(statements)

    if old_statements:
        for mapping in _align_sequences(statements, old_statements):
            OldSequenceMapping.objects.create(
                document=document,
                sequence=mapping[0],
                slug=mapping[1]
            )
        
    for s in statements:
        s.save()

        s.mentioned_politicians.add(*list(s._related_pols))
        s.bills.add(*list(s._related_bills))
        if getattr(s, '_related_vote', False):
            s._related_vote.context_statement = s
            s._related_vote.save()

    document.save()

    return document