Exemple #1
0
def get_questions(
        year='2015',
        reimport_existing=False,
        cache_name=None,
        throttle=None,
        autoanalyze=False,
        ):
    from mptracker.scraper.questions import QuestionScraper
    from mptracker.questions import ocr_question, ocr_answer
    from mptracker.policy import calculate_question

    if reimport_existing:
        known_urls = set()
    else:
        url_query = models.db.session.query(models.Question.url)
        known_urls = set(row[0] for row in url_query)

    def skip_question(url):
        return url in known_urls

    http_session = create_session(cache_name=cache_name or
                                       _get_config_cache_name(),
                                  throttle=throttle and float(throttle),
                                  counters=True)
    questions_scraper = QuestionScraper(session=http_session,
                                        skip=skip_question)

    mandate_lookup = models.MandateLookup()

    question_patcher = TablePatcher(models.Question,
                                    models.db.session,
                                    key_columns=['number', 'date'])

    answer_patcher = TablePatcher(models.Answer,
                                  models.db.session,
                                  key_columns=['question_id'])

    new_ask_rows = 0

    changed_questions = []
    changed_answers = []

    with question_patcher.process() as add, \
         answer_patcher.process() as add_answer:
        for question in questions_scraper.run(int(year)):
            person_list = question.pop('person')
            question['addressee'] = '; '.join(question['addressee'])
            answer_data = question.pop('answer', None)
            result = add(question)
            q = result.row

            old_asked = {ask.mandate_id: ask for ask in q.asked}
            for name, person_year, person_number in person_list:
                mandate = mandate_lookup.find(name, person_year, person_number)
                if mandate.id in old_asked:
                    old_asked.pop(mandate.id)

                else:
                    ask = models.Ask(mandate=mandate)
                    q.asked.append(ask)
                    ask.set_meta('new', True)
                    logger.info("Adding ask for %s: %s", q, mandate)
                    new_ask_rows += 1

            if result.is_changed:
                changed_questions.append(q)

            if old_asked:
                logger.warn("Removing %d old 'ask' records", len(old_asked))
                for ask in old_asked.values():
                    models.db.session.delete(ask)

            if answer_data:
                assert q.id is not None
                answer_data['question_id'] = q.id
                answer_result = add_answer(answer_data)
                if answer_result.is_changed:
                    changed_answers.append(answer_result.row)

    models.db.session.commit()

    if new_ask_rows:
        logger.info("Added %d ask records", new_ask_rows)

    counters = http_session.counters
    logger.info("HTTP: %d kb in %s requests, %.2f seconds",
                counters['bytes'] / 1024, counters['requests'],
                counters['download_time'].total_seconds())

    if autoanalyze:
        logger.info("Scheduling jobs for %d questions", len(changed_questions))
        for question in changed_questions:
            if question.pdf_url:
                ocr_question.delay(question.id, autoanalyze=True)

            if question.policy_domain_id is None:
                calculate_question.delay(question.id)

        logger.info("Scheduling jobs for %d answers", len(changed_answers))
        for answer in changed_answers:
            ocr_answer.delay(answer.id)