def questions( year='2013', reimport_existing=False, cache_name=None, throttle=None, autoanalyze=False, ): from mptracker.scraper.questions import QuestionScraper from mptracker.questions import ocr_question from mptracker.policy import calculate_question if reimport_existing: known_urls = set() else: known_urls = set(q.url for q in models.Question.query) def skip_question(url): return url in known_urls http_session = create_session(cache_name=cache_name, throttle=throttle and float(throttle), counters=True) questions_scraper = QuestionScraper(session=http_session, skip=skip_question) mandate_lookup = models.MandateLookup() question_patcher = TablePatcher(models.Question, models.db.session, key_columns=['number', 'date']) new_ask_rows = 0 changed = [] with question_patcher.process() as add: for question in questions_scraper.run(int(year)): person_list = question.pop('person') question['addressee'] = '; '.join(question['addressee']) result = add(question) q = result.row old_asked = {ask.mandate_id: ask for ask in q.asked} for name, person_year, person_number in person_list: mandate = mandate_lookup.find(name, person_year, person_number) if mandate.id in old_asked: old_asked.pop(mandate.id) else: ask = models.Ask(mandate=mandate) q.asked.append(ask) ask.set_meta('new', True) logger.info("Adding ask for %s: %s", q, mandate) new_ask_rows += 1 if result.is_changed: changed.append(q) assert not old_asked models.db.session.commit() if new_ask_rows: logger.info("Added %d ask records", new_ask_rows) counters = http_session.counters logger.info("HTTP: %d kb in %s requests, %.2f seconds", counters['bytes'] / 1024, counters['requests'], counters['download_time'].total_seconds()) if autoanalyze: logger.info("Scheduling jobs for %d questions", len(changed)) for question in changed: if question.pdf_url: ocr_question.delay(question.id, autoanalyze=True) if question.policy_domain_id is None: calculate_question.delay(question.id)