def main_routine(): t = time() with open("config.yaml", "r") as f: conf = yaml.load(f) set_up_logging(conf[const.CONF_LOGGING]) logger = logging.getLogger("Main") db = storage.MongoDatabase(conf=conf[const.CONF_MONGO]) scrape.Hlasovanie(db, conf).store_all() html_parser.Hlasovanie(db, conf).parse_all() scrape.Poslanec(db, conf).store_all() html_parser.Poslanec(db, conf).parse_all() scrape.Zakon(db, conf).store_all() html_parser.Zakon(db, conf).parse_all() scrape.LegislativnaIniciativa(db, conf).store_all() html_parser.LegislativnaIniciativa(db, conf).parse_all() scrape.HlasovanieTlace(db, conf).store_all() html_parser.HlasovanieTlace(db, conf).parse_all() scrape.Zmena(db, conf).store_all() html_parser.Zmena(db, conf).parse_all() scrape.Rozprava(db, conf).store_all() html_parser.Rozprava(db, conf).parse_all() logger.info("Total elapsed time after scrape + parse: %f", time() - t) processing.NodesHlasovanie(db, conf).process_and_store_all() processing.NodesPoslanec(db, conf).process_and_store_all() processing.NodesKlub(db, conf).process_and_store_all() processing.NodesVybor(db, conf).process_and_store_all() processing.NodesDelegacia(db, conf).process_and_store_all() processing.NodesZakon(db, conf).process_and_store_all() processing.NodesSpektrum(db, conf).process_and_store_all() processing.NodesZmena(db, conf).process_and_store_all() processing.NodesRozprava(db, conf).process_and_store_all() logger.info("Total elapsed time after nodes insert: %f", time() - t) processing.EdgesPoslanecKlubClen(db, conf).process_and_store_all() processing.EdgesPoslanecKlubBolClenom(db, conf).process_and_store_all() processing.EdgesPoslanecVyborClen(db, conf).process_and_store_all() processing.EdgesPoslanecDelegaciaClen(db, conf).process_and_store_all() processing.EdgesPoslanecHlasovanieHlasoval(db, conf).process_and_store_all() processing.EdgesVyborZakonNavrhnuty(db, conf).process_and_store_all() processing.EdgesVyborZakonGestorsky(db, conf).process_and_store_all() processing.EdgesPoslanecZakonNavrhol(db, conf).process_and_store_all() processing.EdgesKlubSpektrumClen(db, conf).process_and_store_all() processing.EdgesSpektrumZakonNavrhol(db, conf).process_and_store_all() processing.EdgesHlasovanieZakonHlasovaloO(db, conf).process_and_store_all() processing.EdgesPoslanecZmenaNavrhol(db, conf).process_and_store_all() processing.EdgesPoslanecZmenaPodpisal(db, conf).process_and_store_all() processing.EdgesZmenaZakonNavrhnuta(db, conf).process_and_store_all() processing.EdgesHlasovanieZmenaHlasovaloO(db, conf).process_and_store_all() processing.EdgesPoslanecRozpravaVystupil(db, conf).process_and_store_all() processing.EdgesRozpravaZakonTykalaSa(db, conf).process_and_store_all() logger.info("Total elapsed time after edges insert: %f", time() - t)
from multiprocessing import Pool as ThreadPool import validictory from validictory import SchemaValidator from schema.validate.sopr_html import transformed_ld1_schema,\ transformed_ld2_schema from utils.validate import validate_uuid, validate_url, validate_email from utils import set_up_logging from settings import TRANS_DIR format_validators = {"uuid_hex": validate_uuid, "url_http": validate_url, "email": validate_email} log = set_up_logging('validate', loglevel=logging.DEBUG) required_by_default = True blank_by_default = False disallow_unknown_properties = True apply_default_to_data = False validator = SchemaValidator(format_validators, required_by_default, blank_by_default, disallow_unknown_properties, apply_default_to_data) def log_result(result):
import os import logging import time from multiprocessing.dummy import Pool as ThreadPool from utils import set_up_logging log = set_up_logging('download', loglevel=logging.DEBUG) # GENERAL DOWNLOAD FUNCTIONS def response_download(response, output_loc): if response.ok: try: with open(output_loc, 'wb') as output_file: for chunk in response.iter_content(): output_file.write(chunk) return response.headers.get('content-length', 'N/A') except Exception as e: log.error(e) else: log.error('response not okay: '+response.reason) raise Exception('didn''t work, trying again') def log_result(result): if result[0] == 'success': url, loc, content_length = result[1:] log.info( 'success: {source} => {dest}({size})'.format(
def configure_worker(sender=None, **extra): from utils import set_up_logging set_up_logging()
import viewer def create_app(): app = flask.Flask(__name__, instance_relative_config=True) app.config.from_pyfile('settings.py', silent=True) data.initialize(app) viewer.initialize(app) return app manager = Manager(create_app) data.register_commands(manager) @manager.option('-s', '--socket') def runfcgi(socket): from flup.server.fcgi import WSGIServer app = create_app() WSGIServer(app, debug=app.debug, bindAddress=socket, umask=0).run() if __name__ == '__main__': from utils import set_up_logging set_up_logging() manager.run() else: app = create_app()
import os import re import sys import logging import json from glob import iglob from collections import defaultdict import numpy as np from settings import CACHE_DIR, REF_DIR from utils import set_up_logging from utils import mkdir_p log = set_up_logging('describe', loglevel=logging.DEBUG) def describe_dos(options): if options.get('loglevel', None): log.setLevel(options['loglevel']) OUT_DIR = os.path.join(REF_DIR, 'dos') if not os.path.exists(OUT_DIR): mkdir_p(OUT_DIR) sql_to_dtype = { 'VARCHAR': 'object', 'INT': 'int64', 'MONEY': 'float64' }
import logging import json from collections import defaultdict from glob import iglob try: import pandas as pd except ImportError: sys.stderr.write("python-pandas not installed.") from settings import ORIG_DIR, TRANS_DIR from utils import mkdir_p from utils import set_up_logging from utils import sqlize_colname log = set_up_logging('transform', loglevel=logging.DEBUG) with open('ref/field_codes.json', 'r') as fc_ref: FIELD_CODES = json.load(fc_ref) def transform_cfo(options): if options.get('loglevel', None): log.setLevel(options['loglevel']) OUT_DIR = os.path.join(TRANS_DIR, 'cfo') if not os.path.exists(OUT_DIR): mkdir_p(OUT_DIR) CFO_ORIG = os.path.join(ORIG_DIR, 'cfo')
import logging import zipfile import json from collections import defaultdict from glob import glob from multiprocessing import Pool as ThreadPool from lxml import etree from settings import CACHE_DIR, ORIG_DIR, TEST_CACHE_DIR, TEST_ORIG_DIR from utils import mkdir_p, translate_dir from utils import set_up_logging from schema.scrape.sopr_html import ld1_schema, ld2_schema log = set_up_logging('extract', loglevel=logging.DEBUG) html_parser = etree.HTMLParser() def log_result(result): if result[0] == 'success': src_dir, dest_dir, num_files = result[1:] log.info("successfully extracted " + "{src_dir} => {dest_dir} ({num} files)".format( src_dir=src_dir, dest_dir=dest_dir, num=num_files)) elif result[0] == 'failure': loc, e = result[1:] log.error("extracting from {loc} failed: {exception}".format( loc=loc, exception=str(e))) elif result[0] == 'no_update':
import logging from datetime import datetime import locale from pytz import timezone from utils import set_up_logging log = set_up_logging('schema', loglevel=logging.DEBUG) REPLACE_MAP = {u' ': u'', u'\xa0': u'', u'\u200b': u'', u' ': u''} locale.setlocale(locale.LC_ALL, 'en_US.UTF-8') us_eastern = timezone('US/Eastern') DATE_FORMATS = ['%m/%d/%Y', '%m/%d/%Y %I:%M:%S %p', '%m/%d/%y', '%Y/%m/%d', '%m-%d-%Y', '%m-%d-%y'] def checkbox_boolean(e): return 'checked' in e.attrib