def save_items(table_name, items, id_lv=None):
    """Save items into DB. Items are saved separately, because we want
    to store id for futher processing."""

    primary_key = get_item_primary_key_name(table_name)

    for item in items:
        is_change = not (item.get('zadna_zmena', False))
        if is_change:
            # exclude owners from saving item, we add it after save
            owners = item.pop('vlastnici', [])

            if 'cislo_zaznamu' not in item:
                item['cislo_zaznamu'] = 1

            if id_lv:
                item['id_lv'] = id_lv

            db = db_handler.get_dataset()
            try:
                item[primary_key] = db[table_name].insert(item)
            except Exception as e:
                pprint(e)

            item['vlastnici'] = owners
def prepare_batch():
    settings = get_project_settings()
    max_items = settings['MAX_STAVEBNI_OBJEKTY_IN_BATCH']
    max_processes = settings['MAX_STAVEBNI_OBJEKTY_PROCESSES']

    # check current running scrapyd processes
    active_processes = utils.sum_active_jobs(LandRegisterCrawler.project_name,
                                             LandRegisterCrawler.spider_name)

    max_processes -= active_processes
    if max_processes < 1:
        return []

    items_to_process = max_items * max_processes

    db = db_handler.get_dataset()
    results = db['stavebni_objekt_ref'].find(stav_scrapingu='W',
                                             _limit=items_to_process,
                                             order_by='id')

    batch = []
    single_list = []
    for r in results:
        if len(single_list) == max_items:
            batch.append(single_list.copy())
            single_list = []

        single_list.append(r['id'])

    if single_list:
        batch.append(single_list)

    return batch
def filter_lv_change(new_item):
    """Check for item change and filter (remove) unchanged fields."""

    primary_key = get_item_primary_key_name('lv')
    search_key = get_item_search_key('lv', new_item)

    db = db_handler.get_dataset()
    all_items = db['lv'].find(**search_key, order_by='cislo_zaznamu')

    current_item = {}
    for item in all_items:
        current_item.update((k,v) for k,v in item.items() if v is not None)

    # item is new => no changes have been made
    if not current_item:
        return

    get_changed_fields(current_item, new_item)

    new_item[primary_key] = current_item[primary_key]

    if is_changed(new_item, ignore_fields=['vlastnici', primary_key]):
        new_item['cislo_zaznamu'] = current_item['cislo_zaznamu'] + 1
    else:
        new_item['zadna_zmena'] = True
def process_seznam_nemovitosti(seznam_nemovitosti, id_rizeni):
    db = db_handler.get_dataset()

    for s in seznam_nemovitosti:
        s['id_rizeni'] = id_rizeni

        db['seznam_nemovitosti'].upsert(s, ['id_rizeni', 'poradove_cislo'])
def process_predmety_rizeni(predmety_rizeni, id_rizeni):
    db = db_handler.get_dataset()

    for p in predmety_rizeni:
        p['id_rizeni'] = id_rizeni

        db['predmety_rizeni'].upsert(p, ['id_rizeni', 'poradove_cislo'])
def create_scraping(name=None):
    """Creates new scraping process."""

    db = db_handler.get_dataset()
    fields = dict(nazev=name, datum_zacatku=datetime.now())
    scraping_id = db['log_scraping'].insert(fields)
    return scraping_id
def process_ucastnici_rizeni(ucastnici, id_rizeni):
    db = db_handler.get_dataset()

    for u in ucastnici:
        u['id_rizeni'] = id_rizeni

        db['ucastnici_rizeni'].upsert(u, ['id_rizeni', 'poradove_cislo'])
Example #8
0
def is_operation_updated(operation_number, wp, state):
    """Checks if operation has been updated already."""

    db = db_handler.get_dataset()
    result = db['rizeni'].find_one(cislo_pracoviste=wp,
                                   cislo_rizeni=operation_number,
                                   stav_rizeni=state)
    return True if result else False
def load_items(id_lv, table_name, additional_fields={}):
    """Loads all items (include historical changes) for single LV."""

    db = db_handler.get_dataset()
    return db[table_name].find(
        id_lv=id_lv,
        **additional_fields,
        order_by='cislo_zaznamu'
    )
def get_ku_jobs(scraping_id, status, limit=None):
    """Loads list of KU jobs by status.
    (R - Runinng, F - Finished, E - Error, W - Waiting)."""

    db = db_handler.get_dataset()
    results = db['log_uloha'].find(
        id_scrapingu=scraping_id, stav=status, _limit=limit
    )
    return [r for r in results]
def process_provedene_operace(provedene_operace, id_rizeni):
    db = db_handler.get_dataset()

    for p in provedene_operace:
        p['id_rizeni'] = id_rizeni
        datum = p.get('datum')
        p['datum'] = common.get_date_from_string(datum) if datum else datum

        db['provedene_operace'].upsert(p, ['id_rizeni', 'poradove_cislo'])
def generate_scraping_objects(ids):
    """Get next url."""
    db = db_handler.get_dataset()
    table = db['stavebni_objekt_ref'].table
    statement = table.select(table.c.id.in_(tuple(ids)))
    results = db.query(statement)

    for obj in results:
        yield obj
def get_id_lv(item):
    """Get LV's ID, if not exists, item will be saved."""

    db = db_handler.get_dataset()

    return db['lv'].upsert(
        dict(cislo_ku=item.get('cislo_ku'),
             cislo_lv=item.get('cislo_lv'),
             cislo_zaznamu=1), ['cislo_ku', 'cislo_lv'])
def get_id_rizeni(item):
    """Get ID of rizeni, if not exists, item will be saved."""

    db = db_handler.get_dataset()

    return db['rizeni'].upsert(
        dict(cislo_pracoviste=item.get('cislo_pracoviste'),
             cislo_rizeni=item.get('cislo_rizeni')),
        ['cislo_pracoviste', 'cislo_rizeni'])
def get_scraping_id():
    """Loads ID of scraping process."""

    db = db_handler.get_dataset()
    result = db.query("""
        SELECT MAX(id) as id
          FROM log_scraping"""
    )
    for r in result:
        return r['id'] if r['id'] else None
def update_job_log(job_id, job_hash, status):
    """Updates job's hash and status."""

    db = db_handler.get_dataset()
    fields = dict(id=job_id, hash_ulohy=job_hash, stav=status, datum_konce=None)

    # if status is R (running), set date of create
    if status == 'R':
        fields['datum_zacatku'] = datetime.now()

    db['log_uloha'].update(fields, ['id'])
def save_ku_csv_into_db(file):
    """Saves all KU items into DB."""

    with open(file, mode='r', encoding='utf8') as csv_file:
        csv_reader = csv.DictReader(csv_file, delimiter=';')
        items_to_save = []
        for row in csv_reader:
            item = prepare_ku_to_save(row)
            items_to_save.append(item)

    db = db_handler.get_dataset()
    db['ku'].insert_many(items_to_save)
def process_ref_rizeni(item, number, type):
    lv_item = item.get('lv_item')
    id_lv = get_id_lv(lv_item)
    id_rizeni = get_id_rizeni(item)

    if not id_rizeni:
        return

    db = db_handler.get_dataset()
    db['seznam_nemovitosti'].update(
        dict(id_lv=id_lv, id_rizeni=id_rizeni, typ=type, cislo=number),
        ['id_rizeni', 'typ', 'cislo'])
def get_last_processed_lv(job_id):
    """Loads last processed LV for job ID (job ID is connected) to KU,
    and with this connection we can distinguish data from different
    scrapings."""

    db = db_handler.get_dataset()
    result = db.query("""
        SELECT MAX(cislo_lv) as max_lv
          FROM log_lv
          WHERE id_ulohy = {}""".format(job_id)
    )
    for r in result:
        return r['max_lv'] if r['max_lv'] else 0
def delete_whole_lv_item(lv, ku):
    """Delete all objects of LV in specified KU."""

    db = db_handler.get_dataset()
    result = db['lv'].find_one(cislo_lv=lv, cislo_ku=ku)

    id_lv = result['id'] if result else None

    if not id_lv:
        return

    # delete object tables
    for t in ['pozemek', 'stavebni_objekt', 'stavba', 'jednotka', 'vlastnici']:
        db[t].delete(id_lv = id_lv)

    # delete main table
    db['lv'].delete(id=id_lv)
def process_rizeni(item):
    data = item.get('data')

    datum = data.get('datum_prijeti')
    data['datum_prijeti'] = common.get_datetime_from_string(
        datum) if datum else datum

    db = db_handler.get_dataset()
    id_rizeni = db['rizeni'].upsert(data, ['cislo_pracoviste', 'cislo_rizeni'])

    if item.get('ucastnici'):
        process_ucastnici_rizeni(item['ucastnici'], id_rizeni)
    if item.get('provedene_operace'):
        process_provedene_operace(item['provedene_operace'], id_rizeni)
    if item.get('predmety_rizeni'):
        process_predmety_rizeni(item['predmety_rizeni'], id_rizeni)
    if item.get('seznam_nemovitosti'):
        process_seznam_nemovitosti(item['seznam_nemovitosti'], id_rizeni)
def load_all_ku():
    """Loads all KU."""

    db = db_handler.get_dataset()
    results = db['ku'].all()

    today = date.today()
    ku_list = []
    for r in results:
        if r['plati_od']:
            if r['plati_od'] > today:
                continue
        if r['plati_do']:
            if r['plati_do'] < today:
                continue
        ku_list.append(r['cislo_ku'])

    return ku_list
def init_new_scraping():
    """Inits new scraping of cadaster. Prepare all ku codes and set
    waiting status to them."""

    scraping_id = create_scraping()
    ku_list = load_all_ku()
    ku_jobs = []

    for ku in ku_list:
        ku_jobs.append({
            'id_scrapingu': scraping_id,
            'cislo_ku': ku,
            'stav': 'W'
        })

    db = db_handler.get_dataset()
    db['log_uloha'].insert_many(ku_jobs)

    return scraping_id
Example #24
0
def check_completness(ku_code):
    scraping_id = get_scraping_id()

    db = db_handler.get_dataset()
    max_valid_lvs = db.query("""
        SELECT MAX(log_lv.cislo_lv), log_lv.cislo_ku
          FROM log_lv
          INNER JOIN log_uloha
            ON log_lv.id_ulohy = log_uloha.id
          WHERE log_uloha.id_scrapingu = 53
          GROUP BY log_lv.cislo_ku, log_lv.existuje
          ORDER BY log_lv.cislo_ku ASC""".format(scraping_id))
    """
        SELECT MAX(cislo_lv), cislo_ku, existuje
          FROM log_lv
          WHERE cislo_ku = '600016'
          GROUP BY cislo_ku, existuje
          ORDER BY cislo_ku ASC"""

    max_invalid_lvs = db.query("""
        SELECT MAX(cislo_lv) AS max_invalid_lv, cislo_ku
          FROM log_lv
          WHERE existuje = 0""")
Example #25
0
 def update_rizeni_log(self, status):
     db = db_handler.get_dataset()
     db['log_rizeni'].update(
         dict(id=self.job_id, stav=status, datum_konce=datetime.now()),
         ['id'])
def save_buildings_refs(id_lv, refs):
    refs = delete_duplicates(refs)
    db = db_handler.get_dataset()
    for r in refs:
        r['id_lv'] = id_lv
    db['stavebni_objekt_ref'].insert_many(refs)
def update_scraping_object(scraping_object, status):
    db = db_handler.get_dataset()
    db['stavebni_objekt_ref'].update(
        dict(id=scraping_object['id'], stav_scrapingu=status), ['id'])
def update_item_status(id, status):
    db = db_handler.get_dataset()
    db['stavebni_objekt_ref'].update(dict(id=id, stav_scrapingu=status),
                                     ['id'])
def delete_all_ku():
    """Deletes all ku data."""

    db = db_handler.get_dataset()
    db['ku'].delete()
Example #30
0
def update_job_status(job_id, status):
    db = db_handler.get_dataset()
    fields = dict(id=job_id, stav=status, datum_konce=datetime.now())
    db['log_uloha'].update(fields, ['id'])