Esempio n. 1
0
def cron_courts():

    try:
        res = get(ROOT_URL + LIST_COURTS)
        soup = BeautifulSoup(res.text, 'html.parser')
        Court.objects.get_or_create(id=SUPREME_COURT, name='Nejvyšší soud')
        Court.objects.get_or_create(id=SUPREME_ADMINISTRATIVE_COURT, name='Nejvyšší správní soud')
        upper = soup.find(id='kraj').find_all('option')[1:]
        lower = soup.find(id='soudy').find_all('option')[1:]
        for court in upper + lower:
            Court.objects.get_or_create(id=court['value'], name=court.string.encode('utf-8'))
    except:  # pragma: no cover
        LOGGER.warning('Error importing courts')
    Court.objects.all().update(reports=None)
    for court in Court.objects.all():
        if isreg(court):
            try:
                sleep(1)
                res = get(ROOT_URL + LIST_REPORTS.format(court.pk))
                soup = BeautifulSoup(res.text, 'xml')
                for item in soup.find_all('okresniSoud'):
                    Court.objects.filter(pk=item.id.string).update(reports=court)
            except:  # pragma: no cover
                LOGGER.warning('Error setting hierarchy for {}'.format(court.id))
    LOGGER.info('Courts imported')
Esempio n. 2
0
def cron_publishers():

    def proc_publisher(tag, typ, high=False, subsidiary_region=False, subsidiary_county=False, reports=None):
        pubid = int(tag['href'].rpartition('=')[2])
        name = (
            tag.text.replace('  ', ' ')
            .replace('KS ', 'Krajský soud ')
            .replace('MS ', 'Městský soud ')
            .replace('OS Praha ', 'Obvodní soud Praha ')
            .replace('OS ', 'Okresní soud ')
            .replace('KSZ ', 'Krajské státní zastupitelství ')
            .replace('MSZ ', 'Městské státní zastupitelství ')
            .replace('OSZ Praha ', 'Obvodní státní zastupitelství Praha ')
            .replace('OSZ ', 'Okresní státní zastupitelství ')
        )
        return Publisher.objects.update_or_create(
            name=name,
            defaults={
                'type': typ,
                'pubid': pubid,
                'high': high,
                'subsidiary_region': subsidiary_region,
                'subsidiary_county': subsidiary_county,
                'reports': reports,
                'updated': datetime.now() - UPDATE_INTERVAL})[0]


    def proc_publishers(soup, typ, high=False):
        if high:
            for tag in soup.find_all('a'):
                proc_publisher(tag, typ, high=True)
        else:
            rep = proc_publisher(soup.select('dt a')[0], typ)
            for tag in soup.find_all('dd'):
                cls = tag.get('class', [])
                subsidiary_region = 'pobockakraj' in cls
                subsidiary_county = 'pobockaokres' in cls
                proc_publisher(
                    tag.find('a'),
                    typ,
                    subsidiary_region=subsidiary_region,
                    subsidiary_county=subsidiary_county,
                    reports=rep)

    for typ in TYPES:
        try:
            res = get(PUBLISHERS_URL.format(typ))
            soup = BeautifulSoup(res.text, 'html.parser')
            high = soup.find('div', 'bezlokality')
            lower = soup.find('div', 'slokalitou')
            proc_publishers(high, typ, high=True)
            for reg in lower.find_all('dl'):
                proc_publishers(reg, typ, high=False)
        except:
            pass

    LOGGER.info('Publishers imported')
Esempio n. 3
0
def cron_find():

    now = datetime.now()
    try:
        dec = Decision.objects.filter(anonfilename='', date__gte=(now - OBS)).earliest('updated')
        dec.updated = now
        dec.save()
        res = get(FIND_URL)
        soup = BeautifulSoup(res.text, 'html.parser')
        form = soup.find('form')
        dct = {i['name']: i['value'] for i in form.find_all('input') if i['type'] == 'hidden' and i.has_attr('value')}
        ref = ('{} '.format(dec.senate) if dec.senate else '')
        ref += '{0.register} {0.number:d}/{0.year:d}'.format(dec)
        dct['_ctl0:ContentPlaceMasterPage:_ctl0:txtDatumOd'] = dct['_ctl0:ContentPlaceMasterPage:_ctl0:txtDatumDo'] = \
            '{0.day:02d}.{0.month:02d}.{0.year:d}'.format(dec.date)
        dct['_ctl0:ContentPlaceMasterPage:_ctl0:txtSpisovaZnackaFull'] = ref
        dct['_ctl0_ContentPlaceMasterPage__ctl0_rbTypDatum_0'] = 'on'
        res = post(FIND_URL, dct)
        soup = BeautifulSoup(res.text, 'html.parser')
        for anchor in soup.select('table#_ctl0_ContentPlaceMasterPage__ctl0_grwA')[0].select('a[title^=Anonymizovan]'):
            fileurl = anchor['href']
            filename = fileurl.split('/')[-1]
            if not FRE.match(filename):
                continue
            res = get(ROOT_URL + fileurl)
            if not res.ok:
                continue
            LOGGER.info(
                'Writing anonymized decision "{}"'
                .format(composeref(dec.senate, dec.register, dec.number, dec.year)))
            with open(join(REPO_PREF, filename), 'wb') as outfile:
                if not outfile.write(res.content):  # pragma: no cover
                    LOGGER.error(
                        'Failed to write anonymized decision "{}"'
                        .format(composeref(dec.senate, dec.register, dec.number, dec.year)))
                    return
                adddoc(APP, filename, ROOT_URL + fileurl)
            dec.anonfilename = filename
            dec.save()
            return
    except:  # pragma: no cover
        LOGGER.warning('Find failed')
Esempio n. 4
0
def cron_courtrooms():

    for court in Court.objects.exclude(id=SUPREME_ADMINISTRATIVE_COURT):
        try:
            sleep(1)
            res = get(LIST_COURTROOMS.format(court.pk))
            soup = BeautifulSoup(res.text, 'xml')
            for room in soup.find_all('jednaciSin'):
                croom, croomc = Courtroom.objects.get_or_create(
                    court=court, desc=room.nazev.string)
                if not croomc:
                    croom.save()
        except:  # pragma: no cover
            LOGGER.warning('Error downloading courtrooms')
    LOGGER.info('Courtrooms downloaded')
Esempio n. 5
0
def cron_courtrooms():

    for court in Court.objects.exclude(id=SUPREME_ADMINISTRATIVE_COURT):
        try:
            sleep(1)
            res = get(LIST_COURTROOMS.format(court.pk))
            soup = BeautifulSoup(res.text, 'xml')
            for room in soup.find_all('jednaciSin'):
                croom, croomc = Courtroom.objects.get_or_create(
                    court=court,
                    desc=room.nazev.string)
                if not croomc:
                    croom.save()
        except:  # pragma: no cover
            LOGGER.warning('Error downloading courtrooms')
    LOGGER.info('Courtrooms downloaded')
Esempio n. 6
0
def getauxid(senate, register, number, year):

    try:
        res = get(NSS_URL)
        soup = BeautifulSoup(res.text, 'html.parser')
        form = soup.find('form')
        dct = {i['name']: i['value'] for i in form.find_all('input') if i['type'] == 'hidden' and i.has_attr('value')}
        ref = composeref(senate, register, number, year)
        dct['_ctl0:ContentPlaceMasterPage:_ctl0:txtSpisovaZnackaFull'] = ref
        res = post(NSS_URL, dct)
        soup = BeautifulSoup(res.text, 'html.parser')
        oncl = (
            soup.select('table#_ctl0_ContentPlaceMasterPage__ctl0_grwA')[0]
            .select('img[src="/Image/infosoud.gif"]')[0]['onclick'])
        return int(oncl.split('=')[-1].split("'")[0])
    except:
        return 0
Esempio n. 7
0
def updateproc(proc):

    notnew = bool(proc.updated)
    proc.updated = datetime.now()
    proc.save()
    court = proc.court_id
    try:
        if court == SUPREME_ADMINISTRATIVE_COURT:
            addauxid(proc)
            if not proc.auxid:
                return
            url = NSS_GET_PROC.format(proc.auxid)
            res = get(url)
            soup = BeautifulSoup(res.text, 'html.parser')
            table = soup.find('table', 'frm')
        else:
            court_type = 'ns' if court == SUPREME_COURT else 'os'
            url = ROOT_URL + GET_PROC.format(
                court,
                proc.court.reports.id if proc.court.reports else proc.court.id,
                proc.senate,
                quote(proc.register.upper()),
                proc.number,
                proc.year,
                court_type)
            res = get(url)
            soup = BeautifulSoup(res.text, 'html.parser')
            table = soup.find('tr', 'AAAA')
        assert table
    except:  # pragma: no cover
        LOGGER.warning(
            'Failed to check proceedings "{0.desc}" ({1}) for user "{2}" ({0.uid_id:d})'
            .format(proc, p2s(proc), User.objects.get(pk=proc.uid_id).username))
        return False
    hsh = md5(str(table).encode()).hexdigest()
    if court != SUPREME_ADMINISTRATIVE_COURT:
        changed = None
        try:
            tbl = table.find_next_sibling().find_next_sibling().table.tr.td.find_next_sibling().text.split()
            if len(tbl) == 4:
                changed = datetime(*map(int, list(reversed(tbl[0].split('.'))) + tbl[1].split(':')))
        except:  # pragma: no cover
            LOGGER.warning(
                'Failed to check proceedings "{0.desc}" ({1}) for user "{2}" ({0.uid_id:d})'
                .format(proc, p2s(proc), User.objects.get(pk=proc.uid_id).username))
        if changed != proc.changed or hsh != proc.hash:
            proc.notify |= notnew
            if changed:
                proc.changed = changed
                LOGGER.info(
                    'Change detected in proceedings "{0.desc}" ({1}) for user "{2}" ({0.uid_id:d})'
                    .format(proc, p2s(proc), User.objects.get(pk=proc.uid_id).username))
    elif hsh != proc.hash:
        proc.notify |= notnew
        if notnew:
            proc.changed = proc.updated
            if proc.changed:
                LOGGER.info(
                    'Change detected in proceedings "{0.desc}" ({1}) for user "{2}" ({0.uid_id:d})'
                    .format(proc, p2s(proc), User.objects.get(pk=proc.uid_id).username))
    proc.hash = hsh
    LOGGER.debug(
        'Proceedings "{0.desc}" ({1}) updated for user "{2}" ({0.uid_id:d})'
        .format(proc, p2s(proc), User.objects.get(pk=proc.uid_id).username))
    return True
Esempio n. 8
0
def cron_update():

    tasks = Task.objects.all()
    if not tasks.exists():
        return
    task = tasks.earliest('timestamp_update')
    task.save()
    court0 = 'os'
    if task.court.reports:
        court1 = task.court.reports.id
        court2 = task.court.id
    else:
        court1 = task.court.id
        court2 = ''
    tdate = str(task.date)
    try:
        for croom in Courtroom.objects.filter(court=task.court):
            query = QueryDict(mutable=True)
            query['type'] = 'jednani'
            query['typSoudu'] = court0
            query['krajOrg'] = court1
            query['org'] = court2
            query['sin'] = croom.desc
            query['datum'] = '{0.day:d}.{0.month:d}.{0.year:d}'.format(task.date)
            query['spamQuestion'] = '23'
            query['druhVec'] = ''
            url = ROOT_URL + GET_HEARINGS + query.urlencode()
            sleep(1)
            res = get(url)
            soup = BeautifulSoup(res.text, 'html.parser')
            sched = soup.select('table tr td + td table tr td table tr')[6]
            if sched.select('b'):
                continue
            for ttr in sched.td.table.children:
                try:
                    ttd = ttr.td
                    ttm = ttd.text.split(':')
                    ttm = datetime(
                        task.date.year,
                        task.date.month,
                        task.date.day,
                        int(ttm[0]),
                        int(ttm[1]))
                    ttd = ttd.find_next_sibling('td')
                    senate, register, number, year = decomposeref(ttd.text.replace(' / ', '/'))
                    register = normreg(register)
                    ttd = ttd.find_next_sibling('td')
                    form = Form.objects.get_or_create(name=ttd.text.strip())[0]
                    ttd = ttd.find_next_sibling('td')
                    judge = Judge.objects.get_or_create(name=ttd.text.strip())[0]
                    ttd = ttd.find_next_sibling('td')
                    parties = ttd.select('td')
                    ttd = ttd.find_next_sibling('td')
                    closed = 'Ano' in ttd.text
                    ttd = ttd.find_next_sibling('td')
                    cancelled = 'Ano' in ttd.text
                    hearing = Hearing.objects.update_or_create(
                        courtroom=croom,
                        time=ttm,
                        senate=senate,
                        register=register,
                        number=number,
                        year=year,
                        form=form,
                        judge=judge,
                        defaults={
                            'closed': closed,
                            'cancelled': cancelled})
                    if hearing[1]:
                        for query in parties:
                            qts = query.text.strip()
                            if qts:
                                party = Party.objects.get_or_create(name=query.text.strip())[0]
                                hearing[0].parties.add(party)
                                sur_check(
                                    {'check_psj': True},
                                    qts,
                                    task.court,
                                    senate,
                                    register,
                                    number,
                                    year,
                                    HEARING_URL.format(
                                        task.court.id,
                                        senate,
                                        quote(register),
                                        number,
                                        year,
                                        tdate,
                                        tdate))
                except:
                    pass
        task.delete()
    except:
        LOGGER.warning(
            'Failed to download hearings for {0}, {1.year:d}-{1.month:02d}-{1.day:02d}'
            .format(task.court_id, task.date))
        return
    LOGGER.debug(
        'Downloaded hearings for {0}, {1.year:d}-{1.month:02d}-{1.day:02d}'.format(task.court_id, task.date))
Esempio n. 9
0
def cron_update2():

    nss = Court.objects.get(pk=SUPREME_ADMINISTRATIVE_COURT)
    croom = Courtroom.objects.get_or_create(court=nss, desc='(neuvedeno)')[0]
    form = Form.objects.get_or_create(name='Veřejné jednání')[0]
    try:
        res = get(LIST_COURTROOMS2)
        soup = BeautifulSoup(res.text, 'html.parser')
        for item in soup.select('table.item'):
            for hearing in Hearing.objects.filter(courtroom__court=nss, auxid=0):
                hearing.auxid = getauxid(hearing.senate, hearing.register, hearing.number, hearing.year)
                hearing.save()
            try:
                senate = register = number = year = judge = ttm = None
                parties = []
                for trow in item.select('tr'):
                    ths = trow.th.text.strip()
                    tds = trow.td.text.strip()
                    if ths.startswith('Spisová značka:'):
                        senate, register, number, year = decomposeref(tds)
                    elif ths.startswith('Účastníci řízení:'):
                        for query in trow.td:
                            if 'strip' in dir(query):
                                party = Party.objects.get_or_create(name=query.strip())[0]
                                parties.append(party)
                    elif ths.startswith('Předseda senátu:'):
                        judge = Judge.objects.get_or_create(name=tds)[0]
                    elif ths.startswith('Datum jednání:'):
                        dtm = tds.split()
                        dat = list(map(int, dtm[0].split('.')))
                        tim = list(map(int, dtm[2].split(':')))
                        ttm = datetime(dat[2], dat[1], dat[0], tim[0], tim[1])
                auxid = getauxid(senate, register, number, year)
                hearing = Hearing.objects.update_or_create(
                    courtroom=croom,
                    time=ttm,
                    senate=senate,
                    register=register,
                    number=number,
                    year=year,
                    form=form,
                    judge=judge,
                    closed=False,
                    cancelled=False,
                    auxid=auxid)
                if hearing[1]:
                    for party in parties:
                        hearing[0].parties.add(party)
                        sur_check(
                            {'check_psj': True},
                            party.name,
                            nss,
                            senate,
                            register,
                            number,
                            year,
                            HEARING_URL.format(
                                nss.id,
                                senate,
                                quote(register),
                                number,
                                year,
                                ttm.date(),
                                ttm.date()))
            except:  # pragma: no cover
                pass
    except:  # pragma: no cover
        LOGGER.warning('Supreme Administrative Court update failed')
    LOGGER.debug('Downloaded Supreme Administrative Court hearings')
Esempio n. 10
0
def cron_update():

    nss = Court.objects.get(pk=SUPREME_ADMINISTRATIVE_COURT)
    try:
        res = get(FORM_URL)
        soup = BeautifulSoup(res.text, 'html.parser')
        form = soup.find('form')
        dct = {i['name']: i['value'] for i in form.find_all('input') if i['type'] == 'hidden' and i.has_attr('value')}
        while True:
            dct['_ctl0:ContentPlaceMasterPage:_ctl0:ddlSortName'] = '5'
            dct['_ctl0:ContentPlaceMasterPage:_ctl0:ddlSortDirection'] = '1'
            res = post(FORM_URL, dct)
            soup = BeautifulSoup(res.text, 'html.parser')
            for item in soup.select('table.item'):
                try:
                    ttr = item.select('tr')
                    senate, register, number, year, page = decomposeref(ttr[0].td.text.strip())
                    if Decision.objects.filter(
                            senate=senate,
                            register=register,
                            number=number,
                            year=year,
                            page=page).exists():
                        continue
                    fileurl = ttr[4].a['href']
                    filename = fileurl.split('/')[-1]
                    if not FRE.match(filename):
                        continue
                    res = get(ROOT_URL + fileurl)
                    if not res.ok:
                        continue
                    LOGGER.info('Writing abridged decision "{}"'.format(composeref(senate, register, number, year)))
                    with open(join(REPO_PREF, filename), 'wb') as outfile:
                        if not outfile.write(res.content):  # pragma: no cover
                            LOGGER.error(
                                'Failed to write abridged decision "{}"'
                                .format(composeref(senate, register, number, year)))
                            continue
                        adddoc(APP, filename, ROOT_URL + fileurl)
                    agenda = Agenda.objects.get_or_create(desc=ttr[2].td.text.strip())[0]
                    dat = date(*map(int, list(reversed(ttr[3].td.text.split('.')))))
                    dec = Decision(
                        senate=senate,
                        register=register,
                        number=number,
                        year=year,
                        page=page,
                        agenda=agenda,
                        date=dat,
                        filename=filename)
                    dec.save()
                    for query in ttr[1].td:
                        if 'strip' in dir(query):
                            qstrip = query.strip()
                            party = Party.objects.get_or_create(name=qstrip)[0]
                            dec.parties.add(party)
                            sur_check(
                                {'check_udn': True},
                                qstrip,
                                nss,
                                senate,
                                register,
                                number,
                                year,
                                DEC_URL.format(senate, quote(register), number, year, page))
                except:  # pragma: no cover
                    pass
            pagers = soup.select('div#PagingBox2')[0]
            cpag = int(pagers.b.text[1:-1])
            pager = pagers.select('a')
            if cpag > len(pager):
                break
            form = soup.find('form')
            dct = {i['name']: i['value'] for i in form.find_all('input')
                 if i['type'] == 'hidden' and i.has_attr('value')}
            dct['__EVENTTARGET'] = pager[cpag - 1]['href'][70:-34]
            dct['__EVENTARGUMENT'] = ''
    except:  # pragma: no cover
        LOGGER.warning('Update failed')
Esempio n. 11
0
def cron_update(*args):

    today = date.today()
    if args:
        dates = []
        for arg in args:
            string = arg.split('.')
            dates.append(datetime(*map(int, string[2::-1])))
    else:
        dates = [today + ODP]
    for dat in dates:
        flt = {'subsidiary_region': False, 'subsidiary_county': False}
        if not args:
            flt['updated__lt'] = datetime.now() - UPDATE_INTERVAL
        for publisher in Publisher.objects.filter(**flt).order_by('id'):
            try:
                sleep(1)
                res = get(LIST_URL.format(publisher.pubid))
                assert res.ok
                soup = BeautifulSoup(res.text, 'html.parser')
                rows = soup.find_all('tr')
                if not rows:
                    continue
                for row in rows:
                    cells = row.find_all('td')
                    if len(cells) < 5:
                        continue
                    links = cells[0].select('a[href]')
                    if not links:
                        continue
                    desc = ref = senate = register = number = year = page = agenda = posted = None
                    files = []
                    href = links[0].get('href')
                    if href and href.startswith('vyveseni.aspx?vyveseniid='):
                        try:
                            docid = int(href.partition('=')[2])
                        except ValueError:
                            continue
                        try:
                            posted = date(*map(int, cells[0].text.strip().split('.')[2::-1]))
                        except:
                            continue
                    else:
                        continue
                    if Document.objects.filter(publisher=publisher, posted=posted, docid=docid).exists():
                        continue
                    try:
                        desc = cells[1].text.strip()
                        ref = cells[2].text.strip()
                        senate, register, number, year, page = parse_ref(ref)
                        agenda = Agenda.objects.get_or_create(desc=cells[3].text.strip())[0]
                        anchors = cells[4].find_all('a')
                        if not anchors:
                            continue
                        for anchor in anchors:
                            if not(anchor and anchor.has_attr('href')
                                and anchor['href'].startswith('soubor.aspx?souborid=')):
                                continue
                            fileid = int(anchor['href'].partition('=')[2])
                            span = anchor.find('span', 'zkraceno')
                            filename = span['title'].strip() if span else anchor.text.strip()
                            if not filename:
                                continue
                            if filename.endswith(')'):
                                filename = filename.rpartition(' (')[0]
                            filename = filename.replace(' ', '_')
                            if fileid not in [x[0] for x in files]:
                                files.append((fileid, filename))
                        doc = Document.objects.get_or_create(
                            docid=docid,
                            publisher=publisher,
                            desc=desc,
                            ref=ref,
                            senate=senate,
                            register=register,
                            number=number,
                            year=year,
                            page=page,
                            agenda=agenda,
                            posted=posted,
                        )[0]
                        for fileid, filename in files:
                            if File.objects.filter(fileid=fileid).exists():
                                File.objects.filter(fileid=fileid).update(document=doc)
                                continue
                            infile = get(FILE_URL.format(fileid))
                            assert infile.ok
                            content = infile.content
                            dirname = join(REPO_PREF, str(fileid))
                            makedirs(dirname, exist_ok=True)
                            pathname = join(dirname, filename)
                            with open(pathname, 'wb') as outfile:
                                outfile.write(content)
                                adddoc(APP, join(str(fileid), filename), FILE_URL.format(fileid))
                            try:
                                text = process(pathname).decode()
                                ocr = len(text) < 5
                                if ocr:
                                    text = process(pathname, method='tesseract', language='ces').decode()
                            except:
                                text = ''
                                ocr = False
                            File.objects.update_or_create(
                                fileid=fileid,
                                defaults={
                                    'document': doc,
                                    'name': filename,
                                    'text': text,
                                    'ocr': ocr,
                                }
                            )
                        update_index(doc)
                        if not args or TEST:
                            sleep(.2)
                            for party in Party.objects.filter(check_uds=True):
                                if DocumentIndex.objects.filter(id=doc.id, text__search='"{}"'.format(party.party)):
                                    Retrieved.objects.update_or_create(
                                        uid_id=party.uid_id,
                                        party=party,
                                        document=doc)
                                    if party.uid.email:
                                        Party.objects.filter(id=party.id).update(notify=True)
                                    LOGGER.info(
                                        'New party "{}" detected for user "{}" ({:d})'
                                        .format(
                                            party.party,
                                            User.objects.get(pk=party.uid_id).username,
                                            party.uid_id))
                    except:
                        continue
                LOGGER.debug('Updated "{}", {:%Y-%m-%d}'.format(publisher.name, dat))
                if not args:
                    Publisher.objects.filter(id=publisher.id).update(updated=datetime.now())
            except:
                LOGGER.info('Failed to update "{}", {:%Y-%m-%d}'.format(publisher.name, dat))
        LOGGER.debug('Updated all publishers, {:%Y-%m-%d}'.format(dat))
Esempio n. 12
0
def cron_publishers():
    def proc_publisher(tag,
                       typ,
                       high=False,
                       subsidiary_region=False,
                       subsidiary_county=False,
                       reports=None):
        pubid = int(tag['href'].rpartition('=')[2])
        name = (tag.text.replace('  ', ' ').replace(
            'KS ', 'Krajský soud ').replace('MS ', 'Městský soud ').replace(
                'OS Praha ',
                'Obvodní soud Praha ').replace('OS ', 'Okresní soud ').replace(
                    'KSZ ', 'Krajské státní zastupitelství ').replace(
                        'MSZ ', 'Městské státní zastupitelství ').replace(
                            'OSZ Praha ',
                            'Obvodní státní zastupitelství Praha ').replace(
                                'OSZ ', 'Okresní státní zastupitelství '))
        return Publisher.objects.update_or_create(name=name,
                                                  defaults={
                                                      'type':
                                                      typ,
                                                      'pubid':
                                                      pubid,
                                                      'high':
                                                      high,
                                                      'subsidiary_region':
                                                      subsidiary_region,
                                                      'subsidiary_county':
                                                      subsidiary_county,
                                                      'reports':
                                                      reports,
                                                      'updated':
                                                      datetime.now() -
                                                      UPDATE_INTERVAL
                                                  })[0]

    def proc_publishers(soup, typ, high=False):
        if high:
            for tag in soup.find_all('a'):
                proc_publisher(tag, typ, high=True)
        else:
            rep = proc_publisher(soup.select('dt a')[0], typ)
            for tag in soup.find_all('dd'):
                cls = tag.get('class', [])
                subsidiary_region = 'pobockakraj' in cls
                subsidiary_county = 'pobockaokres' in cls
                proc_publisher(tag.find('a'),
                               typ,
                               subsidiary_region=subsidiary_region,
                               subsidiary_county=subsidiary_county,
                               reports=rep)

    for typ in TYPES:
        try:
            res = get(PUBLISHERS_URL.format(typ))
            soup = BeautifulSoup(res.text, 'html.parser')
            high = soup.find('div', 'bezlokality')
            lower = soup.find('div', 'slokalitou')
            proc_publishers(high, typ, high=True)
            for reg in lower.find_all('dl'):
                proc_publishers(reg, typ, high=False)
        except:
            pass

    LOGGER.info('Publishers imported')
Esempio n. 13
0
def cron_update(*args):

    today = date.today()
    if args:
        dates = []
        for arg in args:
            string = arg.split('.')
            dates.append(datetime(*map(int, string[2::-1])))
    else:
        dates = [today + ODP]
    for dat in dates:
        flt = {'subsidiary_region': False, 'subsidiary_county': False}
        if not args:
            flt['updated__lt'] = datetime.now() - UPDATE_INTERVAL
        for publisher in Publisher.objects.filter(**flt).order_by('id'):
            try:
                sleep(1)
                res = get(LIST_URL.format(publisher.pubid))
                assert res.ok
                soup = BeautifulSoup(res.text, 'html.parser')
                rows = soup.find_all('tr')
                if not rows:
                    continue
                for row in rows:
                    cells = row.find_all('td')
                    if len(cells) < 5:
                        continue
                    links = cells[0].select('a[href]')
                    if not links:
                        continue
                    desc = ref = senate = register = number = year = page = agenda = posted = None
                    files = []
                    href = links[0].get('href')
                    if href and href.startswith('vyveseni.aspx?vyveseniid='):
                        try:
                            docid = int(href.partition('=')[2])
                        except ValueError:
                            continue
                        try:
                            posted = date(*map(
                                int, cells[0].text.strip().split('.')[2::-1]))
                        except:
                            continue
                    else:
                        continue
                    if Document.objects.filter(publisher=publisher,
                                               posted=posted,
                                               docid=docid).exists():
                        continue
                    try:
                        desc = cells[1].text.strip()
                        ref = cells[2].text.strip()
                        senate, register, number, year, page = parse_ref(ref)
                        agenda = Agenda.objects.get_or_create(
                            desc=cells[3].text.strip())[0]
                        anchors = cells[4].find_all('a')
                        if not anchors:
                            continue
                        for anchor in anchors:
                            if not (anchor and anchor.has_attr('href')
                                    and anchor['href'].startswith(
                                        'soubor.aspx?souborid=')):
                                continue
                            fileid = int(anchor['href'].partition('=')[2])
                            span = anchor.find('span', 'zkraceno')
                            filename = span['title'].strip(
                            ) if span else anchor.text.strip()
                            if not filename:
                                continue
                            if filename.endswith(')'):
                                filename = filename.rpartition(' (')[0]
                            filename = filename.replace(' ', '_')
                            if fileid not in [x[0] for x in files]:
                                files.append((fileid, filename))
                        doc = Document.objects.get_or_create(
                            docid=docid,
                            publisher=publisher,
                            desc=desc,
                            ref=ref,
                            senate=senate,
                            register=register,
                            number=number,
                            year=year,
                            page=page,
                            agenda=agenda,
                            posted=posted,
                        )[0]
                        for fileid, filename in files:
                            if File.objects.filter(fileid=fileid).exists():
                                File.objects.filter(fileid=fileid).update(
                                    document=doc)
                                continue
                            infile = get(FILE_URL.format(fileid))
                            assert infile.ok
                            content = infile.content
                            dirname = join(REPO_PREF, str(fileid))
                            makedirs(dirname, exist_ok=True)
                            pathname = join(dirname, filename)
                            with open(pathname, 'wb') as outfile:
                                outfile.write(content)
                                adddoc(APP, join(str(fileid), filename),
                                       FILE_URL.format(fileid))
                            try:
                                text = process(pathname).decode()
                                ocr = len(text) < 5
                                if ocr:
                                    text = process(pathname,
                                                   method='tesseract',
                                                   language='ces').decode()
                            except:
                                text = ''
                                ocr = False
                            File.objects.update_or_create(fileid=fileid,
                                                          defaults={
                                                              'document': doc,
                                                              'name': filename,
                                                              'text': text,
                                                              'ocr': ocr,
                                                          })
                        update_index(doc)
                        if not args or TEST:
                            sleep(.2)
                            for party in Party.objects.filter(check_uds=True):
                                if DocumentIndex.objects.filter(
                                        id=doc.id,
                                        text__search='"{}"'.format(
                                            party.party)):
                                    Retrieved.objects.update_or_create(
                                        uid_id=party.uid_id,
                                        party=party,
                                        document=doc)
                                    if party.uid.email:
                                        Party.objects.filter(
                                            id=party.id).update(notify=True)
                                    LOGGER.info(
                                        'New party "{}" detected for user "{}" ({:d})'
                                        .format(
                                            party.party,
                                            User.objects.get(
                                                pk=party.uid_id).username,
                                            party.uid_id))
                    except:
                        continue
                LOGGER.debug('Updated "{}", {:%Y-%m-%d}'.format(
                    publisher.name, dat))
                if not args:
                    Publisher.objects.filter(id=publisher.id).update(
                        updated=datetime.now())
            except:
                LOGGER.info('Failed to update "{}", {:%Y-%m-%d}'.format(
                    publisher.name, dat))
        LOGGER.debug('Updated all publishers, {:%Y-%m-%d}'.format(dat))
Esempio n. 14
0
def cron_update():

    tasks = Task.objects.all()
    if not tasks.exists():
        return
    task = tasks.earliest('timestamp_update')
    task.save()
    court0 = 'os'
    if task.court.reports:
        court1 = task.court.reports.id
        court2 = task.court.id
    else:
        court1 = task.court.id
        court2 = ''
    tdate = str(task.date)
    try:
        for croom in Courtroom.objects.filter(court=task.court):
            query = QueryDict(mutable=True)
            query['type'] = 'jednani'
            query['typSoudu'] = court0
            query['krajOrg'] = court1
            query['org'] = court2
            query['sin'] = croom.desc
            query['datum'] = '{0.day:d}.{0.month:d}.{0.year:d}'.format(
                task.date)
            query['spamQuestion'] = '23'
            query['druhVec'] = ''
            url = ROOT_URL + GET_HEARINGS + query.urlencode()
            sleep(1)
            res = get(url)
            soup = BeautifulSoup(res.text, 'html.parser')
            sched = soup.select('table tr td + td table tr td table tr')[6]
            if sched.select('b'):
                continue
            for ttr in sched.td.table.children:
                try:
                    ttd = ttr.td
                    ttm = ttd.text.split(':')
                    ttm = datetime(task.date.year, task.date.month,
                                   task.date.day, int(ttm[0]), int(ttm[1]))
                    ttd = ttd.find_next_sibling('td')
                    senate, register, number, year = decomposeref(
                        ttd.text.replace(' / ', '/'))
                    register = normreg(register)
                    ttd = ttd.find_next_sibling('td')
                    form = Form.objects.get_or_create(name=ttd.text.strip())[0]
                    ttd = ttd.find_next_sibling('td')
                    judge = Judge.objects.get_or_create(
                        name=ttd.text.strip())[0]
                    ttd = ttd.find_next_sibling('td')
                    parties = ttd.select('td')
                    ttd = ttd.find_next_sibling('td')
                    closed = 'Ano' in ttd.text
                    ttd = ttd.find_next_sibling('td')
                    cancelled = 'Ano' in ttd.text
                    hearing = Hearing.objects.update_or_create(
                        courtroom=croom,
                        time=ttm,
                        senate=senate,
                        register=register,
                        number=number,
                        year=year,
                        form=form,
                        judge=judge,
                        defaults={
                            'closed': closed,
                            'cancelled': cancelled
                        })
                    if hearing[1]:
                        for query in parties:
                            qts = query.text.strip()
                            if qts:
                                party = Party.objects.get_or_create(
                                    name=query.text.strip())[0]
                                hearing[0].parties.add(party)
                                sur_check({'check_psj': True}, qts, task.court,
                                          senate, register, number, year,
                                          HEARING_URL.format(
                                              task.court.id, senate,
                                              quote(register), number, year,
                                              tdate, tdate))
                except:
                    pass
        task.delete()
    except:
        LOGGER.warning(
            'Failed to download hearings for {0}, {1.year:d}-{1.month:02d}-{1.day:02d}'
            .format(task.court_id, task.date))
        return
    LOGGER.debug(
        'Downloaded hearings for {0}, {1.year:d}-{1.month:02d}-{1.day:02d}'.
        format(task.court_id, task.date))
Esempio n. 15
0
def cron_update2():

    nss = Court.objects.get(pk=SUPREME_ADMINISTRATIVE_COURT)
    croom = Courtroom.objects.get_or_create(court=nss, desc='(neuvedeno)')[0]
    form = Form.objects.get_or_create(name='Veřejné jednání')[0]
    try:
        res = get(LIST_COURTROOMS2)
        soup = BeautifulSoup(res.text, 'html.parser')
        for item in soup.select('table.item'):
            for hearing in Hearing.objects.filter(courtroom__court=nss,
                                                  auxid=0):
                hearing.auxid = getauxid(hearing.senate, hearing.register,
                                         hearing.number, hearing.year)
                hearing.save()
            try:
                senate = register = number = year = judge = ttm = None
                parties = []
                for trow in item.select('tr'):
                    ths = trow.th.text.strip()
                    tds = trow.td.text.strip()
                    if ths.startswith('Spisová značka:'):
                        senate, register, number, year = decomposeref(tds)
                    elif ths.startswith('Účastníci řízení:'):
                        for query in trow.td:
                            if 'strip' in dir(query):
                                party = Party.objects.get_or_create(
                                    name=query.strip())[0]
                                parties.append(party)
                    elif ths.startswith('Předseda senátu:'):
                        judge = Judge.objects.get_or_create(name=tds)[0]
                    elif ths.startswith('Datum jednání:'):
                        dtm = tds.split()
                        dat = list(map(int, dtm[0].split('.')))
                        tim = list(map(int, dtm[2].split(':')))
                        ttm = datetime(dat[2], dat[1], dat[0], tim[0], tim[1])
                auxid = getauxid(senate, register, number, year)
                hearing = Hearing.objects.update_or_create(courtroom=croom,
                                                           time=ttm,
                                                           senate=senate,
                                                           register=register,
                                                           number=number,
                                                           year=year,
                                                           form=form,
                                                           judge=judge,
                                                           closed=False,
                                                           cancelled=False,
                                                           auxid=auxid)
                if hearing[1]:
                    for party in parties:
                        hearing[0].parties.add(party)
                        sur_check({'check_psj': True}, party.name, nss, senate,
                                  register, number, year,
                                  HEARING_URL.format(nss.id, senate,
                                                     quote(register), number,
                                                     year, ttm.date(),
                                                     ttm.date()))
            except:  # pragma: no cover
                pass
    except:  # pragma: no cover
        LOGGER.warning('Supreme Administrative Court update failed')
    LOGGER.debug('Downloaded Supreme Administrative Court hearings')