Example #1
0
def fetch_postjournal_day(parser, url, html, saver):
    root = lxml.html.fromstring(html)

    listdate = dateutil.parser.parse(
        root.cssselect("h2")[0].text_content().replace("Postlister for ", ""),
        dayfirst=True)
    print listdate.date()

    entries = []
    for tr in root.cssselect("table.ui-corner-all tr"):
        tds = tr.cssselect("td")
        line = tds[0].text_content()
        entries.append(line)


# 9 or 12 lines per entry
    queue = deque(entries)
    datastore = []
    while queue:
        docdesc = (queue.popleft() + queue.popleft()).strip()

        casedesc = (queue.popleft() + queue.popleft()).replace(
            "Sakstittel:", "").strip()

        ref = queue.popleft().strip()
        arkivsaksref = re.sub(r"L.penr.:.+$", "",
                              ref).replace("Arkivsaksnr.:", "").strip()

        caseyear = 0
        caseseqnr = 0
        casedocseq = 0
        doctype = '?'
        caseid = 'unknown'
        matchObj = re.match(r'(\d+)/(\d+)\s*-\s*(\d+) (.+)$', arkivsaksref,
                            re.M | re.I)
        if matchObj:
            caseyear = matchObj.group(1)
            caseseqnr = matchObj.group(2)
            casedocseq = matchObj.group(3)
            doctype = matchObj.group(4)
            caseyear = expand_year(caseyear)
            caseid = str(caseyear) + "/" + str(caseseqnr)
        else:
            print "error: invalid Arkivsaksnr: " + arkivsaksref
            matchObj = re.match(r'(\d+)/(\d+)\s*-', arkivsaksref, re.M | re.I)
            if matchObj:
                caseyear = expand_year(matchObj.group(1))
                caseseqnr = matchObj.group(2)
                caseid = str(caseyear) + "/" + str(caseseqnr)

        laapenr = re.sub(r"^.+L.penr.:", "", ref)
        journalseqnr = 0
        journalyear = 0
        journalid = 'unknown'
        if -1 != laapenr.find(
                '/') and "/" != laapenr:  # Avoid broken/empty values
            journalseqnr, journalyear = laapenr.split("/")
            journalyear = expand_year(journalyear)
            journalid = str(journalyear) + "/" + str(journalseqnr)
        else:
            print u"error: invalid Løpenr: " + laapenr

        if not parser.is_valid_doctype(doctype):
            doctype = {
                'S': 'N',
                'PLN': 'N',
                'Z': 'N',
            }[doctype]

        fratil = queue.popleft().replace("Fra/Til:", "").strip()
        if parser.is_sender_doctype(doctype):
            fratilfield = 'sender'
        elif parser.is_recipient_doctype(doctype):
            fratilfield = 'recipient'

        saksbehandler = queue.popleft().replace("Saksbehandler:", "").strip()
        saksansvarlig, bar = saksbehandler.split(" (")
        saksansvarligenhet, foo = bar.split(")")
        #print saksansvarligenhet

        recorddate = dateutil.parser.parse(queue.popleft().replace(
            "Datert:", "").strip(),
                                           dayfirst=True)

        requesturl = queue.popleft().strip()

        exemption = ""
        if -1 != requesturl.find("Gradering"):
            exemption = requesturl.replace("Gradering:", "").strip()
            requesturl = queue.popleft()
            fratil = ""

        data = {
            'agency': parser.agency,
            'recorddate': recorddate.date(),
            'docdesc': docdesc,
            'casedesc': casedesc,
            'caseyear': int(caseyear),
            'caseseqnr': int(caseseqnr),
            'casedocseq': int(casedocseq),
            'caseid': caseid,
            'doctype': doctype,
            'journalseqnr': int(journalseqnr),
            'journalyear': int(journalyear),
            'journalid': journalid,
            fratilfield: fratil,
            'saksbehandler': saksbehandler,
            'saksansvarlig': saksansvarlig.strip(),
            'saksansvarligenhet': saksansvarligenhet.strip(),
            'arkivsaksref': arkivsaksref,
            'laapenr': laapenr,
            'exemption': exemption,
            'scrapedurl': url,
            'scrapestamputc': datetime.datetime.now()
        }

        #        print data
        parser.verify_entry(data)
        datastore.append(data)
    saver(unique_keys=['arkivsaksref'], data=datastore)
def fetch_postjournal_day(parser, url, html, saver):
    root = lxml.html.fromstring(html.decode('utf-8'))

    recorddate = None
    for div in root.cssselect('div'):
        divcontent = div.text_content()
        if 0 == divcontent.find("Offentlig postjournal for "):
            recorddate = dateutil.parser.parse(divcontent.replace("Offentlig postjournal for ",""), dayfirst=True)
    print recorddate

    # Make sure we save the entire URL or nothing at all
    datastore = []
    for tr in root.cssselect('tr.yang'):
        tds = tr.cssselect("td")
        docidstr = tds[0].text_content().strip()
        docdate = tds[1].text_content().strip()
        doctype = tds[2].text_content().strip()
        docdesc = tds[3].text_content().strip()
        fratil = tds[4].text_content().strip()
        saksbehandler = tds[5].text_content().strip()
        if -1 != tds[6].text_content().find("Bestill"):
            exemption = None
        else:
            exemption = tds[6].text_content().strip()

        docdate = dateutil.parser.parse(docdate, dayfirst=True)

#        print doctype, docdesc
        if not parser.is_valid_doctype(doctype):
            doctype = {
                '' : '?',
                }[doctype]
        if parser.is_sender_doctype(doctype):
            fratilfield = 'sender'
        elif parser.is_recipient_doctype(doctype):
            fratilfield = 'recipient'

        caseyear, caseseqnr = docidstr.split("/")
        caseyear = expand_year(caseyear)
        caseseqnr, casedocseq = caseseqnr.split("-")
        caseid = "%d/%d" % (int(caseyear), int(caseseqnr))

        data = {
            'agency' : parser.agency,
            'recorddate' : recorddate.date(),
            'docdate' : docdate.date(),
            'docdesc' : docdesc,
            'casedesc' : docdesc, # FIXME fake value

            'caseyear' : int(caseyear),
            'caseseqnr' : int(caseseqnr),
            'casedocseq' : int(casedocseq),
            'caseid' : caseid,
            'doctype' : doctype,

#        'journalseqnr' : int(journalseqnr),
#        'journalyear' : int(journalyear),
#        'journalid' : journalid,
            fratilfield : fratil,

            'saksbehandler' : saksbehandler,
#        'saksansvarlig' : saksansvarlig.strip(),
#        'saksansvarligenhet' : saksansvarligenhet.strip(),

            'docidstr' : docidstr,
#        'laapenr' : laapenr,
            'exemption' : exemption,

            'scrapedurl' : url,
            'scrapestamputc' : datetime.datetime.now()
            }

#        print data
        parser.verify_entry(data)
        datastore.append(data)

    seenurl = {}
    # Find next URL.  There are two on each page.
    for ahref in root.cssselect('a.next_page'):
        if 0 == ahref.text_content().find('Neste'):
            nexturl = urlparse.urljoin(url, ahref.attrib['href'])
            if nexturl not in seenurl:
                seenurl[nexturl] = True;
                print 'Fetching ' + nexturl
                html = postlistelib.fetch_url_harder(nexturl)
                mysaver = lambda unique_keys, data: datastore.extend(data)
                fetch_postjournal_day(parser=parser, url=nexturl, html=html,
                                      saver=mysaver)

    saver(unique_keys=['docidstr'], data=datastore)
Example #3
0
def fetch_postjournal_day(parser, url, html, saver):
    root = lxml.html.fromstring(html.decode('utf-8'))

    recorddate = None
    for div in root.cssselect('div'):
        divcontent = div.text_content()
        if 0 == divcontent.find("Offentlig postjournal for "):
            recorddate = dateutil.parser.parse(divcontent.replace(
                "Offentlig postjournal for ", ""),
                                               dayfirst=True)
    print recorddate

    # Make sure we save the entire URL or nothing at all
    datastore = []
    for tr in root.cssselect('tr.yang'):
        tds = tr.cssselect("td")
        docidstr = tds[0].text_content().strip()
        docdate = tds[1].text_content().strip()
        doctype = tds[2].text_content().strip()
        docdesc = tds[3].text_content().strip()
        fratil = tds[4].text_content().strip()
        saksbehandler = tds[5].text_content().strip()
        if -1 != tds[6].text_content().find("Bestill"):
            exemption = None
        else:
            exemption = tds[6].text_content().strip()

        docdate = dateutil.parser.parse(docdate, dayfirst=True)

        #        print doctype, docdesc
        if not parser.is_valid_doctype(doctype):
            doctype = {
                '': '?',
            }[doctype]
        if parser.is_sender_doctype(doctype):
            fratilfield = 'sender'
        elif parser.is_recipient_doctype(doctype):
            fratilfield = 'recipient'

        caseyear, caseseqnr = docidstr.split("/")
        caseyear = expand_year(caseyear)
        caseseqnr, casedocseq = caseseqnr.split("-")
        caseid = "%d/%d" % (int(caseyear), int(caseseqnr))

        data = {
            'agency': parser.agency,
            'recorddate': recorddate.date(),
            'docdate': docdate.date(),
            'docdesc': docdesc,
            'casedesc': docdesc,  # FIXME fake value
            'caseyear': int(caseyear),
            'caseseqnr': int(caseseqnr),
            'casedocseq': int(casedocseq),
            'caseid': caseid,
            'doctype': doctype,

            #        'journalseqnr' : int(journalseqnr),
            #        'journalyear' : int(journalyear),
            #        'journalid' : journalid,
            fratilfield: fratil,
            'saksbehandler': saksbehandler,
            #        'saksansvarlig' : saksansvarlig.strip(),
            #        'saksansvarligenhet' : saksansvarligenhet.strip(),
            'docidstr': docidstr,
            #        'laapenr' : laapenr,
            'exemption': exemption,
            'scrapedurl': url,
            'scrapestamputc': datetime.datetime.now()
        }

        #        print data
        parser.verify_entry(data)
        datastore.append(data)

    seenurl = {}
    # Find next URL.  There are two on each page.
    for ahref in root.cssselect('a.next_page'):
        if 0 == ahref.text_content().find('Neste'):
            nexturl = urlparse.urljoin(url, ahref.attrib['href'])
            if nexturl not in seenurl:
                seenurl[nexturl] = True
                print 'Fetching ' + nexturl
                html = postlistelib.fetch_url_harder(nexturl)
                mysaver = lambda unique_keys, data: datastore.extend(data)
                fetch_postjournal_day(parser=parser,
                                      url=nexturl,
                                      html=html,
                                      saver=mysaver)

    saver(unique_keys=['docidstr'], data=datastore)
def fetch_postjournal_day(parser, url, html, saver):
    root = lxml.html.fromstring(html)

    listdate = dateutil.parser.parse(root.cssselect("h2")[0].text_content().replace("Postlister for ",""), dayfirst=True)
    print listdate.date()
    
    entries = []
    for tr in root.cssselect("table.ui-corner-all tr"):
        tds = tr.cssselect("td")
        line = tds[0].text_content()
        entries.append(line)

# 9 or 12 lines per entry
    queue = deque(entries)
    datastore = []
    while queue:
        docdesc = (queue.popleft() + queue.popleft()).strip()
    
        casedesc = (queue.popleft() +  queue.popleft()).replace("Sakstittel:", "").strip()
    
        ref = queue.popleft().strip()
        arkivsaksref = re.sub(r"L.penr.:.+$", "", ref).replace("Arkivsaksnr.:","").strip()

        caseyear = 0
        caseseqnr = 0
        casedocseq = 0
        doctype = '?'
        caseid = 'unknown'
        matchObj = re.match( r'(\d+)/(\d+)\s*-\s*(\d+) (.+)$', arkivsaksref, re.M|re.I)
        if matchObj:
            caseyear = matchObj.group(1)
            caseseqnr = matchObj.group(2)
            casedocseq = matchObj.group(3)
            doctype = matchObj.group(4)
            caseyear = expand_year(caseyear)
            caseid = str(caseyear) + "/" + str(caseseqnr)
        else:
            print "error: invalid Arkivsaksnr: " + arkivsaksref
            matchObj = re.match( r'(\d+)/(\d+)\s*-', arkivsaksref, re.M|re.I)
            if matchObj:
                caseyear = expand_year(matchObj.group(1))
                caseseqnr = matchObj.group(2)
                caseid = str(caseyear) + "/" + str(caseseqnr)

        laapenr = re.sub(r"^.+L.penr.:", "", ref)
        journalseqnr = 0
        journalyear = 0
        journalid = 'unknown'
        if -1 != laapenr.find('/') and "/" != laapenr: # Avoid broken/empty values
            journalseqnr, journalyear = laapenr.split("/")
            journalyear = expand_year(journalyear)
            journalid = str(journalyear) + "/" + str(journalseqnr)
        else:
            print u"error: invalid Løpenr: " + laapenr

        if not parser.is_valid_doctype(doctype):
            doctype = {
                'S'   : 'N',
                'PLN' : 'N',
                'Z'   : 'N',
            }[doctype]

        fratil = queue.popleft().replace("Fra/Til:", "").strip()
        if parser.is_sender_doctype(doctype):
            fratilfield = 'sender'
        elif parser.is_recipient_doctype(doctype):
             fratilfield = 'recipient'

        saksbehandler = queue.popleft().replace("Saksbehandler:","").strip()
        saksansvarlig, bar = saksbehandler.split(" (")
        saksansvarligenhet, foo = bar.split(")")
        #print saksansvarligenhet

        recorddate = dateutil.parser.parse(queue.popleft().replace("Datert:","").strip(), dayfirst=True)

        requesturl = queue.popleft().strip()

        exemption = ""
        if -1 != requesturl.find("Gradering"):
            exemption = requesturl.replace("Gradering:", "").strip()
            requesturl = queue.popleft()
            fratil = ""

        data = {
            'agency' : parser.agency,
            'recorddate' : recorddate.date(),
            'docdesc' : docdesc,
            'casedesc' : casedesc,

            'caseyear' : int(caseyear),
            'caseseqnr' : int(caseseqnr),
            'casedocseq' : int(casedocseq),
            'caseid' : caseid,
            'doctype' : doctype,

            'journalseqnr' : int(journalseqnr),
            'journalyear' : int(journalyear),
            'journalid' : journalid,
            fratilfield : fratil,

            'saksbehandler' : saksbehandler,
            'saksansvarlig' : saksansvarlig.strip(),
            'saksansvarligenhet' : saksansvarligenhet.strip(),

            'arkivsaksref' : arkivsaksref,
            'laapenr' : laapenr,
            'exemption' : exemption,

            'scrapedurl' : url,
            'scrapestamputc' : datetime.datetime.now()
        }

#        print data
        parser.verify_entry(data)
        datastore.append(data)
    saver(unique_keys=['arkivsaksref'], data=datastore)