def fetch_postjournal_day(parser, url, html, saver): root = lxml.html.fromstring(html) listdate = dateutil.parser.parse( root.cssselect("h2")[0].text_content().replace("Postlister for ", ""), dayfirst=True) print listdate.date() entries = [] for tr in root.cssselect("table.ui-corner-all tr"): tds = tr.cssselect("td") line = tds[0].text_content() entries.append(line) # 9 or 12 lines per entry queue = deque(entries) datastore = [] while queue: docdesc = (queue.popleft() + queue.popleft()).strip() casedesc = (queue.popleft() + queue.popleft()).replace( "Sakstittel:", "").strip() ref = queue.popleft().strip() arkivsaksref = re.sub(r"L.penr.:.+$", "", ref).replace("Arkivsaksnr.:", "").strip() caseyear = 0 caseseqnr = 0 casedocseq = 0 doctype = '?' caseid = 'unknown' matchObj = re.match(r'(\d+)/(\d+)\s*-\s*(\d+) (.+)$', arkivsaksref, re.M | re.I) if matchObj: caseyear = matchObj.group(1) caseseqnr = matchObj.group(2) casedocseq = matchObj.group(3) doctype = matchObj.group(4) caseyear = expand_year(caseyear) caseid = str(caseyear) + "/" + str(caseseqnr) else: print "error: invalid Arkivsaksnr: " + arkivsaksref matchObj = re.match(r'(\d+)/(\d+)\s*-', arkivsaksref, re.M | re.I) if matchObj: caseyear = expand_year(matchObj.group(1)) caseseqnr = matchObj.group(2) caseid = str(caseyear) + "/" + str(caseseqnr) laapenr = re.sub(r"^.+L.penr.:", "", ref) journalseqnr = 0 journalyear = 0 journalid = 'unknown' if -1 != laapenr.find( '/') and "/" != laapenr: # Avoid broken/empty values journalseqnr, journalyear = laapenr.split("/") journalyear = expand_year(journalyear) journalid = str(journalyear) + "/" + str(journalseqnr) else: print u"error: invalid Løpenr: " + laapenr if not parser.is_valid_doctype(doctype): doctype = { 'S': 'N', 'PLN': 'N', 'Z': 'N', }[doctype] fratil = queue.popleft().replace("Fra/Til:", "").strip() if parser.is_sender_doctype(doctype): fratilfield = 'sender' elif parser.is_recipient_doctype(doctype): fratilfield = 'recipient' saksbehandler = queue.popleft().replace("Saksbehandler:", "").strip() saksansvarlig, bar = saksbehandler.split(" (") saksansvarligenhet, foo = bar.split(")") #print saksansvarligenhet recorddate = dateutil.parser.parse(queue.popleft().replace( "Datert:", "").strip(), dayfirst=True) requesturl = queue.popleft().strip() exemption = "" if -1 != requesturl.find("Gradering"): exemption = requesturl.replace("Gradering:", "").strip() requesturl = queue.popleft() fratil = "" data = { 'agency': parser.agency, 'recorddate': recorddate.date(), 'docdesc': docdesc, 'casedesc': casedesc, 'caseyear': int(caseyear), 'caseseqnr': int(caseseqnr), 'casedocseq': int(casedocseq), 'caseid': caseid, 'doctype': doctype, 'journalseqnr': int(journalseqnr), 'journalyear': int(journalyear), 'journalid': journalid, fratilfield: fratil, 'saksbehandler': saksbehandler, 'saksansvarlig': saksansvarlig.strip(), 'saksansvarligenhet': saksansvarligenhet.strip(), 'arkivsaksref': arkivsaksref, 'laapenr': laapenr, 'exemption': exemption, 'scrapedurl': url, 'scrapestamputc': datetime.datetime.now() } # print data parser.verify_entry(data) datastore.append(data) saver(unique_keys=['arkivsaksref'], data=datastore)
def fetch_postjournal_day(parser, url, html, saver): root = lxml.html.fromstring(html.decode('utf-8')) recorddate = None for div in root.cssselect('div'): divcontent = div.text_content() if 0 == divcontent.find("Offentlig postjournal for "): recorddate = dateutil.parser.parse(divcontent.replace("Offentlig postjournal for ",""), dayfirst=True) print recorddate # Make sure we save the entire URL or nothing at all datastore = [] for tr in root.cssselect('tr.yang'): tds = tr.cssselect("td") docidstr = tds[0].text_content().strip() docdate = tds[1].text_content().strip() doctype = tds[2].text_content().strip() docdesc = tds[3].text_content().strip() fratil = tds[4].text_content().strip() saksbehandler = tds[5].text_content().strip() if -1 != tds[6].text_content().find("Bestill"): exemption = None else: exemption = tds[6].text_content().strip() docdate = dateutil.parser.parse(docdate, dayfirst=True) # print doctype, docdesc if not parser.is_valid_doctype(doctype): doctype = { '' : '?', }[doctype] if parser.is_sender_doctype(doctype): fratilfield = 'sender' elif parser.is_recipient_doctype(doctype): fratilfield = 'recipient' caseyear, caseseqnr = docidstr.split("/") caseyear = expand_year(caseyear) caseseqnr, casedocseq = caseseqnr.split("-") caseid = "%d/%d" % (int(caseyear), int(caseseqnr)) data = { 'agency' : parser.agency, 'recorddate' : recorddate.date(), 'docdate' : docdate.date(), 'docdesc' : docdesc, 'casedesc' : docdesc, # FIXME fake value 'caseyear' : int(caseyear), 'caseseqnr' : int(caseseqnr), 'casedocseq' : int(casedocseq), 'caseid' : caseid, 'doctype' : doctype, # 'journalseqnr' : int(journalseqnr), # 'journalyear' : int(journalyear), # 'journalid' : journalid, fratilfield : fratil, 'saksbehandler' : saksbehandler, # 'saksansvarlig' : saksansvarlig.strip(), # 'saksansvarligenhet' : saksansvarligenhet.strip(), 'docidstr' : docidstr, # 'laapenr' : laapenr, 'exemption' : exemption, 'scrapedurl' : url, 'scrapestamputc' : datetime.datetime.now() } # print data parser.verify_entry(data) datastore.append(data) seenurl = {} # Find next URL. There are two on each page. for ahref in root.cssselect('a.next_page'): if 0 == ahref.text_content().find('Neste'): nexturl = urlparse.urljoin(url, ahref.attrib['href']) if nexturl not in seenurl: seenurl[nexturl] = True; print 'Fetching ' + nexturl html = postlistelib.fetch_url_harder(nexturl) mysaver = lambda unique_keys, data: datastore.extend(data) fetch_postjournal_day(parser=parser, url=nexturl, html=html, saver=mysaver) saver(unique_keys=['docidstr'], data=datastore)
def fetch_postjournal_day(parser, url, html, saver): root = lxml.html.fromstring(html.decode('utf-8')) recorddate = None for div in root.cssselect('div'): divcontent = div.text_content() if 0 == divcontent.find("Offentlig postjournal for "): recorddate = dateutil.parser.parse(divcontent.replace( "Offentlig postjournal for ", ""), dayfirst=True) print recorddate # Make sure we save the entire URL or nothing at all datastore = [] for tr in root.cssselect('tr.yang'): tds = tr.cssselect("td") docidstr = tds[0].text_content().strip() docdate = tds[1].text_content().strip() doctype = tds[2].text_content().strip() docdesc = tds[3].text_content().strip() fratil = tds[4].text_content().strip() saksbehandler = tds[5].text_content().strip() if -1 != tds[6].text_content().find("Bestill"): exemption = None else: exemption = tds[6].text_content().strip() docdate = dateutil.parser.parse(docdate, dayfirst=True) # print doctype, docdesc if not parser.is_valid_doctype(doctype): doctype = { '': '?', }[doctype] if parser.is_sender_doctype(doctype): fratilfield = 'sender' elif parser.is_recipient_doctype(doctype): fratilfield = 'recipient' caseyear, caseseqnr = docidstr.split("/") caseyear = expand_year(caseyear) caseseqnr, casedocseq = caseseqnr.split("-") caseid = "%d/%d" % (int(caseyear), int(caseseqnr)) data = { 'agency': parser.agency, 'recorddate': recorddate.date(), 'docdate': docdate.date(), 'docdesc': docdesc, 'casedesc': docdesc, # FIXME fake value 'caseyear': int(caseyear), 'caseseqnr': int(caseseqnr), 'casedocseq': int(casedocseq), 'caseid': caseid, 'doctype': doctype, # 'journalseqnr' : int(journalseqnr), # 'journalyear' : int(journalyear), # 'journalid' : journalid, fratilfield: fratil, 'saksbehandler': saksbehandler, # 'saksansvarlig' : saksansvarlig.strip(), # 'saksansvarligenhet' : saksansvarligenhet.strip(), 'docidstr': docidstr, # 'laapenr' : laapenr, 'exemption': exemption, 'scrapedurl': url, 'scrapestamputc': datetime.datetime.now() } # print data parser.verify_entry(data) datastore.append(data) seenurl = {} # Find next URL. There are two on each page. for ahref in root.cssselect('a.next_page'): if 0 == ahref.text_content().find('Neste'): nexturl = urlparse.urljoin(url, ahref.attrib['href']) if nexturl not in seenurl: seenurl[nexturl] = True print 'Fetching ' + nexturl html = postlistelib.fetch_url_harder(nexturl) mysaver = lambda unique_keys, data: datastore.extend(data) fetch_postjournal_day(parser=parser, url=nexturl, html=html, saver=mysaver) saver(unique_keys=['docidstr'], data=datastore)
def fetch_postjournal_day(parser, url, html, saver): root = lxml.html.fromstring(html) listdate = dateutil.parser.parse(root.cssselect("h2")[0].text_content().replace("Postlister for ",""), dayfirst=True) print listdate.date() entries = [] for tr in root.cssselect("table.ui-corner-all tr"): tds = tr.cssselect("td") line = tds[0].text_content() entries.append(line) # 9 or 12 lines per entry queue = deque(entries) datastore = [] while queue: docdesc = (queue.popleft() + queue.popleft()).strip() casedesc = (queue.popleft() + queue.popleft()).replace("Sakstittel:", "").strip() ref = queue.popleft().strip() arkivsaksref = re.sub(r"L.penr.:.+$", "", ref).replace("Arkivsaksnr.:","").strip() caseyear = 0 caseseqnr = 0 casedocseq = 0 doctype = '?' caseid = 'unknown' matchObj = re.match( r'(\d+)/(\d+)\s*-\s*(\d+) (.+)$', arkivsaksref, re.M|re.I) if matchObj: caseyear = matchObj.group(1) caseseqnr = matchObj.group(2) casedocseq = matchObj.group(3) doctype = matchObj.group(4) caseyear = expand_year(caseyear) caseid = str(caseyear) + "/" + str(caseseqnr) else: print "error: invalid Arkivsaksnr: " + arkivsaksref matchObj = re.match( r'(\d+)/(\d+)\s*-', arkivsaksref, re.M|re.I) if matchObj: caseyear = expand_year(matchObj.group(1)) caseseqnr = matchObj.group(2) caseid = str(caseyear) + "/" + str(caseseqnr) laapenr = re.sub(r"^.+L.penr.:", "", ref) journalseqnr = 0 journalyear = 0 journalid = 'unknown' if -1 != laapenr.find('/') and "/" != laapenr: # Avoid broken/empty values journalseqnr, journalyear = laapenr.split("/") journalyear = expand_year(journalyear) journalid = str(journalyear) + "/" + str(journalseqnr) else: print u"error: invalid Løpenr: " + laapenr if not parser.is_valid_doctype(doctype): doctype = { 'S' : 'N', 'PLN' : 'N', 'Z' : 'N', }[doctype] fratil = queue.popleft().replace("Fra/Til:", "").strip() if parser.is_sender_doctype(doctype): fratilfield = 'sender' elif parser.is_recipient_doctype(doctype): fratilfield = 'recipient' saksbehandler = queue.popleft().replace("Saksbehandler:","").strip() saksansvarlig, bar = saksbehandler.split(" (") saksansvarligenhet, foo = bar.split(")") #print saksansvarligenhet recorddate = dateutil.parser.parse(queue.popleft().replace("Datert:","").strip(), dayfirst=True) requesturl = queue.popleft().strip() exemption = "" if -1 != requesturl.find("Gradering"): exemption = requesturl.replace("Gradering:", "").strip() requesturl = queue.popleft() fratil = "" data = { 'agency' : parser.agency, 'recorddate' : recorddate.date(), 'docdesc' : docdesc, 'casedesc' : casedesc, 'caseyear' : int(caseyear), 'caseseqnr' : int(caseseqnr), 'casedocseq' : int(casedocseq), 'caseid' : caseid, 'doctype' : doctype, 'journalseqnr' : int(journalseqnr), 'journalyear' : int(journalyear), 'journalid' : journalid, fratilfield : fratil, 'saksbehandler' : saksbehandler, 'saksansvarlig' : saksansvarlig.strip(), 'saksansvarligenhet' : saksansvarligenhet.strip(), 'arkivsaksref' : arkivsaksref, 'laapenr' : laapenr, 'exemption' : exemption, 'scrapedurl' : url, 'scrapestamputc' : datetime.datetime.now() } # print data parser.verify_entry(data) datastore.append(data) saver(unique_keys=['arkivsaksref'], data=datastore)