def fetch_postjournal_day(parser, url, html, saver): root = lxml.html.fromstring(html) listdate = dateutil.parser.parse( root.cssselect("h2")[0].text_content().replace("Postlister for ", ""), dayfirst=True) print listdate.date() entries = [] for tr in root.cssselect("table.ui-corner-all tr"): tds = tr.cssselect("td") line = tds[0].text_content() entries.append(line) # 9 or 12 lines per entry queue = deque(entries) datastore = [] while queue: docdesc = (queue.popleft() + queue.popleft()).strip() casedesc = (queue.popleft() + queue.popleft()).replace( "Sakstittel:", "").strip() ref = queue.popleft().strip() arkivsaksref = re.sub(r"L.penr.:.+$", "", ref).replace("Arkivsaksnr.:", "").strip() caseyear = 0 caseseqnr = 0 casedocseq = 0 doctype = '?' caseid = 'unknown' matchObj = re.match(r'(\d+)/(\d+)\s*-\s*(\d+) (.+)$', arkivsaksref, re.M | re.I) if matchObj: caseyear = matchObj.group(1) caseseqnr = matchObj.group(2) casedocseq = matchObj.group(3) doctype = matchObj.group(4) caseyear = expand_year(caseyear) caseid = str(caseyear) + "/" + str(caseseqnr) else: print "error: invalid Arkivsaksnr: " + arkivsaksref matchObj = re.match(r'(\d+)/(\d+)\s*-', arkivsaksref, re.M | re.I) if matchObj: caseyear = expand_year(matchObj.group(1)) caseseqnr = matchObj.group(2) caseid = str(caseyear) + "/" + str(caseseqnr) laapenr = re.sub(r"^.+L.penr.:", "", ref) journalseqnr = 0 journalyear = 0 journalid = 'unknown' if -1 != laapenr.find( '/') and "/" != laapenr: # Avoid broken/empty values journalseqnr, journalyear = laapenr.split("/") journalyear = expand_year(journalyear) journalid = str(journalyear) + "/" + str(journalseqnr) else: print u"error: invalid Løpenr: " + laapenr if not parser.is_valid_doctype(doctype): doctype = { 'S': 'N', 'PLN': 'N', 'Z': 'N', }[doctype] fratil = queue.popleft().replace("Fra/Til:", "").strip() if parser.is_sender_doctype(doctype): fratilfield = 'sender' elif parser.is_recipient_doctype(doctype): fratilfield = 'recipient' saksbehandler = queue.popleft().replace("Saksbehandler:", "").strip() saksansvarlig, bar = saksbehandler.split(" (") saksansvarligenhet, foo = bar.split(")") #print saksansvarligenhet recorddate = dateutil.parser.parse(queue.popleft().replace( "Datert:", "").strip(), dayfirst=True) requesturl = queue.popleft().strip() exemption = "" if -1 != requesturl.find("Gradering"): exemption = requesturl.replace("Gradering:", "").strip() requesturl = queue.popleft() fratil = "" data = { 'agency': parser.agency, 'recorddate': recorddate.date(), 'docdesc': docdesc, 'casedesc': casedesc, 'caseyear': int(caseyear), 'caseseqnr': int(caseseqnr), 'casedocseq': int(casedocseq), 'caseid': caseid, 'doctype': doctype, 'journalseqnr': int(journalseqnr), 'journalyear': int(journalyear), 'journalid': journalid, fratilfield: fratil, 'saksbehandler': saksbehandler, 'saksansvarlig': saksansvarlig.strip(), 'saksansvarligenhet': saksansvarligenhet.strip(), 'arkivsaksref': arkivsaksref, 'laapenr': laapenr, 'exemption': exemption, 'scrapedurl': url, 'scrapestamputc': datetime.datetime.now() } # print data parser.verify_entry(data) datastore.append(data) saver(unique_keys=['arkivsaksref'], data=datastore)
def fetch_postjournal_day(parser, url, html, saver): root = lxml.html.fromstring(html.decode('utf-8')) recorddate = None for div in root.cssselect('div'): divcontent = div.text_content() if 0 == divcontent.find("Offentlig postjournal for "): recorddate = dateutil.parser.parse(divcontent.replace( "Offentlig postjournal for ", ""), dayfirst=True) print recorddate # Make sure we save the entire URL or nothing at all datastore = [] for tr in root.cssselect('tr.yang'): tds = tr.cssselect("td") docidstr = tds[0].text_content().strip() docdate = tds[1].text_content().strip() doctype = tds[2].text_content().strip() docdesc = tds[3].text_content().strip() fratil = tds[4].text_content().strip() saksbehandler = tds[5].text_content().strip() if -1 != tds[6].text_content().find("Bestill"): exemption = None else: exemption = tds[6].text_content().strip() docdate = dateutil.parser.parse(docdate, dayfirst=True) # print doctype, docdesc if not parser.is_valid_doctype(doctype): doctype = { '': '?', }[doctype] if parser.is_sender_doctype(doctype): fratilfield = 'sender' elif parser.is_recipient_doctype(doctype): fratilfield = 'recipient' caseyear, caseseqnr = docidstr.split("/") caseyear = expand_year(caseyear) caseseqnr, casedocseq = caseseqnr.split("-") caseid = "%d/%d" % (int(caseyear), int(caseseqnr)) data = { 'agency': parser.agency, 'recorddate': recorddate.date(), 'docdate': docdate.date(), 'docdesc': docdesc, 'casedesc': docdesc, # FIXME fake value 'caseyear': int(caseyear), 'caseseqnr': int(caseseqnr), 'casedocseq': int(casedocseq), 'caseid': caseid, 'doctype': doctype, # 'journalseqnr' : int(journalseqnr), # 'journalyear' : int(journalyear), # 'journalid' : journalid, fratilfield: fratil, 'saksbehandler': saksbehandler, # 'saksansvarlig' : saksansvarlig.strip(), # 'saksansvarligenhet' : saksansvarligenhet.strip(), 'docidstr': docidstr, # 'laapenr' : laapenr, 'exemption': exemption, 'scrapedurl': url, 'scrapestamputc': datetime.datetime.now() } # print data parser.verify_entry(data) datastore.append(data) seenurl = {} # Find next URL. There are two on each page. for ahref in root.cssselect('a.next_page'): if 0 == ahref.text_content().find('Neste'): nexturl = urlparse.urljoin(url, ahref.attrib['href']) if nexturl not in seenurl: seenurl[nexturl] = True print 'Fetching ' + nexturl html = postlistelib.fetch_url_harder(nexturl) mysaver = lambda unique_keys, data: datastore.extend(data) fetch_postjournal_day(parser=parser, url=nexturl, html=html, saver=mysaver) saver(unique_keys=['docidstr'], data=datastore)
def fetch_postjournal_day(parser, url, html, saver): root = lxml.html.fromstring(html.decode('utf-8')) recorddate = None for div in root.cssselect('div'): divcontent = div.text_content() if 0 == divcontent.find("Offentlig postjournal for "): recorddate = dateutil.parser.parse(divcontent.replace("Offentlig postjournal for ",""), dayfirst=True) print recorddate # Make sure we save the entire URL or nothing at all datastore = [] for tr in root.cssselect('tr.yang'): tds = tr.cssselect("td") docidstr = tds[0].text_content().strip() docdate = tds[1].text_content().strip() doctype = tds[2].text_content().strip() docdesc = tds[3].text_content().strip() fratil = tds[4].text_content().strip() saksbehandler = tds[5].text_content().strip() if -1 != tds[6].text_content().find("Bestill"): exemption = None else: exemption = tds[6].text_content().strip() docdate = dateutil.parser.parse(docdate, dayfirst=True) # print doctype, docdesc if not parser.is_valid_doctype(doctype): doctype = { '' : '?', }[doctype] if parser.is_sender_doctype(doctype): fratilfield = 'sender' elif parser.is_recipient_doctype(doctype): fratilfield = 'recipient' caseyear, caseseqnr = docidstr.split("/") caseyear = expand_year(caseyear) caseseqnr, casedocseq = caseseqnr.split("-") caseid = "%d/%d" % (int(caseyear), int(caseseqnr)) data = { 'agency' : parser.agency, 'recorddate' : recorddate.date(), 'docdate' : docdate.date(), 'docdesc' : docdesc, 'casedesc' : docdesc, # FIXME fake value 'caseyear' : int(caseyear), 'caseseqnr' : int(caseseqnr), 'casedocseq' : int(casedocseq), 'caseid' : caseid, 'doctype' : doctype, # 'journalseqnr' : int(journalseqnr), # 'journalyear' : int(journalyear), # 'journalid' : journalid, fratilfield : fratil, 'saksbehandler' : saksbehandler, # 'saksansvarlig' : saksansvarlig.strip(), # 'saksansvarligenhet' : saksansvarligenhet.strip(), 'docidstr' : docidstr, # 'laapenr' : laapenr, 'exemption' : exemption, 'scrapedurl' : url, 'scrapestamputc' : datetime.datetime.now() } # print data parser.verify_entry(data) datastore.append(data) seenurl = {} # Find next URL. There are two on each page. for ahref in root.cssselect('a.next_page'): if 0 == ahref.text_content().find('Neste'): nexturl = urlparse.urljoin(url, ahref.attrib['href']) if nexturl not in seenurl: seenurl[nexturl] = True; print 'Fetching ' + nexturl html = postlistelib.fetch_url_harder(nexturl) mysaver = lambda unique_keys, data: datastore.extend(data) fetch_postjournal_day(parser=parser, url=nexturl, html=html, saver=mysaver) saver(unique_keys=['docidstr'], data=datastore)
def fetch_postjournal_day(parser, url, html, saver): root = lxml.html.fromstring(html) listdate = dateutil.parser.parse(root.cssselect("h2")[0].text_content().replace("Postlister for ",""), dayfirst=True) print listdate.date() entries = [] for tr in root.cssselect("table.ui-corner-all tr"): tds = tr.cssselect("td") line = tds[0].text_content() entries.append(line) # 9 or 12 lines per entry queue = deque(entries) datastore = [] while queue: docdesc = (queue.popleft() + queue.popleft()).strip() casedesc = (queue.popleft() + queue.popleft()).replace("Sakstittel:", "").strip() ref = queue.popleft().strip() arkivsaksref = re.sub(r"L.penr.:.+$", "", ref).replace("Arkivsaksnr.:","").strip() caseyear = 0 caseseqnr = 0 casedocseq = 0 doctype = '?' caseid = 'unknown' matchObj = re.match( r'(\d+)/(\d+)\s*-\s*(\d+) (.+)$', arkivsaksref, re.M|re.I) if matchObj: caseyear = matchObj.group(1) caseseqnr = matchObj.group(2) casedocseq = matchObj.group(3) doctype = matchObj.group(4) caseyear = expand_year(caseyear) caseid = str(caseyear) + "/" + str(caseseqnr) else: print "error: invalid Arkivsaksnr: " + arkivsaksref matchObj = re.match( r'(\d+)/(\d+)\s*-', arkivsaksref, re.M|re.I) if matchObj: caseyear = expand_year(matchObj.group(1)) caseseqnr = matchObj.group(2) caseid = str(caseyear) + "/" + str(caseseqnr) laapenr = re.sub(r"^.+L.penr.:", "", ref) journalseqnr = 0 journalyear = 0 journalid = 'unknown' if -1 != laapenr.find('/') and "/" != laapenr: # Avoid broken/empty values journalseqnr, journalyear = laapenr.split("/") journalyear = expand_year(journalyear) journalid = str(journalyear) + "/" + str(journalseqnr) else: print u"error: invalid Løpenr: " + laapenr if not parser.is_valid_doctype(doctype): doctype = { 'S' : 'N', 'PLN' : 'N', 'Z' : 'N', }[doctype] fratil = queue.popleft().replace("Fra/Til:", "").strip() if parser.is_sender_doctype(doctype): fratilfield = 'sender' elif parser.is_recipient_doctype(doctype): fratilfield = 'recipient' saksbehandler = queue.popleft().replace("Saksbehandler:","").strip() saksansvarlig, bar = saksbehandler.split(" (") saksansvarligenhet, foo = bar.split(")") #print saksansvarligenhet recorddate = dateutil.parser.parse(queue.popleft().replace("Datert:","").strip(), dayfirst=True) requesturl = queue.popleft().strip() exemption = "" if -1 != requesturl.find("Gradering"): exemption = requesturl.replace("Gradering:", "").strip() requesturl = queue.popleft() fratil = "" data = { 'agency' : parser.agency, 'recorddate' : recorddate.date(), 'docdesc' : docdesc, 'casedesc' : casedesc, 'caseyear' : int(caseyear), 'caseseqnr' : int(caseseqnr), 'casedocseq' : int(casedocseq), 'caseid' : caseid, 'doctype' : doctype, 'journalseqnr' : int(journalseqnr), 'journalyear' : int(journalyear), 'journalid' : journalid, fratilfield : fratil, 'saksbehandler' : saksbehandler, 'saksansvarlig' : saksansvarlig.strip(), 'saksansvarligenhet' : saksansvarligenhet.strip(), 'arkivsaksref' : arkivsaksref, 'laapenr' : laapenr, 'exemption' : exemption, 'scrapedurl' : url, 'scrapestamputc' : datetime.datetime.now() } # print data parser.verify_entry(data) datastore.append(data) saver(unique_keys=['arkivsaksref'], data=datastore)
def save_date(parser, date, url, html): num_saved = 0 root = lxml.html.fromstring(html) journal_date = dateutil.parser.parse( root.cssselect("p")[0].text_content().replace("Journaldato: ", ""), dayfirst=True) if date == journal_date.date(): datastore = [] for table in root.cssselect("table"): docid = table.cssselect("tr")[0].cssselect("p")[1].text.strip() datedesc = table.cssselect("tr")[0].cssselect("td")[3].cssselect( "p")[0].text.strip() exemption = table.cssselect("tr")[1].cssselect("td")[5].cssselect( "p")[0].text.strip() fratil_indicator = table.cssselect("tr")[2].cssselect( "td")[0].cssselect("p")[0].text.strip() doctype = "" if fratil_indicator.startswith("Til"): doctype = "U" elif fratil_indicator.startswith("Fra"): doctype = "I" elif fratil_indicator.startswith("Notat fra"): doctype = "N" else: raise ValueError("Fant ikke doctype %s" % fratil_indicator) fratil_agency = table.cssselect("tr")[2].cssselect( "td")[1].cssselect("p")[0].text.strip() casedesc = table.cssselect("tr")[4].cssselect("td")[1].cssselect( "p")[0].text.strip() docdesc = table.cssselect("tr")[5].cssselect("td")[1].cssselect( "p")[0].text.strip() saksb = table.cssselect("tr")[0].cssselect("p")[5].text.strip() docdate = dateutil.parser.parse(datedesc.strip(), dayfirst=True) matchObj = re.match(r'(\d+)/(\d+)\s*-\s*(\d+)$', docid, re.M | re.I) if matchObj: caseyear = matchObj.group(1) caseseqnr = matchObj.group(2) casedocseq = matchObj.group(3) caseyear = expand_year(caseyear) caseid = str(caseyear) + "/" + str(caseseqnr) else: print "error: invalid Arkivsaksnr: " + docid matchObj = re.match(r'(\d+)/(\d+)\s*-', docid, re.M | re.I) if matchObj: caseyear = expand_year(matchObj.group(1)) caseseqnr = matchObj.group(2) caseid = str(caseyear) + "/" + str(caseseqnr) if parser.is_sender_doctype(doctype): fratilfield = 'sender' elif parser.is_recipient_doctype(doctype): fratilfield = 'recipient' data = { 'agency': agency, 'docdate': docdate.date(), 'recorddate': journal_date.date(), 'docdesc': docdesc, 'casedesc': casedesc, 'caseid': caseid, 'docid': docid, 'caseyear': caseyear, 'caseseqnr': caseseqnr, 'casedocseq': casedocseq, fratilfield: fratil_agency, 'doctype': doctype, 'saksbehandler': saksb, 'exemption': exemption, 'scrapedurl': url, 'scrapestamputc': datetime.datetime.now() } parser.verify_entry(data) datastore.append(data) scraperwiki.sqlite.save(unique_keys=['docid'], data=datastore) num_saved += len(datastore) datastore = [] #print "Saved %s" % data['caseid'] else: # TODO: log error or exit? msg = "Tried to scrape %s but got %s" % (date, journal_date.date()) #raise ValueError(msg) print msg return num_saved
def save_date(parser, date, url, html): num_saved = 0 root = lxml.html.fromstring(html) journal_date = dateutil.parser.parse(root.cssselect("p")[0].text_content().replace("Journaldato: ",""), dayfirst=True) if date == journal_date.date(): datastore = [] for table in root.cssselect("table"): docid = table.cssselect("tr")[0].cssselect("p")[1].text.strip() datedesc = table.cssselect("tr")[0].cssselect("td")[3].cssselect("p")[0].text.strip() exemption = table.cssselect("tr")[1].cssselect("td")[5].cssselect("p")[0].text.strip() fratil_indicator = table.cssselect("tr")[2].cssselect("td")[0].cssselect("p")[0].text.strip() doctype = "" if fratil_indicator.startswith("Til"): doctype = "U" elif fratil_indicator.startswith("Fra"): doctype = "I" elif fratil_indicator.startswith("Notat fra"): doctype = "N" else: raise ValueError("Fant ikke doctype %s" % fratil_indicator) fratil_agency = table.cssselect("tr")[2].cssselect("td")[1].cssselect("p")[0].text.strip() casedesc = table.cssselect("tr")[4].cssselect("td")[1].cssselect("p")[0].text.strip() docdesc = table.cssselect("tr")[5].cssselect("td")[1].cssselect("p")[0].text.strip() saksb = table.cssselect("tr")[0].cssselect("p")[5].text.strip() docdate = dateutil.parser.parse(datedesc.strip(), dayfirst=True) matchObj = re.match( r'(\d+)/(\d+)\s*-\s*(\d+)$', docid, re.M|re.I) if matchObj: caseyear = matchObj.group(1) caseseqnr = matchObj.group(2) casedocseq = matchObj.group(3) caseyear = expand_year(caseyear) caseid = str(caseyear) + "/" + str(caseseqnr) else: print "error: invalid Arkivsaksnr: " + docid matchObj = re.match( r'(\d+)/(\d+)\s*-', docid, re.M|re.I) if matchObj: caseyear = expand_year(matchObj.group(1)) caseseqnr = matchObj.group(2) caseid = str(caseyear) + "/" + str(caseseqnr) if parser.is_sender_doctype(doctype): fratilfield = 'sender' elif parser.is_recipient_doctype(doctype): fratilfield = 'recipient' data = { 'agency' : agency, 'docdate' : docdate.date(), 'recorddate' : journal_date.date(), 'docdesc' : docdesc, 'casedesc' : casedesc, 'caseid' : caseid, 'docid' : docid, 'caseyear' : caseyear, 'caseseqnr' : caseseqnr, 'casedocseq' : casedocseq, fratilfield : fratil_agency, 'doctype' : doctype, 'saksbehandler' : saksb, 'exemption' : exemption, 'scrapedurl' : url, 'scrapestamputc' : datetime.datetime.now() } parser.verify_entry(data) datastore.append(data) scraperwiki.sqlite.save(unique_keys=['docid'], data=datastore) num_saved += len(datastore) datastore = [] #print "Saved %s" % data['caseid'] else: # TODO: log error or exit? msg = "Tried to scrape %s but got %s" % (date, journal_date.date()) #raise ValueError(msg) print msg return num_saved