Esempio n. 1
0
    def parse(self, response):
        hxs = HtmlXPathSelector(response)

        rows = hxs.select('//tr/td')

        rows_list = []
        for row in rows:
            row = html2text.html2text(row.extract())
            print row
            rows_list.append(row[:-2])

        ref_nums = rows.select('p[starts-with(.,"PA")]/text()').extract()

        items = []
        for ref_num in ref_nums:
            print ref_num
            i = rows_list.index(ref_num)
            item = DisclosureItem()
            item['department'] = unicode('Department of Health')
            item['source'] = unicode(
                "http://www.health.nsw.gov.au/gipaa/disclosures.asp")
            item['ref_number'] = ref_num
            print "PROBLEM"
            print rows_list[i + 1]
            rows_list[i + 1] = rows_list[i + 1].encode('ascii', 'replace')
            #rows_list[i+1] = rows_list[i+1].replace('\xc2\xa0', ' ')
            #rows_list[i+1] = unicode(rows_list[i+1])
            rows_list[i + 1] = rows_list[i + 1].replace('?', ' ')
            rows_list[i + 1] = rows_list[i + 1].replace('-', ' ')
            a, b, c = unicode(rows_list[i + 1]).split(' ')
            print a, b, c
            if b[0] == 'S':
                rows_list[i + 1] = ''.join([a, ' Sep ', c])
            elif b[0] == 'N':
                rows_list[i + 1] = ''.join([a, ' Nov ', c])
            print rows_list[i + 1]
            print a, b, c
            item['disclosure_date'] = dateutil.parser.parse(
                rows_list[i + 1]).date().strftime("%Y-%m-%d %H:%M:%S")
            #item['disclosure_date'] = rows_list[i+1]
            item['description'] = rows_list[i + 2]
            item['release_type'] = rows_list[i + 3]
            item['how_to_access'] = rows_list[i + 4]
            items.append(item)
            #print item #for testing uncomment this line and comment out next line
            item.save()
        return items
Esempio n. 2
0
    def parse(self, response):
        hxs = HtmlXPathSelector(response)

        rows = hxs.select('//tr/td')
        
        rows_list = []
        for row in rows:
            row = html2text.html2text(row.extract())
            print row
            rows_list.append(row[:-2])

        ref_nums = rows.select('p[starts-with(.,"PA")]/text()').extract()

        items = []
        for ref_num in ref_nums:
            print ref_num
            i = rows_list.index(ref_num)
            item = DisclosureItem()
            item['department'] = unicode('Department of Health')
            item['source'] = unicode("http://www.health.nsw.gov.au/gipaa/disclosures.asp")
            item['ref_number'] = ref_num
            print "PROBLEM"
            print rows_list[i+1]
            rows_list[i+1] = rows_list[i+1].encode('ascii','replace')
            #rows_list[i+1] = rows_list[i+1].replace('\xc2\xa0', ' ')
            #rows_list[i+1] = unicode(rows_list[i+1])
            rows_list[i+1] = rows_list[i+1].replace('?',' ')
            rows_list[i+1] = rows_list[i+1].replace('-',' ')
            a,b,c = unicode(rows_list[i+1]).split(' ')
            print a, b, c
            if b[0] =='S':
                rows_list[i+1]= ''.join([a, ' Sep ',  c])
            elif b[0] == 'N':
                rows_list[i+1]= ''.join([a, ' Nov ',  c])
            print rows_list[i+1]
            print a ,b ,c
            item['disclosure_date'] = dateutil.parser.parse(rows_list[i+1]).date().strftime("%Y-%m-%d %H:%M:%S")            
#item['disclosure_date'] = rows_list[i+1]
            item['description'] = rows_list[i+2] 
            item['release_type'] = rows_list[i+3]
            item['how_to_access'] = rows_list[i+4]
            items.append(item)
            #print item #for testing uncomment this line and comment out next line
            item.save()
        return items
Esempio n. 3
0
    def parse(self, response):
        hxs = HtmlXPathSelector(response)

        rows = hxs.select(
            '/html/body/div/div/div/div/div/div/div/div/div/div/div/div/div/div/div/table/thead/tr/td'
        )

        rows_list = []
        for row in rows:
            row = html2text.html2text(row.extract())
            print row
            rows_list.append(row[:-2])

        ref_nums = rows.select(
            'p[starts-with(.,"1") and contains(.,"-")]/text()').extract()

        #rows_list = rows.select('p/text()').extract()
        items = []
        for ref_num in ref_nums:
            i = rows_list.index(ref_num)
            item = DisclosureItem()
            item['department'] = unicode('Department of Transport')
            item['source'] = unicode(
                "http://www.transport.nsw.gov.au/content/disclosure-log")
            item['ref_number'] = ref_num
            #hackish cludge
            a, b, c = rows_list[i + 1].split(' ')
            print a, b, c
            if b[0] == 'S':
                rows_list[i + 1] = ''.join([a, ' Sep ', c])
            elif b[0] == 'N':
                rows_list[i + 1] = ''.join([a, ' Nov ', c])
            print rows_list[i + 1]
            print a, b, c
            item['disclosure_date'] = dateutil.parser.parse(
                rows_list[i + 1]).date().strftime("%Y-%m-%d %H:%M:%S")
            item['description'] = rows_list[i + 2]
            item[
                'how_to_access'] = "*****@*****.**" if "Email or written request to the address above." in rows_list[
                    i + 3] else rows_list[i + 3]
            item['release_type'] = 'NA'
            items.append(item)
            item.save()
        return items
Esempio n. 4
0
    def parse(self, response):
        hxs = HtmlXPathSelector(response)

        rows = hxs.select("//table[1]/tr/td")

        rows_list = []
        for row in rows:
            row = html2text.html2text(row.extract())
            print row
            rows_list.append(row[:-2])

        ref_nums = rows.select(
            'p[starts-with(.,"2") and contains(.,"/")]/text()').extract()

        #rows_list = rows.select('p/text()').extract()
        items = []
        for ref_num in ref_nums:
            i = rows_list.index(ref_num)
            item = DisclosureItem()
            item['department'] = unicode('Office of Communities')
            item['source'] = unicode(
                "http://www.communities.nsw.gov.au/gipainfo/gipa_disclosurelog.asp"
            )
            item['ref_number'] = ref_num
            #item['disclosure_date'] = rows_list[i+1]
            a, b, c = rows_list[i + 1].split(' ')
            print a, b, c
            if b[0] == 'S':
                rows_list[i + 1] = ''.join([a, ' Sep ', c])
            elif b[0] == 'N':
                rows_list[i + 1] = ''.join([a, ' Nov ', c])
            print rows_list[i + 1]
            print a, b, c
            item['disclosure_date'] = dateutil.parser.parse(
                rows_list[i + 1]).date().strftime("%Y-%m-%d %H:%M:%S")
            item['description'] = rows_list[i + 3]
            item['how_to_access'] = rows_list[i + 4]
            item['release_type'] = rows_list[i + 2]
            items.append(item)
            #print item
            item.save()
        return items
Esempio n. 5
0
    def parse(self, response):
        hxs = HtmlXPathSelector(response)

        rows = hxs.select("//table[1]/tr/td")

        rows_list = []
        for row in rows:
            row = html2text.html2text(row.extract())
            print row
            rows_list.append(row[:-2])

        ref_nums = rows.select('p[starts-with(.,"2") and contains(.,"/")]/text()').extract()
 
        #rows_list = rows.select('p/text()').extract()
        items = []
        for ref_num in ref_nums:
            i = rows_list.index(ref_num)
            item = DisclosureItem()
            item['department'] = unicode('Office of Communities')
            item['source'] = unicode("http://www.communities.nsw.gov.au/gipainfo/gipa_disclosurelog.asp")
            item['ref_number'] = ref_num
            #item['disclosure_date'] = rows_list[i+1]
            a,b,c = rows_list[i+1].split(' ')
            print a, b, c
            if b[0] =='S':
                rows_list[i+1]= ''.join([a, ' Sep ',  c])
            elif b[0] == 'N':
                rows_list[i+1]= ''.join([a, ' Nov ',  c])
            print rows_list[i+1]
            print a ,b ,c
            item['disclosure_date'] = dateutil.parser.parse(rows_list[i+1]).date().strftime("%Y-%m-%d %H:%M:%S")
            item['description'] = rows_list[i+3]
            item['how_to_access'] = rows_list[i+4]
            item['release_type'] = rows_list[i+2]
            items.append(item)
            #print item
            item.save()
        return items
Esempio n. 6
0
    def parse(self, response):
        hxs = HtmlXPathSelector(response)

        rows = hxs.select('/html/body/div/div/div/div/div/div/div/div/div/div/div/div/div/div/div/table/thead/tr/td')

        rows_list = []
        for row in rows:
            row = html2text.html2text(row.extract())
            print row
            rows_list.append(row[:-2])

        ref_nums = rows.select('p[starts-with(.,"1") and contains(.,"-")]/text()').extract()
 
        #rows_list = rows.select('p/text()').extract()
        items = []
        for ref_num in ref_nums:
            i = rows_list.index(ref_num)
            item = DisclosureItem()
            item['department'] = unicode('Department of Transport')
            item['source'] = unicode("http://www.transport.nsw.gov.au/content/disclosure-log")
            item['ref_number'] = ref_num
            #hackish cludge
            a,b,c = rows_list[i+1].split(' ')
            print a, b, c
            if b[0] =='S':
                rows_list[i+1]= ''.join([a, ' Sep ',  c])
            elif b[0] == 'N':
                rows_list[i+1]= ''.join([a, ' Nov ',  c])
            print rows_list[i+1]
            print a ,b ,c
            item['disclosure_date'] = dateutil.parser.parse(rows_list[i+1]).date().strftime("%Y-%m-%d %H:%M:%S")
            item['description'] = rows_list[i+2]
            item['how_to_access'] = "*****@*****.**" if "Email or written request to the address above." in rows_list[i+3] else rows_list[i+3]
            item['release_type'] = 'NA'
            items.append(item)
            item.save()
        return items