Ejemplo n.º 1
0
def getNewItems(root):
    for d in root.xpath('//td[@class="listlevelthree"]/../td/a'):
        dossier = fetch((URL+d.attrib['href']).encode('utf8'))
        for e in  dossier.xpath('//a[@class="com_acronym"]'):
            d_url = e.attrib['href']
            if not db.dossiers.find_one({'meta.source': URL+d_url}):
                oeil_scrape(URL+d_url)
Ejemplo n.º 2
0
def getNewItems(root):
    for d in root.xpath('//td[@class="listlevelthree"]/../td/a'):
        dossier = fetch((URL + d.attrib['href']).encode('utf8'))
        for e in dossier.xpath('//a[@class="com_acronym"]'):
            d_url = e.attrib['href']
            if not db.dossiers.find_one({'meta.source': URL + d_url}):
                oeil_scrape(URL + d_url)
Ejemplo n.º 3
0
def scrape(url):
    root = fetch(url)
    # TODO optimize this!! (reduce steps)
    if not exists(LAST_UPDATED_CACHE) or open(LAST_UPDATED_CACHE).read() != strip(root.xpath('//div[text()="Data updated on :"]/span/text()')[0]):
        print >>sys.stderr, '[!] Site modification found, scraping unfinished dossiers....'
        for d in db.dossiers.find({'procedure.stage_reached': {'$in': STAGES}},timeout=False):
            oeil_scrape(d['meta']['source'])
            print >>sys.stderr, '\t%s, %s' % (d['procedure']['reference'].encode('utf8'), d['procedure']['title'].encode('utf8'))
        f = open(LAST_UPDATED_CACHE, "w+")
        f.write(strip(root.xpath('//div[text()="Data updated on :"]/span/text()')[0]))
        f.close()
    print >>sys.stderr, '\n[!] Searching/scraping new items..'
    getNewItems(root)
    return True
Ejemplo n.º 4
0
def scrape(url):
    root = fetch(url)
    # TODO optimize this!! (reduce steps)
    if not exists(LAST_UPDATED_CACHE) or open(LAST_UPDATED_CACHE).read(
    ) != strip(root.xpath('//div[text()="Data updated on :"]/span/text()')[0]):
        print >> sys.stderr, '[!] Site modification found, scraping unfinished dossiers....'
        for d in db.dossiers.find({'procedure.stage_reached': {
                '$in': STAGES
        }},
                                  timeout=False):
            oeil_scrape(d['meta']['source'])
            print >> sys.stderr, '\t%s, %s' % (
                d['procedure']['reference'].encode('utf8'),
                d['procedure']['title'].encode('utf8'))
        f = open(LAST_UPDATED_CACHE, "w+")
        f.write(
            strip(
                root.xpath('//div[text()="Data updated on :"]/span/text()')
                [0]))
        f.close()
    print >> sys.stderr, '\n[!] Searching/scraping new items..'
    getNewItems(root)
    return True