Beispiel #1
0
def session_list():
    import lxml.html
    from scrapelib import urlopen
    from datetime import date
    import string

    # Start from City Clerk page
    city_clerk_url = 'http://sanjoseca.gov/index.aspx?NID=145'
    city_clerk_doc = lxml.html.fromstring(urlopen(city_clerk_url))
    city_clerk_doc.make_links_absolute(city_clerk_url)

    # Find current year
    current_year_url = city_clerk_doc.xpath(
        '//td[//span]//a[contains(text(),"Council Agendas 2")]/@href')[0]
    current_year_doc = lxml.html.fromstring(urlopen(current_year_url))
    current_year_doc.make_links_absolute(current_year_url)

    current_year_text = current_year_doc.xpath(
        '//tr[contains(@class, "telerik-reTableHeaderRow")]//td[contains(text(),"COUNCIL AGENDAS")]/text()'
    )[0]
    current_year = string.split(current_year_text)[0]

    # Find agenda years
    council_agendas = map(
        string.strip,
        current_year_doc.xpath(
            '//a[contains(text(),"Council Agendas 2")]/text()'))
    agenda_years = map(strip_council_agendas_prefix, council_agendas)

    # Find old archived years
    archives_url = current_year_doc.xpath(
        "//a[contains(text(),'Archived Agendas')]/@href")[0]
    archives_doc = lxml.html.fromstring(urlopen(archives_url))
    archives_doc.make_links_absolute(archives_url)

    archived_council_agendas = map(
        string.strip,
        archives_doc.xpath(
            '//table[./tr/td/div/strong[text()="Council Agendas/Synopses"]]//a/text()'
        ))
    while archived_council_agendas.count('') > 0:
        archived_council_agendas.remove('')

    archived_council_minutes = map(
        string.strip,
        archives_doc.xpath(
            '//table[./tr/td/div/strong[text()="Council Meeting Minutes"]]//a/text()'
        ))
    while archived_council_minutes.count('') > 0:
        archived_council_minutes.remove('')

    aggregated_years = [
        current_year
    ] + agenda_years + archived_council_agendas + archived_council_minutes
    unique_years = list(set(aggregated_years))
    int_years = map(int, unique_years)
    int_years.sort()
    session_years = map(str, int_years)

    return session_years
Beispiel #2
0
def s3_get(abbr, doc):
    if settings.AWS_BUCKET:
        k = boto.s3.key.Key(s3bucket)
        k.key = 'documents/{0}/{1}'.format(abbr, doc['doc_id'])

        # try and get the object, if it doesn't exist- pull it down
        try:
            return k.get_contents_as_string()
        except:
            data = scrapelib.urlopen(doc['url'].replace(' ', '%20'))
            content_type = data.response.headers.get('content-type')
            if not content_type:
                url = doc['url'].lower()
                if url.endswith('htm') or doc['url'].endswith('html'):
                    content_type = 'text/html'
                elif url.endswith('pdf'):
                    content_type = 'application/pdf'
            headers = {
                'x-amz-acl': 'public-read',
                'Content-Type': content_type
            }
            k.set_contents_from_string(data.bytes, headers=headers)
            _log.debug('pushed %s to s3 as %s', doc['url'], doc['doc_id'])
            return data.bytes
    else:
        return scrapelib.urlopen(doc['url'].replace(' ', '%20')).bytes
Beispiel #3
0
def validate_api(abbr, schema_dir=None):
    metadata_schema = get_json_schema("metadata", schema_dir)
    path = "metadata/%s" % abbr
    url = api_url(path)
    json_response = scrapelib.urlopen(url)
    validictory.validate(json.loads(json_response),
                         metadata_schema,
                         validator_cls=APIValidator)

    bill_schema = get_json_schema("bill", schema_dir)

    level = metadata(abbr)['level']
    spec = {'level': level, level: abbr}
    total_bills = db.bills.find(spec).count()

    for i in xrange(0, 100):
        bill = db.bills.find(spec)[random.randint(0, total_bills - 1)]
        path = "bills/%s/%s/%s/%s" % (abbr, bill['session'], bill['chamber'],
                                      bill['bill_id'])
        url = api_url(path)

        json_response = scrapelib.urlopen(url)
        validictory.validate(json.loads(json_response),
                             bill_schema,
                             validator_cls=APIValidator)

    legislator_schema = get_json_schema("legislator", schema_dir)
    for legislator in db.legislators.find(spec):
        path = 'legislators/%s' % legislator['_id']
        url = api_url(path)

        json_response = scrapelib.urlopen(url)
        validictory.validate(json.loads(json_response),
                             legislator_schema,
                             validator_cls=APIValidator)

    committee_schema = get_json_schema("committee", schema_dir)
    for committee in db.committees.find(spec):
        path = "committees/%s" % committee['_id']
        url = api_url(path)

        json_response = scrapelib.urlopen(url)
        validictory.validate(json.loads(json_response),
                             committee_schema,
                             validator_cls=APIValidator)

    event_schema = get_json_schema("event", schema_dir)
    total_events = db.events.find(spec).count()

    if total_events:
        for i in xrange(0, 10):
            event = db.events.find(spec)[random.randint(0, total_events - 1)]
            path = "events/%s" % event['_id']
            url = api_url(path)

            json_response = scrapelib.urlopen(url)
            validictory.validate(json.loads(json_response),
                                 event_schema,
                                 validator_cls=APIValidator)
Beispiel #4
0
def validate_api(abbr, schema_dir=None):
    metadata_schema = get_json_schema("metadata", schema_dir)
    path = "metadata/%s" % abbr
    url = api_url(path)
    json_response = scrapelib.urlopen(url)
    validictory.validate(json.loads(json_response), metadata_schema,
                         validator_cls=APIValidator)

    bill_schema = get_json_schema("bill", schema_dir)

    level = metadata(abbr)['level']
    spec = {'level': level, level: abbr}
    total_bills = db.bills.find(spec).count()

    for i in xrange(0, 100):
        bill = db.bills.find(spec)[random.randint(0, total_bills - 1)]
        path = "bills/%s/%s/%s/%s" % (abbr, bill['session'],
                                      bill['chamber'], bill['bill_id'])
        url = api_url(path)

        json_response = scrapelib.urlopen(url)
        validictory.validate(json.loads(json_response), bill_schema,
                                 validator_cls=APIValidator)

    legislator_schema = get_json_schema("legislator", schema_dir)
    for legislator in db.legislators.find(spec):
        path = 'legislators/%s' % legislator['_id']
        url = api_url(path)

        json_response = scrapelib.urlopen(url)
        validictory.validate(json.loads(json_response), legislator_schema,
                             validator_cls=APIValidator)

    committee_schema = get_json_schema("committee", schema_dir)
    for committee in db.committees.find(spec):
        path = "committees/%s" % committee['_id']
        url = api_url(path)

        json_response = scrapelib.urlopen(url)
        validictory.validate(json.loads(json_response), committee_schema,
                             validator_cls=APIValidator)

    event_schema = get_json_schema("event", schema_dir)
    total_events = db.events.find(spec).count()

    if total_events:
        for i in xrange(0, 10):
            event = db.events.find(spec)[random.randint(0, total_events - 1)]
            path = "events/%s" % event['_id']
            url = api_url(path)

            json_response = scrapelib.urlopen(url)
            validictory.validate(json.loads(json_response), event_schema,
                                 validator_cls=APIValidator)
Beispiel #5
0
def session_list():
    import scrapelib
    import lxml.html
    # uses urllib because httplib2 has a compression issue on this page
    html = scrapelib.urlopen('http://legis.sd.gov/Legislative_Session/Menu.aspx')
    doc = lxml.html.fromstring(html)
    return doc.xpath('//div[@id="ContentPlaceHolder1_BlueBoxLeft"]//ul/li/a/div/text()')
Beispiel #6
0
def scrape_committees_html(year, chamber, doc):
    name_dict = defaultdict(set)
    tds = doc.xpath('//td[@valign="top"]')[3:]

    cache = []
    for td in tds:
        for name_dict, c in _committees_td(td, chamber, url, name_dict):
            if c not in cache:
                cache.append(c)
                yield name_dict, c

    # Get the joint approps subcommittees during the upper scrape.
    if chamber == 'upper':
        url = committee_urls['joint'][year]
        html = scrapelib.urlopen(url)

        name_dict = defaultdict(set)
        doc = lxml.html.fromstring(html)
        tds = doc.xpath('//td[@valign="top"]')[3:]

        cache = []
        for td in tds:
            for name_dict, c in _committees_td(td, 'joint', url, name_dict):
                if c not in cache:
                    cache.append(c)

                    # These are subcommittees, so a quick switcheroo of the names:
                    c['subcommittee'] = c['committee']
                    c['committee'] = 'Appropriations'
                    yield name_dict, c
Beispiel #7
0
def scrape_committees_html(year, chamber, doc):
    name_dict = defaultdict(set)
    tds = doc.xpath('//td[@valign="top"]')[3:]

    cache = []
    for td in tds:
        for name_dict, c in _committees_td(td, chamber, url, name_dict):
            if c not in cache:
                cache.append(c)
                yield name_dict, c

    # Get the joint approps subcommittees during the upper scrape.
    if chamber == 'upper':
        url = committee_urls['joint'][year]
        html = scrapelib.urlopen(url)

        name_dict = defaultdict(set)
        doc = lxml.html.fromstring(html)
        tds = doc.xpath('//td[@valign="top"]')[3:]

        cache = []
        for td in tds:
            for name_dict, c in _committees_td(td, 'joint', url, name_dict):
                if c not in cache:
                    cache.append(c)

                    # These are subcommittees, so a quick switcheroo of the
                    # names:
                    c['subcommittee'] = c['committee']
                    c['committee'] = 'Appropriations'
                    yield name_dict, c
Beispiel #8
0
def main():
    conn = pymongo.Connection(settings.MONGO_HOST, settings.MONGO_PORT)
    tweets = conn['openstates_web']['tweets']
    data = urlopen('http://api.twitter.com/1/statuses/user_timeline.json?screen_name=openstates&count=1&trim_user=1')
    data = json.loads(data)
    tweets.drop()
    tweets.insert(data, safe=True)
Beispiel #9
0
def session_list():
    import scrapelib
    from billy.scrape.utils import url_xpath
    data = scrapelib.urlopen(
        'http://www.azleg.gov/xml/sessions.asp?sort=SessionID')
    doc = lxml.html.fromstring(data.bytes)
    return doc.xpath('//session/@session_full_name')
Beispiel #10
0
def session_list():
    import scrapelib
    text = scrapelib.urlopen('ftp://ftp.cga.ct.gov')
    sessions = [line.split()[-1] for line in text.splitlines()]
    sessions.remove('incoming')
    sessions.remove('pub')
    return sessions
Beispiel #11
0
def session_list():
    import scrapelib
    text = scrapelib.urlopen('ftp://ftp.cga.ct.gov')
    sessions = [line.split()[-1] for line in text.splitlines()]
    sessions.remove('incoming')
    sessions.remove('pub')
    return sessions
Beispiel #12
0
def session_list():
    import scrapelib
    from billy.scrape.utils import url_xpath

    data = scrapelib.urlopen("http://www.azleg.gov/xml/sessions.asp?sort=SessionID")
    doc = lxml.html.fromstring(data.bytes)
    return doc.xpath("//session/@session_full_name")
Beispiel #13
0
def session_list():
    import scrapelib
    text = scrapelib.urlopen('ftp://ftp.cga.ct.gov')
    sessions = [line.split()[-1] for line in text.splitlines()]
    
    for not_session_name in ('incoming', 'pub', 'CGAAudio', 'rba', 'NCSL'):
        sessions.remove(not_session_name)
    return sessions
Beispiel #14
0
def session_list():
    import scrapelib

    text = scrapelib.urlopen("ftp://ftp.cga.ct.gov")
    sessions = [line.split()[-1] for line in text.splitlines()]
    sessions.remove("incoming")
    sessions.remove("pub")
    return sessions
Beispiel #15
0
def session_list():
    import scrapelib
    import lxml.html

    url = 'http://www.legis.nd.gov/assembly/'
    sessions = []

    html = scrapelib.urlopen(url)
    doc = lxml.html.fromstring(html)
    doc.make_links_absolute(url)
    # go through links and look for pages that have an active Legislation: link
    for a in doc.xpath("//div[@class='linkblockassembly']/div/span/a"):
        ahtml = scrapelib.urlopen(a.get('href'))
        adoc = lxml.html.fromstring(ahtml)
        if adoc.xpath('//a[contains(@href, "leginfo")]'):
            sessions.append(a.text)
    return sessions
def main():
    conn = pymongo.Connection(settings.MONGO_HOST, settings.MONGO_PORT)
    tweets = conn['openstates_web']['tweets']
    data = urlopen(
        'http://api.twitter.com/1/statuses/user_timeline.json?screen_name=openstates&count=1&trim_user=1'
    )
    data = json.loads(data)
    tweets.drop()
    tweets.insert(data, safe=True)
Beispiel #17
0
def session_list():
    import scrapelib
    import lxml.html
    # uses urllib because httplib2 has a compression issue on this page
    html = scrapelib.urlopen(
        'http://legis.sd.gov/Legislative_Session/Menu.aspx')
    doc = lxml.html.fromstring(html)
    return doc.xpath(
        '//div[@id="ContentPlaceHolder1_BlueBoxLeft"]//ul/li/a/div/text()')
def session_list():
    import lxml.html
    from scrapelib import urlopen
    from datetime import date
    import string

    # Start from City Clerk page
    city_clerk_url = 'http://sanjoseca.gov/index.aspx?NID=145'
    city_clerk_doc = lxml.html.fromstring(urlopen(city_clerk_url))
    city_clerk_doc.make_links_absolute(city_clerk_url)

    # Find current year
    current_year_url = city_clerk_doc.xpath('//td[//span]//a[contains(text(),"Council Agendas 2")]/@href')[0]
    current_year_doc = lxml.html.fromstring(urlopen(current_year_url))
    current_year_doc.make_links_absolute(current_year_url)

    current_year_text = current_year_doc.xpath('//tr[contains(@class, "telerik-reTableHeaderRow")]//td[contains(text(),"COUNCIL AGENDAS")]/text()')[0]
    current_year = string.split(current_year_text)[0]

    # Find agenda years
    council_agendas = map(string.strip, current_year_doc.xpath('//a[contains(text(),"Council Agendas 2")]/text()'))
    agenda_years = map(strip_council_agendas_prefix, council_agendas)

    # Find old archived years
    archives_url = current_year_doc.xpath("//a[contains(text(),'Archived Agendas')]/@href")[0]
    archives_doc = lxml.html.fromstring(urlopen(archives_url))
    archives_doc.make_links_absolute(archives_url)

    archived_council_agendas = map(string.strip, archives_doc.xpath('//table[./tr/td/div/strong[text()="Council Agendas/Synopses"]]//a/text()'))
    while archived_council_agendas.count('') > 0:
	archived_council_agendas.remove('')

    archived_council_minutes = map(string.strip, archives_doc.xpath('//table[./tr/td/div/strong[text()="Council Meeting Minutes"]]//a/text()'))
    while archived_council_minutes.count('') > 0:
	archived_council_minutes.remove('')

    aggregated_years = [current_year] + agenda_years + archived_council_agendas + archived_council_minutes
    unique_years     = list(set(aggregated_years))
    int_years        = map(int, unique_years)
    int_years.sort()
    session_years    = map(str, int_years)

    return session_years
Beispiel #19
0
def session_list():
    import scrapelib
    import lxml.html

    url = 'http://www.legis.nd.gov/assembly/'
    sessions = []

    html = scrapelib.urlopen(url)
    doc = lxml.html.fromstring(html)
    doc.make_links_absolute(url)
    return doc.xpath("//div[@class='view-content']//a/text()")
Beispiel #20
0
def session_list():
    import scrapelib
    import lxml.html

    url = 'http://www.legis.nd.gov/assembly/'
    sessions = []

    html = scrapelib.urlopen(url)
    doc = lxml.html.fromstring(html)
    doc.make_links_absolute(url)
    return doc.xpath("//div[@class='view-content']//a/text()")
def session_list():
    import lxml.html
    from scrapelib import urlopen
    from datetime import date

    url = 'http://www3.sanjoseca.gov/clerk/agenda.asp'
    doc = lxml.html.fromstring(urlopen(url))
    doc.make_links_absolute(url)

    timespan = next(text for text in doc.xpath('//text()[contains(.,"Meeting")][contains(.,"Minutes")]/following::text()') if text.strip())
    start = int(timespan.split('-', 1)[0])
    return map(str, range(start, date.today().year))
Beispiel #22
0
def scrape_committees(year, chamber):
    '''Since the legislator names aren't properly capitalized in the
    csv file, scrape the committee page and use the names listed there
    instead.
    '''
    url = committee_urls[chamber][year]
    html = scrapelib.urlopen(url)

    name_dict = defaultdict(set)
    doc = lxml.html.fromstring(html)
    tds = doc.xpath('//td[@valign="top"]')[3:]

    cache = []
    for td in tds:
        for name_dict, c in _committees_td(td, chamber, url, name_dict):
            if c not in cache:
                cache.append(c)
                yield name_dict, c

    # Get the joint approps subcommittees during the upper scrape.
    if chamber == 'upper':
        url = committee_urls['joint'][year]
        html = scrapelib.urlopen(url)

        name_dict = defaultdict(set)
        doc = lxml.html.fromstring(html)
        tds = doc.xpath('//td[@valign="top"]')[3:]

        cache = []
        for td in tds:
            for name_dict, c in _committees_td(td, 'joint', url, name_dict):
                if c not in cache:
                    cache.append(c)

                    # These are subcommittees, so a quick switcheroo of the names:
                    c['subcommittee'] = c['committee']
                    c['committee'] = 'Appropriations'
                    yield name_dict, c
Beispiel #23
0
def scrape_committees(year, chamber):
    '''Since the legislator names aren't properly capitalized in the
    csv file, scrape the committee page and use the names listed there
    instead.
    '''
    url = committee_urls[chamber][year]
    html = scrapelib.urlopen(url)

    name_dict = defaultdict(set)
    doc = lxml.html.fromstring(html)
    tds = doc.xpath('//td[@valign="top"]')[3:]

    cache = []
    for td in tds:
        for name_dict, c in _committees_td(td, chamber, url, name_dict):
            if c not in cache:
                cache.append(c)
                yield name_dict, c

    # Get the joint approps subcommittees during the upper scrape.
    if chamber == 'upper':
        url = committee_urls['joint'][year]
        html = scrapelib.urlopen(url)

        name_dict = defaultdict(set)
        doc = lxml.html.fromstring(html)
        tds = doc.xpath('//td[@valign="top"]')[3:]

        cache = []
        for td in tds:
            for name_dict, c in _committees_td(td, 'joint', url, name_dict):
                if c not in cache:
                    cache.append(c)

                    # These are subcommittees, so a quick switcheroo of the names:
                    c['subcommittee'] = c['committee']
                    c['committee'] = 'Appropriations'
                    yield name_dict, c
Beispiel #24
0
def scrape_story(url, cache_dir):
    story_doc = lxml.html.fromstring(scrapelib.urlopen(url))
    text = story_doc.xpath('//div[@class="storytext xcontrast_txt"]')[0].text_content()

    with open(cache_dir + '/' + hashlib.sha1(url).hexdigest(), 'w') as file:
        file.write(text.encode('utf8', 'ignore'))

    print url

    # check for next page
    base_url = 'http://www.fanfiction.net'
    next = story_doc.xpath('//input[contains(@value, "Next")]/@onclick')
    if next:
        url = base_url + next[0].replace("self.location='", '').strip("'")
        scrape_story(url, cache_dir)
Beispiel #25
0
def s3_get(abbr, doc):
    if settings.AWS_BUCKET:
        k = boto.s3.key.Key(s3bucket)
        k.key = 'documents/{0}/{1}'.format(abbr, doc['doc_id'])

        # try and get the object, if it doesn't exist- pull it down
        try:
            return k.get_contents_as_string()
        except:
            data = scrapelib.urlopen(doc['url'].replace(' ', '%20'))
            content_type = data.response.headers.get('content-type')
            if not content_type:
                url = doc['url'].lower()
                if url.endswith('htm') or doc['url'].endswith('html'):
                    content_type = 'text/html'
                elif url.endswith('pdf'):
                    content_type = 'application/pdf'
            headers = {'x-amz-acl': 'public-read',
                       'Content-Type': content_type}
            k.set_contents_from_string(data.bytes, headers=headers)
            _log.debug('pushed %s to s3 as %s', doc['url'], doc['doc_id'])
            return data.bytes
    else:
        return scrapelib.urlopen(doc['url'].replace(' ', '%20')).bytes
Beispiel #26
0
def s3_get(id):
    k = boto.s3.key.Key(s3bucket)
    k.key = 'documents/{0}/{1}'.format(id[0:2].lower(), id)

    # try and get the object, if it doesn't exist- pull it down
    try:
        return k.get_contents_as_string()
    except:
        doc = db.tracked_versions.find_one(id)
        if not doc:
            return None
        data = scrapelib.urlopen(doc['url'].replace(' ', '%20'))
        content_type = data.response.headers['content-type']
        headers = {'x-amz-acl': 'public-read', 'Content-Type': content_type}
        k.set_contents_from_string(data.bytes, headers=headers)
        log.debug('pushed %s to s3 as %s', doc['url'], id)
        return data.bytes
Beispiel #27
0
def scrape_committees(year, chamber):
    '''Since the legislator names aren't properly capitalized in the
    csv file, scrape the committee page and use the names listed there
    instead.
    '''
    url = committee_urls[chamber][year]
    html = scrapelib.urlopen(url)

    name_dict = defaultdict(set)
    doc = lxml.html.fromstring(html)
    tds = doc.xpath('//td[@valign="top"]')[3:]

    cache = []
    for td in tds:
        for name_dict, c in _committees_td(td, chamber, url, name_dict):
            if c not in cache:
                cache.append(c)
                yield name_dict, c
Beispiel #28
0
    def handle_noargs(self, **options):
        url = 'http://www.wwe.com/superstars'
        data = scrapelib.urlopen(url)
        doc = lxml.html.fromstring(data)
        doc.make_links_absolute(url)

        for div in doc.xpath('//div[starts-with(@class, "star ")]'):
            cssclass = div.get('class')
            if 'letter-champion' in cssclass:
                continue
            # get division
            divisions = ('divas', 'raw', 'smackdown')
            for division in divisions:
                if division in cssclass:
                    break
            else:
                division = 'other'
            name = div.xpath('h2')[0].text_content().strip()
            url = div.xpath('a/@href')[0]
            id = url.rsplit('/', 1)[-1]
            photo_url = 'http://wwe.com' + div.xpath('a/img/@data-fullsrc')[0]

            if Star.objects.filter(id=id).count():
                star = Star.objects.get(id=id)
                if star.name != name:
                    print('updating {0} name to {1}'.format(star.name, name))
                    star.name = name
                if star.division != division:
                    print('updating {0} division to {1}'.format(star.name, division))
                    star.division = division
                if star.photo_url != photo_url:
                    print('updating {0} photo to {1}'.format(star.name.encode('utf8'), photo_url))
                    star.photo_url = photo_url
                star.save()
            else:
                print('adding {0}'.format(name))
                Star.objects.create(id=id, name=name, division=division,
                                    photo_url=photo_url)
Beispiel #29
0
def url_xpath(url, path):

    data = scrapelib.urlopen(url)

    if (data is None):
        print "Could not get data from url:%s" % (url)
        raise NoData(url)

    doc = lxml.html.fromstring(data)

    if (doc is None):
        print "Could not decode XML Doc:%s" % (data)
        raise NoDoc(data)

#    print etree.tostring(doc)
#    print doc
#    print "Check  path:%s Doc:%s" % (path,doc)

    result = doc.xpath(path)

#    print result
#    print len(result)

    if (result is None):
#        print doc
#        print doc.tag
#        print etree.tostring(doc)
#        print "Xpath failed path:%s Doc:%s" % (path,doc)
        print "Xpath failed"
        raise NoXpath(data)

#    print "url_xpath %s" % result
#    exc_type, exc_value, exc_traceback = sys.exc_info()
#    traceback.print_exc()
#    traceback.print_tb(exc_traceback, file=sys.stdout)
    return result
Beispiel #30
0
def _get_url(url):
    return lxml.html.fromstring(scrapelib.urlopen(url))
Beispiel #31
0
def ct_session_info():
    html = scrapelib.urlopen("ftp://ftp.cga.ct.gov")
    sessions = [line.split()[-1] for line in html.splitlines()]
    sessions.pop()  # remove pub/
    return sessions, sessions[-1]
Beispiel #32
0
 def lxmlize(self, url, encoding = 'utf-8'):
   entry = urlopen(url).encode(encoding)
   return lxml.html.fromstring(entry)
Beispiel #33
0
def scrape_stories(url, cache_dir):
    os.path.exists(cache_dir) or os.makedirs(cache_dir)
    doc = lxml.html.fromstring(scrapelib.urlopen(url))
    doc.make_links_absolute(url)
    for link in doc.xpath('//a[@class="stitle"]/@href'):
        scrape_story(link, cache_dir)
Beispiel #34
0
def url_xpath(url):
    html = scrapelib.urlopen(url)
    html = html.decode('latin-1')
    doc = lxml.html.fromstring(html)
    return doc
Beispiel #35
0
def url_xpath(url, path):
    import scrapelib
    import lxml.html
    doc = lxml.html.fromstring(scrapelib.urlopen(url))
    return doc.xpath(path)
Beispiel #36
0
def url2lxml(url):
    html = urlopen(url).decode('latin-1')
    return lxml.html.fromstring(html)
Beispiel #37
0
def validate_xml(url, schema):
    response = scrapelib.urlopen(url + "&format=xml")
    xml = lxml.etree.fromstring(response)
    for child in xml.xpath("/results/*"):
        schema.assertValid(child)
Beispiel #38
0
def validate_api(state):
    cwd = os.path.split(__file__)[0]
    schema_dir = os.path.join(cwd, "../schemas/api/")

    xml_schema = get_xml_schema()

    with open(os.path.join(schema_dir, "metadata.json")) as f:
        metadata_schema = json.load(f)

    path = "metadata/%s" % state
    url = api_url(path)
    json_response = scrapelib.urlopen(url)
    validictory.validate(json.loads(json_response), metadata_schema,
                         validator_cls=APIValidator)
    validate_xml(url, xml_schema)

    with open(os.path.join(schema_dir, "bill.json")) as f:
        bill_schema = json.load(f)

    bill_spec = {'state': state}
    total_bills = db.bills.find(bill_spec).count()

    for i in xrange(0, 100):
        bill = db.bills.find(bill_spec)[random.randint(0, total_bills - 1)]
        path = "bills/%s/%s/%s/%s" % (state, bill['session'],
                                           bill['chamber'],
                                           bill['bill_id'])
        url = api_url(path)

        json_response = scrapelib.urlopen(url)
        validictory.validate(json.loads(json_response), bill_schema,
                                 validator_cls=APIValidator)

        validate_xml(url, xml_schema)

    with open(os.path.join(schema_dir, "legislator.json")) as f:
        legislator_schema = json.load(f)

    for legislator in db.legislators.find({'state': state}):
        path = 'legislators/%s' % legislator['_id']
        url = api_url(path)

        json_response = scrapelib.urlopen(url)
        validictory.validate(json.loads(json_response), legislator_schema,
                             validator_cls=APIValidator)

        validate_xml(url, xml_schema)

    with open(os.path.join(schema_dir, "committee.json")) as f:
        committee_schema = json.load(f)

    for committee in db.committees.find({'state': state}):
        path = "committees/%s" % committee['_id']
        url = api_url(path)

        json_response = scrapelib.urlopen(url)
        validictory.validate(json.loads(json_response), committee_schema,
                             validator_cls=APIValidator)

        validate_xml(url, xml_schema)

    with open(os.path.join(schema_dir, "event.json")) as f:
        event_schema = json.load(f)

    total_events = db.events.find({'state': state}).count()

    if total_events:
        for i in xrange(0, 10):
            event = db.events.find({'state': state})[
                random.randint(0, total_events - 1)]
            path = "events/%s" % event['_id']
            url = api_url(path)

            json_response = scrapelib.urlopen(url)
            validictory.validate(json.loads(json_response), event_schema,
                                 validator_cls=APIValidator)

            validate_xml(url, xml_schema)
Beispiel #39
0
def ct_session_info():
    html = scrapelib.urlopen("ftp://ftp.cga.ct.gov")
    sessions = [line.split()[-1] for line in html.splitlines()]
    sessions.pop()    # remove pub/
    return sessions, sessions[-1]
Beispiel #40
0
def url2lxml(url):
    html = urlopen(url)
    return lxml.html.fromstring(html)
Beispiel #41
0
def url_xpath(url):
    html = scrapelib.urlopen(url)
    html = html
    doc = lxml.html.fromstring(html)
    return doc
Beispiel #42
0
def _get_url(url):
    return lxml.html.fromstring(scrapelib.urlopen(url))
Beispiel #43
0
def url_xpath(url):
    html = scrapelib.urlopen(url)
    html = html
    doc = lxml.html.fromstring(html)
    return doc
Beispiel #44
0
def url_xpath(url, path):
    import scrapelib
    import lxml.html
    doc = lxml.html.fromstring(scrapelib.urlopen(url))
    return doc.xpath(path)
Beispiel #45
0
def url2lxml(url):
    html = urlopen(url)
    return lxml.html.fromstring(html)
Beispiel #46
0
def validate_xml(url, schema):
    response = scrapelib.urlopen(url + "&format=xml")
    xml = lxml.etree.fromstring(response)
    for child in xml.xpath("/results/*"):
        schema.assertValid(child)