Python scrape Examples, scrapemark.scrape Python Examples

Example #1

0

Show file

File: main.py Project: DrMavenRebe/scrappy-scraper

  def scrape_nyt(self):
    urls = scrapemark.scrape("""
          <body>    
              {*
                <div class='element2'>
                  <h3> <a href='{{ [links].url }}'></a> </h3>
                </div>
              *}
          </body>
        """,
        url='http://www.nytimes.com/most-popular-emailed')['links']

    urls += scrapemark.scrape("""
          <body>    
              {*
                <div class='element2'>
                  <h3> <a href='{{ [links] }}'></a> </h3>
                </div>
              *}
          </body>
        """,
        url='http://www.nytimes.com/most-popular-viewed')['links']

    urls += scrapemark.scrape("""
          <body>    
              {*
                <div class='element2'>
                  <h3> <a href='{{ [links] }}'></a> </h3>
                </div>
              *}
          </body>
        """,
        url='http://www.nytimes.com/most-popular-blogged')['links']

    return urls

Example #2

0

Show file

File: new_zealand_bills.py Project: pombredanne/scraperwiki-scraper-vault

def main():
    links = scrape(PATTERN, url=URL)
    print links
    #done= set([res['nzgls_identifier'] for res in sw.sqlite.select('nzgls_identifier FROM bills')])
    #print done
    for link in links['sources']:
        bills = scrape(PATTERN2, url=link)['bills']
        print bills
        for bill in bills:
            print bill
            try:
                bill = scrape(INDIVIDUAL_BILL, url=bill)
            except Exception, e:
                print "DEBUG: %s" % e
                continue
            bill['link'] = link
            do_details(bill)
            do_meta(bill)
            do_related(bill)
            for related_doc in bill['related']:
                related_doc['nzgls_identifier']=bill['nzgls_identifier']
                related_doc['bill']=bill['title']
            sw.sqlite.save(['link'], data=bill['related'], table_name='related_docs')
            cleanup(bill)
            sw.sqlite.save(['link', 'valid_from'], data=bill, table_name='bills')

Example #3

0

Show file

File: swift_codes.py Project: rayassch/scraperwiki-scraper-vault

def parse_swift_code_page(url, country_name, queue=Q):
    if url in DONE:
        return None
    print 'downloading', country_name
    raw = get_country_html(url)

    banks = scrape(SWIFT_CODE_PATTERN, html=raw)['banks']
    for bank in banks:
        bank['address'] = cleanup_address(bank['address'])
        bank['country_name'] = country_name
        bank['source'] = url
    sqlite.save(['swift_code'], data=banks, table_name='swift_codes')

    if 'page=' not in url:
        try:
            n_pages = max(
                int(link.split('=')[-1])
                for link in scrape(PAGINATION_PATTERN, html=raw))
            pages = [
                BASE_URL +
                '/swift-code/search-swift-complete.php?country=%s&page=%d' %
                (country_name.replace(' ', '%20'), n) for n in xrange(n_pages)
            ]
        except ValueError:  #no more pages
            pages = []
        for newurl in pages:
            queue.push((parse_swift_code_page, newurl, country_name))
    DONE.add(url)
    sqlite.save(['url'], table_name='_done', data=dict(url=url))

Example #4

0

Show file

File: main.py Project: DrMavenRebe/scrappy-scraper

  def scrape_topsy(self):
    urls = scrapemark.scrape("""
          <body>
            <div class="list">
              {*
                  <h3 class="title">
                  <a href='{{ [links].url }}'></a>
                  </h3>
              *}
            </div>
          </body>
        """,
        url='http://topsy.com/top100')['links']

    for page, offset in enumerate([15,30,45,60,75,90,105,120,135]):
      urls += scrapemark.scrape("""
          <body>
            <div class="list">
              {*
                  <h3 class="title">
                  <a href='{{ [links].url }}'></a>
                  </h3>
              *}
            </div>
          </body>
        """,
        url='http://topsy.com/top100?offset='+str(offset)+'&om=f&page='+str(page+1)+'&thresh=top100')['links']

    return urls

Example #5

0

Show file

File: new_zealand_historic_places.py Project: pombredanne/scraperwiki-scraper-vault

def extract_construction_profs(details):
    blanket = "<h6>Construction Professionals</h6>{{ pros|html }}" ""
    targetted = """<h6>Construction Professionals</h6>{* <a href="javascript:viewBio('{{ [pros.id] }}');">{{ [pros].name }} </a> *}"""
    pros = scrape(targetted, html=details)['pros']
    if not pros:
        pros = scrape(blanket, html=details)['pros']
    return 'pros', pros

Example #6

0

Show file

File: new_zealand_historic_places.py Project: flyeven/scraperwiki-scraper-vault

def extract_construction_profs(details):
    blanket = "<h6>Construction Professionals</h6>{{ pros|html }}"""
    targetted = """<h6>Construction Professionals</h6>{* <a href="javascript:viewBio('{{ [pros.id] }}');">{{ [pros].name }} </a> *}"""
    pros = scrape(targetted, html=details)['pros']
    if not pros:
        pros = scrape(blanket, html=details)['pros']
    return 'pros', pros

Example #7

0

Show file

def corporation_registration(page):
    pattern = """Corporation:{{ name }}Name change history
Responsible Officer:{{ responsible_officer_name }}
Position Title:    {{ responsible_officer_name }}
Version:{{ registration_id }}
Type:{{ registration_type }}
Active from:{{ registration_active_from_date }}
Activity last confirmed:{{ registration_last_confirmed_date }}

A. Information about Responsible Officer and Corporation
Corporation:{{ corporation_name }}
Telephone number:{{ corporation_phone }}
Fax number:{{ corporation_fax }}
Description of the corporation's business activities: {{ corporation_business_activities }}
 
Parent:{{ parent|html }}
Subsidiary:{{ subsidiary|html }}
Was the corporation funded in whole or in part by any domestic or foreign government institution in the last completed financial year, or does the client expect funding in the current financial year?{{ is_government_funded }}

B. Lobbyists Employed by the Corporation
List of Senior Officers whose lobbying activities represent less than 20% of their Duties
{*
Name:{{ [lobbyists].name }}
Position title:{{ [lobbyists].title }}
Public offices held:{{ [lobbyists].public_offices_held }}
Designated public office holder:{{ [lobbyists].is_public_officer }}Name
*}{*
Name:{{ [lobbyists].name }}
Position title:{{ [lobbyists].title }}
Public offices held:{{ [lobbyists].public_offices_held }}
Designated public office holder:{{ [lobbyists].is_public_officer }}
*}

C. Lobbying Activity Information
Federal departments or organizations which have been or will be communicated with during the course of the undertaking: {{ agencies_talked_to }}
Communication techniques that have been used or are expected to be used in the course of the undertaking: 
{{ lobbying_activities }}
Information about Subject matter:{{ lobbying_subject_matter }}
 
Details Regarding the Identified Subject Matter
"""
    subject_matter_pattern = """Details Regarding the Identified Subject Matter
{* <tr><td>{{ [topics].category }}</td><td>{{ [topics].description }}</td></tr> *} 
"""
    page = GET(url)
    registration = scrape(pattern,
                          html=html.tostring(html.fromstring(page),
                                             encoding='utf-8',
                                             method='text'))
    registration['lobbyists'] = [
        l for l in registration['lobbyists']
        if len(l['is_public_officer'].split()) == 1
    ]
    registration['topics'] = scrape(subject_matter_pattern, html=page)
    registration['parent'] = registration['parent'].strip()
    registration['parent_name'] = registration['parent'].split('\n')[0]
    registration['subsidiary'] = registration['subsidiary'].strip()
    registration['subsidiary_name'] = registration['subsidiary'].split('\n')[0]

Example #8

0

Show file

File: art.py Project: rayassch/scraperwiki-scraper-vault

def main():
    movements = scrape(MOVEMENTS_INDEX, html=get_page(URL))
    print movements
    for m in movements['movements']:
        if 'artcyclopedia.com' in m['link']:
            movement = scrape(MOVEMENTS_INDIVIDUAL, html=get_page(m['link']))
            print m['title']
            if not movement:
                movement = scrape(MOVEMENTS_INDIVIDUAL2,
                                  html=get_page(m['link']))

            relations = []
            for relation in movement['related']:
                r = dict(movement=m['title'], related_to=relation['topic'])
                if '/artists/' in relation['link']:
                    r['topic'] = 'artist'
                else:
                    r['topic'] = 'movement'
                relations.append(r)

            artists = []
            for artist in movement['artists']:
                artist['movement'] = m['title']
                dates = artist['alive'].split('-')
                try:
                    artist['birth_year'] = int(dates[0])
                    artist['death_year'] = int(dates[1])
                except ValueError:
                    if 'Born' in dates:
                        artist['birth_year'] = int(dates.split()[1])
                        artist['death_year'] = None
                except:
                    print >> sys.stderr, "ERROR: Can't parse dates for %s: %s" % (
                        artist['name'], artist['alive'])
                    artist['birth_year'] = None
                    artist['death_year'] = None
                artist['profile_link'] = URL + artist['profile_link'][3:]
                try:
                    artist['nationality'], artist['profession'] = artist[
                        'artist_type'].split(' ', 1)
                except ValueError:
                    artist['nationality'] = artist['artist_type']
                    artist['profession'] = 'unknown'

                artists.append(artist)
            datastore.save(['name'],
                           table_name="movements",
                           data=dict(name=m['title'], link=m['link']))
            datastore.save(['movement', 'related_to'],
                           table_name="relations",
                           data=relations)
            datastore.save(['name', 'nationality'],
                           table_name="artists",
                           data=artists)

Example #9

0

Show file

File: canadian_lobbyists.py Project: flyeven/scraperwiki-scraper-vault

def corporation_registration(page):
    pattern = """Corporation:{{ name }}Name change history
Responsible Officer:{{ responsible_officer_name }}
Position Title:    {{ responsible_officer_name }}
Version:{{ registration_id }}
Type:{{ registration_type }}
Active from:{{ registration_active_from_date }}
Activity last confirmed:{{ registration_last_confirmed_date }}

A. Information about Responsible Officer and Corporation
Corporation:{{ corporation_name }}
Telephone number:{{ corporation_phone }}
Fax number:{{ corporation_fax }}
Description of the corporation's business activities: {{ corporation_business_activities }}
 
Parent:{{ parent|html }}
Subsidiary:{{ subsidiary|html }}
Was the corporation funded in whole or in part by any domestic or foreign government institution in the last completed financial year, or does the client expect funding in the current financial year?{{ is_government_funded }}

B. Lobbyists Employed by the Corporation
List of Senior Officers whose lobbying activities represent less than 20% of their Duties
{*
Name:{{ [lobbyists].name }}
Position title:{{ [lobbyists].title }}
Public offices held:{{ [lobbyists].public_offices_held }}
Designated public office holder:{{ [lobbyists].is_public_officer }}Name
*}{*
Name:{{ [lobbyists].name }}
Position title:{{ [lobbyists].title }}
Public offices held:{{ [lobbyists].public_offices_held }}
Designated public office holder:{{ [lobbyists].is_public_officer }}
*}

C. Lobbying Activity Information
Federal departments or organizations which have been or will be communicated with during the course of the undertaking: {{ agencies_talked_to }}
Communication techniques that have been used or are expected to be used in the course of the undertaking: 
{{ lobbying_activities }}
Information about Subject matter:{{ lobbying_subject_matter }}
 
Details Regarding the Identified Subject Matter
"""
    subject_matter_pattern = """Details Regarding the Identified Subject Matter
{* <tr><td>{{ [topics].category }}</td><td>{{ [topics].description }}</td></tr> *} 
"""
    page = GET(url)
    registration = scrape(pattern, html=html.tostring(html.fromstring(page), encoding='utf-8', method='text'))
    registration['lobbyists'] = [l for l in registration['lobbyists'] if len(l['is_public_officer'].split()) == 1]
    registration['topics'] = scrape(subject_matter_pattern, html=page)
    registration['parent'] = registration['parent'].strip()
    registration['parent_name'] = registration['parent'].split('\n')[0]
    registration['subsidiary'] = registration['subsidiary'].strip()
    registration['subsidiary_name'] = registration['subsidiary'].split('\n')[0]

Example #10

0

Show file

File: scraper.py Project: backgroundcheck/new_zealand_earthquakes2

def iter_mailing_list_quake_refs():
    base = "http://lists.geonet.org.nz/pipermail/eqnews/"
    index_urls = scrape(PIPERMAIL_INDEX_PATTERN, url=base)
    index_urls.reverse()
    index_urls= index_urls[96:]  ### REMEMBER TO DELETE WHEN FIRST COMPELETE RUN WORKS
    print index_urls
    for month in index_urls:
        print month
        messages = scrape(PIPERMAIL_MONTH_PATTERN, url = base + month)
        messages = [base + month.replace('date.html', link) for link in messages if '0' in link]
        print messages
        for message in messages:
            yield scrape(PIPERMAIL_MESSAGE_PATTERN, url=message)

Example #11

0

Show file

File: scraper.py Project: HengeSense/ALEC_api

	def get(self):
		# page ALEC_Corporations
		html = urllib2.urlopen("http://www.sourcewatch.org/index.php?title=ALEC_Corporations").read()
		
		# get for-profit corporation citations
		references = scrape("""<ol class="references"> {* <li> {{ []|html }} </li> *} </ol>""",html)
		self._add_citations(references,'ALEC_Corporations')

		# get for-profit corporations
		letters = ['A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T','U','V','W','X','Y','Z']
		corporations = []
		for letter in letters:
			corps = scrape("""<h3> <span class="mw-headline">"""+letter+""" </span> </h3> <ul> {* <li> {{ []|html }} </li> *} </ul> """,html)
			corpdata = []
			for each in corps:
				refs = scrape(""" {* <sup class="reference">[ {{ []|int }} ]</sup> *}""",each)
				datas = scrape(""" {* {{ [] }} <sup class="reference"> </sup> *} """,each)
				name = ''
				if len(datas) > 0:
					name = datas[0]

				info = ''
				if len(datas) > 2:
					for d in datas[0:]:
						info = info+d
				elif len(datas) > 1:
					info = datas[1]
				corpdata.append((name,info,refs))
			corporations.extend(corpdata)
		self._add_corporations(corporations,1)
		
		# page ALEC_Non-Profits
		html2 = urllib2.urlopen("http://www.sourcewatch.org/index.php?title=ALEC_Non-Profits").read()
		
		# get non-profit corporation citations
		references = scrape("""<ol class="references"> {* <li> {{ []|html }} </li> *} </ol>""",html2)
		self._add_citations(npcitations,'ALEC_Non-Profits')

		# get non-profit corporations
		nonprofits = []
		for letter in letters:
			np = scrape("""<h3> <span class="mw-headline">"""+letter+"""</span> </h3> <ul> {* <li> {{ []|html }} </li> *} </ul> """,html2)
			npdata = []
			for each in np:
				refs = scrape(""" {* <sup class="reference">[ {{ []|int }} ]</sup> *}""",each)
				datas = scrape(""" {* {{ [] }} <sup class="reference"> </sup> *} """,each)
				name = ''
				if len(datas) > 0:
					name = datas[0]
				info = ''
				if len(datas) > 2:
					for d in datas[0:]:
						info = info+d
				elif len(datas) > 1:
					info = datas[1]
				npdata.append((name,info,refs))
			nonprofits.extend(npdata)
		self._add_corporations(nonprofits,0)

Example #12

0

Show file

File: alextemp1.py Project: pombredanne/scraperwiki-scraper-vault

def StartUp():

    #go to homepage in order to set session cookie
    start_url = "https://delecorp.delaware.gov/tin/GINameSearch.jsp"
    p = ""
    g = {"x": str(time())}
    html = ""
    uastart = {
        "User-Agent":
        "Mozilla/5.0 (Windows NT 6.1; rv:9.0) Gecko/20100101 Firefox/9.0"
    }
    try:
        html = scrape('''
<html>
{{ [y].html }}
</html>
''',
                      url=start_url,
                      get=g,
                      headers=uastart,
                      cookie_jar=myjar)

    except BadStatusLine:
        #hmmm... will skip this check for now..
        return 0
    except Exception, e:
        debug(repr(e))
        debug("scrape problem")
        return 1

Example #13

0

Show file

File: boarding_school_usa_canada.py Project: yuandra/scraperwiki-scraper-vault

def getbdschools(url):
    return scrapemark.scrape("""
            {*
                  <table>
                        <tr></tr>
                        <tr></tr>
                        {*
                           <tr>
                               <td><a href='{@{*<div class="contactinfonew"></div><div id="heading3"></div>
                                      <div class="addressbar contactinfonew">
                                      <div><div>{{[saddress]}}</div>
                                                <div>tel:{{[sphone]}}</div>
                                      </div>
                                      <div>
                                                <div><a></a><a href={{[sweb]}} target="new"></a></div>
                                                <div><a></a></div>
                                      </div>
                                      <div><a></a><a></a></div>
                                      <div><br>{{[sbrief]}}</div>
                                </div>*} @}'>{{[sname]}}</a></td>
                               <td><div>{{[stype]}}</div></td>
                               <td><div>{{[sgrade]}}</div></td>
                               <td><div>{{[scity]}}</div></td>
                           </tr>
                        *}
               </table>
            *}
            """,
                             url=url)

Example #14

0

Show file

File: scrape.py Project: stefanw/cologne-ris-scraper

def get_session_attendants(id):
	"""
		Get list of people who have attended a session
	"""
	global db
	url = BASEURL + 'to0045.asp?__ctext=0&__ksinr=' + str(id)
	print "Lade Anwesenheitsliste", url
	html = urllib2.urlopen(url).read()
	data = scrape("""
	{*
		<tr>
			<td><a href="kp0050.asp?__kpenr={{ [attendee].id|int }}&amp;grnr={{ [attendee].grnr|int }}">{{ [attendee].name }}</a></td>
			<td>{{ [attendee].organization }}</td>
			<td>{{ [attendee].function }}</td>
		</tr>
	*}
	""", html)
	persons = []
	attendants = []
	for row in data['attendee']:
		persons.append({
			'person_id': row['id'],
			'person_name': row['name'],
			'person_organization': row['organization']
		})
		attendants.append({
			'session_id': id,
			'person_id': row['id'],
			'attendance_function': row['function']
		})
	db.save_rows('people', persons, ['person_id'])
	db.save_rows('attendance', attendants, ['session_id', 'person_id'])

Example #15

0

Show file

File: scrape.py Project: CatoTH/cologne-ris-scraper

def get_session_attendants(id):
    """
    Scrapet die Liste der (eingeladenen) Teilnehmer einer Sitzung
    """
    global db
    url = config.BASEURL + (config.URI_ATTENDANTS % id)
    print "Lade Anwesenheitsliste", url
    html = urllib2.urlopen(url).read()
    data = scrape("""
    {*
        <tr>
            <td><a href="kp0050.asp?__kpenr={{ [attendee].id|int }}&amp;grnr={{ [attendee].grnr|int }}">{{ [attendee].name }}</a></td>
            <td>{{ [attendee].organization }}</td>
            <td>{{ [attendee].function }}</td>
        </tr>
    *}
    """, html)
    persons = []
    attendants = []
    for row in data['attendee']:
        persons.append({
            'person_id': row['id'],
            'person_name': row['name'],
            'person_organization': row['organization']
        })
        attendants.append({
            'session_id': id,
            'person_id': row['id'],
            'attendance_function': row['function']
        })
    if not options.simulate:
        db.save_rows('people', persons, ['person_id'])
        db.save_rows('attendance', attendants, ['session_id', 'person_id'])

Example #16

0

Show file

File: alextemp1.py Project: pombredanne/scraperwiki-scraper-vault

def GetPage(fileid):

    debug("GetPage:fileid: " + str(fileid))

    #search for a known company:
    params = {
        "JSPName": "GINAMESEARCH",
        "action": "Search",
        "frmFileNumber": fileid,
        "frmEntityName": ""
    }
    html = ""
    try:
        html = scrape('''
<html>
{{ [y].html }}
</html>
''',
                      url=base_url,
                      post=params,
                      headers=ua,
                      cookie_jar=myjar)

    except Exception, e:
        debug(repr(e))
        debug("scrape problem")
        return 1

Example #17

0

Show file

def GetPage(fileid):

    try:
        terms = (scrape("""
            {*
    <h2>Full Details</h2>  </div>  <div class='page_summary_3col'></div>  <div class='page_content_3col'><table width='60%'><tr><td colspan='2' class='line'><font size='2'><b>English</b></font></td></tr><tr><td class='line'><font size='2'>Term</font></td><td class='line'><font size='2'>{{ [y].en_term }}</font></td></tr><tr><td class='line'><font size='2'>Definition</font></td><td class='line'><font size='2'>{{ [y].en_definition }}</font></td></tr><tr><td class='line'><font size='2'>Context</font></td><td class='line'><font size='2'>{{ [y].en_context }}</font></td></tr></table><br><table width='60%'><tr><td colspan='2' class='line'><font size='2'><b>Welsh</b></font></td></tr><tr><td class='line'><font size='2'>Term</font></td><td class='line'><font size='2'>{{ [y].cy_term }}</font></td></tr><tr><td class='line'><font size='2'>Definition</font></td><td class='line'><font size='2'>{{ [y].cy_definition }}</font></td></tr><tr><td class='line'><font size='2'>Status</font></td><td class='line'><font size='2'>{{ [y].cy_status }}</font></td></tr><tr><td class='line'><font size='2'>Part of Speech</font></td><td class='line'><font size='2'>{{ [y].cy_part_of_speech }}</font></td></tr><tr><td class='line'><font size='2'>Gender</font></td><td class='line'><font size='2'>{{ [y].cy_gender }}</font></td></tr><tr><td class='line'><font size='2'>Number</font></td><td class='line'><font size='2'>{{ [y].cy_number }}</font></td></tr><tr><td class='line'><font size='2'>Context</font></td><td class='line'><font size='2'>{{ [y].cy_context }}</font></td></tr><tr><td class='line'><font size='2'>Subject :&nbsp;</font></td><td class='line'><font size='2'>{{ [y].cy_subject }}</font></td></tr></table></div></div></div>            
            *}
            """,
                        url=base_url + fileid))

        debug((len(terms['y']), "items found"))
        debug(terms['y'])

        for k in terms['y']:
            k['id'] = fileid
            scraperwiki.sqlite.execute("""
                INSERT OR REPLACE INTO swdata (id, en_term, en_definition, en_context, cy_term, cy_definition, cy_status, cy_part_of_Speech, cy_gender, cy_number, cy_context, cy_subject) values (:id, :en_term, :en_definition, :en_context, :cy_term, :cy_definition, :cy_status, :cy_part_of_speech, :cy_gender, :cy_number, :cy_context, :cy_subject)
            """,
                                       k,
                                       verbose=0)
            scraperwiki.sqlite.commit()
            #scraperwiki.sqlite.save(unique_keys=fileid, data=k, table_name="swdata")
    except Exception, e:
        print e
        return

Example #18

0

Show file

File: test.py Project: motord/coconut

def process():
    url='http://www.tianya.cn/publicforum/content/develop/1/905898.shtml'
    template=Template(u"""
        {*
        <table id="firstAuthor">
		    <tr>
                <td>
                    <a>${author}</a> &nbsp;发表日期：{{ [stanzas].datetime }}
		        </td>
		    </tr>
	    </table>
	    <div id="pContentDiv">
	        <div class="post">
	        {{ [stanzas].content }}
	        </div>
	    </div>
        *}
        {*
        <table>
		    <tr>
                <td>
                    <a>${author}</a>　回复日期：{{ [stanzas].datetime }}
		        </td>
		    </tr>
	    </table>
        <div class="post">
        {{ [stanzas].content }}
        </div>
        *}
    """)
    pattern=template.substitute(author=u'flp713')
    pattern=scrapemark.compile(pattern)
    stanzas=scrapemark.scrape(pattern, url=url, encoding=encoding)['stanzas']
    return stanzas

Example #19

0

Show file

def pagefetch(p_url, debug=False):
    html = urllib2.urlopen(p_url).read()
    results = scrapemark.scrape(
        """{*
              <div id="srp">
              <ul id="results">
               {*
               <li>
                <a><img alt="" src={{[thumbs]}}/> </a>
                <div class="result-info">
                    <h3><a href="speaker.php?{{[links]}}">{{[names]}}</a></h3>
                </div>
               </li>
               *}</ul>
               <p class="pagination">
               <a href="results.php?{{[nxurl]}}">Next</a></p>
              </div>
            *}""", html)
    if debug:
        print "Fetched Names:", len(results['names'])
        print "Fetched Relinks:", len(results['links'])
        print "Current Page:", p_url
        print "Next Page:", results['nxurl']
        return results
    else:
        return results

Example #20

0

Show file

    def post(self):
        logging.debug('ItemHandler.post')
        url = self.request.get('url')

        detail = scrapemark.scrape("""
                        {* <tr><td><font>{{ name }}</font></td></tr>  *}
                        {* <tr><th>Specialty</th><td>{{ specialty }}</td></tr>  *}
                        {* <tr><th>Facility</th><td>{{ facility }}</td></tr>  *}
                        {* <tr><th>Address</th><td>{{ address|html }}</td></tr>  *}
                        {* <tr><th>Phone</th><td>{{ phone }}</td></tr>  *}
                        {* <tr><th>Certification</th><td>{{ certification }}</td></tr>  *}
                        {* <tr><th>Medical School</th><td>{{ school }}</td></tr>  *}
                        {* <tr><th>Residency</th><td>{{ residence }}</td></tr>  *}
                        {* <tr><th>Gender</th><td>{{ gender }}</td></tr>  *}
                        """,
                                   url=url)

        address = detail['address'].replace('<br>', '\n').replace(
            '\t', '').replace('\r', '').replace('\n\n', '\n')
        office = models.Office.getOrCreate(detail['facility'], address,
                                           detail['phone'])

        detail['specialties'] = [
            i.strip() for i in detail['specialty'].split(';')
        ]
        doc = models.Doc(**detail)
        doc.office = office
        doc.put()

Example #21

0

Show file

File: spine_direct_to_xmlrdf.py Project: yuandra/scraperwiki-scraper-vault

def getIO(name, urlz):
    ios = scrapemark.scrape("""
        {*

    <td><font>{{ [io].direction }}</font></td>
    <td><font>{{ [io].ft }}</font></td>
    <td><font>{{ [io].substance }}</font></td>
    <td>{{ [io].value }}</font></td>
    <td>{{ [io].min }}</td>
    <td>{{ [io].max }}</td>
    <td>{{ [io].std }}</td>
    <td><font>{{ [io].unit }}</font></td>
    <td><font>{{ [io].environment }}</font></td>
    <td><font>{{ [io].geo }}</font></td>
    </tr>
        *}
        """,
        url=urlz)
    inventorystr = ""
    for flow in ios['io']:
        if flow['direction'] == "Input" or flow['direction'] == "Output":
            inventorystr = inventorystr + "<eco:hasUnallocatedExchange>";
            inventorystr = inventorystr + '<eco:hasEffect><rdfs:type rdf:resource="eco:' + flow['direction'] + '" /><eco:hasTransferable><eco:Substance><rdfs:label>' + flow['substance'] + '</rdfs:label></eco:Substance></eco:hasTransferable></eco:hasEffect>'
            inventorystr = inventorystr + "<eco:hasQuantity><eco:hasUnitOfMeasure>" + flow["unit"] + "</eco:hasUnitOfMeasure><eco:hasMagnitude>" + flow["value"] + "</eco:hasMagnitude><ecoUD:maxValue>" + flow["max"] + "</ecoUD:maxValue><ecoUD:minValue>" + flow["min"] + "</ecoUD:minValue><ecoUD:maxValue>" + flow["max"] + "</ecoUD:maxValue><ecoUD:ecoUD:standardDeviation95>" + flow["std"] + "</ecoUS:ecoUD:standardDeviation95></eco:hasQuantity>";
            inventorystr = inventorystr + '</eco:hasUnallocatedExchange>';
    return inventorystr

Example #22

0

Show file

File: parser.py Project: bloodpet/metromanilatraffic.com

def parse_section(section):
    section_data = None
    for section_pattern in section_patterns:
        test_section_data = scrape(section_pattern, section)
        if test_section_data is not None:
            section_data = test_section_data
    if section_data is None:
        #print section
        return {}
        #return section
    recheck = False
    try:
        section_data['start']
    except KeyError:
        pass
    else:
        if ' to ' in section_data['start']:
            section_data['start'], section_data['end'] = section_data['start'].split(' to ')
        #TODO section_patterns: Fix the patterns above to avoid doing this hack
        if 'end' in section_data and \
                section_data['start'].lower().endswith('san') and \
                section_data['end'].lower().startswith('lan to '):
            section_data['start'] = 'Santolan'
            section_data['end'] = section_data['end'].lower().replace('lan to ', '')
    if isinstance(section_data['stat'], list):
        section_data['stat'] = '-'.join(section_data['stat'])
    is_saved = False
    if 'stat' not in section_data:
        #print section
        return {}
        #return section
    return section_data

Example #23

0

Show file

File: california_sex_offenders_2.py Project: flyeven/scraperwiki-scraper-vault

def fetchresultpage(sessionid,pagenumber,county):
    try:
        result = scrapemark.scrape("""
        <tr nowrap="" align="left" valign="top"></tr>
        {*
                <tr align='left'>
                {*
                <td align='center'></td>
                <td align='center'></td>

                <td>
                    <a href="javascript: OpenDetail('{{ [offenders].uniqueid }}')">
                        {{ [offenders].name }}
                    </a>
                </td>
{#
                <td>
                    {{ [offenders].address }}
                </td>

                <td>{{ [offenders].city }}</td>

                <td align='center'>{{ [offenders].zip }}</td>

                <td>{{ [offenders].county }}</td>

#}                    
                *}
                </tr>
        *}
        """,
        url='http://www.meganslaw.ca.gov/cgi/prosoma.dll?w6='+sessionid+'&searchby=CountyList&SelectCounty='+county+'&SB=0&PageNo='+str(pagenumber))
    except:
        return "Error"
    return result

Example #24

0

Show file

File: aisex.py Project: motord/lemonade

def harvest():
    squeezed=memcache.get('Squeezed::lemons')
    if squeezed is None:
        squeezed=Squeezed.get_by_key_name('squeezed')
        if squeezed is None:
            fresh=(baseurl+lemon['url'] for lemon in lemons())
        else:
            fresh=(baseurl+lemon['url'] for lemon in lemons() if lemon['url'] not in squeezed.lemons)
    else:
        fresh=(baseurl+lemon['url'] for lemon in lemons() if lemon['url'] not in squeezed.lemons)
    bucket=[]
    for lemon in fresh:
        logging.info('squeezing '+lemon)
        juices = scrapemark.scrape("""
            <span class='tpc_title'></span>
            {*
            <img src='{{ [juices].image }}' border=0>
            <a href='{{ [juices].download }}' target=_blank></a>
            *}
            """, url=lemon)['juices']
        logging.info(juices)
        for juice in juices:
            try:
                juice=Juice(key_name=lemon, image=juice['image'], download=juice['download'])
                juice.put()
            except BadValueError:
                logging.info(juice)
        bucket.append(lemon)
    if squeezed is None:
        squeezed=Squeezed(key_name='squeezed', lemons=bucket)
    else:
        squeezed.lemons.extend(bucket)
    squeezed.put()
    memcache.set('Squeezed::lemons', squeezed)

Example #25

0

Show file

File: tianyajingji.py Project: motord/coconut

def process(thread):
    for url in pages(thread):
        stanza_template=Template(u"""
        {*
        <table id="firstAuthor">
		    <tr>
                <td>
                    <a>${author}</a> &nbsp;发表日期：{{ [stanzas].datetime }}
		        </td>
		    </tr>
	    </table>
	    <div id="pContentDiv">
	        <div class="post">
	        {{ [stanzas].content|html }}
	        </div>
	    </div>
        *}
        {*
        <table>
		    <tr>
                <td>
                    <a>${author}</a>　回复日期：{{ [stanzas].datetime }}
		        </td>
		    </tr>
	    </table>
        <div class="post">
        {{ [stanzas].content|html }}
        </div>
        *}
        """)
        logging.info(thread['author'])
        pattern=scrapemark.compile(stanza_template.substitute(author=thread['author']))
        logging.info(pattern)
        thread['stanzas'][url]=scrapemark.scrape(pattern, url=url, encoding=encoding)['stanzas']
        logging.info(thread['stanzas'][url])

Example #26

0

Show file

File: boarding_school_usa_canada.py Project: carriercomm/scraperwiki-scraper-vault

def getbdschools(url):
    return scrapemark.scrape("""
            {*
                  <table>
                        <tr></tr>
                        <tr></tr>
                        {*
                           <tr>
                               <td><a href='{@{*<div class="contactinfonew"></div><div id="heading3"></div>
                                      <div class="addressbar contactinfonew">
                                      <div><div>{{[saddress]}}</div>
                                                <div>tel:{{[sphone]}}</div>
                                      </div>
                                      <div>
                                                <div><a></a><a href={{[sweb]}} target="new"></a></div>
                                                <div><a></a></div>
                                      </div>
                                      <div><a></a><a></a></div>
                                      <div><br>{{[sbrief]}}</div>
                                </div>*} @}'>{{[sname]}}</a></td>
                               <td><div>{{[stype]}}</div></td>
                               <td><div>{{[sgrade]}}</div></td>
                               <td><div>{{[scity]}}</div></td>
                           </tr>
                        *}
               </table>
            *}
            """,url=url)

Example #27

0

Show file

File: spine_to_rdfxml.py Project: yuandra/scraperwiki-scraper-vault

def getIO(name, urlz):
    #print url
    #html = scraperwiki.scrape(url)
    #soup = BeautifulSoup(html)
    ios = scrapemark.scrape("""
        {*

    <td><font>{{ [io].direction }}</font></td>
    <td><font>{{ [io].ft }}</font></td>
    <td><font>{{ [io].substance }}</font></td>
    <td>{{ [io].value }}</font></td>
    <td>{{ [io].min }}</td>
    <td>{{ [io].max }}</td>
    <td>{{ [io].std }}</td>
    <td><font>{{ [io].unit }}</font></td>
    <td><font>{{ [io].environment }}</font></td>
    <td><font>{{ [io].geo }}</font></td>
    </tr>
        *}
        """,
        url=urlz)
    for flow in ios['io']:
        if flow['direction'] == "Input" or flow['direction'] == "Output":
            scraperwiki.sqlite.execute("insert into SPINEIO values (?,?,?,?,?,?,?,?,?,?,?)", (name,flow['direction'],flow['ft'],flow['substance'],flow['value'],flow['min'],flow['max'],flow['std'],flow['unit'],flow['environment'],flow['geo']))
            scraperwiki.sqlite.commit()

Example #28

0

Show file

File: nz_auckland_food_grades.py Project: yuandra/scraperwiki-scraper-vault

def main():
    #  Fetch last page index
    last_page = scraperwiki.sqlite.get_var('last_page', default=0)
    #Scrape initial list
    p = scrape(PAGE_LIST_PATTERN, url=LIST_URL)
    # print p
    print 'starting from ' + str(last_page)
    
    # 
    if last_page == 0:
        print 'first page? '
        # Scrape the first list page
        scrape_list(LIST_URL)
    
    # slice from last index
    p = p[last_page:]
    # print p
        
    # Scrape each list page
    for page in p:
        # print 'scraping page : ' + str(page)
        url = "%s&intPageNumber=%d" % (LIST_URL, page)
        # print url
        scrape_list(url)
        # save page index
        scraperwiki.sqlite.save_var('last_page', page-1)
        
    # reset page index to 0
    scraperwiki.sqlite.save_var('last_page', 0)

Example #29

0

Show file

File: parser.py Project: bloodpet/metromanilatraffic.com

def parse_entry(entry):
    updated_at = entry.updated_at
    # Add 8 hours to consider Asia/Manila timezone
    #updated_at = updated_at + datetime.timedelta(0, 8 * 60 * 60)
    now = datetime.datetime.now()
    if updated_at.day > now.day:
        updated_at = updated_at - datetime.timedelta(1)
    text = entry.text
    text = re.sub('%s[, ]?' % entry.road.name, '', text, flags=re.IGNORECASE)
    text = re.sub('http://twitpic.com/[A-Za-z0-9] ?', '', text, flags=re.IGNORECASE)
    data = None
    # Figure out if the data would make sense.
    for main_pattern in main_patterns:
        test_data = scrape(main_pattern, text)
        if test_data is not None:
            data = test_data
            break
    if data is None:
        return
    # Get the time
    #print entry.road, updated_at.strftime('%d-%H:%M'),
    stat_time = data.get('time', None)
    if stat_time:
        if 'pm' in stat_time.lower():
            add12 = True
        else:
            add12 = False
        try:
            stat_time = datetime.datetime.strptime(stat_time.replace(' ', ''), '%H:%M%p')
        except KeyError, e:
            stat_time = updated_at
        except ValueError, e:
            #print stat_time.replace(' ', ''), e
            stat_time = updated_at

Example #30

0

Show file

File: substitutes.py Project: doublea/gobtrans-api

def parse_list(resp):
    html = BeautifulSoup(resp.body).prettify()

    members = scrape(
        """{* 
            <tr>
                <td>
                    <a href='{{ [res].idlink }}'>{{ [res].name }}</a>
                    {* <strong>({{ [res].ref }})</strong> *}
                </td>
                <td>
                    <font>partido {{ [res].party }}</font>
                </td>
            </tr>
        *}""",
        html=html)['res']

    # TODO: The president of the chamber may appear only in a footer. Add him
    #       to the members list.

    sel = HtmlXPathSelector(resp)
    trs = sel.select('//tr/td[@align="RIGHT" and @valign="TOP" and @width="5%"]/font/strong/../../..')
    refs = {}
    for tr in trs:
        ref = tr.select('.//strong[starts-with(text(), "(")]/text()')[0].extract()[1:-1]
        sub_info = "".join(tr.select('.//td[2]/font/descendant-or-self::*/text()').extract())
        refs[ref] = sub_info

    items = []
    for info in members:
        #since = None
        #to = None
        why = None
        #substitutes = None
        if 'ref' in info and info['ref'] is not None:
            why = refs[info['ref']]

            #substitutes = sub_info['name']
            #range = get_substitution_range(sub_info['why'])
            #why = get_substitution_reason(sub_info['why'])

            #if len(range) > 0:
            #    since = range[0]
            #if len(range) > 1:
            #    to = range[1]

        date = resp.meta['date']
        id = extract_id_link(info['idlink']) + date.strftime(DATE_FMT)
        items.append(SubstitutesItem(id=id,
                                     date=date,
                                     name=info['name'],
                                     party=info['party'], 
                                     chamber=resp.url[-1],
                                     #substitutes=substitutes,
                                     #substitutes_from=since,
                                     #substitutes_to=to,
                                     substitutes_line=why))

    return items

Example #31

0

Show file

File: sg-1_and_stargate_atlantis_viewing_order.py Project: carriercomm/scraperwiki-scraper-vault

def scrapeEpisodes(url):
    return scrapemark.scrape("""
        {*
        <td class="summary">"<b>{{ [episode].name }}</b>"</td>
        <span class="bday dtstart published updated">{{ [episode].date }}</span>
        *}
        """,
        url=url)

Example #32

0

Show file

def iter_mailing_list_quake_refs():
    base = "http://lists.geonet.org.nz/pipermail/eqnews/"
    index_urls = scrape(PIPERMAIL_INDEX_PATTERN, url=base)
    index_urls.reverse()
    index_urls = index_urls[
        96:]  ### REMEMBER TO DELETE WHEN FIRST COMPELETE RUN WORKS
    print index_urls
    for month in index_urls:
        print month
        messages = scrape(PIPERMAIL_MONTH_PATTERN, url=base + month)
        messages = [
            base + month.replace('date.html', link) for link in messages
            if '0' in link
        ]
        print messages
        for message in messages:
            yield scrape(PIPERMAIL_MESSAGE_PATTERN, url=message)

Example #33

0

Show file

File: sg-1_and_stargate_atlantis_viewing_order.py Project: yuandra/scraperwiki-scraper-vault

def scrapeEpisodes(url):
    return scrapemark.scrape("""
        {*
        <td class="summary">"<b>{{ [episode].name }}</b>"</td>
        <span class="bday dtstart published updated">{{ [episode].date }}</span>
        *}
        """,
                             url=url)

Example #34

0

Show file

def get_values_for_station_and_day(station, date):
    datestring = date.strftime('%d.%m.%Y')
    now = datetime.today()
    url = 'http://luadb.lds.nrw.de/LUA/wiski/pegel.php?stationsname_n=' + station + '&meindatum=' + datestring + '&tabellet=Tabelle'
    br = mechanize.Browser()
    br.set_handle_robots(False)
    br.open(url)
    assert br.viewing_html()
    data = scrapemark.scrape(
        """
            {*
            <td class='messwerte'>{{ [values].datetime }}</td> 
            <td class='messwerte'>{{ [values].value|float }}&nbsp;</td>
            *}
        """,
        br.response().read())
    if 'values' in data:
        datasets = []
        #print data['values']
        for row in data['values']:
            #print station, row['datetime'], ("%.2f" % row['value'])
            # datetime string can be "DD.MM HH:MM" or "HH:MM"
            match1 = re.match(
                r"([0-9]{2})\.([0-9]{2})\s+([0-9]{2}):([0-9]{2})",
                row['datetime'])
            match2 = re.match(r"([0-9]{2}):([0-9]{2})", row['datetime'])
            year = None
            if match1 is not None:
                day = match1.group(1)
                month = match1.group(2)
                year = now.year
                hour = match1.group(3)
                minute = match1.group(4)
                if now.day == 1 and now.month == 1 and day == 31 and month == 12:
                    year = year - 1
            elif match2 is not None:
                day = date.day
                month = date.month
                year = date.year
                hour = match2.group(1)
                minute = match2.group(2)
            if year is not None:
                mez_timestamp = int(
                    datetime(int(year), int(month), int(day), int(hour),
                             int(minute)).strftime('%s'))
                utc_timestamp = mez_timestamp - 3600
                utcdate = datetime.fromtimestamp(utc_timestamp)
                datasets.append({
                    'station':
                    station,
                    'datetime_utc':
                    utcdate.strftime('%Y-%m-%d %H:%S'),
                    'value': ("%.2f" % row['value'])
                })
        scraperwiki.sqlite.save(unique_keys=['datetime_utc', 'station'],
                                data=datasets,
                                table_name="raindata")
        return len(datasets)

Example #35

0

Show file

File: art.py Project: flyeven/scraperwiki-scraper-vault

def main():
    movements = scrape(MOVEMENTS_INDEX, html=get_page(URL))
    print movements
    for m in movements["movements"]:
        if "artcyclopedia.com" in m["link"]:
            movement = scrape(MOVEMENTS_INDIVIDUAL, html=get_page(m["link"]))
            print m["title"]
            if not movement:
                movement = scrape(MOVEMENTS_INDIVIDUAL2, html=get_page(m["link"]))

            relations = []
            for relation in movement["related"]:
                r = dict(movement=m["title"], related_to=relation["topic"])
                if "/artists/" in relation["link"]:
                    r["topic"] = "artist"
                else:
                    r["topic"] = "movement"
                relations.append(r)

            artists = []
            for artist in movement["artists"]:
                artist["movement"] = m["title"]
                dates = artist["alive"].split("-")
                try:
                    artist["birth_year"] = int(dates[0])
                    artist["death_year"] = int(dates[1])
                except ValueError:
                    if "Born" in dates:
                        artist["birth_year"] = int(dates.split()[1])
                        artist["death_year"] = None
                except:
                    print >> sys.stderr, "ERROR: Can't parse dates for %s: %s" % (artist["name"], artist["alive"])
                    artist["birth_year"] = None
                    artist["death_year"] = None
                artist["profile_link"] = URL + artist["profile_link"][3:]
                try:
                    artist["nationality"], artist["profession"] = artist["artist_type"].split(" ", 1)
                except ValueError:
                    artist["nationality"] = artist["artist_type"]
                    artist["profession"] = "unknown"

                artists.append(artist)
            datastore.save(["name"], table_name="movements", data=dict(name=m["title"], link=m["link"]))
            datastore.save(["movement", "related_to"], table_name="relations", data=relations)
            datastore.save(["name", "nationality"], table_name="artists", data=artists)

Example #36

0

Show file

File: scraper.py Project: backgroundcheck/new_zealand_earthquakes2

def iter_recent_quakes():
    for quake in scrape(pattern=RECENT_QUAKES_PATTERN,url=URL)['quakes']:
        quake['url'] = 'http://www.geonet.org.nz' + quake['url']
        quake['shaking_map_url'] = 'http://www.geonet.org.nz' + quake['shaking_map_url']
        quake['maps_url'] = 'http://www.geonet.org.nz' + quake['maps_url']
        quake['img_of_quake_location_url'] = 'http://www.geonet.org.nz' + quake['img_of_quake_location_url']
        for k, val in fetch_quake_data(quake['geonet_ref']).iteritems():
            quake[k] = val
        yield quake

Example #37

0

Show file

File: nz_auckland_food_grades.py Project: yuandra/scraperwiki-scraper-vault

def scrape_list(url):
    #html = mech_scrape(url)
    p = scrape(EST_PATTERN, url=url)
    print p
    for e in p:
        est_url = "%s%s%d" % (BASE_URL, DETAIL_URL, e)
        print 'scraping: ' + est_url
        print 'scraping id: ' + str(e)
        scrape_detail(est_url, e)

Example #38

0

Show file

File: models.py Project: brelig/urlchat

 def fetch_load_url(self):
     pattern = '''
         <title>{{ pagetitle }}</title>
         '''
     
     dict = scrapemark.scrape(pattern, url=self.url)
     
     self.html_title = dict['pagetitle']
     self.fetched_url = True

Example #39

0

Show file

File: substitutes.py Project: GobTrans/gobtrans-api

def parse_list(resp):
    html = BeautifulSoup(resp.body).prettify()

    members = scrape(
        """{* 
            <tr>
                <td>
                    <a href='{{ [res].idlink }}'>{{ [res].name }}</a>
                    {* <strong>({{ [res].ref }})</strong> *}
                </td>
                <td>
                    <font>partido {{ [res].party }}</font>
                </td>
            </tr>
        *}""",
        html=html)['res']

    # TODO: The president of the chamber may appear only in a footer. Add him
    #       to the members list.

    sel = HtmlXPathSelector(resp)
    trs = sel.select('//tr/td[@align="RIGHT" and @valign="TOP" and @width="5%"]/font/strong/../../..')
    refs = {}

    for tr in trs:
        ref = tr.select('.//strong[starts-with(text(), "(")]/text()')[0].extract()[1:-1]
        refs[ref] = tr

    items = []
    for info in members:
        since = None
        to = None
        line = None
        substitutes_name = None
        substitutes_oid = None
        if 'ref' in info and info['ref'] is not None:
            try:
                tr = refs[info['ref']]
            except KeyError:
                logger.warning('Couldnt find reference %s in substitutes table.' % \
                               info['ref'], exc_info=sys.exc_info())
            line = "".join(tr.select('.//td[2]/font/descendant-or-self::*/text()').extract())
            links = tr.select('.//a')
            if links:
                substitutes_oid = extract_id_link(links[0].select('.//@href').extract()[0])[2:]
                substitutes_name = links[0].select('.//text()').extract()[0]
            range = get_substitution_range(line)
            if len(range) > 0:
                try:
                    since = datetime.strptime(range[0], PAGE_DATE_FMT).date()
                except ValueError, e:
                    logger.warning("Unable to parse substitute 'since' date", exc_info=sys.exc_info())
            if len(range) > 1:
                try:
                    to = datetime.strptime(range[1], PAGE_DATE_FMT).date()
                except ValueError, e:
                    logger.warning("Unable to parse substitute 'to' date", exc_info=sys.exc_info())

Example #40

0

Show file

File: nz_auckland_food_grades.py Project: yuandra/scraperwiki-scraper-vault

def scrape_detail(est_url, id):
    
    html = scraperwiki.scrape(est_url)
    est_details = scrape(DETAIL_PATTERN, html)

    if not est_details:
        #Try the exempt pattern
        est_details = scrape(EXEMPT_PATTERN, html)
        
        if not est_details:
            # it's either changed hands and will turn up soon, or it's new
            return
    else:
        # print est_details['inspection_date']
        est_details['inspection_date'] =  datetime.strftime(datetime.strptime(est_details['inspection_date'], '%d/%m/%Y'), '%Y-%m-%d')
        # parser.parse(est_details['inspection_date'])
        # print est_details['inspection_date']        

    # Locate
    # Attempt to find
    sql = 'lat, lng FROM swdata WHERE address = "%s" AND lat IS NOT NULL LIMIT 0,1' % est_details['address']
    latlng = scraperwiki.sqlite.select(sql)
    
    #Avoid multiple google lookups
    if latlng:
        # print 'DB Geo'
        # print latlng
        est_details['lat'] = latlng[0]['lat']
        est_details['lng'] = latlng[0]['lng']
        # print est_details['lat']
    else:
        # print 'Goog lookup'
        location = locate(est_details['address'] + ', Auckland, NZ')
        if location:
            est_details['lat'], est_details['lng'] = location 
        

    #est_details['fg_id'] = id  # Gah! id aint unique??
    #est_details['url'] = est_url # URLs are useless - the IDs float!!?? WTF!?
    
    
    # Save
    scraperwiki.sqlite.save(unique_keys=['name','address','grade','inspection_date'], data=est_details)
    print 'saved'

Example #41

0

Show file

File: spine_to_rdfxml.py Project: yuandra/scraperwiki-scraper-vault

def getEachRecord(name, urlz):
    #print url
    #html = scraperwiki.scrape(url)
    #soup = BeautifulSoup(html)
    #date = soup.find(text="Date Completed").parent.parent.parent.nextSibling.nextSibling.text
    #print date
    inventory = {}
    temp = scrapemark.scrape("""
        {*
    <th align="left" valign="top" width="257" bgcolor="#DCDEE0"><font style="font-size: 11px;" face="Verdana"><em>Date Completed</em></font></th>
    <td valign="top" width="477" bgcolor="#FFFFFF"><font style="font-size: 11px;">{{ date }}</font>
        *}
        """,
        url=urlz)
    inventory['date'] = temp['date']
    temp = scrapemark.scrape("""
        {*
    <th align="left" valign="top" width="257" bgcolor="#DCDEE0"><font style="font-size: 11px;" face="Verdana"><em>Copyright</em></font></th>
    <td valign="top" width="477" bgcolor="#FFFFFF"><font style="font-size: 11px;">{{ copyright }}</font>
        *}
        """,
        url=urlz)
    inventory['copyright'] = temp['copyright']
    temp = scrapemark.scrape("""
        {*
    <th align="left" valign="top" width="257" bgcolor="#DCDEE0"><font style="font-size: 11px;"><em>Process Type</em></font></th>
    <td valign="top" width="477" bgcolor="#FFFFFF"><font style="font-size: 11px;">{{ desc }}</font>

        *}
        """,
        url=urlz)
    inventory['description'] = temp['desc']
    temp = scrapemark.scrape("""
        {*
    <th align="left" valign="top" width="257" bgcolor="#DCDEE0"><font style="font-size: 11px;"><em>Function</em></font></th>
    <td valign="top" width="477" bgcolor="#FFFFFF"><font style="font-size: 11px;">{{ desc }}</font>

        *}
        """,
        url=urlz)
    inventory['description'] = inventory['description'] + ". " + temp['desc']
    scraperwiki.sqlite.execute("insert into SPINE values (?,?,?,?)", (name,inventory['date'],inventory['description'],inventory['copyright']))
    scraperwiki.sqlite.commit()

Example #42

0

Show file

File: nz-early-education-directory.py Project: yuandra/scraperwiki-scraper-vault

def get_csv_link():
    """ Return up-to-date csv link """
    csv_pattern = """{*<a href="{{ href }}" title="Click here to download this file." target="blank">
    ECE Directory .csv*}</a>"""
    det = scrape(
        csv_pattern,
        url=
        'http://www.educationcounts.govt.nz/directories/early-childhood-services'
    )
    return 'http://www.educationcounts.govt.nz%s' % det['href']

Example #43

0

Show file

File: swift_codes.py Project: rayassch/scraperwiki-scraper-vault

def swift_codes(queue=Q):
    print 'Getting countries'
    raw = GET(SWIFT_URL)
    print raw
    countries = scrape(COUNTRY_PATTERN, html=raw, headers=HEADERS)['countries']
    print countries
    for country in countries:
        print country
        country['link'] = BASE_URL + country['link']
        queue.push((parse_swift_code_page, country['link'], country['name']))

Example #44

0

Show file

File: swift_codes.py Project: flyeven/scraperwiki-scraper-vault

def swift_codes(queue=Q):
    print 'Getting countries'
    raw = GET(SWIFT_URL)
    print raw
    countries = scrape(COUNTRY_PATTERN, html=raw, headers=HEADERS)['countries']
    print countries
    for country in countries:
        print country
        country['link'] = BASE_URL + country['link']
        queue.push((parse_swift_code_page, country['link'], country['name']))

Example #45

0

Show file

File: niederschlag-hessen.py Project: flyeven/scraperwiki-scraper-vault

def get_values():
    """Get actual readings from the stations"""
    baseurl = 'http://www.hlug.de/static/pegel/static/'
    listpageurl = baseurl + "list_N_0.htm?entryparakey=N"
    br = mechanize.Browser()
    br.set_handle_robots(False)
    br.open(listpageurl)
    assert br.viewing_html()
    links = []
    for link in br.links(url_regex=".*stat_[0-9]+.htm\?entryparakey=N"):
        links.append(link.url)
    links = shuffle(links)
    for link in links:
        subpageurl = baseurl + link
        print "Fetching", subpageurl
        br.open(subpageurl)
        html = br.response().read()
        station = scrapemark.scrape("""
                <table class="wwp_sdheader" cellpadding=4 width=720>
                    <tr>
                        <td class="wwp_sdheader" colspan=6>Station</td>
                    </tr>
                    <tr>
                        <td class="head">Name</td><td class="td1">{{ name }}</td>
                        <td class="head">Messstellen-Nr.</td><td class="td1">{{ id|int }}</td>
                        <td class="head">Flussgebiet</td><td class="td1">{{ river }}</td>
                    </tr>
                </table>
                <a target="_blank" class="graphlink" href="data_{{ linkid }}_N_WEEK.xls">4-Tage</a>
            """,
            html)
        #print station
        if station is not None and 'linkid' in station:
            excelurl = baseurl + 'data_'+ station['linkid'] +'_N_WEEK.xls'
            print excelurl
            book = xlrd.open_workbook(file_contents=urllib.urlopen(excelurl).read())
            if book:
                sheet = book.sheets()[0]
                if sheet.ncols == 2 and sheet.nrows > 0:
                    values = []
                    for rownumber in range(3, sheet.nrows): # skip first 3 rows
                        (datecell, numcell) = [ sheet.cell(rownumber, j)  for j in range(sheet.ncols) ]
                        #print "%s, %.1f" % (datecell.value, numcell.value)
                        match = re.match(r"([0-9]{2})\.([0-9]{2})\.([0-9]{4})\s([0-9]{2}:[0-9]{2})", datecell.value)
                        if match is not None:
                            values.append({
                                'datetime': match.group(3) + '-' + match.group(2) + '-' + match.group(1) + ' ' + match.group(4),
                                'station_id': station['id'],
                                'rain_mm': ("%.1f" % numcell.value),
                                
                            })
                        #print values
                    scraperwiki.sqlite.save(unique_keys=['datetime', 'station_id'], data=values, table_name="raindata")
        else:
            print "WARN: No workable data found."

Example #46

0

Show file

    def post(self):
        logging.debug('SyncHandler.post')
        scrape = scrapemark.scrape("""
                    {* <tr class='metalist'><td><a href='{{[details]}}'></a></td></tr> *}
                """,
                                   url=LIST_URL)

        for detailurl in scrape['details']:
            taskqueue.add(url='/tasks/item', params={'url': detailurl})

        self.redirect('/')

Example #47

0

Show file

def GetPage(fileid):

    try:
        fin = urllib2.urlopen(base_url + fileid)
        text = fin.read()
        fin.close()

        pprint(text)

        #test for no match
        no_match = (scrape("""
<hr>There are {{ }} that match your search criteria.<br>
            """,
                           html=text))
        print no_match
        #TODO: Save no match
        #if no_match == "no entries":

        #basic details:
        basic_details = (scrape("""
<span class=detailstext>Registration Number: {{ [y].reg_no }}</span><P><span class=detailstext>Date Registered:&nbsp;</span>{{ [y].reg_date }}&nbsp;&nbsp;&nbsp;&nbsp;<span class=detailstext>Registration Expires:&nbsp;</span>{{ [y].reg_expiry }}<br><br><span class=detailstext>Data Controller:&nbsp;</span>{{ [y].data_controller }}<P><div class=detailstext>Address:</div><Blockquote>{{ [y].reg_address|html }}</BlockQuote><hr>
            """,
                                html=text))
        print basic_details

        debug((len(basic_details['y']), "items found"))
        debug(basic_details['y'])

        #foi:
        foi = (scrape("""
<P ALIGN=center class=detailstext>{{ }} or a Scottish public authority
            """,
                      html=text))
        print foi
        #if foi == "Freedom of Information Act 2000":

#<P class=detailstext>Other Names:</P><BlockQuote>FIRST MONEY DIRECT<br>FIRSTMONEYDIRECT.CO.UK<br></BlockQuote></BlockQuote><hr>

    except Exception, e:
        print e
        return

Example #48

0

Show file

def parse_search_results(url, first=False):
    pattern = """{*
<td>{{ [lobbyists]].type }}:<strong>{{ [lobbyists]].name }}</strong>

{{ [lobbyists].lobbyist_details|html }}<a href="{{ [lobbyists].communication_reports_link|abs }}">View communication reports</a>
</td>
          <td class="tableTop">          
            <a href="{{ [lobbyists].registration_link|abs }}>
              {{ [lobbyists].registration_begining }}to{{ [lobbyists].registration_ending }}
            </a>
          </td>
*}

{* <a href="{{ next|abs }}">Next</a> *}
"""
    if first:
        res = scrape(pattern=pattern, url=url, post=params, cookie_jar=CJ)
    else:
        res = scrape(pattern=pattern, url=url, cookie_jar=CJ)
    print res
    lobbyists = res['lobbyists']
    next_page_url = res['next']
    print next_page_url
    for lobbyist in lobbyists:
        details = html.fromstring(lobbyist['lobbyist_details'])
        if lobbyist['type'] == u'Consultant':
            lobbyist['consulting_firm'] = details[1].text
            lobbyist['client'] = details[3].text
            lobbyist['lobbyist_id'] = details[4].tail.strip()
        elif lobbyist['type'] == u'In-house Organization' or lobbyist[
                'type'] == u'In-house Corporation':
            lobbyist['responsible_officer'] = ' '.join(
                part.strip() for part in details[1].text.split())
            lobbyist['lobbyist_id'] = details[2].tail.strip()
        else:
            print 'CRAZINESS: new type found: ', lobbyist['type'],
            print lobbyist
            raise ValueError
        del lobbyist['lobbyist_details']
        Q.put((comms_report_index, lobbyist['communication_reports_link']))
        Q.put((registration, lobbyist['registration_link']))

Example #49

0

Show file

File: yahoo.py Project: bombpersons/WaveDict

	def getDefinition(self, html):
		definition  = scrape(""" 
		<table border=0 cellspacing=10 cellpadding=0 width=100%> 
			<tr> 
				<td> 			
					{{ }}
				</td> 
			</tr> 
		</table> 
		""", html)
		
		return definition

Example #50

0

Show file

def iter_recent_quakes():
    for quake in scrape(pattern=RECENT_QUAKES_PATTERN, url=URL)['quakes']:
        quake['url'] = 'http://www.geonet.org.nz' + quake['url']
        quake['shaking_map_url'] = 'http://www.geonet.org.nz' + quake[
            'shaking_map_url']
        quake['maps_url'] = 'http://www.geonet.org.nz' + quake['maps_url']
        quake[
            'img_of_quake_location_url'] = 'http://www.geonet.org.nz' + quake[
                'img_of_quake_location_url']
        for k, val in fetch_quake_data(quake['geonet_ref']).iteritems():
            quake[k] = val
        yield quake

Example #51

0

Show file

def fetchsession():
    global fetchnumber
    global sessionid
    if (fetchnumber > 50) or (sessionid == ""):
        sessionurl = 'http://www.meganslaw.ca.gov/cgi/prosoma.dll?searchby=curno'
        result = scrapemark.scrape("{{ page.text }}", url=sessionurl)
        sessionid = str(result['page']['text'])
    if fetchnumber <= 50:
        fetchnumber += 1
    else:
        fetchnumber = 0
    return sessionid

Example #52

0

Show file

File: ico_lines_to_take.py Project: rayassch/scraperwiki-scraper-vault

def GetListOfLtt():

    ltt = (scrape("""
        <table>
        {*
            <td>{{ [y].ltt_id }} withdrawn</td>
        *}
        </table>
        """,
                  url=base_url))

    if ltt != None:
        if 'y' in ltt:
            debug((len(ltt['y']), "items found"))
            debug(ltt['y'])
            for k in ltt['y']:
                k['ltt_status'] = "WITHDRAWN"
                k['date_scraped'] = ''
                scraperwiki.sqlite.save(unique_keys=["ltt_id"],
                                        data=k,
                                        table_name="ltt_data")

    ltt = (scrape("""
        <table>
        {*
            <td><a href='{{ [y].ltt_url|abs }}'>{{ [y].ltt_id }}</a></td>
        *}
        </table>
        """,
                  url=base_url))

    if ltt != None:
        if 'y' in ltt:
            debug((len(ltt['y']), "items found"))
            debug(ltt['y'])
            for k in ltt['y']:
                k['ltt_status'] = "ACTIVE"
                k['date_scraped'] = ''
                GetLtt(k['ltt_url'])