def extract_construction_profs(details):
    blanket = "<h6>Construction Professionals</h6>{{ pros|html }}" ""
    targetted = """<h6>Construction Professionals</h6>{* <a href="javascript:viewBio('{{ [pros.id] }}');">{{ [pros].name }} </a> *}"""
    pros = scrape(targetted, html=details)["pros"]
    if not pros:
        pros = scrape(blanket, html=details)["pros"]
    return "pros", pros
def parse_swift_code_page(url, country_name, queue=Q):
    if url in DONE:
        return None
    print "downloading", country_name
    raw = get_country_html(url)

    banks = scrape(SWIFT_CODE_PATTERN, html=raw)["banks"]
    for bank in banks:
        bank["address"] = cleanup_address(bank["address"])
        bank["country_name"] = country_name
        bank["source"] = url
    sqlite.save(["swift_code"], data=banks, table_name="swift_codes")

    if "page=" not in url:
        try:
            n_pages = max(int(link.split("=")[-1]) for link in scrape(PAGINATION_PATTERN, html=raw))
            pages = [
                BASE_URL
                + "/swift-code/search-swift-complete.php?country=%s&page=%d" % (country_name.replace(" ", "%20"), n)
                for n in xrange(n_pages)
            ]
        except ValueError:  # no more pages
            pages = []
        for newurl in pages:
            queue.push((parse_swift_code_page, newurl, country_name))
    DONE.add(url)
    sqlite.save(["url"], table_name="_done", data=dict(url=url))
def main():
    links = scrape(PATTERN, url=URL)
    print links
    # done= set([res['nzgls_identifier'] for res in sw.sqlite.select('nzgls_identifier FROM bills')])
    # print done
    for link in links["sources"]:
        bills = scrape(PATTERN2, url=link)["bills"]
        print bills
        for bill in bills:
            print bill
            try:
                bill = scrape(INDIVIDUAL_BILL, url=bill)
            except Exception, e:
                print "DEBUG: %s" % e
                continue
            bill["link"] = link
            do_details(bill)
            do_meta(bill)
            do_related(bill)
            for related_doc in bill["related"]:
                related_doc["nzgls_identifier"] = bill["nzgls_identifier"]
                related_doc["bill"] = bill["title"]
            sw.sqlite.save(["link"], data=bill["related"], table_name="related_docs")
            cleanup(bill)
            sw.sqlite.save(["link", "valid_from"], data=bill, table_name="bills")
def corporation_registration(page):
    pattern = """Corporation:{{ name }}Name change history
Responsible Officer:{{ responsible_officer_name }}
Position Title:    {{ responsible_officer_name }}
Version:{{ registration_id }}
Type:{{ registration_type }}
Active from:{{ registration_active_from_date }}
Activity last confirmed:{{ registration_last_confirmed_date }}

A. Information about Responsible Officer and Corporation
Corporation:{{ corporation_name }}
Telephone number:{{ corporation_phone }}
Fax number:{{ corporation_fax }}
Description of the corporation's business activities: {{ corporation_business_activities }}
 
Parent:{{ parent|html }}
Subsidiary:{{ subsidiary|html }}
Was the corporation funded in whole or in part by any domestic or foreign government institution in the last completed financial year, or does the client expect funding in the current financial year?{{ is_government_funded }}

B. Lobbyists Employed by the Corporation
List of Senior Officers whose lobbying activities represent less than 20% of their Duties
{*
Name:{{ [lobbyists].name }}
Position title:{{ [lobbyists].title }}
Public offices held:{{ [lobbyists].public_offices_held }}
Designated public office holder:{{ [lobbyists].is_public_officer }}Name
*}{*
Name:{{ [lobbyists].name }}
Position title:{{ [lobbyists].title }}
Public offices held:{{ [lobbyists].public_offices_held }}
Designated public office holder:{{ [lobbyists].is_public_officer }}
*}

C. Lobbying Activity Information
Federal departments or organizations which have been or will be communicated with during the course of the undertaking: {{ agencies_talked_to }}
Communication techniques that have been used or are expected to be used in the course of the undertaking: 
{{ lobbying_activities }}
Information about Subject matter:{{ lobbying_subject_matter }}
 
Details Regarding the Identified Subject Matter
"""
    subject_matter_pattern = """Details Regarding the Identified Subject Matter
{* <tr><td>{{ [topics].category }}</td><td>{{ [topics].description }}</td></tr> *} 
"""
    page = GET(url)
    registration = scrape(pattern, html=html.tostring(html.fromstring(page), encoding="utf-8", method="text"))
    registration["lobbyists"] = [l for l in registration["lobbyists"] if len(l["is_public_officer"].split()) == 1]
    registration["topics"] = scrape(subject_matter_pattern, html=page)
    registration["parent"] = registration["parent"].strip()
    registration["parent_name"] = registration["parent"].split("\n")[0]
    registration["subsidiary"] = registration["subsidiary"].strip()
    registration["subsidiary_name"] = registration["subsidiary"].split("\n")[0]
def StartUp():

    # go to homepage in order to set session cookie
    start_url = "https://delecorp.delaware.gov/tin/GINameSearch.jsp"
    p = ""
    g = {"x": str(time())}
    html = ""
    uastart = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; rv:9.0) Gecko/20100101 Firefox/9.0"}
    try:
        html = scrape(
            """
<html>
{{ [y].html }}
</html>
""",
            url=start_url,
            get=g,
            headers=uastart,
            cookie_jar=myjar,
        )

    except BadStatusLine:
        # hmmm... will skip this check for now..
        return 0
    except Exception, e:
        debug(repr(e))
        debug("scrape problem")
        return 1
Example #6
0
def get_session_attendants(id):
    """
		Get list of people who have attended a session
	"""
    global db
    url = BASEURL + "to0045.asp?__ctext=0&__ksinr=" + str(id)
    print "Lade Anwesenheitsliste", url
    html = urllib2.urlopen(url).read()
    data = scrape(
        """
	{*
		<tr>
			<td><a href="kp0050.asp?__kpenr={{ [attendee].id|int }}&amp;grnr={{ [attendee].grnr|int }}">{{ [attendee].name }}</a></td>
			<td>{{ [attendee].organization }}</td>
			<td>{{ [attendee].function }}</td>
		</tr>
	*}
	""",
        html,
    )
    persons = []
    attendants = []
    for row in data["attendee"]:
        persons.append({"person_id": row["id"], "person_name": row["name"], "person_organization": row["organization"]})
        attendants.append({"session_id": id, "person_id": row["id"], "attendance_function": row["function"]})
    db.save_rows("people", persons, ["person_id"])
    db.save_rows("attendance", attendants, ["session_id", "person_id"])
Example #7
0
    def post(self):
        logging.debug("ItemHandler.post")
        url = self.request.get("url")

        detail = scrapemark.scrape(
            """
                        {* <tr><td><font>{{ name }}</font></td></tr>  *}
                        {* <tr><th>Specialty</th><td>{{ specialty }}</td></tr>  *}
                        {* <tr><th>Facility</th><td>{{ facility }}</td></tr>  *}
                        {* <tr><th>Address</th><td>{{ address|html }}</td></tr>  *}
                        {* <tr><th>Phone</th><td>{{ phone }}</td></tr>  *}
                        {* <tr><th>Certification</th><td>{{ certification }}</td></tr>  *}
                        {* <tr><th>Medical School</th><td>{{ school }}</td></tr>  *}
                        {* <tr><th>Residency</th><td>{{ residence }}</td></tr>  *}
                        {* <tr><th>Gender</th><td>{{ gender }}</td></tr>  *}
                        """,
            url=url,
        )

        address = detail["address"].replace("<br>", "\n").replace("\t", "").replace("\r", "").replace("\n\n", "\n")
        office = models.Office.getOrCreate(detail["facility"], address, detail["phone"])

        detail["specialties"] = [i.strip() for i in detail["specialty"].split(";")]
        doc = models.Doc(**detail)
        doc.office = office
        doc.put()
Example #8
0
def pages(thread):
    centipede = Centipede.get_by_key_name(thread["url"])
    d = {}
    urls = []
    logging.info(thread["url"])
    centipede_url_components = urlparse.urlparse(thread["url"])
    centipede_url_netloc_path = centipede_url_components.netloc + centipede_url_components.path
    for page in scrapemark.scrape(
        """
        <div class="pages" id="pageDivTop">
        {*
        <a href="{{ [pages] }}"></a>
        *}
        <span></span>
        </div>
        """,
        url=thread["url"],
    )["pages"][:-1]:
        d[page] = 1
    if centipede is None:
        centipede = Centipede(
            key_name=thread["url"],
            species=db.Category(u"天涯经济"),
            author=thread["author"],
            title=thread["title"],
            comments=thread["comments"],
            views=thread["views"],
            pedes=[],
        )
        urls = [db.Link(thread["url"])]
        urls.extend([db.Link(key) for key in d.keys()[:-2]])
        qr_key = centipede_url_components.netloc + ".".join([centipede_url_components.path.split(".")[0], "png"])
        img = urlfetch.fetch(
            "http://chart.apis.google.com/chart?cht=qr&chs=200x200&chl="
            + urllib2.quote(host_url + centipede_url_netloc_path)
        )
        qr_content = StaticContent(key_name=qr_key, body=img.content, content_type="image/png")
        qr_content.put()
    else:
        urls = [db.Link(centipede.next)]
        urls.extend([db.Link(url) for url in d.keys()[:-2] if url not in centipede.pedes])
    logging.info(urls)
    centipede.pedes.extend(urls)
    centipede.next = db.Link(d.keys()[-2])
    for url in urls:
        yield url
    centipede.put()
    content = StaticContent.get_by_key_name(centipede_url_netloc_path)
    stanzas = [stanza for stanza in new_stanzas(thread, centipede)]
    if content is None:
        content = StaticContent(
            key_name=centipede_url_netloc_path,
            template=db.Text(template("centipede.html", centipede=centipede, stanzas=stanzas, template_next=True)),
            content_type="text/html",
        )
    else:
        content.template = db.Text(template(content.template, centipede=centipede, stanzas=stanzas, template_next=True))
    #    db.put(stanzas)
    content.put()
    memcache.delete(content.key().name())
Example #9
0
def get_session_attendants(id):
    """
    Scrapet die Liste der (eingeladenen) Teilnehmer einer Sitzung
    """
    global db
    url = config.BASEURL + (config.URI_ATTENDANTS % id)
    print "Lade Anwesenheitsliste", url
    html = urllib2.urlopen(url).read()
    data = scrape(
        """
    {*
        <tr>
            <td><a href="kp0050.asp?__kpenr={{ [attendee].id|int }}&amp;grnr={{ [attendee].grnr|int }}">{{ [attendee].name }}</a></td>
            <td>{{ [attendee].organization }}</td>
            <td>{{ [attendee].function }}</td>
        </tr>
    *}
    """,
        html,
    )
    persons = []
    attendants = []
    for row in data["attendee"]:
        persons.append({"person_id": row["id"], "person_name": row["name"], "person_organization": row["organization"]})
        attendants.append({"session_id": id, "person_id": row["id"], "attendance_function": row["function"]})
    if not options.simulate:
        db.save_rows("people", persons, ["person_id"])
        db.save_rows("attendance", attendants, ["session_id", "person_id"])
def getIO(name, urlz):
    #print url
    #html = scraperwiki.scrape(url)
    #soup = BeautifulSoup(html)
    ios = scrapemark.scrape("""
        {*

    <td><font>{{ [io].direction }}</font></td>
    <td><font>{{ [io].ft }}</font></td>
    <td><font>{{ [io].substance }}</font></td>
    <td>{{ [io].value }}</font></td>
    <td>{{ [io].min }}</td>
    <td>{{ [io].max }}</td>
    <td>{{ [io].std }}</td>
    <td><font>{{ [io].unit }}</font></td>
    <td><font>{{ [io].environment }}</font></td>
    <td><font>{{ [io].geo }}</font></td>
    </tr>
        *}
        """,
        url=urlz)
    for flow in ios['io']:
        if flow['direction'] == "Input" or flow['direction'] == "Output":
            scraperwiki.sqlite.execute("insert into SPINEIO values (?,?,?,?,?,?,?,?,?,?,?)", (name,flow['direction'],flow['ft'],flow['substance'],flow['value'],flow['min'],flow['max'],flow['std'],flow['unit'],flow['environment'],flow['geo']))
            scraperwiki.sqlite.commit() 
def main():
    #  Fetch last page index
    last_page = scraperwiki.sqlite.get_var("last_page", default=0)
    # Scrape initial list
    p = scrape(PAGE_LIST_PATTERN, url=LIST_URL)
    # print p
    print "starting from " + str(last_page)

    #
    if last_page == 0:
        print "first page? "
        # Scrape the first list page
        scrape_list(LIST_URL)

    # slice from last index
    p = p[last_page:]
    # print p

    # Scrape each list page
    for page in p:
        # print 'scraping page : ' + str(page)
        url = "%s&intPageNumber=%d" % (LIST_URL, page)
        # print url
        scrape_list(url)
        # save page index
        scraperwiki.sqlite.save_var("last_page", page - 1)

    # reset page index to 0
    scraperwiki.sqlite.save_var("last_page", 0)
def getbdschools(url):
    return scrapemark.scrape(
        """
            {*
                  <table>
                        <tr></tr>
                        <tr></tr>
                        {*
                           <tr>
                               <td><a href='{@{*<div class="addressbar contactinfonew">
                                      <div><div>{{[saddress]}}<a></a></div>
                                                <div>{{[sphone]}}</div>
                                      </div>
                                      <div><div style="float:left"><a href={{[sweb]}} target="new"></a></div></div>
                                </div>*} @}'>{{[sname]}}</a></td>
                               <td><div>{{[stype]}}</div></td>
                               <td><div>{{[sgrade]}}</div></td>
                               <td><div>{{[scity]}}</div></td>
                           </tr>
                        *}
               </table>
            *}
            """,
        url=url,
    )
Example #13
0
def process():
    proposition = scrapemark.scrape(
        """
        <div class="sfc_news_txt">
            <a>{{ deal.from_shop}}</a>
            <a>{{ deal.to_shop}}</a>
            2、指定车型:<span id="modeName1">{{ deal.car_spec }}</span>   {{ deal.inventory }}
            3、可租区间: {{ deal.available_date }}至 {{ deal.expire_date }};
        </div>
        <table class="sfc_tab">
          <tr class="sfc_news_text">
            <td>{{ deal.from_city }}—{{ deal.to_city }}</td>
            <td>{{ deal.days_allowed|int }}天</td>
            <td>{{ deal.mileages_allowed|int }}公里</td>
            <td>{{ deal.gps }}</td>
            <td>{{ deal.baby_seat }}</td>
            <td>{{ deal.listed_price|int }}元</td>
            <td>{{ deal.discount_price|int }}元</td>
            <td>{{ deal.toll }}</td>
          </tr>
        </table>
        """,
        url="http://www.zuche.com//order/ConvenienceInfoControl.do_?convenienceId=10392&mid=81171&cid=81167",
    )["deal"]
    logging.warning(proposition)
def getbdschools(url):
    return scrapemark.scrape(
        """
            {*
                  <table>
                        <tr></tr>
                        <tr></tr>
                        {*
                           <tr>
                               <td><a href='{@{*<div class="contactinfonew"></div><div id="heading3"></div>
                                      <div class="addressbar contactinfonew">
                                      <div><div>{{[saddress]}}</div>
                                                <div>tel:{{[sphone]}}</div>
                                      </div>
                                      <div>
                                                <div><a></a><a href={{[sweb]}} target="new"></a></div>
                                                <div><a></a></div>
                                      </div>
                                      <div><a></a><a></a></div>
                                      <div><br>{{[sbrief]}}</div>
                                </div>*} @}'>{{[sname]}}</a></td>
                               <td><div>{{[stype]}}</div></td>
                               <td><div>{{[sgrade]}}</div></td>
                               <td><div>{{[scity]}}</div></td>
                           </tr>
                        *}
               </table>
            *}
            """,
        url=url,
    )
def GetPage(fileid):

    try:
        terms = scrape(
            """
            {*
    <h2>Full Details</h2>  </div>  <div class='page_summary_3col'></div>  <div class='page_content_3col'><table width='60%'><tr><td colspan='2' class='line'><font size='2'><b>English</b></font></td></tr><tr><td class='line'><font size='2'>Term</font></td><td class='line'><font size='2'>{{ [y].en_term }}</font></td></tr><tr><td class='line'><font size='2'>Definition</font></td><td class='line'><font size='2'>{{ [y].en_definition }}</font></td></tr><tr><td class='line'><font size='2'>Context</font></td><td class='line'><font size='2'>{{ [y].en_context }}</font></td></tr></table><br><table width='60%'><tr><td colspan='2' class='line'><font size='2'><b>Welsh</b></font></td></tr><tr><td class='line'><font size='2'>Term</font></td><td class='line'><font size='2'>{{ [y].cy_term }}</font></td></tr><tr><td class='line'><font size='2'>Definition</font></td><td class='line'><font size='2'>{{ [y].cy_definition }}</font></td></tr><tr><td class='line'><font size='2'>Status</font></td><td class='line'><font size='2'>{{ [y].cy_status }}</font></td></tr><tr><td class='line'><font size='2'>Part of Speech</font></td><td class='line'><font size='2'>{{ [y].cy_part_of_speech }}</font></td></tr><tr><td class='line'><font size='2'>Gender</font></td><td class='line'><font size='2'>{{ [y].cy_gender }}</font></td></tr><tr><td class='line'><font size='2'>Number</font></td><td class='line'><font size='2'>{{ [y].cy_number }}</font></td></tr><tr><td class='line'><font size='2'>Context</font></td><td class='line'><font size='2'>{{ [y].cy_context }}</font></td></tr><tr><td class='line'><font size='2'>Subject :&nbsp;</font></td><td class='line'><font size='2'>{{ [y].cy_subject }}</font></td></tr></table></div></div></div>            
            *}
            """,
            url=base_url + fileid,
        )

        debug((len(terms["y"]), "items found"))
        debug(terms["y"])

        for k in terms["y"]:
            k["id"] = fileid
            scraperwiki.sqlite.execute(
                """
                INSERT OR REPLACE INTO swdata (id, en_term, en_definition, en_context, cy_term, cy_definition, cy_status, cy_part_of_Speech, cy_gender, cy_number, cy_context, cy_subject) values (:id, :en_term, :en_definition, :en_context, :cy_term, :cy_definition, :cy_status, :cy_part_of_speech, :cy_gender, :cy_number, :cy_context, :cy_subject)
            """,
                k,
                verbose=0,
            )
            scraperwiki.sqlite.commit()
            # scraperwiki.sqlite.save(unique_keys=fileid, data=k, table_name="swdata")
    except Exception, e:
        print e
        return
Example #16
0
def parse_section(section):
    section_data = None
    for section_pattern in section_patterns:
        test_section_data = scrape(section_pattern, section)
        if test_section_data is not None:
            section_data = test_section_data
    if section_data is None:
        # print section
        return {}
        # return section
    recheck = False
    try:
        section_data["start"]
    except KeyError:
        pass
    else:
        if " to " in section_data["start"]:
            section_data["start"], section_data["end"] = section_data["start"].split(" to ")
        # TODO section_patterns: Fix the patterns above to avoid doing this hack
        if (
            "end" in section_data
            and section_data["start"].lower().endswith("san")
            and section_data["end"].lower().startswith("lan to ")
        ):
            section_data["start"] = "Santolan"
            section_data["end"] = section_data["end"].lower().replace("lan to ", "")
    if isinstance(section_data["stat"], list):
        section_data["stat"] = "-".join(section_data["stat"])
    is_saved = False
    if "stat" not in section_data:
        # print section
        return {}
        # return section
    return section_data
Example #17
0
def parse_entry(entry):
    updated_at = entry.updated_at
    # Add 8 hours to consider Asia/Manila timezone
    # updated_at = updated_at + datetime.timedelta(0, 8 * 60 * 60)
    now = datetime.datetime.now()
    if updated_at.day > now.day:
        updated_at = updated_at - datetime.timedelta(1)
    text = entry.text
    text = re.sub("%s[, ]?" % entry.road.name, "", text, flags=re.IGNORECASE)
    text = re.sub("http://twitpic.com/[A-Za-z0-9] ?", "", text, flags=re.IGNORECASE)
    data = None
    # Figure out if the data would make sense.
    for main_pattern in main_patterns:
        test_data = scrape(main_pattern, text)
        if test_data is not None:
            data = test_data
            break
    if data is None:
        return
    # Get the time
    # print entry.road, updated_at.strftime('%d-%H:%M'),
    stat_time = data.get("time", None)
    if stat_time:
        if "pm" in stat_time.lower():
            add12 = True
        else:
            add12 = False
        try:
            stat_time = datetime.datetime.strptime(stat_time.replace(" ", ""), "%H:%M%p")
        except KeyError, e:
            stat_time = updated_at
        except ValueError, e:
            # print stat_time.replace(' ', ''), e
            stat_time = updated_at
def getIO(name, urlz):
    ios = scrapemark.scrape("""
        {*

    <td><font>{{ [io].direction }}</font></td>
    <td><font>{{ [io].ft }}</font></td>
    <td><font>{{ [io].substance }}</font></td>
    <td>{{ [io].value }}</font></td>
    <td>{{ [io].min }}</td>
    <td>{{ [io].max }}</td>
    <td>{{ [io].std }}</td>
    <td><font>{{ [io].unit }}</font></td>
    <td><font>{{ [io].environment }}</font></td>
    <td><font>{{ [io].geo }}</font></td>
    </tr>
        *}
        """,
        url=urlz)
    inventorystr = ""
    for flow in ios['io']:
        if flow['direction'] == "Input" or flow['direction'] == "Output":
            inventorystr = inventorystr + "<eco:hasUnallocatedExchange>";
            inventorystr = inventorystr + '<eco:hasEffect><rdfs:type rdf:resource="eco:' + flow['direction'] + '" /><eco:hasTransferable><eco:Substance><rdfs:label>' + flow['substance'] + '</rdfs:label></eco:Substance></eco:hasTransferable></eco:hasEffect>'
            inventorystr = inventorystr + "<eco:hasQuantity><eco:hasUnitOfMeasure>" + flow["unit"] + "</eco:hasUnitOfMeasure><eco:hasMagnitude>" + flow["value"] + "</eco:hasMagnitude><ecoUD:maxValue>" + flow["max"] + "</ecoUD:maxValue><ecoUD:minValue>" + flow["min"] + "</ecoUD:minValue><ecoUD:maxValue>" + flow["max"] + "</ecoUD:maxValue><ecoUD:ecoUD:standardDeviation95>" + flow["std"] + "</ecoUS:ecoUD:standardDeviation95></eco:hasQuantity>";
            inventorystr = inventorystr + '</eco:hasUnallocatedExchange>';
    return inventorystr
def pagefetch(p_url, debug=False):
    html = urllib2.urlopen(p_url).read()
    results = scrapemark.scrape(
        """{*
              <div id="srp">
              <ul id="results">
               {*
               <li>
                <a><img alt="" src={{[thumbs]}}/> </a>
                <div class="result-info">
                    <h3><a href="speaker.php?{{[links]}}">{{[names]}}</a></h3>
                </div>
               </li>
               *}</ul>
               <p class="pagination">
               <a href="results.php?{{[nxurl]}}">Next</a></p>
              </div>
            *}""",
        html,
    )
    if debug:
        print "Fetched Names:", len(results["names"])
        print "Fetched Relinks:", len(results["links"])
        print "Current Page:", p_url
        print "Next Page:", results["nxurl"]
        return results
    else:
        return results
def GetPage(fileid):

    try:
        fin = urllib2.urlopen(base_url + fileid)
        text = fin.read()
        fin.close()

        pprint(text)

        # test for no match
        no_match = scrape(
            """
<hr>There are {{ }} that match your search criteria.<br>
            """,
            html=text,
        )
        print no_match
        # TODO: Save no match
        # if no_match == "no entries":

        # basic details:
        basic_details = scrape(
            """
<span class=detailstext>Registration Number: {{ [y].reg_no }}</span><P><span class=detailstext>Date Registered:&nbsp;</span>{{ [y].reg_date }}&nbsp;&nbsp;&nbsp;&nbsp;<span class=detailstext>Registration Expires:&nbsp;</span>{{ [y].reg_expiry }}<br><br><span class=detailstext>Data Controller:&nbsp;</span>{{ [y].data_controller }}<P><div class=detailstext>Address:</div><Blockquote>{{ [y].reg_address|html }}</BlockQuote><hr>
            """,
            html=text,
        )
        print basic_details

        debug((len(basic_details["y"]), "items found"))
        debug(basic_details["y"])

        # foi:
        foi = scrape(
            """
<P ALIGN=center class=detailstext>{{ }} or a Scottish public authority
            """,
            html=text,
        )
        print foi
        # if foi == "Freedom of Information Act 2000":

    # <P class=detailstext>Other Names:</P><BlockQuote>FIRST MONEY DIRECT<br>FIRSTMONEYDIRECT.CO.UK<br></BlockQuote></BlockQuote><hr>

    except Exception, e:
        print e
        return
def get_values_for_station_and_day(station, date):
    datestring = date.strftime("%d.%m.%Y")
    now = datetime.today()
    url = (
        "http://luadb.lds.nrw.de/LUA/wiski/pegel.php?stationsname_n="
        + station
        + "&meindatum="
        + datestring
        + "&tabellet=Tabelle"
    )
    br = mechanize.Browser()
    br.set_handle_robots(False)
    br.open(url)
    assert br.viewing_html()
    data = scrapemark.scrape(
        """
            {*
            <td class='messwerte'>{{ [values].datetime }}</td> 
            <td class='messwerte'>{{ [values].value|float }}&nbsp;</td>
            *}
        """,
        br.response().read(),
    )
    if "values" in data:
        datasets = []
        # print data['values']
        for row in data["values"]:
            # print station, row['datetime'], ("%.2f" % row['value'])
            # datetime string can be "DD.MM HH:MM" or "HH:MM"
            match1 = re.match(r"([0-9]{2})\.([0-9]{2})\s+([0-9]{2}):([0-9]{2})", row["datetime"])
            match2 = re.match(r"([0-9]{2}):([0-9]{2})", row["datetime"])
            year = None
            if match1 is not None:
                day = match1.group(1)
                month = match1.group(2)
                year = now.year
                hour = match1.group(3)
                minute = match1.group(4)
                if now.day == 1 and now.month == 1 and day == 31 and month == 12:
                    year = year - 1
            elif match2 is not None:
                day = date.day
                month = date.month
                year = date.year
                hour = match2.group(1)
                minute = match2.group(2)
            if year is not None:
                mez_timestamp = int(datetime(int(year), int(month), int(day), int(hour), int(minute)).strftime("%s"))
                utc_timestamp = mez_timestamp - 3600
                utcdate = datetime.fromtimestamp(utc_timestamp)
                datasets.append(
                    {
                        "station": station,
                        "datetime_utc": utcdate.strftime("%Y-%m-%d %H:%S"),
                        "value": ("%.2f" % row["value"]),
                    }
                )
        scraperwiki.sqlite.save(unique_keys=["datetime_utc", "station"], data=datasets, table_name="raindata")
        return len(datasets)
Example #22
0
def main():
    movements = scrape(MOVEMENTS_INDEX, html=get_page(URL))
    print movements
    for m in movements["movements"]:
        if "artcyclopedia.com" in m["link"]:
            movement = scrape(MOVEMENTS_INDIVIDUAL, html=get_page(m["link"]))
            print m["title"]
            if not movement:
                movement = scrape(MOVEMENTS_INDIVIDUAL2, html=get_page(m["link"]))

            relations = []
            for relation in movement["related"]:
                r = dict(movement=m["title"], related_to=relation["topic"])
                if "/artists/" in relation["link"]:
                    r["topic"] = "artist"
                else:
                    r["topic"] = "movement"
                relations.append(r)

            artists = []
            for artist in movement["artists"]:
                artist["movement"] = m["title"]
                dates = artist["alive"].split("-")
                try:
                    artist["birth_year"] = int(dates[0])
                    artist["death_year"] = int(dates[1])
                except ValueError:
                    if "Born" in dates:
                        artist["birth_year"] = int(dates.split()[1])
                        artist["death_year"] = None
                except:
                    print >> sys.stderr, "ERROR: Can't parse dates for %s: %s" % (artist["name"], artist["alive"])
                    artist["birth_year"] = None
                    artist["death_year"] = None
                artist["profile_link"] = URL + artist["profile_link"][3:]
                try:
                    artist["nationality"], artist["profession"] = artist["artist_type"].split(" ", 1)
                except ValueError:
                    artist["nationality"] = artist["artist_type"]
                    artist["profession"] = "unknown"

                artists.append(artist)
            datastore.save(["name"], table_name="movements", data=dict(name=m["title"], link=m["link"]))
            datastore.save(["movement", "related_to"], table_name="relations", data=relations)
            datastore.save(["name", "nationality"], table_name="artists", data=artists)
Example #23
0
def parse_list(resp):
    html = BeautifulSoup(resp.body).prettify()

    members = scrape(
        """{* 
            <tr>
                <td>
                    <a href='{{ [res].idlink }}'>{{ [res].name }}</a>
                    {* <strong>({{ [res].ref }})</strong> *}
                </td>
                <td>
                    <font>partido {{ [res].party }}</font>
                </td>
            </tr>
        *}""",
        html=html,
    )["res"]

    # TODO: The president of the chamber may appear only in a footer. Add him
    #       to the members list.

    sel = HtmlXPathSelector(resp)
    trs = sel.select('//tr/td[@align="RIGHT" and @valign="TOP" and @width="5%"]/font/strong/../../..')
    refs = {}

    for tr in trs:
        ref = tr.select('.//strong[starts-with(text(), "(")]/text()')[0].extract()[1:-1]
        refs[ref] = tr

    items = []
    for info in members:
        since = None
        to = None
        line = None
        substitutes_name = None
        substitutes_oid = None
        if "ref" in info and info["ref"] is not None:
            try:
                tr = refs[info["ref"]]
            except KeyError:
                logger.warning("Couldnt find reference %s in substitutes table." % info["ref"], exc_info=sys.exc_info())
            line = "".join(tr.select(".//td[2]/font/descendant-or-self::*/text()").extract())
            links = tr.select(".//a")
            if links:
                substitutes_oid = extract_id_link(links[0].select(".//@href").extract()[0])[2:]
                substitutes_name = links[0].select(".//text()").extract()[0]
            range = get_substitution_range(line)
            if len(range) > 0:
                try:
                    since = datetime.strptime(range[0], PAGE_DATE_FMT).date()
                except ValueError, e:
                    logger.warning("Unable to parse substitute 'since' date", exc_info=sys.exc_info())
            if len(range) > 1:
                try:
                    to = datetime.strptime(range[1], PAGE_DATE_FMT).date()
                except ValueError, e:
                    logger.warning("Unable to parse substitute 'to' date", exc_info=sys.exc_info())
def scrape_list(url):
    # html = mech_scrape(url)
    p = scrape(EST_PATTERN, url=url)
    print p
    for e in p:
        est_url = "%s%s%d" % (BASE_URL, DETAIL_URL, e)
        print "scraping: " + est_url
        print "scraping id: " + str(e)
        scrape_detail(est_url, e)
Example #25
0
    def fetch_load_url(self):
        pattern = """
            <title>{{ pagetitle }}</title>
            """

        dict = scrapemark.scrape(pattern, url=self.url)

        self.html_title = dict["pagetitle"]
        self.fetched_url = True
def scrape_detail(est_url, id):

    html = scraperwiki.scrape(est_url)
    est_details = scrape(DETAIL_PATTERN, html)

    if not est_details:
        # Try the exempt pattern
        est_details = scrape(EXEMPT_PATTERN, html)

        if not est_details:
            # it's either changed hands and will turn up soon, or it's new
            return
    else:
        # print est_details['inspection_date']
        est_details["inspection_date"] = datetime.strftime(
            datetime.strptime(est_details["inspection_date"], "%d/%m/%Y"), "%Y-%m-%d"
        )
        # parser.parse(est_details['inspection_date'])
        # print est_details['inspection_date']

    # Locate
    # Attempt to find
    sql = 'lat, lng FROM swdata WHERE address = "%s" AND lat IS NOT NULL LIMIT 0,1' % est_details["address"]
    latlng = scraperwiki.sqlite.select(sql)

    # Avoid multiple google lookups
    if latlng:
        # print 'DB Geo'
        # print latlng
        est_details["lat"] = latlng[0]["lat"]
        est_details["lng"] = latlng[0]["lng"]
        # print est_details['lat']
    else:
        # print 'Goog lookup'
        location = locate(est_details["address"] + ", Auckland, NZ")
        if location:
            est_details["lat"], est_details["lng"] = location

    # est_details['fg_id'] = id  # Gah! id aint unique??
    # est_details['url'] = est_url # URLs are useless - the IDs float!!?? WTF!?

    # Save
    scraperwiki.sqlite.save(unique_keys=["name", "address", "grade", "inspection_date"], data=est_details)
    print "saved"
def swift_codes(queue=Q):
    print "Getting countries"
    raw = GET(SWIFT_URL)
    print raw
    countries = scrape(COUNTRY_PATTERN, html=raw, headers=HEADERS)["countries"]
    print countries
    for country in countries:
        print country
        country["link"] = BASE_URL + country["link"]
        queue.push((parse_swift_code_page, country["link"], country["name"]))
def scrapeEpisodes(url):
    return scrapemark.scrape(
        """
        {*
        <td class="summary">"<b>{{ [episode].name }}</b>"</td>
        <span class="bday dtstart published updated">{{ [episode].date }}</span>
        *}
        """,
        url=url,
    )
def getEachRecord(name, urlz):
    #print url
    #html = scraperwiki.scrape(url)
    #soup = BeautifulSoup(html)
    #date = soup.find(text="Date Completed").parent.parent.parent.nextSibling.nextSibling.text
    #print date
    inventory = {}
    temp = scrapemark.scrape("""
        {*
    <th align="left" valign="top" width="257" bgcolor="#DCDEE0"><font style="font-size: 11px;" face="Verdana"><em>Date Completed</em></font></th>
    <td valign="top" width="477" bgcolor="#FFFFFF"><font style="font-size: 11px;">{{ date }}</font>
        *}
        """,
        url=urlz)
    inventory['date'] = temp['date']
    temp = scrapemark.scrape("""
        {*
    <th align="left" valign="top" width="257" bgcolor="#DCDEE0"><font style="font-size: 11px;" face="Verdana"><em>Copyright</em></font></th>
    <td valign="top" width="477" bgcolor="#FFFFFF"><font style="font-size: 11px;">{{ copyright }}</font>
        *}
        """,
        url=urlz)
    inventory['copyright'] = temp['copyright']
    temp = scrapemark.scrape("""
        {*
    <th align="left" valign="top" width="257" bgcolor="#DCDEE0"><font style="font-size: 11px;"><em>Process Type</em></font></th>
    <td valign="top" width="477" bgcolor="#FFFFFF"><font style="font-size: 11px;">{{ desc }}</font>

        *}
        """,
        url=urlz)
    inventory['description'] = temp['desc']
    temp = scrapemark.scrape("""
        {*
    <th align="left" valign="top" width="257" bgcolor="#DCDEE0"><font style="font-size: 11px;"><em>Function</em></font></th>
    <td valign="top" width="477" bgcolor="#FFFFFF"><font style="font-size: 11px;">{{ desc }}</font>

        *}
        """,
        url=urlz)
    inventory['description'] = inventory['description'] + ". " + temp['desc']
    scraperwiki.sqlite.execute("insert into SPINE values (?,?,?,?)", (name,inventory['date'],inventory['description'],inventory['copyright']))
    scraperwiki.sqlite.commit()     
def GetListOfLtt():

    ltt = scrape(
        """
        <table>
        {*
            <td>{{ [y].ltt_id }} withdrawn</td>
        *}
        </table>
        """,
        url=base_url,
    )

    if ltt != None:
        if "y" in ltt:
            debug((len(ltt["y"]), "items found"))
            debug(ltt["y"])
            for k in ltt["y"]:
                k["ltt_status"] = "WITHDRAWN"
                k["date_scraped"] = ""
                scraperwiki.sqlite.save(unique_keys=["ltt_id"], data=k, table_name="ltt_data")

    ltt = scrape(
        """
        <table>
        {*
            <td><a href='{{ [y].ltt_url|abs }}'>{{ [y].ltt_id }}</a></td>
        *}
        </table>
        """,
        url=base_url,
    )

    if ltt != None:
        if "y" in ltt:
            debug((len(ltt["y"]), "items found"))
            debug(ltt["y"])
            for k in ltt["y"]:
                k["ltt_status"] = "ACTIVE"
                k["date_scraped"] = ""
                GetLtt(k["ltt_url"])