def login(username, password): _, h = scraper.fetch(LOGIN_FORM, data={'UserName': username, 'Password': password, 'SUBMIT1': 'Login', }) if 'set-cookie' not in h: BARF cookies = h['set-cookie'] scraper.fetch(LOGIN_CONFIRM, headers={'Cookie': cookies,}) return cookies
def scrape_register(cookies, module, year_id): page, _ = scraper.fetch( "%s?%s" % (SEARCH_URL, urllib.urlencode({ 'form_id': 3, 'exclude': '', 'year_id': year_id, 'mnem': module, })), headers={'Cookie': cookies,} ) doc = scraper.parse(page) ## print page title = doc.find(scraper.path("h4", "a")).text for table in doc.findall(scraper.path("table")): if 'bordercolor' in table.keys(): headings = [ t.text for t in table.findall(scraper.path("th", "font", "b")) ] if headings != [ 'Name', 'Category', 'Course', 'Misc' ]: BARF ## for row in table.findall(scraper.path("tr"))[1:]: ## for c in row.findall(scraper.path("td", "font")): ## print c, c.text, c.tail students = [ dict(zip(headings[:-1], (c.text.strip() for c in row.findall(scraper.path("td", "font")) if c.text))) for row in table.findall(scraper.path("tr"))[1:] ] return { 'title': title, 'students': students, }
def derp(): slack_event = json.loads(request.data) if "challenge" in slack_event: return make_response(slack_event["challenge"], 200, {"content_type": "application/json" }) if verification_token != slack_event.get("token"): message = "Invalid Slack verification token: %s \npyBot has: \ %s\n\n" % (slack_event["token"], verification_token.verification) make_response(message, 403, {"X-Slack-No-Retry": 1}) if "event" in slack_event: event = slack_event['event'] if "text" in event.keys() and event['text'].startswith('!last'): number = 1 split = event['text'].split(" ") if len(split) > 1: number = int(split[1]) last = list(fetch())[:number] for message in entries_to_messages(last): send_message(message) return make_response("OK", 200, {}) return make_response("[NO EVENT IN SLACK REQUEST] These are not the droids\ you're looking for.", 404, {"X-Slack-No-Retry": 1})
def on_add_node(msg): g = gevent.spawn(scraper.fetch, msg) g.join() # Wait to Greenlet finishes node = g.value graph.add_node(node['id']) emit('add_node', {'id': node['id'], 'abstract': node['abstract'], 'authors': node['authors'], 'date': node['date'], 'title': node['title'] }) for c in node['citations']: gevent.sleep(0) graph.add_edge(node['id'], c) if c[:4] == 'doi:': cinfo = scraper.fetch(c[4:]) emit('add_node', {'id': cinfo['id'], 'abstract': cinfo['abstract'], 'authors': cinfo['authors'], 'date': cinfo['date'], 'title': cinfo['title']}) else: emit('add_node', {'id': c}) emit('add_link', {'source': node['id'], 'target': c, 'value': 1}) for c in node['cited by']: gevent.sleep(0) graph.add_edge(c, node['id']) if c[:4] == 'doi:': cinfo = scraper.fetch(c[4:]) emit('add_node', {'id': cinfo['id'], 'abstract': cinfo['abstract'], 'authors': cinfo['authors'], 'date': cinfo['date'], 'title': cinfo['title']}) else: emit('add_node', {'id': c}) emit('add_link', {'source': c, 'target': node['id'], 'value': 1})
if not (dump_ascii or dump_json): dump_ascii = True if "".join(map(lambda s:s.lower(), args)) in Courses: courses = "%0D%0A".join(map(urllib.quote_plus, Courses[args[0]])) elif specify_courses: courses = "%0D%0A".join(map(urllib.quote_plus, args)) if courses: url = "%s;%s" % (TT_URL, COURSES_URL % { "courses": courses, }) else: modules = "%0D%0A".join(args) url = "%s;%s" % (TT_URL, MODULES_URL % { "modules": modules, }) if not (courses or modules): die_with_usage("", 1) modules = scrape_timetable(scraper.parse(scraper.fetch(url)[0])) if module_detail: for m in modules: data = { 'year_id': '000110', 'mnem': m['code'], } page, hdrs = scraper.fetch(MODULE_DETAIL_URL, data) m['detail'] = scrape_module_details(scraper.parse(page)) ## dump scraped data if dump_json: print json.dumps(modules) elif dump_ascii: for module in modules: print "\x1b[0;1m%s\x1b[0m" % module['code'], "--", module['title'] for (act, data) in sorted(module['acts'].items()): print "\t%-13s" % (act,), \
def test_fetch(self): soup = scraper.fetch('https://instagram.com/jawkneelin', False) self.assertEqual(type(soup), BeautifulSoup)
def post_new_entries(): global new_entries existing_entries = new_entries[:] new_entries = fetch() for msg in get_new_entries(existing_entries, new_entries): send_message(msg)
import json from flask import Flask, request, make_response from flask_apscheduler import APScheduler from client import send_message, verification_token from scraper import fetch, get_new_entries, entries_to_messages app = Flask(__name__) new_entries = fetch() class Config(object): JOBS = [ { 'id': 'example', 'func': 'app:post_new_entries', 'trigger': 'cron', 'second': '0', } ] SCHEDULER_API_ENABLED = True def post_new_entries(): global new_entries existing_entries = new_entries[:] new_entries = fetch() for msg in get_new_entries(existing_entries, new_entries): send_message(msg)