def scrape_user(username): user = { 'type':'user', 'username':username, } url = "/{username}".format(username=username) html = urldb.get(PREFIX+url) soup = BeautifulSoup(html) #user orgs orgs = [] for org in soup.find_all('a',{'itemprop':'follows'}): orgs.append(org.attrs['aria-label']) user['orgs'] = orgs return user
def scrape_user(username): user = { 'type': 'user', 'username': username, } url = "/{username}".format(username=username) html = urldb.get(PREFIX + url) soup = BeautifulSoup(html) #user orgs orgs = [] for org in soup.find_all('a', {'itemprop': 'follows'}): orgs.append(org.attrs['aria-label']) user['orgs'] = orgs return user
def fetch_event_list(url_group): url = "{group_url}events/past/?page={page}&__fragment=centerColMeetupList" page = 0 events = set() while True: r = urldb.get(url.format(page=page, group_url=url_group)) j = json.loads(r) h = BeautifulSoup(j[0], 'html.parser') a = h.find_all(class_='event-title') print(len(a)) if len(a) == 0: break for el in a: events.add(el.attrs['href']) page += 1 return list(events)
def scrape_org_people(orgname): members = [] page = 1 while True: url = "/orgs/{orgname}/people?page={page}".format( orgname=orgname, page=page) html = urldb.get(PREFIX+url) soup = BeautifulSoup(html) prev_len = len(members) for member in soup.find_all('div',{'class':'member-info'}): username = member.find('strong').text.strip() members.append(username) if len(members) == prev_len: break page += 1 return members
def scrape_org_people(orgname): members = [] page = 1 while True: url = "/orgs/{orgname}/people?page={page}".format(orgname=orgname, page=page) html = urldb.get(PREFIX + url) soup = BeautifulSoup(html) prev_len = len(members) for member in soup.find_all('div', {'class': 'member-info'}): username = member.find('strong').text.strip() members.append(username) if len(members) == prev_len: break page += 1 return members
def fetch_group_members(url_group): all_members = set() offset = 0 while True: url = "members/?offset={offset}&sort=join_date&desc=0" html = urldb.get(url_group + url.format(offset=offset)) soup = BeautifulSoup(html, "html.parser") members = soup.find_all(class_="memberInfo") prev_uniq_url = len(all_members) for member in members: infos = {} name = member.find(class_="memName") #infos['name'] = name.text.strip() all_members.add(name.attrs['href']) offset += 20 print(len(all_members), '=>', len(all_members)) if len(all_members) <= prev_uniq_url: break return list(all_members)
import urldb import json from bs4 import BeautifulSoup URL = "http://www.tendaysinparis.com/events/list/?action=tribe_list&tribe_paged={page}&tribe_event_display=list" ALL = [] i = 0 while True: i += 1 html = urldb.get(URL.format(page=i)) soup = BeautifulSoup(html) vevent = soup.find_all(class_='vevent') for event in vevent: infos = {} a = event.find(class_='url') infos['url'] = a.attrs['href'] infos['title'] = a.text.strip() infos['date-start'] = event.find('span', {'class':'value-title'}).attrs['title'] date_end = event.find('span', {'class':'date-end dtend'}) if date_end: date_end = date_end.find('span') infos['date-end'] = date_end.attrs['title'] infos['organizer'] = event.find(class_='author fn org').text.strip() loc = event.find(class_='tribe-events-address') if loc: infos['location'] = loc.text.strip() img = event.find('img')
import urldb, json from bs4 import BeautifulSoup bootstrap = "http://google.meetup.com/all/" html = urldb.get(bootstrap) soup = BeautifulSoup(html, "html.parser") DATA = [] def save(): with open("data/raw.live.json", 'w') as f: json.dump(DATA, f, indent=2) def parse_group(html): interresting_keys = ('description','locality','country-name', \ 'region','postal-code','latitude','longitude','image', 'placename') meetup_infos = {} soup = BeautifulSoup(html, "html.parser") metas = soup.find_all('meta') print('\n'.join(list(map(str, metas)))) for meta in metas: keyname = 'name' if 'name' in meta.attrs else 'property' if keyname in meta.attrs: key = meta.attrs[keyname] key = key.replace('og:', '').replace('geo.', '') if key in interresting_keys: value = meta.attrs['content'] if 'itude' in key:
import urldb import json from pprint import pprint as pp from bs4 import BeautifulSoup links = sorted(set(x['link'] for x in json.load(open('data/pages.json')))) print(len(links),'links') ALL = [] for link in links: print(link) infos = {'url':link} html = urldb.get(link) if html == None: continue e = BeautifulSoup(html,'lxml') all = e.find_all('meta') for el in all: if 'property' in el.attrs and 'blabla' in el.attrs['property']: content = el.attrs['content'].strip() attr_name = el.attrs['property'].split(':',1)[1].replace(':','_') infos[attr_name] = content #pp(infos) ALL.append(infos) json.dump(ALL, open('data/details.json','w'), indent=2) print(len(ALL))
from bs4 import BeautifulSoup from pprint import pprint as pp url = "http://www.meetup.com/find/events/?pageToken=225320454%7C2015-09-19T04%3A00%3A00.000-04%3A00%7C{offset}%7CFORWARD&allMeetups=true&keywords=&radius=Infinity&userFreeform=Palo+Alto%2C+CA&mcId=z94303&mcName=Palo+Alto%2C+CA&sort=recommended&eventFilter=all&__fragment=simple_search&op=&on_home=true" DATA = [] def save(): with open("data/events_coming.json", 'w') as f: json.dump(DATA, f, indent=2) offset = 0 while True: html = json.loads(urldb.get(url.format(offset=offset)))[0] soup = BeautifulSoup(html, 'html.parser') events = soup.find_all(class_='event-listing') print(offset, ':', len(events)) for e in events: infos = {} event_link = e.find(class_='event-title') infos['title'] = event_link.text.strip() infos['url'] = event_link.attrs['href'] infos['attendees_count'] = int( e.find(class_='attendee-count').text.strip().split()[0]) group_link = e.find(class_='chapter-name') infos['group_name'] = group_link.text.strip() infos['group_url'] = group_link.attrs['href'] infos['start'] = e.find('time').attrs['datetime'] DATA.append(infos)
groups = json.load(open('data/raw.live.json')) url = "{group_url}events/past/?page={page}&__fragment=centerColMeetupList" group_events = {} def save(): with open("data/events.live.json",'w') as f: json.dump(group_events, f, indent=2) for group in groups: page = 0 events = [] while True: r = urldb.get(url.format(page=page, group_url=group['url'])) j = json.loads(r) h = BeautifulSoup(j[0], 'html.parser') a = h.find_all(class_='event-title') print(len(a)) if len(a) == 0: break for el in a: infos = { 'url': el.attrs['href'], 'title': el.text.strip() } events.append(infos) page += 1 group_events[group['url']] = events save()
import urldb import json from pprint import pprint as pp from bs4 import BeautifulSoup links = sorted(set(x['link'] for x in json.load(open('data/pages.json')))) print(len(links), 'links') ALL = [] for link in links: print(link) infos = {'url': link} html = urldb.get(link) if html == None: continue e = BeautifulSoup(html, 'lxml') all = e.find_all('meta') for el in all: if 'property' in el.attrs and 'blabla' in el.attrs['property']: content = el.attrs['content'].strip() attr_name = el.attrs['property'].split(':', 1)[1].replace(':', '_') infos[attr_name] = content #pp(infos) ALL.append(infos) json.dump(ALL, open('data/details.json', 'w'), indent=2) print(len(ALL))
import urldb import json from pprint import pprint as pp import itertools URL = "http://paris.demosphere.eu/events.ics" ALL = [] ics = urldb.get(URL) for line in ics.split('\n'): if line.startswith('URL'): infos = {} infos['url'] = line.split(':',1)[1].strip() ALL.append(infos) json.dump(ALL, open('data/index.json','w'), indent=2) print(len(ALL))
def detect_type(url): if '/events/' in url: return EVENT_PAGE if '/members/' in url: return MEMBER_PAGE return GROUP_PAGE if __name__ == "__main__": while True: url = something_to_explore() if url == None: break try: html = urldb.get(url) except Exception as e: print('error:', e) continue page_type = detect_type(url) if page_type == EVENT_PAGE: event = parse_event(html, url) add_event_explored(event) if page_type == GROUP_PAGE: group = parse_group(html, url) add_group_explored(group) print() print() print(len(explored_groups()), 'groups', len(explored_events()), 'events')
import urldb import json from pprint import pprint as pp from bs4 import BeautifulSoup import itertools events = json.load(open("data/index.json")) for event in events: print(event["url"]) html = urldb.get(event["url"]) e = BeautifulSoup(html) all = itertools.chain(e.find_all("meta"), e.find_all("span"), e.find_all("a"), e.find_all("div")) for el in all: if "itemprop" in el.attrs: if el.attrs["itemprop"] not in ("url", "address", "geo"): content = el.text.strip() if "content" in el.attrs: content = el.attrs["content"] attr_name = el.attrs["itemprop"] if attr_name == "name": attr_name = "location" if attr_name in event: attr_name = "title" event[attr_name] = content print(event["title"]) json.dump(events, open("data/details.json", "w"), indent=2)
import urldb, json from bs4 import BeautifulSoup from pprint import pprint as pp url = "http://www.meetup.com/find/events/?pageToken=225320454%7C2015-09-19T04%3A00%3A00.000-04%3A00%7C{offset}%7CFORWARD&allMeetups=true&keywords=&radius=Infinity&userFreeform=Palo+Alto%2C+CA&mcId=z94303&mcName=Palo+Alto%2C+CA&sort=recommended&eventFilter=all&__fragment=simple_search&op=&on_home=true" DATA = [] def save(): with open("data/events_coming.json",'w') as f: json.dump(DATA, f, indent=2) offset = 0 while True: html = json.loads(urldb.get(url.format(offset=offset)))[0] soup = BeautifulSoup(html,'html.parser') events = soup.find_all(class_='event-listing') print(offset,':',len(events)) for e in events: infos = {} event_link = e.find(class_='event-title') infos['title'] = event_link.text.strip() infos['url'] = event_link.attrs['href'] infos['attendees_count'] = int(e.find(class_='attendee-count').text.strip().split()[0]) group_link = e.find(class_='chapter-name') infos['group_name'] = group_link.text.strip() infos['group_url'] = group_link.attrs['href'] infos['start'] = e.find('time').attrs['datetime'] DATA.append(infos) if len(events) == 0: break
import urldb import json from pprint import pprint as pp from bs4 import BeautifulSoup import itertools BASE_URL = "https://www.blablacar.co.uk" URL = BASE_URL+"/search_xhr?fn=Paris&fc=48.856614%7C2.352222&fcc=FR&tn=&sort=trip_date&order=asc&limit=100&page={page}&_=1439842190324" ALL = [] i = 0 while True: i += 1 resp = urldb.get(URL.format(page=i)) html = json.loads(resp)['html']['results'] soup = BeautifulSoup(html,'lxml') elements = soup.find_all(class_='trip') for e in elements: infos = {} infos['link'] = BASE_URL+e.find('a').attrs['href'] pp(infos) ALL.append(infos) print(len(elements)) json.dump(ALL, open('data/pages.json','w'), indent=2) if len(elements) < 10: break
import urldb import json from pprint import pprint as pp from bs4 import BeautifulSoup import itertools BASE_URL = "http://quefaire.paris.fr" URL = BASE_URL+"/all/0/{count}" ALL = [] i = 0 while True: html = urldb.get(URL.format(count=i)) soup = BeautifulSoup(html) events = soup.find_all(class_='result-section') for e in events: infos = {} link = e.find('h1') if link: link = link.find('a') infos['url'] = link.attrs['href'] infos['title'] = link.text.strip() img = e.find('img') if img: infos['img'] = img.attrs['src'] infos['description'] = e.find(class_='first-intro').text.strip() price = e.find(class_='prix') if price: infos['price'] = price.text.strip()
return random.choice(list(to_explore)) def detect_type(url): if '/events/' in url: return EVENT_PAGE if '/members/' in url: return MEMBER_PAGE return GROUP_PAGE if __name__ == "__main__": while True: url = something_to_explore() if url == None: break try: html = urldb.get(url) except Exception as e: print('error:',e) continue page_type = detect_type(url) if page_type == EVENT_PAGE: event = parse_event(html, url) add_event_explored(event) if page_type == GROUP_PAGE: group = parse_group(html, url) add_group_explored(group) print() print() print(len(explored_groups()),'groups',len(explored_events()),'events') print()
import urldb import json from pprint import pprint as pp from bs4 import BeautifulSoup import itertools BASE_URL = "https://www.blablacar.co.uk" URL = BASE_URL + "/search_xhr?fn=Paris&fc=48.856614%7C2.352222&fcc=FR&tn=&sort=trip_date&order=asc&limit=100&page={page}&_=1439842190324" ALL = [] i = 0 while True: i += 1 resp = urldb.get(URL.format(page=i)) html = json.loads(resp)['html']['results'] soup = BeautifulSoup(html, 'lxml') elements = soup.find_all(class_='trip') for e in elements: infos = {} infos['link'] = BASE_URL + e.find('a').attrs['href'] pp(infos) ALL.append(infos) print(len(elements)) json.dump(ALL, open('data/pages.json', 'w'), indent=2) if len(elements) < 10: break
import urldb, json from bs4 import BeautifulSoup bootstrap = "http://google.meetup.com/all/" html = urldb.get(bootstrap) soup = BeautifulSoup(html, "html.parser") DATA = [] def save(): with open("data/raw.live.json",'w') as f: json.dump(DATA, f, indent=2) def parse_group(html): interresting_keys = ('description','locality','country-name', \ 'region','postal-code','latitude','longitude','image', 'placename') meetup_infos = {} soup = BeautifulSoup(html, "html.parser") metas = soup.find_all('meta') print('\n'.join(list(map(str,metas)))) for meta in metas: keyname = 'name' if 'name' in meta.attrs else 'property' if keyname in meta.attrs: key = meta.attrs[keyname] key = key.replace('og:','').replace('geo.','') if key in interresting_keys: value = meta.attrs['content'] if 'itude' in key: value = float(value) meetup_infos[key] = value