Esempio n. 1
0
def scrape_user(username):
    user = {
        'type':'user',
        'username':username,
    }
    url = "/{username}".format(username=username)
    html = urldb.get(PREFIX+url)
    soup = BeautifulSoup(html)

    #user orgs
    orgs = []
    for org in soup.find_all('a',{'itemprop':'follows'}):
        orgs.append(org.attrs['aria-label'])
    user['orgs'] = orgs
    return user
Esempio n. 2
0
def scrape_user(username):
    user = {
        'type': 'user',
        'username': username,
    }
    url = "/{username}".format(username=username)
    html = urldb.get(PREFIX + url)
    soup = BeautifulSoup(html)

    #user orgs
    orgs = []
    for org in soup.find_all('a', {'itemprop': 'follows'}):
        orgs.append(org.attrs['aria-label'])
    user['orgs'] = orgs
    return user
Esempio n. 3
0
def fetch_event_list(url_group):
    url = "{group_url}events/past/?page={page}&__fragment=centerColMeetupList"
    page = 0
    events = set()
    while True:
        r = urldb.get(url.format(page=page, group_url=url_group))
        j = json.loads(r)
        h = BeautifulSoup(j[0], 'html.parser')
        a = h.find_all(class_='event-title')
        print(len(a))
        if len(a) == 0:
            break
        for el in a:
            events.add(el.attrs['href'])
        page += 1
    return list(events)
Esempio n. 4
0
def scrape_org_people(orgname):
    members = []
    page = 1
    while True:
        url = "/orgs/{orgname}/people?page={page}".format(
            orgname=orgname, page=page)
        html = urldb.get(PREFIX+url)
        soup = BeautifulSoup(html)
        prev_len = len(members)
        for member in soup.find_all('div',{'class':'member-info'}):
            username = member.find('strong').text.strip()
            members.append(username)
        if len(members) == prev_len:
            break
        page += 1
    return members
Esempio n. 5
0
def scrape_org_people(orgname):
    members = []
    page = 1
    while True:
        url = "/orgs/{orgname}/people?page={page}".format(orgname=orgname,
                                                          page=page)
        html = urldb.get(PREFIX + url)
        soup = BeautifulSoup(html)
        prev_len = len(members)
        for member in soup.find_all('div', {'class': 'member-info'}):
            username = member.find('strong').text.strip()
            members.append(username)
        if len(members) == prev_len:
            break
        page += 1
    return members
Esempio n. 6
0
def fetch_group_members(url_group):
    all_members = set()
    offset = 0
    while True:
        url = "members/?offset={offset}&sort=join_date&desc=0"
        html = urldb.get(url_group + url.format(offset=offset))
        soup = BeautifulSoup(html, "html.parser")
        members = soup.find_all(class_="memberInfo")
        prev_uniq_url = len(all_members)
        for member in members:
            infos = {}
            name = member.find(class_="memName")
            #infos['name'] = name.text.strip()
            all_members.add(name.attrs['href'])
        offset += 20
        print(len(all_members), '=>', len(all_members))
        if len(all_members) <= prev_uniq_url:
            break
    return list(all_members)
Esempio n. 7
0
import urldb
import json
from bs4 import BeautifulSoup

URL = "http://www.tendaysinparis.com/events/list/?action=tribe_list&tribe_paged={page}&tribe_event_display=list"

ALL = []

i = 0
while True:
    i += 1
    html = urldb.get(URL.format(page=i))
    soup = BeautifulSoup(html)
    vevent = soup.find_all(class_='vevent')
    for event in vevent:
        infos = {}
        a = event.find(class_='url')
        infos['url'] = a.attrs['href']
        infos['title'] = a.text.strip()
        infos['date-start'] = event.find('span',
                {'class':'value-title'}).attrs['title']
        date_end = event.find('span',
                {'class':'date-end dtend'})
        if date_end:
            date_end = date_end.find('span')
            infos['date-end'] = date_end.attrs['title']
        infos['organizer'] = event.find(class_='author fn org').text.strip()
        loc = event.find(class_='tribe-events-address')
        if loc:
            infos['location'] = loc.text.strip()
        img = event.find('img')
Esempio n. 8
0
import urldb, json
from bs4 import BeautifulSoup

bootstrap = "http://google.meetup.com/all/"
html = urldb.get(bootstrap)
soup = BeautifulSoup(html, "html.parser")

DATA = []


def save():
    with open("data/raw.live.json", 'w') as f:
        json.dump(DATA, f, indent=2)


def parse_group(html):
    interresting_keys = ('description','locality','country-name', \
        'region','postal-code','latitude','longitude','image', 'placename')
    meetup_infos = {}
    soup = BeautifulSoup(html, "html.parser")
    metas = soup.find_all('meta')
    print('\n'.join(list(map(str, metas))))

    for meta in metas:
        keyname = 'name' if 'name' in meta.attrs else 'property'
        if keyname in meta.attrs:
            key = meta.attrs[keyname]
            key = key.replace('og:', '').replace('geo.', '')
            if key in interresting_keys:
                value = meta.attrs['content']
                if 'itude' in key:
Esempio n. 9
0
import urldb
import json
from pprint import pprint as pp
from bs4 import BeautifulSoup

links = sorted(set(x['link'] for x in json.load(open('data/pages.json'))))

print(len(links),'links')

ALL = []

for link in links:
    print(link)
    infos = {'url':link}
    html = urldb.get(link)
    if html == None: continue
    e = BeautifulSoup(html,'lxml')
    all = e.find_all('meta')
    for el in all:
        if 'property' in el.attrs and 'blabla' in el.attrs['property']:
            content = el.attrs['content'].strip()
            attr_name = el.attrs['property'].split(':',1)[1].replace(':','_')
            infos[attr_name] = content
    #pp(infos)
    ALL.append(infos)
    json.dump(ALL, open('data/details.json','w'), indent=2)
    print(len(ALL))
Esempio n. 10
0
from bs4 import BeautifulSoup
from pprint import pprint as pp

url = "http://www.meetup.com/find/events/?pageToken=225320454%7C2015-09-19T04%3A00%3A00.000-04%3A00%7C{offset}%7CFORWARD&allMeetups=true&keywords=&radius=Infinity&userFreeform=Palo+Alto%2C+CA&mcId=z94303&mcName=Palo+Alto%2C+CA&sort=recommended&eventFilter=all&__fragment=simple_search&op=&on_home=true"

DATA = []


def save():
    with open("data/events_coming.json", 'w') as f:
        json.dump(DATA, f, indent=2)


offset = 0
while True:
    html = json.loads(urldb.get(url.format(offset=offset)))[0]
    soup = BeautifulSoup(html, 'html.parser')
    events = soup.find_all(class_='event-listing')
    print(offset, ':', len(events))
    for e in events:
        infos = {}
        event_link = e.find(class_='event-title')
        infos['title'] = event_link.text.strip()
        infos['url'] = event_link.attrs['href']
        infos['attendees_count'] = int(
            e.find(class_='attendee-count').text.strip().split()[0])
        group_link = e.find(class_='chapter-name')
        infos['group_name'] = group_link.text.strip()
        infos['group_url'] = group_link.attrs['href']
        infos['start'] = e.find('time').attrs['datetime']
        DATA.append(infos)
Esempio n. 11
0
groups = json.load(open('data/raw.live.json'))

url = "{group_url}events/past/?page={page}&__fragment=centerColMeetupList"

group_events = {}

def save():
    with open("data/events.live.json",'w') as f:
        json.dump(group_events, f, indent=2)

for group in groups:
	page = 0
	events = []
	while True:
		r = urldb.get(url.format(page=page, group_url=group['url']))
		j = json.loads(r)
		h = BeautifulSoup(j[0], 'html.parser')
		a = h.find_all(class_='event-title')
		print(len(a))
		if len(a) == 0:
			break
		for el in a:
			infos = {
				'url': el.attrs['href'],
				'title': el.text.strip()
			}
			events.append(infos)
		page += 1
	group_events[group['url']] = events
	save()
Esempio n. 12
0
import urldb
import json
from pprint import pprint as pp
from bs4 import BeautifulSoup

links = sorted(set(x['link'] for x in json.load(open('data/pages.json'))))

print(len(links), 'links')

ALL = []

for link in links:
    print(link)
    infos = {'url': link}
    html = urldb.get(link)
    if html == None: continue
    e = BeautifulSoup(html, 'lxml')
    all = e.find_all('meta')
    for el in all:
        if 'property' in el.attrs and 'blabla' in el.attrs['property']:
            content = el.attrs['content'].strip()
            attr_name = el.attrs['property'].split(':', 1)[1].replace(':', '_')
            infos[attr_name] = content
    #pp(infos)
    ALL.append(infos)
    json.dump(ALL, open('data/details.json', 'w'), indent=2)
    print(len(ALL))
Esempio n. 13
0
import urldb
import json
from pprint import pprint as pp
import itertools

URL = "http://paris.demosphere.eu/events.ics"

ALL = []

ics = urldb.get(URL)

for line in ics.split('\n'):
    if line.startswith('URL'):
        infos = {}
        infos['url'] = line.split(':',1)[1].strip()
        ALL.append(infos)

json.dump(ALL, open('data/index.json','w'), indent=2)

print(len(ALL))
Esempio n. 14
0

def detect_type(url):
    if '/events/' in url:
        return EVENT_PAGE
    if '/members/' in url:
        return MEMBER_PAGE
    return GROUP_PAGE


if __name__ == "__main__":
    while True:
        url = something_to_explore()
        if url == None: break
        try:
            html = urldb.get(url)
        except Exception as e:
            print('error:', e)
            continue
        page_type = detect_type(url)
        if page_type == EVENT_PAGE:
            event = parse_event(html, url)
            add_event_explored(event)
        if page_type == GROUP_PAGE:
            group = parse_group(html, url)
            add_group_explored(group)

        print()
        print()
        print(len(explored_groups()), 'groups', len(explored_events()),
              'events')
Esempio n. 15
0
import urldb
import json
from pprint import pprint as pp
from bs4 import BeautifulSoup
import itertools

events = json.load(open("data/index.json"))

for event in events:
    print(event["url"])
    html = urldb.get(event["url"])
    e = BeautifulSoup(html)
    all = itertools.chain(e.find_all("meta"), e.find_all("span"), e.find_all("a"), e.find_all("div"))
    for el in all:
        if "itemprop" in el.attrs:
            if el.attrs["itemprop"] not in ("url", "address", "geo"):
                content = el.text.strip()
                if "content" in el.attrs:
                    content = el.attrs["content"]
                attr_name = el.attrs["itemprop"]
                if attr_name == "name":
                    attr_name = "location"
                    if attr_name in event:
                        attr_name = "title"
                event[attr_name] = content
    print(event["title"])

json.dump(events, open("data/details.json", "w"), indent=2)
Esempio n. 16
0
import urldb, json
from bs4 import BeautifulSoup
from pprint import pprint as pp

url = "http://www.meetup.com/find/events/?pageToken=225320454%7C2015-09-19T04%3A00%3A00.000-04%3A00%7C{offset}%7CFORWARD&allMeetups=true&keywords=&radius=Infinity&userFreeform=Palo+Alto%2C+CA&mcId=z94303&mcName=Palo+Alto%2C+CA&sort=recommended&eventFilter=all&__fragment=simple_search&op=&on_home=true"

DATA = []

def save():
    with open("data/events_coming.json",'w') as f:
        json.dump(DATA, f, indent=2)

offset = 0
while True:
	html = json.loads(urldb.get(url.format(offset=offset)))[0]
	soup = BeautifulSoup(html,'html.parser')
	events = soup.find_all(class_='event-listing')
	print(offset,':',len(events))
	for e in events:
		infos = {}
		event_link = e.find(class_='event-title')
		infos['title'] = event_link.text.strip()
		infos['url'] = event_link.attrs['href']
		infos['attendees_count'] = int(e.find(class_='attendee-count').text.strip().split()[0])
		group_link = e.find(class_='chapter-name')
		infos['group_name'] = group_link.text.strip()
		infos['group_url'] = group_link.attrs['href']
		infos['start'] = e.find('time').attrs['datetime']
		DATA.append(infos)
	if len(events) == 0:
		break
Esempio n. 17
0
import urldb
import json
from pprint import pprint as pp
from bs4 import BeautifulSoup
import itertools

BASE_URL = "https://www.blablacar.co.uk"
URL = BASE_URL+"/search_xhr?fn=Paris&fc=48.856614%7C2.352222&fcc=FR&tn=&sort=trip_date&order=asc&limit=100&page={page}&_=1439842190324"

ALL = []

i = 0
while True:
    i += 1
    resp = urldb.get(URL.format(page=i))
    html = json.loads(resp)['html']['results']
    soup = BeautifulSoup(html,'lxml')
    elements = soup.find_all(class_='trip')
    for e in elements:
        infos = {}
        infos['link'] = BASE_URL+e.find('a').attrs['href']
        pp(infos)
        ALL.append(infos)

    print(len(elements))

    json.dump(ALL, open('data/pages.json','w'), indent=2)

    if len(elements) < 10:
        break
Esempio n. 18
0
import urldb
import json
from pprint import pprint as pp
from bs4 import BeautifulSoup
import itertools

BASE_URL = "http://quefaire.paris.fr"
URL = BASE_URL+"/all/0/{count}"

ALL = []

i = 0
while True:
    html = urldb.get(URL.format(count=i))
    soup = BeautifulSoup(html)
    events = soup.find_all(class_='result-section')
    for e in events:
        infos = {}
        link = e.find('h1')
        if link:
            link = link.find('a')
            infos['url'] = link.attrs['href']
            infos['title'] = link.text.strip()
            img = e.find('img')
            if img:
                infos['img'] = img.attrs['src']
            infos['description'] = e.find(class_='first-intro').text.strip()
            price = e.find(class_='prix')
            if price:
                infos['price'] = price.text.strip()
Esempio n. 19
0
	return random.choice(list(to_explore))

def detect_type(url):
	if '/events/' in url:
		return EVENT_PAGE
	if '/members/' in url:
		return MEMBER_PAGE
	return GROUP_PAGE


if __name__ == "__main__":
	while True:
		url = something_to_explore()
		if url == None: break
		try:
			html = urldb.get(url)
		except Exception as e:
			print('error:',e)
			continue
		page_type = detect_type(url)
		if page_type == EVENT_PAGE:
			event = parse_event(html, url)
			add_event_explored(event)
		if page_type == GROUP_PAGE:
			group = parse_group(html, url)
			add_group_explored(group)

		print()
		print()
		print(len(explored_groups()),'groups',len(explored_events()),'events')
		print()
Esempio n. 20
0
import urldb
import json
from pprint import pprint as pp
from bs4 import BeautifulSoup
import itertools

BASE_URL = "https://www.blablacar.co.uk"
URL = BASE_URL + "/search_xhr?fn=Paris&fc=48.856614%7C2.352222&fcc=FR&tn=&sort=trip_date&order=asc&limit=100&page={page}&_=1439842190324"

ALL = []

i = 0
while True:
    i += 1
    resp = urldb.get(URL.format(page=i))
    html = json.loads(resp)['html']['results']
    soup = BeautifulSoup(html, 'lxml')
    elements = soup.find_all(class_='trip')
    for e in elements:
        infos = {}
        infos['link'] = BASE_URL + e.find('a').attrs['href']
        pp(infos)
        ALL.append(infos)

    print(len(elements))

    json.dump(ALL, open('data/pages.json', 'w'), indent=2)

    if len(elements) < 10:
        break
Esempio n. 21
0
import urldb, json
from bs4 import BeautifulSoup

bootstrap = "http://google.meetup.com/all/"
html = urldb.get(bootstrap)
soup = BeautifulSoup(html, "html.parser")

DATA = []

def save():
    with open("data/raw.live.json",'w') as f:
        json.dump(DATA, f, indent=2)

def parse_group(html):
    interresting_keys = ('description','locality','country-name', \
        'region','postal-code','latitude','longitude','image', 'placename')
    meetup_infos = {}
    soup = BeautifulSoup(html, "html.parser")
    metas = soup.find_all('meta')
    print('\n'.join(list(map(str,metas))))

    for meta in metas:
        keyname = 'name' if 'name' in meta.attrs else 'property'
        if keyname in meta.attrs:
            key = meta.attrs[keyname]
            key = key.replace('og:','').replace('geo.','')
            if key in interresting_keys:
                value = meta.attrs['content']
                if 'itude' in key:
                    value = float(value)
                meetup_infos[key] = value