soup = BeautifulSoup(page) links = soup.findAll('a', href=True) same_domain_links = filter(lambda link: domain in str(link["href"]), links) for link in same_domain_links: controller.emit({"domain":tup["domain"], "url":link["href"]}) return def execute_crawl(controller, tup): url = tup["url"] page = open_url(url) if(page != None): controller.emit({"domain":tup["domain"], "url":url, "html":page}) return app = zillabyte.app(name="python_crawler") #Create a stream from all the domains we have domains = app.source(matches="select * from domains") # For each homepage of the domain, fetch all the links inner_links = domains.each(execute=execute_find_links) # For each link, fetch the page first_level_pages = inner_links.each(execute=execute_crawl) # Finally, save these pages first_level_pages.sink(name="domain_pages", columns=[{"domain":"string"}, {"url":"string"}, {"html":"string"}])
for link in same_domain_links: controller.emit({"domain": tup["domain"], "url": link["href"]}) return def execute_crawl(controller, tup): url = tup["url"] page = open_url(url) if (page != None): controller.emit({"domain": tup["domain"], "url": url, "html": page}) return app = zillabyte.app(name="python_crawler") #Create a stream from all the domains we have domains = app.source(matches="select * from domains") # For each homepage of the domain, fetch all the links inner_links = domains.each(execute=execute_find_links) # For each link, fetch the page first_level_pages = inner_links.each(execute=execute_crawl) # Finally, save these pages first_level_pages.sink(name="domain_pages", columns=[{ "domain": "string" }, {
# This is run after all tuples have been received for the cycle # We emit the "word" , "url", and the count of the pair in tuples def domain_count_end_group(controller): global domain_word global domain_count controller.emit({"domain": domain_word, "count": domain_count}) # This is the heart of your algorithm. It's processed on every # web page. This algorithm is run in parallel on possibly hundreds # of machines. def domain_count(controller, tup): for domain in domains: if (domain in tup["html"]): controller.emit({"domain": domain}) app = zillabyte.app(name="hello_world") app.source(matches="sample_homepages") \ .each(execute=domain_count) \ .group_by( \ name="domain_count", \ fields=["domain"], \ begin_group=domain_count_begin_group, \ aggregate=domain_count_aggregate_group, \ end_group=domain_count_end_group \ )\ .sink(name="domain_names", columns=[{"domain": "string"}, {"count": "integer"}])
def buildGraph(controller, tup): #extract song url and artist name song = tup["song"] #artist name used only for debugging purposes artist = tup["artist"] #print song, artist #make request to rap.genius.com to get and emit artist, feature, and producer info songData = setSong(song) controller.emit({"song" : song, "artist": songData[0], "featuredArtists": songData[1], "producers": songData[2]}) def nt(controller): #function to handle custom source from my seed list of artists with open("rapperlist.csv") as rl: for line in rl: controller.emit({"artist" : line}) controller.end_cycle() #initialize app, use custom source and two Each steps with previously defined functions, and a sink app = zillabyte.app(name = "pygenius") app.source(name="raplist", next_tuple = nt, end_cycle_policy="explicit")\ .each(execute = getsongs)\ .each(execute = buildGraph)\ .sink(name = "rapsink", columns = [{"song":"string"}, {"artist":"string"}, {"featuredArtists":"array"}, {"producers":"array"}])