Beispiel #1
0
    soup = BeautifulSoup(page)
    links = soup.findAll('a', href=True)
    same_domain_links = filter(lambda link: domain in str(link["href"]), links)

    for link in same_domain_links:
      controller.emit({"domain":tup["domain"], "url":link["href"]})
  return

def execute_crawl(controller, tup):
  url = tup["url"]
  page = open_url(url)

  if(page != None):
    controller.emit({"domain":tup["domain"], "url":url, "html":page})
  return

app = zillabyte.app(name="python_crawler")

#Create a stream from all the domains we have
domains           = app.source(matches="select * from domains")

# For each homepage of the domain, fetch all the links
inner_links       = domains.each(execute=execute_find_links)

# For each link, fetch the page
first_level_pages = inner_links.each(execute=execute_crawl)

# Finally, save these pages 
first_level_pages.sink(name="domain_pages", columns=[{"domain":"string"}, {"url":"string"}, {"html":"string"}])

Beispiel #2
0
        for link in same_domain_links:
            controller.emit({"domain": tup["domain"], "url": link["href"]})
    return


def execute_crawl(controller, tup):
    url = tup["url"]
    page = open_url(url)

    if (page != None):
        controller.emit({"domain": tup["domain"], "url": url, "html": page})
    return


app = zillabyte.app(name="python_crawler")

#Create a stream from all the domains we have
domains = app.source(matches="select * from domains")

# For each homepage of the domain, fetch all the links
inner_links = domains.each(execute=execute_find_links)

# For each link, fetch the page
first_level_pages = inner_links.each(execute=execute_crawl)

# Finally, save these pages
first_level_pages.sink(name="domain_pages",
                       columns=[{
                           "domain": "string"
                       }, {
Beispiel #3
0

# This is run after all tuples have been received for the cycle
# We emit the "word" , "url", and the count of the pair in tuples
def domain_count_end_group(controller):
    global domain_word
    global domain_count
    controller.emit({"domain": domain_word, "count": domain_count})


# This is the heart of your algorithm.  It's processed on every
# web page.  This algorithm is run in parallel on possibly hundreds
# of machines.
def domain_count(controller, tup):
    for domain in domains:
        if (domain in tup["html"]):
            controller.emit({"domain": domain})


app = zillabyte.app(name="hello_world")
app.source(matches="sample_homepages") \
    .each(execute=domain_count) \
    .group_by( \
    name="domain_count", \
    fields=["domain"], \
    begin_group=domain_count_begin_group, \
    aggregate=domain_count_aggregate_group, \
    end_group=domain_count_end_group \
    )\
	.sink(name="domain_names", columns=[{"domain": "string"}, {"count": "integer"}])
Beispiel #4
0

def buildGraph(controller, tup):
  #extract song url and artist name  
  song = tup["song"]

  #artist name used only for debugging purposes
  artist = tup["artist"]
  #print song, artist

  #make request to rap.genius.com to get and emit artist, feature, and producer info
  songData = setSong(song)
  controller.emit({"song" : song, "artist": songData[0], "featuredArtists": songData[1], "producers": songData[2]})


def nt(controller):
  #function to handle custom source from my seed list of artists
  with open("rapperlist.csv") as rl:
    for line in rl:
      controller.emit({"artist" : line})
  controller.end_cycle()



#initialize app, use custom source and two Each steps with previously defined functions, and a sink
app = zillabyte.app(name = "pygenius")
app.source(name="raplist", next_tuple = nt, end_cycle_policy="explicit")\
   .each(execute = getsongs)\
   .each(execute = buildGraph)\
   .sink(name = "rapsink", columns = [{"song":"string"}, {"artist":"string"}, {"featuredArtists":"array"}, {"producers":"array"}])