def scrape_website(seeds, base_url, force_download=False):
    # keep scraping and indexing until we run out of links to find

    # to keep track of what we need to find and where we're going
    links_to_follow = set(
        ["/" + "/".join(seed.split("/")[3:]) for seed in seeds])
    visited_links = set()
    index = {}

    # repeat until we run out of links
    while links_to_follow:
        print("Number of pages in queue: {}".format(len(links_to_follow)))
        new_url = links_to_follow.pop()
        visited_links.add(new_url)

        maybe_save_url(base_url + new_url, force_download=force_download)

        links_to_follow = (links_to_follow | get_links_from_url(
            base_url + new_url, force_download=force_download)) - visited_links
        add_page_to_index(base_url + new_url, index)

    # save the list of pages we've got
    with open(get_filename(base_url + "/scraped_pages.txt"), "w") as f:
        f.write(base_url + "\n{}".format(base_url).join(sorted(visited_links)))

    sort_and_store_index(index, base_url)
Beispiel #2
0
def crawl_web(seed):
    tocrawl = [seed]
    crawled = []
    index = []
    
    while tocrawl:
        page = tocrawl.pop()
        content = get_page(page)
        indexer.add_page_to_index(index, page, content)
        if page not in crawled:
            union(tocrawl, get_all_links(content))
            crawled.append(page)
               
    return index    
Beispiel #3
0
def crawl_web(seed):
	tocrawl = [seed]
	crawled = []
	graph = {} #<url>:[list of pages it links to]
	index = {}
	while tocrawl:
		page = tocrawl.pop()
		if page not in crawled:
			content = get_page(page)
			indexer.add_page_to_index(index, page, content)
			outlinks = get_all_links(content)
			graph[page] = outlinks
			union(tocrawl, outlinks)
			crawled.append(page)
			#print(crawled)
               
	return index, graph    
Beispiel #4
0
def crawl(user_id,type=INITIAL_CRAWL, url=None):
    # retrieve user info
    user = User.get_by_key_name(user_id)

    if type == INITIAL_CRAWL:
        # already indexed, ignore
        if user.indexed:
            logging.info('Already crawled %d for %s' % (user.num_indexed, user_id))
            task_crawl_previous(user_id, user.previous_url)
            return

        # Initial crawl url
        url = 'https://graph.facebook.com/%s/posts?access_token=%s' % (user.id, user.access_token)

    logging.info('start crawl: type=%d user_id=%s url=%s' % (type, user_id, url))

    try:
        result = urllib2.urlopen(url)
        data = result.read()
        logging.info(data)
        json_data = json.loads(data)
        posts = []
        index = {}
        for entity in json_data['data']:
            if 'message' in entity:
                post = Post(parent=user,key_name=entity['id'],id = entity['id'],from_name = entity['from']['name'],from_id=entity['from']['id'])
                post.message = entity['message']
                post.type = entity['type']
                post.created_time = entity['created_time']
                post.likes_count = 0
                if 'likes' in entity:
                    post.likes_count = entity['likes']['count']
                post.comments_count = 0
                if 'comments' in entity:
                    post.comments_count =entity['comments']['count']
                add_page_to_index(index, entity['id'], entity['message'])
                posts.append(post)
        if len(posts):
            # store posts
            db.put(posts)
            store_index_in_db(index, user)

        # store previous url in queue
        if type == INITIAL_CRAWL:
            if 'paging' in json_data:
                if 'previous' in json_data['paging']:
                    previous = json_data['paging']['previous']
                    logging.info(previous)
                    if len(previous) > 0:
                        user.previous_url = previous

        # update user
        count = 0
        if type != INITIAL_CRAWL:
            count = user.num_indexed
        user.num_indexed = count + len(posts)
        user.last_indexed = datetime.datetime.now()
        user.indexed = True
        user.put()

        # create task for next or previous page
        if 'paging' in json_data:
            if type == INITIAL_CRAWL or type == NEXT_CRAWL:
                # store next url in queue
                if 'next' in json_data['paging']:
                    next = json_data['paging']['next']
                    logging.info(next)
                    if len(next) > 0:
                        task_crawl_next(user_id, next)
            elif type == PREVIOUS_CRAWL:
                # store previous url in queue
                if 'previous' in json_data['paging']:
                    previous = json_data['paging']['previous']
                    logging.info(previous)
                    if len(previous) > 0:
                        task_crawl_previous(user_id, previous)
    except urllib2.URLError, e:
        logging.error(e)
        logging.error(e.read())