def get_all_friends(user1): friends = [] page = get_4sq_page('users', user1) if page == None: return friends user = json.loads(page)['response']['user'] count = 0 if 'checkins' in user: checkins = user['checkins'] if 'count' in checkins: count = checkins['count'] #print user, count firstName = user['firstName'] if 'firstName' in user else 'firstName' lastName = user['lastName'] if 'lastName' in user else 'lastName' homeCity = user['homeCity'] if 'homeCity' in user else 'homeCity' write_dat('user', [user1, firstName, lastName, homeCity.replace(' ','_'), count]) for group in user['friends']['groups']: for item in group['items']: user2 = item['id'] if 'id' in item else 'user2' friends.append(item['id']) write_dat('friendship', [user1, user2]) if 'contact' in item: contact = item['contact'] if 'twitter' in contact: for t_urls in get_4sq(contact['twitter']): for t_url in t_urls: #print t_url['expanded_url'] page = get_page(t_url['expanded_url']) if page != None: get_venue(user2, page) return friends
def crawl_web(tocrawl, keyword, n = 1000): # returns index, graph of inlinks crawled = set([]) num_404 = 0 total_size = 0 min_score = -tocrawl[0][0] fout = open('result.txt','w+') start = time.clock() while tocrawl: url = heappop(tocrawl) # changed page to url - clearer name filehandle = get_page(url[1]) if filehandle == None: continue code = filehandle.code if code == 404: num_404 += 1 if code == 401: continue if filehandle.headers.type != 'text/html': continue new_url = filehandle.geturl() if new_url not in crawled: #corpus.add_page(url, new_url, outlinks, tocrawl, count) #tocrawl += outlinks page = filehandle.read() outlinks, count = get_all_link_keyword(page, new_url, keyword) if count == 0: continue for outlink in outlinks: is_new_link = True for i in range(len(tocrawl)): target = tocrawl[i] if target[1] == outlink: is_new_link = False tocrawl.pop(i) heappush(tocrawl, (target[0] - count, outlink)) break #if is_new_link and len(tocrawl) < n: if is_new_link: if len(tocrawl) > n: if count > min_score: heappush(tocrawl, (-count, outlink)) else: heappush(tocrawl, (-count, outlink)) if count < min_score: min_score = count crawled.add(new_url) urllib.urlretrieve(new_url, os.path.join('downloads',str(n)+".html")) n -= 1 if n < 0: break size = len(page) total_size += size fout.write(new_url + ' time:' + str(time.clock()) + ' size:' + str(size) + ' return_code:' + str(code) + ' score:' + str(-url[0]) + ' actually:' + str(count) + '\n') fout.write('number_of_files:' + str(len(crawled)) + ' total_size:' + str(total_size) + ' total_time:' + str(time.clock() - start) + ' number_of_404_errors:' + str(num_404)) fout.close() return crawled
def crawl(self, seed): links = set(seed) crawled = [] while len(crawled) < self.depth and links: url = links.pop() if url not in crawled: content = get_page(url) scrapped_links = self.__get_links(content, url) links.update(scrapped_links) crawled.append(url) return crawled
def crawl_web(seed,max_depth): tocrawl = [[seed, 0]] crawled = [] myData = [] while tocrawl: myData = tocrawl.pop() if myData[1] < max_depth + 1: union(tocrawl, get_all_links(getpage.get_page(myData[0]), myData[1]+1)) if myData[0] not in crawled: crawled.append(myData[0]) return crawled
def crawl_web(seed): tocrawl = set([seed]) crawled = [] corpus = WebCorpus() while tocrawl: id = tocrawl.pop() if id not in crawled: content = get_page(id) friends = get_all_friends(content) corpus.add_friend(id, friends) tocrawl.update(friends) crawled.append(id) return crawled
def crawl_web(seed): # returns index, graph of inlinks tocrawl = set([seed]) crawled = [] corpus = WebCorpus() while tocrawl: url = tocrawl.pop() # changed page to url - clearer name if url not in crawled: content = get_page(url) outlinks = get_all_links(content) corpus.add_page(url, content, outlinks) tocrawl.update(outlinks) crawled.append(url) corpus.finish_crawl() return corpus
def crawl_web(seed): # returns index, graph of inlinks tocrawl = set([seed]) crawled = [] wcorpus = WebCorpus() while tocrawl: url = tocrawl.pop() # changed page to url - clearer name if url not in crawled: content = get_page(url) add_page_to_index(wcorpus.index, url, content) outlinks = get_all_links(content) wcorpus.graph[url] = outlinks tocrawl.update(outlinks) crawled.append(url) return wcorpus
def crawl_web(seed): # returns index, graph of inlinks tocrawl = set([seed]) crawled = [] graph = {} # <url>, [list of pages it links to] index = {} while tocrawl: url = tocrawl.pop() # changed page to url - clearer name if url not in crawled: content = get_page(url) add_page_to_index(index, url, content) outlinks = get_all_links(content) graph[url] = outlinks tocrawl.update(outlinks) crawled.append(url) return index, graph
def crawl_web(seed): # returns index, graph of inlinks tocrawl = set([seed]) crawled = [] wcorpus = WebCorpus() while tocrawl: url = tocrawl.pop() # changed page to url - clearer name if url not in crawled: content = get_page(url) outlinks = get_all_links(content) for outlink in outlinks: wcorpus.add_link(url, outlink) for word in content.split(): wcorpus.add_word_occurrence(url, word) tocrawl.update(outlinks) crawled.append(url) return wcorpus
def crawlWeb(seed): toCrawl = set([seed]) #start with a seed page crawled = [] #keep a record of sites crawled to prevent repeat visits wcorpus = WebCorpus() while toCrawl: url = toCrawl.pop() if url not in crawled: #check whether already crawled content = get_page(url) #read-in all of the page's html text outlinks = getAllLinks(content) #store outlinks in var for building graph for outlink in outlinks: wcorpus.add_link(url, outlink) for word in content.split(): wcorpus.add_word_occurrence(url, word) toCrawl.update(outlinks) #add outlinks to toCrawl stack if we haven't cralwed already crawled.append(url) #store page that we popped in crawled. return wcorpus
def crawl_web(seed): # returns webcorpus (includes index, graph) tocrawl = set([seed]) crawled = [] corpus = WebCorpus() while tocrawl: url = tocrawl.pop() if url not in crawled: content = get_page(url) add_page_to_index(corpus, url, content) outlinks = get_all_links(content) for outlink in outlinks: corpus.add_link(url, outlink) tocrawl.update(outlinks) crawled.append(url) return corpus
def get_all_friends(user1): friends = [] page = get_4sq_page('users', user1) user = json.loads(page)['response']['user'] count = 0 if 'checkins' in user: checkins = user['checkins'] if 'count' in checkins: count = checkins['count'] #print user, count firstName = user['firstName'] if 'firstName' in user else 'firstName' lastName = user['lastName'] if 'lastName' in user else 'lastName' homeCity = user['homeCity'] if 'homeCity' in user else 'homeCity' write_dat('user', [user1, firstName, lastName, homeCity.replace(' ','_'), count]) for group in user['friends']['groups']: for item in group['items']: user2 = item['id'] if 'id' in item else 'user2' friends.append(item['id']) write_dat('friendship', [user1, user2]) if 'contact' in item: contact = item['contact'] if 'twitter' in contact: try: for t_urls in get_4sq(contact['twitter']): for t_url in t_urls: #print t_url['expanded_url'] get_venue(user2, get_page(t_url['expanded_url'])) except tweepy.TweepError: time.sleep(60 * 2) continue except StopIteration: break except: pass return friends
def crawl_web(tocrawl, keyword, n = 1000): # returns index, graph of inlinks crawled = set([]) url_finder = {} num_404 = 0 total_size = 0 fout = open('result.txt','w+') start = time.clock() while tocrawl: url = heappop(tocrawl) # changed page to url - clearer name filehandle = get_page(url[1]) if filehandle == None: continue code = filehandle.code if code == 404: num_404 += 1 if code == 401: continue if filehandle.headers.type != 'text/html': continue new_url = filehandle.geturl() if new_url not in crawled: #corpus.add_page(url, new_url, outlinks, tocrawl, count) #tocrawl += outlinks page = filehandle.read() outlinks, count = get_all_link_keyword(page, new_url, keyword) if count == 0: continue len_tocrawl = len(tocrawl) for outlink in outlinks: is_new_link = True min_index = 0 max_index = len_tocrawl - 1 if outlink in url_finder: key_score = url_finder[outlink] while True: if max_index < min_index: break m = (max_index + min_index)/2 target = tocrawl[m] if target[0] < key_score: min_index = m + 1 elif target[0] > key_score: max_index = m - 1 else: for j in range(min_index, max_index): target = tocrawl[j] if target[1] == outlink: is_new_link = False tocrawl.pop(j) url_score = target[0] - count heappush(tocrawl, (url_score, outlink)) url_finder[outlink] = url_score break break # for i in range(len_tocrawl): # target = tocrawl[i] # if target[1] == outlink: # is_new_link = False # tocrawl.pop(i) # heappush(tocrawl, (target[0] - count, outlink)) # break #if is_new_link and len(tocrawl) < n: else: heappush(tocrawl, (-count, outlink)) url_finder[outlink] = -count tocrawl = nsmallest(n, tocrawl) crawled.add(new_url) urllib.urlretrieve(new_url, os.path.join('downloads',str(n)+".html")) n -= 1 if n < 0: break size = len(page) total_size += size fout.write(new_url + ' time:' + str(time.clock()) + ' size:' + str(size) + ' return_code:' + str(code) + ' score:' + str(-url[0]) + ' actually:' + str(count) + '\n') fout.write('number_of_files:' + str(len(crawled)) + ' total_size:' + str(total_size) + ' total_time:' + str(time.clock() - start) + ' number_of_404_errors:' + str(num_404)) fout.close() return crawled