def fetchProfiles(initURL, maxcount): """Given the URL from where to initiate the crawling, it first fetches the webpage, sends it to the crawler for scraping data from the webpage. Not only that, it also reads all the public profile urls present in the current page and adds them to the list. In subsequent iterations, it will fetch the LinkedIn profiles of people associated with these urls. The iteration continues for the number of times specified by maxcount""" count = 0 links = set([initURL]) waitinglist = list() while count< maxcount: count += 1 while True: newreq = links.pop() if newreq not in waitinglist: # If the url hasn't be used already, add it to the waiting list waitinglist.append(newreq) break try: page = urllib2.urlopen(waitinglist[-1]).read() # Fetch the web page from the url just appended except: break crawler.contentExtractor(page, waitinglist[-1]) # Send the page and the url for scraping links.update(re.findall(r'http://.*linkedin.com/pub/(?:[a-z]*[-]?)*(?:/?[0-9]?[a-z]?)*\?trk=pub-pbmap', page)) # Get all the urls present in this web page links = set([link.strip('"') for link in links]) percentage = int(count*100.0/maxcount) # Progress bar sys.stdout.write('\r'+'='*percentage+'>'+' '*(101-percentage) +str(percentage)+'%') sys.stdout.flush()
def google(params): """Google for LinkedIn profiles with the parameters""" print 'Googling with params', params url = 'http://google.com/search?btnI=1&q='+'+'.join(params)+'+linkedin' # Does the I'm Lucky! search try: page = requests.get(url, allow_redirects=True) if re.match(r'http://.*linkedin.com/pub/dir/*',page.url): return False else: crawler.contentExtractor(page.content, page.url) return True except: return False