def run(self): print 'Started thread ...' def get_all_friends(url): self.driver.get(url + '/friends_all') last_source = '' while True: self.driver.execute_script('window.scrollTo(0, document.body.scrollHeight);') html_source = self.driver.page_source count = 0 while True: count += 1 time.sleep(1) if html_source != last_source: break if count > 7: # If page content doesn't change after 7 seconds, stop return html_source.encode('utf-8') last_source = html_source html_source = self.driver.page_source last_source = html_source for idx in range(self.i, self.j): url = urls[idx] print 'Thread :', self.name, 'Index :', idx, 'URL :', url data = get_all_friends(url) soup = bs.BeautifulSoup(data) anchors = soup.findAll('a', attrs = {'href' : re.compile(conf.url_prefix)}) links = [] for a in anchors: links.append(a.get('href')) profile_urls = parse_url.filter_profile_urls(links) print 'Number of unique pofiles found :', len(profile_urls), 'URL :', url with open(self.name + '.txt', 'a+') as f: for pr in profile_urls: f.write(pr + '\n')
if html_source != last_source: break if count > 7: # If page content doesn't change after 7 seconds, stop return html_source.encode('utf-8') last_source = html_source html_source = driver.page_source last_source = html_source try: for url in urls: print "Started with url :", url data = get_all_friends(url) soup = bs.BeautifulSoup(data) anchors = soup.findAll('a', attrs = {'href' : re.compile(conf.url_prefix)}) links = [] for a in anchors: links.append(a.get("href")) profile_urls = parse_url.filter_profile_urls(links) print "Number of unique pofiles found :", len(profile_urls) for i in profile_urls: if i not in urls: urls.append(i) print "Total number of profile urls :", len(urls) except KeyboardInterrupt: print "Finished fetching profile urls." print "Total number of profile urls :", len(urls) with open("profile_urls" + time.ctime() + ".csv", "w") as wr: for url in urls: wr.write(url + "\n") print "Finished ...."