Python filter_profile_urls Examples

Programming Language: Python

Namespace/Package Name: parse_url

Method/Function: filter_profile_urls

Examples at hotexamples.com: 2

Python filter_profile_urls - 2 examples found. These are the top rated real world Python examples of parse_url.filter_profile_urls extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

File: threaded_scrape.py Project: sujanadiga-practo/Project-CDFIFM

    def run(self):
        print 'Started thread ...'
        
        def get_all_friends(url):
            self.driver.get(url + '/friends_all')
            
            last_source = ''
            while True:
                self.driver.execute_script('window.scrollTo(0, document.body.scrollHeight);')
                html_source = self.driver.page_source
                count = 0
                while True:
                    count += 1
                    time.sleep(1)
                    if html_source != last_source:
                        break
                    if count > 7: # If page content doesn't change after 7 seconds, stop 
                        return html_source.encode('utf-8')
                    last_source = html_source
                    html_source = self.driver.page_source
                last_source = html_source


        for idx in range(self.i, self.j):
            url = urls[idx]
            print 'Thread :', self.name, 'Index :', idx, 'URL :', url
           
            data = get_all_friends(url)
            soup = bs.BeautifulSoup(data)
            anchors = soup.findAll('a', attrs = {'href' : re.compile(conf.url_prefix)})
            links = []
            for a in anchors:
                links.append(a.get('href'))
            profile_urls = parse_url.filter_profile_urls(links)
            print 'Number of unique pofiles found :', len(profile_urls), 'URL :', url
            with open(self.name + '.txt', 'a+') as f:
                for pr in profile_urls:
                    f.write(pr + '\n')

Example #2

Show file

File: scrape.py Project: sujanadiga-practo/Project-CDFIFM

            if html_source != last_source:
                break
            if count > 7: # If page content doesn't change after 7 seconds, stop 
                return html_source.encode('utf-8')
            last_source = html_source
            html_source = driver.page_source
        last_source = html_source

try:
    for url in urls:
        print "Started with url :", url
        data = get_all_friends(url)
        soup = bs.BeautifulSoup(data)
        anchors = soup.findAll('a', attrs = {'href' : re.compile(conf.url_prefix)})
        links = []
        for a in anchors:
            links.append(a.get("href"))
        profile_urls = parse_url.filter_profile_urls(links)
        print "Number of unique pofiles found :", len(profile_urls)
        for i in profile_urls:
            if i not in urls:
                urls.append(i)
        print "Total number of profile urls :", len(urls)        
except KeyboardInterrupt:
    print "Finished fetching profile urls."
    print "Total number of profile urls :", len(urls)    
    with open("profile_urls" + time.ctime() + ".csv", "w") as wr:
        for url in urls:
            wr.write(url + "\n")
    print "Finished ...."