Exemple #1
0
 def __init__(self,logging_name):
     
     #self.interval = WATCH_INTERVAL
     self.crawl_pool = Pool(size=CRWAL_POOL_SIZE)
     self.logger = get_logger(logging_name)
     self.page_queue = Queue()
     self.info_queue = Queue()
     self.parm_queue = Queue()
     self.proxy_manager = ProxyManager("{}/1.txt".format(project_dir),self.logger)
     #self.timer = Timer(random.randint(0,2),self.interval)
     self.proxys = self.proxy_manager.get_proxy()
     self.count = 0
    def _search(self, retries=0):
        payload = {}
        try:
            payload['q'] = self.query.encode('utf8')  # query to lookup
        except:
            payload['q'] = self.query  # query to lookup
        #payload['sclient'] = 'psy-ab'

        payload['gl'] = self.country  # query from country
        payload['hl'] = self.language  # user query language
        payload['lr'] = 'lang_%s' % self.language  # restrict language pages
        payload['pws'] = 0  # no custom
        payload['gws_rd'] = 'cr'  # country select

        # old API
        payload['client'] = 'firefox'  # --> json result

        pool = Urllib3PoolFactory.getProxyPool()

        # Getting the response in an Object r
        time.sleep(random.uniform(0.0, 0.2))
        try:
            r = pool.request('GET',
                             self.googleHost,
                             fields=payload,
                             headers={
                                 "User-Agent": UserAgent.old,
                                 "Accept": "text/html"
                             })
        except Exception as ex:
            app_logger.error(u"_requestError %s" % ex)
            raise ex

        # ['repelente', [['repelente<b> mosquitos</b>', 0, [131]], ['repelente<b> para gatos</b>', 0, [131]], ['repelente<b> para perros</b>', 0], ['repelente', 0]], ...
        results = []

        data = json.loads(r.data)
        if len(data) > 1:
            for itemsData in data[1]:
                ##related_query = BeautifulSoup(itemsData[0], "lxml").getText()
                related_query = itemsData
                if related_query != self.query:
                    results.append(related_query)

        if not data and not results:
            ProxyManager.invalidateProxy()
            if retries > 0:
                print 'Reintentando(%s)... %s' % (retries, self.query)
                return self._search(retries - 1)

        return results
Exemple #3
0
class Taskmanager(object):
    
    def __init__(self,logging_name):
        
        #self.interval = WATCH_INTERVAL
        self.crawl_pool = Pool(size=CRWAL_POOL_SIZE)
        self.logger = get_logger(logging_name)
        self.page_queue = Queue()
        self.info_queue = Queue()
        self.parm_queue = Queue()
        self.proxy_manager = ProxyManager("{}/1.txt".format(project_dir),self.logger)
        #self.timer = Timer(random.randint(0,2),self.interval)
        self.proxys = self.proxy_manager.get_proxy()
        self.count = 0

        
    def run(self):
        pass
    
    def reload_proxies(self):
        self.proxy_manager.reload_proxies()
        
    def _feed_page_queue(self,base_url):
        pass
    
    def _page_loop(self):
        while 1:
            page_url=self.page_queue.get(block=True)
            gevent.sleep(2)
            self.crawl_pool.spawn(self._feed_info_queue, page_url)
    
    def _feed_info_queue(self,url):
        pass
    
    def _item_loop(self):
        while 1:
            item_url=self.info_queue.get(block=True)
            gevent.sleep(2)
            self.crawl_pool.spawn(self._crawl_info, item_url)
            
    def _crawl_info(self,item_url):
        pass
    
    def _db_save_loop(self):
        while 1:
            parm = self.parm_queue.get(block=True)
            gevent.sleep(0.1)
            self.count = self.count+1
            S = SqlHelper(logger=self.logger)
            self.crawl_pool.spawn(S.insert_scholar, **parm)
Exemple #4
0
    def getProxyPool():

        from utils.proxy_manager import ProxyManager
        urllib3.disable_warnings()  #@UndefinedVariable

        nextProxy = ProxyManager.getNextProxy()

        if nextProxy.proxy_basic_auth:
            proxy_url = 'http://%s@%s:%s' % (nextProxy.proxy_basic_auth,
                                             nextProxy.host, nextProxy.port)
        else:
            proxy_url = 'http://%s:%s' % (nextProxy.host, nextProxy.port)

        proxies = {
            'http': proxy_url,
            'https': proxy_url,
        }

        import requests

        session = requests.Session()
        session.proxies = proxies
        session.max_redirects = 2

        return session
Exemple #5
0
 def _feed_journal_loop(self, subject, subject_url):
     
     self.logging = get_logger(name=subject)
     self.proxy_manager = ProxyManager("{}/1.txt".format(project_dir), self.logging)
     
     self.logging.info("Processing journal {}".format(subject_url))
     html_source = fetch(subject_url, requests_session=self.requests_session)
     
     journal_item = iter(extract("//li[@class='browseimpBrowseRow']/ul/li/span/a/@href", html_source, multi=True))
     try:
         while True:
             self.journal_queue.put_nowait('http://www.sciencedirect.com{}'.\
                                  format(next(journal_item)))
     except StopIteration:
         self.logging.info("Journal_Queue Get {} seeds".format(self.journal_queue._qsize()))
    def getProxyPool():
        from utils.proxy_manager import ProxyManager
        urllib3.disable_warnings()  #@UndefinedVariable

        nextProxy = ProxyManager.getNextProxy()
        if nextProxy.proxy_basic_auth:
            headers = urllib3.make_headers(
                proxy_basic_auth=nextProxy.proxy_basic_auth)
        else:
            headers = None
        proxy_url = 'http://%s:%s' % (nextProxy.host, nextProxy.port)
        proxy = urllib3.ProxyManager(proxy_url,
                                     proxy_headers=headers,
                                     retries=Retry(total=None,
                                                   connect=2,
                                                   read=2,
                                                   redirect=2,
                                                   backoff_factor=0.1))
        return proxy
    def _search(self, start, visible=0):
        payload = {}
        try:
            payload['q'] = self.query.encode('utf8')  # query to lookup
        except:
            payload['q'] = self.query  # query to lookup
        payload['start'] = start  # start point
        payload['gl'] = self.country  # query from country
        payload['hl'] = self.language  # user query language
        payload['lr'] = 'lang_%s' % self.language  # restrict language pages
        payload['num'] = GoogleSeleniumPlus.PAGE_LIMIT
        payload['safe'] = 'off'

        params = urllib.urlencode(payload)

        display = Display(visible=visible, size=(800, 600))
        display.start()
        try:
            proxyInfo = ProxyManager.getNextProxy()

            myProxy = '%s:%s' % (proxyInfo.host, proxyInfo.port)

            proxy = Proxy({
                'proxyType': ProxyType.MANUAL,
                'httpProxy': myProxy,
                'ftpProxy': myProxy,
                'sslProxy': myProxy,
                'noProxy': ''  # set this value as desired
            })

            browser = webdriver.Firefox(proxy=proxy)
            browser.set_page_load_timeout(30)
            try:
                params = urllib.urlencode(payload)

                browser.implicitly_wait(10)

                browser.get('%s#%s' % (self.googleHost, params))

                app_logger.info(u"%s" % browser.current_url)

                h3List = browser.find_elements_by_xpath("//h3[@class='r']")

                results = []

                for h3 in h3List:
                    link = h3.find_element_by_tag_name('a')
                    results.append(link.get_attribute("href"))

                box = browser.find_element_by_id('lst-ib')

                partialQuery = ' '.join(self.query.split()[1:])

                for _letter in partialQuery:
                    box.send_keys(Keys.BACKSPACE)
                    randomSleep(0.03, 0.05)

                typeQuery(box, partialQuery)

                randomSleep(0.05, 0.25)
                print('-' * 80)

            finally:
                browser.close()

        except Exception as ex:
            raise ex

        finally:
            display.stop()

        if not results:
            ProxyManager.invalidateProxy()

        return results
Exemple #8
0
    def _search(self, start, retries=0, visible=0):
        
        display = None
        results = []
        
        try:
            display = Display(visible=visible, size=(800, 600))
            display.start()
        except Exception as ex:
            pass
        try:
            proxyInfo = ProxyManager.getNextProxy()
            
            myProxy = '%s:%s' % (proxyInfo.host,proxyInfo.port)
    
            proxy = Proxy({
                'proxyType': ProxyType.MANUAL,
                'httpProxy': myProxy,
                'ftpProxy': myProxy,
                'sslProxy': myProxy,
                'noProxy': '' # set this value as desired
                })
    
            browser = webdriver.Firefox(proxy=proxy)
            browser.set_page_load_timeout(30)
            try:
                #params = urllib.urlencode(payload)
    
                browser.implicitly_wait(10)
    
                browser.get('%s' % (self.googleHost,))
                
                box = browser.find_element_by_id('lst-ib')
                
                app_logger.info(u"%s" % self.googleHost)
                
                typeQuery(box, self.query)  
                
                # <p class="_e4b"><a href="..">elefantes marinos <b>videos</b></a></p>
                paragraphList = browser.find_elements_by_xpath('//*[@class="_e4b"]')
                
                for p in paragraphList:
                    link = p.find_element_by_tag_name('a')
                    results.append(link.text)

                # fake typing    
                partialQuery = ' '.join(self.query.split()[1:])
                
                for _letter in partialQuery: 
                    box.send_keys(Keys.BACKSPACE)
                    randomSleep(0.03, 0.05)
                
                typeQuery(box, ' '.join(self.query.split()[1:]))      
    
                randomSleep(0.05, 0.25)
            
            finally:
                browser.close()
        
        except Exception as ex:
            raise ex
        
        finally:
            if display:
                display.stop()
        
        if not results:
            ProxyManager.invalidateProxy()
            if retries > 0:
                print 'Reintentando... %s' % self.query
                return self._search(start, retries=retries-1, visible=visible)
        
        return results
Exemple #9
0
    def _search(self, start):
        payload = {}
        try:
            payload['q'] = self.query.encode('utf8')  # query to lookup
        except:
            payload['q'] = self.query  # query to lookup
        payload['start'] = start  # start point
        payload['gl'] = self.country  # query from country
        payload['hl'] = self.language  # user query language
        payload['lr'] = 'lang_%s' % self.language  # restrict language pages
        payload['num'] = GoogleSelenium.PAGE_LIMIT
        payload['safe'] = 'off'

        params = urllib.urlencode(payload)

        display = Display(visible=0, size=(800, 600))
        try:
            display.start()

            proxyInfo = ProxyManager.getNextProxy()

            myProxy = '%s:%s' % (proxyInfo.host, proxyInfo.port)

            proxy = Proxy({
                'proxyType': ProxyType.MANUAL,
                'httpProxy': myProxy,
                'ftpProxy': myProxy,
                'sslProxy': myProxy,
                'noProxy': ''  # set this value as desired
            })

            driver = webdriver.Firefox(proxy=proxy)

            try:
                '''
                Cualquier fallo aquí, revisar que la ip está dada de alta
                en buyproxies.com
                '''
                driver.implicitly_wait(10)

                driver.get('%s?%s' % (self.googleHost, params))

                app_error_logger.info(u"%s" % driver.current_url)

                results = []

                h3List = driver.find_elements_by_xpath("//h3[@class='r']")

                for h3 in h3List:
                    link = h3.find_element_by_tag_name('a')
                    results.append(link.get_attribute("href"))

            except Exception as ex:
                raise ex

            finally:
                driver.close()

        except Exception as ex:
            raise ex

        finally:
            display.stop()

        if not results:
            ProxyManager.invalidateProxy()

        return results