Python build_opener Examples, eventlet.green.urllib2.build_opener Python Examples

Example #1

0

Show file

File: green_downloader.py Project: nathenqian/campusearch

 def get_data(self, argv):
     # argv = {"urls" : [], "worker" : , }
     content = None
     error_code = None
     self.logger.debug("start fetch " + argv["url"])
     try:
         url = argv["url"]
         try:
             with eventlet.Timeout(self.timeout, False):
                 headers = {
                     "User-Agent":"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1"
                 }
                 if self.proxy is None:
                     req = urllib2.Request(url, headers = headers)
                     res = urllib2.urlopen(req)
                     content = res.read()
                 else:
                     proxy_handler = urllib2.ProxyHandler(self.proxy)
                     opener = urllib2.build_opener(proxy_handler)
                     header_list = []
                     for header in headers:
                         header_list.append((header, headers[header]))
                     opener.addheaders = header_list
                     res = opener.open(url)
                     content = res.read()
         except urllib2.HTTPError, e:
             raise Exception(e.code)
         except urllib2.URLError, e:
             raise Exception("URLError")

Example #2

0

Show file

    def get_sync(self,
                 url,
                 data,
                 type=None,
                 content_type="application/x-www-form-urlencoded"):
        try:
            data = urlencode(data)
        except:  # data is probably a string to be send directly.
            pass
        headers = {"Content-Type": content_type}
        if type and type.upper() not in ("POST", "GET"):
            from restlib import RestfulRequest  #@UnresolvedImport
            req = RestfulRequest(url, data=data, method=type.upper())
        else:
            req = urllib2.Request(url, data, headers=headers)

        opener = urllib2.build_opener(self)
        eventlet.greenthread.sleep()
        try:
            f = opener.open(req, data=data)
            if f.code is None or str(f.code)[0] == "2":
                dispatcher.send(UrlGetter.HTTP_RESULT,
                                self,
                                result=f.read(),
                                source=url,
                                code=f.code)
            else:
                e = urllib2.HTTPError(
                    url, f.code,
                    "A code %s HTTP error has occurred when trying to send to target %s"
                    % (f.code, url), req.headers, f)
                dispatcher.send(UrlGetter.HTTP_ERROR, self, exception=e)
#        TODO: make sure we're supposed to listen to URLErrors
        except (urllib2.URLError, ValueError), e:
            dispatcher.send(UrlGetter.URL_ERROR, self, exception=e, url=url)

Example #3

0

Show file

File: rewrite.py Project: AlexGreat007/wikia

 def handle404(self, reqorig, url, container, obj):
     """
     Return a webob.Response which fetches the thumbnail from the thumb
     host, potentially writes it out to Swift so we don't 404 next time,
     and returns it. Note also that the thumb host might write it out
     to Swift so we don't have to.
     """
     # go to the thumb media store for unknown files
     reqorig.host = self.thumbhost
     # upload doesn't like our User-agent, otherwise we could call it
     # using urllib2.url()
     opener = urllib2.build_opener()
     opener.addheaders = [('User-agent', self.user_agent)]
     # At least in theory, we shouldn't be handing out links to originals
     # that we don't have (or in the case of thumbs, can't generate).
     # However, someone may have a formerly valid link to a file, so we
     # should do them the favor of giving them a 404.
     try:
         upcopy = opener.open(reqorig.url)
     except urllib2.HTTPError,status:
         if status == 404:
             resp = webob.exc.HTTPNotFound('Expected original file not found')
             return resp
         else:
             resp = webob.exc.HTTPNotFound('Unexpected error %s' % status)
             return resp

Example #4

0

Show file

def get_plugin_status(args):

    plugin, host, request = args
    url = "%s/plugins/%s/%d/_s/status" % (host, plugin.plugin_name, plugin.id)
    json = None

    jail_status = notifier().pluginjail_running(pjail=plugin.plugin_jail)
    if not jail_status:
        return plugin, json, jail_status

    try:
        opener = urllib2.build_opener()
        opener.addheaders = [
            ('Cookie',
             'sessionid=%s' % (request.COOKIES.get("sessionid", ''), ))
        ]
        #TODO: Increase timeout based on number of plugins
        response = opener.open(url, None, 5).read()
        json = simplejson.loads(response)
    except Exception, e:
        log.warn(
            _("Couldn't retrieve %(url)s: %(error)s") % {
                'url': url,
                'error': e,
            })

Example #5

0

Show file

def plugin_fetch(args):
    plugin, host, request = args
    data = None
    url = "%s/plugins/%s/%d/_s/treemenu" % (
        host,
        plugin.plugin_name,
        plugin.id
    )
    try:
        opener = urllib2.build_opener()
        opener.addheaders = [(
            'Cookie', 'sessionid=%s' % (
                request.COOKIES.get("sessionid", ''),
            )
        )]
        # TODO: Increase timeout based on number of plugins
        response = opener.open(url, None, 5)
        data = response.read()
        if not data:
            log.warn(_("Empty data returned from %s") % (url,))

    except Exception, e:
        log.warn(_("Couldn't retrieve %(url)s: %(error)s") % {
            'url': url,
            'error': e,
        })

Example #6

0

Show file

File: Elastix.py Project: blink-hr/vPBX

    def _getAuthOpener_LXP150(self, http_user, http_pass):
        ''' Create an authenticated opener for the LXPx50 series.

        The LXPx50 HTTP authentication is again weird. First, a request must be
        sent to the phone with a Cookie with a SessionId set to a random number
        between 0 and 99999. Sending 0 works just as well. The first request must
        be a GET that asks the phone to calculate a hash for a specified username
        and password. The hash is embedded inside a HTML fragment in the response.
        Next the hash must be sent as a new Cookie in a POST request that also
        includes the original SessionId number and the UserName as cookies. A
        successful login returns a response with the phone status, including the
        phone model. Additionally, the response after a successful login includes
        a brand new SessionId that must be replaced in the opener cookie.
        '''
        cookiejar = cookielib.CookieJar(cookielib.DefaultCookiePolicy(rfc2965=True))
        sesscookie = cookielib.Cookie(None, 'SessionId', '0', None, False,
            self._ip, False, False,
            '/', False, False, str((int)(time.time() + 3600)),
            False, 'SessionId', None, None)
        cookiejar.set_cookie(sesscookie)
        opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookiejar))
        response = opener.open('http://' + self._ip + '/fcgi/do?' + urllib.urlencode({
            'action': 'Encrypt',
            'UserName' : http_user,
            'Password' : http_pass}))
        body = response.read()
        m = re.search(r"id=hcSingleResult type=hidden value='(.+?)'", body)
        if m is None:
            return (None, None)
        encrypted_password = m.group(1)

        sesscookie = cookielib.Cookie(None, 'UserName', http_user, None, False,
            self._ip, False, False, '/', False, False, str((int)(time.time() + 3600)),
            False, 'UserName', None, None)
        cookiejar.set_cookie(sesscookie)
        sesscookie = cookielib.Cookie(None, 'Password', encrypted_password, None, False,
            self._ip, False, False, '/', False, False, str((int)(time.time() + 3600)),
            False, 'Password', None, None)
        cookiejar.set_cookie(sesscookie)
        response = opener.open('http://' + self._ip + '/fcgi/do?id=1',
            'SubmitData=begin%26Operation%3DCreateSession%26DestURL%3Did%6021%26SubmitData%3Dend')

        # Find new SessionId value. What, no Set-Cookie header?
        body = response.read()
        m = re.search(r"id=hcSessionIdNow type=hidden value='(.+?)'", body)
        if m != None:
            sesscookie = cookielib.Cookie(None, 'SessionId', m.group(1), None, False,
                self._ip, False, False,
                '/', False, False, str((int)(time.time() + 3600)),
                False, 'SessionId', None, None)
            cookiejar.set_cookie(sesscookie)
        else:
            logging.error('Endpoint %s@%s LXPx50 failed to authenticate - new session ID not found in response' %
                (self._vendorname, self._ip))
            return (None, None)

        # Subsequent requests must NOT have the UserName/Password cookies
        cookiejar.clear(self._ip, '/', 'UserName')
        cookiejar.clear(self._ip, '/', 'Password')
        return (opener, body)

Example #7

0

Show file

File: overload.py Project: pombredanne/overload

def build_fetcher_builder(log, client_number):
    """ Generates a function that generates a function that 
        takes a url, reports statistics, and returns a soup.

        TODO: Some of this is routine-dependant. Refactor.
    """
    # This logic happens once per client
    opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(CookieJar()))
    def build_fetcher(routine_name):
        # This logic happends once per routine
        def fetcher(url, params=None, step=""):
            # This logic happends once per page load
            try:
                time_before = time.time()
                response_text = opener.open(routine.base_url + url, params).read()
                time_after = time.time()
                soup = BeautifulSoup(response_text)
                if response_text  is not None:
                    reported_time = response_text[:10].split("|")[0][:-2]
                    log([time_before, time_after, reported_time, client_number, current_processes, url,routine_name, step])
                    return soup
                else:
                    raise Exception()
            except Exception:
                log([time_before, time_after, "", client_number, current_processes, url,routine_name, step])
                return None
        return fetcher
    return build_fetcher

Example #8

0

Show file

File: spiderWorker.py Project: piashishi/python_spider

 def __init__ (self, manager, useCookies=False):
     super(URLFetchWorker, self).__init__(manager)
     self.timeoutTask = 0
     if useCookies:
         self.setCookies()
     else:
         self.opener = urllib2.build_opener()

Example #9

0

Show file

File: rewrite.py Project: Wikia/OldStructuredDataPrototype

 def handle404(self, reqorig, url, container, obj):
     """
     Return a webob.Response which fetches the thumbnail from the thumb
     host, potentially writes it out to Swift so we don't 404 next time,
     and returns it. Note also that the thumb host might write it out
     to Swift so we don't have to.
     """
     # go to the thumb media store for unknown files
     reqorig.host = self.thumbhost
     # upload doesn't like our User-agent, otherwise we could call it
     # using urllib2.url()
     opener = urllib2.build_opener()
     opener.addheaders = [('User-agent', self.user_agent)]
     # At least in theory, we shouldn't be handing out links to originals
     # that we don't have (or in the case of thumbs, can't generate).
     # However, someone may have a formerly valid link to a file, so we
     # should do them the favor of giving them a 404.
     try:
         upcopy = opener.open(reqorig.url)
     except urllib2.HTTPError, status:
         if status == 404:
             resp = webob.exc.HTTPNotFound(
                 'Expected original file not found')
             return resp
         else:
             resp = webob.exc.HTTPNotFound('Unexpected error %s' % status)
             return resp

Example #10

0

Show file

File: __init__.py Project: CsMAr51/freenas

def get_plugin_status(args):

    plugin, host, request = args
    url = "%s/plugins/%s/%d/_s/status" % (
        host,
        plugin.plugin_name,
        plugin.id)
    json = None

    jail_status = notifier().pluginjail_running(pjail=plugin.plugin_jail)
    if not jail_status:
        return plugin, json, jail_status

    try:
        opener = urllib2.build_opener()
        opener.addheaders = [
            ('Cookie', 'sessionid=%s' % (
                request.COOKIES.get("sessionid", ''),
            ))
        ]
        #TODO: Increase timeout based on number of plugins
        response = opener.open(url, None, 5).read()
        json = simplejson.loads(response)
    except Exception, e:
        log.warn(_("Couldn't retrieve %(url)s: %(error)s") % {
            'url': url,
            'error': e,
        })

Example #11

0

Show file

    def base_request(self, method, container=None, name=None, prefix=None,
                     headers={}, proxy=None, contents=None, full_listing=None):
        # Common request method
        url = self.url

        if self.token:
            headers['X-Auth-Token'] = self.token

        if container:
            url = '%s/%s' % (url.rstrip('/'), quote(container))

        if name:
            url = '%s/%s' % (url.rstrip('/'), quote(name))

        url += '?format=json'

        if prefix:
            url += '&prefix=%s' % prefix

        if proxy:
            proxy = urlparse.urlparse(proxy)
            proxy = urllib2.ProxyHandler({proxy.scheme: proxy.netloc})
            opener = urllib2.build_opener(proxy)
            urllib2.install_opener(opener)

        req = urllib2.Request(url, headers=headers, data=contents)
        req.get_method = lambda: method
        urllib2.urlopen(req)
        conn = urllib2.urlopen(req)
        body = conn.read()
        try:
            body_data = json.loads(body)
        except ValueError:
            body_data = None
        return [None, body_data]

Example #12

0

Show file

File: Hanlong.py Project: venturinog/endpointconfig2

 def probeModel(self):
     '''Probe specific model of the Hanlong phone
     
     To probe for the specific model, a http session is tried. After 
     authentication, the status page reveals the phone model.
     '''
     sModel = None
     # Try detecting Hanlong with updated firmware
     try:
         password_manager = urllib2.HTTPPasswordMgrWithDefaultRealm()
         password_manager.add_password(None, 'http://' + self._ip + '/',
                                       'admin', 'admin')
         basic_auth_handler = urllib2.HTTPBasicAuthHandler(password_manager)
         opener = urllib2.build_opener(basic_auth_handler)
         response = opener.open('http://' + self._ip + '/')
         htmlbody = response.read()
         #  <TR>
         #  <td width="220"><script> document.write(jscs.product_type);</script></td>
         #  <td width="250">UC862</td>
         #  <TR>
         m = m = re.search(r'product_type\);</script></TD>.*?<TD.*?>(\w+)',
                           htmlbody, re.IGNORECASE | re.DOTALL)
         if m != None:
             sModel = m.group(1)
     except Exception, e:
         pass

Example #13

0

Show file

File: internal_client.py Project: hannanabdul55/swift

    def base_request(self, method, container=None, name=None, prefix=None,
                     headers={}, proxy=None, contents=None, full_listing=None):
        # Common request method
        url = self.url

        if self.token:
            headers['X-Auth-Token'] = self.token

        if container:
            url = '%s/%s' % (url.rstrip('/'), quote(container))

        if name:
            url = '%s/%s' % (url.rstrip('/'), quote(name))

        url += '?format=json'

        if prefix:
            url += '&prefix=%s' % prefix

        if proxy:
            proxy = urlparse.urlparse(proxy)
            proxy = urllib2.ProxyHandler({proxy.scheme: proxy.netloc})
            opener = urllib2.build_opener(proxy)
            urllib2.install_opener(opener)

        req = urllib2.Request(url, headers=headers, data=contents)
        req.get_method = lambda: method
        urllib2.urlopen(req)
        conn = urllib2.urlopen(req)
        body = conn.read()
        try:
            body_data = json.loads(body)
        except ValueError:
            body_data = None
        return [None, body_data]

Example #14

0

Show file

File: crawler.py Project: CMUChimpsLab/AppInfoCrawler

    def __init__(self, concurrency=10):
        # a green pool is a pool of greenthreads - you're pushing
        # tasks to it and they get executed when eventlet's loop is
        # active
        self.pool = eventlet.GreenPool(concurrency)        
        # the queue receives URLs to visit
        self.queue = eventlet.Queue()
        # our root URL, the first to be fetched
        self.queue.put("https://play.google.com/store/apps")
        # after a fetch of an app is finished, results get pushed in
        # this queue
        self.results = eventlet.Queue()
        # we need to make sure we don't fetch the same URL more than
        # once, otherwise the script might never finish
        self.seen = set()
        # `seen_app_ids` cuts down on fetching apps that have been
        # fetched before; it is necessary in addition to `seen`
        self.seen_app_ids = set()
        # just a counter for statistics
        self.failed = 0
        self.cnt = 0

        # our opener
        self.browser = urllib2.build_opener()
        self.browser.addheaders.append(('Cookie', 'hlSession2=en'))

Example #15

0

Show file

File: Grandstream.py Project: blink-hr/vPBX

    def _enableStaticProvisioning_GXP1450(self, vars):
        try:
            # Login into interface
            opener = urllib2.build_opener(urllib2.HTTPCookieProcessor())
            response = opener.open(
                'http://' + self._ip + '/cgi-bin/dologin',
                urllib.urlencode({
                    'Login': '******',
                    'P2': self._http_password,
                    'gnkey': '0b82'
                }))
            body = response.read()
            if 'dologin' in body:
                logging.error('Endpoint %s@%s GXP1450 - dologin failed login' %
                              (self._vendorname, self._ip))
                return False

            response = opener.open('http://' + self._ip + '/cgi-bin/update',
                                   urllib.urlencode(vars) + '&gnkey=0b82')
            body = response.read()
            if 'dologin' in body:
                logging.error(
                    'Endpoint %s@%s GXP1450 - dologin failed to keep session' %
                    (self._vendorname, self._ip))
                return False
            return True
        except socket.error, e:
            logging.error('Endpoint %s@%s GXP1450 failed to connect - %s' %
                          (self._vendorname, self._ip, str(e)))
            return False

Example #16

0

Show file

File: sc3.py Project: hexiangyi/websiteTree

	def main(self, start_url, block_extensions=['.pdf','.gif','.jpg','.JPG','.PNG','.png','.wav','.mp3','.wma'], max_urls = 100):

		# Set user agent string
		opener = urllib2.build_opener()
		opener.addheaders = [
			('User-agent', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.220 Safari/535.1'),
			('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'),
			('Accept-Charset', 'utf-8,gbk;q=0.7,*;q=0.3'),
			#('Accept-Encoding', 'gzip,deflate,sdch'),
			('Accept-Language', 'en-US,en,en-zh;q=0.8'),
			#('Cache-Control', 'max-age=0'),
			#('Connection', 'keep-alive')
		]
		urllib2.install_opener(opener)

		# Get base info
		(scheme, netloc, path, params, query, fragment) = urlparse.urlparse(start_url)
		fragments = (scheme, netloc, '', '', '', '')
		base_url = urlparse.urlunparse(fragments)
		#print "base_url  -> ", base_url
		
		mainLink = LinkInfo(None,base_url,u'Main',0,u'first page')
		self.assignID(mainLink)
		

		urls_queue = set([mainLink])
		urls_crawled = set()
		urls_crawled2 = set()

		pool = eventlet.GreenPool(20)

		counter = 0
		tmpC = 0
		while True:
			#Infinite loop sanity check
			counter +=1
			if counter > max_urls:
				break

			for url, body in pool.imap(self.fetch, urls_queue):
				# Remove this url from the queue set
				urls_queue = urls_queue - set([url])

				# Add url to crawled set
				urls_crawled = urls_crawled.union(set([url]))
				urls_crawled2 = urls_crawled2.union(set([url]))

				# Extract links
				links = self.extract_links(url, body, block_extensions)
				if ( links == None ):return urls_crawled
				if tmpC == 100000 : return urls_crawled
				tmpC += 1
				for link in links:
					if link not in urls_queue and link not in urls_crawled:
						# Add link to queue
						urls_queue = urls_queue.union(set([link]))
						print u"[valid]: link -> ", link.link

		return urls_crawled

Example #17

0

Show file

File: managers.py Project: cosmospham/crawley

    def _install_opener(self):

        if has_valid_attr(self.settings,'PROXY_HOST') and has_valid_attr(self.settings,'PROXY_PORT'):

            proxy_info = {        #proxy information
                'user' : getattr(self.settings, 'PROXY_USER', ''),
                'pass' : getattr(self.settings, 'PROXY_PASS', ''),
                'host' : getattr(self.settings, 'PROXY_HOST', ''), #localhost
                'port' : getattr(self.settings, 'PROXY_PORT', 80)
            }

            # build a new opener that uses a proxy requiring authorization
            proxy = urllib2.ProxyHandler({"http" :"http://%(user)s:%(pass)s@%(host)s:%(port)d" % proxy_info})
            self.opener = urllib2.build_opener(proxy, self.cookie_handler)

        else:
            self.opener = urllib2.build_opener(self.cookie_handler)

Example #18

0

Show file

File: wikiExtraction.py Project: elrob/AutoFlashcardGenerator

def vector_sort(wiki_titles, terms_counts_dict, verbose=False):
    # wiki_titles is a list of wiki titles
    # terms_counts_dict is a dictionary with term tuples keys and counts values
    wiki = wikiapi.WikiApi()
    opener = urllib2.build_opener()
    opener.addheaders = [('User-agent', 'Mozilla/5.0')]

    def title_to_article_url(title):
        spaces_to_underscores = '_'.join(title.split())
        utf8_encoded_title = spaces_to_underscores.encode('utf8')
        url_title = urllib2.quote(utf8_encoded_title)  # url escape
        article_url = wiki.get_article_url(url_title)
        return article_url

    def fetch_content(wiki_title):
        # takes a wiki title and gets the page content
        # returns a tuple of the wiki title and the cosine value

        content = opener.open(title_to_article_url(wiki_title)).read()
        return wiki_title, content

    term_array = np.array(terms_counts_dict.values())
    pool = eventlet.GreenPool(250)  # some fail if more than 250
    titles_cosines_dict = {}
    i = 1
    leng = len(wiki_titles)
    for wiki_title, content in pool.imap(fetch_content, wiki_titles[:100]):
        if verbose: print wiki_title
        if len(content) < 10:
            assert False
        if verbose: print "tokenizing..."
        tokens = tokenize_article_content(wiki.get_article(content).content)
        if verbose: print "counting ngrams for tokens..."
        tokens_counter = Counter(ngram for sent in tokens
                                 for i in xrange(1, 6)
                                 for ngram in ngrams(sent, i))
        if verbose: print "calculating array..."
        wiki_array = np.array([tokens_counter[term]*math.log(len(term)+1,2)\
                                   for term in terms_counts_dict.iterkeys()]) #*len(term)*len(term)
        full_wiki_array = np.array(tokens_counter.values())

        if verbose: print "calculating cosine..."
        cosine_value = np.dot(term_array,wiki_array)/ \
            (np.linalg.norm(term_array) * np.linalg.norm(full_wiki_array))
        titles_cosines_dict[wiki_title] = cosine_value
        if verbose: print cosine_value
        if verbose: print i, '/', leng
        i += 1

    if verbose: print titles_cosines_dict
    if verbose: print
    sorted_titles_cosines = sorted(titles_cosines_dict,
                                   key=titles_cosines_dict.get,
                                   reverse=True)
    if verbose: print sorted_titles_cosines

    return sorted_titles_cosines

Example #19

0

Show file

File: wikiExtraction.py Project: elrob/AutoFlashcardGenerator

def vector_sort(wiki_titles, terms_counts_dict, verbose=False):
    # wiki_titles is a list of wiki titles
    # terms_counts_dict is a dictionary with term tuples keys and counts values
    wiki = wikiapi.WikiApi()
    opener = urllib2.build_opener()
    opener.addheaders = [('User-agent', 'Mozilla/5.0')]
   
    def title_to_article_url(title):
        spaces_to_underscores = '_'.join(title.split())
        utf8_encoded_title = spaces_to_underscores.encode('utf8')
        url_title = urllib2.quote(utf8_encoded_title) # url escape
        article_url = wiki.get_article_url(url_title)
        return article_url
    
    
   
    
    def fetch_content(wiki_title):
        # takes a wiki title and gets the page content
        # returns a tuple of the wiki title and the cosine value

        content = opener.open(title_to_article_url(wiki_title)).read()
        return wiki_title, content

    term_array = np.array(terms_counts_dict.values())
    pool = eventlet.GreenPool(250) # some fail if more than 250
    titles_cosines_dict = {}
    i = 1
    leng = len(wiki_titles)
    for wiki_title, content in pool.imap(fetch_content,wiki_titles[:100]):
        if verbose: print wiki_title
        if len(content) < 10: 
            assert False
        if verbose: print "tokenizing..."
        tokens = tokenize_article_content(wiki.get_article(content).content)
        if verbose: print "counting ngrams for tokens..."
        tokens_counter = Counter(ngram for sent in tokens for i in xrange(1,6) for ngram in ngrams(sent,i))
        if verbose: print "calculating array..."
        wiki_array = np.array([tokens_counter[term]*math.log(len(term)+1,2)\
                                   for term in terms_counts_dict.iterkeys()]) #*len(term)*len(term) 
        full_wiki_array = np.array(tokens_counter.values())

        if verbose: print "calculating cosine..."
        cosine_value = np.dot(term_array,wiki_array)/ \
            (np.linalg.norm(term_array) * np.linalg.norm(full_wiki_array))
        titles_cosines_dict[wiki_title] = cosine_value
        if verbose: print cosine_value
        if verbose: print i, '/', leng
        i += 1

    if verbose: print titles_cosines_dict
    if verbose: print
    sorted_titles_cosines = sorted(titles_cosines_dict, key=titles_cosines_dict.get, reverse = True)
    if verbose: print sorted_titles_cosines

    return sorted_titles_cosines

Example #20

0

Show file

File: Grandstream.py Project: venturinog/endpointconfig2

    def _enableStaticProvisioning_GXP140x(self, vars):
        try:
            # Login into interface and get SID. Check proper Content-Type
            cookiejar = cookielib.CookieJar(
                cookielib.DefaultCookiePolicy(rfc2965=True))
            opener = urllib2.build_opener(
                urllib2.HTTPCookieProcessor(cookiejar))
            # response = urllib2.urlopen('http://' + self._ip + '/cgi-bin/dologin',
            response = opener.open(
                'http://' + self._ip + '/cgi-bin/dologin',
                urllib.urlencode({'password': self._http_password}))
            body = response.read()
            content_type = response.info()['Content-Type'].rsplit(';', 1)[0]
            if content_type <> 'application/json':
                logging.error(
                    'Endpoint %s@%s GXP140x - dologin answered not application/json but %s'
                    % (self._vendorname, self._ip,
                       response.info()['Content-Type']))
                return False

            # Check successful login and get sid
            jsonvars = cjson.decode(body)
            if not ('body' in jsonvars and 'sid' in jsonvars['body']):
                logging.error('Endpoint %s@%s GXP140x - dologin failed login' %
                              (self._vendorname, self._ip))
                return False
            sid = jsonvars['body']['sid']

            # Post vars with sid
            vars.update({'sid': sid})
            # response = urllib2.urlopen('http://' + self._ip + '/cgi-bin/api.values.post',
            response = opener.open(
                'http://' + self._ip + '/cgi-bin/api.values.post',
                urllib.urlencode(vars))

            jsonvars = self._parseBotchedJSONResponse(response)
            if jsonvars == None:
                logging.error(
                    'jsonvars vacio %s@%s GXP140x - vars rejected by interface - %s - %s - %s'
                    % (self._vendorname, self._ip, urllib.urlencode(vars),
                       jsonvars['body'], sid))
                return False

            if not ('response' in jsonvars and jsonvars['response'] == 'success' \
                    and 'body' in jsonvars and 'status' in jsonvars['body'] and jsonvars['body']['status'] == 'right' ):
                logging.error(
                    'Endpoint %s@%s GXP140x - vars rejected by interface - %s - %s - %s'
                    % (self._vendorname, self._ip, urllib.urlencode(vars),
                       jsonvars['body'], sid))
                return False

            return True
        except cjson.DecodeError, e:
            logging.error('Endpoint %s@%s GXP140x received invalid JSON - %s' %
                          (self._vendorname, self._ip, str(e)))
            return False

Example #21

0

Show file

File: event.py Project: msurovcak/pulp

def build_urllib2_opener(config):

    kwargs = {'key_file': config.ssl_client_key_path,
              'cert_file': config.ssl_client_cert_path,
              'ca_cert_file': config.ssl_ca_cert_path,
              'verify_host': bool(config.ssl_validation), # None -> False
              'proxy_url': config.proxy_url,
              'proxy_port': config.proxy_port,}

    handler = PulpHandler(**kwargs)
    return urllib2.build_opener(handler)

Example #22

0

Show file

    def setUp(self):
        FakeProxyHandler.digest_auth_handler.set_users(
            {self.USER: self.PASSWD})
        FakeProxyHandler.digest_auth_handler.set_realm(self.REALM)

        self.server = LoopbackHttpServerThread(self.PORT, FakeProxyHandler)
        self.server.start()
        self.server.ready.wait()

        handler = urllib2.ProxyHandler({"http": self.PROXY_URL})
        self._digest_auth_handler = urllib2.ProxyDigestAuthHandler()
        self.opener = urllib2.build_opener(handler, self._digest_auth_handler)

Example #23

0

Show file

    def _install_opener(self):

        if has_valid_attr(self.settings, 'PROXY_HOST') and has_valid_attr(
                self.settings, 'PROXY_PORT'):

            proxy_info = {  #proxy information
                'user': getattr(self.settings, 'PROXY_USER', ''),
                'pass': getattr(self.settings, 'PROXY_PASS', ''),
                'host': getattr(self.settings, 'PROXY_HOST', ''),  #localhost
                'port': getattr(self.settings, 'PROXY_PORT', 80)
            }

            # build a new opener that uses a proxy requiring authorization
            proxy = urllib2.ProxyHandler({
                "http":
                "http://%(user)s:%(pass)s@%(host)s:%(port)d" % proxy_info
            })
            self.opener = urllib2.build_opener(proxy, self.cookie_handler)

        else:
            self.opener = urllib2.build_opener(self.cookie_handler)

Example #24

0

Show file

File: Grandstream.py Project: IssabelFoundation/endpointconfig2

 def _rebootbyhttp(self):
     opener = urllib2.build_opener(urllib2.HTTPCookieProcessor())
     response = opener.open('http://' + self._ip +
                            '/cgi-bin//api-sys_operation?passcode=' +
                            self._http_password + '&request=REBOOT')
     jsonvars = self._parseBotchedJSONResponse(response)
     if jsonvars == None:
         return False
     if not ('response' in jsonvars and jsonvars['response'] == 'success'):
         logging.error('Endpoint %s@%s unimplemented reboot by HTTP' %
                       (self._vendorname, self._ip))
         return False
     return True

Example #25

0

Show file

File: test_urllib2_localnet.py Project: AGProjects/python-eventlib

    def setUp(self):
        FakeProxyHandler.digest_auth_handler.set_users({
            self.USER : self.PASSWD
            })
        FakeProxyHandler.digest_auth_handler.set_realm(self.REALM)

        self.server = LoopbackHttpServerThread(self.PORT, FakeProxyHandler)
        self.server.start()
        self.server.ready.wait()

        handler = urllib2.ProxyHandler({"http" : self.PROXY_URL})
        self._digest_auth_handler = urllib2.ProxyDigestAuthHandler()
        self.opener = urllib2.build_opener(handler, self._digest_auth_handler)

Example #26

0

Show file

def pull(title):
    """pull all the infobox goodies from wikipedia"""
    url = "http://en.wikipedia.org/w/index.php?action=render&title=%s" % title
    opener = urllib2.build_opener()
    opener.addheaders = [('User-agent',
                          'Mozilla/5.0 (user-agent-restrictions-are-silly)')]
    try:
        html = opener.open(url.encode("utf-8")).read()
    except:
        print(u"  Could not fetch %s" % url).encode('utf-8')
        return None
    try:
        soup = BeautifulSoup.BeautifulSoup(html)
    except:
        print(u"  Could not parse %s" % url).encode('utf-8')
        return None
    # Extract information
    infobox = soup.find("table", {'class': re.compile(r'\binfobox\b')})
    if not infobox:
        print(u"  No infobox found in %s" % url).encode('utf-8')
        return None

    information = {}
    name = infobox.find("th", {'class': 'fn org'})
    if name:
        information['name'] = extract_text(name)

    def grab(info, name=None):
        if name is None:
            name = info.lower()
        text = infobox.find("text", text=info)
        if text:
            information[name] = extract_text(text.parent.findNext("td"))

    grab("Capital")
    grab("Admission to Union", "admitted")
    pop = infobox.find("text", text="Population")
    if pop:
        text = pop.findNext("text", text=re.compile("Total$"))
        if text:
            information['population'] = extract_text(
                text.parent.findNext("td"))
    grab(re.compile("Latitude$"), "latitude")
    grab(re.compile("Longitude$"), "longitude")
    text = infobox.find("text", text=re.compile("Motto"))
    if text:
        information["motto"] = extract_text(text.findNext("i"))
    information["description"] = extract_text(infobox.findNext("p"))

    return information

Example #27

0

Show file

File: get_metadata.py Project: trampolinerocket/patenttags

def get_class(patent_id, cluster_id):
    url = BASE_URL + fix_patent_number(patent_id)
    opener = urllib2.build_opener()
    opener.addheaders = [('User-agent', 'Mozilla/5.0')]
    try:
        html = opener.open(url).read()
        soup = bs(html)
        toptd = soup.find('td', text='U.S. Classification')
        classstring = toptd.findNext('td').findNext('span').text
        mainclass, subclass = classstring.split('/')
        return patent_id, cluster_id, mainclass, subclass
    except:
        print 'Couldnt get', patent_id
        return ('', '', '', '')

Example #28

0

Show file

File: openanything.py Project: hunterowens/StreetScope

def openAnything(source, referrer=None, etag=None, lastmodified=None, agent=USER_AGENT):
    """URL, filename, or string --> stream

    This function lets you define parsers that take any input source
    (URL, pathname to local or network file, or actual data as a string)
    and deal with it in a uniform manner.  Returned object is guaranteed
    to have all the basic stdio read methods (read, readline, readlines).
    Just .close() the object when you're done with it.

    If the etag argument is supplied, it will be used as the value of an
    If-None-Match request header.

    If the lastmodified argument is supplied, it must be a formatted
    date/time string in GMT (as returned in the Last-Modified header of
    a previous request).  The formatted date/time will be used
    as the value of an If-Modified-Since request header.

    If the agent argument is supplied, it will be used as the value of a
    User-Agent request header.
    """

    if hasattr(source, 'read'):
        return source

    if source == '-':
        return sys.stdin

    if urlparse.urlparse(source)[0] == 'http':
        # open URL with urllib2
        request = urllib2.Request(source)
        request.add_header('User-Agent', agent)
        if referrer:
            print "Adding referrer %s" % referrer
            request.add_header('Referer', referrer)
        if lastmodified:
            request.add_header('If-Modified-Since', lastmodified)
        if etag:
            request.add_header('If-None-Match', etag)
        request.add_header('Accept-encoding', 'gzip')
        opener = urllib2.build_opener(SmartRedirectHandler(), DefaultErrorHandler())
        return opener.open(request)
    
    # try to open with native open function (if source is a filename)
    try:
        return open(source)
    except (IOError, OSError):
        pass

    # treat source as string
    return StringIO(str(source))

Example #29

0

Show file

File: get_metadata.py Project: gtfierro/patenttags

def get_class(patent_id, cluster_id):
    url = BASE_URL + fix_patent_number(patent_id)
    opener = urllib2.build_opener()
    opener.addheaders = [('User-agent', 'Mozilla/5.0')]
    try:
        html = opener.open(url).read()
        soup = bs(html)
        toptd = soup.find('td',text='U.S. Classification')
        classstring = toptd.findNext('td').findNext('span').text
        mainclass, subclass = classstring.split('/')
        return patent_id, cluster_id, mainclass, subclass
    except:
        print 'Couldnt get', patent_id
        return ('', '', '', '')

Example #30

0

Show file

 def probeModel(self):
     '''Probe specific model of Aastra phone
     
     The Aastra web admin interface uses Basic authentication for access 
     control. The authentication realm exposes the phone model like this:
     
     HTTP/1.1 401 Unauthorized
     Server: Aragorn
     WWW-Authenticate: Basic realm="Aastra 6757i"
     Connection: close
     Content-Length: 745
     Content-Type: text/html
     
     '''
     sModel = None
     try:
         # Do not expect this to succeed. Only interested in exception.
         urllib2.urlopen('http://' + self._ip + '/')
     except urllib2.HTTPError, e:
         if e.code == 401 and 'WWW-Authenticate' in e.headers:
             m = re.search(r'realm="Aastra (.+)"',
                           e.headers['WWW-Authenticate'])
             if m != None:
                 sModel = m.group(1)
             else:
                 self._http_username = '******'
                 self._http_password = '******'
                 password_manager = urllib2.HTTPPasswordMgrWithDefaultRealm(
                 )
                 password_manager.add_password(None,
                                               'http://' + self._ip + '/',
                                               self._http_username,
                                               self._http_password)
                 basic_auth_handler = urllib2.HTTPBasicAuthHandler(
                     password_manager)
                 opener = urllib2.build_opener(basic_auth_handler)
                 try:
                     response = opener.open('http://' + self._ip +
                                            '/sysinfo.html')
                     htmlbody = response.read()
                     #  <TR>
                     #    <TD style="BORDER-BOTTOM: 1px dashed">Platform</TD>
                     #    <TD style="BORDER-BOTTOM: 1px dashed">9112i Revision 0</TD></TR>
                     #  <TR>
                     m = re.search(r'Platform</TD>.*?<TD.*?>(\w+)',
                                   htmlbody, re.IGNORECASE | re.DOTALL)
                     if m != None:
                         sModel = m.group(1)
                 except Exception, e:
                     pass

Example #31

0

Show file

File: from_wikipedia.py Project: Averroes/xapian-docsprint

def pull(title):
    """pull all the infobox goodies from wikipedia"""
    url = "http://en.wikipedia.org/w/index.php?action=render&title=%s" % title
    opener = urllib2.build_opener()
    opener.addheaders = [('User-agent', 'Mozilla/5.0 (user-agent-restrictions-are-silly)')]
    try:
        html = opener.open(url.encode("utf-8")).read()
    except:
        print (u"  Could not fetch %s" % url).encode('utf-8')
        return None
    try:
        soup = BeautifulSoup.BeautifulSoup(html)
    except:
        print (u"  Could not parse %s" % url).encode('utf-8')
        return None
    # Extract information
    infobox = soup.find("table", { 'class': re.compile(r'\binfobox\b') })
    if not infobox:
        print (u"  No infobox found in %s" % url).encode('utf-8')
        return None

    information = {}
    name = infobox.find("th", { 'class': 'fn org' })
    if name:
        information['name'] = extract_text(name)

    def grab(info, name=None):
        if name is None:
            name = info.lower()
        text = infobox.find("text", text=info)
        if text:
            information[name] = extract_text(text.parent.findNext("td"))

    grab("Capital")
    grab("Admission to Union", "admitted")
    pop = infobox.find("text", text="Population")
    if pop:
        text = pop.findNext("text", text=re.compile("Total$"))
        if text:
            information['population'] = extract_text(text.parent.findNext("td"))
    grab(re.compile("Latitude$"), "latitude")
    grab(re.compile("Longitude$"), "longitude")
    text = infobox.find("text", text=re.compile("Motto"))
    if text:
        information["motto"] = extract_text(text.findNext("i"))
    information["description"] = extract_text(infobox.findNext("p"))

    return information

Example #32

0

Show file

File: Parallel linkfile downloader.py Project: xtotdam/nautilus-scripts

def fetchFiles(name, url):
    if not exists(name):
        time.sleep(random.random() * wait)
        opener = urllib2.build_opener()
        opener.addheaders = [
                             ('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'),
                             ('Accept-Encoding', 'gzip, deflate'),
                             ('Connection', 'close'),
                             ('Proxy-Authorization', randomHexString()),
                             ('User-agent', randomUA())
                             ]
        r = opener.open(url)
        open(name, 'wb').write(r.read())
    else:
        stderr.write(name + ' already exists [' + url + ']\n')
    return name

Example #33

0

Show file

    def _request(self, statement, timeout=0):
        """
        Builds the query string, then opens a connection to the endpoint
        and returns the file descriptor.
        """
        query = self._queryString(statement)
        buf = tempfile.NamedTemporaryFile()

        opener = urllib2.build_opener(RedirectHandler)
        opener.addheaders = self.headers().items()

        try:
            response = self._build_response(query, opener, buf, timeout)
        except SparqlException, error:
            self.endpoint = error.message
            response = self._build_response(query, opener, buf, timeout)

Example #34

0

Show file

File: sitemap-crawler.py Project: nmg0721/Sitemap-Crawler

    def main(self, start_url, block_extensions=['.pdf'], max_urls = 100):

		# Set user agent string
		opener = urllib2.build_opener()
		opener.addheaders = [
			('User-agent', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.220 Safari/535.1'),
			('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'),
			('Accept-Charset', 'ISO-8859-1,utf-8;q=0.7,*;q=0.3'),
			('Accept-Encoding', 'gzip,deflate,sdch'),
			('Accept-Language', 'en-US,en;q=0.8'),
			('Cache-Control', 'max-age=0'),
			('Connection', 'keep-alive')
		]
		urllib2.install_opener(opener)

		# Get base info
		(scheme, netloc, path, params, query, fragment) = urlparse.urlparse(start_url)
		fragments = (scheme, netloc, '', '', '', '')
		base_url = urlparse.urlunparse(fragments)

		urls_queue = set([base_url])
		urls_crawled = set()

		pool = eventlet.GreenPool(20)

		counter = 0
		while True:
			#Infinite loop sanity check
			counter +=1
			if counter > max_urls:
				break

			for url, body in pool.imap(self.fetch, urls_queue):
				# Remove this url from the queue set
				urls_queue = urls_queue - set([url])

				# Add url to crawled set
				urls_crawled = urls_crawled.union(set([url]))

				# Extract links
				links = self.extract_links(url, body, block_extensions)
				for link in links:
					if link not in urls_queue and link not in urls_crawled:
						# Add link to queue
						urls_queue = urls_queue.union(set([link]))

		return urls_crawled

Example #35

0

Show file

File: buildout.py Project: eea/eea.eggmonkey

 def request(self, host, handler, request_body, verbose):
     '''Send xml-rpc request using proxy'''
     # We get a traceback if we don't have this attribute:
     self.verbose = verbose
     url = 'http://' + host + handler
     request = urllib2.Request(url)
     request.add_data(request_body)
     # Note: 'Host' and 'Content-Length' are added automatically
     base64string = base64.encodestring(
         '%s:%s' % (self._username, self._password)).replace('\n', '')
     request.add_header("Authorization", "Basic %s" % base64string)
     request.add_header('User-Agent', self.user_agent)
     request.add_header('Content-Type', 'text/xml')
     proxy_handler = urllib2.ProxyHandler()
     opener = urllib2.build_opener(proxy_handler)
     fhandle = opener.open(request)
     return(self.parse_response(fhandle))

Example #36

0

Show file

 def request(self, host, handler, request_body, verbose):
     '''Send xml-rpc request using proxy'''
     # We get a traceback if we don't have this attribute:
     self.verbose = verbose
     url = 'http://' + host + handler
     request = urllib2.Request(url)
     request.add_data(request_body)
     # Note: 'Host' and 'Content-Length' are added automatically
     base64string = base64.encodestring(
         '%s:%s' % (self._username, self._password)).replace('\n', '')
     request.add_header("Authorization", "Basic %s" % base64string)
     request.add_header('User-Agent', self.user_agent)
     request.add_header('Content-Type', 'text/xml')
     proxy_handler = urllib2.ProxyHandler()
     opener = urllib2.build_opener(proxy_handler)
     fhandle = opener.open(request)
     return (self.parse_response(fhandle))

Example #37

0

Show file

File: coordinate_solver.py Project: 24eme/cadatrava

def coordinate_solver(id):
    global coordinates
    global coordinates_id
    global commune
    global departement

    mycid = 0
    s = eventlet.semaphore.Semaphore(1)
    cj = CookieJar()
    opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
    body = opener.open("http://www.cadastre.gouv.fr/scpc/accueil.do")
    page = body.read()
    data = {'ville': commune, 'codeDepartement': departement}
    body = opener.open("http://www.cadastre.gouv.fr/scpc/rechercherPlan.do", urllib.urlencode(data))
    page = body.read()
    m = re.search("afficherCarteCommune.do.c=([^']+)'", page)
    codeCommune = None
    if m:
        codeCommune = m.group(1)
    else:
        m = re.search('select name="codeCommune" id="codeCommune" class="long erreur"><option value="">Choisir</option><option value="([^"]+)"', page)
        if m:
            codeCommune = m.group(1)
            data = {"codeCommune": codeCommune, 'codeDepartement': departement, 'nbResultatParPage': 10, 'x':153, 'y':6}
            body = opener.open("http://www.cadastre.gouv.fr/scpc/rechercherPlan.do", urllib.urlencode(data))
            page = body.read()
    if codeCommune == None:
        print "commune: ERROR"
        return "ERROR"
    body = opener.open("http://www.cadastre.gouv.fr/scpc/afficherCarteCommune.do?c="+codeCommune)
    page = body.read()
    if re.search("Impossible d'initialiser", page):
        print "carte: ERROR"
        return "ERROR"
    while True:
        with s:
            mycid = coordinates_id
            coordinates_id += 1
        if mycid >= len(coordinates):
            break
        coordargs = coordinates[mycid]
        m = re.search('^([^ ]+) ([^ ]+) ([^ ]+)$', coordargs)
        if coordinate_xml_solver(opener, codeCommune, m.group(2), m.group(1)) == False:
            print "New tests for coords"
            coordinate_xml_solver(opener, codeCommune, m.group(3), m.group(1))

Example #38

0

Show file

File: expiring_objects.py Project: hurricanerix/swift-tools

def create_objects(data):
    # TODO: handle errors
    url = data.get('url')
    token = data.get('token')
    delete_at = data.get('delete_at')

    opener = urllib2.build_opener(urllib2.HTTPHandler)
    request = urllib2.Request(url, data='')
    request.add_header('Content-Type', 'text/plain')
    request.add_header('X-Auth-Token', token)
    request.add_header('X-Delete-At', delete_at)
    request.get_method = lambda: 'PUT'
    result = opener.open(request)

    txid = result.headers.getheader('X-Trans-Id')
    status = result.getcode()

    return url, txid, status

Example #39

0

Show file

File: request.py Project: hammadk373/crawley

    def get_response(self, data=None):
        """
            Returns the response object from a request.
            Cookies are supported via a CookieHandler object
        """

        self._normalize_url()

        request = urllib2.Request(self.url, data, self.headers)
        opener = urllib2.build_opener(self.cookie_handler)

        if REQUEST_TIMEOUT is not None:
            response = opener.open(request, timeout=REQUEST_TIMEOUT)
        else:
            response = opener.open(request)

        self.cookie_handler.save_cookies()

        return response

Example #40

0

Show file

File: scrapers.py Project: wurdum/mtg-prices

def openurl(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.110 Safari/537.36',
        'Accept-Language': 'ru-RU,ru;q=0.8,en-US;q=0.6,en;q=0.4',
        'Accept-Encoding': 'gzip,deflate',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        'Cache-Control': 'max-age=0'
    }

    opener = urllib2.build_opener()
    opener.addheaders = headers.items()
    response = opener.open(ext.iriToUri(url))

    if response.info().get('Content-Encoding') == 'gzip':
        buf = StringIO(response.read())
        response = gzip.GzipFile(fileobj=buf)

    page = response.read()
    return page

Example #41

0

Show file

    def _request(self, statement, timeout=0):
        """
        Builds the query string, then opens a connection to the endpoint
        and returns the file descriptor.
        """
        query = self._queryString(statement)
        buf = tempfile.NamedTemporaryFile()

        opener = urllib2.build_opener()
        opener.addheaders = self.headers().items()

        request = self._build_request(query)
        response = self._get_response(opener, request, buf,
                                      timeout if timeout > 0 else None)

        self._read_response(response, buf, timeout)

        buf.seek(0)
        return buf

Example #42

0

Show file

File: wikiExtraction.py Project: elrob/AutoFlashcardGenerator

def get_wiki_content(title):
    # title is in unicode (utf-8) format with spaces, without underscores and
    # url escape characters

    wiki = wikiapi.WikiApi()

    spaces_to_underscores = '_'.join(title.split())
    utf8_encoded_title = spaces_to_underscores.encode('utf8')
    url_title = urllib2.quote(utf8_encoded_title)  # url escape

    article_url = wiki.get_article_url(url_title)
    # print repr(article_url)
    opener = urllib2.build_opener()
    opener.addheaders = [('User-agent', 'Mozilla/5.0')]
    content = opener.open(article_url).read()
    art = wiki.get_article(content)
    # print "Got article: ", art.heading
    # print "Content: ", art.content
    # print
    return art.content

Example #43

0

Show file

File: wikiExtraction.py Project: elrob/AutoFlashcardGenerator

def get_wiki_content(title):
    # title is in unicode (utf-8) format with spaces, without underscores and
    # url escape characters

    wiki = wikiapi.WikiApi()

    spaces_to_underscores = '_'.join(title.split())
    utf8_encoded_title = spaces_to_underscores.encode('utf8')
    url_title = urllib2.quote(utf8_encoded_title) # url escape

    article_url = wiki.get_article_url(url_title)
    # print repr(article_url)
    opener = urllib2.build_opener()
    opener.addheaders = [('User-agent', 'Mozilla/5.0')]
    content = opener.open(article_url).read()
    art = wiki.get_article(content)
    # print "Got article: ", art.heading
    # print "Content: ", art.content
    # print
    return art.content

Example #44

0

Show file

    def handle404(self, reqorig, url, container, obj):
        """
        Return a webob.Response which fetches the thumbnail from the thumb
        host, potentially writes it out to Swift so we don't 404 next time,
        and returns it. Note also that the thumb host might write it out
        to Swift so we don't have to.
        """
        # go to the thumb media store for unknown files
        reqorig.host = self.thumbhost
        # upload doesn't like our User-agent, otherwise we could call it
        # using urllib2.url()
        opener = urllib2.build_opener()
        # Pass on certain headers from the caller squid to the scalers
        opener.addheaders = []
        if reqorig.headers.get('User-Agent') != None:
            opener.addheaders.append(('User-Agent', reqorig.headers.get('User-Agent')))
        else:
            opener.addheaders.append(('User-Agent', self.user_agent))
        for header_to_pass in ['X-Forwarded-For', 'X-Original-URI']:
            if reqorig.headers.get( header_to_pass ) != None:
                opener.addheaders.append((header_to_pass, reqorig.headers.get( header_to_pass )))
        # At least in theory, we shouldn't be handing out links to originals
        # that we don't have (or in the case of thumbs, can't generate).
        # However, someone may have a formerly valid link to a file, so we
        # should do them the favor of giving them a 404.
        try:
            # break apach the url, url-encode it, and put it back together
            urlobj = list(urlparse.urlsplit(reqorig.url))
            urlobj[2] = urllib2.quote(urlobj[2], '%/')
            encodedurl = urlparse.urlunsplit(urlobj)
            # ok, call the encoded url
            upcopy = opener.open(encodedurl)

        except urllib2.HTTPError,status:
            if status.code == 404:
                resp = webob.exc.HTTPNotFound('Expected original file not found')
                return resp
            else:
                resp = webob.exc.HTTPNotFound('Unexpected error %s' % status)
                resp.status = status.code
                return resp

Example #45

0

Show file

    def _request(self, statement, timeout=0):
        """
        Builds the query string, then opens a connection to the endpoint
        and returns the file descriptor.
        """
        resultsType = 'xml'

        query = self._queryString(statement)
        buf = tempfile.NamedTemporaryFile()

        opener = urllib2.build_opener()
        opener.addheaders = self.headers().items()

        request = self._build_request(query)

        response = self._get_response(opener, request, buf)

        self._read_response(response, buf, timeout)

        buf.seek(0)
        return buf

Example #46

0

Show file

File: sparql.py Project: yanliang12/sparql-client

    def _request(self, statement, timeout=0):
        """
        Builds the query string, then opens a connection to the endpoint
        and returns the file descriptor.
        """
        query = self._queryString(statement)
        buf = tempfile.NamedTemporaryFile()

        opener = ev_request.build_opener(RedirectHandler)
        opener.addheaders = list(self.headers().items())
        try:
            if type(query) is not bytes and not six.PY2:
                query = query.encode()
            response = self._build_response(query, opener, buf, timeout)
        except SparqlException as error:
            self.endpoint = error.message
            response = self._build_response(query, opener, buf, timeout)

        self._read_response(response, buf, timeout)
        buf.seek(0)
        return buf

Example #47

0

Show file

    def proxyTest(self, row):
        proxy = row[0] + ":" + row[1]
        if 'HTTPS' in row[3]:
            proxies = {"https": "https://" + proxy}
        else:
            proxies = {"http": "http://" + proxy}
        ip = row[0]
        port = row[1]

        theProxy = urllib2.ProxyHandler(proxies)
        opener = urllib2.build_opener(theProxy)
        urllib2.install_opener(opener)
        testResult = 'ok!'
        try:
            webcode = urllib2.urlopen("https://www.fliggy.com/",
                                      timeout=10).getcode()
            #logger.info("Proxy %s is ok" % proxy)
        except Exception, e:
            #logger.warn("Proxy %s is nolonger ok" % proxy)
            self.clean(ip=ip, port=port)
            testResult = 'nolonger ok!'

Example #48

0

Show file

 def _doAuthPost(self, urlpath, postvars):
     '''Perform an HTTP POST on a particular URL using the HTTP credentials
     
     This method is frequently used to make the phone use the Elastix server
     as the TFTP source for autoprovisioning.
     ''' 
     password_manager = urllib2.HTTPPasswordMgrWithDefaultRealm()
     password_manager.add_password(None, 'http://' + self._ip + '/',
         self._http_username, self._http_password)
     basic_auth_handler = urllib2.HTTPBasicAuthHandler(password_manager)
     digest_auth_handler = urllib2.HTTPDigestAuthHandler(password_manager)
     opener = urllib2.build_opener(basic_auth_handler, digest_auth_handler)
     if postvars != None:
         opener.addheaders = [('Content-Type', 'application/x-www-form-urlencoded')]
         if not isinstance(postvars, str):
             postvars = urllib.urlencode(postvars)
     try:
         opener.open('http://' + self._ip + urlpath, postvars)
     except urllib2.HTTPError, e:
         logging.error('Endpoint %s@%s failed to authenticate - %s' %
             (self._vendorname, self._ip, str(e)))
         return False

Example #49

0

Show file

File: messaging.py Project: bollwyvl/PySCXML

 def get_sync(self, url, data, type=None, content_type="application/x-www-form-urlencoded"):
     try:
         data = urlencode(data)
     except: # data is probably a string to be send directly. 
         pass
     headers = {"Content-Type" : content_type}
     if type and type.upper() not in ("POST", "GET"):
         from restlib import RestfulRequest #@UnresolvedImport
         req = RestfulRequest(url, data=data, method=type.upper())
     else:
         req = urllib2.Request(url, data, headers=headers)
     
     opener = urllib2.build_opener(self)
     try:
         f = opener.open(req, data=data)
         if f.code is None or str(f.code)[0] == "2":
             dispatcher.send(UrlGetter.HTTP_RESULT, self, result=f.read(), source=url, code=f.code)
         else:
             e = urllib2.HTTPError(url, f.code, "A code %s HTTP error has ocurred when trying to send to target %s" % (f.code, url), req.headers, f)
             dispatcher.send(UrlGetter.HTTP_ERROR, self, exception=e)
     except urllib2.URLError, e:
         dispatcher.send(UrlGetter.URL_ERROR, self, exception=e, url=url)

Example #50

0

Show file

File: Grandstream.py Project: blink-hr/vPBX

    def _enableStaticProvisioning_BT200(self, vars):
        try:
            # Login into interface
            cookiejar = cookielib.CookieJar(
                cookielib.DefaultCookiePolicy(rfc2965=True))
            opener = urllib2.build_opener(
                urllib2.HTTPCookieProcessor(cookiejar))
            response = opener.open(
                'http://' + self._ip + '/dologin.htm',
                urllib.urlencode({
                    'Login': '******',
                    'P2': self._http_password,
                    'gnkey': '0b82'
                }))
            body = response.read()
            if 'dologin.htm' in body:
                logging.error('Endpoint %s@%s BT200 - dologin failed login' %
                              (self._vendorname, self._ip))
                return False

            # Force cookie version to 0
            for cookie in cookiejar:
                cookie.version = 0

            response = opener.open('http://' + self._ip + '/update.htm',
                                   urllib.urlencode(vars) + '&gnkey=0b82')
            body = response.read()
            if 'dologin.htm' in body:
                logging.error(
                    'Endpoint %s@%s BT200 - dologin failed to keep session' %
                    (self._vendorname, self._ip))
                return False

            return True
        except urllib2.HTTPError, e:
            logging.error(
                'Endpoint %s@%s BT200 failed to send vars to interface - %s' %
                (self._vendorname, self._ip, str(e)))
            return False

Example #51

0

Show file

File: spiderWorker.py Project: piashishi/python_spider

    def setCookies(self):
        cookie_file_path = os.path.join(os.environ['LOCALAPPDATA'], 
                                r'Google\Chrome\User Data\Default\Cookies')
        if not os.path.exists(cookie_file_path):
            raise Exception('Cookies file not exist!')

        #fetch domain from website,
        domain = ""
        tmpList = re.compile(r'http://(.*?)/').findall(self.manager.webSite)
        if tmpList:
            domain = tmpList[0]
        else:
            #webSite is a root
            domain = re.compile(r'http://(.*?)').findall(self.manager.webSite)

        sql = 'select host_key, name, encrypted_value, path from cookies'
        sql += ' where host_key like "%{}%"'.format(domain)
        with sqlite3.connect(cookie_file_path) as conn:
            rows = conn.execute(sql)

        cookiejar = cookielib.CookieJar()
        for row in rows:
            #get encrypted value
            pwdHash = str(row[2])
            try:
                ret = win32crypt.CryptUnprotectData(pwdHash, None, None, None, 0)
            except:
                print 'Fail to decrypt chrome cookies'
                sys.exit(-1)

            cookie_item = cookielib.Cookie(version=0, name=row[1], value=ret[1],
                         port=None, port_specified=None,domain=row[0], 
                         domain_specified=None, domain_initial_dot=None,path=row[3], 
                         path_specified=None,secure=None,expires=None,
                         discard=None,comment=None,comment_url=None,rest=None,rfc2109=False,
                         )
            cookiejar.set_cookie(cookie_item)    # Apply each cookie_item to cookiejar
        self.opener =  urllib2.build_opener(urllib2.HTTPCookieProcessor(cookiejar))

Example #52

0

Show file

File: Snom.py Project: blink-hr/vPBX

    def probeModel(self):
        '''Probe specific model of the Snom phone

        The Snom phone displays the phone model in the title screen, which is
        unsecured by default.
        '''
        self._loadCustomCredentials()

        sModel = None
        try:
            password_manager = urllib2.HTTPPasswordMgrWithDefaultRealm()
            if self._http_password != None:
                password_manager.add_password(None, 'http://' + self._ip + '/',
                                              self._http_username,
                                              self._http_password)
            basic_auth_handler = urllib2.HTTPBasicAuthHandler(password_manager)
            opener = urllib2.build_opener(basic_auth_handler)
            response = opener.open('http://' + self._ip + '/')
            htmlbody = response.read()
            if response.code == 200:
                # <TITLE>snom 320</TITLE>
                m = re.search(r'<TITLE>snom (\w+)</TITLE>', htmlbody,
                              re.IGNORECASE)
                if m != None:
                    sModel = m.group(1)
                else:
                    # M300, M700
                    m = re.search(r'<TITLE>(M\d+)</TITLE>', htmlbody,
                                  re.IGNORECASE)
                    if m != None:
                        sModel = m.group(1)
        #except urllib2.HTTPError, e:
        #    if e.code == 401 and 'WWW-Authenticate' in e.headers:
        #        m = re.search(r'realm="Aastra (.+)"', e.headers['WWW-Authenticate'])
        #        if m != None: sModel = m.group(1)
        except Exception, e:
            pass

Example #53

0

Show file

    def __init__(self, concurrency=10):
        # a green pool is a pool of greenthreads - you're pushing
        # tasks to it and they get executed when eventlet's loop is
        # active
        self.pool = eventlet.GreenPool(concurrency)
        # the queue receives URLs to visit
        self.queue = eventlet.Queue()
        # our root URL, the first to be fetched
        self.queue.put("https://market.android.com/")
        # after a fetch of an app is finished, results get pushed in
        # this queue
        self.results = eventlet.Queue()
        # we need to make sure we don't fetch the same URL more than
        # once, otherwise the script might never finish
        self.seen = set()
        # `seen_app_ids` cuts down on fetching apps that have been
        # fetched before; it is necessary in addition to `seen`
        self.seen_app_ids = set()
        # just a counter for statistics
        self.failed = 0

        # our opener
        self.browser = urllib2.build_opener()
        self.browser.addheaders.append(('Cookie', 'hlSession2=en'))

Example #54

0

Show file

    def handle404(self, reqorig, url, container, obj):
        """
        Return a swob.Response which fetches the thumbnail from the thumb
        host and returns it. Note also that the thumb host might write it out
        to Swift so it won't 404 next time.
        """
        # upload doesn't like our User-agent, otherwise we could call it
        # using urllib2.url()
        thumbor_opener = urllib2.build_opener(DumbRedirectHandler())

        # Pass on certain headers from Varnish to Thumbor
        thumbor_opener.addheaders = []
        if reqorig.headers.get('User-Agent') is not None:
            thumbor_opener.addheaders.append(
                ('User-Agent', reqorig.headers.get('User-Agent')))
        else:
            thumbor_opener.addheaders.append(('User-Agent', self.user_agent))
        for header_to_pass in [
                'X-Forwarded-For', 'X-Forwarded-Proto', 'Accept',
                'Accept-Encoding', 'X-Original-URI'
        ]:
            if reqorig.headers.get(header_to_pass) is not None:
                header = (header_to_pass, reqorig.headers.get(header_to_pass))
                thumbor_opener.addheaders.append(header)

        # At least in theory, we shouldn't be handing out links to originals
        # that we don't have (or in the case of thumbs, can't generate).
        # However, someone may have a formerly valid link to a file, so we
        # should do them the favor of giving them a 404.
        try:
            thumbor_encodedurl = self.thumborify_url(reqorig, self.thumborhost)
            upcopy = thumbor_opener.open(thumbor_encodedurl)
        except urllib2.HTTPError as error:
            # Wrap the urllib2 HTTPError into a swob HTTPException
            status = error.code
            body = error.fp.read()
            headers = error.hdrs.items()
            if status not in swob.RESPONSE_REASONS:
                # Generic status description in case of unknown status reasons.
                status = "%s Error" % status
            return swob.HTTPException(status=status,
                                      body=body,
                                      headers=headers)
        except urllib2.URLError as error:
            msg = 'There was a problem while contacting the thumbnailing service: %s' % \
                  error.reason
            return swob.HTTPServiceUnavailable(msg)

        # We successfully generated a thumbnail on the active DC, send the same request
        # blindly to the inactive DC to populate Swift there, not waiting for the response
        inactivedc_encodedurl = self.thumborify_url(
            reqorig, self.inactivedc_thumborhost)
        eventlet.spawn(self.inactivedc_request, thumbor_opener,
                       inactivedc_encodedurl)

        # get the Content-Type.
        uinfo = upcopy.info()
        c_t = uinfo.gettype()

        resp = swob.Response(app_iter=upcopy, content_type=c_t)

        headers_whitelist = [
            'Content-Length', 'Content-Disposition', 'Last-Modified',
            'Accept-Ranges', 'XKey', 'Thumbor-Engine', 'Server',
            'Nginx-Request-Date', 'Nginx-Response-Date',
            'Thumbor-Processing-Time', 'Thumbor-Processing-Utime',
            'Thumbor-Request-Id', 'Thumbor-Request-Date'
        ]

        # add in the headers if we've got them
        for header in headers_whitelist:
            if (uinfo.getheader(header) != ''):
                resp.headers[header] = uinfo.getheader(header)

        # also add CORS; see also our CORS middleware
        resp.headers['Access-Control-Allow-Origin'] = '*'

        return resp

Example #55

0

Show file

File: rewrite.py Project: pnorman/puppet

    def handle404(self, reqorig, url, container, obj):
        """
        Return a webob.Response which fetches the thumbnail from the thumb
        host and returns it. Note also that the thumb host might write it out
        to Swift so it won't 404 next time.
        """
        # go to the thumb media store for unknown files
        reqorig.host = self.thumbhost
        # upload doesn't like our User-agent, otherwise we could call it
        # using urllib2.url()
        proxy_handler = urllib2.ProxyHandler({'http': self.thumbhost})
        redirect_handler = DumbRedirectHandler()
        opener = urllib2.build_opener(redirect_handler, proxy_handler)
        # Thumbor doesn't need (and doesn't like) the proxy
        thumbor_opener = urllib2.build_opener(redirect_handler)

        # Pass on certain headers from the caller squid to the scalers
        opener.addheaders = []
        if reqorig.headers.get('User-Agent') is not None:
            opener.addheaders.append(('User-Agent', reqorig.headers.get('User-Agent')))
        else:
            opener.addheaders.append(('User-Agent', self.user_agent))
        for header_to_pass in ['X-Forwarded-For', 'X-Forwarded-Proto',
                               'Accept', 'Accept-Encoding', 'X-Original-URI']:
            if reqorig.headers.get(header_to_pass) is not None:
                opener.addheaders.append((header_to_pass, reqorig.headers.get(header_to_pass)))

        thumbor_opener.addheaders = opener.addheaders

        # At least in theory, we shouldn't be handing out links to originals
        # that we don't have (or in the case of thumbs, can't generate).
        # However, someone may have a formerly valid link to a file, so we
        # should do them the favor of giving them a 404.
        try:
            # break apach the url, url-encode it, and put it back together
            urlobj = list(urlparse.urlsplit(reqorig.url))
            # encode the URL but don't encode %s and /s
            urlobj[2] = urllib2.quote(urlobj[2], '%/')
            encodedurl = urlparse.urlunsplit(urlobj)

            # Thumbor never needs URL mangling and it needs a different host
            if self.thumborhost:
                thumbor_reqorig = reqorig.copy()
                thumbor_reqorig.host = self.thumborhost
                thumbor_urlobj = list(urlparse.urlsplit(thumbor_reqorig.url))
                thumbor_urlobj[2] = urllib2.quote(thumbor_urlobj[2], '%/')
                thumbor_encodedurl = urlparse.urlunsplit(thumbor_urlobj)

            # if sitelang, we're supposed to mangle the URL so that
            # http://upload.wm.o/wikipedia/commons/thumb/a/a2/Foo_.jpg/330px-Foo_.jpg
            # changes to
            # http://commons.wp.o/w/thumb_handler.php/a/a2/Foo_.jpg/330px-Foo_.jpg
            if self.backend_url_format == 'sitelang':
                match = re.match(
                    r'^http://(?P<host>[^/]+)/(?P<proj>[^-/]+)/(?P<lang>[^/]+)/thumb/(?P<path>.+)',
                    encodedurl)
                if match:
                    proj = match.group('proj')
                    lang = match.group('lang')
                    # and here are all the legacy special cases, imported from thumb_handler.php
                    if(proj == 'wikipedia'):
                        if(lang in ['meta', 'commons', 'internal', 'grants']):
                            proj = 'wikimedia'
                        if(lang in ['mediawiki']):
                            lang = 'www'
                            proj = 'mediawiki'
                    hostname = '%s.%s.%s' % (lang, proj, self.tld)
                    if(proj == 'wikipedia' and lang == 'sources'):
                        # yay special case
                        hostname = 'wikisource.%s' % self.tld
                    # ok, replace the URL with just the part starting with thumb/
                    # take off the first two parts of the path
                    # (eg /wikipedia/commons/); make sure the string starts
                    # with a /
                    encodedurl = 'http://%s/w/thumb_handler.php/%s' % (
                        hostname, match.group('path'))
                    # add in the X-Original-URI with the swift got (minus the hostname)
                    opener.addheaders.append(
                        ('X-Original-URI', list(urlparse.urlsplit(reqorig.url))[2]))
                else:
                    # ASSERT this code should never be hit since only thumbs
                    # should call the 404 handler
                    self.logger.warn("non-thumb in 404 handler! encodedurl = %s" % encodedurl)
                    resp = webob.exc.HTTPNotFound('Unexpected error')
                    return resp
            else:
                # log the result of the match here to test and make sure it's
                # sane before enabling the config
                match = re.match(
                    r'^http://(?P<host>[^/]+)/(?P<proj>[^-/]+)/(?P<lang>[^/]+)/thumb/(?P<path>.+)',
                    encodedurl)
                if match:
                    proj = match.group('proj')
                    lang = match.group('lang')
                    self.logger.warn(
                        "sitelang match has proj %s lang %s encodedurl %s" % (
                            proj, lang, encodedurl))
                else:
                    self.logger.warn("no sitelang match on encodedurl: %s" % encodedurl)

            # To turn thumbor off and have thumbnail traffic served by image scalers,
            # replace the line below with this one:
            # upcopy = opener.open(encodedurl)
            upcopy = thumbor_opener.open(thumbor_encodedurl)
        except urllib2.HTTPError, error:
            # copy the urllib2 HTTPError into a webob HTTPError class as-is

            class CopiedHTTPError(webob.exc.HTTPError):
                code = error.code
                title = error.msg

                def html_body(self, environ):
                    return self.detail

                def __init__(self):
                    super(CopiedHTTPError, self).__init__(
                        detail="".join(error.readlines()),
                        headers=error.hdrs.items())

            return CopiedHTTPError()

Example #56

0

Show file

File: internal_client.py Project: krishna-kashyap/swift

    def base_request(self, method, container=None, name=None, prefix=None,
                     headers=None, proxy=None, contents=None,
                     full_listing=None, logger=None, additional_info=None):
        # Common request method
        trans_start = time()
        url = self.url

        if headers is None:
            headers = {}

        if self.token:
            headers['X-Auth-Token'] = self.token

        if container:
            url = '%s/%s' % (url.rstrip('/'), quote(container))

        if name:
            url = '%s/%s' % (url.rstrip('/'), quote(name))
        else:
            url += '?format=json'
            if prefix:
                url += '&prefix=%s' % prefix

        if proxy:
            proxy = urlparse.urlparse(proxy)
            proxy = urllib2.ProxyHandler({proxy.scheme: proxy.netloc})
            opener = urllib2.build_opener(proxy)
            urllib2.install_opener(opener)

        req = urllib2.Request(url, headers=headers, data=contents)
        req.get_method = lambda: method
        conn = urllib2.urlopen(req)
        body = conn.read()
        try:
            body_data = json.loads(body)
        except ValueError:
            body_data = None
        trans_stop = time()
        if logger:
            sent_content_length = 0
            for n, v in headers.items():
                nl = n.lower()
                if nl == 'content-length':
                    try:
                        sent_content_length = int(v)
                        break
                    except ValueError:
                        pass
            logger.debug("-> " + " ".join(
                quote(str(x) if x else "-", ":/")
                for x in (
                    strftime('%Y-%m-%dT%H:%M:%S', gmtime(trans_stop)),
                    method,
                    url,
                    conn.getcode(),
                    sent_content_length,
                    conn.info()['content-length'],
                    trans_start,
                    trans_stop,
                    trans_stop - trans_start,
                    additional_info
                )))
        return [None, body_data]

Example #57

0

Show file

File: s5.py Project: eychung/ETagSurvey

	def requestPage(self,etag,set_cookie,rank,site,server):
		try:
			global num_errors
			
			print "Checking site: " + str(urlHeader+site)
			sys.stdout.flush()
			
			robot_url = urlHeader+site+"/robots.txt"

			rp = robotparser.RobotFileParser()
			rp.set_url(robot_url)
			rp.read()
					
			if rp.errcode == 404 or (rp.can_fetch("*", urlHeader+site+'/') and rp.errcode == 200):
				try:
					time.sleep(60)
					opener = urllib2.build_opener(openanything.DefaultErrorHandler())
					req = urllib2.Request(urlHeader+site)
					res = opener.open(req)
					
					etag1 = res.info()['ETag']
					set_cookie1 = res.info()['Set-Cookie']
					print str(etag1) + " and " + str(set_cookie1)
					
					print "Trying case 1" + str(urlHeader+site)
					time.sleep(60)
					# 1. Use set-cookie given by initial connection
					req = urllib2.Request(urlHeader+site, headers={"Cookie" : set_cookie1})
					res = opener.open(req)
					etag2 = res.info()['ETag']
					if etag1 == etag2:
						print "ETags are same."
						self.same_etag.append((site,etag1,set_cookie1))
						self.etag_unchanged += 1
					else:
						print "ETags are different."
						self.etag_changed += 1
					
					time.sleep(60)
					print "Trying case 2" + str(urlHeader+site)
					# 2. Use ETag given by initial connection
					req = urllib2.Request(urlHeader+site, headers={"If-None-Match" : etag1})
					res = opener.open(req)
					if res.getcode() == 304:
						print "Returned 304 Not Modified"
						self.cookie_304 += 1
					else:
						try:
							set_cookie2 = res.info()['Set-Cookie']
							if set_cookie1 == set_cookie2:
								print "Set the same coookie!"
								self.same_set_cookie.append((site,etag1,set_cookie1))
								self.cookie_unchanged += 1
							else:
								print "Receieved new cookie."
								self.cookie_changed += 1
						except:
							print "Does not have Set-Cookie header."
							self.error_site.append(site)
							num_errors += 1
				except:
					print "Problem acquiring either ETag or Set-Cookie in response packet."
					self.error_site.append(site)
					num_errors += 1
			else:
				print "No permissions given to robot."
				self.error_site.append(site)
				self.err_permission += 1
		except:
			print "Error connecting to robot."
			self.error_site.append(site)
			self.err_robot += 1
		return site