コード例 #1
0
def searchcrawler(url, keyword=''):
    """
    tb搜索页爬虫
    """
    html = get_html(url)
    #print html
    if html:
        soup = BeautifulSoup(html, fromEncoding='gbk')
        items_row = soup.findAll('div', {'class': 'row item icon-datalink'})
        if items_row:
            print '=======================row search row=========================='
            #print items
            for item in items_row:
                item_info = item.find('div', {'class': 'col title'}).h3.a
                item_url = item_info['href']
                url_info = urlparse.urlparse(item_url)
                item_id = urlparse.parse_qs(url_info.query, True)['id'][0]
                print item_url
                print item_id
                judge_site(item_url, keyword)
        items_col = soup.findAll('div', {'class': 'col item icon-datalink'})
        if items_col:
            print '=======================row search col=========================='
            #print items
            for item in items_col:
                item_info = item.find('div', {'class': 'item-box'}).h3.a
                item_url = item_info['href']
                url_info = urlparse.urlparse(item_url)
                item_id = urlparse.parse_qs(url_info.query, True)['id'][0]
                print item_url
                print item_id
                judge_site(item_url, keyword)
コード例 #2
0
ファイル: crlf.py プロジェクト: lucmichalski/astra
def crlf_get_uri_method(uri, method, headers, scanid=None):
    # This function checks CRLF through GET URI method.
    par_key = {}
    url_query = urlparse.urlparse(uri)
    parsed_query = urlparse.parse_qs(url_query.query)
    for key, value in parsed_query.items():
        crlf_payloads = fetch_crlf_payload()
        for payload in crlf_payloads:
            par_key.update(parsed_query)
            par_key[key] = payload
            parsed_uri = urlparse.urlparse(
                uri).scheme + "://" + urlparse.urlparse(
                    uri).netloc + urlparse.urlparse(
                        uri).path + "?" + urlparse.urlparse(uri).query.replace(
                            value[0], payload)
            crlf_get_method = req.api_request(parsed_uri, "GET", headers)
            for name in crlf_get_method.headers:
                if "CRLF-Test" in name:
                    attack_result = {
                        "id": 13,
                        "scanid": scanid,
                        "url": parsed_uri,
                        "alert": "CRLF injection",
                        "impact": "High",
                        "req_headers": headers,
                        "req_body": "NA",
                        "res_headers": crlf_get_method.headers,
                        "res_body": crlf_get_method.text
                    }
                    dbupdate.insert_record(attack_result)
                    print "[+]{0} is vulnerable to CRLF injection".format(
                        parsed_uri)
                    return
コード例 #3
0
def searchcrawler(url,keyword=''):
    """
    tb搜索页爬虫
    """
    html=get_html(url)
    #print html
    if html:
        soup = BeautifulSoup(html,fromEncoding='gbk')
        items_row = soup.findAll('div',{'class':'row item icon-datalink'})
        if items_row:
            print '=======================row search row=========================='
            #print items
            for item in items_row:
                item_info = item.find('div',{'class':'col title'}).h3.a
                item_url = item_info['href']
                url_info = urlparse.urlparse(item_url)
                item_id = urlparse.parse_qs(url_info.query,True)['id'][0]
                print item_url
                print item_id
                judge_site(item_url,keyword)
        items_col = soup.findAll('div',{'class':'col item icon-datalink'})
        if items_col:
            print '=======================row search col=========================='
            #print items
            for item in items_col:
                item_info = item.find('div',{'class':'item-box'}).h3.a
                item_url = item_info['href']
                url_info = urlparse.urlparse(item_url)
                item_id = urlparse.parse_qs(url_info.query,True)['id'][0]
                print item_url
                print item_id
                judge_site(item_url,keyword)
コード例 #4
0
ファイル: views.py プロジェクト: meletakis/collato
def rendered_wall_posts( wall_posts ):
	for wall_post in wall_posts:
		title = ''
		desc = ''
		site_image = ''
		article_title = ''
		urls = re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', wall_post.data['post_content'])
		for url in urls: 
			parse_obj = urlparse.urlparse(url)
			site = parse_obj.netloc
			path = parse_obj.path
			conn = httplib.HTTPConnection(site)
			conn.request('HEAD',path)
			response = conn.getresponse()
			conn.close()
			ctype = response.getheader('Content-Type')
			if response.status < 400 and ctype.startswith('image'):
				wall_post.data['post_content'] = wall_post.data['post_content']+"<br/><a href='"+url+"' target='_blank'><img width=300 src='"+url+"' target = '_blank'/></a>"
			else:
				og = opengraph.OpenGraph(url)
				if not len(og.items()) == 2:
					for x,y in og.items():
						if x == 'type' and y == 'video':
							for k,l in og.items():
								if k == 'site_name' and l == 'YouTube':
							
									url_data = urlparse.urlparse(url)
									query = urlparse.parse_qs(url_data.query)
									video = query["v"][0]
									wall_post.data['post_content'] = wall_post.data['post_content'].replace(url,"")+"<br/><iframe width='300' height='200' src='//www.youtube.com/embed/"+video+"' frameborder='0' allowfullscreen></iframe>"
								elif k == 'site_name' and l == 'Vimeo':
									url_data = urlparse.urlparse(url)
									video = url_data.path
									wall_post.data['post_content'] = wall_post.data['post_content'].replace(url,"")+"<br/><iframe src='//player.vimeo.com/video"+video+"' width='300' height='200' frameborder='0' webkitallowfullscreen mozallowfullscreen allowfullscreen></iframe> <p></p>"
						elif x == 'type' and y == 'article':
							for k,l in og.items():
								if k == 'title':
									article_title = l
								elif k == 'site_name':
									title = l
								elif k=='description':
									desc = l
								elif k=='image':
									site_image = l
							wall_post.data['post_content'] = wall_post.data['post_content'] +"<br/><table><tr><td><img width='50' src='"+site_image+"'</td><td><a href='"+url+"' target='_blank'/>"+article_title+"</a><br/>"+title+"</td></td></table>"
						elif x=='type':
							for k,l in og.items():
								if k == 'site_name':
									title = l
								elif k=='description':
									desc = l
								elif k=='image':
									site_image = l
							wall_post.data['post_content'] = wall_post.data['post_content'].replace(url, "<table><tr><td><img width='50' src='"+site_image+"'</td><td><a href='"+url+"' target='_blank'/>"+title+"</a><br/>"+desc+"</td></td></table>")
				else:
					wall_post.data['post_content'] = wall_post.data['post_content'].replace(url, "<a href='"+url+"' target='_blank'>"+url+"</a>")	
	return wall_posts	
コード例 #5
0
ファイル: tb.py プロジェクト: rich678s/wangw
def searchcrawler(url):

    html = get_html(url)
    #     print url
    if html:
        soup = BeautifulSoup(html, fromEncoding='gbk')
        items_row = soup.findAll('div', {'class': 'item-box st-itembox'})
        if items_row:
            print '=======================row search row=========================='
            for item in items_row:
                #                 print item
                item_info = item.find('h3', {'class': 'summary'}).a
                item_url = item_info['href']
                #                 print item_url

                sid_info = item.find('div', {
                    'class': 'col seller feature-dsi-tgr'
                }).a
                print sid_info
                sid_item_url = sid_info['href']
                sid_url_info = urlparse.urlparse(sid_item_url)
                sid_id = urlparse.parse_qs(sid_url_info.query,
                                           True)['user_number_id'][0]
                print sid_id

                judge_site(item_url, sid_id)

#                 logging.warning(item_id)
#
#                 download_reply_by_id(item_id)

        items_col = soup.findAll('div',
                                 {'class': 'product-item row icon-datalink'})
        if items_col:

            print '=======================row search col=========================='
            #print items
            for item in items_col:
                item_info = item.find('div', {'class': 'title'}).a
                item_url = item_info['href']
                #                 url_info = urlparse.urlparse(item_url)
                #                 item_id = urlparse.parse_qs(url_info.query,True)['id'][0]
                print item_url
                #                 print item_id

                sid_info = item.find('div', {'class': 'seller'}).a
                print sid_info
                sid_item_url = sid_info['href']
                sid_url_info = urlparse.urlparse(sid_item_url)
                sid_id = urlparse.parse_qs(sid_url_info.query,
                                           True)['user_number_id'][0]
                print sid_id

                judge_site(item_url, sid_id)
コード例 #6
0
ファイル: tb.py プロジェクト: fubendong/wangw
def searchcrawler(url):
    
    html=get_html(url)
#     print url
    if html:
        soup = BeautifulSoup(html,fromEncoding='gbk')
        items_row = soup.findAll('div',{'class':'item-box st-itembox'})
        if items_row:
            print '=======================row search row=========================='
            for item in items_row:
#                 print item
                item_info = item.find('h3',{'class':'summary'}).a
                item_url = item_info['href']
#                 print item_url
                
                
                sid_info = item.find('div',{'class':'col seller feature-dsi-tgr'}).a
                print sid_info
                sid_item_url = sid_info['href']
                sid_url_info = urlparse.urlparse(sid_item_url)
                sid_id = urlparse.parse_qs(sid_url_info.query,True)['user_number_id'][0]
                print sid_id
                
                judge_site(item_url, sid_id)
                
#                 logging.warning(item_id)
#                 
#                 download_reply_by_id(item_id)
                
        items_col = soup.findAll('div',{'class':'product-item row icon-datalink'})       
        if items_col:
            
            print '=======================row search col=========================='
            #print items
            for item in items_col:
                item_info = item.find('div',{'class':'title'}).a
                item_url = item_info['href']
#                 url_info = urlparse.urlparse(item_url)
#                 item_id = urlparse.parse_qs(url_info.query,True)['id'][0]
                print item_url
#                 print item_id

                sid_info = item.find('div',{'class':'seller'}).a
                print sid_info
                sid_item_url = sid_info['href']
                sid_url_info = urlparse.urlparse(sid_item_url)
                sid_id = urlparse.parse_qs(sid_url_info.query,True)['user_number_id'][0]
                print sid_id
                
                judge_site(item_url, sid_id)
コード例 #7
0
ファイル: __init__.py プロジェクト: bbockelm/globus-toolkit
def is_local_service(name):
    """
    Determine if a service definition describes a service running on
    the local node. This is true if the service URL is for localhost,
    matches the machine's name, or ec2 public name
    """
    if name is None:
        return False
    if "://" in name:
        url = urlparse.urlparse(name)
        if ":" in url.netloc:
            name = url.netloc.split(":")[0]
        else:
            name = url.netloc
    elif ":" in name:
        name = name.split(":")[0]

    if name == "localhost":
        return True

    if '.' in name:
        name = name.split('.')[0]
    node = platform.node()
    if '.' in node:
        node = node.split('.')[0]

    if name == node:
        return True
    pn = public_name()
    if pn is not None and pn.split(".")[0] == name:
        return True
    return False
コード例 #8
0
ファイル: index.py プロジェクト: kultus/foofind-web
def gensitemap(server, urlformat):
    '''
    Crea la ruta del índice de sitemap para el servidor de archivos dado.
    Se conecta a los índices de segundo nivel y obtiene su fecha de modificación.

    @type server: dict-like
    @param server: Documento del servidor tal cual viene de MongoDB

    @rtype tuple (str, datetime) o None
    @return tupla con la url y su fecha de modificación, o None si no se puede
            obtener la url.
    '''
    subdomain = server["ip"].split(".")[0]
    serverno = int(subdomain[6:])
    url = urlformat % serverno
    domain = urlparse.urlparse(url)[1]
    con = httplib.HTTPConnection(domain)
    con.request("HEAD", url)
    response =  con.getresponse()

    if response.status == 200:
        mtime = time.mktime(time.strptime(
           response.getheader("last-Modified"),
            "%a, %d %b %Y %H:%M:%S %Z"))
        return (url, datetime.datetime.fromtimestamp(mtime))

    return None
コード例 #9
0
ファイル: web.py プロジェクト: schandrika/volttron
    def startupagent(self, sender, **kwargs):

        if not self.bind_web_address:
            _log.info('Web server not started.')
            return
        import urlparse
        parsed = urlparse.urlparse(self.bind_web_address)
        hostname = parsed.hostname
        port = parsed.port

        _log.info('Starting web server binding to {}:{}.' \
                   .format(hostname, port))
        self.registeredroutes.append((re.compile('^/discovery/$'), 'callable',
                                      self._get_discovery))
        self.registeredroutes.append((re.compile('^/discovery/allow$'),
                                      'callable',
                                      self._allow))
        self.registeredroutes.append((re.compile('^/$'), 'callable',
                                      self._redirect_index))
        port = int(port)
        vhome = os.environ.get('VOLTTRON_HOME')
        logdir = os.path.join(vhome, "log")
        if not os.path.exists(logdir):
            os.makedirs(logdir)

        self.appContainer = WebApplicationWrapper(self, hostname, port)
        svr = WSGIServer((hostname, port), self.appContainer)
        self._server_greenlet = gevent.spawn(svr.serve_forever)
コード例 #10
0
ファイル: __init__.py プロジェクト: ysvenkat/globus-toolkit
def is_local_service(name):
    """
    Determine if a service definition describes a service running on
    the local node. This is true if the service URL is for localhost,
    matches the machine's name, or ec2 public name
    """
    if name is None:
        return False
    if "://" in name:
        url = urlparse.urlparse(name)
        if ":" in url.netloc:
            name = url.netloc.split(":")[0]
        else:
            name = url.netloc
    elif ":" in name:
        name = name.split(":")[0]

    if name == "localhost":
        return True

    if '.' in name:
        name = name.split('.')[0]
    node = platform.node()
    if '.' in node:
        node = node.split('.')[0]

    if name == node:
        return True
    pn = public_name()
    if pn is not None and pn.split(".")[0] == name:
        return True
    return False
コード例 #11
0
    def fps_ipn_handler(self, request):
        uri = request.build_absolute_uri()
        parsed_url = urlparse.urlparse(uri)
        resp = self.fps_connection.verify_signature(UrlEndPoint="%s://%s%s" % (parsed_url.scheme,
                                                                  parsed_url.netloc,
                                                                  parsed_url.path),
                                                    HttpParameters=request.body)
        if not resp.VerifySignatureResult.VerificationStatus == "Success":
            return HttpResponseForbidden()

        data = dict(map(lambda x: x.split("="), request.body.split("&")))
        for (key, val) in data.items():
            data[key] = urllib.unquote_plus(val)
        if AmazonFPSResponse.objects.filter(transactionId=data["transactionId"]).count():
            resp = AmazonFPSResponse.objects.get(transactionId=data["transactionId"])
        else:
            resp = AmazonFPSResponse()
        for (key, val) in data.items():
            attr_exists = hasattr(resp, key)
            if attr_exists and not callable(getattr(resp, key, None)):
                if key == "transactionDate":
                    val = datetime.datetime(*time.localtime(float(val))[:6])
                setattr(resp, key, val)
        resp.save()
        if resp.statusCode == "Success":
            transaction_was_successful.send(sender=self.__class__,
                                            type=data["operation"],
                                            response=resp)
        else:
            if not "Pending" in resp.statusCode:
                transaction_was_unsuccessful.send(sender=self.__class__,
                                                  type=data["operation"],
                                                  response=resp)
        # Return a HttpResponse to prevent django from complaining
        return HttpResponse(resp.statusCode)
コード例 #12
0
ファイル: web.py プロジェクト: cbs-iiith/volttron
    def startupagent(self, sender, **kwargs):

        if not self.bind_web_address:
            _log.info('Web server not started.')
            return
        import urlparse
        parsed = urlparse.urlparse(self.bind_web_address)
        hostname = parsed.hostname
        port = parsed.port

        _log.info('Starting web server binding to {}:{}.' \
                   .format(hostname, port))
        self.registeredroutes.append((re.compile('^/discovery/$'), 'callable',
                                      self._get_discovery))
        self.registeredroutes.append((re.compile('^/discovery/allow$'),
                                      'callable',
                                      self._allow))
        self.registeredroutes.append((re.compile('^/$'), 'callable',
                                      self._redirect_index))
        port = int(port)
        vhome = os.environ.get('VOLTTRON_HOME')
        logdir = os.path.join(vhome, "log")
        if not os.path.exists(logdir):
            os.makedirs(logdir)
        with open(os.path.join(logdir, 'web.access.log'), 'wb') as accesslog:
            with open(os.path.join(logdir, 'web.error.log'), 'wb') as errlog:
                server = pywsgi.WSGIServer((hostname, port), self.app_routing,
                                       log=accesslog, error_log=errlog)
                server.serve_forever()
コード例 #13
0
 def iriToUri(self, iri):
     import urlparse
     parts = urlparse.urlparse(iri.decode('utf-8'))
     return urlparse.urlunparse(
         part.encode('idna') if parti ==
         1 else self.urlEncodeNonAscii(part.encode('utf-8'))
         for parti, part in enumerate(parts))
コード例 #14
0
def judge_site(url,keyword=''):
    """
    判断物品是tb还是tm
    """
    url_info = urlparse.urlparse(url)
    urlkey = urlparse.parse_qs(url_info.query,True)
    iid = int(urlkey['id'][0])
    #print 'url_info:',url_info[1]
    try:
        if url_info[1] == 'detail.tmall.com':
            print 'it is a tm item'
            if check_item_update_time(iid,'tm'):
                return
            data = getTmallItemInfo(iid,keyword)
        elif urlkey.get('cm_id'):
            print 'it is a tm item'
            if check_item_update_time(iid,'tm'):
                return
            data = getTmallItemInfo(iid,keyword)
        else:
            print 'it is a tb item'
            if check_item_update_time(iid,'tb'):
                return
            data = getTaobaoItemInfo(iid,keyword)
    except Exception ,e:
        print traceback.print_exc()
        return
コード例 #15
0
ファイル: webhooks.py プロジェクト: zh/ReaTiWe
 def post(self):
   try:
     name = self.request.POST['name']
     topic = MicroTopic.all().filter('name =', name).get()
     if not topic:
       raise ReatiweError("Topic %s does not exists." % name)
     if self.request.POST['mode']:
       mode = self.request.POST['mode']
     else:
       mode = "subscribe"
     form_fields = { "hub.mode": mode,
                     "hub.callback": "%s/callback/%s" % (settings.SITE_URL, topic.name),
                     "hub.topic": topic.url,
                     "hub.verify": "sync",
                     "hub.verify_token": topic.name }
     result = 200
     url = self.request.POST['hub']
     req = urllib2.Request(url, urllib.urlencode(form_fields))
     o = urlparse.urlparse(url)
     # superfeedr support
     if o.username and o.password:
       base64string = base64.encodestring('%s:%s' % (o.username, o.password))[:-1]
       authheader =  "Basic %s" % base64string
       new_url = "%s://%s%s" % (o.scheme, o.hostname, o.path)
       req = urllib2.Request(new_url, urllib.urlencode(form_fields))
       req.add_header("Authorization", authheader)
     urllib2.urlopen(req)
   except DownloadError, e:
     logging.error('DownloadError: %s' % repr(e))
     pass
コード例 #16
0
ファイル: cdapl.py プロジェクト: ricardofcf/filmkodi
 def listsCategoriesMenu(self, url):
     query_data = {
         'url': url,
         'use_host': False,
         'use_cookie': False,
         'use_post': False,
         'return_data': True
     }
     link = self.cm.getURLRequestData(query_data)
     #ile jest filmów ?
     match = re.compile(
         '<li class="active"id="mVid"><a href="#" onclick="moreVideo\(\);return false;">Video \((.*?)\)</a></li>',
         re.DOTALL).findall(link)
     ilejest = int(match[0])
     policz = int(ilejest / o_filmow_na_stronie) + 1
     max_stron = policz
     parsed = urlparse.urlparse(url)
     typ = urlparse.parse_qs(parsed.query)['s'][0]
     for i in range(0, (policz)):
         purl = 'http://www.cda.pl/video/show/ca%C5%82e_filmy_or_ca%C5%82y_film/p' + str(
             i + 1) + '?s=' + typ
         self.add('cdapl', 'categories-menu', 'Strona ' + str(i + 1),
                  'None', 'None', purl, 'None', 'None', True, False,
                  str(i + 1))
     xbmcplugin.endOfDirectory(int(sys.argv[1]))
コード例 #17
0
 def _extracturls(self):
     #print "Extract URLs"
     urls = []
     htmlsrc, charset, parenturl = self.htmlSrcTuple
     if htmlsrc != None:
         resulturls = []
         urlExtractor = ExtractLinks(resulturls)
         try:
             if charset == None:
                 urlExtractor.feed(htmlsrc)
             else:
                 urlExtractor.feed(htmlsrc.decode(charset))
         except HTMLParser.HTMLParseError:
             pass
         try:
             urlExtractor.reset() # I think close needs special treatment .close()
         except HTMLParser.HTMLParseError:
             urlExtractor.reset()
         #this piece of code forms the URIs to full URLs by joining the
         #parenturl with the network location free URLs extracted
         for i in xrange(len(resulturls)): #replacing range() for performance reasons
             urlres = urlparse.urlparse(resulturls[i], "http")
             if urlres.netloc == "":
                 resulturls[i] = urlparse.urljoin(parenturl, resulturls[i])
             urls.extend(resulturls)
     return urls
コード例 #18
0
ファイル: service.py プロジェクト: zptime/netdisk
def get_token_from_url(url):
    # 从url中获取token
    import urlparse
    result = urlparse.urlparse(url)
    param_dict = urlparse.parse_qs(result.query)
    tk = param_dict['tk'][0]
    return tk
コード例 #19
0
def feature_extract(url_input):

    Feature = {}
    tokens_words = re.split('\W+', url_input)

    host = urlparse.urljoin(url_input, '/')
    path = urlparse.urlparse(url_input).path

    Feature['URL'] = url_input

    Feature['rank_host'], Feature['rank_country'] = sitepopularity(host)

    Feature['host'] = host

    Feature['Length_of_url'] = len(url_input)
    Feature['Length_of_host'] = len(host)
    Feature['No_of_dots'] = url_input.count('.')

    Feature['sec_sen_word_cnt'] = Security_sensitive(tokens_words)
    Feature['IPaddress_presence'] = Check_IPaddress(tokens_words)
    Feature['avg_token_length'], Feature['token_count'], Feature[
        'largest_token'] = Tokenise(url_input)
    Feature['avg_domain_token_length'], Feature['domain_token_count'], Feature[
        'largest_domain'] = Tokenise(host)
    Feature['avg_path_token'], Feature['path_token_count'], Feature[
        'largest_path'] = Tokenise(path)

    Feature['ASNno'] = getASN(host)
    Feature['safebrowsing'] = safebrowsing(url_input)
    Feature['numTld'] = numTld(url_input)
    Feature['numPunctuation'] = numPunctuation(url_input)
    return Feature
コード例 #20
0
ファイル: phone_get.py プロジェクト: rich678s/wangw
def searchcrawler(url):
    
    html=get_html(url)
#     print url
    if html:
        soup = BeautifulSoup(html,fromEncoding='gbk')
        items_row = soup.findAll('div',{'class':'product-iWrap'})
        #items_row = soup.find('div',{'class':'item-box st-itembox'})
#         print items_row
        if items_row:
            print '=======================row search row=========================='
            for item in items_row:
#                 print item
                try:
                    item_info = item.find('p',{'class':'productTitle'}).a
                except:
                    item_info = item.find('div',{'class':'productTitle productTitle-spu'}).a
                
#                 print item_info
                item_url = item_info['href']
#                 print item_url
                
                url_info = urlparse.urlparse(item_url)
                item_id = urlparse.parse_qs(url_info.query,True)['id'][0]
                print item_id
                logging.warning(item_id)
                
#                 item_id = 16862466992
                download_reply_by_id(item_id)
コード例 #21
0
def wait_for_servers(urls, timeout):
    import time, urlparse, httplib
    from ssl import SSLError
    
    for u in urls:
        parsed = urlparse.urlparse(u.lower(), "https")
        netloc = parsed.hostname
        if parsed.port: netloc = "%s:%s" % (netloc, parsed.port)
        if parsed.scheme == "http":
            cnxn = httplib.HTTPConnection(netloc)
        elif parsed.scheme == "https":
            cnxn = httplib.HTTPSConnection(netloc)
        else:
            raise Exception("Don't know how to handle scheme %s" % parsed.scheme)
        i = 0
        while(i < timeout):
            try:
                cnxn.connect()
            except SSLError:
                break;
            except Exception as e:
                if "Connection refused" in str(e):
                    time.sleep(1)
                    i = i - 1
                elif "SSL" in str(e):
                    break
                else:
                    raise
            else:
                break
コード例 #22
0
def judge_site(url, keyword=''):
    """
    判断物品是tb还是tm
    """
    url_info = urlparse.urlparse(url)
    urlkey = urlparse.parse_qs(url_info.query, True)
    iid = int(urlkey['id'][0])
    #print 'url_info:',url_info[1]
    try:
        if url_info[1] == 'detail.tmall.com':
            print 'it is a tm item'
            if check_item_update_time(iid, 'tm'):
                return
            data = getTmallItemInfo(iid, keyword)
        elif urlkey.get('cm_id'):
            print 'it is a tm item'
            if check_item_update_time(iid, 'tm'):
                return
            data = getTmallItemInfo(iid, keyword)
        else:
            print 'it is a tb item'
            if check_item_update_time(iid, 'tb'):
                return
            data = getTaobaoItemInfo(iid, keyword)
    except Exception, e:
        print traceback.print_exc()
        return
コード例 #23
0
ファイル: tb.py プロジェクト: rich678s/wangw
def judge_site(url, sid_id):
    """
    判断物品是tb还是tm
    """
    url_info = urlparse.urlparse(url)
    urlkey = urlparse.parse_qs(url_info.query, True)
    iid = int(urlkey['id'][0])
    print iid
    #     print 'url_info:',url_info[1]
    try:
        if url_info[1] == 'detail.tmall.com':
            print 'it is a tm item'

#             data = download_tm_reply_by_id(iid)
        elif urlkey.get('cm_id'):
            print 'it is a tm item cm_id'

#             data = download_tm_reply_by_id(iid)
        else:
            print 'it is a tb item'

            data = download_tb_reply_by_id(iid, sid_id)
    except Exception, e:
        print traceback.print_exc()
        return
コード例 #24
0
ファイル: tb.py プロジェクト: fubendong/wangw
def judge_site(url, sid_id):
    """
    判断物品是tb还是tm
    """
    url_info = urlparse.urlparse(url)
    urlkey = urlparse.parse_qs(url_info.query,True)
    iid = int(urlkey['id'][0])
    print iid
#     print 'url_info:',url_info[1]
    try:
        if url_info[1] == 'detail.tmall.com':
            print 'it is a tm item'
            
#             data = download_tm_reply_by_id(iid)
        elif urlkey.get('cm_id'):
            print 'it is a tm item cm_id'
            
#             data = download_tm_reply_by_id(iid)
        else:
            print 'it is a tb item'
            
            data = download_tb_reply_by_id(iid, sid_id)
    except Exception ,e:
        print traceback.print_exc()
        return
コード例 #25
0
ファイル: web.py プロジェクト: rlutes/volttron
    def startupagent(self, sender, **kwargs):

        if not self.bind_web_address:
            _log.info('Web server not started.')
            return
        import urlparse
        parsed = urlparse.urlparse(self.bind_web_address)
        hostname = parsed.hostname
        port = parsed.port

        _log.info('Starting web server binding to {}:{}.' \
                   .format(hostname, port))
        self.registeredroutes.append(
            (re.compile('^/discovery/$'), 'callable', self._get_discovery))
        self.registeredroutes.append(
            (re.compile('^/discovery/allow$'), 'callable', self._allow))
        self.registeredroutes.append(
            (re.compile('^/$'), 'callable', self._redirect_index))
        port = int(port)
        vhome = os.environ.get('VOLTTRON_HOME')
        logdir = os.path.join(vhome, "log")
        if not os.path.exists(logdir):
            os.makedirs(logdir)
        with open(os.path.join(logdir, 'web.access.log'), 'wb') as accesslog:
            with open(os.path.join(logdir, 'web.error.log'), 'wb') as errlog:
                server = pywsgi.WSGIServer((hostname, port),
                                           self.app_routing,
                                           log=accesslog,
                                           error_log=errlog)
                server.serve_forever()
コード例 #26
0
ファイル: web.py プロジェクト: yizenrg/volttron
    def startupagent(self, sender, **kwargs):

        if not self.bind_web_address:
            _log.info('Web server not started.')
            return
        import urlparse
        parsed = urlparse.urlparse(self.bind_web_address)
        hostname = parsed.hostname
        port = parsed.port

        _log.info('Starting web server binding to {}:{}.' \
                   .format(hostname, port))
        self.registeredroutes.append(
            (re.compile('^/discovery/$'), 'callable', self._get_discovery))
        self.registeredroutes.append(
            (re.compile('^/discovery/allow$'), 'callable', self._allow))
        self.registeredroutes.append(
            (re.compile('^/$'), 'callable', self._redirect_index))
        port = int(port)
        vhome = os.environ.get('VOLTTRON_HOME')
        logdir = os.path.join(vhome, "log")
        if not os.path.exists(logdir):
            os.makedirs(logdir)

        self.appContainer = WebApplicationWrapper(self, hostname, port)
        svr = WSGIServer((hostname, port), self.appContainer)
        self._server_greenlet = gevent.spawn(svr.serve_forever)
コード例 #27
0
def getParams(path):
    query = urlparse.urlparse(path).query
    queryDict = dict([x.split('=') for x in query.split('&')])

    width = queryDict['WIDTH']
    height = queryDict['HEIGHT']
    bbox = queryDict['BBOX']
    return Params(int(width), int(height), map(float, bbox.split(',')))
コード例 #28
0
ファイル: conf2.py プロジェクト: bluedazzle/smart_screen
    def parse_failover(self, conf_service):
        """ Parse manual service values.
        consul:alpha#ip=192.168.1.1&port=80&ip=192.168.1.1&port=22 => 192.168.1.1:80,192.168.1.1:22
        consul:bravo?ip#ip=192.168.1.10 => 192.168.1.10
        consul:charlie?user++passwd#user=sa&passwd=guest => sa++guest
        consul:delta?user#passwd=guest => ""
        consul:echo?port#port=80&ip=localhost&port=22 => 80,22
        consul:foxtrot => consul:foxtrot
        consul:golf?user=passwd#user=sa&passwd=guest&user=root => sa=guest,root=
        """
        # Manual query string.
        _, _, man_qs = conf_service.partition("#")
        if not man_qs:
            # There isn't a manual value.
            return conf_service

        # Parse manual configures.
        man_qs = "?%s" % man_qs
        if is_python3:
            qs_dic = parse.parse_qs(urlparse.urlparse(man_qs).query)
        else:
            qs_dic = urlparse.parse_qs(urlparse.urlparse(man_qs).query)
        if not qs_dic:
            # There isn't a manual value.
            return conf_service

        _, _, seg = conf_service.partition("?")
        keys = self.parse_qs_keys(seg)
        if not keys:
            keys = ["ip", ":", "port"]

        def parse():
            max_l = max(len(lst) for lst in qs_dic.values())
            for i in range(max_l):
                for j in range(len(keys)):
                    k = keys[j]
                    if k[0] in string.punctuation:
                        yield k
                        continue
                    lst = qs_dic.get(k, [])
                    yield lst[i] if i < len(lst) else ""

                if i < max_l - 1:
                    yield ","

        return "".join(v for v in parse()).strip(",")
コード例 #29
0
    def _get_store(self, uri):
        if os.path.isabs(uri):  # to support win32 paths like: C:\\some\dir
            scheme = 'file'
        else:
            scheme = urlparse.urlparse(uri).scheme

        store_cls = self.STORE_SCHEMES[scheme]
        return store_cls(uri)
コード例 #30
0
ファイル: raw2http.py プロジェクト: rangeme/GourdScanV2
 def sethash(self):
     request = 'http://' + self.host + self.url.split('?')[0]
     dic = urlparse.urlparse('http://'+self.host+self.url).query.split('&')
     for d in dic:
         request += d.split('=')[0]+'=&'
     request += "|"
     for d in self.headers['postdata'].split('&'):
         request += d.split('=')[0]+'=&'
     self.hash=md5(request).hexdigest()
コード例 #31
0
ファイル: crlf.py プロジェクト: jsfan/Astra
def crlf_get_url_method(uri, headers, scanid=None):
    # This function checks CRLF through GET URL method.
    crlf_payloads = fetch_crlf_payload()
    for payload in crlf_payloads:
        parsed_uri = urlparse.urlparse(uri).scheme + "://" + urlparse.urlparse(
            uri).netloc + urlparse.urlparse(uri).path + "/" + payload
        crlf_get_method = req.api_request(parsed_uri, "GET", headers)
        for name in crlf_get_method.headers:
            if "CRLF-Test" in name:
                attack_result = {"id": 13, "scanid": scanid, "url": parsed_uri,
                                 "alert": "CRLF injection", "impact": "High",
                                 "req_headers": headers, "req_body": "NA",
                                 "res_headers": crlf_get_method.headers,
                                 "res_body": crlf_get_method.text}
                dbupdate.insert_record(attack_result)
                print "[+]{0} is vulnerable to CRLF injection".format(
                    parsed_uri)
                return
コード例 #32
0
ファイル: utils.py プロジェクト: lantianlz/cheka
def get_sub_domain_from_http_host(http_host):
    '''
    @note: 从http host中获取子域名前缀
    '''
    import urlparse
    if http_host:
        http_host = ('http://%s' % http_host) if not http_host.startswith('http') else http_host
        prefix = urlparse.urlparse(http_host)[1].split('.', 1)[0]
        return prefix
コード例 #33
0
ファイル: utils.py プロジェクト: cash2one/eqcj
def get_sub_domain_from_http_host(http_host):
    '''
    @note: 从http host中获取子域名前缀
    '''
    import urlparse
    if http_host:
        http_host = ('http://%s' % http_host) if not http_host.startswith('http') else http_host
        prefix = urlparse.urlparse(http_host)[1].split('.', 1)[0]
        return prefix
コード例 #34
0
ファイル: compat.py プロジェクト: biancini/TeamsLogExporter
def parse_query_string(url, key):
    if is_py2:
        import urlparse
        parsed_url = urlparse.urlparse(url)
        return urlparse.parse_qs(parsed_url.query)[key][0]
    else:
        from urllib.parse import urlparse
        from urllib.parse import parse_qs
        parsed_url = urlparse(url)
        return parse_qs(parsed_url.query)[key][0]
コード例 #35
0
ファイル: cmd.py プロジェクト: ericmoritz/static-ld
def relative_uri(base, target):
    """
    >>> relative_uri(u"http://example.com/foo/", u"http://example.com/foo/bar")
    u'bar'

    >>> relative_uri(u"http://example.com/baz/", u"http://example.com/foo/bar")
    u'../foo/bar'

    >>> relative_uri(u"http://example2.com/baz/", u"http://example.com/foo/bar")
    u'http://example.com/foo/bar'

    """
    base_bits=urlparse.urlparse(base)
    target_bits=urlparse.urlparse(target)
    if base_bits.netloc != target_bits.netloc:
        return target
    base_dir='.'+posixpath.dirname(base_bits.path)
    target='.'+target_bits.path
    return posixpath.relpath(target,start=base_dir)
コード例 #36
0
ファイル: admin.py プロジェクト: glukagen/EnglishQualityGame
 def get(self):
     self.response.headers['Content-Type'] = 'text/html'
     path = os.path.join(os.path.dirname(__file__), 'admin.html')
     u = urlparse.urlparse(self.request.url)
     dashboard = ""
     if u.netloc.startswith("localhost"):
         dashboard = "/_ah/admin"
     else:
         appname = u.netloc[:u.netloc.find(".")]
         dashboard = "https://appengine.google.com/dashboard?&app_id=s~" + appname
     self.response.out.write(template.render(path, {"dashboard" : dashboard}))
コード例 #37
0
ファイル: authshim.py プロジェクト: rcoh/oauth-shim
def add_params(url, params):
    import urllib
    import urlparse

    url_parts = list(urlparse.urlparse(url))
    query = dict(urlparse.parse_qsl(url_parts[4]))
    query.update(params)

    url_parts[4] = urllib.urlencode(query)

    return urlparse.urlunparse(url_parts)
コード例 #38
0
ファイル: utils.py プロジェクト: beforebeta/dealfu
def extract_query_params(url, *names):
    """
    Extracts names in the list from url
    @param url:
    @param names:
    @return: dict
    """
    parsed_res = urlparse.urlparse(url)
    d = urlparse.parse_qs(parsed_res.query)

    return {key:value[0] for (key, value) in d.iteritems() if key in names}
コード例 #39
0
ファイル: main.py プロジェクト: murtuzasaleh/odoo-addons
 def qs(url):
     query = urlparse.urlparse(url).query
     res = dict(
         [(k, v[0]) for k, v in urlparse.parse_qs(query).items()])
     res1 = {}
     if res.get('redirect', {}):
         res1 = qs(res.get('redirect', {}))
     res.update(res1)
     # todo if one ticket in redirect
     # and one ticket in normal path, deal ticket
     return res
コード例 #40
0
ファイル: atomformat.py プロジェクト: goetzk/tendenci
def get_tag_uri(url, date):
    "Creates a TagURI. See http://diveintomark.org/archives/2004/05/28/howto-atom-id"
    parts = urlparse.urlparse(url)
    date_part = ""
    if date is not None:
        date_part = ",%s:" % date.strftime("%Y-%m-%d")
    return "tag:%s%s%s/%s" % (
        parts.hostname,
        date_part,
        parts.path,
        parts.fragment,
    )
コード例 #41
0
def get_tag_uri(url, date):
    "Creates a TagURI. See http://diveintomark.org/archives/2004/05/28/howto-atom-id"
    parts = urlparse.urlparse(url)
    date_part = ""
    if date is not None:
        date_part = ",%s:" % date.strftime("%Y-%m-%d")
    return "tag:%s%s%s/%s" % (
        parts.hostname,
        date_part,
        parts.path,
        parts.fragment,
    )
コード例 #42
0
def getTmallItemInfo(iid,keyword=''):
    """
    获取tm的物品信息
    """
    temp = {'site':'tm','itemid':iid,'keyword':keyword}
    patt_list = {
                #r""""sellerNickName"\s*:\s*(.*)'\s*,'isEcardAuction'""",
                'sellerid':r"'userId'\s*:\s*'(\w*)',",
                'shopid':r'rstShopId:(\w*),',
                'brand':r"'brand'\s*:\s*(.*)'\s*,'brandId'",
                'brandid':r"'brandId'\s*:\s*'(\w*)'", 
                'total_count':r'totalSQ=(\w*)', 
    }
    html = get_html("http://detail.tmall.com/item.htm?id=%s"%iid)
    #print 'html:',html
    htmlutf = html.replace('\r\n','').replace('\t','')
    soup = BeautifulSoup(html,fromEncoding='gbk')
    temp['shopurl'] = urlparse.urlparse(soup.find('span',{'class':'slogo'}).a['href']).netloc
    temp['itemname'] = soup.find('input',{'name':'title'})['value']
    temp['region'] = soup.find('input',{'name':'region'})['value']
    temp['sellername'] = soup.find('input',{'name':'seller_nickname'})['value']
    for k in patt_list:
        patt = patt_list[k]
        temp[k] = re.findall(patt,htmlutf)[0]
    url = "http://mdskip.taobao.com/core/initItemDetail.htm?tmallBuySupport=true&itemId=%s&service3C=true"%(iid)
    data = get_html(url,referer="http://detail.tmall.com/item.htm?id=%s"%iid).decode('gbk')#.replace('\r\n','').replace('\t','')
    patt = '"priceInfo":(\{.*\}),"promType"'
    price_info = re.findall(patt,data)
    if price_info:
        price_info = json.loads(price_info[0])
        #print 'price_info:',price_info
        if price_info.get('def'):
            temp['price'] = float(price_info['def']['price'])
            if price_info['def']['promotionList']:
                temp['realprice'] = float(price_info['def']['promotionList'][0]['price'])
            else:
                if price_info['def'].get('tagPrice'):
                    temp['realprice'] = float(price_info['def']['tagPrice'])
                else:
                    temp['realprice'] = float(price_info['def']['price'])
                    
        else:
            temp['price'] = float(price_info[price_info.keys()[0]]['price'])
            temp['realprice'] = float(price_info[price_info.keys()[0]]['price'])
            
    patt = '"sellCountDO":(\{.*\}),"serviceDO"'
    quantity_info = re.findall(patt,data)
    if quantity_info:
        quantity = re.findall(r'"sellCount":(\w*)',quantity_info[0])[0]
        print 'quantity :',quantity
        temp['quantity'] = float(quantity)
    return temp
コード例 #43
0
def getTmallItemInfo(iid,keyword=''):
    """
    获取tm的物品信息
    """
    temp = {'site':'tm','itemid':iid,'keyword':keyword}
    patt_list = {
                #r""""sellerNickName"\s*:\s*(.*)'\s*,'isEcardAuction'""",
                'sellerid':r"'userId'\s*:\s*'(\w*)',",
                'shopid':r'rstShopId:(\w*),',
                'brand':r"'brand'\s*:\s*(.*)'\s*,'brandId'",
                'brandid':r"'brandId'\s*:\s*'(\w*)'", 
                'total_count':r'totalSQ=(\w*)', 
    }
    html = get_html("http://detail.tmall.com/item.htm?id=%s"%iid)
    #print 'html:',html
    htmlutf = html.replace('\r\n','').replace('\t','')
    soup = BeautifulSoup(html,fromEncoding='gbk')
    temp['shopurl'] = urlparse.urlparse(soup.find('span',{'class':'slogo'}).a['href']).netloc
    temp['itemname'] = soup.find('input',{'name':'title'})['value']
    temp['region'] = soup.find('input',{'name':'region'})['value']
    temp['sellername'] = soup.find('input',{'name':'seller_nickname'})['value']
    for k in patt_list:
        patt = patt_list[k]
        temp[k] = re.findall(patt,htmlutf)[0]
    url = "http://mdskip.taobao.com/core/initItemDetail.htm?tmallBuySupport=true&itemId=%s&service3C=true"%(iid)
    data = get_html(url,referer="http://detail.tmall.com/item.htm?id=%s"%iid).decode('gbk')#.replace('\r\n','').replace('\t','')
    patt = '"priceInfo":(\{.*\}),"promType"'
    price_info = re.findall(patt,data)
    if price_info:
        price_info = json.loads(price_info[0])
        #print 'price_info:',price_info
        if price_info.get('def'):
            temp['price'] = float(price_info['def']['price'])
            if price_info['def']['promotionList']:
                temp['realprice'] = float(price_info['def']['promotionList'][0]['price'])
            else:
                if price_info['def'].get('tagPrice'):
                    temp['realprice'] = float(price_info['def']['tagPrice'])
                else:
                    temp['realprice'] = float(price_info['def']['price'])
                    
        else:
            temp['price'] = float(price_info[price_info.keys()[0]]['price'])
            temp['realprice'] = float(price_info[price_info.keys()[0]]['price'])
            
    patt = '"sellCountDO":(\{.*\}),"serviceDO"'
    quantity_info = re.findall(patt,data)
    if quantity_info:
        quantity = re.findall(r'"sellCount":(\w*)',quantity_info[0])[0]
        print 'quantity :',quantity
        temp['quantity'] = float(quantity)
    return temp
コード例 #44
0
ファイル: loader.py プロジェクト: jaywhy13/mapstream
	def __init__(self, data_src):
		self.url = data_src.src_id
		self.source_node = data_src
		self.parameters = data_src.get_parameters()

		# set the src type
		self.source_type = DataSourceType.objects.get(name='SiteLinkLoader')
		
		# check for crucial parameters
		self.article_link_selector = self.parameters.get('article-link-selector','')
		self.article_css_selector = self.parameters.get('article-css-selector','')
		self.fetch_limit = self.parameters.get('fetch-limit',50)
		self.hostname = urlparse.urlparse(self.url).hostname
コード例 #45
0
ファイル: config.py プロジェクト: albertoconti/astropysics
 def getUrl(self):
     import urllib2,urlparse,os
     dlurl = 'http://www.stsci.edu/resources/software_hardware/pyfits/Download'
     uf = urllib2.urlopen(dlurl)
     try:
         self.feed(uf.read())
     finally:
         uf.close()
         
     url = self.dlurl
     fn= os.path.split(urlparse.urlparse(url).path)[-1]
         
     return url,fn
コード例 #46
0
ファイル: config.py プロジェクト: sargas/astropysics
    def getUrl(self):
        import urllib2, urlparse, os
        dlurl = 'http://www.stsci.edu/resources/software_hardware/pyfits/Download'
        uf = urllib2.urlopen(dlurl)
        try:
            self.feed(uf.read())
        finally:
            uf.close()

        url = self.dlurl
        fn = os.path.split(urlparse.urlparse(url).path)[-1]

        return url, fn
コード例 #47
0
def get_url_parts(url):
    '''
        提取url各部分
    :param url: 完整url
    :return: 六大部分(Scheme, Hostname, Domain, Path, Param, Query)
    '''
    result = urlparse.urlparse(url)
    scheme = result.scheme
    hostname = result.netloc
    try:
        domain = get_tld(url)
    except Exception, e:
        domain = hostname
コード例 #48
0
ファイル: payment.py プロジェクト: russelljk/flexpay
 def verify_signature(self, url):
     '''
     http://docs.aws.amazon.com/AmazonFPS/latest/FPSAPIReference/VerifySignatureAPI.html
     '''        
     p = urlparse.urlparse(url)
     http_parameters = p.query
     url_end_point = '{0.scheme}://{0.netloc}{0.path}'.format(p)
     
     params = {
         'Action': 'VerifySignature',
         'UrlEndPoint': url_end_point,
         'HttpParameters': http_parameters,
     }
     return params
コード例 #49
0
ファイル: payment.py プロジェクト: russelljk/flexpay
    def verify_signature(self, url):
        '''
        http://docs.aws.amazon.com/AmazonFPS/latest/FPSAPIReference/VerifySignatureAPI.html
        '''
        p = urlparse.urlparse(url)
        http_parameters = p.query
        url_end_point = '{0.scheme}://{0.netloc}{0.path}'.format(p)

        params = {
            'Action': 'VerifySignature',
            'UrlEndPoint': url_end_point,
            'HttpParameters': http_parameters,
        }
        return params
コード例 #50
0
ファイル: cdapl.py プロジェクト: gosiaiunia1/KODI_PRIV
 def listsCategoriesMenu(self,url):
     query_data = { 'url': url, 'use_host': False, 'use_cookie': False, 'use_post': False, 'return_data': True }
     link = self.cm.getURLRequestData(query_data)
     #ile jest filmów ?
     match = re.compile('<li class="active"id="mVid"><a href="#" onclick="moreVideo\(\);return false;">Video \((.*?)\)</a></li>', re.DOTALL).findall(link)
     ilejest = int(match[0])
     policz = int(ilejest/o_filmow_na_stronie) +1
     max_stron = policz
     parsed = urlparse.urlparse(url)
     typ = urlparse.parse_qs(parsed.query)['s'][0]
     for i in range(0, (policz)):
         purl = 'http://www.cda.pl/video/show/ca%C5%82e_filmy_or_ca%C5%82y_film/p'+str(i+1)+'?s='+typ
         self.add('cdapl', 'categories-menu', 'Strona '+str(i+1), 'None', 'None', purl, 'None', 'None', True, False,str(i+1))
     xbmcplugin.endOfDirectory(int(sys.argv[1]))
コード例 #51
0
ファイル: compute.py プロジェクト: 3fs/libcloud-interoute
    def __init__(self,
                 key,
                 secret=None,
                 secure=True,
                 host=None,
                 path=None,
                 port=None,
                 url=None,
                 *args,
                 **kwargs):
        if url:
            parsed = urlparse.urlparse(url)

            path = parsed.path

            scheme = parsed.scheme
            split = parsed.netloc.split(':')

            if len(split) == 1:
                # No port provided, use the default one
                host = parsed.netloc
                port = 443 if scheme == 'https' else 80
            else:
                host = split[0]
                port = int(split[1])
        else:
            host = host if host else self.host
            path = path if path else self.path

        if path is not None:
            self.path = path

        if host is not None:
            self.host = host

        if (self.type == Provider.CLOUDSTACK) and (not host or not path):
            raise Exception('When instantiating CloudStack driver directly '
                            'you also need to provide url or host and path '
                            'argument')

        region = kwargs.get('region', None)
        if region is not None:
            self.region = region

        super(InterouteNodeDriver, self).__init__(key=key,
                                                  secret=secret,
                                                  secure=secure,
                                                  host=host,
                                                  port=port)
コード例 #52
0
 def wget(self, url, download, content):
     import uuid
     import urlparse, os
     local = download['local_path']
     pathX = urlparse.urlparse(url).path
     ext = os.path.splitext(pathX)[1]
     if ext.lower() not in [".png", ".jpg", ".gif", ".jpeg"]:
         print "Create {}".format(url)
         fileName = "{}{}".format(uuid.uuid4(), ext)
         fileUrl = open("{}\{}".format(local, fileName), "wb")
         fileUrl.write(url)
         fileUrl.write("\n")
         fileUrl.write(content)
         fileUrl.close()
         pass
コード例 #53
0
def _parseURL(url):
    if url:
        import urlparse
        url = urlparse.urlparse(url)
        protocol = url.scheme
        host = url.hostname
        if url.port:
            try:
                port = int(url.port)
            except ValueError:
                message = "Invalid port number %s in URL" % url.port
                raise SessionArgumentException(message)
        else:
            port = (protocol == "http") and 80 or 443
        path = url.path
        return host, port, protocol, path
コード例 #54
0
ファイル: link.py プロジェクト: Daroth/collectr
    def clean_url(self, url=None):
        import cgi
        import urlparse
        import urllib

        if not url:
            url = self.url

        u = urlparse.urlparse(url)
        qs = cgi.parse_qs(u[4])
        qs = dict((k, v) for k, v in qs.iteritems() if not k.startswith('utm_'))
        u = u._replace(query=urllib.urlencode(qs, True))
        url = urlparse.urlunparse(u)
        url = url.replace('#!', '?_escaped_fragment_=')
        self.logger.info("cleaned url : %s" % url)
        return url
コード例 #55
0
ファイル: util.py プロジェクト: rjstreet/bgglookup-glass
def openAnything(source, etag=None, lastmodified=None, agent=USER_AGENT):
    '''URL, filename, or string --> stream

    This function lets you define parsers that take any input source
    (URL, pathname to local or network file, or actual data as a string)
    and deal with it in a uniform manner.  Returned object is guaranteed
    to have all the basic stdio read methods (read, readline, readlines).
    Just .close() the object when you're done with it.

    If the etag argument is supplied, it will be used as the value of an
    If-None-Match request header.

    If the lastmodified argument is supplied, it must be a formatted
    date/time string in GMT (as returned in the Last-Modified header of
    a previous request).  The formatted date/time will be used
    as the value of an If-Modified-Since request header.

    If the agent argument is supplied, it will be used as the value of a
    User-Agent request header.
    '''

    if hasattr(source, 'read'):
        return source

    if source == '-':
        return sys.stdin

    if urlparse.urlparse(source)[0] == 'http':                                      
        # open URL with urllib2                                                     
        request = urllib2.Request(source)                                           
        request.add_header('User-Agent', agent)                                     
        if etag:                                                                    
            request.add_header('If-None-Match', etag)                               
        if lastmodified:                                                            
            request.add_header('If-Modified-Since', lastmodified)                   
        request.add_header('Accept-encoding', 'gzip')                               
        opener = urllib2.build_opener(SmartRedirectHandler(), DefaultErrorHandler())
        return opener.open(request)                                                 
    
    # try to open with native open function (if source is a filename)
    try:
        return open(source)
    except (IOError, OSError):
        pass

    # treat source as string
    return StringIO(str(source))
コード例 #56
0
def is_valid(url):
    '''
    Function returns True or False based on whether the url has to be
    downloaded or not.
    Robot rules and duplication rules are checked separately.
    This is a great place to filter out crawler traps.
    '''
    global invalid_links
    if invalid_links == "":
        invalid_links = 0
    parsed = urlparse.urlparse(url)
    if parsed.scheme not in set(["http", "https"]):
        invalid_links += 1
        write_invalid_file = open("invalid.txt", "w")
        write_invalid_file.write(str(invalid_links))
        return False

    try:
        r = requests.get(url)
        if r.status_code != 200:
            invalid_links += 1
            write_invalid_file = open("invalid.txt", "w")
            write_invalid_file.write(str(invalid_links))
            return False

        lower_ = ".ics.uci.edu" in parsed.hostname and not re.match(
            ".*\.(css|js|bmp|gif|jpe?g|ico" + "|png|tiff?|mid|mp2|mp3|mp4" +
            "|wav|avi|mov|mpeg|ram|m4v|mkv|ogg|ogv|pdf" +
            "|ps|eps|tex|ppt|pptx|doc|docx|xls|xlsx|names|data|dat|exe|bz2|tar|msi|bin|7z|psd|dmg|iso|epub|dll|cnf|tgz|sha1"
            + "|thmx|mso|arff|rtf|jar|csv" +
            "|rm|smil|wmv|swf|wma|zip|rar|gz)$", parsed.path.lower())

        if not lower_:
            invalid_links += 1
            write_invalid_file = open("invalid.txt", "w")
            write_invalid_file.write(str(invalid_links))
        return lower_

    except TypeError:
        print("TypeError for ", parsed)
        return False

    except requests.exceptions.ConnectionError:
        invalid_links += 1
        write_invalid_file = open("invalid.txt", "w")
        write_invalid_file.write(str(invalid_links))
        return False
コード例 #57
0
def vimeo_video_id(value):
    """ 
    Examples:
    - https://vimeo.com/11111111
    - http://vimeo.com/11111111
    - https://www.vimeo.com/11111111
    - http://www.vimeo.com/11111111
    - https://vimeo.com/channels/11111111
    - http://vimeo.com/channels/11111111
    - https://vimeo.com/groups/name/videos/11111111
    """

    import urlparse

    parsed_url = urlparse.urlparse(value)
    parsed_url_path_parts = parsed_url.path.lstrip('/').split('/')

    return parsed_url_path_parts[-1]
コード例 #58
0
def get_params(url,specify_params=False):
	"""
	Returns The Base URL,URL Parameters and Given URL

	"""
	if specify_params:
		params=specify_params.split(',')
		check_params = parseurl.split('&')
		x=set(check_params)
		y=set(params)
		requ_param=list(x.difference(y))
	else:
		params=[]
		parsed = urlparse.urlparse(url)
		fun_params = urlparse.parse_qsl(parsed.query)
	for param in fun_params:
		params.append(param)
	return params