def searchcrawler(url, keyword=''): """ tb搜索页爬虫 """ html = get_html(url) #print html if html: soup = BeautifulSoup(html, fromEncoding='gbk') items_row = soup.findAll('div', {'class': 'row item icon-datalink'}) if items_row: print '=======================row search row==========================' #print items for item in items_row: item_info = item.find('div', {'class': 'col title'}).h3.a item_url = item_info['href'] url_info = urlparse.urlparse(item_url) item_id = urlparse.parse_qs(url_info.query, True)['id'][0] print item_url print item_id judge_site(item_url, keyword) items_col = soup.findAll('div', {'class': 'col item icon-datalink'}) if items_col: print '=======================row search col==========================' #print items for item in items_col: item_info = item.find('div', {'class': 'item-box'}).h3.a item_url = item_info['href'] url_info = urlparse.urlparse(item_url) item_id = urlparse.parse_qs(url_info.query, True)['id'][0] print item_url print item_id judge_site(item_url, keyword)
def crlf_get_uri_method(uri, method, headers, scanid=None): # This function checks CRLF through GET URI method. par_key = {} url_query = urlparse.urlparse(uri) parsed_query = urlparse.parse_qs(url_query.query) for key, value in parsed_query.items(): crlf_payloads = fetch_crlf_payload() for payload in crlf_payloads: par_key.update(parsed_query) par_key[key] = payload parsed_uri = urlparse.urlparse( uri).scheme + "://" + urlparse.urlparse( uri).netloc + urlparse.urlparse( uri).path + "?" + urlparse.urlparse(uri).query.replace( value[0], payload) crlf_get_method = req.api_request(parsed_uri, "GET", headers) for name in crlf_get_method.headers: if "CRLF-Test" in name: attack_result = { "id": 13, "scanid": scanid, "url": parsed_uri, "alert": "CRLF injection", "impact": "High", "req_headers": headers, "req_body": "NA", "res_headers": crlf_get_method.headers, "res_body": crlf_get_method.text } dbupdate.insert_record(attack_result) print "[+]{0} is vulnerable to CRLF injection".format( parsed_uri) return
def searchcrawler(url,keyword=''): """ tb搜索页爬虫 """ html=get_html(url) #print html if html: soup = BeautifulSoup(html,fromEncoding='gbk') items_row = soup.findAll('div',{'class':'row item icon-datalink'}) if items_row: print '=======================row search row==========================' #print items for item in items_row: item_info = item.find('div',{'class':'col title'}).h3.a item_url = item_info['href'] url_info = urlparse.urlparse(item_url) item_id = urlparse.parse_qs(url_info.query,True)['id'][0] print item_url print item_id judge_site(item_url,keyword) items_col = soup.findAll('div',{'class':'col item icon-datalink'}) if items_col: print '=======================row search col==========================' #print items for item in items_col: item_info = item.find('div',{'class':'item-box'}).h3.a item_url = item_info['href'] url_info = urlparse.urlparse(item_url) item_id = urlparse.parse_qs(url_info.query,True)['id'][0] print item_url print item_id judge_site(item_url,keyword)
def rendered_wall_posts( wall_posts ): for wall_post in wall_posts: title = '' desc = '' site_image = '' article_title = '' urls = re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', wall_post.data['post_content']) for url in urls: parse_obj = urlparse.urlparse(url) site = parse_obj.netloc path = parse_obj.path conn = httplib.HTTPConnection(site) conn.request('HEAD',path) response = conn.getresponse() conn.close() ctype = response.getheader('Content-Type') if response.status < 400 and ctype.startswith('image'): wall_post.data['post_content'] = wall_post.data['post_content']+"<br/><a href='"+url+"' target='_blank'><img width=300 src='"+url+"' target = '_blank'/></a>" else: og = opengraph.OpenGraph(url) if not len(og.items()) == 2: for x,y in og.items(): if x == 'type' and y == 'video': for k,l in og.items(): if k == 'site_name' and l == 'YouTube': url_data = urlparse.urlparse(url) query = urlparse.parse_qs(url_data.query) video = query["v"][0] wall_post.data['post_content'] = wall_post.data['post_content'].replace(url,"")+"<br/><iframe width='300' height='200' src='//www.youtube.com/embed/"+video+"' frameborder='0' allowfullscreen></iframe>" elif k == 'site_name' and l == 'Vimeo': url_data = urlparse.urlparse(url) video = url_data.path wall_post.data['post_content'] = wall_post.data['post_content'].replace(url,"")+"<br/><iframe src='//player.vimeo.com/video"+video+"' width='300' height='200' frameborder='0' webkitallowfullscreen mozallowfullscreen allowfullscreen></iframe> <p></p>" elif x == 'type' and y == 'article': for k,l in og.items(): if k == 'title': article_title = l elif k == 'site_name': title = l elif k=='description': desc = l elif k=='image': site_image = l wall_post.data['post_content'] = wall_post.data['post_content'] +"<br/><table><tr><td><img width='50' src='"+site_image+"'</td><td><a href='"+url+"' target='_blank'/>"+article_title+"</a><br/>"+title+"</td></td></table>" elif x=='type': for k,l in og.items(): if k == 'site_name': title = l elif k=='description': desc = l elif k=='image': site_image = l wall_post.data['post_content'] = wall_post.data['post_content'].replace(url, "<table><tr><td><img width='50' src='"+site_image+"'</td><td><a href='"+url+"' target='_blank'/>"+title+"</a><br/>"+desc+"</td></td></table>") else: wall_post.data['post_content'] = wall_post.data['post_content'].replace(url, "<a href='"+url+"' target='_blank'>"+url+"</a>") return wall_posts
def searchcrawler(url): html = get_html(url) # print url if html: soup = BeautifulSoup(html, fromEncoding='gbk') items_row = soup.findAll('div', {'class': 'item-box st-itembox'}) if items_row: print '=======================row search row==========================' for item in items_row: # print item item_info = item.find('h3', {'class': 'summary'}).a item_url = item_info['href'] # print item_url sid_info = item.find('div', { 'class': 'col seller feature-dsi-tgr' }).a print sid_info sid_item_url = sid_info['href'] sid_url_info = urlparse.urlparse(sid_item_url) sid_id = urlparse.parse_qs(sid_url_info.query, True)['user_number_id'][0] print sid_id judge_site(item_url, sid_id) # logging.warning(item_id) # # download_reply_by_id(item_id) items_col = soup.findAll('div', {'class': 'product-item row icon-datalink'}) if items_col: print '=======================row search col==========================' #print items for item in items_col: item_info = item.find('div', {'class': 'title'}).a item_url = item_info['href'] # url_info = urlparse.urlparse(item_url) # item_id = urlparse.parse_qs(url_info.query,True)['id'][0] print item_url # print item_id sid_info = item.find('div', {'class': 'seller'}).a print sid_info sid_item_url = sid_info['href'] sid_url_info = urlparse.urlparse(sid_item_url) sid_id = urlparse.parse_qs(sid_url_info.query, True)['user_number_id'][0] print sid_id judge_site(item_url, sid_id)
def searchcrawler(url): html=get_html(url) # print url if html: soup = BeautifulSoup(html,fromEncoding='gbk') items_row = soup.findAll('div',{'class':'item-box st-itembox'}) if items_row: print '=======================row search row==========================' for item in items_row: # print item item_info = item.find('h3',{'class':'summary'}).a item_url = item_info['href'] # print item_url sid_info = item.find('div',{'class':'col seller feature-dsi-tgr'}).a print sid_info sid_item_url = sid_info['href'] sid_url_info = urlparse.urlparse(sid_item_url) sid_id = urlparse.parse_qs(sid_url_info.query,True)['user_number_id'][0] print sid_id judge_site(item_url, sid_id) # logging.warning(item_id) # # download_reply_by_id(item_id) items_col = soup.findAll('div',{'class':'product-item row icon-datalink'}) if items_col: print '=======================row search col==========================' #print items for item in items_col: item_info = item.find('div',{'class':'title'}).a item_url = item_info['href'] # url_info = urlparse.urlparse(item_url) # item_id = urlparse.parse_qs(url_info.query,True)['id'][0] print item_url # print item_id sid_info = item.find('div',{'class':'seller'}).a print sid_info sid_item_url = sid_info['href'] sid_url_info = urlparse.urlparse(sid_item_url) sid_id = urlparse.parse_qs(sid_url_info.query,True)['user_number_id'][0] print sid_id judge_site(item_url, sid_id)
def is_local_service(name): """ Determine if a service definition describes a service running on the local node. This is true if the service URL is for localhost, matches the machine's name, or ec2 public name """ if name is None: return False if "://" in name: url = urlparse.urlparse(name) if ":" in url.netloc: name = url.netloc.split(":")[0] else: name = url.netloc elif ":" in name: name = name.split(":")[0] if name == "localhost": return True if '.' in name: name = name.split('.')[0] node = platform.node() if '.' in node: node = node.split('.')[0] if name == node: return True pn = public_name() if pn is not None and pn.split(".")[0] == name: return True return False
def gensitemap(server, urlformat): ''' Crea la ruta del índice de sitemap para el servidor de archivos dado. Se conecta a los índices de segundo nivel y obtiene su fecha de modificación. @type server: dict-like @param server: Documento del servidor tal cual viene de MongoDB @rtype tuple (str, datetime) o None @return tupla con la url y su fecha de modificación, o None si no se puede obtener la url. ''' subdomain = server["ip"].split(".")[0] serverno = int(subdomain[6:]) url = urlformat % serverno domain = urlparse.urlparse(url)[1] con = httplib.HTTPConnection(domain) con.request("HEAD", url) response = con.getresponse() if response.status == 200: mtime = time.mktime(time.strptime( response.getheader("last-Modified"), "%a, %d %b %Y %H:%M:%S %Z")) return (url, datetime.datetime.fromtimestamp(mtime)) return None
def startupagent(self, sender, **kwargs): if not self.bind_web_address: _log.info('Web server not started.') return import urlparse parsed = urlparse.urlparse(self.bind_web_address) hostname = parsed.hostname port = parsed.port _log.info('Starting web server binding to {}:{}.' \ .format(hostname, port)) self.registeredroutes.append((re.compile('^/discovery/$'), 'callable', self._get_discovery)) self.registeredroutes.append((re.compile('^/discovery/allow$'), 'callable', self._allow)) self.registeredroutes.append((re.compile('^/$'), 'callable', self._redirect_index)) port = int(port) vhome = os.environ.get('VOLTTRON_HOME') logdir = os.path.join(vhome, "log") if not os.path.exists(logdir): os.makedirs(logdir) self.appContainer = WebApplicationWrapper(self, hostname, port) svr = WSGIServer((hostname, port), self.appContainer) self._server_greenlet = gevent.spawn(svr.serve_forever)
def fps_ipn_handler(self, request): uri = request.build_absolute_uri() parsed_url = urlparse.urlparse(uri) resp = self.fps_connection.verify_signature(UrlEndPoint="%s://%s%s" % (parsed_url.scheme, parsed_url.netloc, parsed_url.path), HttpParameters=request.body) if not resp.VerifySignatureResult.VerificationStatus == "Success": return HttpResponseForbidden() data = dict(map(lambda x: x.split("="), request.body.split("&"))) for (key, val) in data.items(): data[key] = urllib.unquote_plus(val) if AmazonFPSResponse.objects.filter(transactionId=data["transactionId"]).count(): resp = AmazonFPSResponse.objects.get(transactionId=data["transactionId"]) else: resp = AmazonFPSResponse() for (key, val) in data.items(): attr_exists = hasattr(resp, key) if attr_exists and not callable(getattr(resp, key, None)): if key == "transactionDate": val = datetime.datetime(*time.localtime(float(val))[:6]) setattr(resp, key, val) resp.save() if resp.statusCode == "Success": transaction_was_successful.send(sender=self.__class__, type=data["operation"], response=resp) else: if not "Pending" in resp.statusCode: transaction_was_unsuccessful.send(sender=self.__class__, type=data["operation"], response=resp) # Return a HttpResponse to prevent django from complaining return HttpResponse(resp.statusCode)
def startupagent(self, sender, **kwargs): if not self.bind_web_address: _log.info('Web server not started.') return import urlparse parsed = urlparse.urlparse(self.bind_web_address) hostname = parsed.hostname port = parsed.port _log.info('Starting web server binding to {}:{}.' \ .format(hostname, port)) self.registeredroutes.append((re.compile('^/discovery/$'), 'callable', self._get_discovery)) self.registeredroutes.append((re.compile('^/discovery/allow$'), 'callable', self._allow)) self.registeredroutes.append((re.compile('^/$'), 'callable', self._redirect_index)) port = int(port) vhome = os.environ.get('VOLTTRON_HOME') logdir = os.path.join(vhome, "log") if not os.path.exists(logdir): os.makedirs(logdir) with open(os.path.join(logdir, 'web.access.log'), 'wb') as accesslog: with open(os.path.join(logdir, 'web.error.log'), 'wb') as errlog: server = pywsgi.WSGIServer((hostname, port), self.app_routing, log=accesslog, error_log=errlog) server.serve_forever()
def iriToUri(self, iri): import urlparse parts = urlparse.urlparse(iri.decode('utf-8')) return urlparse.urlunparse( part.encode('idna') if parti == 1 else self.urlEncodeNonAscii(part.encode('utf-8')) for parti, part in enumerate(parts))
def judge_site(url,keyword=''): """ 判断物品是tb还是tm """ url_info = urlparse.urlparse(url) urlkey = urlparse.parse_qs(url_info.query,True) iid = int(urlkey['id'][0]) #print 'url_info:',url_info[1] try: if url_info[1] == 'detail.tmall.com': print 'it is a tm item' if check_item_update_time(iid,'tm'): return data = getTmallItemInfo(iid,keyword) elif urlkey.get('cm_id'): print 'it is a tm item' if check_item_update_time(iid,'tm'): return data = getTmallItemInfo(iid,keyword) else: print 'it is a tb item' if check_item_update_time(iid,'tb'): return data = getTaobaoItemInfo(iid,keyword) except Exception ,e: print traceback.print_exc() return
def post(self): try: name = self.request.POST['name'] topic = MicroTopic.all().filter('name =', name).get() if not topic: raise ReatiweError("Topic %s does not exists." % name) if self.request.POST['mode']: mode = self.request.POST['mode'] else: mode = "subscribe" form_fields = { "hub.mode": mode, "hub.callback": "%s/callback/%s" % (settings.SITE_URL, topic.name), "hub.topic": topic.url, "hub.verify": "sync", "hub.verify_token": topic.name } result = 200 url = self.request.POST['hub'] req = urllib2.Request(url, urllib.urlencode(form_fields)) o = urlparse.urlparse(url) # superfeedr support if o.username and o.password: base64string = base64.encodestring('%s:%s' % (o.username, o.password))[:-1] authheader = "Basic %s" % base64string new_url = "%s://%s%s" % (o.scheme, o.hostname, o.path) req = urllib2.Request(new_url, urllib.urlencode(form_fields)) req.add_header("Authorization", authheader) urllib2.urlopen(req) except DownloadError, e: logging.error('DownloadError: %s' % repr(e)) pass
def listsCategoriesMenu(self, url): query_data = { 'url': url, 'use_host': False, 'use_cookie': False, 'use_post': False, 'return_data': True } link = self.cm.getURLRequestData(query_data) #ile jest filmów ? match = re.compile( '<li class="active"id="mVid"><a href="#" onclick="moreVideo\(\);return false;">Video \((.*?)\)</a></li>', re.DOTALL).findall(link) ilejest = int(match[0]) policz = int(ilejest / o_filmow_na_stronie) + 1 max_stron = policz parsed = urlparse.urlparse(url) typ = urlparse.parse_qs(parsed.query)['s'][0] for i in range(0, (policz)): purl = 'http://www.cda.pl/video/show/ca%C5%82e_filmy_or_ca%C5%82y_film/p' + str( i + 1) + '?s=' + typ self.add('cdapl', 'categories-menu', 'Strona ' + str(i + 1), 'None', 'None', purl, 'None', 'None', True, False, str(i + 1)) xbmcplugin.endOfDirectory(int(sys.argv[1]))
def _extracturls(self): #print "Extract URLs" urls = [] htmlsrc, charset, parenturl = self.htmlSrcTuple if htmlsrc != None: resulturls = [] urlExtractor = ExtractLinks(resulturls) try: if charset == None: urlExtractor.feed(htmlsrc) else: urlExtractor.feed(htmlsrc.decode(charset)) except HTMLParser.HTMLParseError: pass try: urlExtractor.reset() # I think close needs special treatment .close() except HTMLParser.HTMLParseError: urlExtractor.reset() #this piece of code forms the URIs to full URLs by joining the #parenturl with the network location free URLs extracted for i in xrange(len(resulturls)): #replacing range() for performance reasons urlres = urlparse.urlparse(resulturls[i], "http") if urlres.netloc == "": resulturls[i] = urlparse.urljoin(parenturl, resulturls[i]) urls.extend(resulturls) return urls
def get_token_from_url(url): # 从url中获取token import urlparse result = urlparse.urlparse(url) param_dict = urlparse.parse_qs(result.query) tk = param_dict['tk'][0] return tk
def feature_extract(url_input): Feature = {} tokens_words = re.split('\W+', url_input) host = urlparse.urljoin(url_input, '/') path = urlparse.urlparse(url_input).path Feature['URL'] = url_input Feature['rank_host'], Feature['rank_country'] = sitepopularity(host) Feature['host'] = host Feature['Length_of_url'] = len(url_input) Feature['Length_of_host'] = len(host) Feature['No_of_dots'] = url_input.count('.') Feature['sec_sen_word_cnt'] = Security_sensitive(tokens_words) Feature['IPaddress_presence'] = Check_IPaddress(tokens_words) Feature['avg_token_length'], Feature['token_count'], Feature[ 'largest_token'] = Tokenise(url_input) Feature['avg_domain_token_length'], Feature['domain_token_count'], Feature[ 'largest_domain'] = Tokenise(host) Feature['avg_path_token'], Feature['path_token_count'], Feature[ 'largest_path'] = Tokenise(path) Feature['ASNno'] = getASN(host) Feature['safebrowsing'] = safebrowsing(url_input) Feature['numTld'] = numTld(url_input) Feature['numPunctuation'] = numPunctuation(url_input) return Feature
def searchcrawler(url): html=get_html(url) # print url if html: soup = BeautifulSoup(html,fromEncoding='gbk') items_row = soup.findAll('div',{'class':'product-iWrap'}) #items_row = soup.find('div',{'class':'item-box st-itembox'}) # print items_row if items_row: print '=======================row search row==========================' for item in items_row: # print item try: item_info = item.find('p',{'class':'productTitle'}).a except: item_info = item.find('div',{'class':'productTitle productTitle-spu'}).a # print item_info item_url = item_info['href'] # print item_url url_info = urlparse.urlparse(item_url) item_id = urlparse.parse_qs(url_info.query,True)['id'][0] print item_id logging.warning(item_id) # item_id = 16862466992 download_reply_by_id(item_id)
def wait_for_servers(urls, timeout): import time, urlparse, httplib from ssl import SSLError for u in urls: parsed = urlparse.urlparse(u.lower(), "https") netloc = parsed.hostname if parsed.port: netloc = "%s:%s" % (netloc, parsed.port) if parsed.scheme == "http": cnxn = httplib.HTTPConnection(netloc) elif parsed.scheme == "https": cnxn = httplib.HTTPSConnection(netloc) else: raise Exception("Don't know how to handle scheme %s" % parsed.scheme) i = 0 while(i < timeout): try: cnxn.connect() except SSLError: break; except Exception as e: if "Connection refused" in str(e): time.sleep(1) i = i - 1 elif "SSL" in str(e): break else: raise else: break
def judge_site(url, keyword=''): """ 判断物品是tb还是tm """ url_info = urlparse.urlparse(url) urlkey = urlparse.parse_qs(url_info.query, True) iid = int(urlkey['id'][0]) #print 'url_info:',url_info[1] try: if url_info[1] == 'detail.tmall.com': print 'it is a tm item' if check_item_update_time(iid, 'tm'): return data = getTmallItemInfo(iid, keyword) elif urlkey.get('cm_id'): print 'it is a tm item' if check_item_update_time(iid, 'tm'): return data = getTmallItemInfo(iid, keyword) else: print 'it is a tb item' if check_item_update_time(iid, 'tb'): return data = getTaobaoItemInfo(iid, keyword) except Exception, e: print traceback.print_exc() return
def judge_site(url, sid_id): """ 判断物品是tb还是tm """ url_info = urlparse.urlparse(url) urlkey = urlparse.parse_qs(url_info.query, True) iid = int(urlkey['id'][0]) print iid # print 'url_info:',url_info[1] try: if url_info[1] == 'detail.tmall.com': print 'it is a tm item' # data = download_tm_reply_by_id(iid) elif urlkey.get('cm_id'): print 'it is a tm item cm_id' # data = download_tm_reply_by_id(iid) else: print 'it is a tb item' data = download_tb_reply_by_id(iid, sid_id) except Exception, e: print traceback.print_exc() return
def judge_site(url, sid_id): """ 判断物品是tb还是tm """ url_info = urlparse.urlparse(url) urlkey = urlparse.parse_qs(url_info.query,True) iid = int(urlkey['id'][0]) print iid # print 'url_info:',url_info[1] try: if url_info[1] == 'detail.tmall.com': print 'it is a tm item' # data = download_tm_reply_by_id(iid) elif urlkey.get('cm_id'): print 'it is a tm item cm_id' # data = download_tm_reply_by_id(iid) else: print 'it is a tb item' data = download_tb_reply_by_id(iid, sid_id) except Exception ,e: print traceback.print_exc() return
def startupagent(self, sender, **kwargs): if not self.bind_web_address: _log.info('Web server not started.') return import urlparse parsed = urlparse.urlparse(self.bind_web_address) hostname = parsed.hostname port = parsed.port _log.info('Starting web server binding to {}:{}.' \ .format(hostname, port)) self.registeredroutes.append( (re.compile('^/discovery/$'), 'callable', self._get_discovery)) self.registeredroutes.append( (re.compile('^/discovery/allow$'), 'callable', self._allow)) self.registeredroutes.append( (re.compile('^/$'), 'callable', self._redirect_index)) port = int(port) vhome = os.environ.get('VOLTTRON_HOME') logdir = os.path.join(vhome, "log") if not os.path.exists(logdir): os.makedirs(logdir) with open(os.path.join(logdir, 'web.access.log'), 'wb') as accesslog: with open(os.path.join(logdir, 'web.error.log'), 'wb') as errlog: server = pywsgi.WSGIServer((hostname, port), self.app_routing, log=accesslog, error_log=errlog) server.serve_forever()
def startupagent(self, sender, **kwargs): if not self.bind_web_address: _log.info('Web server not started.') return import urlparse parsed = urlparse.urlparse(self.bind_web_address) hostname = parsed.hostname port = parsed.port _log.info('Starting web server binding to {}:{}.' \ .format(hostname, port)) self.registeredroutes.append( (re.compile('^/discovery/$'), 'callable', self._get_discovery)) self.registeredroutes.append( (re.compile('^/discovery/allow$'), 'callable', self._allow)) self.registeredroutes.append( (re.compile('^/$'), 'callable', self._redirect_index)) port = int(port) vhome = os.environ.get('VOLTTRON_HOME') logdir = os.path.join(vhome, "log") if not os.path.exists(logdir): os.makedirs(logdir) self.appContainer = WebApplicationWrapper(self, hostname, port) svr = WSGIServer((hostname, port), self.appContainer) self._server_greenlet = gevent.spawn(svr.serve_forever)
def getParams(path): query = urlparse.urlparse(path).query queryDict = dict([x.split('=') for x in query.split('&')]) width = queryDict['WIDTH'] height = queryDict['HEIGHT'] bbox = queryDict['BBOX'] return Params(int(width), int(height), map(float, bbox.split(',')))
def parse_failover(self, conf_service): """ Parse manual service values. consul:alpha#ip=192.168.1.1&port=80&ip=192.168.1.1&port=22 => 192.168.1.1:80,192.168.1.1:22 consul:bravo?ip#ip=192.168.1.10 => 192.168.1.10 consul:charlie?user++passwd#user=sa&passwd=guest => sa++guest consul:delta?user#passwd=guest => "" consul:echo?port#port=80&ip=localhost&port=22 => 80,22 consul:foxtrot => consul:foxtrot consul:golf?user=passwd#user=sa&passwd=guest&user=root => sa=guest,root= """ # Manual query string. _, _, man_qs = conf_service.partition("#") if not man_qs: # There isn't a manual value. return conf_service # Parse manual configures. man_qs = "?%s" % man_qs if is_python3: qs_dic = parse.parse_qs(urlparse.urlparse(man_qs).query) else: qs_dic = urlparse.parse_qs(urlparse.urlparse(man_qs).query) if not qs_dic: # There isn't a manual value. return conf_service _, _, seg = conf_service.partition("?") keys = self.parse_qs_keys(seg) if not keys: keys = ["ip", ":", "port"] def parse(): max_l = max(len(lst) for lst in qs_dic.values()) for i in range(max_l): for j in range(len(keys)): k = keys[j] if k[0] in string.punctuation: yield k continue lst = qs_dic.get(k, []) yield lst[i] if i < len(lst) else "" if i < max_l - 1: yield "," return "".join(v for v in parse()).strip(",")
def _get_store(self, uri): if os.path.isabs(uri): # to support win32 paths like: C:\\some\dir scheme = 'file' else: scheme = urlparse.urlparse(uri).scheme store_cls = self.STORE_SCHEMES[scheme] return store_cls(uri)
def sethash(self): request = 'http://' + self.host + self.url.split('?')[0] dic = urlparse.urlparse('http://'+self.host+self.url).query.split('&') for d in dic: request += d.split('=')[0]+'=&' request += "|" for d in self.headers['postdata'].split('&'): request += d.split('=')[0]+'=&' self.hash=md5(request).hexdigest()
def crlf_get_url_method(uri, headers, scanid=None): # This function checks CRLF through GET URL method. crlf_payloads = fetch_crlf_payload() for payload in crlf_payloads: parsed_uri = urlparse.urlparse(uri).scheme + "://" + urlparse.urlparse( uri).netloc + urlparse.urlparse(uri).path + "/" + payload crlf_get_method = req.api_request(parsed_uri, "GET", headers) for name in crlf_get_method.headers: if "CRLF-Test" in name: attack_result = {"id": 13, "scanid": scanid, "url": parsed_uri, "alert": "CRLF injection", "impact": "High", "req_headers": headers, "req_body": "NA", "res_headers": crlf_get_method.headers, "res_body": crlf_get_method.text} dbupdate.insert_record(attack_result) print "[+]{0} is vulnerable to CRLF injection".format( parsed_uri) return
def get_sub_domain_from_http_host(http_host): ''' @note: 从http host中获取子域名前缀 ''' import urlparse if http_host: http_host = ('http://%s' % http_host) if not http_host.startswith('http') else http_host prefix = urlparse.urlparse(http_host)[1].split('.', 1)[0] return prefix
def parse_query_string(url, key): if is_py2: import urlparse parsed_url = urlparse.urlparse(url) return urlparse.parse_qs(parsed_url.query)[key][0] else: from urllib.parse import urlparse from urllib.parse import parse_qs parsed_url = urlparse(url) return parse_qs(parsed_url.query)[key][0]
def relative_uri(base, target): """ >>> relative_uri(u"http://example.com/foo/", u"http://example.com/foo/bar") u'bar' >>> relative_uri(u"http://example.com/baz/", u"http://example.com/foo/bar") u'../foo/bar' >>> relative_uri(u"http://example2.com/baz/", u"http://example.com/foo/bar") u'http://example.com/foo/bar' """ base_bits=urlparse.urlparse(base) target_bits=urlparse.urlparse(target) if base_bits.netloc != target_bits.netloc: return target base_dir='.'+posixpath.dirname(base_bits.path) target='.'+target_bits.path return posixpath.relpath(target,start=base_dir)
def get(self): self.response.headers['Content-Type'] = 'text/html' path = os.path.join(os.path.dirname(__file__), 'admin.html') u = urlparse.urlparse(self.request.url) dashboard = "" if u.netloc.startswith("localhost"): dashboard = "/_ah/admin" else: appname = u.netloc[:u.netloc.find(".")] dashboard = "https://appengine.google.com/dashboard?&app_id=s~" + appname self.response.out.write(template.render(path, {"dashboard" : dashboard}))
def add_params(url, params): import urllib import urlparse url_parts = list(urlparse.urlparse(url)) query = dict(urlparse.parse_qsl(url_parts[4])) query.update(params) url_parts[4] = urllib.urlencode(query) return urlparse.urlunparse(url_parts)
def extract_query_params(url, *names): """ Extracts names in the list from url @param url: @param names: @return: dict """ parsed_res = urlparse.urlparse(url) d = urlparse.parse_qs(parsed_res.query) return {key:value[0] for (key, value) in d.iteritems() if key in names}
def qs(url): query = urlparse.urlparse(url).query res = dict( [(k, v[0]) for k, v in urlparse.parse_qs(query).items()]) res1 = {} if res.get('redirect', {}): res1 = qs(res.get('redirect', {})) res.update(res1) # todo if one ticket in redirect # and one ticket in normal path, deal ticket return res
def get_tag_uri(url, date): "Creates a TagURI. See http://diveintomark.org/archives/2004/05/28/howto-atom-id" parts = urlparse.urlparse(url) date_part = "" if date is not None: date_part = ",%s:" % date.strftime("%Y-%m-%d") return "tag:%s%s%s/%s" % ( parts.hostname, date_part, parts.path, parts.fragment, )
def getTmallItemInfo(iid,keyword=''): """ 获取tm的物品信息 """ temp = {'site':'tm','itemid':iid,'keyword':keyword} patt_list = { #r""""sellerNickName"\s*:\s*(.*)'\s*,'isEcardAuction'""", 'sellerid':r"'userId'\s*:\s*'(\w*)',", 'shopid':r'rstShopId:(\w*),', 'brand':r"'brand'\s*:\s*(.*)'\s*,'brandId'", 'brandid':r"'brandId'\s*:\s*'(\w*)'", 'total_count':r'totalSQ=(\w*)', } html = get_html("http://detail.tmall.com/item.htm?id=%s"%iid) #print 'html:',html htmlutf = html.replace('\r\n','').replace('\t','') soup = BeautifulSoup(html,fromEncoding='gbk') temp['shopurl'] = urlparse.urlparse(soup.find('span',{'class':'slogo'}).a['href']).netloc temp['itemname'] = soup.find('input',{'name':'title'})['value'] temp['region'] = soup.find('input',{'name':'region'})['value'] temp['sellername'] = soup.find('input',{'name':'seller_nickname'})['value'] for k in patt_list: patt = patt_list[k] temp[k] = re.findall(patt,htmlutf)[0] url = "http://mdskip.taobao.com/core/initItemDetail.htm?tmallBuySupport=true&itemId=%s&service3C=true"%(iid) data = get_html(url,referer="http://detail.tmall.com/item.htm?id=%s"%iid).decode('gbk')#.replace('\r\n','').replace('\t','') patt = '"priceInfo":(\{.*\}),"promType"' price_info = re.findall(patt,data) if price_info: price_info = json.loads(price_info[0]) #print 'price_info:',price_info if price_info.get('def'): temp['price'] = float(price_info['def']['price']) if price_info['def']['promotionList']: temp['realprice'] = float(price_info['def']['promotionList'][0]['price']) else: if price_info['def'].get('tagPrice'): temp['realprice'] = float(price_info['def']['tagPrice']) else: temp['realprice'] = float(price_info['def']['price']) else: temp['price'] = float(price_info[price_info.keys()[0]]['price']) temp['realprice'] = float(price_info[price_info.keys()[0]]['price']) patt = '"sellCountDO":(\{.*\}),"serviceDO"' quantity_info = re.findall(patt,data) if quantity_info: quantity = re.findall(r'"sellCount":(\w*)',quantity_info[0])[0] print 'quantity :',quantity temp['quantity'] = float(quantity) return temp
def __init__(self, data_src): self.url = data_src.src_id self.source_node = data_src self.parameters = data_src.get_parameters() # set the src type self.source_type = DataSourceType.objects.get(name='SiteLinkLoader') # check for crucial parameters self.article_link_selector = self.parameters.get('article-link-selector','') self.article_css_selector = self.parameters.get('article-css-selector','') self.fetch_limit = self.parameters.get('fetch-limit',50) self.hostname = urlparse.urlparse(self.url).hostname
def getUrl(self): import urllib2,urlparse,os dlurl = 'http://www.stsci.edu/resources/software_hardware/pyfits/Download' uf = urllib2.urlopen(dlurl) try: self.feed(uf.read()) finally: uf.close() url = self.dlurl fn= os.path.split(urlparse.urlparse(url).path)[-1] return url,fn
def getUrl(self): import urllib2, urlparse, os dlurl = 'http://www.stsci.edu/resources/software_hardware/pyfits/Download' uf = urllib2.urlopen(dlurl) try: self.feed(uf.read()) finally: uf.close() url = self.dlurl fn = os.path.split(urlparse.urlparse(url).path)[-1] return url, fn
def get_url_parts(url): ''' 提取url各部分 :param url: 完整url :return: 六大部分(Scheme, Hostname, Domain, Path, Param, Query) ''' result = urlparse.urlparse(url) scheme = result.scheme hostname = result.netloc try: domain = get_tld(url) except Exception, e: domain = hostname
def verify_signature(self, url): ''' http://docs.aws.amazon.com/AmazonFPS/latest/FPSAPIReference/VerifySignatureAPI.html ''' p = urlparse.urlparse(url) http_parameters = p.query url_end_point = '{0.scheme}://{0.netloc}{0.path}'.format(p) params = { 'Action': 'VerifySignature', 'UrlEndPoint': url_end_point, 'HttpParameters': http_parameters, } return params
def listsCategoriesMenu(self,url): query_data = { 'url': url, 'use_host': False, 'use_cookie': False, 'use_post': False, 'return_data': True } link = self.cm.getURLRequestData(query_data) #ile jest filmów ? match = re.compile('<li class="active"id="mVid"><a href="#" onclick="moreVideo\(\);return false;">Video \((.*?)\)</a></li>', re.DOTALL).findall(link) ilejest = int(match[0]) policz = int(ilejest/o_filmow_na_stronie) +1 max_stron = policz parsed = urlparse.urlparse(url) typ = urlparse.parse_qs(parsed.query)['s'][0] for i in range(0, (policz)): purl = 'http://www.cda.pl/video/show/ca%C5%82e_filmy_or_ca%C5%82y_film/p'+str(i+1)+'?s='+typ self.add('cdapl', 'categories-menu', 'Strona '+str(i+1), 'None', 'None', purl, 'None', 'None', True, False,str(i+1)) xbmcplugin.endOfDirectory(int(sys.argv[1]))
def __init__(self, key, secret=None, secure=True, host=None, path=None, port=None, url=None, *args, **kwargs): if url: parsed = urlparse.urlparse(url) path = parsed.path scheme = parsed.scheme split = parsed.netloc.split(':') if len(split) == 1: # No port provided, use the default one host = parsed.netloc port = 443 if scheme == 'https' else 80 else: host = split[0] port = int(split[1]) else: host = host if host else self.host path = path if path else self.path if path is not None: self.path = path if host is not None: self.host = host if (self.type == Provider.CLOUDSTACK) and (not host or not path): raise Exception('When instantiating CloudStack driver directly ' 'you also need to provide url or host and path ' 'argument') region = kwargs.get('region', None) if region is not None: self.region = region super(InterouteNodeDriver, self).__init__(key=key, secret=secret, secure=secure, host=host, port=port)
def wget(self, url, download, content): import uuid import urlparse, os local = download['local_path'] pathX = urlparse.urlparse(url).path ext = os.path.splitext(pathX)[1] if ext.lower() not in [".png", ".jpg", ".gif", ".jpeg"]: print "Create {}".format(url) fileName = "{}{}".format(uuid.uuid4(), ext) fileUrl = open("{}\{}".format(local, fileName), "wb") fileUrl.write(url) fileUrl.write("\n") fileUrl.write(content) fileUrl.close() pass
def _parseURL(url): if url: import urlparse url = urlparse.urlparse(url) protocol = url.scheme host = url.hostname if url.port: try: port = int(url.port) except ValueError: message = "Invalid port number %s in URL" % url.port raise SessionArgumentException(message) else: port = (protocol == "http") and 80 or 443 path = url.path return host, port, protocol, path
def clean_url(self, url=None): import cgi import urlparse import urllib if not url: url = self.url u = urlparse.urlparse(url) qs = cgi.parse_qs(u[4]) qs = dict((k, v) for k, v in qs.iteritems() if not k.startswith('utm_')) u = u._replace(query=urllib.urlencode(qs, True)) url = urlparse.urlunparse(u) url = url.replace('#!', '?_escaped_fragment_=') self.logger.info("cleaned url : %s" % url) return url
def openAnything(source, etag=None, lastmodified=None, agent=USER_AGENT): '''URL, filename, or string --> stream This function lets you define parsers that take any input source (URL, pathname to local or network file, or actual data as a string) and deal with it in a uniform manner. Returned object is guaranteed to have all the basic stdio read methods (read, readline, readlines). Just .close() the object when you're done with it. If the etag argument is supplied, it will be used as the value of an If-None-Match request header. If the lastmodified argument is supplied, it must be a formatted date/time string in GMT (as returned in the Last-Modified header of a previous request). The formatted date/time will be used as the value of an If-Modified-Since request header. If the agent argument is supplied, it will be used as the value of a User-Agent request header. ''' if hasattr(source, 'read'): return source if source == '-': return sys.stdin if urlparse.urlparse(source)[0] == 'http': # open URL with urllib2 request = urllib2.Request(source) request.add_header('User-Agent', agent) if etag: request.add_header('If-None-Match', etag) if lastmodified: request.add_header('If-Modified-Since', lastmodified) request.add_header('Accept-encoding', 'gzip') opener = urllib2.build_opener(SmartRedirectHandler(), DefaultErrorHandler()) return opener.open(request) # try to open with native open function (if source is a filename) try: return open(source) except (IOError, OSError): pass # treat source as string return StringIO(str(source))
def is_valid(url): ''' Function returns True or False based on whether the url has to be downloaded or not. Robot rules and duplication rules are checked separately. This is a great place to filter out crawler traps. ''' global invalid_links if invalid_links == "": invalid_links = 0 parsed = urlparse.urlparse(url) if parsed.scheme not in set(["http", "https"]): invalid_links += 1 write_invalid_file = open("invalid.txt", "w") write_invalid_file.write(str(invalid_links)) return False try: r = requests.get(url) if r.status_code != 200: invalid_links += 1 write_invalid_file = open("invalid.txt", "w") write_invalid_file.write(str(invalid_links)) return False lower_ = ".ics.uci.edu" in parsed.hostname and not re.match( ".*\.(css|js|bmp|gif|jpe?g|ico" + "|png|tiff?|mid|mp2|mp3|mp4" + "|wav|avi|mov|mpeg|ram|m4v|mkv|ogg|ogv|pdf" + "|ps|eps|tex|ppt|pptx|doc|docx|xls|xlsx|names|data|dat|exe|bz2|tar|msi|bin|7z|psd|dmg|iso|epub|dll|cnf|tgz|sha1" + "|thmx|mso|arff|rtf|jar|csv" + "|rm|smil|wmv|swf|wma|zip|rar|gz)$", parsed.path.lower()) if not lower_: invalid_links += 1 write_invalid_file = open("invalid.txt", "w") write_invalid_file.write(str(invalid_links)) return lower_ except TypeError: print("TypeError for ", parsed) return False except requests.exceptions.ConnectionError: invalid_links += 1 write_invalid_file = open("invalid.txt", "w") write_invalid_file.write(str(invalid_links)) return False
def vimeo_video_id(value): """ Examples: - https://vimeo.com/11111111 - http://vimeo.com/11111111 - https://www.vimeo.com/11111111 - http://www.vimeo.com/11111111 - https://vimeo.com/channels/11111111 - http://vimeo.com/channels/11111111 - https://vimeo.com/groups/name/videos/11111111 """ import urlparse parsed_url = urlparse.urlparse(value) parsed_url_path_parts = parsed_url.path.lstrip('/').split('/') return parsed_url_path_parts[-1]
def get_params(url,specify_params=False): """ Returns The Base URL,URL Parameters and Given URL """ if specify_params: params=specify_params.split(',') check_params = parseurl.split('&') x=set(check_params) y=set(params) requ_param=list(x.difference(y)) else: params=[] parsed = urlparse.urlparse(url) fun_params = urlparse.parse_qsl(parsed.query) for param in fun_params: params.append(param) return params