コード例 #1
0
def searchcrawler(url,keyword=''):
    """
    tb搜索页爬虫
    """
    html=get_html(url)
    #print html
    if html:
        soup = BeautifulSoup(html,fromEncoding='gbk')
        items_row = soup.findAll('div',{'class':'row item icon-datalink'})
        if items_row:
            print '=======================row search row=========================='
            #print items
            for item in items_row:
                item_info = item.find('div',{'class':'col title'}).h3.a
                item_url = item_info['href']
                url_info = urlparse.urlparse(item_url)
                item_id = urlparse.parse_qs(url_info.query,True)['id'][0]
                print item_url
                print item_id
                judge_site(item_url,keyword)
        items_col = soup.findAll('div',{'class':'col item icon-datalink'})
        if items_col:
            print '=======================row search col=========================='
            #print items
            for item in items_col:
                item_info = item.find('div',{'class':'item-box'}).h3.a
                item_url = item_info['href']
                url_info = urlparse.urlparse(item_url)
                item_id = urlparse.parse_qs(url_info.query,True)['id'][0]
                print item_url
                print item_id
                judge_site(item_url,keyword)
コード例 #2
0
ファイル: tb.py プロジェクト: fubendong/wangw
def searchcrawler(url):
    
    html=get_html(url)
#     print url
    if html:
        soup = BeautifulSoup(html,fromEncoding='gbk')
        items_row = soup.findAll('div',{'class':'item-box st-itembox'})
        if items_row:
            print '=======================row search row=========================='
            for item in items_row:
#                 print item
                item_info = item.find('h3',{'class':'summary'}).a
                item_url = item_info['href']
#                 print item_url
                
                
                sid_info = item.find('div',{'class':'col seller feature-dsi-tgr'}).a
                print sid_info
                sid_item_url = sid_info['href']
                sid_url_info = urlparse.urlparse(sid_item_url)
                sid_id = urlparse.parse_qs(sid_url_info.query,True)['user_number_id'][0]
                print sid_id
                
                judge_site(item_url, sid_id)
                
#                 logging.warning(item_id)
#                 
#                 download_reply_by_id(item_id)
                
        items_col = soup.findAll('div',{'class':'product-item row icon-datalink'})       
        if items_col:
            
            print '=======================row search col=========================='
            #print items
            for item in items_col:
                item_info = item.find('div',{'class':'title'}).a
                item_url = item_info['href']
#                 url_info = urlparse.urlparse(item_url)
#                 item_id = urlparse.parse_qs(url_info.query,True)['id'][0]
                print item_url
#                 print item_id

                sid_info = item.find('div',{'class':'seller'}).a
                print sid_info
                sid_item_url = sid_info['href']
                sid_url_info = urlparse.urlparse(sid_item_url)
                sid_id = urlparse.parse_qs(sid_url_info.query,True)['user_number_id'][0]
                print sid_id
                
                judge_site(item_url, sid_id)
コード例 #3
0
def judge_site(url,keyword=''):
    """
    判断物品是tb还是tm
    """
    url_info = urlparse.urlparse(url)
    urlkey = urlparse.parse_qs(url_info.query,True)
    iid = int(urlkey['id'][0])
    #print 'url_info:',url_info[1]
    try:
        if url_info[1] == 'detail.tmall.com':
            print 'it is a tm item'
            if check_item_update_time(iid,'tm'):
                return
            data = getTmallItemInfo(iid,keyword)
        elif urlkey.get('cm_id'):
            print 'it is a tm item'
            if check_item_update_time(iid,'tm'):
                return
            data = getTmallItemInfo(iid,keyword)
        else:
            print 'it is a tb item'
            if check_item_update_time(iid,'tb'):
                return
            data = getTaobaoItemInfo(iid,keyword)
    except Exception ,e:
        print traceback.print_exc()
        return
コード例 #4
0
ファイル: tb.py プロジェクト: fubendong/wangw
def judge_site(url, sid_id):
    """
    判断物品是tb还是tm
    """
    url_info = urlparse.urlparse(url)
    urlkey = urlparse.parse_qs(url_info.query,True)
    iid = int(urlkey['id'][0])
    print iid
#     print 'url_info:',url_info[1]
    try:
        if url_info[1] == 'detail.tmall.com':
            print 'it is a tm item'
            
#             data = download_tm_reply_by_id(iid)
        elif urlkey.get('cm_id'):
            print 'it is a tm item cm_id'
            
#             data = download_tm_reply_by_id(iid)
        else:
            print 'it is a tb item'
            
            data = download_tb_reply_by_id(iid, sid_id)
    except Exception ,e:
        print traceback.print_exc()
        return
コード例 #5
0
ファイル: phone_get.py プロジェクト: fubendong/test
def searchcrawler(url):
    
    html=get_html(url)
#     print url
    if html:
        soup = BeautifulSoup(html,fromEncoding='gbk')
        items_row = soup.findAll('div',{'class':'product-iWrap'})
        #items_row = soup.find('div',{'class':'item-box st-itembox'})
#         print items_row
        if items_row:
            print '=======================row search row=========================='
            for item in items_row:
#                 print item
                try:
                    item_info = item.find('p',{'class':'productTitle'}).a
                except:
                    item_info = item.find('div',{'class':'productTitle productTitle-spu'}).a
                
#                 print item_info
                item_url = item_info['href']
#                 print item_url
                
                url_info = urlparse.urlparse(item_url)
                item_id = urlparse.parse_qs(url_info.query,True)['id'][0]
                print item_id
                logging.warning(item_id)
                
#                 item_id = 16862466992
                download_reply_by_id(item_id)
コード例 #6
0
 def do_GET(self):
     """Implementa el manejo de peticiones GET al server.
     Maneja una web raiz, las respuestas a la consulta (Tanto para rss
         validos como invalidos, y para el caso de otro tipo de consulta
         no valida dentro de las pre-establecidas, devuelve un error 404.
     """
     if self.path == '/':
         # Peticion a la raiz del server
         self.send_response(200)
         self.send_header("Content-type", "text/html")
         self.end_headers()
         self.wfile.write(self._get_root_page())
     else:
         rss = urlparse.parse_qs(self.path[2:]) # Descarta /?
         if rss.has_key('p'):
             if self.es_valido(rss):
                 # Recuperar RSS
                 self.send_response(200)
                 self.send_header("Content-type", "application/xml")
                 self.end_headers()
                 self.wfile.write(self._get_valid_feed_page(rss))
             else:
                 # RSS invalido
                 self.send_response(200)
                 self.send_header("Content-type", "text/html")
                 self.end_headers()
                 self.wfile.write(self._get_invalid_feed_page(rss))
         else:
             # Si la peticion no esta bien formada, devuelve error 404
             self.send_response(404)
             self.send_header("Content-type", "text/html")
             self.end_headers()
             self.wfile.write(self._get_error_page())
コード例 #7
0
ファイル: otherplugins.py プロジェクト: simonm3/mim
def auth(sender):
    """ log userids and passwords from get and post requests
        NOTE this will not work with most sites """

    include = ["log", "login", "logon", "user", "username", "key", "name", "email", \
             "password", "pass", "passwd", "pwd", "psw", "passw", "auth"]

    query = sender.data if sender.data else sender.query
    if not query:
        return

    # split into auth and noauth
    q = urlparse.parse_qs(query)
    auth = dict()
    noauth = dict()
    for k, v in q.items():
        if k in include:
            auth[k] = v
        else:
            noauth[k] = v

    # output auth
    auth = '\n'.join(["%s=%s"%(k,v) for k, v in auth.items()])
    log.info("query strings===>\n%s"% auth)

    # output noauth truncating the values
    noauth = {k: v[:15]+"..." if len(v)>15 else v for k, v in noauth.items()}
    noauth = '\n'.join(["%s=%s"%(k,v) for k, v in noauth.items()])
    log.info("auth strings===> ***************************************\n%s" % noauth)
コード例 #8
0
ファイル: Functions.py プロジェクト: luongvancong/sat8spider
def getVGProductId(link):
    if 'record_id' in link:
        parsed = urlparse(link)
        record_id = urlparse.parse_qs(parsed.query)['record_id']
        return int(record_id)

    parsed = urlparse(link)
    path = parsed.path
    return int(path.split('/')[2])
コード例 #9
0
ファイル: views.py プロジェクト: dithua/collato
def rendered_wall_posts( wall_posts ):
	for wall_post in wall_posts:
		title = ''
		desc = ''
		site_image = ''
		article_title = ''
		urls = re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', wall_post.data['post_content'])
		for url in urls: 
			parse_obj = urlparse.urlparse(url)
			site = parse_obj.netloc
			path = parse_obj.path
			conn = httplib.HTTPConnection(site)
			conn.request('HEAD',path)
			response = conn.getresponse()
			conn.close()
			ctype = response.getheader('Content-Type')
			if response.status < 400 and ctype.startswith('image'):
				wall_post.data['post_content'] = wall_post.data['post_content']+"<br/><a href='"+url+"' target='_blank'><img width=300 src='"+url+"' target = '_blank'/></a>"
			else:
				og = opengraph.OpenGraph(url)
				if not len(og.items()) == 2:
					for x,y in og.items():
						if x == 'type' and y == 'video':
							for k,l in og.items():
								if k == 'site_name' and l == 'YouTube':
							
									url_data = urlparse.urlparse(url)
									query = urlparse.parse_qs(url_data.query)
									video = query["v"][0]
									wall_post.data['post_content'] = wall_post.data['post_content'].replace(url,"")+"<br/><iframe width='300' height='200' src='//www.youtube.com/embed/"+video+"' frameborder='0' allowfullscreen></iframe>"
								elif k == 'site_name' and l == 'Vimeo':
									url_data = urlparse.urlparse(url)
									video = url_data.path
									wall_post.data['post_content'] = wall_post.data['post_content'].replace(url,"")+"<br/><iframe src='//player.vimeo.com/video"+video+"' width='300' height='200' frameborder='0' webkitallowfullscreen mozallowfullscreen allowfullscreen></iframe> <p></p>"
						elif x == 'type' and y == 'article':
							for k,l in og.items():
								if k == 'title':
									article_title = l
								elif k == 'site_name':
									title = l
								elif k=='description':
									desc = l
								elif k=='image':
									site_image = l
							wall_post.data['post_content'] = wall_post.data['post_content'] +"<br/><table><tr><td><img width='50' src='"+site_image+"'</td><td><a href='"+url+"' target='_blank'/>"+article_title+"</a><br/>"+title+"</td></td></table>"
						elif x=='type':
							for k,l in og.items():
								if k == 'site_name':
									title = l
								elif k=='description':
									desc = l
								elif k=='image':
									site_image = l
							wall_post.data['post_content'] = wall_post.data['post_content'].replace(url, "<table><tr><td><img width='50' src='"+site_image+"'</td><td><a href='"+url+"' target='_blank'/>"+title+"</a><br/>"+desc+"</td></td></table>")
				else:
					wall_post.data['post_content'] = wall_post.data['post_content'].replace(url, "<a href='"+url+"' target='_blank'>"+url+"</a>")	
	return wall_posts	
コード例 #10
0
ファイル: utils.py プロジェクト: beforebeta/dealfu
def extract_query_params(url, *names):
    """
    Extracts names in the list from url
    @param url:
    @param names:
    @return: dict
    """
    parsed_res = urlparse.urlparse(url)
    d = urlparse.parse_qs(parsed_res.query)

    return {key:value[0] for (key, value) in d.iteritems() if key in names}
コード例 #11
0
ファイル: taleoapitest.py プロジェクト: darrodan/TaleoTest
def a_taleo():
    
    # get the query string        
    qsdata= request.query_string
    qs = urlparse.parse_qs(qsdata)
    # write log entry
    neclogger(qsdata,True, True)

    data = qsdata
    js = json.dumps(data)
    resp = Response(js, status=200, mimetype='application/json')
    
    return resp
コード例 #12
0
ファイル: cdapl.py プロジェクト: gosiaiunia1/KODI_PRIV
 def listsCategoriesMenu(self,url):
     query_data = { 'url': url, 'use_host': False, 'use_cookie': False, 'use_post': False, 'return_data': True }
     link = self.cm.getURLRequestData(query_data)
     #ile jest filmów ?
     match = re.compile('<li class="active"id="mVid"><a href="#" onclick="moreVideo\(\);return false;">Video \((.*?)\)</a></li>', re.DOTALL).findall(link)
     ilejest = int(match[0])
     policz = int(ilejest/o_filmow_na_stronie) +1
     max_stron = policz
     parsed = urlparse.urlparse(url)
     typ = urlparse.parse_qs(parsed.query)['s'][0]
     for i in range(0, (policz)):
         purl = 'http://www.cda.pl/video/show/ca%C5%82e_filmy_or_ca%C5%82y_film/p'+str(i+1)+'?s='+typ
         self.add('cdapl', 'categories-menu', 'Strona '+str(i+1), 'None', 'None', purl, 'None', 'None', True, False,str(i+1))
     xbmcplugin.endOfDirectory(int(sys.argv[1]))
コード例 #13
0
def searchcrawler(url):
    
    html=get_html(url)
#     print url
    if html:
        soup = BeautifulSoup(html,fromEncoding='gbk')
        items_row = soup.findAll('div',{'class':'product-iWrap'})
        #items_row = soup.find('div',{'class':'item-box st-itembox'})
#         print items_row
        if items_row:
            print '=======================row search row=========================='
            for item in items_row:
#                 print item
                item_info = item.find('p',{'class':'productTitle'}).a
                item_url = item_info['href']
#                 print item_url
                
                url_info = urlparse.urlparse(item_url)
                item_id = urlparse.parse_qs(url_info.query,True)['id'][0]
                print item_id
#                 item_id = 16862466992
                download_reply_by_id(item_id)
コード例 #14
0
ファイル: tx.py プロジェクト: fubendong/test
def searchcrawler(url):

    html = get_html(url)
    #     print url
    if html:
        soup = BeautifulSoup(html, fromEncoding="gbk")
        items_row = soup.findAll("div", {"class": "product-iWrap"})
        # items_row = soup.find('div',{'class':'item-box st-itembox'})
        #         print items_row
        if items_row:
            print "=======================row search row=========================="
            for item in items_row:
                #                 print item
                item_info = item.find("p", {"class": "productTitle"}).a
                item_url = item_info["href"]
                #                 print item_url

                url_info = urlparse.urlparse(item_url)
                item_id = urlparse.parse_qs(url_info.query, True)["id"][0]
                print item_id
                #                 item_id = 16862466992
                download_reply_by_id(item_id)
コード例 #15
0
 def do_GET(self):
     """Implementa el manejo de peticiones GET al server."""
     if self.path == '/':
         self.send_response(200)
         self.send_header("Content-type", "text/html")
         self.end_headers()
         self.wfile.write(self._get_root_page())
     else:
         peticion = urlparse.parse_qs(self.path[2:]) # Descarta /?
         if peticion.has_key('consultar'):
             # Si la peticion es correcta, solicita el server a ser usado, realiza la peticion y la devuelve al cliente
             server = self.next_server()
             url = 'http://'+server['host']+':'+server['port']+'/'
             open(LOGFILE, "a").write("Accediendo a %s" % url)
             self.send_response(200)
             self.send_header("Content-type", "text/html")
             self.end_headers()
             self.wfile.write(self._get_node_response(url))
         else:
             self.send_response(404)
             self.send_header("Content-type", "text/html")
             self.end_headers()
             self.wfile.write(self._get_error_page())
コード例 #16
0
def oembed(oembed_url):
    width = 400
    height = 300
    services = {
        'youtube.com': 'youtube',
        'youtu.be': 'yoube',
        'vimeo.com': 'vimeo',
        'vine.co': 'vine',
        'facebook.com': 'facebook',
        #'imgur.com': 'imgur'
    }

    parsedUrl = urlparse(oembed_url)

    url = parsedUrl.netloc
    url = url.replace("www.", "", 1)

    if url not in services:
        # Check if url is image
        images = ['.jpg', '.jpeg', '.gif', '.png']
        disassembled = urlparse(oembed_url)
        filename, file_ext = splitext(basename(disassembled.path))

        print file_ext
        if file_ext in images:
            embedHtml = "<a href='{image}' ><img src='{image}' height=300 alt='{name}' /></a>".format(
                image=oembed_url, name=filename)
            return embedHtml
        else:
            return ""
    else:
        provider = services[url]
        try:
            # Youtube
            if provider is 'youtube':
                videoCode = parsedUrl.query[-11:]
                embedHtml = "<iframe width=\"{width}\" height=\"{height}\" src=\"//www.youtube.com/embed/{video}\" frameborder=\"0\" allowfullscreen></iframe>".format(
                    width=width, height=height, video=videoCode)
                embedHtml = re.sub('http', 'https', embedHtml)
                return embedHtml

            # Vimeo
            elif provider is 'vimeo':
                videoCode = parsedUrl.path[-8:]
                embedHtml = "<iframe src=\"//player.vimeo.com/video/{video}\" width=\"{width}\" height=\"{height}\" frameborder=\"0\" webkitallowfullscreen mozallowfullscreen allowfullscreen></iframe>".format(
                    width=width, height=height, video=videoCode)
                embedHtml = re.sub('http', 'https', embedHtml)
                return embedHtml

            # Youtu.be
            elif provider is 'yoube':
                videoCode = parsedUrl.path[-11:]
                embedHtml = "<iframe width=\"{width}\" height=\"{height}\" src=\"//www.youtube.com/embed/{video}\" frameborder=\"0\" allowfullscreen></iframe>".format(
                    width=width, height=height, video=videoCode)
                embedHtml = re.sub('http', 'https', embedHtml)
                return embedHtml

            # Vine
            elif provider is 'vine':
                videoCode = parsedUrl.path[-11:]
                embedHtml = "<iframe class=\"vine-embed\" src=\"https://vine.co/v/{video}/embed/simple\" width=\"{width}\" height=\"{height}\" frameborder=\"0\"></iframe>".format(
                    width=width, height=height, video=videoCode)
                if parsedUrl.scheme is "http":
                    embedHtml = re.sub('http', 'https', embedHtml)
                return embedHtml

            # Facebook
            elif provider is 'facebook':
                par = urlparse.parse_qs(urlparse.urlparse(url).query)
                videoCode = par['v']
                print videoCode
                embedHtml = "<object width=\"{width}\" height=\"{height}\" ><param name=\"allowfullscreen\" value=\"true\" /><param name=\"allowscriptaccess\" value=\"always\" /><param name=\"movie\" value=\"http://www.facebook.com/v/{video}\" /><embed src=\"http://www.facebook.com/v/{video}\" type=\"application/x-shockwave-flash\" allowscriptaccess=\"always\" allowfullscreen=\"true\" width=\"{width}\" height=\"{height}\"></embed></object>".format(
                    width=width, height=height, video=videoCode)
                if parsedUrl.scheme is "http":
                    embedHtml = re.sub('http', 'https', embedHtml)
                return embedHtml

            # Imgur TODO
            '''
            elif provider is 'imgur':
                # Check if url is image
                images = ['.jpg', '.jpeg', '.gif', '.png']
                disassembled = urlparse(oembed_url)
                filename, file_ext = splitext(basename(disassembled.path))

                print file_ext
                if file_ext in images:
                    embedHtml = "<a href='{image}' ><img src='{image}' height=300 alt='{name}' /></a>".format(image=oembed_url, name=filename)
                    return embedHtml
                else:
                    # Try to show picture anyway

             '''
        except Exception:
            return ""
コード例 #17
0
ファイル: __init__.py プロジェクト: timxi/TileStache
        #
        # WSGI behavior is different from CGI behavior, because we may not want
        # to return a chatty rummy for likely-deployed WSGI vs. testing CGI.
        #
#         if layer and layer not in self.config.layers:
#             return self._response(start_response, 404)
# 
#         path_info = environ.get('PATH_INFO', None)
#         query_string = environ.get('QUERY_STRING', None)
#         script_name = environ.get('SCRIPT_NAME', None)

        logging.debug("self is %s",self)
        path_info = environ.get('PATH_INFO', None)
        query_string = environ.get('QUERY_STRING', None)
        script_name = environ.get('SCRIPT_NAME', None)
        version_str = urlparse.parse_qs(query_string,True).get('version')
        
        if version_str is not None:
            version  = long(version_str[0])
            logging.debug("layer %s version %d", layer,version)
        else:
            version = None
        
        if version is not None and layer is None:
            return self._response(start_response, 404)
        
        layer_obj = self.config.layers[layer]
        
        if layer_obj is None:
            return self._response(start_response, 404)
        
コード例 #18
0
    def get_product_price(self, url):

        page = MarketBrowser.get_html(url)

        xpath = Directory.flat_xpath

        name_str = xpath(page, '//h3[@class="trade_Name"]/text()')

        intro_str = xpath(page, '//dd[@class="introduction"]/text()')

        content_origin_str = xpath(
            page,
            '//div[@class="product_content"]//tr[contains(string(), "產地")]/td[2]//text()'
        )

        content_unit_str = xpath(
            page,
            '//div[@class="product_content"]//tr[contains(string(), "數量")]/td[2]//text()'
        )

        price_str = xpath(page, '//dd[@class="list_price"]/text()')

        try:

            # 大成去骨雞腿1盒 => 大成去骨雞腿
            name = GeantBrowser.NAME_RE.findall(name_str)[0]

            # try to find origin in introduction
            try:
                origin_str = GeantBrowser.ORIGIN_RE.findall(intro_str)[0]

            # try content table, could be ''
            except IndexError:
                origin_str = content_origin_str

            origin = self.get_origin(origin_str)

            # try to find count in introduction
            try:
                count_str = GeantBrowser.COUNT_RE.findall(intro_str)[0]
                count = Directory.get_count(count_str)

            # try to find count in title, or 1
            except IndexError:
                count = Directory.get_count(name_str)

            # try to find spec in introduction
            try:
                spec_str = GeantBrowser.WEIGHT_RE.findall(intro_str)[0]
                weight = self.get_weight(spec_str)

                # test weight with title weight
                test_weight = self.get_weight(name_str)
                if test_weight and weight != test_weight:
                    weight = test_weight

            # try to find spec in title
            except IndexError:
                weight = self.get_weight(name_str)

            # &pid=4940444 => 4940444
            pid = urlparse.parse_qs(url)['pid'][0]

            price = int(price_str)

            # try to find unit in title, introduction, content table
            try:
                unit_str = GeantBrowser.COUNT_RE.findall(intro_str)[0]
                unit_str += name_str
                unit_str += content_unit_str
            except IndexError:
                unit_str = name_str + content_unit_str

            unit = self.get_unit(unit_str)

        except:
            log.error(Directory.ERROR_MAP[3] % (name_str, url))
            return None, None

        product = Product(source=url,
                          name=name,
                          origin=origin,
                          market_id=self.market.id,
                          pid=pid,
                          weight=weight,
                          count=count,
                          unit=unit)

        price = Price(price=price, date=self.date)

        return product, price
コード例 #19
0
ファイル: taleoapitest.py プロジェクト: darrodan/TaleoTest
def a_nec_log():
    
    # timer code 
    if debug == True:
        start = time()       
    
    domainMatch = None

    global kinveyInitialized
    global whitelistCache

    # check whitelist if not already initialized
    
    if not kinveyInitialized:
        kinveyURL = kinveyBaseURL + '/appdata/' + kinveyAppKey +  '/' +'whitelist/'
        #app.logger.info(kinveyURL)
        r = requests.get(kinveyURL, auth=(kinveyUsername, kinveyPassword))
        neclogger(r.text, debug, True)
        kinveyInitialized = True
        
        s = r.text
        entries = json.loads(s)
        for entry in entries:
            whitelistEntry = entry.get('url', None)
            if whitelistEntry:
                whitelistCache.append(whitelistEntry)
                neclogger("Entry = " + whitelistEntry, debug, True)
                            
    rurl = request.url
    o = urlparse.urlparse(rurl)
    requestDomain = None
    requestDomain = o.netloc
    neclogger(requestDomain, debug, True)

    for entry in whitelistCache:
        if entry == requestDomain:
            domainMatch = True
            break
        
    if domainOverride:
        msg = "Overriding domain blocking"
        neclogger(msg, debug, False)
    else:
        if not domainMatch:
            msg = "No match for requesting domain: " + requestDomain
            neclogger(msg,debug, False)
            #app.logger.info(msg)
            rc = 'NYMBLE202'
            data = rc
            js = json.dumps(data)
            result = js
            resp = Response(result, status=202, mimetype='application/javascript')
            return resp
            
 
    if debug == True:
        end = time()
        t = "kinvey lookup elapsed time = " + str(end - start)
        neclogger(t,debug, True)
        #app.logger.info(t)
  
    # get the query string        
    qsdata= request.query_string
    qs = urlparse.parse_qs(qsdata)
    
    # process the user-agent info
    ua = request.headers.get('User-Agent')    
    neclogger("Useragent is:=== " + ua,debug, True)
    platform = None
    browser = None
    version = None
    if ua:
        useragent = UserAgent(ua)
        if useragent.platform:
            platform = useragent.platform
        if useragent.browser:
            browser = useragent.browser
        if useragent.version:
            version = useragent.version
    s_a = "platform," + platform + ",browser," + browser + ",version," + version + ","
    js_a = '"platform": "' + platform + '", "browser": "' + browser + '", "version": "' + version + '"'
              
    # get the client IP address        
    ip = request.remote_addr 
    if ip and'X-Forwarded-For' in request.headers:
        ip_adds = request.headers['X-Forwarded-For'].split(",")   
        ip = ip_adds[0]
    else:
        ip = "0.0.0.0"
    
    # add ip and user-agent data to logging record
    s = "ip," + ip + ","
    json_s = '"ip": "' + ip + '"'
    
    s = s + s_a
    json_s = json_s + ', ' + js_a
    
    s1 = ""
    json_s1 = ""
    cb = ""
    
    # process the query string, return the callback function if provided as a jsonp convenience
    
    if qs:
        keys = qs.keys()
        i = 0
        
        for k in keys:
            # print k, "..."
            # print qs.get(k)
            
            v = qs.get(k)
            if k == 'callback':
                # print k + " = " + v[0]
                cb = v[0]
            s1 = s1 + k + "," + v[i] + ","
            json_s1 = json_s1 + ', "' + k + '": "' + v[i] + '"'
        s = s + s1
        json_s = json_s + json_s1
    st = dt.datetime.now().strftime("date,%Y-%m-%d,time,%H:%M:%S.%f,")
    json_st = dt.datetime.now().strftime('"date": "%Y-%m-%d", "time": "%H:%M:%S.%f", ')
    s = "NECLog: " + st + s
    json_s = 'JSON_NECLog:' + ' { ' + json_st + json_s + ' }'
    
    # write log entry
    neclogger(s,debug, False)
    neclogger(json_s,debug, False)
    
    
    rc = 'NYMBLE200'
    data = rc
    js = json.dumps(data)
    if cb != '':
        result = cb + '(' + js + ')'
        resp = Response(result, status=200, mimetype='application/javascript')
    else:
        result = js
        resp = Response(result, status=200, mimetype='application/json')
    
    if debug == True:
        end = time()
        t = "final elapsed time = ", end - start
        neclogger(t,debug, True)
    return resp
コード例 #20
0
    def get_product_price(self, url):

        page = MarketBrowser.get_html(url)

        xpath = Directory.flat_xpath

        name_str = xpath(
            page,
            '//div[@class="pro_rightbox"]/h2[@class="product_Titlename"]/span/text()'
        )

        price_str = xpath(
            page,
            '//div[@class="product_PRICEBOX"]//span[@class="price_num"]/text()'
        )

        intro_str = xpath(page,
                          '//table[@class="title_word"]//table/tr/td/text()')

        try:

            # 紅蘿蔔約500g => 紅蘿蔔
            name = RtmartBrowser.NAME_RE.findall(name_str)[0]

            # try to find spec in introduction
            try:
                spec_str = RtmartBrowser.WEIGHT_RE.findall(intro_str)[0]
                weight = self.get_weight(spec_str)

                # test spec with weight in title
                test_weight = self.get_weight(name_str)

                if test_weight and test_weight != weight:
                    weight = test_weight

            # try to find spec in title
            except IndexError:
                weight = self.get_weight(name_str)

            # &prod_no=12345 => 12345
            pid = urlparse.parse_qs(url)['prod_no'][0]

            # try to find origin in introduction
            try:
                origin_str = RtmartBrowser.ORIGIN_RE.findall(intro_str)[0]

            # tyr to find origin in title
            except IndexError:
                origin_str = name_str

            origin = self.get_origin(origin_str)

            # try to find count in title
            count = self.get_count(name_str)

            price_str = Directory.NUM_RE.findall(price_str)[0]
            price = int(price_str)

            # try to find unit in title, introduction
            unit = self.get_unit(name_str + intro_str)

        except:
            log.error(Directory.ERROR_MAP[3] % (name_str, url))
            return None, None

        product = Product(source=url,
                          name=name,
                          origin=origin,
                          market_id=self.market.id,
                          pid=pid,
                          weight=weight,
                          count=count,
                          unit=unit)

        price = Price(price=price, date=self.date)

        return product, price
コード例 #21
0
ファイル: views.py プロジェクト: meletakis/collato
def rendered_content(content, request):
    #for wall_post in wall_posts:
    title = ''
    desc = ''
    site_image = ''
    article_title = ''
    urls = re.findall(
        'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+',
        content)
    mentions = re.findall('\@\w+', content)
    r = re.compile('###uploaded_image###(.*?)##!uploaded_image!##')
    m = r.search(content)
    if m:
        content = content.replace(m.group(1), "").replace(
            "###uploaded_image###", ""
        ).replace(
            "##!uploaded_image!##", ""
        ) + "<br/><div class='row'><div class='col-sm-6 col-md-3'><a href='" + m.group(
            1
        ) + "' target='_blank' class='thumbnail'><img data-src='holder.js/300' src='" + m.group(
            1) + "'/></a></div></div>"

    for mention in mentions:
        mentioned_username = mention.replace('@', '')
        mentioned_user = User.objects.get(username=mentioned_username)
        if mentioned_user:
            notify.send(request.user,
                        recipient=mentioned_user,
                        verb='post_mention')
            content = content.replace(
                mention, '<a href="/user/profile/' + mentioned_username +
                '">' + mention + '</a>')
    for url in urls:
        parse_obj = urlparse.urlparse(url)
        site = parse_obj.netloc
        path = parse_obj.path
        conn = httplib.HTTPConnection(site)
        conn.request('HEAD', path)
        response = conn.getresponse()
        conn.close()
        ctype = response.getheader('Content-Type')
        if response.status < 400 and ctype.startswith('image'):
            content = content + "<br/><div class='row'><div class='col-sm-6 col-md-3'><a href='" + url + "' target='_blank' class='thumbnail'><img data-src='holder.js/300' src='" + url + "'/></a></div></div>"
        else:
            og = opengraph.OpenGraph(url)
            if not len(og.items()) == 2:
                for x, y in og.items():
                    if x == 'type' and y == 'video':
                        for k, l in og.items():
                            if k == 'site_name' and l == 'YouTube':

                                url_data = urlparse.urlparse(url)
                                query = urlparse.parse_qs(url_data.query)
                                video = query["v"][0]
                                content = content.replace(
                                    url, "<a href='" + url +
                                    "' target='_blank'>" + url + "</a>"
                                ) + "<br/><br/><iframe width='300' height='200' src='//www.youtube.com/embed/" + video + "' frameborder='0' allowfullscreen></iframe>"
                            elif k == 'site_name' and l == 'Vimeo':
                                url_data = urlparse.urlparse(url)
                                video = url_data.path
                                content = content.replace(
                                    url, "<a href='" + url +
                                    "' target='_blank'>" + url + "</a>"
                                ) + "<br/><br/><iframe src='//player.vimeo.com/video" + video + "' width='300' height='200' frameborder='0' webkitallowfullscreen mozallowfullscreen allowfullscreen></iframe> <p></p>"
                    elif x == 'type' and y == 'article':
                        for k, l in og.items():
                            if k == 'title':
                                article_title = l
                            elif k == 'site_name':
                                title = l
                            elif k == 'description':
                                desc = l
                            elif k == 'image':
                                site_image = l
                        content = content + "<br/><table><tr><td><img width='50' src='" + site_image + "'</td><td><a href='" + url + "' target='_blank'/>" + article_title + "</a><br/>" + title + "</td></td></table>"
                    elif x == 'type':
                        for k, l in og.items():
                            if k == 'site_name':
                                title = l
                            elif k == 'description':
                                desc = l
                            elif k == 'image':
                                site_image = l
                        content = content.replace(
                            url, "<table><tr><td><img width='50' src='" +
                            site_image + "'</td><td><a href='" + url +
                            "' target='_blank'/>" + title + "</a><br/>" +
                            desc + "</td></td></table>")
            else:
                content = content.replace(
                    url,
                    "<a href='" + url + "' target='_blank'>" + url + "</a>")

    return content
コード例 #22
0
def key_value_pairs(url):
    return dict(urlparse.parse_qs(urlparse.urlsplit(url).query))
コード例 #23
0
ファイル: views.py プロジェクト: dithua/collato
def rendered_content( content,request ):
#for wall_post in wall_posts:
	title = ''
	desc = ''
	site_image = ''
	article_title = ''
	urls = re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', content)
	mentions = re.findall('\@\w+', content)
	r = re.compile('###uploaded_image###(.*?)##!uploaded_image!##')
	m = r.search(content)
	if m:
		content = content.replace(m.group(1), "").replace("###uploaded_image###", "").replace("##!uploaded_image!##", "") +"<br/><div class='row'><div class='col-sm-6 col-md-3'><a href='"+m.group(1)+"' target='_blank' class='thumbnail'><img data-src='holder.js/300' src='"+m.group(1)+"'/></a></div></div>"
	
	for mention in mentions:
		mentioned_username= mention.replace('@','')
		mentioned_user = User.objects.get(username=mentioned_username)
		if mentioned_user:
			notify.send(request.user, recipient=mentioned_user, verb='post_mention' )
			content=content.replace(mention, '<a href="/user/profile/'+mentioned_username+'">'+mention+'</a>')	
	for url in urls: 
		parse_obj = urlparse.urlparse(url)
		site = parse_obj.netloc
		path = parse_obj.path
		conn = httplib.HTTPConnection(site)
		conn.request('HEAD',path)
		response = conn.getresponse()
		conn.close()
		ctype = response.getheader('Content-Type')
		if response.status < 400 and ctype.startswith('image'):
			content = content+"<br/><div class='row'><div class='col-sm-6 col-md-3'><a href='"+url+"' target='_blank' class='thumbnail'><img data-src='holder.js/300' src='"+url+"'/></a></div></div>"
		else:
			og = opengraph.OpenGraph(url)
			if not len(og.items()) == 2:
				for x,y in og.items():
					if x == 'type' and y == 'video':
						for k,l in og.items():
							if k == 'site_name' and l == 'YouTube':
						
								url_data = urlparse.urlparse(url)
								query = urlparse.parse_qs(url_data.query)
								video = query["v"][0]
								content = content.replace(url,"<a href='"+url+"' target='_blank'>"+url+"</a>")+"<br/><br/><iframe width='300' height='200' src='//www.youtube.com/embed/"+video+"' frameborder='0' allowfullscreen></iframe>"
							elif k == 'site_name' and l == 'Vimeo':
								url_data = urlparse.urlparse(url)
								video = url_data.path
								content = content.replace(url,"<a href='"+url+"' target='_blank'>"+url+"</a>")+"<br/><br/><iframe src='//player.vimeo.com/video"+video+"' width='300' height='200' frameborder='0' webkitallowfullscreen mozallowfullscreen allowfullscreen></iframe> <p></p>"
					elif x == 'type' and y == 'article':
						for k,l in og.items():
							if k == 'title':
								article_title = l
							elif k == 'site_name':
								title = l
							elif k=='description':
								desc = l
							elif k=='image':
								site_image = l
						content = content +"<br/><table><tr><td><img width='50' src='"+site_image+"'</td><td><a href='"+url+"' target='_blank'/>"+article_title+"</a><br/>"+title+"</td></td></table>"
					elif x=='type':
						for k,l in og.items():
							if k == 'site_name':
								title = l
							elif k=='description':
								desc = l
							elif k=='image':
								site_image = l
						content = content.replace(url, "<table><tr><td><img width='50' src='"+site_image+"'</td><td><a href='"+url+"' target='_blank'/>"+title+"</a><br/>"+desc+"</td></td></table>")
			else:
				content = content.replace(url, "<a href='"+url+"' target='_blank'>"+url+"</a>")	

	return content