def data_web_update(): from webb import webb from aplikace.models import Product # from locale import atof import re id = 0 from html_table_parser import HTMLTableParser for polozka in Product.notKL(): url = "http://www.vskprofi.cz/vyhledavani?type=sku&search=" + polozka.Obj + "&sku=OK" page = webb.download_page(url) p = HTMLTableParser() p.feed(page.decode('utf-8')) #print(p.tables) ar = p.tables try: data = Product.find_by_Obj(polozka.Obj) for i in range(6, 10): if re.search('technical-data', ar[0:1][0][1][i]): data.TL = ar[0:1][0][1][i] #print ar[0:1][0][1][i] if re.search('product-lists', ar[0:1][0][1][i]): data.KL = ar[0:1][0][1][i] #print ar[0:1][0][1][i] if re.search('pics', ar[0:1][0][1][i]): data.Foto = ar[0:1][0][1][i] #print ar[0:1][0][1][i] data.sklad = ar[0:1][0][1][3] if ar[0:1][0][1][11]: data.Poznamka = ar[0:1][0][1][11] #print data.Obj data.update(commit=False) id = id + 1 if id % 100 == 0: print "aktualizuji data" db.session.commit() # for i in ar[0:1][0][1]:db.session.commit()rint # print(i) #print(float(re.split(" ", ar[0:1][0][1][4])[0].replace(".","").replace(",","."))) except: print "Chyba" + str(id) + " " + polozka.Obj db.session.commit() #data_web_update.delay() db.session.commit() return True
def data_web_update(): from webb import webb from aplikace.models import Product # from locale import atof import re id=0 from html_table_parser import HTMLTableParser for polozka in Product.notKL(): url="http://www.vskprofi.cz/vyhledavani?type=sku&search=" + polozka.Obj + "&sku=OK" page = webb.download_page(url) p = HTMLTableParser() p.feed(page.decode('utf-8')) #print(p.tables) ar=p.tables try: data=Product.find_by_Obj(polozka.Obj) for i in range(6,10): if re.search('technical-data',ar[0:1][0][1][i]): data.TL = ar[0:1][0][1][i] #print ar[0:1][0][1][i] if re.search('product-lists',ar[0:1][0][1][i]): data.KL = ar[0:1][0][1][i] #print ar[0:1][0][1][i] if re.search('pics',ar[0:1][0][1][i]): data.Foto = ar[0:1][0][1][i] #print ar[0:1][0][1][i] data.sklad = ar[0:1][0][1][3] if ar[0:1][0][1][11]: data.Poznamka = ar[0:1][0][1][11] #print data.Obj data.update(commit=False) id=id+1 if id % 100 == 0: print "aktualizuji data" db.session.commit() # for i in ar[0:1][0][1]:db.session.commit()rint # print(i) #print(float(re.split(" ", ar[0:1][0][1][4])[0].replace(".","").replace(",","."))) except: print "Chyba" + str(id) + " " + polozka.Obj db.session.commit() #data_web_update.delay() db.session.commit() return True
def update_page_active_time(request, format=None): if request.method == 'POST': data = request.data db=MySQLdb.connect(host="127.0.0.1",port=9306,passwd="",db="") cur = db.cursor() new_item = False query = "" if data['user_id'] ==None or data['user_id'] =='': resp = "Invalid Details" print (resp) return Response(resp , status=status.HTTP_202_ACCEPTED) if data['page_title'] =='' or data['page_title']== 'new tab' or data['page_id'] =='' or data['page_id'].startswith('chrome://'): resp = "Invalid Page" print (resp) return Response(resp , status=status.HTTP_202_ACCEPTED) if data['icon_url'] == None or data['icon_url'] == '': data['icon_url'] ="http://52.26.203.91:80/icon.png" else: data['icon_url'] = "http://www.google.com/s2/favicons?domain_url="+data['icon_url'] baseUrl = data['page_id'] if baseUrl.startswith("https://"): baseUrl = baseUrl.replace("https://", "",1) position = baseUrl.find("/") if position != -1: baseUrl = baseUrl[0:position] elif baseUrl.startswith("http://"): baseUrl = baseUrl.replace("http://", "",1) position = baseUrl.find("/") if position != -1: baseUrl = baseUrl[0:position] data['base_url'] = baseUrl isBlackListed = BlackListedPages.objects.filter(user_id= data['user_id'], base_url= data['base_url'] ).exists() if isBlackListed: print("Page with url :"+data['page_id']+" is black listed for user: "******"Domain is blacklisted --- Timer not Update" return Response(response, status=status.HTTP_202_ACCEPTED) else: print("Page "+data['page_id']+" is not on Blacklist for user "+data['user_id']+"! ") page_content = webb.download_page(data['page_id']) soup = BeautifulSoup(page_content,"html5lib") soup = BeautifulSoup(soup.html.body.encode_contents()) [tag.decompose() for tag in soup.find_all(attrs={'id' : re.compile(r'^MathJax_')})] html_exception = 0 for tag in soup(): for attribute in invalidAttrs: try: del tag[attribute] except: html_exception +=1 if tag.name in invalidTags: tag.decompose() if tag.name in invalidTagsToReplace: tag.replaceWithChildren() print("html parsing exceptions :"+ str(html_exception)) page_content = str(soup.prettify().encode('utf-8')) page_content = re.sub('[^a-zA-Z0-9\.]', ' ', page_content) data['page_content'] = page_content pageItem = PageActiveTime.objects.filter(user_id = data['user_id'], page_id = data['page_id'], is_active=1, is_deleted=0)[:1] # print (pageItem) if len(pageItem) == 0: new_item = True serializer = PageActiveTimeSerializer(data = data) # query = "INSERT INTO tart (page_id, user_id, page_title, cumulative_time, icon_url, base_url, is_active) VALUES (\'"+data['page_id']+"\',\'"+data['user_id']+"\',\'"+data['page_title']+"\',\'"+str(data['cumulative_time'])+"\',\'"+data['icon_url']+"\',\'"+data['base_url']+"\',\'1\')" else: print ("Already exists") data['cumulative_time'] = pageItem[0].cumulative_time + int(data['cumulative_time']) id = pageItem[0].id print(id) serializer = PageActiveTimeSerializer(pageItem[0],data = data) # query = "REPLACE INTO tart (id, cumulative_time) VALUES (\'"+str(id)+"\',\'"+str(data['cumulative_time'])+"\')" query = "UPDATE tart SET cumulative_time = "+str(data['cumulative_time'])+" WHERE id ="+str(id)+" " cur.execute(query); if serializer.is_valid(): serializer.save() if new_item: pageItem = PageActiveTime.objects.filter(user_id = data['user_id'], page_id = data['page_id'], is_active=1, is_deleted=0)[:1] id = pageItem[0].id query = "INSERT INTO tart (id, page_id, user_id, page_title, cumulative_time, icon_url, base_url,is_active, page_content) VALUES (\'"+str(id)+"\',\'"+data['page_id']+"\',\'"+data['user_id']+"\',\'"+data['page_title']+"\',\'"+str(data['cumulative_time'])+"\',\'"+data['icon_url']+"\',\'"+data['base_url']+"\',\'1\',\'"+page_content+"\')" print("here") cur.execute(query); return Response(serializer.data["page_id"], status=status.HTTP_201_CREATED) print ("invalid serializer") return Response(serializer.errors, status=status.HTTP_400_BAD_REQUEST)
# pip install webb from webb import webb http_address = "http://mail.ru" webb.get_ip(http_address) webb.get_whois_data(http_address) webb.ping(http_address) webb.traceroute(http_address) webb.clean_page(webb.download_page(http_address)) # webb.web_crawl(http_address)