def __init__(self, app, username, password, debug_dir=None): self.app = app self.username = username self.password = password self.debug_dir = debug_dir self.session = dryscrape.Session() self.session_sub_cat = dryscrape.Session() if self.debug_dir: if not os.path.exists(self.debug_dir): os.mkdir(self.debug_dir)
def readDIs(self, sleeptime, session_time): session = dryscrape.Session() session.set_timeout(session_time) url = 'http://' + self.ip + '/login.cgi?webpwd=Admin&Submit=Submit' try: session.visit(url) time.sleep(sleeptime) self.link = True self.timestamp = time.time() except: newError = 'Destination Host Unreachable ' + self.name + ' IP: ' + self.ip if newError not in self.ERRORS: logger.warning(newError) self.ERRORS[newError] = time.time() return False response = session.body() soup = BeautifulSoup(response, 'lxml') cDIs = soup.findAll(id=re.compile('DI\d+')) self.currentDIs = [] for DI in cDIs: if DI.text == 'ON': self.currentDIs.append(1) else: self.currentDIs.append(0) return True
def findImdbUrl(movie_title, movie_writer): dryscrape.start_xvfb() session = dryscrape.Session() link = "http://www.imdb.com/find?q=" + urllib.quote(movie_title) + "&s=all" session.visit(link) response = session.body() soup = BeautifulSoup(response) div = soup.find(lambda tag: tag.name == 'div' and tag.has_key('class') and tag['class'] == 'findSection') if (div): div_content = "".join([unicode(x) for x in div.contents]) title_search = re.compile('/title/tt\d+') search_results = re.findall(title_search, div_content) for movie_url in search_results: try: names = extractPeopleBehind("http://www.imdb.com" + movie_url + "/") if not set(movie_writer).isdisjoint(names): return "http://www.imdb.com" + movie_url + "/" #soup_search = BeautifulSoup(resp_search) #people_behind = soup_search.findall(lambda tag: tag.name=='div' and tag.has_key('class') and tag['class']=='credit_summary_item') #for people in people_behind: print people.text except: pass return None
def get_many_usernames(artist_urls, verbose=0): session = dryscrape.Session() session.set_attribute('auto_load_images', False) t1 = time.time() count = 0 for url in artist_urls: username_urls = [] session.visit(url) time.sleep(1) page = session.body() soup = BeautifulSoup(page) usernames = soup.find_all(class_="username") for un in usernames: username_urls.append(un.a.get('href')) slidernames = soup.find_all(class_="nowslider_name") for sn in slidernames: username_urls.append(sn.get('href')) #slightly minimize repeats #(not all of them, though, so do this again later!) username_urls = set(username_urls) with open("username_urls2.txt", 'a') as uuf: for uu in username_urls: uuf.write(uu + "\n") if verbose > 0: print count, print "elapsed seconds:", time.time() - t1 sys.stdout.flush() count += 1
def read_DIs(ip, name, normal_state): session = dryscrape.Session() url = 'http://' + ip + '/login.cgi?webpwd=Admin&Submit=Submit' print("") print(name + ' - ' + time.ctime()) session.visit(url) time.sleep(0.1) response = session.body() soup = BeautifulSoup(response, 'lxml') DIs = soup.findAll(id=re.compile('DI\d+')) sensors = ["220V", "SMOK", "MUXL", "DOOR", "", ""] i = 0 for DI in DIs: if DI.text != '-': if DI.text == 'ON': DIT = 1 else: DIT = 0 if str(DIT) != normal_state[i]: STATE = "ALARM" else: STATE = "OK" if i < 4: print('DI' + str(i) + ' ' + sensors[i] + ' ' + str(DIT) + ' ' + normal_state[i] + ' ' + STATE) i = i + 1 return
def getAd(url): url = "https:" + url session = dryscrape.Session() session.visit(url) res = session.body() print(url) return BeautifulSoup(res, "html.parser").find(id="adview")
def get_jobs(url): ret = {} jobs = [] rake_object = rake.Rake( "/root/freshack/Jobscraper/freshdeskhack/SmartStoplist.txt", 3, 2, 1) dryscrape.start_xvfb() session = dryscrape.Session() session.visit(url) html_page = session.body() soup = BeautifulSoup(html_page, 'lxml') master_tag = soup.find_all("div", class_="fd-posdesc") for tag in master_tag: job = {} job["title"] = tag.h3.string div_list = tag.find_all("div") job_desc = "" for childdiv in div_list: text = childdiv.string if text: job_desc = job_desc + text keywords = rake_object.run(job_desc) words = [] for word in keywords: if "year" not in word[0]: words.append(word[0]) else: job["experience"] = word[0] job["keywords"] = words jobs.append(job) ret["jobs"] = jobs return json.dumps(ret)
def hasPopUpWindow(self): """ This method uses dryscrape which imlements webkit and can scrape a web page for Javascript has well as HTML. javascript has alert,confirm,prompt,window.open methods """ from bs4 import BeautifulSoup retVal = 0 try: sess = dryscrape.Session() sess.visit(self.url) response = sess.body() soup = BeautifulSoup(response) data = soup.find('script') for tag in soup.findAll('script'): stringTag = str(tag) matchObj = re.search( r'.*open\(|alert\(|confirm\(|prompt\(.*', stringTag) # look for alert,confirm,prompt,open if matchObj: retVal = 1 else: retVal = -1 except: printFormat("exc", "hasPopUpWindow", "Pop up window exception") self.phishScore['popUpWindow'] = retVal return retVal
def __init__(self, uri, soup=True): if not uri.startswith('http'): uri = 'http://{0}'.format(uri) data_path = os.path.join(os.path.dirname(__file__), 'data') url_parts = urlparse(uri) local_page = url_parts.netloc page = url_parts.path if url_parts.path else '/' base_url = '{0}://{1}'.format(url_parts.scheme, url_parts.netloc) if url_parts.path: local_page += '-' + os.path.basename(url_parts.path) if url_parts.query: local_page += '-' + url_parts.query.replace('=', '-') page = '{0}?{1}'.format(page, url_parts.query) try: f = open(os.path.join(data_path, local_page), 'r') data = f.read() f.close() except IOError, e: sess = dryscrape.Session(base_url=base_url) sess.set_attribute('auto_load_images', True) sess.visit(page) data = sess.driver.body() local_file = os.path.join(data_path, local_page) with open(local_file, 'w') as data_file: data_file.write(data.encode('utf-8'))
def scrape_Taobao_dev(url, alertPrice, getName, getIsPriceUnder): session = dryscrape.Session() session.visit(url) response = session.body() soup = bs4.BeautifulSoup(response, "html.parser") try: price = float(soup.find(id='J_PromoPriceNum').getText()) except AttributeError as error: price = float( soup.find('input', { 'name': 'current_price' }).get('value')) #print(price) if getName and (not getIsPriceUnder): name = str.strip( soup.find("h3", { "class": "tb-main-title" }).getText()) return name elif getIsPriceUnder and (not (getName)): if price <= alertPrice: return True else: return False elif getName and getIsPriceUnder: if price <= alertPrice: return (name, True) else: return (name, False)
def createSession(crawlingWebsite): print "\nCrawler Initiated. Searching in '" + crawlingWebsite + "' for domains.\n\n" dryscrape.start_xvfb() #Begin new session with loaded scripts try: session = dryscrape.Session() session.visit(crawlingWebsite) response = session.body() #Reset session for handling memory issues session.reset() except InvalidResponseError as e: print "Cannot open " + crawlingWebsite + "\n" print 'InvalidResponseError:', e quit() soup = BeautifulSoup(response, "html.parser") #Searches for hyperrefs in a page. This is the hardcoded bit. #Searching for items is webpage-specific. For a different website, please refer to its HTML content #to find out which tags are needed to obtain a list of domains if any. tableFound = soup.findAll("a", {"target": "_blank"}) if len(tableFound) == 0: print "Nothing found. Terminating crawler." quit() else: for row in tableFound: #Add found domains to the list of sites siteList.append(row.get('href'))
def postear_en_twitter(mensaje): if 'linux' in sys.platform: dryscrape.start_xvfb() sess = dryscrape.Session(base_url = 'https://www.twitter.com') sess.set_header("User-Agent", "Mozilla/5.0 (Windows NT 5.1; rv:41.0) Gecko/20100101 Firefox/41.0") # True = muestra imágenes # False = oculta las imágenes sess.set_attribute('auto_load_images', False) email='Escribe tu correo de twitter' # Debe de ir entrecomillado password='******' # Debe de ir entrecomillado try: sess.visit('/') q = sess.at_xpath('//*[@id="doc"]/div/div[1]/div[1]/div[1]/form/div[1]/input') q.set(email) q = sess.at_xpath('//*[@id="doc"]/div/div[1]/div[1]/div[1]/form/div[2]/input') q.set(password) q.form().submit() q=sess.at_xpath('//*[@id="tweet-box-home-timeline"]') q.click() q=sess.at_xpath('/html/body/div[2]/div[3]/div/div[2]/div[2]/div/form/div[2]/textarea') q.set(mensaje) q = sess.at_xpath('//*[@id="timeline"]/div[2]/div/form/div[3]/div[2]/button') q.click() sleep(1) # sess.render('twitter.png') except Exception as e: print (e)
class Nykaa(): url = "https://www.makeupalley.com/product/showreview.asp/ItemId=192371/Daily-Superfoliant/Dermalogica/Scrubs" session = dryscrape.Session() session.visit(url) page = session.body() soup = BeautifulSoup(page, "html.parser") data = soup.find("div", {"id": "reviews-wrapper"}) xyz = [] for item in data.findAll("div", {"class": "comments"}): dict = {} dict['author'] = item.find("div", { "class": "user-name" }).find("p").find(text=True).strip('\t') dict['date'] = item.find("div", { "class": "date" }).find("p").text.strip('on') dict['user_details'] = item.find("div", { "class": "important" }).find("p").find(text=True) dict['comment'] = item.find("div", { "class": "comment-content" }).find("p").find(text=True) xyz.append(dict) print(xyz)
class Adorebeauty(object): url = "https://www.adorebeauty.com.au/dermalogica/dermalogica-daily-superfoliant.html" session = dryscrape.Session() session.visit(url) page = session.body() soup = BeautifulSoup(page, "html.parser") analyzer = SentimentIntensityAnalyzer() data = soup.find("div", {"id": "customer-reviews"}) xyz = [] for item in data.findAll("div", {"itemprop": "review"}): dict = {} dict['user'] = item.find("span", { "itemprop": "author" }).find(text=True) dict['review_title'] = item.find("span", { "class": "review-title" }).find(text=True) dict['date'] = item.find("span", {"class": "date"}).find(text=True) dict['comments'] = item.find("p", { "itemprop": "description" }).find(text=True) vs = analyzer.polarity_scores(dict['comment']) #print(str(vs)) dict['sentiment'] = str(vs) xyz.append(dict) return xyz
def fetch_page(self, url): corrected_url = url if not url.startswith("http://") and not url.startswith("https://"): corrected_url = "http://{u}".format(u=url) self.logger.debug("Fetching page: {u}".format(u = corrected_url)) cache = get_page_cache() cached_content = cache.get_cached_content(corrected_url) if cached_content: self.logger.debug("Page served from cache") return cached_content if not self._can_fetch(corrected_url): self.logger.warn("Unable to fetch, disallowed by robots.txt") raise FetchException("Disallowed by robots.txt") try: parsed_url = urlparse.urlparse(url) base_url = parsed_url.scheme + "://" + parsed_url.hostname path = parsed_url.path sess = dryscrape.Session(base_url=base_url) sess.set_attribute('auto_load_images', False) sess.visit(path) content = sess.body() cache.save_content(corrected_url, content) return content except Exception as e: raise FetchException("Failed to load the page", e)
def __init__(self, p_from=0, p_to=10000): self.__session = dryscrape.Session() self.__main = "http://auto.ria.com" self.db = sqlite3.connect('cars.db',timeout=1).cursor() try: self.db.execute('''CREATE TABLE cars (name test, price integer, currency text, url text)''') except: print "Couldn't create new table" params = { 'target': 'search', 'event': 'little', 'category_id':1, 'bodystyle[0]':3, 'bodystyle[1]':4, 'bodystyle[2]':6, 'chooseTypeSearchAuto':'oldAutos', 'marka':58, 'model':0, 'state':0, 's_yers':2004, 'po_yers':0, 'price_ot':p_from, 'price_do':p_to, 'currency':1 } self.__params = urllib.urlencode(params)
def get_pgo_events(request): # eventos: # events-list__event__date__day : clase para los días (span) # events-list__event__date__month : clase para el mes # events-list__event__content (CONTENIDO, DIVIDIO POR TITULO Y TEXTO) # events-list__event__title : clase para el título del evento # events-list__event__body : cuerpo / descripción del evento sess = dryscrape.Session() sess.visit('https://pokemongolive.com/en/events/') body = sess.body() soup = BeautifulSoup(body, features="lxml") dias = list(soup.find_all('span', class_='events-list__event__date__day')) meses = list( soup.find_all('span', class_='events-list__event__date__month')) titulos = list(soup.find_all('div', class_='events-list__event__title')) descripciones = list( soup.find_all('div', class_='events-list__event__body')) if dias and meses and titulos and descripciones: for i in range(5): if not Event.objects.filter(days=dias[i].text, title=titulos[i].text).exists(): Event.objects.create(title=titulos[i].text, description=descripciones[i].text, days=dias[i].text + ' ' + meses[i].text) return HttpResponse('Got them!')
def getResumen(soup, url): # session = dryscrape.Session() # session.visit(url) # response = session.body() # soup = BeautifulSoup(response) htmlResumen = soup.find_all(class_='adp-summary') while (len(htmlResumen) == 0): print 'Error el resumen esta vacio. Esperamos 15 segundos para proseguir' time.sleep(15) session = dryscrape.Session() session.visit(url) response = session.body() soupinterno = BeautifulSoup(response) htmlResumen = soupinterno.find_all(class_='adp-summary') del soupinterno del response del session resumenParseado = parseaResumen(str(htmlResumen[0])) resumenParseadaJSON = { 'distancia': resumenParseado[ 0], #TODO meter esta distancia en km para así poder hacer los cálculos de las medias sin morirnos en el intento 'tiempo': resumenParseado[ 1] #TODO meter el tiempo en minutos para así poder hacer los cálculos de las medias sin morirnos en el intento } return json.dumps(resumenParseadaJSON)
def onMouseOver(self): """ This method looks for the on mouse over re-writing of links in the status bar. This type of ruse has become less effective as browsers usually ignore this. """ from bs4 import BeautifulSoup retVal = 0 try: sess = dryscrape.Session() sess.visit(self.url) response = sess.body() soup = BeautifulSoup(response) for tag in soup.findAll('a'): if tag.has_attr('onmouseover'): match = re.search(r'window.status', tag['onmouseover']) if match: retVal = 1 else: retVal = -1 if tag.has_attr('href'): #matches the href=javascript tag hrefMatch = re.search(r'javascript', tag['href']) if hrefMatch: retVal = 1 else: retVal = -1 except: printFormat("exc", "onMouseOver", "On mouse over exception") self.phishScore['onMouseOver'] = retVal return retVal
def getRuta(soup, url): # session = dryscrape.Session() # session.visit(url) # response = session.body() # soup = BeautifulSoup(response) htmlRuta = soup.find_all(class_='adp-directions') while (len(htmlRuta) == 0): print 'Error la ruta esta vacio. Esperamos 15 segundos para proseguir y repetimos' time.sleep(15) session = dryscrape.Session() session.visit(url) response = session.body() soupinterno = BeautifulSoup(response) htmlRuta = soupinterno.find_all(class_='adp-summary') del soupinterno del response del session rutaParseada = ruta(str(htmlRuta[0])) rutaPorPasosJSON = [] for paso in rutaParseada: if 'El destino' in paso[1]: paso[1] = paso[1].replace('El destino', '. El destino') pasoJSON = { 'numero_paso': paso[0].replace('.', ''), 'descripcion_recorrido': paso[1], 'distancia_recorrida': paso[2] } rutaPorPasosJSON.append(pasoJSON) return json.dumps(rutaPorPasosJSON)
def find(self, string): url = "https://duckduckgo.com/?q=procon.org" for word in string.split(" "): url += "+" + word url += "&t=h_&ia=web" website_string = requests.get(url).text session = dryscrape.Session() session.visit(url) website_string = session.body() search_results = re.findall('<span class="result__url__domain">(.*?)</span>', website_string, flags=re.DOTALL) base_url = search_results[0] links = re.findall("<a.*?newblue-get-started-(.*?)'", requests.get(base_url).text) self.background_url = base_url if links != []: self.argument_url = base_url + links[1] else: self.argument_url = base_url
def listMovieScripts(): dryscrape.start_xvfb() session = dryscrape.Session() imsdbLink = "http://www.imsdb.com/all scripts/" session.visit(imsdbLink) webContent = session.body() bs = BeautifulSoup(webContent) movies = bs.findAll(lambda tag: tag.name=='p') links = {} writers = {} for movie in movies: #<p><a href="/Movie Scripts/Boyhood Script.html" title="Boyhood Script">Boyhood</a> (Undated Draft)<br><i>Written by Richard Linklater</i><br></p> movie_title = movie.find(lambda tag: tag.name=='a').text if (movie_title.endswith(", The")): movie_title = "The " + movie_title.replace(", The", "") movie_url = "http://www.imsdb.com" + urllib.quote(movie.find(lambda tag: tag.name=='a').get("href")) movie_writer = movie.find(lambda tag: tag.name=='i').text movie_writer = movie_writer.replace("Written by ", "") movie_writer_list = getlastNames(movie_writer.split(",")) #print movie_title, movie_url, movie_writer_list links[movie_title] = movie_url writers[movie_title] = movie_writer_list return (links, writers)
def get_dryscape_session(self): if self.session: self.session.reset() return self.session else: self.session = dryscrape.Session() return self.session
def __init__(self, url): self.__url = url self.__haveResult = True self.__pageNum = 0 self.__dataList = [] sess = dryscrape.Session() sess.set_attribute('auto_load_images', False) sess.visit(url) time.sleep(.5) content = sess.body() self.soup = bs4.BeautifulSoup(content, "lxml") if ( 'delivery' in self.__url ): #set mode by looking for substring in url: 'delivery' or 'pickup' (different html format) self.__mode = 'delivery' else: self.__mode = 'pickup' #check if any restaurant data in this area resultCheck = self.soup.select('.no-results-h3') if (len(resultCheck)): #no result self.__haveResult = False else: #got result, then find the total page number self.__pageNum = int( self.soup.select('.searchResults-footer-stats')[0].find_all( 'span')[3].string) #print '----total items: ' + str(self.soup.select('.searchResultsSnippet-text')[1].string) print '----' + self.__mode + ': total pages number:' + str( self.__pageNum)
def get_premarket_volume(): session = dryscrape.Session() session.visit(url) response = session.body() soup = BeautifulSoup(response, 'html.parser') right_table = soup.find('table', {'id': 'preOpenNiftyTab'}) rows = [] for row in right_table.find_all('tr'): rows.append([ " ".join( val.text.encode('utf8').replace(" NFO", "").replace( " NSE", "").replace(" MCX", "").replace(",", "").split()) for val in row.find_all('td') ]) sorted_rows = sorted(rows[2:], key=lambda x: float(x[7]), reverse=True) top_stocks = [] for row in sorted_rows[:10]: top_stocks.append(row[0]) sc.api_call("chat.postMessage", channel="@dexter", text='Volume - Stock:{} Volume:{} Change%:{}'.format( row[0], row[7], row[5]), as_user=False) return top_stocks
def getClothes(): """Fetch a list of clothes from a saga-falabella's url Args: url: Base men url address Returns: This script create json filter with clothe's cathegories Example: python3 firebase_script_clothes.py <url> """ sess = dryscrape.Session() sess.set_attribute('auto_load_images', False) sess.visit(sys.argv[1]) source = sess.body() soup = bs.BeautifulSoup(source,'lxml') b = soup.select('div.fb-pod__item') #vertical-filters-custom for index, clothe in enumerate(b.findAll(name='a')): try: print(clothe['href']) sess.visit(clothe['href'])#dryscrape source = sess.body()#dryscrape soup = bs.BeautifulSoup(source, 'lxml') # x = soup.select('div.row.transcripts.video-transcripts')#principal parent tag # for subtitle_block in x[0].findAll(name='span'): # saveFile(name_clothe, subtitle_block) except: print("error")
def testURL(section_url): # # Put the stuff you see when using Inspect Element in a variable called html. # html = urlopen(section_url).read() # # Parse the stuff. # print html[:100] # soup = BeautifulSoup(html, "lxml") # # The next two lines will change depending on what you're looking for. This # # line is looking for <dl class="boccat">. # boccat = soup.find("dl", "ba-Eb-ba") # # This line organizes what is found in the above line into a list of # # hrefs (i.e. links). # print boccat # category_links = [BASE_URL + dd.a["href"] for dd in boccat.findAll("dd")] # return category_links session = dryscrape.Session(base_url=BASE_URL) session.visit(section_url) print(type(session)) response = session.body() print(type(response)) print response # soup = BeautifulSoup(response, 'lxml') soup = BeautifulSoup(session, 'lxml') # results = soup.findAll("div", {"class": "ba-Eb-ba"}) results = soup.findAll("div", class_="ba-Eb-ba") # results = soup.findAll(text='ga:type=\"Comment\"') print results print len(results) # print(soup) # print len(soup) return 0
def main(): url = input('URL: ') name = input('Name to save images: ') session = dryscrape.Session() session.visit(url) response = session.body() soup = BeautifulSoup(response, "html.parser") if not os.path.isdir('img'): os.mkdir('img') # Find images in links links = soup.findAll('a', {'href': True}) for link in links: if link.findAll('img'): img_url = link['href'] queue.put(img_url) # Find images in <img> tags imgs = soup.findAll('img', {'src': True}) for img in imgs: img_url = img['src'] queue.put(img_url) create_workers(url, name)
def scrape(): session = dryscrape.Session() session.set_attribute('auto_load_images', False) session.visit(URL) response = session.body() global_tree = html.fromstring(response) products_links = global_tree.xpath('//div[@id="J_goodsList"]/ul/li/div/div[1]/a/@href') output = [] validator = Validator(ScrapeStrategy(scrape_first_type_object), ScrapeStrategy(scrape_second_type_object)) for i, products_link in enumerate(products_links): t0 = datetime.now() session.visit('http:' + products_link) response = session.body() tree = html.fromstring(response) strategy = validator.choose_strategy(tree) output.append(strategy.scrape(tree, products_link)) print datetime.now() - t0 with open('products.csv', 'w') as csvfile: fieldnames = ['Brand', 'MPN', 'URL', 'Name', 'Price', 'Stock'] writer = csv.DictWriter(csvfile, fieldnames=fieldnames) writer.writeheader() for obj in output: writer.writerow(obj.as_dict())
def process_sites(self): # Determine if a dryscrape session is required for a js-reliant site need_session = any([x.require_js for x in self.sites]) if need_session: print "Dryscrape session required, creating now.." session = dryscrape.Session() print "Dryscrape session created" for site in self.sites: # If it requires JS, use dryscrape if site.require_js: session.visit(site.url) page = session.body() # Otherwise just use requests else: page = requests.get(site.url).text # Processing is done the same regardless soup = BeautifulSoup(page, "lxml") processed_data = site.process_page(soup) self.processed_auctions[site.base_url] = processed_data print "" print "Done processing sites" PP.pprint(self.processed_auctions)