def parse_review(url, df): x_rating = "//span[@itemprop=\"ratingValue\"]" x_text = "//div[@itemprop=\"reviewBody\"]" x_pro = "//span[@class=\"plus\"]" x_con = "//span[@class=\"minus\"]" g = grab.Grab() ua = random.choice(user_agents) print(ua) g.go(url, user_agent="%s" % ua) rating = g.doc.select(x_rating).text() text = " ".join(g.doc.select(x_text).text_list()) text = text.replace(".", ". ") text = text.replace(",", ", ") text = text.replace("!", "! ") try: pro = " ".join(g.doc.select(x_pro).text_list()) con = " ".join(g.doc.select(x_con).text_list()) except Exception: pro = float('NaN') con = float('NaN') # time.sleep(120) df.loc[len(df)] = [rating, text, pro, con] return df
def parse_playlist_time(url: str) -> (int, List[Tuple[str, str]]): """Функция парсит страницу плейлиста и подсчитывает сумму продолжительности роликов.""" import grab g = grab.Grab() if PROXY: g.setup(proxy=PROXY, proxy_type=PROXY_TYPE) # Передаю невалидный User-Agent чтобы ютуб вернул отрендеренную страницу (данные в HTML будут) # а не страницу с скриптом -- данные будут как объект javacript g.setup(headers={'User-Agent': 'null'}) g.go(url) video_list = g.doc.select('//*[@class="pl-video yt-uix-tile "]') time_list = g.doc.select('//*[@class="timestamp"]') total_seconds = 0 items = [] for title, time in zip(video_list, time_list): title = title.attr('data-title') time_str = time.text() items.append((title, time_str)) total_seconds += time_to_seconds(time_str) return total_seconds, items
def count_total_playlist_time(url, proxy=None, proxy_type='http'): """Функция парсит страницу плейлиста и подсчитывает сумму продолжительности роликов.""" import grab g = grab.Grab() if proxy: g.setup(proxy=proxy, proxy_type=proxy_type) g.go(url) video_list = g.doc.select('//*[@class="pl-video yt-uix-tile "]') time_list = g.doc.select('//*[@class="timestamp"]') total_seconds = 0 print('Playlist:') for i, (video, time) in enumerate(zip(video_list, time_list), 1): time_str = time.text() print('{}. {} ({})'.format(i, video.attr('data-title'), time_str)) time_split = time_str.split(':') if len(time_split) == 3: h, m, s = map(int, time_split) total_seconds += h * 60 * 60 + m * 60 + s elif len(time_split) == 2: m, s = map(int, time_split) total_seconds += m * 60 + s else: total_seconds += int(time_split[0]) return total_seconds
def gather_models(url): g = grab.Grab() g.go(url) links = g.doc.select( "//*[@id='content']/div[2]/div[2]/div[2]/div/div[3]/a") print(links)
def __init__(self, user, password, base, useragent=None, transport='pycurl'): self.base = base self.user = user self.password = password self.cookie_set = False # Browser instance self.browser = grab.Grab(timeout=DEFAULT_TIMEOUT) self.browser.setup_transport(transport) if transport == "urllib3": import urllib3 import certifi self.browser.transport.pool = urllib3.PoolManager( cert_reqs='CERT_REQUIRED', ca_certs=certifi.where()) # Grab automatically handles cookies. # Are we anonymous? self.anonymous = (user == '') # Identify ourselves if useragent is not None: self.browser.setup(headers={'User-agent': useragent})
def start_button(self): link = self.ui.lineEdit.text() if re.search('https://www.supremenewyork.com/shop/(.+?)', link): # Getting all the values from the database to fill in the delivery information with the bot: db_data = DB.execute( 'select * from USER_DATA where id=2').fetchall() DATA['url_1'] = link DATA['name'] = db_data[0][1] DATA['email'] = db_data[0][2] DATA['tel'] = db_data[0][3] DATA['address'] = db_data[0][4] DATA['city'] = db_data[0][5] DATA['post_code'] = db_data[0][6] DATA['country'] = db_data[0][7] DATA['card'] = db_data[0][8] DATA['card_number'] = db_data[0][9] DATA['valid_month'] = db_data[0][10] DATA['valid_year'] = db_data[0][11] DATA['cvv'] = db_data[0][12] if self.ui.radioButton_8.isChecked(): DATA['size'] = 'Medium' elif self.ui.radioButton_9.isChecked(): DATA['size'] = 'Large' elif self.ui.radioButton_10.isChecked(): DATA['size'] = 'XLarge' if self.ui.radioButton.isChecked(): DATA['img'] = 'on' elif self.ui.radioButton_2.isChecked(): DATA['img'] = 'off' # Check proxy valid: if self.ui.lineEdit_12.text(): checker = grab.Grab() checker.setup(proxy=self.ui.lineEdit_12.text().strip(), proxy_type='http', connect_timeout=5, timeout=5) try: checker.go('https://www.supremenewyork.com/') DATA['proxy'] = self.ui.lineEdit_12.text().strip() self.bot_thread = OrderThread() self.bot_thread.start() except grab.GrabError: self.proxy_error = ProxyError() self.proxy_error.show() else: self.bot_thread = OrderThread() self.bot_thread.start()
def __getPage(self, url, hammer_mode=True): g = grab.Grab() g.setup(connect_timeout=1, timeout=3) g.setup(hammer_mode=hammer_mode, hammer_timeouts=( (3, 5), (5, 7), (7, 9), (15, 20), (50, 60) )) g.go(url) return g.response
def __init__(self, config): self.order = config['order'] self.grab_connect_timeout = int(config['connect_timeout']) self.grab_download_timeout = int(config['download_timeout']) self.grab = grab.Grab() self.grab.setup(connect_timeout=self.grab_connect_timeout, timeout=self.grab_download_timeout) self.proxy_list_filename = ''
def __init__(self, workdir): self.workdir = workdir.rstrip(os.path.sep) + os.path.sep self.g = grab.Grab() self.g.cookies.set(name='over18', value='yeah', domain='.imgsrc.ru', path='/', expires=time.time() + 3600 * 24) self.go(self.host)
def _auth_and_get_api_key(self): g = grab.Grab() g.go(url='https://home.openweathermap.org/users/sign_in') g.doc.set_input('user[email]', self.user) g.doc.set_input('user[password]', self.password) g.doc.submit() g.go(url='https://home.openweathermap.org/api_keys') for i in g.doc(".//div").text_list(): if not re.match('Key Name [a-zA-Z0-9]*.*', i) is None: return i.split()[2]
def get_articles(): g = grab.Grab() parser.setup_grab(g) g.go("http://it.toolbox.com") css_path = ".tile .tileContent div .floatleft a" posts = parser.get_articles(g, css_path, css_path, "ittoolbox") return posts
def test(): g = grab.Grab() g.go('http://habrahabr.ru/post/266293/') root_node = g.doc.tree.cssselect('.post_show')[0] text = hr.html_to_readable(root_node) path = 'out' if not os.path.exists(path): os.mkdir(path) outpath = os.path.join(path, 'out.log') with codecs.open(outpath, 'w', encoding='utf-8') as fh: fh.write(text)
def get_data(url): df = pd.DataFrame(columns=['Time', 'Parking', 'Level', 'Option', 'Spaces']) update_time = datetime.datetime.now() html_doc = grab.Grab().go(url).body soup = BeautifulSoup(html_doc, 'lxml') structures = soup.find_all('table') for structure in structures: structure_df = get_parking(structure, update_time) df = df.append(structure_df, ignore_index=True) return df
def login(user, password): g = grab.Grab() g.setup(post={ "user[email]": user, "user[password]": password, "grant_type": "password", "authenticity_token": "undefined" }, timeout=60000) g.go(SIGN_IN_URL) return g
def gather_models(url): g = grab.Grab() g.go(url) links = [] flinks = g.doc.select("//*[@id='content']/div[2]/div[2]/div/div/div[3]/a") for fl in flinks: links.append("http://irecommend.ru" + fl.attr("href")) # print(links) return links
def search(self, search_term="", journal_title_issn="", volume_year="", issue="", pages="", number_results=25): g = grab.Grab() request = { "s": search_term, "journalid": journal_title_issn, "v": volume_year, "i": issue, "p": pages, "redirect": "0" } if sys.version_info[0] < 3: url = self.url+"?"+ \ urllib.urlencode(request) else: url = self.url+"?"+ \ urllib.parse.urlencode(request) g.go(url) search_result = [] #body > font:nth-child(7) Displayed first 100 results #body > font:nth-child(7) Found 1 results nresults = re.search( r'([0-9]*) results', g.doc.select("/html/body/font[1]").one().text()) nresults = int(nresults.group(1)) pages_to_load = int(math.ceil(number_results / 25.0)) # Pages needed to be loaded # Check if the pages needed to be loaded are more than the pages available if pages_to_load > int(math.ceil(nresults / 25.0)): pages_to_load = int(math.ceil(nresults / 25.0)) for page in range(1, pages_to_load + 1): if len(search_result ) > number_results: # Check if we got all the results break url = "" request.update({"page": page}) if sys.version_info[0] < 3: url = self.url+"?"+ \ urllib.urlencode(request) else: url = self.url+"?"+ \ urllib.parse.urlencode(request) g.go(url) search_result += self.__parse(g) if page != pages_to_load: # Random delay because if you ask a lot of pages,your ip might get blocked. time.sleep(random.randint(250, 1000) / 1000.0) return search_result[:number_results]
def __init__(self, **kwargs): self.result = None self.g = grab.Grab(timeout=5, connect_timeout=5, user_agent='METASCAN') page_type = self.determine_page_type(self.search_url.format(**kwargs)) logger.info(page_type) if page_type == 'error': self.result = None elif page_type == 'search_page': self.search_page(**kwargs) self.result = self.make_json_from_page() elif page_type == 'vulns_page': self.vulns_page() self.result = self.make_json_from_page()
def setup_grab(self): self.grab = grab.Grab() self.current_proxy = random.choice(self.proxy_list) first_slash_index = self.current_proxy['_id'].find('/') proxy_type = self.current_proxy['_id'][:first_slash_index - 1] proxy_address = self.current_proxy['_id'][first_slash_index + 2:] proxy_lag = self.current_proxy['latency'] self.grab.setup(connect_timeout=self.grab_connect_timeout + proxy_lag, timeout=self.grab_download_timeout + proxy_lag, proxy=proxy_address, proxy_type=proxy_type)
def process_url(url): logger.info('Processing %s...' % url) g = grab.Grab() resp = g.go(url) links = [] for found_url in parser.parse_urls(resp.body.decode()): logger.info('Found URL: %s' % found_url) links.append(found_url) if links: io.save_links(url, links)
def del4flash(): g = grab.Grab() g.go('http://172.25.63.1/myconnect/') text = g.css_text('body') text = text.split() already = 0 for i in range(len(text)): if text[i] == 'Size' or already == 1: if already == 0: already = 1 continue else: r = requests.delete('http://172.25.63.1/myconnect/' + text[i])
def gather_reviews(url, proxy): g = grab.Grab() # g.setup(proxy=proxy) g.go(url) links = [] flinks = g.doc.select( "//*[@id=\"quicktabs_tabpage_12388_myreviewinfo\"]/div/div/div/ul/li/div/div/p/nobr/a" ) "//*[@id=\"quicktabs_tabpage_12388_myreviewinfo\"]/div/div[1]/div/ul/li/div/div/p/nobr/a" for fl in flinks: print(fl) links.append("http://irecommend.ru" + fl.attr("href")) return links
def main(): settings.init() settings.logger = log.Log(settings.log_file_name) settings.logger.daemon = True settings.logger.start() settings.logger.log( 'Starting grabber {name}'.format(name=settings.irc_nick)) tools.create_dir(settings.dir_ready) tools.create_dir(settings.dir_new_lists) tools.create_dir(settings.dir_old_lists) if not os.path.isfile(settings.target_main): raise Exception( "Please add a rsync target to file '{settings.target_main}'.". format(**locals())) settings.irc_bot = irc.IRC() settings.irc_bot.daemon = True settings.irc_bot.start() time.sleep(30) settings.upload = upload.Upload() settings.upload.daemon = True settings.upload.start() settings.grab = grab.Grab() settings.grab.daemon = True settings.grab.start() while settings.running: # if not settings.logger.isAlive(): # print('The logger stopped running...') # settings.irc_bot.send('PRIVMSG', 'The logger stopped running...', # settings.irc_channel_bot) # settings.running = False # if not settings.irc_bot.isAlive(): # print('The IRC bot stopped running...') # settings.running = False # if not settings.upload.isAlive(): # print('The uploader stopped running...') # settings.irc_bot.send('PRIVMSG', 'The uploader stopped running...', # settings.irc_channel_bot) # settings.running = False # if not settings.grab.isAlive(): # print('The grabber stopped running...') # settings.irc_bot.send('PRIVMSG', 'The grabber stopped working...', # settings.irc_channel_bot) # settings.running = False time.sleep(1)
def _get(self, url, **kwargs): grabber = grab.Grab() grabber.reset() grabber.setup( connect_timeout=5, timeout=300, hammer_mode=True, hammer_timeouts=((300, 360), (360, 420), (420, 480)), ) if kwargs: grabber.setup(**kwargs) if self.proxy_enabled: if hasattr(self, 'proxy'): grabber.setup(proxy=self.proxy, proxy_type='http') if hasattr(self, 'proxy_auth'): grabber.setup(proxy_userpwd=self.proxy_auth) grabber.go(url) return grabber.response.body
async def prx_srv(proxies): while True: proxy_for_parser = await proxies.get() print('start_proxy_for_parser') print('Found proxy: %s' % proxy_for_parser) g = grab.Grab() print('1') g.setup(proxy=proxy_for_parser.host + ':' + str(proxy_for_parser.port), proxy_type='http') print('2') try: g.go('http://www.google.com/search?q=Spam') except: pass print('2.1') print(g.doc.url) print('3') proxy_for_parser = await proxies.get()
def parse_article_image(article, site_url=''): try: img = article.cssselect('img:first-child')[0] img.set('class', '') img.set('id', '') img.set('align', '') img.set('src', absolutize_link(img.get('src', ''), site_url)) return tostring(img).strip() except IndexError: return b'' except AttributeError: try: img = grab.Grab(article).css_one('img:first-child') except GrabError: return b'' img.set('class', '') img.set('id', '') img.set('align', '') img.set('src', absolutize_link(img.get('src'), site_url)) return tostring(img).strip()
def get_articles(): g = grab.Grab() parser.setup_grab(g) g.go('http://planet.clojure.in') css_path = '.entry .article > h2 a' summary_texts = [] for elem in g.css_list(".entry .article"): text = '' for children in elem.getchildren()[1:-1]: text += parser.remove_bad_tags(tostring(children).decode()) summary_texts.append(text) posts = parser.get_articles(g, css_path, css_path, 'planetclojure', 'planet.clojure.in') for (post, summary_text) in zip(posts, summary_texts): post['summary'] = summary_text return posts
def search(self, search_term="", pages="", number_results=25): # TODO: Add Batch search for comics. g = grab.Grab() request = {"s": search_term, "p": pages} if sys.version_info[0] < 3: url = self.url+"?"+ \ urllib.urlencode(request) else: url = self.url+"?"+ \ urllib.parse.urlencode(request) g.go(url) search_result = [] nresults = re.search( r'([0-9]*) results', g.doc.select("/html/body/font[1]").one().text()) nresults = int(nresults.group(1)) pages_to_load = int(math.ceil(number_results / 25.0)) # Pages needed to be loaded # Check if the pages needed to be loaded are more than the pages available if pages_to_load > int(math.ceil(nresults / 25.0)): pages_to_load = int(math.ceil(nresults / 25.0)) for page in range(1, pages_to_load + 1): if len(search_result ) > number_results: # Check if we got all the results break url = "" request.update({"page": page}) if sys.version_info[0] < 3: url = self.url+"?"+ \ urllib.urlencode(request) else: url = self.url+"?"+ \ urllib.parse.urlencode(request) g.go(url) search_result += self.__parse(g) if page != pages_to_load: # Random delay because if you ask a lot of pages,your ip might get blocked. time.sleep(random.randint(250, 1000) / 1000.0) return search_result[:number_results]
def __choose_mirror(self): g = grab.Grab() if self.mirrors is None: raise MissingMirrorsError("There are no mirrors!") if isinstance(self.mirrors, str): self.mirrors = [self.mirrors] last = len(self.mirrors) - 1 for i, mirror in enumerate(self.mirrors): try: if sys.version_info[0] < 3: url = mirror g.go(url) else: url = mirror g.go(url) self.__selected_mirror = mirror categories = g.doc( "//input[contains(@name,'lg_topic')]").node_list() for category in categories: if category.attrib["value"] == "libgen": self.libgen = self.__Libgen( g.make_url_absolute( category.getnext().attrib["href"])) elif category.attrib["value"] == "scimag": self.scimag = self.__Scimag( g.make_url_absolute( category.getnext().attrib["href"])) elif category.attrib["value"] == "fiction": self.fiction = self.__Fiction( g.make_url_absolute( category.getnext().attrib["href"])) elif category.attrib["value"] == "comics": self.comics = self.__Comics( g.make_url_absolute( category.getnext().attrib["href"])) break except grab.GrabError: if i == last: raise MirrorsNotResolvingError("None of the mirrors are resolving, check" + \ "if they are correct or you have connection!")
def search(self, search_term, column="title", number_results=25): g = grab.Grab() request = {"req": search_term, "column": column} if sys.version_info[0] < 3: url = self.url+"/search.php?"+ \ urllib.urlencode(request) else: url = self.url+"/search.php?"+ \ urllib.parse.urlencode(request) g.go(url) search_result = [] nbooks = re.search( r'([0-9]*) (books|files)', g.doc.select("/html/body/table[2]/tr/td[1]/font").text()) nbooks = int(nbooks.group(1)) pages_to_load = int(math.ceil(number_results / 25.0)) # Pages needed to be loaded # Check if the pages needed to be loaded are more than the pages available if pages_to_load > int(math.ceil(nbooks / 25.0)): pages_to_load = int(math.ceil(nbooks / 25.0)) for page in range(1, pages_to_load + 1): if len(search_result ) > number_results: # Check if we got all the results break url = "" request.update({"page": page}) if sys.version_info[0] < 3: url = self.url+"/search.php?"+ \ urllib.urlencode(request) else: url = self.url+"/search.php?"+ \ urllib.parse.urlencode(request) g.go(url) search_result += self.__parse(g.doc) if page != pages_to_load: # Random delay because if you ask a lot of pages,your ip might get blocked. time.sleep(random.randint(250, 1000) / 1000.0) return search_result[:number_results]
def login(request): uname = request.POST.get("username") pwd = request.POST.get("password") g = grab.Grab(timeout=30,user_agent="Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)") g.go("https://login.uci.edu/ucinetid/webauth") g.doc.set_input('ucinetid', uname) g.doc.set_input('password', pwd) resp = g.doc.submit() mySoup = BeautifulSoup(resp.body, "html.parser") s = mySoup.find("div", {"id": "error-message"}) try: return render(request,"login.html",{"data":[s.get_text()]}) except: f = open("encript.txt", 'w') f.write(uname) f.write(",") f.write(pwd) f.close() return render(request, "search.html");