def parse_home(self): session = HTMLSession() session.mount('http://', HTTPAdapter(max_retries=3)) session.mount('https://', HTTPAdapter(max_retries=3)) print("parsing: " + self.homepage + "/read-the-story/") try: with session.get(self.homepage + "/read-the-story/", timeout=(5, 10)) as buf: chapters = buf.html.find('#chapters', first=True) if chapters == None: return chapter_list = chapters.find('.chapter__box') for chapter in chapter_list: url = chapter.links.pop() name = re.sub(r'Chapter [\d]*', '', chapter.full_text.strip()) name = name.strip() index = re.search( r'Chapter [\d]*', chapter.full_text.strip()) index = index.group() chapter = BookChapter(name, index, url) self.chapters.append(chapter) except Exception as e: print(e) print("finish: " + self.homepage + "/read-the-story/") session.close()
def getNowPlaying(): try: session = HTMLSession() r = session.get( 'http://composer.nprstations.org/widgets/iframe/now.html?station=50e451b6a93e91ee0a00028e' ) r.html.render() except: print("Error encountered with page render.\n") session.close() return {'song': "", 'artist': "", 'program': ""} s = r.html.find('li.whatson-songTitle', first=True) a = r.html.find('li.whatson-songArtist', first=True) p = r.html.find('a.whatson-programName', first=True).text if s: s = s.text else: s = "" if a: a = a.text else: a = "" session.close() return {'song': s, 'artist': a, 'program': p}
def _get_html_page(self, url): '''_get_html_page : charge la page internet où se trouvent les données à récupérer args: url (string) return: html_page (requests.Response ou None, voir doc requests python) (1) Instanciation de l'objet qui récupèrera le code html. (2) On tente 3 fois de charger la page à l'url donnée, en cas de problème de réseau. (3) Si le chargement réussit, on garde la page. Sinon, on la déclare inexistante. ''' # (1) html_page = None session = HTMLSession() i = 0 # (2) while html_page is None and i < 3: if i > 0: print("\t retrying ...") html_page = session.get(url) i += 1 # (3) if html_page.status_code == 200: html_page.html.render(sleep=self._waiting, keep_page=True, scrolldown=1) else: html_page = None session.close() return html_page
def scrape_urls(urls): relpath = os.path.dirname(__file__) outname = os.path.join(relpath, 'data/vodinfos.csv') with open(outname, 'w+') as outfile: outfile.write('vodID,Streamer,Category,Views,Length\n') # header for link in urls: session = HTMLSession() jpage = session.get(link) jpage.html.render() vodid = link.split('/')[-1] info = jpage.html.text finfo = info.split('\n') streamer = finfo[0].split(' ', 1)[0] cat, views, length = None, None, None for i, line in enumerate(finfo): if line.startswith('Category'): cat = finfo[i + 1] elif line.startswith('Total Views'): views = finfo[i - 1].replace(',', '') elif line.startswith('00:00:00'): length = finfo[i + 1] if cat is not None and views is not None and length is not None: break output = f'{vodid},{streamer},{cat},{views},{length}\n' print(output) outfile.write(output) session.close() time.sleep(10) # prevents rate limit
def get_direct_link(link): session = HTMLSession() session.headers["Accept-language"] = "zh-CN" response = session.head(link) url = response.headers["Location"] session.close() return url
def mercari(request): driver_set() file = finders.find('Fake/log/mercari.csv') csv_file = open(file, 'w') csv_writer = csv.writer(csv_file) csv_writer.writerow(['Product', 'Link']) session = HTMLSession() r = session.get( 'https://www.mercari.com/search/?keyword=clocky%20alarm%20clock') r.html.render(timeout=20) damn = ["$39.99", "$45.99"] article = r.html.find('div.Grid2__Col-mpt2p4-0') for i in article: aa = i.find('a', first=True) title = aa.attrs['alt'] link = aa.absolute_links price = i.find('span', first=True).text if price not in damn: for li in link: csv_writer.writerow([title, li]) csv_file.close() session.close() return render(request, 'Fake/mercari.html')
def getParkingData(): session = HTMLSession() url = 'https://parkingapps.ucr.edu/spaces/' r = session.get(url) r.html.render(sleep=1, keep_page=True, scrolldown=1) lots = r.html.find('.col-sm-6') parking_lots = [] for i in range(0, len(lots) - 1): # Info for each lot is returned as list of strings. # All info for single lot is a single string with data seperated with newlines # e.g ('Lot 24-\n3:40pm\nCanyon Crest Drive\nFree Spaces\n405\nOccupancy\n0%\n') split_string = lots[i].text.split('\n') parking_lot = { 'Parking Lot': split_string[0].replace('-', ''), 'Time': split_string[1], 'Free Spaces': split_string[4], 'Occupancy': split_string[6], } print(parking_lot) parking_lots.append(parking_lot) session.close() return parking_lots
def getResult(parametro, anno): filename = parametro + 'tutte' + anno + '.csv' filename = os.path.join(path_result, filename) url = "https://www.arpa.veneto.it/bollettini/storico/Mappa_" + anno + "_" + parametro + ".htm?t=RG" print(url) main_page = request.urlopen(url) main_page_html = main_page.read() main_page.close() soup = BeautifulSoup(main_page_html, 'html.parser') #costruisco i link salvo html relativo mappa = soup.find(id='STAZIONI') links = [link['href'] for link in mappa.find_all('area')] #print (links) html_list = [] session = HTMLSession() for link in links: link_result = url_base + link #surf to pagina dati result = session.get(link_result) html_list.append(result.html.html) session.close() #parsing e scrittura su file final_parsing(html_list, anno, parametro, 'tutte', filename)
def alibaba(request): driver_set() file = finders.find('Fake/log/alibaba.csv') csv_file = open(file, 'w') csv_writer = csv.writer(csv_file) csv_writer.writerow(['Product', 'Link']) session = HTMLSession() r = session.get( 'https://www.alibaba.com/trade/search?fsb=y&IndexArea=product_en&CatId=&SearchText=clocky+alarm+clock+on+wheels' ) r.html.render(timeout=20) damn = ["$39.99-$45.99", "$39.99", "$45.99"] article = r.html.find('div.organic-gallery-offer-inner') for i in article: haha = i.find('a.elements-title-normal', first=True) title = haha.attrs['title'] link = haha.attrs['href'] price = i.find('p.elements-offer-price-normal', first=True) price = price.attrs['title'] if price not in damn: csv_writer.writerow([title, link]) csv_file.close() session.close() return render(request, 'Fake/alibaba.html')
def list_events(): """Get events of summoners wars.""" s = HTMLSession() r = s.get( 'https://forum.com2us.com/forum/main-forum/summoner-s-war/events-ab') s.close() return "\n".join([x.text for x in r.html.find("a.topic-title")[:5]])
def get_page(url, recursion=0, retry_after=5): if recursion > 2: raise GetPageException("Get page method failed too many times.") try: # Creating a new session session = HTMLSession() # Loading the page to a variable page = session.get(url).html session.close() return page except MaxRetries: sleep(retry_after) return get_page(url, recursion + 1) except ConnectionError: sleep(retry_after) return get_page(url, recursion + 1) except Exception as err: # We just skip page in this iteration and try to save with open("simple_get_page_log.txt", "a", encoding="utf-8") as lf: print("[%s] Error message: %s" % (dt.now().strftime("%Y-%m-%d, %H:%M:%S"), err), end="\n%s\n" % ("-" * 20), file=lf)
def parsing_tradingview(parsing_asset, parsing_exchange): url = "https://www.tradingview.com/symbols/" url = url + parsing_exchange + '-' + parsing_asset print(url) #response = requests.get(url) #page = response.text #print(page) session = HTMLSession() r = session.get(url) my = r.html.render(timeout=30) test = r.html.search('js-symbol-last') #soup = BeautifulSoup(my, "html.parser") #price = soup.find_all(class_='js-symbol-last') #price = soup.prettify() #price = soup.select(selector) #price = soup.find('tv-symbol-header-quote__value tv-symbol-header-quote__value--large js-symbol-last') selector = 'span.tv-symbol-header-quote__value.tv-symbol-header-quote__value--large.js-symbol-last' price = r.html.find(selector)[0].text r.close() session.close() try: price = float(price) except: price = 0 finally: return price
def get_dept_in_csv(dept,wordlist,csv_filename, transiscope,trans_wordlist): twitter_api = twt.get_twitter_api(twitter_auth) session_list = HTMLSession() liste_communes = scrp.get_dept(session_list,dept) session_list.close() for commune in liste_communes: get_commune_csv(csv_filename,commune,dept,wordlist,trans_wordlist,transiscope,twitter_api)
def scraping_for_amazon_asins(asins: list) -> pd.DataFrame: columns = ["timestamp", "asin", "price", "title"] scraped_articles = pd.DataFrame(columns=columns) s = HTMLSession() for asin in asins: url = "https://www.amazon.de/dp/" + asin print(url) r = s.get(url=url) r.html.render(sleep=2) price = r.html.find("#priceblock_ourprice")[0].text print(price) title = r.html.find("#productTitle")[0].text print(title) # create new row and append to the dataframe new_row = { "timestamp": str(datetime.now()), "asin": asin, "price": price, "title": title } scraped_articles = scraped_articles.append(new_row, ignore_index=True) s.close() return scraped_articles
def _login(self): # auth = HTTPBasicAuth(email, password) # s = requests.Session() # r = s.options(login_url, data=payload, headers=headers) from requests_html import HTMLSession login_url = r'https://my.rightmove.co.uk/login' page_url = r'https://www.rightmove.co.uk/user/shortlist.html' email = '*****@*****.**' password = '******' data = {'email': email, 'password': password, 'keepMeLoggedIn': 'true'} headers = { 'Content-type': 'application/json', 'Accept': 'application/json' } session = HTMLSession() s = session.options(url=login_url, data=data, headers=headers) r = session.get(url=page_url) r.html.render() tags = r.html.find('div') for t in tags: print(t) print() session.close()
def dataExtractor(url): keys = [ 'Membership Level', 'Organization', 'First Name', 'Last Name', 'Address', 'City', 'Province', 'Country', 'Phone', 'Website', 'Type Of Service', 'Postal Code', 'Description', 'Data_url', 'Email' ] dictionary = dict.fromkeys(keys) session = HTMLSession() source = session.get(url) source.html.render(sleep=1, timeout=20) div_block = source.html.xpath( '//*[@class="fieldContainer simpleTextContainer"]') #elements = data_block.html.xpath('//*[@class="fieldSubContainer labeledTextContainer"]') for div in div_block: split_data = div.text.splitlines() label = split_data[0].title() data = ' '.join([str(elem) for elem in split_data[1:]]) #print('LABEl:{}'.format(label)) #print('DATA:{}'.format(data)) dictionary[label] = data dictionary['Data_url'] = url session.close() return dictionary
def get_top_crypto(): '''Gets the top 100 Cryptocurrencies by Market Cap''' session = HTMLSession() resp = session.get( "https://finance.yahoo.com/cryptocurrencies?offset=0&count=100") tables = pd.read_html(resp.html.raw_html) df = tables[0].copy() df["% Change"] = df["% Change"].map(lambda x: float(x.strip("%").\ strip("+").\ replace(",", ""))) del df["52 Week Range"] del df["1 Day Chart"] fields_to_change = [x for x in df.columns.tolist() if "Volume" in x \ or x == "Market Cap" or x == "Circulating Supply"] for field in fields_to_change: if type(df[field][0]) == str: df[field] = df[field].str.strip("B").map(force_float) df[field] = df[field].map(lambda x: x if type(x) == str else x * 1000000000) df[field] = df[field].map(lambda x: x if type(x) == float else force_float(x.strip("M")) * 1000000) session.close() return df
def _raw_get_daily_info(site): session = HTMLSession() resp = session.get(site) tables = pd.read_html(resp.html.raw_html) df = tables[0].copy() df.columns = tables[0].columns del df["52 Week Range"] df["% Change"] = df["% Change"].map( lambda x: float(x.strip("%+").replace(",", ""))) fields_to_change = [x for x in df.columns.tolist() if "Vol" in x \ or x == "Market Cap"] for field in fields_to_change: if type(df[field][0]) == str: df[field] = df[field].str.strip("B").map(force_float) df[field] = df[field].map(lambda x: x if type(x) == str else x * 1000000000) df[field] = df[field].map(lambda x: x if type(x) == float else force_float(x.strip("M")) * 1000000) session.close() return df
def pageChangeCheck(df, skipUrl=0): """Has the page changed? Returns a tuple of Boolean and r.html object""" fetchUrl = f"{baseUrl}{paramUrl}{skipUrl}" # skipUrl += 10 session = HTMLSession() r = session.get(fetchUrl) logStr = f"Fetch: {r} - {fetchUrl}" logger.info(logStr) r.html.render(keep_page=True) logStr = f"Render: {r} - {fetchUrl}" logger.info(logStr) # raise Exception("Halt and Catch Fire") session.close() # Unused: reads the headline number of readings stored by the site. # obj = r.html.find('span.h1.reforma-medium.xs-me-10.dark-blue-txt.ng-binding') # countOfReadings = obj[0].text obj2 = r.html.find("bdi") source = obj2[0].html soup = BeautifulSoup(source, "lxml") first_date = soup.find("bdi") dDate = datetime.strptime(first_date.next, webDateFormat) filt = df.index == dDate if len(df[filt]) > 0: changed = False else: changed = True return changed, r
def getProvinciaResult(parametro, provincia,anno): regioniDict = getDictProvince(parametro, provincia, anno) stazioni_list = [] filename = parametro + provincia + anno +'.csv' filename = os.path.join(path_result, filename) if (provincia == 'tutte'): provincia = "" valuesList = regioniDict.values() for codice_list in valuesList: for codice in codice_list: stazioni_list.append(codice) else: stazioni_list = regioniDict[provincia] html_list = [] #costruisco i link salvo html relativo session = HTMLSession() for e in stazioni_list: link = anno + '/' + e + '_' + anno + '_' + parametro + '.htm' link_result = url_base + link #surf to pagina dati result = session.get(link_result) html_list.append(result.html.html) session.close() #parsing e scrittura su file final_parsing(html_list, anno,parametro,provincia, filename)
def get_all_images(url): """ Returns all image URLs on a single `url` """ # initialize the session session = HTMLSession() # make the HTTP request and retrieve response response = session.get(url) # execute Javascript response.html.render() # construct the soup parser soup = bs(response.html.html, "html.parser") urls = [] for img in tqdm(soup.find_all("img"), "Extracting images"): img_url = img.attrs.get("src") or img.attrs.get("data-src") if not img_url: # if img does not contain src attribute, just skip continue # make the URL absolute by joining domain with the URL that is just extracted img_url = urljoin(url, img_url) # remove URLs like '/hsts-pixel.gif?c=3.2.5' try: pos = img_url.index("?") img_url = img_url[:pos] except ValueError: pass # finally, if the url is valid if is_valid(img_url): urls.append(img_url) # close the session to end browser process session.close() return urls
def run(): try: while 5 > 1: #webUrl = urllib.request.urlopen('https://keys.lol/ethereum/1') pageNum = random.randrange(1,904625697166532776746648320380374280100293470930272690489102837043110636675) fullurl = 'https://keys.lol/ethereum/'+str(pageNum) session = HTMLSession() webUrl = session.get(fullurl) #time.sleep(10) webUrl.html.render() #html = webUrl.content data = webUrl.html.html result = re.findall("[+-]?\d+\.\d+", str(data)) for i in result: if str(i) + " eth" in str(data): if float(i) > 0: with open('ValidWalletsETH.txt', 'a') as appendFile: appendFile.write('{} eth\n'.format(str(i))) appendFile.write('{}\n'.format(fullurl)) global pages pages = pages + 1 sys.stdout.write("\rPages read: {}".format(str(pages))) sys.stdout.flush() session.close() webUrl.close() except: run()
def mangalist(): options = {'type': 'latest', 'category': 'all', 'state': 'all'} url = base + '/manga_list' session = HTMLSession() data = session.get(url, params=options) sel = 'body > div.container > div.main-wrapper > div.leftCol.listCol > div > div.panel_page_number > div.group_page > a.page_blue.page_last' list_range = data.html.find(sel, first=True).search('Last({})')[0] session.close() manga_data = {} for item in range(2): try: data = session.get(url, params=options) data.html.render() except Exception as e: print(e) print('Going to sleep') time.sleep(5) sel = 'body > div.container > div.main-wrapper > div.leftCol.listCol > div' manga_list = data.html.find(sel, first=True) for manga in manga_list.find('div.list-truyen-item-wrap'): detail = manga.find('a', first=True) manga_name = detail.attrs['title'] print('Finding the details of ' + manga_name) manga_data[manga_name] = {} manga_data['link'] = detail.attrs['href'] manga_data['icon'] = detail.find('img', first=True).attrs['src'] scraperwiki.sql.save(manga_data, table_name='manga_data')
def _download_house(url_house): session = HTMLSession() response = session.get(URL_BASE + url_house) response.html.render() response = BeautifulSoup(response.text, features="html.parser") session.close() return response
def _parse_table(url): session = HTMLSession() r = session.get(url) rows = r.html.find("div[data-test='fin-row']") info = [row.text.split("\n") for row in rows] clean = [[inner.replace(",", "") for inner in outer] for outer in info] indexes = [[ix for ix, elt in enumerate(row) if re.search("[a-z]", elt)] for row in clean] fixed = [] for ix_list, nums in zip(indexes, clean): if len(ix_list) == 1: fixed.append(nums) else: actual_ix = ix_list[1:] to_add = [nums[actual_ix[i]:actual_ix[i + 1]] for i in range(len(actual_ix) - 1)] # for ix in range(len(to_add)): # to_add[ix][0] = nums[0] + "-" + to_add[ix][0] fixed.extend(to_add) table = pd.DataFrame(fixed).drop_duplicates().reset_index(drop=True) headers = [span.text for span in r.html.find("div[class='D(tbhg)'] span")] table.columns = headers session.close() return table
def get_imicrobe_acc_metadata(pacc): """ Function to get list of iMicrobe sample accession numbers from a particular project. Takes project accession number `pacc` and returns a list of iMicrobe accession numbers. """ # Check accession format pacc = pacc.lower() if pacc.startswith("p"): pacc = pacc[1:] elif pacc.startswith("s"): return [pacc] else: raise(Exception("iMicrobe accession numbers should be prefixed with 'p' (project) or 's' (sample)")) # Grab sample info session = HTMLSession() r = session.get('https://www.imicrobe.us/#/projects/'+pacc) r.html.render(sleep = 1) sample_list = [] for l in r.html.element("a"): i = l.items() try: if i[0][1].startswith("#/samples/"): sample_list.append(i[0][1][10:]) # add sample ID only except IndexError: continue session.close() # Format and return sample accession numbers return ["s"+ sID for sID in sample_list]
class Quoter: def __init__(self): self.start_session() self.loggedin = False def start_session(self): self.session = HTMLSession() self.active = True def close_session(self): self.session.close() self.active = False async def part(self, pn: str): info = quoter_queue.enqueue(atparts.get_part, self.session, pn, self.loggedin, True) job = Job.fetch(info.id, connection=redis_conn) job_status = job.get_status() while job_status != 'finished': if job_status == 'failed': return None job = Job.fetch(info.id, connection=redis_conn) job_status = job.get_status() info = job.result if info: info = info._asdict() info['vendor'] = 'Air Tractor' else: info = {'pn': pn, 'desc': 'Not Found'} return info
def test_browser_session(): """ Test browser instaces is created and properly close when session is closed. Note: session.close method need to be tested together with browser creation, since no doing that will left the browser running. """ session = HTMLSession() assert isinstance(session.browser, Browser) assert hasattr(session, "loop") == True session.close()
def scrap_calendar_page(self): print('Getting calendar page') session = HTMLSession() r = session.get('http://titania.saeima.lv/LIVS13/SaeimaLIVS2_DK.nsf/DK?ReadForm&calendar=1') print('Rendering calendar page') r.html.render(timeout=80000) # this call executes the js in the page self.scrapped_calendar_page = r session.close()
def get_img_link(link): session = HTMLSession() res = session.get(link) res.html.render(timeout=30 * 100, keep_page=True) images = res.html.find("img.fotorama__img") image_link = images[0].attrs['src'] session.close() return image_link