def download_linke(coords, proxy, port, saveFile, saveMode): print proxy, port print proxy != "" url = "http://www.soda-is.com/eng/services/service_invoke/gui.php?" + "xml_descript=soda_tl.xml&Submit2=Month" session = Session() session.verify = False if proxy != "": proxies = {proxy: port} session.proxies = proxies br = RoboBrowser(session=session, parser="lxml") br.open(url) linke_form = br.get_forms()[1] num = len(coords) index = 0 with open(saveFile, saveMode) as f: try: for coord in coords: inlon, inlat = coord linke_form["lat"].value = inlat linke_form["lon"].value = inlon sf = linke_form.submit_fields.getlist("execute") br.submit_form(linke_form, submit=sf[0]) linke_table = br.find("table", {"cellspacing": "0", "cellpadding": "2"}) linkes = get_monthly_linke_str(get_linke_values(linke_table)) s = "%s,%s,%s\n" % (format(inlon, "0.5f"), format(inlat, "0.5f"), linkes) if len(s) > 48: f.write(s) print "Done with point %i of %i: (%s, %s)" % ( index + 1, num, format(inlon, "0.5f"), format(inlat, "0.5f"), ) index += 1 br.back() print "DONE!" except Exception as e: not_dl = list(coords[index:]) with open(saveFile + "_notdownloaded.txt", "w") as nd: for c in not_dl: nd.write("%s,%s\n" % (str(c[0]), str(c[1]))) print e
def get_video_url(url): br = RoboBrowser(history=True, parser='lxml') br.open(url) cn = input('请问是否要转换为中文?(y/n)') if not cn: cn = 'y' if cn == 'y': # shift to simplified chinese lang = br.get_forms()[0] lang['session_language'].options = ['cn_CN'] lang['session_language'].value = 'cn_CN' br.submit_form(lang) # get video title vid_title = br.find('div', {'id': 'viewvideo-title'}).text.strip() print('the video you want to download is: {0}'.format(vid_title)) print('-----------------------------------------------------------') # get video id vid_id = re.findall( r'\d{6}', br.find('a', { 'href': '#featureVideo' }).attrs['onclick'])[0] # get real video link vid_real_url = 'http://192.240.120.34//mp43/{}.mp4'.format(vid_id) return vid_real_url, re.sub( """[\s+\.\!\/_,$%^*(+\"\']+|[+——!,。|?、~@#¥%……&*():]+""", " ", vid_title).strip()
def test_calc_interface(self): operation = "5,+,2" expected_result = 7 # Add some result to DB requests.post('/'.join((TEST_URL, 'calc')), data={'operation':'998,-,888'}) # Init object browser = RoboBrowser(history=True, parser='html.parser') browser.open(TEST_URL) # Fill calc form calc_form = browser.get_form(action='/calc') calc_form['operation'] = operation browser.submit_form(calc_form) # Get result result_raw = browser.find(id="result").text self.assertEqual(int(result_raw), expected_result) # Check result link browser.follow_link(browser.find(id='result_link')) self.assertEqual((operation, expected_result), (browser.find(id="operation").text, int(browser.find(id="result").text)))
def main(): url = 'http://www.porncomix.info/milky-milk-2-dragon-ball-z-english/' browser = RoboBrowser(history=True, parser='html.parser', user_agent='Chrome/41.0.2228.0') browser.open(url) wrapper = browser.find('div', {'id': 'gallery-1'}) imgs = wrapper.find_all('a', href=True) img_list = [] for line in imgs: img_list.append(line['href']) name = 1 for line in img_list: browser.open(line) wrapper_div = browser.find('div', {'class': 'attachment-image'}) my_img = wrapper_div.find('img', src=True) img_data = requests.get(my_img['src']).content with open(str(name) + '.jpg', 'wb') as handler: handler.write(img_data) name += 1 with open('Walao.txt', 'a') as f: f.write(my_img['src'] + '\n') print(my_img['src'])
def parse_answer_page(page_url): ans_browser = RoboBrowser(history=True, user_agent='nemo1') ans_browser.open(page_url) title = ans_browser.find(class_="zm-item-title").a # pdb.set_trace() title_text = title.get_text() print title_text.encode('utf-8') title_id = title["href"].split("/")[-1] directory = "/Users/nemo/Pictures/zhihu/" + title_text if not os.path.exists(directory): os.makedirs(directory) with open(directory + "/url_record.txt", "a") as output: output.write(page_url + "\n") content_div = ans_browser.find("div", class_="zm-editable-content clearfix") count = 0 for img_tag in content_div.find_all("img"): count = count + 1 try: img_src_url = img_tag["data-original"] print count, img_src_url except: pdb.set_trace() print "No data original " + img_tag continue
class Answer(object): """ Zhihu parser, answer obj""" def __init__(self, page_url): self.url = page_url self.ans_browser = RoboBrowser(history=True,user_agent='nemo1') self.ans_browser.open(self.url) def get_related_question_url(self): h2_tag = self.ans_browser.find("h2", class_="zm-item-title") return "https://www.zhihu.com" + h2_tag.a["href"] def get_related_question_title(self): h2_tag = self.ans_browser.find("h2", class_="zm-item-title") return h2_tag.a.get_text() def get_img_url_list(self): content_div = self.ans_browser.find("div", class_="zm-editable-content clearfix") results = [] for img_tag in content_div.find_all("img"): if "data-original" in img_tag: results.append(img_tag["data-original"]) elif "src" in img_tag: results.append(img_tag["src"]) return results def get_thumbs_up_count(self): div = self.ans_browser.find("div", class_="zm-item-vote-info") if div: return int(div["data-votecount"])
def run(wait): """Starts the scrapping proccess. Opens a teamstats page and gathers all the form inputs Then sends these inputs to parseSeason which opens a new page for every possible option in the form If you get an error at the start, with role.find_all, just try again, nfl.com returns weird pages sometimes """ logger = makeLogger('main', r'./logs_nflteamStat/') startTime = datetime.now() logger.debug('start time: ' + str(startTime)) logger.debug('waiting %d seconds', wait) time.sleep(wait) pool = Pool(processes=int(get_proxy_count()/2.5)) #html5lib parser required for broken html on gameSplits browser = RoboBrowser(history=False, parser='html5lib', user_agent=get_user_agent(logger), timeout=10) startingUrl = "http://www.nfl.com/stats/categorystats?tabSeq=2&offensiveStatisticCategory=GAME_STATS&conference=ALL&role=TM&season=2015&seasonType=REG" browser = open_or_follow_link(logger, browser, 'open', startingUrl) role = browser.find(id="role") roles = role.find_all("option") offensiveCategory = browser.find(id="offensive-category") offensiveCategories = offensiveCategory.find_all("option") defensiveCategory = browser.find(id="defensive-category") defensiveCategories = defensiveCategory.find_all("option") season = browser.find(id="season-dropdown") seasons = season.find_all("option") seasonType = browser.find(id="season-type") seasonTypes = seasonType.find_all("option") for role in roles: availableCategories = None if role.text == "Offense": availableCategories = offensiveCategories elif role.text == "Defense": availableCategories = defensiveCategories else: print "unknown role" for category in availableCategories: if category.text == "Category...": continue for season in seasons: if season.text == "Season..." or convertToNumber(removeNewLine(season.text)) < 1960: continue #parseSeason(role, category, season, seasonTypes) pool.apply_async(parseSeason, (role, category, season, seasonTypes,)) pool.close() #Prevents any more tasks from being submitted to the pool. Once all the tasks have been completed the worker processes will exit. pool.join() #Wait for the worker processes to exit. One must call close() or terminate() before using join(). logger.debug('run time: ' + str(datetime.now()-startTime )) closeLogger('main')
def build_cache(): """ Get current data from the website http://www.lfd.uci.edu/~gohlke/pythonlibs/ Returns ------- Dictionary containing package details """ data = {} soup = RoboBrowser() soup.open(MAIN_URL) # We mock out a little javascript environment within which to run Gohlke's obfuscation code context = js2py.EvalJs() context.execute(""" top = {location: {href: ''}}; location = {href: ''}; function setTimeout(f, t) { f(); }; """) # We grab Gohlke's code and evaluate it within py2js context.execute(soup.find("script").text) links = soup.find(class_="pylibs").find_all("a") for link in links: if link.get("onclick") is not None: # Evaluate the obfuscation javascript, store the result (squirreled away within location.href) into url context.execute(link.get("onclick").split("javascript:")[-1]) url = MAIN_URL + context.location.href # Details = [package, version, pyversion, --, arch] details = url.split("/")[-1].split("-") pkg = details[0].lower().replace("_", "-") # Not using EXEs and ZIPs if len(details) != 5: continue # arch = win32 / win_amd64 / any arch = details[4] arch = arch.split(".")[0] # ver = cpXX / pyX / pyXXx pkg_ver = details[1] py_ver = details[2] py_ver_key = py_ver + "-" + arch if pkg in data.keys(): if py_ver_key in data[pkg].keys(): data[pkg][py_ver_key].update({pkg_ver: url}) else: data[pkg][py_ver_key] = {pkg_ver: url} else: data[pkg] = {py_ver_key: {pkg_ver: url}} return data
class Question(object): """ Zhihu parser, question obj""" def __init__(self, page_url): self.url = page_url self.browser = RoboBrowser(history=True, user_agent='nemo1') self.browser.open(self.url) def get_answer_count(self): if self.browser.find("h3", id="zh-question-answer-num") != None: return int( self.browser.find("h3", id="zh-question-answer-num")["data-num"]) def get_all_answer_url_list(self): results = [] if self.get_answer_count() <= 10: for answer_div in self.browser.find_all( "div", class_="zm-item-answer zm-item-expanded"): results.append(URL_PREFIX + answer_div.find("link")["href"]) else: for i in range(0, (self.get_answer_count() / 10) + 1): offset = i * 10 if i == 0: for answer_div in self.browser.find_all( "div", class_="zm-item-answer zm-item-expanded"): results.append(URL_PREFIX + answer_div.find("link")["href"]) # print results else: # pass post_url = "http://www.zhihu.com/node/QuestionAnswerListV2" _xsrf = self.browser.find("input", attrs={'name': '_xsrf'})["value"] params = json.dumps({ "url_token": int(self.url[-8:-1] + self.url[-1]), "pagesize": 10, "offset": offset }) data = {'_xsrf': _xsrf, 'method': "next", 'params': params} header = { 'User-Agent': "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:34.0) Gecko/20100101 Firefox/34.0", 'Host': "www.zhihu.com", 'Referer': self.url } r = requests.post(post_url, data=data, headers=header, verify=False) answers = r.json()["msg"] # print len(answers) # pdb.set_trace() for ans in answers: soup = BeautifulSoup(ans, 'html.parser') results.append(URL_PREFIX + soup.find("link")["href"]) return results
def scrape(q): query = q ph = re.compile('(\(\d{3}\)\ \d{3}-\d{4})') ad = re.compile('[A-Z]{2}\ (\d{5})') site = re.compile('(?<=\?q=).*(?=&sa)') result = { 'name':'!NO DATA!', 'address':'!NO DATA!', 'phone':'!NO DATA!', 'website':'!NO DATA!', 'blurb':'!NO DATA!' } #uses mechanize to submit google search browser = RoboBrowser(user_agent='Firefox', parser='html.parser') browser.open('http://google.com/') # Search for Porcupine Tree form = browser.get_form(action='/search') form # <RoboForm q=> form['q'].value = query browser.submit_form(form, form.submit_fields['btnG']) result['query']=query if browser.find("div", {"class" : "_B5d"}): result['name'] = browser.find("div", {"class" : "_B5d"}).text.encode('utf-8') stuff = browser.find("div", {"class" : "_uXc"}) address = stuff.find(text=ad) if address: result['address']=address.encode('utf-8') phone = stuff.find(text=ph) if phone: result['phone']=phone.encode('utf-8') blurb = stuff.find("span") if blurb: result['blurb'] = blurb.text.encode('utf-8') website = stuff.find("a", string="Website") if website: website = website.get('href').encode('utf-8') result['website'] = site.search(website).group() print result delay = random.randint(5,10) print "Waiting " + str(delay) + " seconds..." time.sleep(delay) return result
def ExtractAllComicImages(url): with open('HenRUniqueComic.txt', 'w') as f: print(f) browser = RoboBrowser(history=True, parser='html.parser', user_agent='Chrome/41.0.2228.0') browser.open(url) read_button = browser.find('div', {'class': 'read-now'}) link = read_button.find('a', href=True) ComicFirstPage = link['href'] browser.open(ComicFirstPage) Select_element = browser.find('select', {'class': 'cbo_wpm_pag'}) options = Select_element.find_all('option') page_numbers = (options[-1].text) All_Pages = CraftAllComicPages(url, page_numbers) '''figure out how many segments we need to split all the pages, 35 pages is one block download''' print(len(All_Pages)) Segments = int(math.ceil(len(All_Pages) / 35)) print(Segments) '''split the list''' Segment_list_container = GeneralSplitList(All_Pages, Segments) for unique_segment in Segment_list_container: jobs = [] for page in unique_segment: print('Starting job for page ' + str(page)) p = multiprocessing.Process(target=ExtractONEPAGE, args=(page, )) jobs.append(p) p.start() for proc in jobs: proc.join() time.sleep(2) print('Finished all jobs. Now returning them all as unsorted list..') with open('HenRUniqueComic.txt', 'r') as f: data = f.read().splitlines() print('Now sorting them and returning that..') sorted_result = SortHenRUniqueComic(data) return sorted_result
class BKBrowser(object): def __init__(self): # Browse url : self.result = None self.browser = RoboBrowser(parser="html.parser") self.browser.session.headers = config.headers # Mount with custom SSL Adapter self.browser.session.mount('https://', HTTPSAdapter()) def _connect(self): # Get to website print("- Connecting to url ...") self.browser.open(config.url) def _skip_first_page(self): button = self.browser.get_forms()[0] self.browser.submit_form(button) # Let's fill in the proper form ! def _fill_form(self): while not self.browser.find('p', {'class': 'ValCode'}): inputs_map = max_radio_map(self.browser) f = self.browser.get_forms()[0] for i in f.keys(): if f[i].value == '': answers_list = inputs_map.get(i, ['1']) f[i].value = random.choice(answers_list) f.serialize() self.browser.submit_form(f) def _fill_date_form(self): # Fill in Date/Time form and start the Questionnaire print("- Filling Forms Randomly ...") form = self.browser.get_forms()[0] form['JavaScriptEnabled'].value = '1' form['SurveyCode'].value = config.ID form['InputMonth'].value = config.date[0] form['InputDay'].value = config.date[1] form['InputHour'].value = config.time[0] form['InputMinute'].value = config.time[1] form.serialize() self.browser.submit_form(form) def get_validation_code(self): self._connect() self._skip_first_page() self._fill_date_form() self._fill_form() self.result = self.browser.find('p', {'class': 'ValCode'}).text return self.result def return_result(self): return self.result
def gettab(keyword): browser = RoboBrowser(history=True, parser='html5lib') browser.open('https://www.tabs4acoustic.com/') form = browser.get_form(action=re.compile('recherche')) form['FindMe'].value = keyword browser.submit_form(form) div_resultat = browser.find('div', id='page_content') browser.follow_link(div_resultat.find('a')) tab = browser.find('div', id='tab_zone') return tab.find('pre').text
class Downloader(): def __init__(self, proxy=None, worker_num=0): self.worker_num = worker_num session = Session() if proxy is not None: session.proxies = {'http': proxy, 'https': proxy} self.browser = RoboBrowser(history=True, parser='html.parser', session=session) def get_download_link(self, book_url): self.browser.open(book_url) for link in self.browser.find_all("a"): if "download.php?t=1" in str(link): return f"https://www.lectulandia.cc{link['href']}" def download_book(self, download_url): self.browser.open(download_url) pattern = re.compile("var linkCode = \"(.*?)\";") section = pattern.findall(str(self.browser.parsed)) bee_url = f'https://www.beeupload.net/file/{section[0]}' self.browser.open(bee_url) try: filename = self.browser.find( "div", id="fileDescription").find_all("p")[1].text.replace( "Name: ", "") size = self.browser.find( "div", id="fileDescription").find_all("p")[2].text file_url = self.browser.find("a", id="downloadB") time.sleep(2) self.browser.follow_link(file_url) with open(f"books/{filename}", "wb") as epub_file: epub_file.write(self.browser.response.content) return filename, size except: print(self.browser.parsed) def get_book_page_list(self, page): self.browser.open(f'https://www.lectulandia.cc/book/page/{page}/') return [ f"https://www.lectulandia.cc{book['href']}" for book in self.browser.find_all("a", class_="card-click-target") ] def download_full_page(self, page): print(f"Downloading page: {page} ") books = self.get_book_page_list(page) for book in books: time.sleep(2) download_url = self.get_download_link(book) print(f"Worker: {self.worker_num} on page: {page}", self.download_book(download_url))
def get_source_code(commitId, project): import random import requests from robobrowser import RoboBrowser HEADERS_LIST = [ 'Mozilla/5.0 (Windows; U; Windows NT 6.1; x64; fr; rv:1.9.2.13) Gecko/20101203 Firebird/3.6.13', 'Mozilla/5.0 (compatible, MSIE 11, Windows NT 6.3; Trident/7.0; rv:11.0) like Gecko', 'Mozilla/5.0 (Windows; U; Windows NT 6.1; rv:2.2) Gecko/20110201', 'Opera/9.80 (X11; Linux i686; Ubuntu/14.10) Presto/2.12.388 Version/12.16', 'Mozilla/5.0 (Windows NT 5.2; RW; rv:7.0a1) Gecko/20091211 SeaMonkey/9.23a1pre' ] link = [] session = requests.Session() browser = RoboBrowser(session=session, user_agent=random.choice(HEADERS_LIST), parser="lxml") url = "https://github.com/" + project.replace("-", "/") + "/commit/" + commitId browser.open(url + "?diff=unified") results = browser.find_all("a") for item in results: if ".java" in str(item): second_url = "https://raw.githubusercontent.com/" + project.replace( "-", "/") + "/" + commitId + "/" + item.string browser.open(second_url) return browser.find().text
def get_cookies(self): """ opens a fake browser to get the cookies needed """ from robobrowser import RoboBrowser browser = RoboBrowser( user_agent= 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.5; en-US; rv:1.9.1b3) Gecko/20090305 Firefox/3.1b3 GTB5', parser='html.parser') browser.open('https://battlemap.deltatgame.com/home#') link = browser.find('a') browser.follow_link(link) form = browser.get_form(0) with open('battlecreds.json') as credentialfile: credentials = json.load(credentialfile) form['Email'] = credentials['email'] browser.submit_form(form) form = browser.get_form(0) form['Passwd'] = credentials['password'] browser.submit_form(form) browser.open('https://battlemap.deltatgame.com/home') self.battlemap_token = browser.session.cookies.get('battlemap_session') self.xsrf = browser.session.cookies.get('XSRF-TOKEN') self.cookietimeout = time.time() + 60 * 60 * 1.95 # GET csrf-token META HERE self.csrf = '' self.brow = browser from bs4 import BeautifulSoup soup = BeautifulSoup(str(browser.parsed()), "html.parser") for tag in soup.find_all('meta'): if 'name' in tag.attrs and tag.attrs['name'] == 'csrf-token': self.csrf = tag.attrs['content']
def parseWeek(year, week): """ parses a specific week on http://rotoguru1.com/cgi-bin/fyday.pl?week={}&year={}&game=fd&scsv=1 which contains a csv of the fan duel player prices stores this info in fanduel_prices collection """ logger = makeLogger(str(year) + '_' + str(week), r'./logs_RotoFDStats/') startTime = datetime.now() logger.debug('Starting %d', year) client = MongoClient('localhost', 27017) db = client['nfl_data'] col_fanduel_prices = db['fanduel_prices'] if col_fanduel_prices.find({'year': year, 'yeek': week}).count(): logger.debug('Already parsed %d %d', year, week) closeLogger(logger) return None wait = random.uniform(1.5, 3.5) logger.debug('Waiting %f', wait) time.sleep(wait) logger.debug('Opening main page') browser = RoboBrowser(history=False, parser='html.parser', user_agent=get_user_agent(logger), timeout=10) url = "http://rotoguru1.com/cgi-bin/fyday.pl?week={}&year={}&game=fd&scsv=1".format( week, year) browser = open_or_follow_link(logger, browser, 'open', url) docs = [] try: data = browser.find('pre').text lines = data.split('\n') header = lines[0] header = header.split(';') lines = lines[1:] for line in lines: doc = {} if not line: continue for index, each in enumerate(line.split(';')): doc[cleanKey(header[index])] = convertToNumber(each) docs.append(doc) except: logger.exception("Parse fail: %s", url) try: logger.debug('Bulk Creating docs') col_fanduel_prices.insert_many(docs) except: logger.exception('insert_many error') logger.debug('parseWeek time elapsed: ' + str(datetime.now() - startTime)) closeLogger(str(year) + '_' + str(week))
def praca_shopping(): from robobrowser import RoboBrowser browser = RoboBrowser(parser="html.parser") not_finded = 0 n = 0 names = set() while not_finded < 20: # print(f'Página {n}') finded = False url = f"http://www.pracauberabashopping.com.br/filtro_loja_tipo.asp?tipo=vlojas.asp?busca1={n}" browser.open(url) item = browser.find("strong") if item: name = item.text if name != "Busca sem resultado.": names.add(fixed(name)) finded = True else: items = browser.find_all("a") if len(items) > 1: for item in items[1:]: if item.text != "Resultado da Busca": names.add(fixed(item.text)) finded = True if not finded: not_finded += 1 n += 1 return names
def Pururin(): Front_Page_URLS = [] Front_Page_Img = [] base_URL = 'http://pururin.us' browser = RoboBrowser(history=True, parser='html.parser', user_agent='Chrome/41.0.2228.0') browser.open(base_URL) gallery = browser.find('ul', {'class': 'gallery-list'}) all_links = gallery.find_all('a', href=True) all_image = gallery.find_all('img', src=True) count = 0 while (count < len(all_links)): if 'gallery' in all_links[count]['href']: Front_Page_URLS.append(base_URL + all_links[count]['href']) Front_Page_Img.append(base_URL + all_image[count]['src']) count += 1 return (Front_Page_URLS, Front_Page_Img)
def GameUpdatesForum(): ''' This will return as a list the latest aries updates from the main forums ''' MyUpdateList = [] base_url = 'http://elluel.net/' browser = RoboBrowser(history=True, parser='html.parser', user_agent='Chrome/41.0.2228.0') browser.open(base_url) form = browser.get_form(id='navbar_loginform') form["vb_login_username"] = '******' form["vb_login_password"] = '******' browser.submit_form(form) browser.open( 'http://elluel.net/showthread.php?15877-AriesMS-Official-Update-Fix-Log' ) MyLinks = browser.find('div', {'class': 'spoiler'}).find_all('li') for line in MyLinks: MyUpdateList.append(line.text) return MyUpdateList
async def AlertWhenServerUp(MyVar2): ''' This will run as a background task and an be called mutliple times by different people. It accepts the channel id and the user name who called it through on_message. It will crawl aries homepage and check to see if it is online every minute. If it is, then the infinite loop will exit and a message to the user who called this will be sent saying that servers are up ''' await client.wait_until_ready() while True: ''' This loop will continue looping and scraping aries front website to see if server is online. If it is, then it will break the loop and alert person who called this func''' url = 'http://aries.elluel.net/' browser = RoboBrowser(history=True, parser='html.parser', user_agent='Chrome/41.0.2228.0') browser.open(url) MyButton = browser.find('button') if 'ONLINE' in MyButton.text: break await asyncio.sleep( 60) # task runs every 60 seconds or the duration provided ''' Sending the alert message to the author since servers are up now''' await client.send_message( MyVar2, 'The servers are up now. Hope I was useful to you. If you have any other useful alert ideas then DM them to me. Enjoy!' )
def main(): # Browse to Rap Genius browser = RoboBrowser(history=True) browser = RoboBrowser( parser="html.parser") # will get a warning if parser not declared browser.open('http://rapgenius.com/') # Search for Queen form = browser.get_form(action='/search') form # <RoboForm q=> form['q'].value = 'queen' browser.submit_form(form) # Look up the first song songs = browser.select('.song_name') try: browser.follow_link(songs[0]) except IndexError: print("Songs Index doesn't exist!") return lyrics = browser.select('.lyrics') try: lyrics[0].text # \n[Intro]\nIs this the real life... except IndexError: print("Lyrics Index doesn't exist!") # Back to results page browser.back() # Look up my favorite song browser.follow_link('death on two legs') # Can also search HTML using regex patterns lyrics = browser.find(class_=re.compile(r'\blyrics\b')) print(lyrics.text) # \n[Verse 1]\nYou suck my blood like a leech...
def FenHen(keyword): result_urls = [] base_URL = 'http://fenhentai.blogspot.co.uk/search?q=' + keyword browser = RoboBrowser(history=True, parser='html.parser', user_agent='Chrome/41.0.2228.0') while True: browser.open(base_URL) post_body_list = browser.find_all('div', {'class': 'post-body entry-content'}) for post in post_body_list: this_image = post.find('img', src=True) print(this_image['src']) result_urls.append(this_image['src']) Next_Post_Link = browser.find('a', {'class': 'blog-pager-older-link'}, href=True) if (Next_Post_Link == None): break else: base_URL = Next_Post_Link['href'] return result_urls
def sign_in(username, password): """ Signs into the DOH website and sets the global session to allow other browser instances to access the cookies :param username: the username to login with :param password: the password to login with """ # If already logged in, don't log in again global global_session if global_session is not None: return True # Create Non-JS browser browser = RoboBrowser(parser='html.parser') # Open login page browser.open('https://doh.arcabc.ca/user/login') # Get the login form form = browser.get_form(id='user-login') # Set the username & password form['name'].value = username form['pass'].value = password # Submit the form browser.submit_form(form) # If successfully signed in h1 = browser.find(class_='page__title') if h1.text == username: # Set the global session global_session = browser.session return True else: return False
def find_download_page(podcast, episode): download_base = 'https://www.trancepodcasts.com/download/' browser = RoboBrowser(history=True) browser.open('https://www.trancepodcasts.com/download/{:s}-{:d}/'.format( podcast, episode)) link = browser.find('a', attrs={'rel': 'nofollow', 'class': 'btn'}) browser.follow_link(link) browser.response
def usingurllib(): url = 'https://www.screener.in/' rb = RoboBrowser(history=True, parser="html.parser") rb.open(url) f = {rb.find(placeholder="Company search...").value: 'PCPL'} post_args = urllib.parse.urlencode(f) fp = urllib.request.urlopen(url, post_args) soup = BeautifulSoup(fp) print(soup)
def get_digitised_pages(self, entity_id=None): ''' Returns the number of pages (images) in a digitised file. Note that you don't need a session id to access these pages, so there's no need to go through get_url(). ''' # url = 'http://recordsearch.naa.gov.au/scripts/Imagine.asp?B={}&I=1&SE=1'.format(entity_id) url = 'https://recordsearch.naa.gov.au/SearchNRetrieve/Interface/ViewImage.aspx?B={}'.format(entity_id) br = RoboBrowser(parser='lxml') br.open(url) try: pages = int(br.find('span', attrs={'id': "lblEndPage"}).string) except AttributeError: if br.find('span', attrs={'id': "lblCitation"}): pages = 1 else: pages = 0 return pages
def parseWeek(year, week): """ parses a specific week on http://rotoguru1.com/cgi-bin/fyday.pl?week={}&year={}&game=fd&scsv=1 which contains a csv of the fan duel player prices stores this info in fanduel_prices collection """ logger = makeLogger(str(year) + '_' + str(week), r'./logs_RotoFDStats/') startTime = datetime.now() logger.debug('Starting %d', year) client = MongoClient('localhost', 27017) db = client['nfl_data'] col_fanduel_prices = db['fanduel_prices'] if col_fanduel_prices.find({'year': year, 'yeek': week}).count(): logger.debug('Already parsed %d %d', year, week) closeLogger(logger) return None wait = random.uniform(1.5,3.5) logger.debug('Waiting %f', wait) time.sleep(wait) logger.debug('Opening main page') browser = RoboBrowser(history=False, parser='html.parser', user_agent=get_user_agent(logger), timeout=10) url = "http://rotoguru1.com/cgi-bin/fyday.pl?week={}&year={}&game=fd&scsv=1".format(week, year) browser = open_or_follow_link(logger, browser, 'open', url) docs = [] try: data = browser.find('pre').text lines = data.split('\n') header = lines[0] header = header.split(';') lines = lines[1:] for line in lines: doc = {} if not line: continue for index, each in enumerate(line.split(';')): doc[cleanKey(header[index])] = convertToNumber(each) docs.append(doc) except: logger.exception("Parse fail: %s", url) try: logger.debug('Bulk Creating docs') col_fanduel_prices.insert_many(docs) except: logger.exception('insert_many error') logger.debug('parseWeek time elapsed: ' + str(datetime.now() - startTime)) closeLogger(str(year) + '_' + str(week))
def FetchFirstImage(url): domain = 'https://luscious.net' browser = RoboBrowser(history=True, parser='html.parser', user_agent='Chrome/41.0.2228.0') browser.open(url) wrapper = browser.find('div', {'class': 'album_cover_item'}) link = domain + wrapper.find('a', href=True)['href'] browser.open(link) container = browser.find('div', {'class': 'ic_container'}) img_link = container.find('img', src=True) return img_link['src']
def solved_captcha(page): soup = BeautifulSoup(page, 'lxml') if soup.find('form', action='/captcha-form') is None: return True browser = RoboBrowser() browser.open(BASE_URL) form = browser.get_form(action='/captcha-form') captcha_url = '%s%s' % (BASE_URL, browser.find('img').get('src')) answer = get_captcha_answer(captcha_url) form['captcha[input]'].value = answer browser.submit_form(form) return False
def test_cleanup_interface(self): # Init object browser = RoboBrowser(history=True, parser='html.parser') browser.open(TEST_URL) # Find cleanup form cleanup_form = browser.get_form(action='/cleanup') self.assertTrue(cleanup_form) # Cleanup DB using form browser.submit_form(cleanup_form) self.assertTrue(browser.find(text="Database cleared"))
def browser_stuff(): browser = RoboBrowser(parser='html.parser') browser.open('http://www-wfau.roe.ac.uk/sss/pixel.html') form = browser.get_form() form['coords'].value = "00 05 53.9 -34 45 08" form['size'].value = "15" form['equinox'].value = "1" print(form['waveband'].options) browser.submit_form(form) download_link = str(browser.find("a")) download_link = download_link.split(" ")[1].split("\"")[1] return download_link
def scrape_revigo_csv(input_GOstats_tsv, out_file, pvalue_cutoff=0.05, fdr_cutoff=1.0): """ """ oh = open(out_file, "w") # get input goterms from GOstats result goterms = GOstats2Revigo(input_GOstats_tsv, pvalue_cutoff=pvalue_cutoff, fdr_cutoff=fdr_cutoff, output_column=3) if goterms: br = RoboBrowser(parser="lxml") br.open("http://revigo.irb.hr/") form = br.get_form() #print(form) form["goList"].value = goterms br.submit_form(form) download_rsc_link = br.find("a", href=re.compile("toR.jsp")) br.follow_link(download_rsc_link) #r_code = br.response.content.decode("utf-8") #print(r_code) br.back() download_csv_link = br.find("a", href=re.compile("export.jsp")) br.follow_link(download_csv_link) csv_content = br.response.content.decode("utf-8") oh.write(csv_content) else: oh.write( "term_ID,description,frequency,plot_X,plot_Y,plot_size,log10 p-value,userVal_2,uniqueness,dispensability,representative,eliminated" ) oh.close()
class RoboBrowserTestCase(StaticLiveServerTestCase, base.AbstractBrowser): def setUp(self): super().setUp() self.browser = RoboBrowser(history=True, parser='html.parser') def load(self, url): self.browser.open(self.live_server_url + url) def get_title(self): return self.browser.find('title').text def get_form(self, selector): return RoboBrowserForm(self.browser, selector)
def get_webdav_urls(username, password): # log in browser = RoboBrowser(history=True) browser.open('http://ctools.umich.edu') browser.follow_link(browser.find(id='ctoolsLogin')) login_form = browser.get_form() login_form['login'].value = username login_form['password'].value = password browser.submit_form(login_form) # get the results browser.follow_link(browser.find( class_='toolMenuLink ', title='For creating, revising, and deleting course and project sites' )) browser.open(browser.find(class_='portletMainIframe').attrs['src']) results = [] course_links = browser.select('#sitesForm td h4 a[target="_top"]') for course_link in course_links: if not course_link.attrs: continue href = course_link.attrs['href'] if '~' in href: continue results.append( 'https://ctools.umich.edu/dav' + findall('\/[^\/]+$', href)[0] ) return results
class FakeMail(object): def __init__(self): self.browser = RoboBrowser(history=True) self.browser.open('http://10minutemail.com/') with open('10minmail.txt', 'w') as f: f.write(str(self.browser.parsed)) if self.browser.get_link('Blocked'): raise BlockedException('to many login Attempts') def get_address(self): address = self.browser.find("div", {"id": "copyAddress"}) print address def read_mail(self): pass
def build_cache(): """ Get current data from the website http://www.lfd.uci.edu/~gohlke/pythonlibs/ Returns ------- Dictionary containing package details """ data = {} soup = RoboBrowser() soup.open(MAIN_URL) links = soup.find(class_="pylibs").find_all("a") for link in links: if link.get("onclick") is not None: jsfun = link.get("onclick").split('"') mlstr = jsfun[0].split("(")[1].strip()[1:-2] ml = list(map(int, mlstr.split(","))) mi = jsfun[1] url = parse_url(ml, mi) # Details = [package, version, pyversion, --, arch] details = url.split("/")[-1].split("-") pkg = details[0].lower().replace("_", "-") # Not using EXEs and ZIPs if len(details) != 5: continue # arch = win32 / win_amd64 / any arch = details[4] arch = arch.split(".")[0] # ver = cpXX / pyX / pyXXx pkg_ver = details[1] py_ver = details[2] py_ver_key = py_ver + "-" + arch if pkg in data.keys(): if py_ver_key in data[pkg].keys(): data[pkg][py_ver_key].update({pkg_ver: url}) else: data[pkg][py_ver_key] = {pkg_ver: url} else: data[pkg] = {py_ver_key: {pkg_ver: url}} return data
class Tracker(object): def __init__(self): self.browser = RoboBrowser(history=True, parser='html.parser') self.login() def login(self): self.browser.open(TRACKER_LOGIN) login_form = self.browser.get_form(id='new_member') login_form["member[login]"] = settings.TRACKER_LOGIN login_form["member[password]"] = settings.TRACKER_PASSWORD self.browser.session.headers['Referer'] = TRACKER self.browser.submit_form(login_form) def create_event(self, event, description): try: self.browser.open(TRACKER_EVENT) event_form = self.browser.get_form(id='new_event') event_form["event[title]"] = event["event_name"] event_form["event[contact_name]"] = event["contact"] event_form["event[contactemail]"] = event["email"] event_form[EVENT_DATE + "[description]"] = "Show" event_form[EVENT_DATE + "[location_ids][]"] = "70" start_date = event["start_date"] event_form[EVENT_DATE + "[startdate(1i)]"] = str(start_date.year) event_form[EVENT_DATE + "[startdate(2i)]"] = str(start_date.month) event_form[EVENT_DATE + "[startdate(3i)]"] = str(start_date.day) event_form[EVENT_DATE + "[startdate(5i)]"] = "00" event_form[EVENT_DATE + "[enddate(1i)]"] = str(start_date.year) event_form[EVENT_DATE + "[enddate(2i)]"] = str(start_date.month) event_form[EVENT_DATE + "[enddate(3i)]"] = str(start_date.day) event_form[EVENT_DATE + "[enddate(5i)]"] = "05" event_form["event[notes]"] = description self.browser.submit_form(event_form) if "errors prohibited" in str(self.browser.parsed): return self.browser.find(id="errorExplanation") else: return self.browser.url except Exception as e: return "EXCEPTION! " + str(e)
def get_email_by_cin(cin): url = 'http://www.mca.gov.in/mcafoportal/viewCompanyMasterData.do' browser = RoboBrowser() browser.session.headers['User-Agent'] = random.choice(user_agents) browser.open(url) form = browser.get_forms()[-1] form['companyID'].value = cin browser.submit_form(form) table = browser.find('table', attrs={'class': 'result-forms'}) if not table: return None email_header = table.find('td', text='Email Id') if not email_header: return None email_row = email_header.findNext('td') email = str.strip(email_row.text) return email.lower()
def main(): args = docopt(__doc__, version="dailyprogrammer-dl v{}".format(__version__)) # Configure logging logLevel = logging.INFO #default if args['--verbose']: logLevel = logging.DEBUG elif args['--quiet']: logLevel = logging.ERROR logging.basicConfig(format='%(levelname)s: %(message)s', level=logLevel) logging.debug(args) # Process command line arguments challengeURL = args['<challengeurl>'] # Parse project page for title and description logging.info("Parsing daily challenge: {}".format(challengeURL)) browser = RoboBrowser() browser.session.headers['User-Agent'] = "dailyprogrammer-dl v{} by /u/zod77".format(__version__) browser.open(challengeURL) title = browser.find('a',class_='title').string description = browser.find_all('div',class_="md") description = description[1] descriptionHTML = "".join(str(t) for t in description.contents) # remove outer <div> projectName = generateProjectName(title) # Init project skeleton logging.info("Generating project") projectPath = os.path.abspath(projectName) os.mkdir(projectPath) # Write out project files pyTemplate = os.path.abspath(os.path.join(os.path.dirname(os.path.realpath(__file__)),"boilerplate.txt")) shutil.copy(pyTemplate, os.path.join(projectPath,"{}.py".format(projectName))) # Generate README.md h = html2text.HTML2Text() descriptionMD = h.handle(descriptionHTML) readme = os.path.join(projectPath,"README.md") with open(readme, "w") as f: f.write(descriptionMD) return
def testProxy(proxy): """ Tests a proxy with api.ipify.org If the proxy fails, it retries 20 more times. This is because free proxies are unreliable at times. """ tries = 0 browser = RoboBrowser(history=False, parser='html5lib', timeout=10) while(True): try: tries += 1 browser.open("http://api.ipify.org", proxies={'http': proxy}) if browser.find('body').text != row['IP Address']: raise Exception('Failed') return row except: if tries > 20: return None
def gatherData(user, password): baseURL = 'https://sigarra.up.pt/feup/pt/' browser = RoboBrowser(history=True, parser='html.parser') browser.open(baseURL + 'web_page.Inicial') # Gets the login form form = browser.get_form(action=re.compile(r'validacao')) # Updates the login form with the user credentials form['p_user'].value = 'up' + user form['p_pass'].value = password browser.submit_form(form) # Goes to the user profile browser.open(baseURL + 'fest_geral.cursos_list?pv_num_unico=' + user) # Opens the extended view extended = browser.find(title='Visualizar informações no contexto do curso') browser.follow_link(extended) credits = [] grades = [] # For each html class containing grades ("i", "p" and "o"), gather data for i in browser.find_all(class_='i'): if i.find(class_='n aprovado'): credits.append(i.find(class_='k n').text) grades.append(i.find(class_='n aprovado').text) for j in browser.find_all(class_='p'): if j.find(class_='n aprovado'): credits.append(j.find(class_='k n').text) grades.append(j.find(class_='n aprovado').text) for k in browser.find_all(class_='o'): if k.find(class_='n aprovado'): credits.append(k.find(class_='k n').text) grades.append(k.find(class_='n aprovado').text) return credits, grades
def parseTeam(team_url, team_name): """ parses a teams page returns a list of year urls there is some data on this page that would be usefull to scrape in the future """ logger = makeLogger(cleanKey(team_name), r"./logs_pfrTeamStats/") startTime = datetime.now() logger.debug("Starting %s", team_name) wait = random.uniform(1.5, 3.5) logger.debug("Waiting %f", wait) time.sleep(wait) logger.debug("Opening main page") browser = RoboBrowser(history=False, parser="html5lib", user_agent=get_user_agent(logger), timeout=10) browser = open_or_follow_link(logger, browser, "open", team_url) table = browser.find(id="team_index").find("tbody") year_columns = table.find_all("th") year_url_tups = [] for index, year_column in enumerate(year_columns): logger.debug("Row %d of %d", index, len(year_columns)) try: year_link = year_column.find("a") if year_link: year_url = "http://www.pro-football-reference.com" + year_link["href"] year = convertToNumber(year_link.text) if not isinstance(year, int): continue year_url_tups.append((team_name, year_url, year)) except: logger.exception(year_column) logger.debug("parseTeam time elapsed: " + str(datetime.now() - startTime)) closeLogger(logger) return year_url_tups
def run(wait): """ """ logger = makeLogger('main', r'./logs_pfrPlayerStats/') startTime = datetime.now() logger.debug('start time: ' + str(startTime)) browser = RoboBrowser(history=False, parser='html5lib', user_agent=get_user_agent(logger), timeout=10) player_tuples = [] for letter in list(string.ascii_uppercase): wait = random.uniform(.5,1.5) logger.debug('Waiting %f', wait) time.sleep(wait) logger.debug('Opening players %s', letter) browser = open_or_follow_link(logger, browser, 'open', "http://www.pro-football-reference.com/players/{}/".format(letter)) players = browser.find(id="div_players") for player in players.find_all('p'): player = player.find('a') player_tuples.append((player.text, player['href'])) pool = Pool(processes=int(get_proxy_count()/2.5)) logger.debug('Processing %d Players', len(player_tuples)) for player_tuple in player_tuples: #parsePlayer(player_tuple[0], player_tuple[1]) pool.apply_async(parsePlayer, (player_tuple[0], player_tuple[1],)) pool.close() #Prevents any more tasks from being submitted to the pool. Once all the tasks have been completed the worker processes will exit. pool.join() #Wait for the worker processes to exit. One must call close() or terminate() before using join(). logger.debug('run time elapsed: ' + str(datetime.now() - startTime)) closeLogger(logger)
def scrap(url): browser = RoboBrowser(user_agent='i am tool') browser.open(url) a = browser.find(class_='captcha') ##machine learning would be great for class prediction fullsrc = url[:-1] + a['src'] request.urlretrieve(fullsrc, "captcha.jpg") ##tesseract buraya gelecek ##tam buraya işte aha form = browser.get_form(action=re.compile(r'.')) # Fill it out form['name'].value = 'namaaaeee' form['password'].value = '*****@*****.**' form['password2'].value = 'teambeaver' form['captcha_1'].value = '1234' # Submit the form browser.submit_form(form) print(browser.response)
def connect(request, mmg): """ Login to MyMedicare.gov using RoboBrowser :param request: :param username: :param password: :return: """ mmg_back = mmg mmg_back['status'] = "FAIL" PARSER = BS_PARSER if not PARSER: logger.debug('Default Parser for BeautifulSoup:', 'lxml') PARSER = 'lxml' login_url = 'https://www.mymedicare.gov/default.aspx' # This is for testing. Next step is to receive as parameters username = mmg['mmg_user'] # 'MBPUSER202A' # password = '******'# 'CMSPWD2USE' password = mmg['mmg_pwd'] # 'CMSPWD2USE' # Call the default page # We will then want to get the Viewstate and eventvalidation entries # we need to submit them with the form rb = RoboBrowser() mmg_back['robobrowser'] = rb # Set the default parser (lxml) # This avoids BeautifulSoup reporting an issue in the console/log rb.parser = PARSER # Open the form to start the login rb.open(login_url) # Get the form content form = rb.get_form() # if settings.DEBUG: # print("Page:", rb) # We will be working with these form fields. # Set them as variables for easier re-use form_pwd = "ctl00$ContentPlaceHolder1$ctl00$HomePage$SWEPassword" form_usr = "******" form_agree = "ctl00$ContentPlaceHolder1$ctl00$HomePage$Agree" # sign_in = "ctl00$ContentPlaceHolder1$ctl00$HomePage$SignIn" # EVENTTARGET = "ctl00$ContentPlaceHolder1$ctl00$HomePage$SignIn" form_create_acc = "ctl00$ContentPlaceHolder1$ctl00$HomePage$lnk" \ "CreateAccount" # Set the form field values form.fields[form_usr].value = username form.fields[form_pwd].value = password # There is a javascript popup after hitting submit # It seems to set the following field to "True" # Default in form is "False" form.fields[form_agree].value = "True" # Remove the CreateAccount field. It seems to drive the form # to the registration page. form.fields.pop(form_create_acc) # Capture the dynamic elements from these damned aspnetForms # We need to feed them back to allow the form to validate VIEWSTATEGENERATOR = form.fields['__VIEWSTATEGENERATOR']._value EVENTVALIDATION = form.fields['__EVENTVALIDATION']._value VIEWSTATE = form.fields['__VIEWSTATE']._value # if settings.DEBUG: # print("EventValidation:", EVENTVALIDATION ) # print("ViewStateGenerator:", VIEWSTATEGENERATOR) # Set the validator fields back in to the form form.fields['__VIEWSTATEGENERATOR'].value = VIEWSTATEGENERATOR form.fields['__VIEWSTATE'].value = VIEWSTATE form.fields['__EVENTVALIDATION'].value = EVENTVALIDATION # Prepare the form for submission form.serialize() # logger.debug("serialized form:", form) # submit the form rb.submit_form(form) # logger.debug("RB:", rb, "\nRB:", rb.__str__()) browser = RoboBrowser(history=True) if browser: pass # browser.parser = PARSER # logger.debug("Browser History:", browser.history, # "\nBrowser parser:", browser.parser, # # "\nPage html:", rb.parsed # ) if not rb.url == "https://www.mymedicare.gov/dashboard.aspx": err_msg = rb.find("span", {"id": "ctl00_ContentPlaceHolder1" "_ctl00_HomePage_lblError"}) if err_msg: err_msg = err_msg.contents messages.error(request, err_msg) messages.error(request, "We had a problem connecting to your" "Medicare account") mmg_back['status'] = "FAIL" mmg_back['url'] = rb.url return mmg_back # <ul id="headertoolbarright"> # <li class="welcometxt" id="welcomeli">Welcome, JOHN A DOE </li> my_name = rb.find("li", {"id": "welcomeli"}) if my_name: my_name = my_name.contents[0].replace("Welcome, ", "") my_account = rb.find("div", {"id": "RightContent"}) if my_account: my_account = my_account.prettify() my_account = my_account.replace('href="/', 'target="_blank" ' 'href="https://www.mymedicare.gov/') # my_account = my_account.contents # href="/mymessages.aspx" # href="/myaccount.aspx" # href="/plansandcoverage.aspx" # my_account.str('href="/mymessages.aspx', # 'href="https://www.mymedicare.gov/mymessages.apsx') # my_account.str('href="/myaccount.aspx', # 'href="https://www.mymedicare.gov/myaccount.aspx') # my_account.str('href="/plansandcoverage.aspx', # 'href="https://www.mymedicare.gov/plansandcoverage.aspx') # if settings.DEBUG: # print("\nMyAccount:", len(my_account), "|", my_account) # Need to pass data to context and then render to different # template with some data retrieved from MyMedicare.gov # If successfully logged in, Or return an error message. mmg_back['status'] = "OK" mmg_back['url'] = rb.url mmg_back['mmg_account'] = my_account mmg_back['mmg_name'] = my_name mmg_back['robobrowser'] = rb # logger.debug("RB post sign-in:", rb, # "rb url:", rb.url) return mmg_back
class Submitter: def __init__(self): self.browser = RoboBrowser(parser='html5lib') def login(self): self.browser.open(MTI_BATCH_URL) form = self.browser.get_form('fm1') form['username'] = config.MTI_USERNAME form['password'] = config.MTI_PASSWORD self.browser.submit_form(form) def submit(self, batch_id): logger.info('Submitting MTI batch {:04}'.format(batch_id)) batch = session.query(MtiBatch).filter_by(id=batch_id).one() path = get_batch_file(batch_id, 'abstracts') self.browser.open(MTI_BATCH_URL) form = self.browser.get_form() form['Email_Address'] = config.MTI_EMAIL form['BatchNotes'] = config.MTI_EMAIL form['UpLoad_File'] = open(path) form['Filtering'] = '' form['SingLinePMID'] = 'Yes' form['Output'] = 'detail' self.browser.submit_form(form) # Confirm submit js = self.browser.find('script').text param = MTI_SCHEDULE_RE.search(js).groups()[0] self.browser.open(MTI_CONFIRM_URL + param) batch.submitted = True batch.path = param session.commit() def fetch(self, path, batch_id): session = self.browser.session path = '{}/text.out'.format(path.replace(MTI_PATH_PREFIX, '')) url = urljoin(MTI_BASE_URL, path) resp = session.get(url, stream=True) resp.raise_for_status() with open(get_batch_file(batch_id, 'terms'), 'wb') as fp: for chunk in resp.iter_content(chunk_size=1024): fp.write(chunk) @classmethod def batch_submit(cls): rows = session.query( MtiBatch ).filter_by( submitted=False, done=False, ).limit( MAX_SUBMIT ) if rows: submitter = cls() submitter.login() for batch in rows: submitter.submit(batch.id) @classmethod def batch_fetch(cls): submitter = cls() submitter.login() rows = session.query( MtiBatch ).filter( MtiBatch.submitted == True, # nqa MtiBatch.done == False, # noqa MtiBatch.path != None, # noqa ) for row in rows: try: submitter.fetch(row.path, row.id) load_batch(row.id) except HTTPError as exc: logger.exception(exc)
def authenticate(self, username=None, password=None): login_url = 'https://www.mymedicare.gov/default.aspx' rb = RoboBrowser() rb.parser = 'lxml' rb.open(login_url) # Get the form content form = rb.get_form() if settings.DEBUG: print("Page:", rb) # We will be working with these form fields. # Set them as variables for easier re-use form_pwd = "ctl00$ContentPlaceHolder1$ctl00$HomePage$SWEPassword" form_usr = "******" form_agree = "ctl00$ContentPlaceHolder1$ctl00$HomePage$Agree" form_create_acc = "ctl00$ContentPlaceHolder1$ctl00$HomePage$lnk" \ "CreateAccount" # Set the form field values form.fields[form_usr].value = username form.fields[form_pwd].value = password # There is a javascript popup after hitting submit # that the form_agree to "True" # Default in form is "False" form.fields[form_agree].value = "True" # Remove the CreateAccount field. It seems to drive the form # to the registration page. form.fields.pop(form_create_acc) # Capture the dynamic elements from these damned aspnetForms # We need to feed them back to allow the form to validate VIEWSTATEGENERATOR = form.fields['__VIEWSTATEGENERATOR']._value EVENTVALIDATION = form.fields['__EVENTVALIDATION']._value VIEWSTATE = form.fields['__VIEWSTATE']._value # Set the validator fields back in to the form form.fields['__VIEWSTATEGENERATOR'].value = VIEWSTATEGENERATOR form.fields['__VIEWSTATE'].value = VIEWSTATE form.fields['__EVENTVALIDATION'].value = EVENTVALIDATION # Prepare the form for submission form.serialize() # submit the form rb.submit_form(form) # If the login was successful then we would be redirected to the dashboard. if rb.url == "https://www.mymedicare.gov/dashboard.aspx": """The login worked.""" # Get the name my_name = rb.find("li", {"id": "welcomeli"}) if my_name: my_name = my_name.contents[0].replace("Welcome, ", "") split_name = my_name.split(' ') first_name = split_name[0] last_name = split_name[-1] if not last_name: last_name = split_name[-2] try: user = User.objects.get(username=username) except User.DoesNotExist: # Create a new user. Note that we can set password # to anything, because it won't be checked; the password # from the external backend is checked (coming from settings). user = User(username=username, password='******', first_name=first_name, last_name=last_name) user.save() up, created = UserProfile.objects.get_or_create(user=user, user_type='BEN') group = Group.objects.get(name='BlueButton') user.groups.add(group) return user # The MyMedicare login failed. return None
def parseYear(year): """ parses a schedule for a specific year on http://www.pro-football-reference.com/years/{YEAR}/games.htm follows all the "boxscore" links (column[3]) to get stadium and weather conditions (game_info) stores schedule info in nfl_data.schedule stores game_info in nfl_data.game_info with schedule ids """ logger = makeLogger(year, r'./logs_pfrSchedule/') startTime = datetime.now() logger.debug('Starting %d', year) schedule_list = [] gameInfo_list = [] client = MongoClient('localhost', 27017) db = client['nfl_data'] col_schedule = db['schedule'] col_game_info = db['game_info'] col_failed_game_info = db['failed_game_info'] if col_schedule.find({'year': year}).count(): logger.debug('Already parsed %s', year) closeLogger(logger) return None wait = random.uniform(1.5,3.5) logger.debug('Waiting %f', wait) time.sleep(wait) logger.debug('Opening main page') browser = RoboBrowser(history=False, parser='html5lib', user_agent=get_user_agent(logger), timeout=10) browser = open_or_follow_link(logger, browser, 'open', "http://www.pro-football-reference.com/years/{}/games.htm".format(year)) table = browser.find(id='games') rows = table.find_all('tr') for index, row in enumerate(rows): logger.debug('Row %d of %d', index, len(rows)) try: schedule_dict = {} gameInfo_dict = {} columns = row.find_all('td') if columns: schedule_dict['week'] = convertToNumber(columns[0].text) schedule_dict['day'] = columns[1].text schedule_dict['date'] = columns[2].text schedule_dict['year'] = convertToNumber(year) homeIndicator = columns[5].text if homeIndicator == '@': schedule_dict['homeTeam'] = columns[6].text schedule_dict['awayTeam'] = columns[4].text schedule_dict['homeTeamScore'] = convertToNumber(columns[8].text) schedule_dict['awayTeamScore'] = convertToNumber(columns[7].text) else: schedule_dict['homeTeam'] = columns[4].text schedule_dict['awayTeam'] = columns[6].text schedule_dict['homeTeamScore'] = convertToNumber(columns[7].text) schedule_dict['awayTeamScore'] = convertToNumber(columns[8].text) gameInfo_dict['week'] = convertToNumber(columns[0].text) gameInfo_dict['year'] = convertToNumber(year) wait = random.uniform(.5, 2.5) logger.debug('Waiting to follow_link %f', wait) time.sleep(wait) logger.debug('Following link') url = columns[3].find('a') if url: url = 'http://www.pro-football-reference.com' + url['href'] failed_game_info = True browser = open_or_follow_link(logger, browser, 'open', url) game_info = browser.find(id="game_info") if game_info: for each in game_info.find_all('tr'): pair = each.find_all('td') if pair: failed_game_info = False key = pair[0].text value = convertToNumber(pair[1].text) gameInfo_dict[cleanKey(key)] = convertToNumber(value) if failed_game_info: failed_dict = schedule_dict failed_dict['row'] = index failed_dict['href'] = url['href'] col_failed_game_info.insert(failed_dict) gameInfo_dict['FAIL'] = True schedule_list.append(schedule_dict) gameInfo_list.append(gameInfo_dict) except: logger.exception(row) logger.debug('nfl_schedule.inert_many') schedule_ids = col_schedule.insert_many(schedule_list).inserted_ids logger.debug('mapping nfl_schedule.id to gameInfo_list') for index, schedule_id in enumerate(schedule_ids): if len(gameInfo_list[index].keys()) <= 2: logger.debug('Empty game_info: %s', schedule_id) gameInfo_list[index]['schedule_id'] = schedule_id logger.debug('game_info.insert_many') col_game_info.insert_many(gameInfo_list) logger.debug('parseYear time elapsed: ' + str(datetime.now() - startTime)) closeLogger(year)
# coding: utf-8 import re from robobrowser import RoboBrowser url = "http://www.qq.com/" b = RoboBrowser(history=True) b.open(url) # 获取今日话题这个link today_top = b.find(id="todaytop").a print today_top["href"] b.follow_link(today_top) # 这个时候已经跳转到了今日话题的具体页面了 # 打印标题 title = b.select(".hd h1")[0] print "*************************************" print title.text print "*************************************" # 打印正文内容 print b.find(id="articleContent").text
import re from robobrowser import RoboBrowser url = 'http://itest.info/courses/2' b = RoboBrowser(history=True) b.open(url) class_name = b.select('.headline h2') print class_name[0].text class_desc = b.select('.tag-box') print class_desc[0].text class_time = b.select('h4') print class_time[0].text teacher = b.select('.thumbnail-style h3') print teacher[0].text qq = b.find(text=re.compile('QQ')) print qq qq_group = b.find(text=re.compile('\+selenium')) print qq_group
USERAGENTS ='Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:37.0) Gecko/20100101 Firefox/37.0' session = req_session() session.headers.update({'Referer': 'https://www.deviantart.com'}) browser = RoboBrowser(history=False, session=session, tries=2, user_agent=USERAGENTS) print("Attempting to log in to deviantArt...") browser.open('https://www.deviantart.com/users/login?ref=https%3A%2F%2Fwww.deviantart.com%2F&remember_me=1') form = browser.get_forms()[1] form['username'] = USERNAME form['password'] = PSWD #print(form) if browser.find(text=re.compile("Login")): print('Compiled login fields form...') browser.submit_form(form) if browser.find(text=re.compile("The password you entered was incorrect")): print("Wrong password or username. Attempting to download anyway.") exit(); elif browser.find(text=re.compile("\"loggedIn\":true")): print("Logged in!") else: print("Login unsuccessful. Attempting to download anyway.") exit(); browser.open('https://www.deviantart.com/messages/#view=deviantwatch') page=browser.select('body')
def parseSeason(role, category, season, seasonTypes): """Parses every seasonType in a season at http://www.nfl.com/stats/categorystats for a given role/category/season doesnt follow any links some years dont have any info, but still return a page. These are loged with Exception('No teams found %s' % url) All data is stored in team_stats """ logger = makeLogger(role.text + '_' + category.text + '_' + season.text, r'./logs_nflteamStat/') startTime = datetime.now() logger.debug('Starting %s %s %s', role.text, category.text, season.text) teamStat_list = [] for seasonType in seasonTypes: if seasonType.text == "Season Type...": continue team_stats_query = {'year': convertToNumber(removeNewLine(season.text)), 'seasonType': removeNewLine(seasonType.text), 'role': removeNewLine(role.text), 'category': removeNewLine(category.text) } if col_team_stats.find(team_stats_query).count(): logger.debug('Already parsed %s', team_stats_query) continue wait = random.uniform(1.5,3.5) logger.debug('Waiting %f', wait) time.sleep(wait) logger.debug('Starting: %s', team_stats_query) url = 'http://www.nfl.com/stats/categorystats?' + 'archive=true&conference=null' + '&role=' + role['value'] try: if role.text == "Offense": categoryUrl = '&offensiveStatisticCategory=' + category['value'] + '&defensiveStatisticCategory=null' elif role.text == "Defense": categoryUrl = '&offensiveStatisticCategory=null&defensiveStatisticCategory=' + category['value'] else: raise Exception('Unsupported role: %s', role.text) url += categoryUrl url += '&season=' + season['value'] + '&seasonType=' + seasonType['value'] + '&tabSeq=2&qualified=false&Submit=Go' logger.debug('Opening: %s', url) browser = RoboBrowser(history=False, parser='html5lib', user_agent=get_user_agent(logger), timeout=10) browser = open_or_follow_link(logger, browser, 'open', url) result = browser.find(id="result") tries = 0 # sometimes when using slow proxies nfl.com returns 200 without the whole page being loaded while not result: if tries > 10: raise Exception('No teams found %s' % url) elif tries > 0: time.sleep(random.uniform(5, 7)) tries += 1 logger.debug('No result-tries: %d', tries) browser = RoboBrowser(history=False, parser='html5lib', user_agent=get_user_agent(logger), timeout=10) browser = open_or_follow_link(logger, browser, 'open', url) result = browser.find(id="result") tbodies = result.find_all("tbody") if len(tbodies) != 2: raise Exception("error parsing result") tableKey = tbodies[0] tableKeyRows = tableKey.find_all("tr") topTableKeys = [] if len(tableKeyRows) == 1: tableKey = tableKey.find_all("th") elif len(tableKeyRows) == 2: topTableColumns = tableKeyRows[0].find_all("th") for topTableColumn in topTableColumns: for _ in range(int(topTableColumn['colspan'])): topTableKeys.append(topTableColumn.text) tableKey = tableKeyRows[1].find_all("th") else: raise Exception('To many header rows found') tableItems = tbodies[1] tableItems = tableItems.find_all("td") tableColumn = 0 teamStatDict = {} for tableIndex, tableItem in enumerate(tableItems): if tableColumn == 0: logger.debug('Row %d of %d', tableIndex, len(tableItems)) tableColumn += 1 continue if tableColumn == 1: teamStatDict['team'] = removeNewLine(tableItem.text) teamStatDict['year'] = int(removeNewLine(season.text)) teamStatDict['seasonType'] = removeNewLine(seasonType.text) teamStatDict['role'] = removeNewLine(role.text) teamStatDict['category'] = removeNewLine(category.text) tableColumn += 1 continue if topTableKeys and topTableKeys[tableColumn]: key = topTableKeys[tableColumn] + '_' + tableKey[tableColumn].text else: key = tableKey[tableColumn].text key = cleanKey(removeNewLine(key)) value = convertToNumber(removeNewLine(tableItem.text)) teamStatDict[key] = value tableColumn += 1 if tableColumn >= len(tableKey): teamStat_list.append(teamStatDict) teamStatDict = {} tableColumn = 0 except: logger.exception('row fail') try: if teamStat_list: logger.debug('Bulk Creating teamStat_list') col_team_stats.insert_many(teamStat_list) except: logger.exception('insert_many error') logger.debug('parseSeason time elapsed: ' + str(datetime.now() - startTime)) closeLogger(role.text + '_' + category.text)
class RSClient(): def __init__(self): self._create_browser() def _create_browser(self): url = 'http://recordsearch.naa.gov.au/scripts/Logon.asp?N=guest' self.br = RoboBrowser(parser='lxml', user_agent='Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.81 Safari/537.36') self.br.open(url) form = self.br.get_form(id='t') self.br.submit_form(form) def _open_url(self, url): ''' RecordSearch inserts a page that needs to have an embedded form automatically submitted before you get what you actually want. ''' self.br.open(url) form = self.br.get_form(id='t') self.br.submit_form(form) def _get_details(self, entity_id): ''' Given an id retrieve the element containing the item details. ''' if (not entity_id and self.entity_id) or (entity_id == self.entity_id): details = self.details else: url = '{}{}'.format(RS_URLS[self.entity_type], quote_plus(entity_id)) self._open_url(url) details = self.br.find('div', 'detailsTable') if details: self.entity_id = entity_id self.details = details else: raise UsageError('No details found for {}'.format(entity_id)) return details def _get_cell(self, label, entity_id): details = self._get_details(entity_id) try: cell = ( details.find(text=re.compile(label)) .parent.parent.findNextSiblings('td')[0] ) except (IndexError, AttributeError): # Sometimes the cell labels are inside an enclosing div, # but sometimes not. Try again assuming no div. try: cell = ( details.find(text=re.compile(label)) .parent.findNextSiblings('td')[0] ) except (IndexError, AttributeError): cell = None return cell def _get_value(self, label, entity_id): cell = self._get_cell(label, entity_id) try: value = ' '.join([string for string in cell.stripped_strings]) except AttributeError: value = None return value def _get_formatted_dates(self, label, entity_id, date_format): try: date_str = self._get_value(label, entity_id) except AttributeError: dates = {'date_str': date_str, 'start_date': None, 'end_date': None} else: if date_str: dates = utilities.process_date_string(date_str) if date_format == 'iso': formatted_dates = { 'date_str': date_str, 'start_date': utilities.convert_date_to_iso(dates['start_date']), 'end_date': utilities.convert_date_to_iso(dates['end_date']), } elif date_format == 'obj': formatted_dates = dates else: formatted_dates = {'date_str': None, 'start_date': None, 'end_date': None} return formatted_dates def _get_relations(self, label, entity_id, date_format): cell = self._get_cell(label, entity_id) relations = [] if cell is not None: for relation in cell.findAll('li'): try: date_str = relation.find('div', 'dates').string.strip() except AttributeError: date_str = '' dates = {'date_str': '', 'start_date': None, 'end_date': None} else: dates = utilities.process_date_string(date_str) if date_format == 'iso': formatted_dates = { 'date_str': date_str, 'start_date': utilities.convert_date_to_iso(dates['start_date']), 'end_date': utilities.convert_date_to_iso(dates['end_date']), } elif date_format == 'obj': formatted_dates = dates details = [string for string in relation.find('div', 'linkagesInfo').stripped_strings] try: identifier = details[0] title = details[1][2:] except IndexError: identifier = details[0] title = details[0] relations.append({ 'date_str': formatted_dates['date_str'], 'start_date': formatted_dates['start_date'], 'end_date': formatted_dates['end_date'], 'identifier': identifier, 'title': title }) else: relations = None return relations def get_digitised_pages(self, entity_id=None): ''' Returns the number of pages (images) in a digitised file. Note that you don't need a session id to access these pages, so there's no need to go through get_url(). ''' # url = 'http://recordsearch.naa.gov.au/scripts/Imagine.asp?B={}&I=1&SE=1'.format(entity_id) url = 'http://recordsearch.naa.gov.au/SearchNRetrieve/Interface/ViewImage.aspx?B={}'.format(entity_id) br = RoboBrowser(parser='lxml') br.open(url) try: pages = int(br.find('span', attrs={'id': "lblEndPage"}).string) except AttributeError: pages = 0 return pages def _get_advanced_search_form(self): # Added header 10 June 2015 -- otherwise causes error self.br.session.headers.update({'Referer': 'http://recordsearch.naa.gov.au/SearchNRetrieve/Interface/SearchScreens/BasicSearch.aspx'}) self.br.open('http://recordsearch.naa.gov.au/SearchNRetrieve/Interface/SearchScreens/AdvSearchItems.aspx') search_form = self.br.get_form(id="formSNRMaster") return search_form
def parseWeek(year, week): """Parsing a specific week at http://nflweather.com/week/{}/Week-{} Follows all detial links, which is where must of the data is scraped. Scrapes weather, and stadium enough per week, and stores them in their respective collections """ logger = makeLogger(str(year) + '_' + str(week), r'./logs_nflWeather/') startTime = datetime.now() logger.debug('Starting %d %d', year, week) weather_list = [] stadium_list = [] if col_weather_info.find({'year': year, 'week': week}).count(): logger.debug('Already parsed %d %d', year, week) return None wait = random.uniform(1.5,3.5) logger.debug('Waiting %f', wait) time.sleep(wait) logger.debug('Opening main page') browser = RoboBrowser(history=False, parser='html5lib', user_agent=get_user_agent(logger), timeout=10) browser = open_or_follow_link(logger, browser, 'open', "http://nflweather.com/week/{}/Week-{}".format(year, week)) data = browser.find(class_="footable") rows = data.find_all('tr') for index, row in enumerate(rows): logger.debug('Row %d of %d', index, len(rows)) weatherInfo = {'year': year, 'week': week} stadiumInfo = {'year': year, 'week': week} try: columns = row.find_all('td') if columns: weatherInfo['weatherPicAlt'] = columns[8].find('img')['alt'] weatherInfo['weatherText'] = columns[9].text.strip() weatherInfo['shortWind'] = columns[10].text details = columns[12] detialsLink = 'http://nflweather.com' + details.find('a')['href'] wait = random.uniform(.5, 2.5) logger.debug('Waiting to follow_link %f', wait) time.sleep(wait) logger.debug('Following link') browser = open_or_follow_link(logger, browser, 'open', detialsLink) gameTime = browser.find('strong').text.split('-')[0].split(':', 1)[1].strip() awayTeam = browser.find_all(class_='g-away')[1].find('a').text.replace(' ', ' ').strip() homeTeam = browser.find_all(class_='g-home')[1].find('a').text.replace(' ', ' ').strip() spans = browser.find_all(class_='span5') if len(spans) != 2: raise Exception('to many spans') weatherItems = spans[0].find_all('p') stadiumItems = spans[1].find_all('p') index = spans[0].text.find('Temperature:') weatherCondition = spans[0].text[:index].strip() for each in weatherItems: split = each.text.strip().split(':') if len(split) == 2: weatherInfo[cleanKey(split[0].strip())] = convertToNumber(split[1].strip()) for index, each in enumerate(stadiumItems): split = each.text.strip().split(':') if len(split) == 2: if split[0] == 'Surface': stadiumInfo['stadium'] = stadiumItems[index-1].text.strip() stadiumInfo[cleanKey(split[0].strip())] = convertToNumber(split[1].strip()) #find nfl_schedule, update gameTime, hoepfully result as id, insert id into both info dicts, append to _list schedule_query = {'year': year, 'week': week, 'homeTeam': homeTeam, 'awayTeam': awayTeam} schedule_doc = col_schedule.find(schedule_query) if schedule_doc.count() != 1: error_docs = str(schedule_query) + ' | ' + str(weatherInfo) + ' | ' + str(stadiumInfo) raise Exception("nfl_scedule doc not found " + error_docs) result = col_schedule.update_one(schedule_query, {'$set': {'dateTime': gameTime}}) schedule_id = schedule_doc[0]['_id'] weatherInfo['schedule_id'] = schedule_id stadiumInfo['schedule_id'] = schedule_id weather_list.append(weatherInfo) stadium_list.append(stadiumInfo) except: logger.exception(row) try: logger.debug('Bulk Creating weather_list') col_weather_info.insert_many(weather_list) logger.debug('Bulk Creating stadium_list') col_stadium_info.insert_many(stadium_list) except: logger.exception('insert_many error') logger.debug('parseWeek time elapsed: ' + str(datetime.now() - startTime)) closeLogger(str(year) + '_' + str(week))
class Dagr: """deviantArt gallery ripper class""" NAME = basename(__file__) __version__="0.60" MAX_DEVIATIONS = 1000000 # max deviations def __init__(self): # Internals self.browser = None self.errors_count = dict() # Configuration self.username = "" self.password = "" self.overwrite = False self.reverse = False self.testOnly = False self.verbose = False # Current status self.deviant = "" def start(self): if not self.browser: # Set up fake browser self.set_browser() # Always run login self.login() def set_browser(self): USERAGENTS = ( 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.202 Safari/535.1', 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:7.0.1) Gecko/20100101', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50', 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0)', 'Opera/9.99 (Windows NT 5.1; U; pl) Presto/9.9.9', 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_6; en-US) AppleWebKit/530.5 (KHTML, like Gecko) Chrome/ Safari/530.5', 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/533.2 (KHTML, like Gecko) Chrome/6.0', 'Mozilla/5.0 (Windows; U; Windows NT 6.1; pl; rv:1.9.1) Gecko/20090624 Firefox/3.5 (.NET CLR 3.5.30729)' ) session = req_session() session.headers.update({'Referer': 'http://www.deviantart.com/'}) self.browser = RoboBrowser(history=False, session=session, tries=3, user_agent=random.choice(USERAGENTS)) def login(self): if not (self.username and self.password): return print("Attempting to log in to deviantArt...") self.browser.open('https://www.deviantart.com/users/login?ref=http%3A%2F%2Fwww.deviantart.com%2F&remember_me=1') form = self.browser.get_forms()[1] form['username'] = self.username form['password'] = self.password self.browser.submit_form(form) if self.browser.find(text=re.compile("The password you entered was incorrect")): print("Wrong password or username. Attempting to download anyway.") elif self.browser.find(text=re.compile("\"loggedIn\":true")): print("Logged in!") else: print("Login unsuccessful. Attempting to download anyway.") def get(self, url, file_name = None): if file_name is not None and (self.overwrite == False) and (path_exists(file_name)): print(file_name + " exists - skipping") return #TODO Test robobrowser retries and exceptions self.browser.open(url) if file_name is None: return str(self.browser.parsed) else: # Open our local file for writing local_file = open(file_name, "wb") #Write to our local file local_file.write(self.browser.response.content) local_file.close() def find_link(self, link): filelink = None mature_error = False self.browser.open(link) # Full image link (via download link) img_link = self.browser.get_link(text=re.compile("Download( (Image|File))?")) if img_link and img_link.get("href"): self.browser.follow_link(img_link) filelink = self.browser.url else: if self.verbose: print("Download link not found, falling back to direct image") # Fallback 1: try meta (filtering blocked meta) filesearch = self.browser.find("meta", {"name":"og:image"}) if filesearch: filelink = filesearch['content'] if basename(filelink).startswith("noentrythumb-"): filelink = None mature_error = True if not filelink: # Fallback 2: try collect_rid, full filesearch = self.browser.find("img", {"collect_rid":True, "class":re.compile(".*full")}) if not filesearch: # Fallback 3: try collect_rid, normal filesearch = self.browser.find("img", {"collect_rid":True, "class":re.compile(".*normal")}) if filesearch: filelink = filesearch['src'] if not filelink: if mature_error: raise DagrException("probably a mature deviation") else: raise DagrException("all attemps to find a link failed") filename = basename(filelink) return (filename, filelink) def handle_download_error(self, link, e): error_string = str(e) print("Download error (" + link + ") : " + error_string) if error_string in self.errors_count: self.errors_count[error_string] += 1 else: self.errors_count[error_string] = 1 def deviant_get(self, mode): print("Ripping " + self.deviant + "'s " + mode + "...") pat = "http://[a-zA-Z0-9_-]*\.deviantart\.com/art/[a-zA-Z0-9_-]*" modeArg = '_' if mode.find(':') != -1: mode = mode.split(':',1) modeArg = mode[1] mode = mode[0] #DEPTH 1 pages = [] for i in range(0,int(Dagr.MAX_DEVIATIONS/24),24): html = "" url = "" if mode == "favs": url = "http://" + self.deviant.lower() + ".deviantart.com/favourites/?catpath=/&offset=" + str(i) elif mode == "collection": url = "http://" + self.deviant.lower() + ".deviantart.com/favourites/" + modeArg + "?offset=" + str(i) elif mode == "scraps": url = "http://" + self.deviant.lower() + ".deviantart.com/gallery/?catpath=scraps&offset=" + str(i) elif mode == "gallery": url = "http://" + self.deviant.lower() + ".deviantart.com/gallery/?catpath=/&offset=" + str(i) elif mode == "album": url = "http://" + self.deviant.lower() + ".deviantart.com/gallery/" + modeArg + "?offset=" + str(i) elif mode == "query": url = "http://" + self.deviant.lower() + ".deviantart.com/gallery/?q=" + modeArg + "&offset=" + str(i) else: continue html = self.get(url) prelim = re.findall(pat, html, re.IGNORECASE|re.DOTALL) c = len(prelim) for match in prelim: if match in pages: c -= 1 else: pages.append(match) done = re.findall("(This section has no deviations yet!|This collection has no items yet!)", html, re.IGNORECASE|re.S) if len(done) >= 1 or c <= 0: break print(self.deviant + "'s " + mode + " page " + str(int((i/24)+1)) + " crawled...") if not self.reverse: pages.reverse() if len(pages) == 0: print(self.deviant + "'s " + mode + " had no deviations.") return 0 else: try: da_make_dirs(self.deviant + "/" + mode) if (mode == "query") or (mode == "album") or (mode == "collection"): da_make_dirs(self.deviant + "/" + mode + "/" + modeArg) except Exception as e: print(str(e)) print("Total deviations in " + self.deviant + "'s gallery found: " + str(len(pages))) ##DEPTH 2 counter2 = 0 for link in pages: counter2 += 1 if self.verbose: print("Downloading " + str(counter2) + " of " + str(len(pages)) + " ( " + link + " )") filename = "" filelink = "" try: filename,filelink = self.find_link(link) except (KeyboardInterrupt, SystemExit): raise except Exception as e: self.handle_download_error(link, e) continue if self.testOnly == False: if (mode == "query") or (mode=="album") or (mode == "collection"): self.get(filelink, self.deviant + "/" + mode + "/" + modeArg + "/" + filename) else: self.get(filelink, self.deviant + "/" + mode + "/" + filename) else: print(filelink) print(self.deviant + "'s gallery successfully ripped.") def group_get(self, mode): if mode == "favs": strmode = "favby" strmode2 = "favourites" strmode3 = "favs gallery" elif mode == "gallery": strmode = "gallery" strmode2 = "gallery" strmode3 = "gallery" else: print("?") sys.exit() print("Ripping " + self.deviant + "'s " + strmode2 + "...") folders = [] insideFolder = False #are we inside a gallery folder? html = self.get('http://' + self.deviant + '.deviantart.com/' + strmode2 + '/') if re.search(strmode2 + "/\?set=.+&offset=", html, re.IGNORECASE|re.S): insideFolder = True folders = re.findall(strmode + ":.+ label=\"[^\"]*\"", html, re.IGNORECASE) #no repeats folders = list(set(folders)) i = 0 while not insideFolder: html = self.get('http://' + self.deviant + '.deviantart.com/' + strmode2 + '/?offset=' + str(i)) k = re.findall(strmode + ":" + self.deviant + "/\d+\"\ +label=\"[^\"]*\"", html, re.IGNORECASE) if k == []: break flag = False for match in k: if match in folders: flag = True else: folders+=k if self.verbose: print("Gallery page " + str(int((i/10) + 1)) + " crawled...") if flag: break i += 10 #no repeats folders = list(set(folders)) if len(folders) == 0: print(self.deviant + "'s " + strmode3 + " is empty.") return 0 else: print("Total folders in " + self.deviant + "'s " + strmode3 + " found: " + str(len(folders))) if self.reverse: folders.reverse() pat = "http:\\/\\/[a-zA-Z0-9_-]*\.deviantart\.com\\/art\\/[a-zA-Z0-9_-]*" pages = [] for folder in folders: try: folderid = re.search("[0-9]+",folder,re.IGNORECASE).group(0) label = re.search("label=\"([^\"]*)",folder,re.IGNORECASE).group(1) except: continue for i in range(0,int(Dagr.MAX_DEVIATIONS/24),24): html = self.get("http://" + self.deviant.lower() + ".deviantart.com/" + strmode2 + "/?set=" + folderid + "&offset=" + str(i - 24)) prelim = re.findall(pat, html, re.IGNORECASE) if not prelim: break for x in prelim: p = str(re.sub(r'\\/','/',x)) if p not in pages: pages.append(p) if self.verbose: print("Page " + str(int((i/24) + 1)) + " in folder " + label + " crawled...") if not self.reverse: pages.reverse() try: if mode == "favs": da_make_dirs(self.deviant + "/favs/" + label) elif mode == "gallery": da_make_dirs(self.deviant + "/" + label) except Exception as err: print(err) counter = 0 for link in pages: counter += 1 if self.verbose: print("Downloading " + str(counter) + " of " + str(len(pages)) + " ( " + link + " )") filename = "" filelink = "" try: filename,filelink = self.find_link(link) except (KeyboardInterrupt, SystemExit): raise except Exception as e: self.handle_download_error(link, e) continue if self.testOnly == False: if mode == "favs": self.get(filelink, self.devianti + "/favs/" + label + "/" + filename) elif mode == "gallery": self.get(filelink, self.deviant + "/" + label + "/" + filename) else: print(filelink) print(self.deviant + "'s " + strmode3 + " successfully ripped.") def print_errors(self): if len(self.errors_count): print("Download errors count:") for error, count in self.errors_count.iteritems(): print("* " + error + " : " + str(count))
def get_medicare_email(request, mmg): """ :param request: :param mmg: :return: """ mmg_back = mmg mmg_back['status'] = "FAIL" mmg_back['mmg_email'] = "" PARSER = settings.BS_PARSER if not PARSER: if settings.DEBUG: print('Default Parser for BeautifulSoup:', 'lxml') PARSER = 'lxml' # Call the default page rb = RoboBrowser() # Set the default parser (lxml) # This avoids BeautifulSoup reporting an issue in the console/log rb.parser = PARSER target_page = "https://www.mymedicare.gov/myaccount.aspx" # Open the form to start the login rb.open(target_page) # Get the form content page = rb.parsed if settings.DEBUG: print("===============================") print("on page:", rb.url) print("MyAccount:", page) my_email = rb.find("div", attrs={"class":"ctl00_ctl00_ContentPlaceHolder1_ctl00_ctl00_ctl00_ctl01_UserInfo_pnlEmailSettings"}) if settings.DEBUG: print("What email information:", my_email) for addr in my_email: mail_addr = my_email.find("div", attrs={"class": "myaccount-data"}) mail_address = mail_addr.text mmg_back['mmg_email'] = mail_address if rb.url == target_page: mmg_back['url'] = rb.url mmg_back['status'] = "OK" if settings.DEBUG: print("Email:", mail_address) print("url:", rb.url) return mmg_back