def fetch(url): browser = RoboBrowser(history=True, parser="html.parser") browser.open(url) votes = browser.select('.moderatorenSlider a.beitrag') followed_links = set() total_scores = {} for v in votes: if v["href"] in followed_links: continue else: followed_links.add(v["href"]) print(v["href"]) browser.follow_link(v) try: scores = extractVotes(browser) print(scores) for title, score in scores.items(): if title not in total_scores: total_scores[title] = (score, 1) else: score_, num = total_scores[title] total_scores[title] = (score_+score, num+1) except Exception as e: print(e) browser.back() return total_scores
def scrape_cosmo_exam(url,email,password): browser = RoboBrowser() browser.open(tlink) search = browser.get_form() search[ 'user[email]' ] = email search[ 'user[password]' ] = password browser.submit_form(search,submit=search.submit_fields['commit']) # main page browser.follow_link(browser.find_all('a')[2]) browser #browser.get_links() all_links = browser.find_all('a') announcements_key = list(filter( lambda x: 'Announcements' in x, all_links ))[0] announcement_ind = all_links.index(announcements_key) browser.follow_link(browser.find_all('a')[announcement_ind]) #obtaining title objects - tags titles =mapper( lambda x:date_extract(1) , browser.find_all('h2'))[0] # helper function 2 def date_extract(ind): return list(mapper( lambda x:list(x.children)[1], browser.find_all('h2') )) # helper function 3 def matcher(lst,*matches): if not matches: matches = ['exam','reminder'] else: matches=matches[0] return filterer(lambda x:any(string.lower() in str(x).lower() for string in matches) ,lst) return titles
def _download_rib(dir, date): url = "http://archive.routeviews.org/route-views.wide/bgpdata/" dt_web = date.strftime("%Y") + "." + date.strftime("%m") + "/" print("Looking for RIB file...") br = RoboBrowser() br.open(url) link_date = br.get_link(dt_web) br.follow_link(link_date) link_rib = br.get_link("RIBS/") br.follow_link(link_rib) elem = "rib." + date.strftime("%Y") + date.strftime("%m") + date.strftime( "%d") _dt_web = date.strftime("%Y") + "." + date.strftime("%m") + "/" links = br.get_links(elem) one_link = links[0] file = (str(one_link).split('"'))[1] url_dw = "http://archive.routeviews.org/route-views.wide/bgpdata/" + _dt_web + "RIBS/" + file filename = dir + file r = requests.get(url_dw) with open(filename, "wb") as code: code.write(r.content) rib = _decompress_rib(filename) return rib
def run(url): """ start crawler """ # first open novel url with normal browser browser = RoboBrowser(parser='html.parser', history=True, timeout=30, tries=5) browser.open(url) # look all page in novels while not is_end_page(browser): novels = get_all_novel_links(browser) # process each novel for novel in novels: novel_id = get_id(novel) title = get_title(novel) author = get_author(novel) date = get_date(novel) novel_type = get_type(novel) print('*' * 75) print(title) content = get_content(novel, author) output(title, novel_id=novel_id, author=author, novel_type=novel_type, content=content, date=date) time.sleep(random.randint(R_START, R_END)) next_page_link = next_page(browser) if next_page_link is None: break browser.follow_link(next_page_link)
def pushedbutton(self, b): account = self.lineEdit.text() pasw = self.lineEdit_3.text() #use robobrowser module to manipulate web page browser = RoboBrowser(history=True) browser.open('http://web1.cmu.edu.tw/stdinfo/login.asp') form1 = browser.get_form(id='form1') form1['f_id'].value = account form1['f_pwd'].value = pasw browser.submit_form(form1) if browser.state.url == "http://web1.cmu.edu.tw/stdinfo/loginerr.asp": self.lineEdit_2.setText('帳號密碼錯了?') else: link_one = browser.get_link(text=re.compile('.意見調查')) browser.follow_link(link_one) list = [] for l in browser.get_links(text=re.compile('.填寫.')): list.append(l) list.pop(0) for li in list: browser.follow_link(li) form2 = browser.get_form(id='thisform') form2['Cos_Q1'].value = '1' browser.submit_form(form2) self.lineEdit_2.setText('Done!')
def main(): # Browse to Rap Genius browser = RoboBrowser(history=True) browser = RoboBrowser( parser="html.parser") # will get a warning if parser not declared browser.open('http://rapgenius.com/') # Search for Queen form = browser.get_form(action='/search') form # <RoboForm q=> form['q'].value = 'queen' browser.submit_form(form) # Look up the first song songs = browser.select('.song_name') try: browser.follow_link(songs[0]) except IndexError: print("Songs Index doesn't exist!") return lyrics = browser.select('.lyrics') try: lyrics[0].text # \n[Intro]\nIs this the real life... except IndexError: print("Lyrics Index doesn't exist!") # Back to results page browser.back() # Look up my favorite song browser.follow_link('death on two legs') # Can also search HTML using regex patterns lyrics = browser.find(class_=re.compile(r'\blyrics\b')) print(lyrics.text) # \n[Verse 1]\nYou suck my blood like a leech...
def get_cookies(self): """ opens a fake browser to get the cookies needed """ from robobrowser import RoboBrowser browser = RoboBrowser( user_agent= 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.5; en-US; rv:1.9.1b3) Gecko/20090305 Firefox/3.1b3 GTB5', parser='html.parser') browser.open('https://battlemap.deltatgame.com/home#') link = browser.find('a') browser.follow_link(link) form = browser.get_form(0) with open('battlecreds.json') as credentialfile: credentials = json.load(credentialfile) form['Email'] = credentials['email'] browser.submit_form(form) form = browser.get_form(0) form['Passwd'] = credentials['password'] browser.submit_form(form) browser.open('https://battlemap.deltatgame.com/home') self.battlemap_token = browser.session.cookies.get('battlemap_session') self.xsrf = browser.session.cookies.get('XSRF-TOKEN') self.cookietimeout = time.time() + 60 * 60 * 1.95 # GET csrf-token META HERE self.csrf = '' self.brow = browser from bs4 import BeautifulSoup soup = BeautifulSoup(str(browser.parsed()), "html.parser") for tag in soup.find_all('meta'): if 'name' in tag.attrs and tag.attrs['name'] == 'csrf-token': self.csrf = tag.attrs['content']
def test_calc_interface(self): operation = "5,+,2" expected_result = 7 # Add some result to DB requests.post('/'.join((TEST_URL, 'calc')), data={'operation':'998,-,888'}) # Init object browser = RoboBrowser(history=True, parser='html.parser') browser.open(TEST_URL) # Fill calc form calc_form = browser.get_form(action='/calc') calc_form['operation'] = operation browser.submit_form(calc_form) # Get result result_raw = browser.find(id="result").text self.assertEqual(int(result_raw), expected_result) # Check result link browser.follow_link(browser.find(id='result_link')) self.assertEqual((operation, expected_result), (browser.find(id="operation").text, int(browser.find(id="result").text)))
def pushedbutton(self,b): account = self.lineEdit.text() pasw = self.lineEdit_3.text() #use robobrowser module to manipulate web page browser = RoboBrowser(history = True) browser.open('http://web1.cmu.edu.tw/stdinfo/login.asp') form1 = browser.get_form(id = 'form1') form1['f_id'].value = account form1['f_pwd'].value = pasw browser.submit_form(form1) if browser.state.url == "http://web1.cmu.edu.tw/stdinfo/loginerr.asp": self.lineEdit_2.setText('帳號密碼錯了?') else: link_one = browser.get_link(text = re.compile('.意見調查')) browser.follow_link(link_one) list = [] for l in browser.get_links(text = re.compile('.填寫.')): list.append(l) list.pop(0) for li in list: browser.follow_link(li) form2 = browser.get_form(id = 'thisform') form2['Cos_Q1'].value = '1' browser.submit_form(form2) self.lineEdit_2.setText('Done!')
def find_download_page(podcast, episode): download_base = 'https://www.trancepodcasts.com/download/' browser = RoboBrowser(history=True) browser.open('https://www.trancepodcasts.com/download/{:s}-{:d}/'.format( podcast, episode)) link = browser.find('a', attrs={'rel': 'nofollow', 'class': 'btn'}) browser.follow_link(link) browser.response
def dirty_get_mp3_url(yt_url): browser = RoboBrowser(history=True) browser.open("http://www.youtubeinmp3.com/") form = browser.get_form(id="form") form["video"].value = yt_url browser.submit_form(form) a = browser.get_link(id="download") browser.follow_link(a) return (browser.url, unquote(browser.url.split("t=")[-1]) + ".mp3")
def gettab(keyword): browser = RoboBrowser(history=True, parser='html5lib') browser.open('https://www.tabs4acoustic.com/') form = browser.get_form(action=re.compile('recherche')) form['FindMe'].value = keyword browser.submit_form(form) div_resultat = browser.find('div', id='page_content') browser.follow_link(div_resultat.find('a')) tab = browser.find('div', id='tab_zone') return tab.find('pre').text
class Downloader(): def __init__(self, proxy=None, worker_num=0): self.worker_num = worker_num session = Session() if proxy is not None: session.proxies = {'http': proxy, 'https': proxy} self.browser = RoboBrowser(history=True, parser='html.parser', session=session) def get_download_link(self, book_url): self.browser.open(book_url) for link in self.browser.find_all("a"): if "download.php?t=1" in str(link): return f"https://www.lectulandia.cc{link['href']}" def download_book(self, download_url): self.browser.open(download_url) pattern = re.compile("var linkCode = \"(.*?)\";") section = pattern.findall(str(self.browser.parsed)) bee_url = f'https://www.beeupload.net/file/{section[0]}' self.browser.open(bee_url) try: filename = self.browser.find( "div", id="fileDescription").find_all("p")[1].text.replace( "Name: ", "") size = self.browser.find( "div", id="fileDescription").find_all("p")[2].text file_url = self.browser.find("a", id="downloadB") time.sleep(2) self.browser.follow_link(file_url) with open(f"books/{filename}", "wb") as epub_file: epub_file.write(self.browser.response.content) return filename, size except: print(self.browser.parsed) def get_book_page_list(self, page): self.browser.open(f'https://www.lectulandia.cc/book/page/{page}/') return [ f"https://www.lectulandia.cc{book['href']}" for book in self.browser.find_all("a", class_="card-click-target") ] def download_full_page(self, page): print(f"Downloading page: {page} ") books = self.get_book_page_list(page) for book in books: time.sleep(2) download_url = self.get_download_link(book) print(f"Worker: {self.worker_num} on page: {page}", self.download_book(download_url))
def scrape_cs2040s(url, email, password): browser = RoboBrowser(parser='html.parser') browser.open(url) search = browser.get_form() search['user[email]'] = str(email) search['user[password]'] = str(password) browser.submit_form(search, submit=search.submit_fields['commit']) # main page browser.follow_link(browser.find_all('a')[2]) # missions browser.follow_link(browser.find_all('a')[11]) # find names reduced = filterer(lambda x: len(list(x.children)) >= 1, browser.find_all('th')) reduced = filterer(lambda x: 'colspan' in x.attrs, reduced) # unsure of object structure so convert to list type and assess last element names = mapper(lambda x: list(list(x.children)[-1])[-1], reduced) # find deadlines deadlines_tags = list( filter(lambda x: x['class'] == ['table-end-at'], browser.find_all('td'))) deadlines = list( map(lambda x: (list(x))[0] if list(x) else 'not yet', deadlines_tags)) curr_yr = datetime.now().year #returns a list of datetime objects dates = mapper( lambda x: str(datetime.strptime(f"{curr_yr} {x}", '%Y %d %b %H:%M')) if x != 'not yet' else 'Not yet', deadlines) array = [] for n, d in zip(names, dates): dic1 = {} dic1['title'] = n dic1['datetime'] = d array.append(dic1) dic = {} dic['data'] = array #scrape exam details with open( '/Users/sherrywu1999/Desktop/untitled/callie/python/deadlines/data.json', 'w') as json_file: json.dump(dic, json_file)
def pdbfixretrieve(joblink): browser=RoboBrowser(history=True) browser.open(joblink) stdout=browser.get_links('stdout') while not stdout: time.sleep(5) browser.open(joblink) stdout=browser.get_links('stdout') pdbout=browser.get_links('outpdb') browser.follow_link(stdout[0]) stdcontent=browser.response.content browser.follow_link(pdbout[0]) pdbcontent=browser.response.content pdb.set_trace() return stdcontent, pdbcontent
class infs_brsr: """This browser will have functions useful to someone browsing the Infusionsoft front end programatically. """ def __init__(self, appname, username, password, *args, **kwargs): self.loggedin=False self.browser=RoboBrowser(history=True) self.appname=appname self.username=username self.password=password self.baseurl = 'https://' + self.appname + '.infusionsoft.com' def openbase(self): self.browser.open(self.baseurl) def login(self): self.openbase() loginform = self.browser.get_form() loginform.fields['username'].value = self.username loginform.fields['password'].value = self.password self.browser.submit_form(loginform) # This next step is probably a bad idea. It needs # some form of control self.browser.follow_link(self.browser.get_links()[1]) self.loggedin=True def getapikey(self): if not self.loggedin: self.login() self.browser.open(self.baseurl + 'app/miscSetting/itemWrapper?systemId=nav.admin&settingModuleName=Application&settingTabName=Application') pageSoup = BeautifulSoup(self.browser.response.content, 'html.parser') self.apikey=pageSoup.findAll(id='Application_Encrypted_Key:_data')[0].text return self.apikey def importContactCSV(self, pathToCSV='/home/jlmarks/importme.csv'): if not self.loggedin: self.login() importURL = "https://" + self.appname + ".infusionsoft.com/Import/jumpToWizard.jsp?update=false&profileClass=com.infusion.crm.db.importer.profiles.ContactProfile" self.browser.open(importURL) frms = self.browser.get_forms() for eachform in frms: if 'id' in eachform.fields.keys(): self.thisimportid=eachform['id'].value correctform = eachform correctform.fields.pop('Back') correctform.fields['importFile'].value=open(pathToCSV, 'rb') self.browser.submit_form(correctform)
def fetch(): USERNAME = '******' PASSWORD = '******' result_no = 0 br = RoboBrowser() br.open(LOGIN_URL) print(br) br.get_form(id="fm1") br['username'].value = USERNAME br['password'].value = PASSWORD resp = br.submit() # Automatic redirect sometimes fails, follow manually when needed if 'Redirecting' in br.title(): resp = br.follow_link(text_regex='click here') print(resp) # Loop through the searches, keeping fixed query parameters for actor in VARIABLE_QUERY: # I like to watch what's happening in the console print >> sys.stderr, '***', actor # Lets do the actual query now br.open(SEARCH_URL + FIXED_QUERY + actor) # The query actually gives us links to the content pages we like, # but there are some other links on the page that we ignore nice_links = [ l for l in br.links() if 'good_path' in l.url and 'credential' in l.url ] if not nice_links: # Maybe the relevant results are empty break for link in nice_links: response = br.follow_link(link) # More console reporting on title of followed link page print(sys.stderr, br.title()) # Increment output filenames, open and write the file result_no += 1 out = open('result%d' % result_no, 'w') print(out, response.read()) out.close() # Nothing ever goes perfectly, ignore if we do not get page # except RoboBrowser: # print(sys.stderr, "Response error (probably 404)") # Let's not hammer the site too much between fetches time.sleep(1)
def gather(self): browser = RoboBrowser() page = 0 browser.open(self.url) while (page < self.max_pages): links = browser.get_links() if(page == 0): for link in links: self.list.append(link) else: for link in links: if(self.isInTheList(link)): self.list.append(link) browser.follow_link(self.list[page]) page+=1
def gather(self): browser = RoboBrowser() page = 0 browser.open(self.url) while (page < self.max_pages): links = browser.get_links() if (page == 0): for link in links: self.list.append(link) else: for link in links: if (self.isInTheList(link)): self.list.append(link) browser.follow_link(self.list[page]) page += 1
def _login(): username = input('User ID: ') password = getpass('Password: '******'html.parser') browser.open('http://online.lloydsbank.co.uk/personal/logon.login.jsp') form = browser.get_form('frmLogin') form['frmLogin:strCustomerLogin_userID'] = username form['frmLogin:strCustomerLogin_pwd'] = password browser.submit_form(form) mem_info = getpass('Memorable information: ').lower() form_name = 'frmentermemorableinformation1' option_name = ':strEnterMemorableInformation_memInfo{}' form = browser.get_form(form_name) indices = re.findall('Character (\d+) :', form.parsed.text) indices = [int(x) for x in indices] for i, idx in enumerate(indices): form[form_name + option_name.format(i + 1)] = ' ' + mem_info[idx - 1] browser.submit_form(form) assert 'Lloyds Bank - Personal Account Overview' in browser.parsed.title accounts = {} for link in browser.get_links(): if 'lnkAccName' in link.attrs.get('id', ''): accounts[link.text] = link print('Accounts:', list(accounts)) account = input('Account: ') browser.follow_link(accounts[account]) export_link = browser.get_link(title='Export') browser.follow_link(export_link) return browser
def test_add_valid_link__when_the_link_is_invalid__then_the_link_is_not_in_list( self): links_finder = LinksFinder(ANY_NOT_SECURED_URL) link = MagicMock() browser = RoboBrowser(parser=PARSER, history=True) browser.follow_link = MagicMock(side_effect=RoboError) links_finder.add_valid_link(browser, link) actual_url_list = links_finder.url_list self.assertTrue(link not in actual_url_list)
def download_internal(user_id, from_date, to_date): """Download the csv files for the transaction between the given dates""" # Create the browser and open the lloyds login page browser = RoboBrowser(parser='html5lib') browser.open( 'https://online.lloydsbank.co.uk/personal/logon/login.jsp?WT.ac=hpIBlogon' ) while 'Enter Memorable Information' not in browser.parsed.title.text: print(browser.parsed.title.text) form = browser.get_form(id='frmLogin') form['frmLogin:strCustomerLogin_userID'] = str(user_id) form['frmLogin:strCustomerLogin_pwd'] = prompt('Enter password: '******'re logged in, now enter memorable information print(browser.parsed.title.text) form = browser.get_form(id='frmentermemorableinformation1') field = 'frmentermemorableinformation1:strEnterMemorableInformation_memInfo{0}' for i in range(1, 4): label = browser.find("label", {"for": field.format(i)}) form[field.format(i)] = ' ' + prompt(label.text.strip()) browser.submit_form(form) # hopefully now we're logged in... print(browser.parsed.title.text) links = [] for link in browser.get_links("View statement"): if link.text == "View statement": links.append(link) # loop through all accounts for index, link in enumerate(links): acc_name = link['data-wt-ac'].split(" resource")[0] print(acc_name) print(browser.parsed.title) browser.follow_link(link) yield acc_name, download_account_internal(browser, from_date, to_date) browser.back()
def gatherData(user, password): baseURL = 'https://sigarra.up.pt/feup/pt/' browser = RoboBrowser(history=True, parser='html.parser') browser.open(baseURL + 'web_page.Inicial') # Gets the login form form = browser.get_form(action=re.compile(r'validacao')) # Updates the login form with the user credentials form['p_user'].value = 'up' + user form['p_pass'].value = password browser.submit_form(form) # Goes to the user profile browser.open(baseURL + 'fest_geral.cursos_list?pv_num_unico=' + user) # Opens the extended view extended = browser.find(title='Visualizar informações no contexto do curso') browser.follow_link(extended) credits = [] grades = [] # For each html class containing grades ("i", "p" and "o"), gather data for i in browser.find_all(class_='i'): if i.find(class_='n aprovado'): credits.append(i.find(class_='k n').text) grades.append(i.find(class_='n aprovado').text) for j in browser.find_all(class_='p'): if j.find(class_='n aprovado'): credits.append(j.find(class_='k n').text) grades.append(j.find(class_='n aprovado').text) for k in browser.find_all(class_='o'): if k.find(class_='n aprovado'): credits.append(k.find(class_='k n').text) grades.append(k.find(class_='n aprovado').text) return credits, grades
def Revigo_ana(Go_string): from robobrowser import RoboBrowser import re import lxml br = RoboBrowser(parser="lxml") br.open("http://revigo.irb.hr/") form = br.get_form() form["goList"].value = Go_string br.submit_form(form) download_csv_link = br.find("a", href=re.compile("export.jsp")) br.follow_link(download_csv_link) csv_content = br.response.content.decode("utf-8") # write results to file f = open('Revigo_Analysis_results.csv', 'w') f.write(csv_content) f.close() print("Revigo results written to file: Revigo_Analysis_results")
def scrape_revigo_csv(input_GOstats_tsv, out_file, pvalue_cutoff=0.05, fdr_cutoff=1.0): """ """ oh = open(out_file, "w") # get input goterms from GOstats result goterms = GOstats2Revigo(input_GOstats_tsv, pvalue_cutoff=pvalue_cutoff, fdr_cutoff=fdr_cutoff, output_column=3) if goterms: br = RoboBrowser(parser="lxml") br.open("http://revigo.irb.hr/") form = br.get_form() #print(form) form["goList"].value = goterms br.submit_form(form) download_rsc_link = br.find("a", href=re.compile("toR.jsp")) br.follow_link(download_rsc_link) #r_code = br.response.content.decode("utf-8") #print(r_code) br.back() download_csv_link = br.find("a", href=re.compile("export.jsp")) br.follow_link(download_csv_link) csv_content = br.response.content.decode("utf-8") oh.write(csv_content) else: oh.write( "term_ID,description,frequency,plot_X,plot_Y,plot_size,log10 p-value,userVal_2,uniqueness,dispensability,representative,eliminated" ) oh.close()
def scrape_cosmo(url,email,password ): browser = RoboBrowser() browser.open(tlink) search = browser.get_form() search[ 'user[email]' ] = str(email) search[ 'user[password]' ] = str(password) browser.submit_form(search,submit=search.submit_fields['commit']) # main page browser.follow_link(browser.find_all('a')[2]) # missions browser.follow_link(browser.find_all('a')[17]) # find deadlines deadlines_tags = list(filter( lambda x:x['class']==['table-end-at'], browser.find_all('td') ) ) deadlines = list( map (lambda x: (list(x))[0] if list(x) else 'not yet', deadlines_tags )) curr_yr = datetime.now().year #returns a list of datetime objects return mapper( lambda x: datetime.strptime( f"{curr_yr} {x}", '%Y %d %b %H:%M') if x!='not yet' else 'Not yet', deadlines)
def get_content(novel, author): """ get novel all content return content string """ browser = RoboBrowser(parser="html.parser", history=True, timeout=30, tries=5) novel_link = novel.find('td', class_='tal').a link = host + novel_link['href'] time.sleep(random.randint(R_START, R_END)) # browser.follow_link(novel_link) try: browser.open(link) except: print('link failed', link) return '' else: print('novel link', browser.url) contents = list() # look all page in a novel while True: content = get_cell_content(browser, author) contents.append(content) if is_end_page(browser): break time.sleep(random.randint(R_START, R_END)) next_page_link = next_page(browser) if next_page_link is None: break try: browser.follow_link(next_page_link) except: print('link failed', browser.url) continue else: print('page link', browser.url) return "\n".join(contents)
def GeneOntology(name): print("Starting GeneOntology for " + name) br = RoboBrowser(parser="html.parser") br.open("http://geneontology.org/") form = br.get_forms()[1] geneinput = form["input"] species = form["species"] form["species"].value = "IXOSC" os.chdir("/home/david/Documents/blast/Blastfiles/outputfiles/Genelists") os.listdir(".") file = open(name, "r") string = "" for line in file.readlines(): #print(line) string = string + line form["input"] = string #print(form) br.submit_form(form) #print(br.find_all()) #DebugHtml(str(br.parsed)) table_link = br.find("a", href=re.compile("/tools/compareToRefListTxt.jsp")) br.follow_link(table_link) csv_content = br.response.content.decode("utf-8") savefile = open("GOoutput/" + name, "w") savefile.write(csv_content) savefile.close() print("finished")
def get_webdav_urls(username, password): # log in browser = RoboBrowser(history=True) browser.open('http://ctools.umich.edu') browser.follow_link(browser.find(id='ctoolsLogin')) login_form = browser.get_form() login_form['login'].value = username login_form['password'].value = password browser.submit_form(login_form) # get the results browser.follow_link(browser.find( class_='toolMenuLink ', title='For creating, revising, and deleting course and project sites' )) browser.open(browser.find(class_='portletMainIframe').attrs['src']) results = [] course_links = browser.select('#sitesForm td h4 a[target="_top"]') for course_link in course_links: if not course_link.attrs: continue href = course_link.attrs['href'] if '~' in href: continue results.append( 'https://ctools.umich.edu/dav' + findall('\/[^\/]+$', href)[0] ) return results
def autoRevigo(name): name = name os.chdir("/home/david/Documents/BenoitLab/RNA-seq/Gprofiler/") os.listdir(".") file = open(name, "r") string = "" for line in file.readlines(): # print(line) string = string + line + "\n" goterms = string br = RoboBrowser(parser="html") br.open("http://revigo.irb.hr/") form = br.get_form() form["goList"].value = goterms br.submit_form(form) download_rsc_link = br.find("a", href=re.compile("toR.jsp")) br.follow_link(download_rsc_link) r_code = br.response.content.decode("utf-8") print(r_code) br.back() download_csv_link = br.find("a", href=re.compile("export.jsp")) br.follow_link(download_csv_link) csv_content = br.response.content.decode("utf-8") writefile = open("/home/david/Documents/BenoitLab/RNA-seq/Revigo/" + name, "w") writefile.write(csv_content) writefile.close()
def pushedbutton(self, b): account = self.lineEdit.text() pasw = self.lineEdit_3.text() #use robobrowser module to manipulate web page browser = RoboBrowser(history=True) browser.open('http://web1.cmu.edu.tw/stdinfo/login.asp') form1 = browser.get_form(id='form1') form1['f_id'].value = account form1['f_pwd'].value = pasw browser.submit_form(form1) if browser.state.url == "http://web1.cmu.edu.tw/stdinfo/loginerr.asp": self.lineEdit_2.setText('帳號密碼錯了?') else: self.lineEdit_2.setText('成功登入,填寫中....') link_one = browser.get_link(text='教師教學意見調查') browser.follow_link(link_one) list = [] for l in browser.get_links(text='填寫'): list.append(l) list.pop(0) for li in list: browser.follow_link(li) form2 = browser.get_form(id='thisform') form2['CH_1'].value = '3' form2['CH_2'].value = '3' form2['CH_3'].value = '3' form2['CH_4'].value = '3' form2['CH_5'].value = '3' form2['CH_6'].value = '3' form2['CH_7'].value = '3' form2['CH_8'].value = '3' form2['CH_9'].value = '3' form2['CH_10'].value = '3' browser.submit_form(form2) self.lineEdit_2.setText('Done!')
def pushedbutton(self,b): account = self.lineEdit.text() pasw = self.lineEdit_3.text() #use robobrowser module to manipulate web page browser = RoboBrowser(history = True) browser.open('http://web1.cmu.edu.tw/stdinfo/login.asp') form1 = browser.get_form(id = 'form1') form1['f_id'].value = account form1['f_pwd'].value = pasw browser.submit_form(form1) if browser.state.url == "http://web1.cmu.edu.tw/stdinfo/loginerr.asp": self.lineEdit_2.setText('帳號密碼錯了?') else: self.lineEdit_2.setText('成功登入,填寫中....') link_one = browser.get_link(text = '教師教學意見調查') browser.follow_link(link_one) list = [] for l in browser.get_links(text = '填寫'): list.append(l) list.pop(0) for li in list: browser.follow_link(li) form2 = browser.get_form(id = 'thisform') form2['CH_1'].value = '3' form2['CH_2'].value = '3' form2['CH_3'].value = '3' form2['CH_4'].value = '3' form2['CH_5'].value = '3' form2['CH_6'].value = '3' form2['CH_7'].value = '3' form2['CH_8'].value = '3' form2['CH_9'].value = '3' form2['CH_10'].value = '3' browser.submit_form(form2) self.lineEdit_2.setText('Done!')
def getExcelFromWoS(self, url, mark, totalMarked, outputLocationPath): mark_from = str(mark) sres = self.sres if mark + 499 > int(totalMarked): mark_to = totalMarked else: mark_to = str(mark + 499) sres.print(command='log', msg='[%s-%s레코드] 엑셀을 받을 준비를 합니다.'%(mark_from, mark_to)) excelBrowser = RoboBrowser(history=True, parser='lxml') excelBrowser.open(url) reportLink = excelBrowser.select('a.citation-report-summary-link') if len(reportLink) == 0: sres.print(command='err', msg='[%s-%s레코드] 요약 보고서가 없습니다.'%(mark_from, mark_to)) return None sres.print(command='log', msg='[%s-%s레코드] 요약 보고서를 엽니다.'%(mark_from, mark_to)) excelBrowser.follow_link(reportLink[0]) summary_records_form = excelBrowser.get_form(id='summary_records_form') qid = summary_records_form['qid'].value filters = summary_records_form['filters'].value sortBy = summary_records_form['sortBy'].value timeSpan = summary_records_form['timeSpan'].value endYear = summary_records_form['endYear'].value startYear = summary_records_form['startYear'].value rurl = summary_records_form['rurl'].value piChart = summary_records_form['piChart'].value toChart = summary_records_form['piChart'].value makeExcelURL = "http://apps.webofknowledge.com/OutboundService.do?" makeExcelParam = "" makeExcelParam += "action=go" makeExcelParam += "&save_options=xls" makeExcelURL += makeExcelParam sres.print(command='log', msg='[%s-%s레코드] 엑셀 데이터 제작을 요청합니다.'%(mark_from, mark_to)) excelBrowser.session.post(makeExcelURL, data={ "selectedIds": "", "displayCitedRefs":"", "displayTimesCited":"", "displayUsageInfo":"true", "viewType":"summary", "product":"WOS", "rurl":rurl, "mark_id":"WOS", "colName":"WOS", "search_mode":"CitationReport", "view_name":"WOS-CitationReport-summary", "sortBy": sortBy, "mode":"OpenOutputService", "qid":qid, "SID":self.SID, "format":"crsaveToFile", "mark_to":mark_to, "mark_from":mark_from, "queryNatural":"", "count_new_items_marked":"0", "use_two_ets":"false", "IncitesEntitled":"no", "value(record_select_type)":"range", "markFrom":mark_from, "markTo":mark_to, "action":"recalulate", "start_year_val":"1900", "end_year_val":"2019", "viewAbstractUrl":"", "LinksAreAllowedRightClick": "full_record.do", "filters":filters, "timeSpan": timeSpan, "db_editions": "", "additional_qoutput_params": "cr_qid="+qid, "print_opt":"Html", "include_mark_from_in_url":"true", "endYear":endYear, "startYear":startYear, "piChart":piChart, "toChart":toChart, "fields":"DUMMY_VALUE" }) ExcelActionURL = "https://ets.webofknowledge.com" ExcelAction = "/ETS/ets.do?" ExcelParam = "mark_from=1" ExcelParam += "&product=UA" ExcelParam += "&colName=WOS" ExcelParam += "&displayUsageInfo=true" ExcelParam += "&parentQid=" + qid ExcelParam += "&rurl=" + requests.utils.quote(rurl) ExcelParam += "&startYear=" + startYear ExcelParam += "&mark_to=" + mark_to ExcelParam += "&filters=" + requests.utils.quote(filters) ExcelParam += "&qid=" + str(int(qid)+1) ExcelParam += "&endYear=" + endYear ExcelParam += "&SID=" + self.SID ExcelParam += "&totalMarked=" + totalMarked ExcelParam += "&action=crsaveToFile" ExcelParam += "&timeSpan=" + requests.utils.quote(timeSpan) ExcelParam += "&sortBy=" + sortBy ExcelParam += "&displayTimesCited=false" ExcelParam += "&displayCitedRefs=true" ExcelParam += "&fileOpt=xls" ExcelParam += "&UserIDForSaveToRID=null" ExcelActionURL += ExcelAction ExcelActionURL += ExcelParam sres.print(command='log', msg='[%s-%s레코드] 엑셀 데이터를 다운로드 받습니다.'%(mark_from, mark_to)) res = requests.get(ExcelActionURL) if res.text.find("<html>") > 0 or res.text.find("Error report</title>") > 0: sres.print(command='err', msg='%s-%s 레코드, 서버가 에러를 반환'%(mark_from, mark_to)) ofileName = "%X"%random.getrandbits(128) with open(ofileName, 'wb') as rsFile: rsFile.write(res.content) rsFile.close() resPD = pd.read_excel(ofileName, header=26) os.remove(ofileName) return resPD
class MarketPoster: def __init__(self): self.login_url = 'http://forums.zybez.net/index.php?' \ 'app=curseauth&module=global§ion=login' self.browser = RoboBrowser(history=False, parser='html.parser') self.logged_in = False def login(self, login_name, login_password): login_name = login_name login_password = login_password self.browser.open(self.login_url) sign_in_form = self.browser.get_form(class_='authentication-box') sign_in_form['ips_username'].value = login_name sign_in_form['ips_password'].value = login_password self.browser.submit_form(sign_in_form) correct_url = 'http://forums.zybez.net/index.php' if self.browser.url == correct_url: self.logged_in = True return True else: return False def deleteItemPosts(self, post): item_url = self.getItemURL(post.item_name) self.browser.open(item_url) items_to_delete = self.browser.get_links(href=re.compile( "do=trade-delete")) for i in items_to_delete: self.browser.follow_link(i) self.browser.open(item_url) def deleteAllPosts(self): self.browser.open('http://forums.zybez.net/runescape-2007-prices') delete_button = self.browser.get_link('Remove all active offers') if delete_button is not None: self.browser.follow_link(delete_button) def getItemURL(self, item_name): item_name = item_name.split() item_name = '+'.join(item_name) item_data_url = 'http://forums.zybez.net' \ '/runescape-2007-prices/api/item/' + item_name item_data_dict = self.browser.session.get(item_data_url).json() item_id = item_data_dict['id'] item_url = 'http://forums.zybez.net/runescape-2007-prices/' \ + str(item_id) + '-' + item_name return item_url def postItem(self, post): price = post.price quantity = post.quantity note = post.note offer_type = post.offer_type contact_method = post.contact_method self.deleteItemPosts(post) post_item_form = self.browser.find( action='http://forums.zybez.net/index.php?app=priceguide&module=' 'public§ion=action&do=trade-add') post_item_form = self.browser.get_form(post_item_form) # Fill out and submit form post_item_form['type'].value = str(int(offer_type)) post_item_form['qty'].value = quantity post_item_form['price'].value = price post_item_form['notes'].value = note post_item_form['contact'].value = str(int(contact_method)) self.browser.submit_form(post_item_form)
def attack(self): user_agent = 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:69.0) Gecko/20100101 Firefox/69.0' accept = 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8' accept_language = 'en-US,en;q=0.5' s = requests.Session() s.headers['User-Agent'] = user_agent s.headers['Accept'] = accept s.headers['Accept-Language'] = accept_language robo = RoboBrowser(session=s, history=True, parser='html.parser') robo.open(self.url) if self.atr_form is "id": form = robo.get_form(id=self.atr_value) elif self.atr_form is "class": form = robo.get_form(class_=self.atr_value) elif self.atr_form is "name": form = robo.get_form(name=self.atr_value) elif self.atr_form is "action": form = robo.get_form(action=self.atr_value) else: self.log.append("Tidak Menemukan Form Login") return None #melakukan input salah sebagai kondisi gagal login saat melakukan Brute Force form[self.name_input[0]].value = "xxxxx" form[self.name_input[1]].value = "xxxxx" robo.submit_form(form) urlFailed = str(robo.url) #melakukan serangan Brute for username in self.user_list: for password in self.pass_list: robo.open(self.url) form[self.name_input[0]].value = username form[self.name_input[1]].value = password robo.submit_form(form) url = str(robo.url) if url != urlFailed: self.log_csv.append(["Brute Force", self.url]) self.log.append("login is success or you has been locked out of attempts") self.log.append("Url after login : "******"Username : "******"Password : "******"Sensitive Data Exposed", self.url]) self.log.append("url contain sensitive data maybe have vulnerability") try: #mencari SESSION PHPSESSID sess = robo.session.cookies['PHPSESSID'] self.log_csv.append(["Session ID Exposed", self.url]) self.log.append("found PHPSESSID maybe have vulnerability fixation attack") self.log.append("PHPSESSID : " + sess) #mencoba logout dan kembali urlLog = robo.url linkLogout = robo.get_link(text="logout") if linkLogout is not None: robo.follow_link(linkLogout) robo.back(n=1) if robo.url == urlLog: self.log.append("session not destroyed maybe have vulnerabilty") except: pass return time.sleep(5) self.log.append("Brute Force failed - Login not successfull") return
class tat: global pw def __init__(self, appname=None): self.startingpath = os.path.abspath(os.curdir) if not appname: self.appname = self.getappname() else: self.appname = appname self.apppath = os.path.join(self.startingpath, self.appname) if not os.path.exists(self.apppath): os.mkdir(self.apppath) os.chdir(self.apppath) self.mapping = {} self.mapping["Contact"] = -1 self.mapping["Affiliate"] = -3 self.mapping["ContactAction"] = -5 self.mapping["Company"] = -6 self.mapping["OrderItem"] = -9 self.menu() def menu(self, context="initial"): if context is "initial": self.baseurl = "https://" + self.appname + ".infusionsoft.com/" self.apikey = self.getapikey() self.svr = ISServer.ISServer(self.appname, self.apikey) if not os.path.exists(self.apppath): os.mkdir(self.apppath) os.chdir(self.apppath) if not os.path.exists("files"): os.mkdir("files") os.chdir("files") self.usermenu = {} self.usermenu["downloadAPITables"] = "apit" self.usermenu["play"] = "play" self.usermenu["reports"] = "rpts" # for eachitem in self.usermenu.keys(): # print eachitem + ":\t" + self.usermenu[eachitem] # thisChoice = raw_input('please make a choice: ').strip(' \n\t') thisChoice = "play" if thisChoice == "apit": self.handleAPItables() elif thisChoice == "play": self.play() elif thisChoice == "rpts": self.downloadAllReports() else: self.inchandlefiles() def handlefiles(self): os.chdir(self.startingpath) if not os.path.exists("files"): os.mkdir("files") os.chdir("files") allfiles = self.svr.getAllRecords("FileBox") for eachfile in allfiles: downloadurl = self.baseurl + "Download?Id=" + str(eachfile["Id"]) self.browser.open(downloadurl) fileoutpath = os.path.join(self.startingpath, "files", eachfile["ContactId"], eachfile["FileName"]) if not os.path.exists(os.path.dirname(fileoutpath)): os.makedirs(fileoutpath) fout = open(fileoutpath, "wb") fout.write(self.browser.response.content) fout.close() def inchandleAPItables(self): apidata = {} self.customfields = self.svr.getAllRecords("DataFormField") for eachtable in ISServer.tables.keys(): if eachtable not in [ "LeadSourceExpense", "DataFormTab", "GroupAssign", "AffResource", "InvoiceItem", "UserGroup", "CProgram", "ActionSequence", "Template", "LeadSource", "Status", "Campaignee", "DataFormField", "OrderItem", "DataFormGroup", "ProductOptValue", "ContactGroup", "Company", "TicketStage", "ProductCategoryAssign", "ContactGroupAssign", ]: print "starting " + eachtable if eachtable not in self.mapping.keys(): self.mapping[eachtable] = 99 fields = ISServer.tables[eachtable] + [ "_" + fld["Name"] for fld in self.customfields if fld["FormId"] is self.mapping[eachtable] ] self.svr.incrementlyGetRecords(eachtable, interestingData=fields) print "done writing " + eachtable else: print "already completed " + eachtable self.apidata = apidata def inchandleAPItable(self, tablename): self.customfields = self.svr.getAllRecords("DataFormField") if tablename not in self.mapping.keys(): self.mapping[tablename] = 99 fields = ISServer.tables[tablename] + [ "_" + fld["Name"] for fld in self.customfields if fld["FormId"] is self.mapping[tablename] ] self.svr.incrementlyGetRecords(tablename, interestingData=fields) print "done writing " + tablename def inchandlefiles(self): os.chdir(self.startingpath) self.svr.incgetfiles(self.browser) def downloadContact0files(self, numberofmostrecentfilestodownload): thesefiles = self.svr.getAllRecords("FileBox", searchCriteria={"ContactId": 0}) for eachfile in thesefiles[-int(numberofmostrecentfilestodownload) :]: print "doing " + str(eachfile) self.svr.getfile(self.browser, eachfile) def play(self): print "she's all yours captain!" def downloadAReport(self, reportname): self.browser.open(self.baseurl + "Reports/exportResults.jsp?reportClass=" + reportname) reportForm = [eachform for eachform in self.browser.get_forms() if eachform.action == "qbExport.jsp"] if len(reportForm) > 0: self.browser.submit_form(reportForm[0], submit=reportForm[0].submit_fields["process"]) with open(reportname + ".csv", "wb") as outfile: outfile.write(self.browser.response.content) else: print "no " + reportname def downloadAllReports(self): for reportname in [ "AffiliateActivitySummary", "AffiliateLedger", "AffiliateRedirectActivity", "AffiliateReferral", "AffPayout", "AllOrders", "AllSales", "AllSalesItemized", "ARAgingReport", "CampaigneeBasic", "CampaigneeByDay", "CampaignProductConversion", "ClickThroughPercentage", "ClickThroughPercentageByEmail", "ContactDistributed", "CProgramRevenueSummary", "CreditCard", "CreditsIssued", "CustomerLifetimeValue", "DailyPayments", "DailyReceivables", "DailySalesTotals", "DashboardCampaign", "DashboardEmail", "DashboardLeads", "DashboardOrders", "DashboardUsers", "DigitalProductKey", "EmailBatchSearch", "EmailBroadcastConversionReport", "EmailConversion", "EmailSentSearch", "FailedCharge", "FaxBatchSearch", "FollowUpSequenceConversionReport", "FunnelFlowRecipient", "FunnelFlowRecipientWaiting", "FunnelGoalAchieved", "FunnelQueuedFlowItem", "FunnelUniqueContacts", "GroupAdds", "HabeasDetail", "InvoiceNetIncome", "LeadSourceConversion", "LeadSourceIncome", "LeadSourceROI", "LeadSourceROIByCategory", "MonthlyPayments", "MonthlyReceivables", "MonthlySalesTotals", "MonthlySalesTotalsByProduct", "OptOutSearch", "PaymentsReport", "PieceResponse", "ProductNetIncome", "Receivables", "RevenueForecastReport", "TaskSearch", "VoiceBatchSearch", "VoiceOptOutSearch", "WebformActivitySummary", "WebFormTracking", ]: self.downloadAReport(reportname) def getFilePath(self): return tkFileDialog.askopenfilename() def getFolderPath(self): return tkFileDialog.askdirectory() def getappname(self): return raw_input("Please enter appname:").strip("\n \t") def getapikey(self): global pw username = pw["username"] password = pw["password"] # Basically: # #Add username and password to your global variables. self.browser = RoboBrowser(history=True) self.browser.open(self.baseurl) logform = self.browser.get_form() logform.fields["username"].value = username logform.fields["password"].value = password self.browser.submit_form(logform) self.browser.follow_link(self.browser.get_links()[1]) self.browser.open( self.baseurl + "app/miscSetting/itemWrapper?systemId=nav.admin&settingModuleName=Application&settingTabName=Application" ) pageSoup = BeautifulSoup(self.browser.response.content, "html.parser") return pageSoup.findAll(id="Application_Encrypted_Key:_data")[0].text def handleAPItables(self): apidata = {} self.customfields = self.svr.getAllRecords("DataFormField") for eachtable in ISServer.tables.keys(): print "starting " + eachtable if eachtable not in self.mapping.keys(): self.mapping[eachtable] = 99 fields = ISServer.tables[eachtable] + [ "_" + fld["Name"] for fld in self.customfields if fld["FormId"] is self.mapping[eachtable] ] apidata[eachtable] = self.svr.getAllRecords(eachtable, interestingData=fields) with open(eachtable + ".csv", "wb") as outfile: writer = csv.DictWriter(outfile, fields) writer.writeheader() writer.writerows(apidata[eachtable]) print "done writing " + eachtable self.apidata = apidata def handlewebforms(self): # for eachid # webformsubmissionpath="https://" + self.appname + ".infusionsoft.com/app/webformSubmission/contactTabDetails?customFormWebResultId=" + str(x) pass def creditCardsToCSV(self): ccs = self.svr.getAllRecords( "CreditCard", interestingData=[ "Id", "ContactId", "CardType", "Last4", "ExpirationMonth", "ExpirationYear", "Email", "StartDateMonth", "StartDateYear", "Status", ], ) os.chdir(self.startingpath) if not os.path.exists("pyDatas"): os.mkdir("pyDatas") os.chdir("pyDatas") with open("ccs.csv", "wb") as outfile: thiswriter = csv.DictWriter(outfile, ccs[0].keys()) thiswriter.writeheader() thiswriter.writerows(ccs) print "File written to " + str(os.path.abspath(os.curdir)) os.chdir(self.startingpath) def contactsToCSV(self): os.chdir(self.startingpath) self.customfields = self.svr.getAllRecords("DataFormField") fields = ISServer.tables["Contact"] + ["_" + fld["Name"] for fld in self.customfields if fld["FormId"] == -1] cons = self.svr.getAllRecords("Contact", interestingData=fields) if not os.path.exists("pyDatas"): os.mkdir("pyDatas") os.chdir("pyDatas") with open("contacts.csv", "wb") as outfile: thiswriter = csv.DictWriter(outfile, cons[0].keys()) thiswriter.writeheader() thiswriter.writerows(cons)
print "Repository: " + link.select('td.repo')[0].text.encode("utf-8").strip() print "User: "******"utf-8").strip() print "Title: " + link.select('td.title')[0].select('a.execute')[0].text.encode("utf-8").strip() print "Updated " + link.select('td.date')[0].text.encode("utf-8").strip() print "\n----------------------" #obtain links with beautifulSoup links = browser.find_all('a') for link in links: try: #print(link.get('href')) if not link['href'].startswith("https"): link['href']='https://bitbucket.org'+link['href'].encode("utf-8").strip() #link['href']='/odigeoteam/frontend-html5' print link['href'] #print link browser.follow_link(link) branches = browser.select('li.branches') if len(branches)>0 : print 'branches '+ branches[0].select('span.value')[0].text tags = browser.select('li.tags') if len(tags)>0 : print 'tags' + tags[0].select('span.value')[0].text enlaces = browser.find_all('a') #print enlaces for enlace in enlaces: if enlace.get('href') == '#forks': print 'forks '+ enlace.select('span.value')[0].text if enlace.get('href') == '#tags':
fp = br.parsed #f0 = open('f1.html', 'w') #f0.write(str(fp)) #login form=br.get_form(id='mod_loginform') form['username'].value= 'pygather' form['passwd'].value= '1324354657687980' br.submit_form(form) sp = br.parsed #f2 = open('f2.html','w') #f2.write(str(sp)) #navigate to quick submit for a in br.find_all('a', href=True, text = re.compile('Quick Submit')): br.follow_link(a) tp = br.parsed form = br.get_form(action = re.compile('Itemid=25')) # print(form) #form.new_control('text','code',{'value':''}) #form.fixup() form['localid'].value=str(curProgram) form['language'].value='2' form['code'].value='import java.util.*;class Main{public static void main(String[]args) throws Exception{Scanner in = new Scanner(System.in);StringBuilder sb = new StringBuilder();while(in.hasNextLine()){sb.append(in.nextLine());}byte b=(byte)sb.charAt('+str(curByte)+');if((b>>'+str(shift)+'&0x01)==0){throw new Exception("Error");}}}' br.submit_form(form) #f3 = open('f3.html','w') #f3.write(str(tp)) #print(tp)
courseTitle = "".join([x if x.isalnum() else "_" for x in courseTitle]) print('Course Url: ' + courseModulesUrl) print('Course Title: ' + courseTitle) print('Finding file links of type: ' + args.downloadOnly) # Make output dir outputDir = os.path.join('output/', courseTitle) make_path(outputDir) # Get modules links with lecture in title moduleLinks = browser.find_all("a", {"class": "for-nvda"}) print('Found ' + str(len(moduleLinks)) + ' links, (not all will be valid)') # Process each lecture link for moduleLink in moduleLinks: print('Opening: ' + moduleLink['aria-label']) browser.follow_link(moduleLink) try: # Find link - containing words "download" downloadLinkRel = browser.find('a', href=re.compile(r'.*download*')) # If failed, find link - containing reference to file "****.XXX" if downloadLinkRel is None: downloadLinkRel = browser.find('a', href=re.compile(r'.*\.[a-z]{3,4}$')) fileNameWithExtension = downloadLinkRel.text.strip() # Check the link is the right filetype if args.downloadOnly != 'all' and not fileNameWithExtension.endswith( args.downloadOnly): print(' not processing (wrong extension): ' + fileNameWithExtension) continue downloadLinkAbsolute = urlparse.urljoin(courseModulesUrl,
from robobrowser import RoboBrowser # Browse to Genius browser = RoboBrowser(history=True) browser.open('http://www.genius.com') # Search for Porcupine Tree #form = browser.get_form(action='/search') #form = browser.get_form(class_='global_search global_search--giant') #form = browser.get_forms()[0] print form form['q'].value = 'porcupine tree' response = browser.submit_form(form) print respo # Look up the first song songs = browser.select('.song_link') browser.follow_link(songs[0]) lyrics = browser.select('.lyrics') lyrics[0].text # Back to results page browser.back() # Look up my favorite song song_link = browser.get_link('trains') browser.follow_link(song_link) # Can also search HTML using regex patterns lyrics = browser.find(class_=re.compile(r'\blyrics\b')) lyrics.text
class StitchBot(object): def __init__(self, output_path=None, username=None, password=None): self.browser = RoboBrowser(history=True) self.output_path = output_path or tempfile.TemporaryDirectory().name self.username = username or os.environ['STITCHBOT_USERNAME'] self.password = password or os.environ['STITCHBOT_PASSWORD'] self.logger = logger.getChild('StitchBot') def log(self, level, method_name, message, *args, **kwargs): child_logger = self.logger.getChild(method_name) child_logger.log(level, message, *args, **kwargs) def scrape(self): self.log(logging.INFO, 'scrape', 'Starting scrape') self.log_in() self.navigate_to_free_pattern() scraped_filenames = self.download_pattern() self.log(logging.INFO, 'scrape', 'Scrape complete') return scraped_filenames def log_in(self): self.log(logging.INFO, 'log_in', 'Logging in') self.browser.open('http://dailycrossstitch.com/my-account/') form = self.browser.get_form(class_='login') form['username'] = self.username form['password'] = self.password self.browser.submit_form(form) self.log(logging.INFO, 'log_in', 'Logged in') def navigate_to_free_pattern(self): self.log( logging.INFO, 'navigate_to_free_pattern', 'Finding free pattern') self.browser.open('http://dailycrossstitch.com/') free_button = self.browser.find('a', class_='button', string='FREE') self.browser.follow_link(free_button) self.log( logging.INFO, 'navigate_to_free_pattern', 'Found free pattern') def download_pattern(self): self.log(logging.INFO, 'download_pattern', 'Downloading pattern') download_buttons = self.browser.find_all( 'a', class_='single_add_to_cart_button') download_urls = list(map(itemgetter('href'), download_buttons)) local_filenames = [ self.download_pattern_file(url) for url in download_urls] self.log(logging.INFO, 'download_pattern', 'Downloaded pattern') return local_filenames def download_pattern_file(self, url): self.log( logging.INFO, 'download_pattern_file', 'Downloading pattern file at {0}'.format(url)) self.browser.open(url) download_script = self.browser.find( 'script', string=re.compile(r'^\s*function startDownload')) if not download_script: return pdf_url_match = re.search(r'(http.+\.pdf)', download_script.string) if not pdf_url_match: return pdf_url = pdf_url_match.group(1) self.browser.open(pdf_url) output_filename = self.save_pattern(self.browser.response) self.log( logging.INFO, 'download_pattern_file', 'Downloaded pattern file at {0}'.format(url)) return output_filename def save_pattern(self, response): self.log(logging.INFO, 'save_pattern', 'Saving pattern') try: os.makedirs(self.output_path) except OSError: pass filename = self.get_filename(response.headers) output_filename = os.path.join(self.output_path, filename) with open(output_filename, 'wb') as output_file: output_file.write(response.content) self.log( logging.INFO, 'save_pattern', 'Saved pattern to {0}'.format(output_filename)) return output_filename def get_filename(self, headers, default_filename='pattern.pdf'): filename_match = re.search( r'filename="?([^"]+)"?', headers.get('Content-Disposition', '')) if not filename_match: return default_filename return filename_match.group(1)
class Robot(object): """This robot have two functionality, which is to grab matakuliah data and to grab KRS of each mahasiswa. This robot also need username and password for authorization. :param str username: username for login :param str password: password for login """ def __init__(self, username, password): self.browser = RoboBrowser() self.username = username self.password = password self.matakuliah = [] def update_matakuliah(self): self.matakuliah = self._get_matakuliah() for obj in self.matakuliah: detail = self._get_matakuliah_detail(obj['link_detail']) obj['jadwal_kuliah'] = detail['jadwal_kuliah'] # obj['jadwal_uts'] = detail['jadwal_uts'] # obj['jadwal_uas'] = detail['jadwal_uas'] self._persist_matakuliah() def _persist_matakuliah(self): for obj in self.matakuliah: try: kelas = (Kelas.select() .where(Kelas.nama == obj['nama_kelas']).get()) except Kelas.DoesNotExist: kelas = Kelas() kelas.kode_mk = obj['kode_mk'] kelas.nama = obj['nama_kelas'] kelas.matakuliah = obj['matakuliah'] kelas.dosen = obj['dosen'] kelas.sks = obj['sks'] kelas.tipe = obj['tipe'] kelas.jadwal_kuliah = obj['jadwal_kuliah'] kelas.save() def __login(self): self.browser.open('http://akademika.ugm.ac.id') login_form = self.browser.get_form(id='form-login') login_form['username'].value = self.username login_form['password'].value = self.password self.browser.submit_form(login_form) def _get_matakuliah(self): self.__login() # go to 'informasi matakuliah' page link_matakuliah = self.browser.select('#navigation li a')[3] self.browser.follow_link(link_matakuliah) marshal = [] matakuliah_raw = browser.select('.table-common > tr')[1:] for raw in matakuliah_raw: data = raw.select('td') obj = {} obj['kode_mk'] = data[1].contents[0] obj['matakuliah'] = data[2].contents[0] obj['dosen'] = data[3].contents[0] obj['link_detail'] = data[4].contents[0] obj['nama_kelas'] = data[4].contents[0].get_text() obj['tipe'] = data[5].contents[0] obj['sks'] = data[6].contents[0] marshal.append(obj) return marshal def _get_matakuliah_detail(self, link): self.browser.follow_link(link) jadwal_row = self.browser.select('table > tr') # for brevity obj = {} obj['jadwal_kuliah'] = "" obj['jadwal_uts'] = "" obj['jadwal_uas'] = "" jadwal_kuliah_row = jadwal_row[0].select('table tr')[1:] for row in jadwal_kuliah_row: contents = [x.contents[0] for x in row.select('td')] data_string = "$".join(contents) obj['jadwal_kuliah'] = "|".join([data_string]) # TODO: find a way to get 'tanggal' # jadwal_uts_row = jadwal_row[1].select('table tr')[1:] # jadwal_uas_row = jadwal_row[2].select('table tr')[1:] return obj
password = getpass() challenge_count = 1 while (True): browser = RoboBrowser(parser='lxml') browser.open(SITE_URL) # loop forever #try catch this signin_form = browser.get_forms()[0] signin_form['login'].value = username signin_form['password'].value = password browser.submit_form(signin_form) #get the leaderboard list browser.follow_link(browser.get_link(text='Leaderboard')) bot_name_tags = browser.find_all('div', {'class': 'bot-name'}); bot_name_extracter = lambda tag: tag.string.replace('\t', '').replace('\n', '').lower() bot_names = map(bot_name_extracter, bot_name_tags) no_bots = len(bot_names) our_rank = bot_names.index('cbteamname') + 1 print("[INFO] CBTeamName is ranked " + str(our_rank)) random.seed(os.urandom(8)) opponent_queue = [] #three bots with lower rank opponent_queue += ([bot_names[random.randint(our_rank + 1, no_bots - 1)], bot_names[random.randint(our_rank + 1, no_bots - 1)], bot_names[random.randint(our_rank + 1, no_bots - 1)]]) #one bot with a higher rank
class Dagr: """deviantArt gallery ripper class""" NAME = basename(__file__) __version__="0.60" MAX_DEVIATIONS = 1000000 # max deviations def __init__(self): # Internals self.browser = None self.errors_count = dict() # Configuration self.username = "" self.password = "" self.overwrite = False self.reverse = False self.testOnly = False self.verbose = False # Current status self.deviant = "" def start(self): if not self.browser: # Set up fake browser self.set_browser() # Always run login self.login() def set_browser(self): USERAGENTS = ( 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.202 Safari/535.1', 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:7.0.1) Gecko/20100101', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50', 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0)', 'Opera/9.99 (Windows NT 5.1; U; pl) Presto/9.9.9', 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_6; en-US) AppleWebKit/530.5 (KHTML, like Gecko) Chrome/ Safari/530.5', 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/533.2 (KHTML, like Gecko) Chrome/6.0', 'Mozilla/5.0 (Windows; U; Windows NT 6.1; pl; rv:1.9.1) Gecko/20090624 Firefox/3.5 (.NET CLR 3.5.30729)' ) session = req_session() session.headers.update({'Referer': 'http://www.deviantart.com/'}) self.browser = RoboBrowser(history=False, session=session, tries=3, user_agent=random.choice(USERAGENTS)) def login(self): if not (self.username and self.password): return print("Attempting to log in to deviantArt...") self.browser.open('https://www.deviantart.com/users/login?ref=http%3A%2F%2Fwww.deviantart.com%2F&remember_me=1') form = self.browser.get_forms()[1] form['username'] = self.username form['password'] = self.password self.browser.submit_form(form) if self.browser.find(text=re.compile("The password you entered was incorrect")): print("Wrong password or username. Attempting to download anyway.") elif self.browser.find(text=re.compile("\"loggedIn\":true")): print("Logged in!") else: print("Login unsuccessful. Attempting to download anyway.") def get(self, url, file_name = None): if file_name is not None and (self.overwrite == False) and (path_exists(file_name)): print(file_name + " exists - skipping") return #TODO Test robobrowser retries and exceptions self.browser.open(url) if file_name is None: return str(self.browser.parsed) else: # Open our local file for writing local_file = open(file_name, "wb") #Write to our local file local_file.write(self.browser.response.content) local_file.close() def find_link(self, link): filelink = None mature_error = False self.browser.open(link) # Full image link (via download link) img_link = self.browser.get_link(text=re.compile("Download( (Image|File))?")) if img_link and img_link.get("href"): self.browser.follow_link(img_link) filelink = self.browser.url else: if self.verbose: print("Download link not found, falling back to direct image") # Fallback 1: try meta (filtering blocked meta) filesearch = self.browser.find("meta", {"name":"og:image"}) if filesearch: filelink = filesearch['content'] if basename(filelink).startswith("noentrythumb-"): filelink = None mature_error = True if not filelink: # Fallback 2: try collect_rid, full filesearch = self.browser.find("img", {"collect_rid":True, "class":re.compile(".*full")}) if not filesearch: # Fallback 3: try collect_rid, normal filesearch = self.browser.find("img", {"collect_rid":True, "class":re.compile(".*normal")}) if filesearch: filelink = filesearch['src'] if not filelink: if mature_error: raise DagrException("probably a mature deviation") else: raise DagrException("all attemps to find a link failed") filename = basename(filelink) return (filename, filelink) def handle_download_error(self, link, e): error_string = str(e) print("Download error (" + link + ") : " + error_string) if error_string in self.errors_count: self.errors_count[error_string] += 1 else: self.errors_count[error_string] = 1 def deviant_get(self, mode): print("Ripping " + self.deviant + "'s " + mode + "...") pat = "http://[a-zA-Z0-9_-]*\.deviantart\.com/art/[a-zA-Z0-9_-]*" modeArg = '_' if mode.find(':') != -1: mode = mode.split(':',1) modeArg = mode[1] mode = mode[0] #DEPTH 1 pages = [] for i in range(0,int(Dagr.MAX_DEVIATIONS/24),24): html = "" url = "" if mode == "favs": url = "http://" + self.deviant.lower() + ".deviantart.com/favourites/?catpath=/&offset=" + str(i) elif mode == "collection": url = "http://" + self.deviant.lower() + ".deviantart.com/favourites/" + modeArg + "?offset=" + str(i) elif mode == "scraps": url = "http://" + self.deviant.lower() + ".deviantart.com/gallery/?catpath=scraps&offset=" + str(i) elif mode == "gallery": url = "http://" + self.deviant.lower() + ".deviantart.com/gallery/?catpath=/&offset=" + str(i) elif mode == "album": url = "http://" + self.deviant.lower() + ".deviantart.com/gallery/" + modeArg + "?offset=" + str(i) elif mode == "query": url = "http://" + self.deviant.lower() + ".deviantart.com/gallery/?q=" + modeArg + "&offset=" + str(i) else: continue html = self.get(url) prelim = re.findall(pat, html, re.IGNORECASE|re.DOTALL) c = len(prelim) for match in prelim: if match in pages: c -= 1 else: pages.append(match) done = re.findall("(This section has no deviations yet!|This collection has no items yet!)", html, re.IGNORECASE|re.S) if len(done) >= 1 or c <= 0: break print(self.deviant + "'s " + mode + " page " + str(int((i/24)+1)) + " crawled...") if not self.reverse: pages.reverse() if len(pages) == 0: print(self.deviant + "'s " + mode + " had no deviations.") return 0 else: try: da_make_dirs(self.deviant + "/" + mode) if (mode == "query") or (mode == "album") or (mode == "collection"): da_make_dirs(self.deviant + "/" + mode + "/" + modeArg) except Exception as e: print(str(e)) print("Total deviations in " + self.deviant + "'s gallery found: " + str(len(pages))) ##DEPTH 2 counter2 = 0 for link in pages: counter2 += 1 if self.verbose: print("Downloading " + str(counter2) + " of " + str(len(pages)) + " ( " + link + " )") filename = "" filelink = "" try: filename,filelink = self.find_link(link) except (KeyboardInterrupt, SystemExit): raise except Exception as e: self.handle_download_error(link, e) continue if self.testOnly == False: if (mode == "query") or (mode=="album") or (mode == "collection"): self.get(filelink, self.deviant + "/" + mode + "/" + modeArg + "/" + filename) else: self.get(filelink, self.deviant + "/" + mode + "/" + filename) else: print(filelink) print(self.deviant + "'s gallery successfully ripped.") def group_get(self, mode): if mode == "favs": strmode = "favby" strmode2 = "favourites" strmode3 = "favs gallery" elif mode == "gallery": strmode = "gallery" strmode2 = "gallery" strmode3 = "gallery" else: print("?") sys.exit() print("Ripping " + self.deviant + "'s " + strmode2 + "...") folders = [] insideFolder = False #are we inside a gallery folder? html = self.get('http://' + self.deviant + '.deviantart.com/' + strmode2 + '/') if re.search(strmode2 + "/\?set=.+&offset=", html, re.IGNORECASE|re.S): insideFolder = True folders = re.findall(strmode + ":.+ label=\"[^\"]*\"", html, re.IGNORECASE) #no repeats folders = list(set(folders)) i = 0 while not insideFolder: html = self.get('http://' + self.deviant + '.deviantart.com/' + strmode2 + '/?offset=' + str(i)) k = re.findall(strmode + ":" + self.deviant + "/\d+\"\ +label=\"[^\"]*\"", html, re.IGNORECASE) if k == []: break flag = False for match in k: if match in folders: flag = True else: folders+=k if self.verbose: print("Gallery page " + str(int((i/10) + 1)) + " crawled...") if flag: break i += 10 #no repeats folders = list(set(folders)) if len(folders) == 0: print(self.deviant + "'s " + strmode3 + " is empty.") return 0 else: print("Total folders in " + self.deviant + "'s " + strmode3 + " found: " + str(len(folders))) if self.reverse: folders.reverse() pat = "http:\\/\\/[a-zA-Z0-9_-]*\.deviantart\.com\\/art\\/[a-zA-Z0-9_-]*" pages = [] for folder in folders: try: folderid = re.search("[0-9]+",folder,re.IGNORECASE).group(0) label = re.search("label=\"([^\"]*)",folder,re.IGNORECASE).group(1) except: continue for i in range(0,int(Dagr.MAX_DEVIATIONS/24),24): html = self.get("http://" + self.deviant.lower() + ".deviantart.com/" + strmode2 + "/?set=" + folderid + "&offset=" + str(i - 24)) prelim = re.findall(pat, html, re.IGNORECASE) if not prelim: break for x in prelim: p = str(re.sub(r'\\/','/',x)) if p not in pages: pages.append(p) if self.verbose: print("Page " + str(int((i/24) + 1)) + " in folder " + label + " crawled...") if not self.reverse: pages.reverse() try: if mode == "favs": da_make_dirs(self.deviant + "/favs/" + label) elif mode == "gallery": da_make_dirs(self.deviant + "/" + label) except Exception as err: print(err) counter = 0 for link in pages: counter += 1 if self.verbose: print("Downloading " + str(counter) + " of " + str(len(pages)) + " ( " + link + " )") filename = "" filelink = "" try: filename,filelink = self.find_link(link) except (KeyboardInterrupt, SystemExit): raise except Exception as e: self.handle_download_error(link, e) continue if self.testOnly == False: if mode == "favs": self.get(filelink, self.devianti + "/favs/" + label + "/" + filename) elif mode == "gallery": self.get(filelink, self.deviant + "/" + label + "/" + filename) else: print(filelink) print(self.deviant + "'s " + strmode3 + " successfully ripped.") def print_errors(self): if len(self.errors_count): print("Download errors count:") for error, count in self.errors_count.iteritems(): print("* " + error + " : " + str(count))
class ISServer: def __init__(self): global pw self.pw = pw self.startingpath = os.path.abspath(os.curdir) self.infusionsoftapp=self.getappname() self.baseurl = 'https://' + self.infusionsoftapp + '.infusionsoft.com/' self.infusionsoftAPIKey=self.getapikey() self.appurl = "https://" + self.infusionsoftapp + ".infusionsoft.com:443/api/xmlrpc" self.connection = xmlrpclib.ServerProxy(self.appurl) def getappname(self): return raw_input("Please enter appname:").strip('\n \t') def getapikey(self): global pw username = self.pw['username'] password = self.pw['password'] #Basically: # #Add username and password to your global variables. self.browser = RoboBrowser(history=True) self.browser.open(self.baseurl) logform = self.browser.get_form() logform.fields['username'].value = username logform.fields['password'].value = password self.browser.submit_form(logform) self.browser.follow_link(self.browser.get_links()[1]) self.browser.open(self.baseurl + 'app/miscSetting/itemWrapper?systemId=nav.admin&settingModuleName=Application&settingTabName=Application') pageSoup = BeautifulSoup(self.browser.response.content, 'html.parser') return pageSoup.findAll(id='Application_Encrypted_Key:_data')[0].text ######################################################## ## Methods to get records from various tables ## ## def getMatchingRecords(self, tableName, criteria, desiredFields=None, orderedBy=None): """Search at table by criteria """ return self.getAllRecords(tableName, searchCriteria=criteria, interestingData=desiredFields, orderedBy=orderedBy) def getTagCats(self): return self.getAllRecords("ContactGroupCategory") def getAllTags(self): return self.getAllRecords("ContactGroup") def getAllProductCats(self): return self.getAllRecords("ProductCategory") def getAllProducts(self): return self.getAllRecords("Product") def getAllRecords(self, tableName, interestingData=None, searchCriteria=None, orderedBy=None): if interestingData is None: interestingData = tables[tableName] if searchCriteria is None: searchCriteria={} if orderedBy is None: orderedBy = interestingData[0] records = [] p=0 while True: listOfDicts = self.connection.DataService.query(self.infusionsoftAPIKey, tableName, 1000, p, searchCriteria, interestingData, orderedBy, True) for each in listOfDicts: thisRecord={} for eachbit in interestingData: # this should be records.append(zip(interestingData, each)) perhaps if not each.has_key(eachbit): # TODO: research THIS each[eachbit]=None thisRecord[eachbit] = each[eachbit] records.append(thisRecord) if not(len(listOfDicts)==1000): break p+=1 return records def incrementlyGetRecords(self, tableName, interestingData=None, searchCriteria=None, orderedBy=None): if interestingData is None: interestingData = tables[tableName] if searchCriteria is None: searchCriteria={} if orderedBy is None: orderedBy = interestingData[0] records = [] p=0 while True: print tableName, p print "trying!" try: listOfDicts = self.connection.DataService.query(self.infusionsoftAPIKey, tableName, 1000, p, searchCriteria, interestingData, orderedBy, True) except Exception, e: print e ,p for each in listOfDicts: thisRecord={} for eachbit in interestingData: # this should be records.append(zip(interestingData, each)) perhaps if not each.has_key(eachbit): # TODO: research THIS each[eachbit]=None thisRecord[eachbit] = each[eachbit] records.append(thisRecord) if not(len(listOfDicts)==1000): break p+=1 if p%10==0: fname = tableName + "%010d" %(p) + ".csv" print 'writing', p, fname with open(fname, 'wb') as outfile: thisWriter = csv.DictWriter(outfile, records[0]) thisWriter.writeheader() thisWriter.writerows(records) records=[] fname = tableName + "%010d" %(p) + ".csv" print 'writing', p, fname with open(fname, 'wb') as outfile: thisWriter = csv.DictWriter(outfile, records[0]) thisWriter.writeheader() thisWriter.writerows(records)
# -*- coding: utf-8 -*- import re import sys from robobrowser import RoboBrowser from getpass import getpass account = input('account:') password = getpass('password:'******'http://web1.cmu.edu.tw/stdinfo/login.asp') form1 = browser.get_form(id = 'form1') form1['f_id'].value = account form1['f_pwd'].value = password browser.submit_form(form1) link_one = browser.get_link(text = '期中網路教學意見調查') browser.follow_link(link_one) list = [] for l in browser.get_links(text = re.compile('填寫')): list.append(l) list.pop(0) for li in list: browser.follow_link(li) form2 = browser.get_form(id = 'thisform') form2['Cos_Q1'].value = '1' browser.submit_form(form2)
# coding: utf-8 import re from robobrowser import RoboBrowser url = "http://www.qq.com/" b = RoboBrowser(history=True) b.open(url) # 获取今日话题这个link today_top = b.find(id="todaytop").a print today_top["href"] b.follow_link(today_top) # 这个时候已经跳转到了今日话题的具体页面了 # 打印标题 title = b.select(".hd h1")[0] print "*************************************" print title.text print "*************************************" # 打印正文内容 print b.find(id="articleContent").text
class fullexporter(): global pw def __init__(self): self.startingpath = os.path.abspath(os.curdir) self.appname=self.getappname() self.mapping={} self.mapping['Contact']=-1 self.mapping['Affiliate']=-3 self.mapping['ContactAction']=-5 self.mapping['Company']=-6 self.mapping['OrderItem']=-9 self.menu() def menu(self, context="initial"): if context is "initial": self.baseurl = 'https://' + self.appname + '.infusionsoft.com/' self.apikey=self.getapikey() self.svr = ISServer.ISServer(self.appname, self.apikey) self.apppath = os.path.join(self.startingpath, self.appname) if not os.path.exists(self.apppath): os.mkdir(self.apppath) os.chdir(self.apppath) if not os.path.exists('files'): os.mkdir('files') os.chdir('files') self.usermenu={} self.usermenu['downloadAPITables'] = 'apit' self.usermenu['play'] = 'play' self.usermenu['reports'] = 'rpts' for eachitem in self.usermenu.keys(): print eachitem + ":\t" + self.usermenu[eachitem] thisChoice = raw_input('please make a choice: ').strip(' \n\t') if thisChoice == 'apit': self.handleAPItables() elif thisChoice == 'play': self.play() elif thisChoice == 'rpts': self.downloadAllReports() else: self.menu() def handlefiles(self): os.chdir(self.startingpath) if not os.path.exists('files'): os.mkdir('files') os.chdir('files') allfiles = self.svr.getAllRecords('FileBox') for eachfile in allfiles: downloadurl = self.baseurl+"Download?Id="+str(eachfile['Id']) self.browser.open(downloadurl) fileoutpath = os.path.join(self.startingpath, 'files', eachfile['ContactId'], eachfile['FileName']) if not os.path.exists(os.path.dirname(fileoutpath)): os.makedirs(fileoutpath) fout = open(fileoutpath, 'wb') fout.write(self.browser.response.content) fout.close() def play(self): print "she's all yours captain!" def downloadAReport(self, reportname): self.browser.open(self.baseurl + "Reports/exportResults.jsp?reportClass=" + reportname) reportForm = [eachform for eachform in self.browser.get_forms() if eachform.action == 'qbExport.jsp'] if len(reportForm) > 0: self.browser.submit_form(reportForm[0], submit=reportForm[0].submit_fields['process']) with open(reportname+".csv", 'wb') as outfile: outfile.write(self.browser.response.content) else: print "no " + reportname def downloadAllReports(self): for reportname in [ "AffiliateActivitySummary", "AffiliateLedger", "AffiliateRedirectActivity", "AffiliateReferral", "AffPayout", "AllOrders", "AllSales", "AllSalesItemized", "ARAgingReport", "CampaigneeBasic", "CampaigneeByDay", "CampaignProductConversion", "ClickThroughPercentage", "ClickThroughPercentageByEmail", "ContactDistributed", "CProgramRevenueSummary", "CreditCard", "CreditsIssued", "CustomerLifetimeValue", "DailyPayments", "DailyReceivables", "DailySalesTotals", "DashboardCampaign", "DashboardEmail", "DashboardLeads", "DashboardOrders", "DashboardUsers", "DigitalProductKey", "EmailBatchSearch", "EmailBroadcastConversionReport", "EmailConversion", "EmailSentSearch", "FailedCharge", "FaxBatchSearch", "FollowUpSequenceConversionReport", "FunnelFlowRecipient", "FunnelFlowRecipientWaiting", "FunnelGoalAchieved", "FunnelQueuedFlowItem", "FunnelUniqueContacts", "GroupAdds", "HabeasDetail", "InvoiceNetIncome", "LeadSourceConversion", "LeadSourceIncome", "LeadSourceROI", "LeadSourceROIByCategory", "MonthlyPayments", "MonthlyReceivables", "MonthlySalesTotals", "MonthlySalesTotalsByProduct", "OptOutSearch", "PaymentsReport", "PieceResponse", "ProductNetIncome", "Receivables", "RevenueForecastReport", "TaskSearch", "VoiceBatchSearch", "VoiceOptOutSearch", "WebformActivitySummary", "WebFormTracking" ]: self.downloadAReport(reportname) def getFilePath(self): return tkFileDialog.askopenfilename() def getFolderPath(self): return tkFileDialog.askdirectory() def getappname(self): return raw_input("Please enter appname:").strip('\n \t') def getapikey(self): global pw username = pw['username'] password = pw['password'] #Basically: # #Add username and password to your global variables. self.browser = RoboBrowser(history=True) self.browser.open(self.baseurl) logform = self.browser.get_form() logform.fields['username'].value = username logform.fields['password'].value = password self.browser.submit_form(logform) self.browser.follow_link(self.browser.get_links()[1]) self.browser.open(self.baseurl + 'app/miscSetting/itemWrapper?systemId=nav.admin&settingModuleName=Application&settingTabName=Application') pageSoup = BeautifulSoup(self.browser.response.content, 'html.parser') return pageSoup.findAll(id='Application_Encrypted_Key:_data')[0].text def handleAPItables(self): apidata={} self.customfields=self.svr.getAllRecords('DataFormField') for eachtable in ISServer.tables.keys(): print "starting " + eachtable if eachtable not in self.mapping.keys(): self.mapping[eachtable]=99 fields = ISServer.tables[eachtable] + ['_'+fld['Name'] for fld in self.customfields if fld['FormId'] is self.mapping[eachtable]] apidata[eachtable] = self.svr.getAllRecords(eachtable, interestingData=fields) with open(eachtable+".csv", 'wb') as outfile: writer=csv.DictWriter(outfile, fields) writer.writeheader() writer.writerows(apidata[eachtable]) print "done writing " + eachtable self.apidata = apidata def handlewebforms(self): # for eachid # webformsubmissionpath="https://" + self.appname + ".infusionsoft.com/app/webformSubmission/contactTabDetails?customFormWebResultId=" + str(x) pass
# Browser #br = mechanize.Browser() br = RoboBrowser(history=True, user_agent='Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.2 (KHTML, like Gecko) Chrome/22.0.1216.0 Safari/537.2') # The site we will navigate into, handling it's session br.open('http://heroes-wow.com/wotlk/index.php?page=login') login_form = br.get_form(action="http://heroes-wow.com/wotlk/execute.php?take=login") login_form['username'].value = 'anathk2' login_form['password'].value = 'wow123456' login_form['rememberme'].value = '1' br.submit_form(login_form) br.open('http://topg.org/server-heroes-wow-id347987') links = br.find_all('a', href=True) br.follow_link(links[22]) result = br.parsed new_links = br.find_all('a', href=True) br.follow_link(new_links[1])
import re from robobrowser import RoboBrowser # Browse to Rap Genius browser = RoboBrowser(history=True) browser.open('http://rapgenius.com/') # Search for Queen form = browser.get_form(action='/search') form # <RoboForm q=> form['q'].value = 'queen' browser.submit_form(form) # Look up the first song songs = browser.select('.song_name') browser.follow_link(songs[0]) lyrics = browser.select('.lyrics') lyrics[0].text # \n[Intro]\nIs this the real life... # Back to results page browser.back() # Look up my favorite song browser.follow_link('death on two legs') # Can also search HTML using regex patterns lyrics = browser.find(class_=re.compile(r'\blyrics\b')) lyrics.text
courseTitle = "".join([x if x.isalnum() else "_" for x in courseTitle]) print('Course Url: ' + courseModulesUrl) print('Course Title: ' + courseTitle) print('Finding file links of type: ' + args.downloadOnly) # Make output dir outputDir = os.path.join('output/', courseTitle) make_path(outputDir) # Get modules links with lecture in title moduleLinks = browser.find_all("a", { "class" : "for-nvda" }) print('Found ' + str(len(moduleLinks)) + ' links, (not all will be valid)') # Process each lecture link for moduleLink in moduleLinks: print('Opening: ' + moduleLink['aria-label']) browser.follow_link(moduleLink) try: # Find link - containing words "download" downloadLinkRel = browser.find('a', href = re.compile(r'.*download*')) # If failed, find link - containing reference to file "****.XXX" if downloadLinkRel is None: downloadLinkRel = browser.find('a', href = re.compile(r'.*\.[a-z]{3,4}$')) fileNameWithExtension = downloadLinkRel.text.strip() # Check the link is the right filetype if args.downloadOnly != 'all' and not fileNameWithExtension.endswith(args.downloadOnly): print(' not processing (wrong extension): ' + fileNameWithExtension) continue downloadLinkAbsolute = urlparse.urljoin(courseModulesUrl, downloadLinkRel['href']) pdfOutputPath = os.path.join(outputDir, fileNameWithExtension) # Check if file already download (incase the tool was interrupted) if os.path.isfile(pdfOutputPath):