def grablinks(pageurl): dllinks = [] br = Browser() br2 = Browser() br.set_handle_referer(True) br.set_handle_robots(False) br.addheaders = [( 'User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1' )] br2.set_handle_referer(True) br2.set_handle_robots(False) br2.addheaders = [( 'User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1' )] br.open(sys.argv[1]) grabbed = 0 for link in br.links(url_regex='/download/'): print "Working..." req = br.click_link(url=link.url) br2.open(req) dlpagetext = br2.response().read() dllinks.append( str.replace( str.replace( re.search('var hqurl = \'.*\'', dlpagetext).group(0), "var hqurl =", ""), "'", "")) print "Grabbed link " + str(grabbed + 1) grabbed = grabbed + 1 return dllinks
def delete_module(br: Browser, args: Args, config: Config) -> None: module_name = get_module_name(args, config) print(f'Deleting module with name {module_name}...') # Try clicking on link to module config page try: res = br.follow_link(text=module_name) except: print('Cannot delete module; does not exist.') print() return # Get URL to delete module req = br.click_link(text='Delete Module') url = req.get_full_url() # Search for the stupid CSRF token token = find_csrf_token(res) params = {u'_method': 'delete', u'authenticity_token': token} data = urllib.parse.urlencode(params) # POST the deletion request try: br.open(url, data) print('Module deleted') except: print('Module could not be deleted') print()
def grablinks(pageurl): dllinks = [] br = Browser() br2 = Browser() br.set_handle_referer(True) br.set_handle_robots(False) br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')] br2.set_handle_referer(True) br2.set_handle_robots(False) br2.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')] br.open(sys.argv[1]) grabbed = 0 for link in br.links(url_regex='/download/'): print "Working..." req = br.click_link(url=link.url) br2.open(req) dlpagetext = br2.response().read() dllinks.append(str.replace(str.replace(re.search('var hqurl = \'.*\'',dlpagetext).group(0),"var hqurl =",""),"'","")) print "Grabbed link "+str(grabbed+1) grabbed = grabbed + 1 return dllinks
def DoAuth(address, password) : br = Browser() br.open(address) #find first form br.select_form(nr=0) #filling fields br["user_name"] = "user" br["user_pass"] = password br.submit() #find all links on page by regexp for link in br.links(url_regex="side"): link_url = ''; link_text = ''; page_text = ''; req = br.click_link(link) link_url = link.url print('........... ............ ............') print("IN " + link_url) time.sleep(1) link_text = link.text; print("Link text: " + link.text) #create object for parsing page soup = BeautifulSoup(br.open(req)) cols = soup.findAll('iframe') if (cols) : #create object for frame body fr = Browser() s = address; #remove index.php s = s[:-9] soupframe = BeautifulSoup(fr.open(s + cols[0]['src'])) cols = soupframe.findAll('h3') if (cols) : page_text = cols[0].renderContents() print 'page text: ' + page_text RecordToFile(link_url, link_text, page_text) f1.close()
from mechanize import Browser import webbrowser br = Browser() br.open('http://www.city-data.com/') br.find_link(text='CT') req = br.click_link(text='CT') br.open(req) print br.geturl() clickedurl = br.geturl() #br.open(clickedurl).read() #webbrowser.open(clickedurl,new=1) #browser.select_form(nr = 0) f = br.retrieve('http://pics3.city-data.com/images/data-stats-new.png')[0] print f fh = open(f) #print br.response().read()
class Gmtkn24(object): """ Interact with the online web pages of GMTKN24. """ BASE_URL = 'http://toc.uni-muenster.de/GMTKN/GMTKN24/' def __init__(self): # initialization self._browser = Browser() self._browser.set_handle_robots(False) self._subsets = self._list_subsets() _subset_link_re = re.compile("The (.+) subset") #_subset_link_re = re.compile("here") def _list_subsets(self): """Return dictionary mapping GMTKN24 subset names to download URLs.""" html = BeautifulSoup( self._browser.open(Gmtkn24.BASE_URL + 'GMTKN24main.html')) links = html.findAll(name="a") result = {} for a in links: if a.string is not None: match = Gmtkn24._subset_link_re.match(a.string) if match is not None: print a # if a subset has several names, add all of them for name in match.group(1).split(' '): if name == 'and': continue result[name] = Gmtkn24.BASE_URL + a['href'] print result #result = ['google.com', 'cnn.com'] return result def list(self): """Return dictionary mapping GMTKN24 subset names to download URLs.""" return self._subsets def get_geometries(self, subset, output_dir='geometries'): """ Download geometry files for the specified GMTKN24 subset, and save them into the 'geometries/' subdirectory of the current working directory. Return list of extracted molecules/filenames. """ subset_url = self._subsets[subset] page = self._browser.open(subset_url) # must download the zip to a local file -- zipfiles are not stream-friendly ... geometries_url = self._browser.click_link( text_regex=re.compile("^Geometries")) (filename, headers) = self._browser.retrieve(geometries_url) logger.info("%s geometries downloaded into file '%s'", subset, filename) geometries_zip = ZipFile(filename, 'r') if not os.path.exists(output_dir): os.mkdir(output_dir) molecules = self.get_molecule_names(subset) extracted = list() names = geometries_zip.namelist() for name in names: basename = os.path.basename(name) if basename not in molecules and basename != 'README': continue # zipfile's `extract` method preserves full pathname, # so let's get the data from the archive and write # it in the file WE want... content = geometries_zip.read(name) output_path = os.path.join(output_dir, basename) output = open(output_path, 'w') output.write(content) output.close() if not ('README' == basename): extracted.append(basename) logger.info("Extracted '%s' into '%s'", basename, output_path) geometries_zip.close() return extracted def get_reference_data(self, subset): """ Iterate over stoichiometry reference data in a given GMTKN24 subset. Each returned value is a pair `(r, d)`, where `r` is a dictionary mapping compound names (string) to their stoichiometric coefficient (integer), and `d` is a (float) number representing the total energy. """ subset_url = self._subsets[subset] subset_page = self._browser.open(subset_url) if subset in ['BH76', 'BH76RC']: # special case self._browser.follow_link(text=("Go to the %s subset" % subset)) refdata_page = self._browser.follow_link(text="Reference data") table = HtmlTable(refdata_page.read()) for row in table.rows_as_dict(): if subset == 'W4-08woMR': # The 16 entries marked with an asterisk (*) are not # part of the W4-08woMR subset. if row['#'] and row['#'][0].endswith('*'): continue reactants = row['Systems'] if len(reactants) == 0: continue # ignore null rows qtys = row['Stoichiometry'] refdata = float(row['Ref.'][0]) reaction = {} for n, sy in enumerate(reactants): if qtys[n] == '': continue # skip null fields reaction[sy] = int(qtys[n]) yield (reaction, refdata) def get_molecule_names(self, subset): """Return set of molecule names belonging in the specified subset.""" # The only generic way to list molecule names seems to be: # take the systems names from the ref.data table. molecules = set() for reaction, data in self.get_reference_data(subset): for molecule in reaction: molecules.add(molecule) return molecules
[ 1) Get reponse from inputting workspace url 2) Get reponse from inputting email and password ] """ # workspace url browser.open(url) browser.select_form(nr=0) browser.form['domain'] = workspace response_workspace = browser.submit() print(response_workspace.read()) ## email and password browser.select_form(nr=1) browser.form['email'] = email browser.form['password'] = password response_login = browser.submit() print(response_login.read()) """Choose Channel to add to [Ie. twittercode_snippets] """ browser.click_link(text=channel)
class Gmtkn24(object): """ Interact with the online web pages of GMTKN24. """ BASE_URL = 'http://toc.uni-muenster.de/GMTKN/GMTKN24/' def __init__(self): # initialization self._browser = Browser() self._browser.set_handle_robots(False) self._subsets = self._list_subsets() _subset_link_re = re.compile("The (.+) subset") #_subset_link_re = re.compile("here") def _list_subsets(self): """Return dictionary mapping GMTKN24 subset names to download URLs.""" html = BeautifulSoup(self._browser.open(Gmtkn24.BASE_URL + 'GMTKN24main.html')) links = html.findAll(name="a") result = { } for a in links: if a.string is not None: match = Gmtkn24._subset_link_re.match(a.string) if match is not None: print a # if a subset has several names, add all of them for name in match.group(1).split(' '): if name == 'and': continue result[name] = Gmtkn24.BASE_URL + a['href'] print result #result = ['google.com', 'cnn.com'] return result def list(self): """Return dictionary mapping GMTKN24 subset names to download URLs.""" return self._subsets def get_geometries(self, subset, output_dir='geometries'): """ Download geometry files for the specified GMTKN24 subset, and save them into the 'geometries/' subdirectory of the current working directory. Return list of extracted molecules/filenames. """ subset_url = self._subsets[subset] page = self._browser.open(subset_url) # must download the zip to a local file -- zipfiles are not stream-friendly ... geometries_url = self._browser.click_link(text_regex=re.compile("^Geometries")) (filename, headers) = self._browser.retrieve(geometries_url) logger.info("%s geometries downloaded into file '%s'", subset, filename) geometries_zip = ZipFile(filename, 'r') if not os.path.exists(output_dir): os.mkdir(output_dir) molecules = self.get_molecule_names(subset) extracted = list() names = geometries_zip.namelist() for name in names: basename = os.path.basename(name) if basename not in molecules and basename != 'README': continue # zipfile's `extract` method preserves full pathname, # so let's get the data from the archive and write # it in the file WE want... content = geometries_zip.read(name) output_path = os.path.join(output_dir, basename) output = open(output_path, 'w') output.write(content) output.close() if not ('README' == basename): extracted.append(basename) logger.info("Extracted '%s' into '%s'", basename, output_path) geometries_zip.close() return extracted def get_reference_data(self, subset): """ Iterate over stoichiometry reference data in a given GMTKN24 subset. Each returned value is a pair `(r, d)`, where `r` is a dictionary mapping compound names (string) to their stoichiometric coefficient (integer), and `d` is a (float) number representing the total energy. """ subset_url = self._subsets[subset] subset_page = self._browser.open(subset_url) if subset in ['BH76', 'BH76RC']: # special case self._browser.follow_link(text=("Go to the %s subset" % subset)) refdata_page = self._browser.follow_link(text="Reference data") table = HtmlTable(refdata_page.read()) for row in table.rows_as_dict(): if subset == 'W4-08woMR': # The 16 entries marked with an asterisk (*) are not # part of the W4-08woMR subset. if row['#'] and row['#'][0].endswith('*'): continue reactants = row['Systems'] if len(reactants) == 0: continue # ignore null rows qtys = row['Stoichiometry'] refdata = float(row['Ref.'][0]) reaction = { } for n,sy in enumerate(reactants): if qtys[n] == '': continue # skip null fields reaction[sy] = int(qtys[n]) yield (reaction, refdata) def get_molecule_names(self, subset): """Return set of molecule names belonging in the specified subset.""" # The only generic way to list molecule names seems to be: # take the systems names from the ref.data table. molecules = set() for reaction,data in self.get_reference_data(subset): for molecule in reaction: molecules.add(molecule) return molecules
interpreter.process_page(page) outfp.write("END PAGE %d\n" % i) device.close() fp.close() return outfp.getvalue() db = cdb.cdbmake('omega/cdb/pdfurl','pdfurl.tmp') b = Browser() for round in range(1,4): url = 'http://taxreview.treasury.gov.au/content/submission.aspx?round=' + str(round) b.open(url) for link in b.links(url_regex='pdf$'): u = b.click_link(link).get_full_url() #print "link: ", u try: f = urllib2.urlopen(u) except: continue remotefile = re.search('[^/]+$',u).group(0) remotetime = time.mktime(f.info().getdate('Last-Modified')) #base = re.search('[^\.]+',remotefile).group(0) base = re.search('(.+)\.pdf$',remotefile).group(1) print base localhtml = 'www/html/' + str(round) + '/' + base + '.html' localpdf = 'pdf/' + str(round) + '/' + base + '.pdf' localtime = 0