Exemple #1
0
def grablinks(pageurl):
    dllinks = []
    br = Browser()
    br2 = Browser()
    br.set_handle_referer(True)
    br.set_handle_robots(False)
    br.addheaders = [(
        'User-agent',
        'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1'
    )]
    br2.set_handle_referer(True)
    br2.set_handle_robots(False)
    br2.addheaders = [(
        'User-agent',
        'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1'
    )]

    br.open(sys.argv[1])
    grabbed = 0
    for link in br.links(url_regex='/download/'):
        print "Working..."
        req = br.click_link(url=link.url)
        br2.open(req)
        dlpagetext = br2.response().read()
        dllinks.append(
            str.replace(
                str.replace(
                    re.search('var hqurl = \'.*\'', dlpagetext).group(0),
                    "var hqurl =", ""), "'", ""))
        print "Grabbed link " + str(grabbed + 1)
        grabbed = grabbed + 1
    return dllinks
Exemple #2
0
def delete_module(br: Browser, args: Args, config: Config) -> None:
    module_name = get_module_name(args, config)

    print(f'Deleting module with name {module_name}...')
    # Try clicking on link to module config page
    try:
        res = br.follow_link(text=module_name)
    except:
        print('Cannot delete module; does not exist.')
        print()
        return

    # Get URL to delete module
    req = br.click_link(text='Delete Module')
    url = req.get_full_url()

    # Search for the stupid CSRF token
    token = find_csrf_token(res)
    params = {u'_method': 'delete', u'authenticity_token': token}
    data = urllib.parse.urlencode(params)

    # POST the deletion request
    try:
        br.open(url, data)
        print('Module deleted')
    except:
        print('Module could not be deleted')

    print()
Exemple #3
0
def grablinks(pageurl):
	dllinks = []
	br = Browser()
	br2 = Browser()
	br.set_handle_referer(True)
	br.set_handle_robots(False)
	br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')]
	br2.set_handle_referer(True)
	br2.set_handle_robots(False)
	br2.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')]
	
	br.open(sys.argv[1])
	grabbed = 0
	for link in br.links(url_regex='/download/'):
		print "Working..."
		req = br.click_link(url=link.url)
		br2.open(req)
		dlpagetext = br2.response().read()
		dllinks.append(str.replace(str.replace(re.search('var hqurl = \'.*\'',dlpagetext).group(0),"var hqurl =",""),"'",""))
		print "Grabbed link "+str(grabbed+1)
		grabbed = grabbed + 1
	return dllinks
Exemple #4
0
def DoAuth(address, password) :
	br = Browser()
	br.open(address)
	#find first form
	br.select_form(nr=0)
	#filling fields
	br["user_name"] = "user"
	br["user_pass"] = password
	br.submit()
	#find all links on page by regexp
	for link in br.links(url_regex="side"):
		link_url = '';
		link_text = '';
		page_text = '';
		req = br.click_link(link)
		link_url = link.url
		print('........... ............ ............')
		print("IN " + link_url)
		
		time.sleep(1)
		link_text = link.text;
		print("Link text: " + link.text)
		#create object for parsing page
		soup = BeautifulSoup(br.open(req))
		cols = soup.findAll('iframe')
		if (cols) :
			#create object for frame body
			fr = Browser()
			s = address;
			#remove index.php
			s = s[:-9]
			soupframe = BeautifulSoup(fr.open(s + cols[0]['src']))
			cols = soupframe.findAll('h3')
			if (cols) :
				page_text = cols[0].renderContents()
				print 'page text: ' + page_text
		RecordToFile(link_url, link_text, page_text)
	f1.close()
Exemple #5
0
from mechanize import Browser
import webbrowser

br = Browser()
br.open('http://www.city-data.com/')

br.find_link(text='CT')
req = br.click_link(text='CT')
br.open(req)
print br.geturl()
clickedurl = br.geturl()

#br.open(clickedurl).read()
#webbrowser.open(clickedurl,new=1)

#browser.select_form(nr = 0)
f = br.retrieve('http://pics3.city-data.com/images/data-stats-new.png')[0]
print f
fh = open(f)

#print br.response().read()
Exemple #6
0
class Gmtkn24(object):
    """
    Interact with the online web pages of GMTKN24.
    """
    BASE_URL = 'http://toc.uni-muenster.de/GMTKN/GMTKN24/'

    def __init__(self):
        # initialization
        self._browser = Browser()
        self._browser.set_handle_robots(False)
        self._subsets = self._list_subsets()

    _subset_link_re = re.compile("The (.+) subset")

    #_subset_link_re = re.compile("here")
    def _list_subsets(self):
        """Return dictionary mapping GMTKN24 subset names to download URLs."""
        html = BeautifulSoup(
            self._browser.open(Gmtkn24.BASE_URL + 'GMTKN24main.html'))
        links = html.findAll(name="a")
        result = {}
        for a in links:
            if a.string is not None:
                match = Gmtkn24._subset_link_re.match(a.string)
                if match is not None:
                    print a
                    # if a subset has several names, add all of them
                    for name in match.group(1).split(' '):
                        if name == 'and':
                            continue
                        result[name] = Gmtkn24.BASE_URL + a['href']
        print result
        #result = ['google.com', 'cnn.com']
        return result

    def list(self):
        """Return dictionary mapping GMTKN24 subset names to download URLs."""
        return self._subsets

    def get_geometries(self, subset, output_dir='geometries'):
        """
        Download geometry files for the specified GMTKN24 subset,
        and save them into the 'geometries/' subdirectory of the
        current working directory.

        Return list of extracted molecules/filenames.
        """
        subset_url = self._subsets[subset]
        page = self._browser.open(subset_url)
        # must download the zip to a local file -- zipfiles are not stream-friendly ...
        geometries_url = self._browser.click_link(
            text_regex=re.compile("^Geometries"))
        (filename, headers) = self._browser.retrieve(geometries_url)
        logger.info("%s geometries downloaded into file '%s'", subset,
                    filename)
        geometries_zip = ZipFile(filename, 'r')
        if not os.path.exists(output_dir):
            os.mkdir(output_dir)
        molecules = self.get_molecule_names(subset)
        extracted = list()
        names = geometries_zip.namelist()
        for name in names:
            basename = os.path.basename(name)
            if basename not in molecules and basename != 'README':
                continue
            # zipfile's `extract` method preserves full pathname,
            # so let's get the data from the archive and write
            # it in the file WE want...
            content = geometries_zip.read(name)
            output_path = os.path.join(output_dir, basename)
            output = open(output_path, 'w')
            output.write(content)
            output.close()
            if not ('README' == basename):
                extracted.append(basename)
            logger.info("Extracted '%s' into '%s'", basename, output_path)
        geometries_zip.close()
        return extracted

    def get_reference_data(self, subset):
        """
        Iterate over stoichiometry reference data in a given GMTKN24
        subset.  Each returned value is a pair `(r, d)`, where `r` is
        a dictionary mapping compound names (string) to their
        stoichiometric coefficient (integer), and `d` is a (float)
        number representing the total energy.
        """
        subset_url = self._subsets[subset]
        subset_page = self._browser.open(subset_url)
        if subset in ['BH76', 'BH76RC']:
            # special case
            self._browser.follow_link(text=("Go to the %s subset" % subset))
        refdata_page = self._browser.follow_link(text="Reference data")
        table = HtmlTable(refdata_page.read())
        for row in table.rows_as_dict():
            if subset == 'W4-08woMR':
                # The 16 entries marked with an asterisk (*) are not
                # part of the W4-08woMR subset.
                if row['#'] and row['#'][0].endswith('*'):
                    continue
            reactants = row['Systems']
            if len(reactants) == 0:
                continue  # ignore null rows
            qtys = row['Stoichiometry']
            refdata = float(row['Ref.'][0])
            reaction = {}
            for n, sy in enumerate(reactants):
                if qtys[n] == '':
                    continue  # skip null fields
                reaction[sy] = int(qtys[n])
            yield (reaction, refdata)

    def get_molecule_names(self, subset):
        """Return set of molecule names belonging in the specified subset."""
        # The only generic way to list molecule names seems to be:
        # take the systems names from the ref.data table.
        molecules = set()
        for reaction, data in self.get_reference_data(subset):
            for molecule in reaction:
                molecules.add(molecule)
        return molecules
Exemple #7
0
[
    1) Get reponse from inputting workspace url
    2) Get reponse from inputting email and password
]
"""

# workspace url
browser.open(url)
browser.select_form(nr=0)
browser.form['domain'] = workspace

response_workspace = browser.submit()
print(response_workspace.read())

## email and password
browser.select_form(nr=1)
browser.form['email'] = email
browser.form['password'] = password

response_login = browser.submit()
print(response_login.read())


"""Choose Channel to add to

[Ie. twittercode_snippets]
"""

browser.click_link(text=channel)
Exemple #8
0
class Gmtkn24(object):
    """
    Interact with the online web pages of GMTKN24.
    """
    BASE_URL = 'http://toc.uni-muenster.de/GMTKN/GMTKN24/'
    def __init__(self):
        # initialization
        self._browser = Browser()
        self._browser.set_handle_robots(False)
        self._subsets = self._list_subsets()

    _subset_link_re = re.compile("The (.+) subset")
    #_subset_link_re = re.compile("here")
    def _list_subsets(self):
        """Return dictionary mapping GMTKN24 subset names to download URLs."""
        html = BeautifulSoup(self._browser.open(Gmtkn24.BASE_URL + 'GMTKN24main.html'))
        links = html.findAll(name="a")
        result = { }
        for a in links:
	     if a.string is not None:
                match = Gmtkn24._subset_link_re.match(a.string)
                if match is not None:
                    print a
		    # if a subset has several names, add all of them
                    for name in match.group(1).split(' '):
                        if name == 'and':
                            continue
                        result[name] = Gmtkn24.BASE_URL + a['href']
        print result
	#result = ['google.com', 'cnn.com']
	return result

    def list(self):
        """Return dictionary mapping GMTKN24 subset names to download URLs."""
        return self._subsets

    def get_geometries(self, subset, output_dir='geometries'):
        """
        Download geometry files for the specified GMTKN24 subset,
        and save them into the 'geometries/' subdirectory of the
        current working directory.

        Return list of extracted molecules/filenames.
        """
        subset_url = self._subsets[subset]
        page = self._browser.open(subset_url)
        # must download the zip to a local file -- zipfiles are not stream-friendly ...
        geometries_url = self._browser.click_link(text_regex=re.compile("^Geometries"))
        (filename, headers) = self._browser.retrieve(geometries_url)
        logger.info("%s geometries downloaded into file '%s'", subset, filename)
        geometries_zip = ZipFile(filename, 'r')
        if not os.path.exists(output_dir):
            os.mkdir(output_dir)
        molecules = self.get_molecule_names(subset)
        extracted = list()
        names = geometries_zip.namelist()
        for name in names:
            basename = os.path.basename(name)
            if basename not in molecules and basename != 'README':
                continue
            # zipfile's `extract` method preserves full pathname, 
            # so let's get the data from the archive and write
            # it in the file WE want...
            content = geometries_zip.read(name)
            output_path = os.path.join(output_dir, basename)
            output = open(output_path, 'w')
            output.write(content)
            output.close()
            if not ('README' == basename):
                extracted.append(basename)
            logger.info("Extracted '%s' into '%s'", basename, output_path)
        geometries_zip.close()
        return extracted

    def get_reference_data(self, subset):
        """
        Iterate over stoichiometry reference data in a given GMTKN24
        subset.  Each returned value is a pair `(r, d)`, where `r` is
        a dictionary mapping compound names (string) to their
        stoichiometric coefficient (integer), and `d` is a (float)
        number representing the total energy.
        """
        subset_url = self._subsets[subset]
        subset_page = self._browser.open(subset_url)
        if subset in ['BH76', 'BH76RC']:
            # special case
            self._browser.follow_link(text=("Go to the %s subset" % subset))
        refdata_page = self._browser.follow_link(text="Reference data")
        table = HtmlTable(refdata_page.read())
        for row in table.rows_as_dict():
            if subset == 'W4-08woMR':
                # The 16 entries marked with an asterisk (*) are not
                # part of the W4-08woMR subset.
                if row['#'] and row['#'][0].endswith('*'):
                    continue
            reactants = row['Systems']
            if len(reactants) == 0:
                continue # ignore null rows
            qtys = row['Stoichiometry']
            refdata = float(row['Ref.'][0])
            reaction = { }
            for n,sy in enumerate(reactants):
                if qtys[n] == '':
                    continue # skip null fields
                reaction[sy] = int(qtys[n])
            yield (reaction, refdata)

    def get_molecule_names(self, subset):
        """Return set of molecule names belonging in the specified subset."""
        # The only generic way to list molecule names seems to be:
        # take the systems names from the ref.data table.
        molecules = set()
        for reaction,data in self.get_reference_data(subset):
            for molecule in reaction:
                molecules.add(molecule)
        return molecules
Exemple #9
0
        interpreter.process_page(page)
        outfp.write("END PAGE %d\n" % i)

    device.close()
    fp.close()

    return outfp.getvalue()

db = cdb.cdbmake('omega/cdb/pdfurl','pdfurl.tmp')
b = Browser()
for round in range(1,4):
    url = 'http://taxreview.treasury.gov.au/content/submission.aspx?round=' + str(round)
    b.open(url)
    for link in b.links(url_regex='pdf$'):

        u = b.click_link(link).get_full_url()
        #print "link: ", u
        try:
            f = urllib2.urlopen(u)
        except:
            continue

        remotefile = re.search('[^/]+$',u).group(0)
        remotetime = time.mktime(f.info().getdate('Last-Modified'))

        #base = re.search('[^\.]+',remotefile).group(0)
        base = re.search('(.+)\.pdf$',remotefile).group(1)
        print base
        localhtml = 'www/html/' + str(round) + '/' + base + '.html'
        localpdf = 'pdf/' + str(round) + '/' + base + '.pdf'
        localtime = 0