def main(page, regex, path):
    start_time = time.time()
    br = Browser()
    br.set_handle_robots(False)
    br.open(page)
    #br.open('http://storage.googleapis.com/books/ngrams/books/datasetsv2.html')

    eng_all = re.compile(regex)
    #eng_all = re.compile('.*googlebooks-eng-all.*20120701.*')

    #print page, regex, path
    n = 0
    maxlen = 0
    link_list = []
    for link in br.links():
        if eng_all.match(link.url):
            n += 1
            maxlen = max(len(os.path.basename(link.url)), maxlen)
            link_list.append(link.url)
            sys.stderr.write('Found Link: %s\n' % link.url)

    answer = raw_input("\n\nAre you sure you want to download the above %i file(s)? (Y/N):  " % n)
    if answer == 'N' or answer == 'n':
        sys.exit(0)

    sys.stderr.write('\n\nDownloading files to: %s\n' % path)

    digits = len('%d' % n)
    disp_time = datetime.datetime.now

    for i, link in enumerate(link_list):
        download_start = time.time()
        file_name = os.path.basename(link)
        full_path = os.path.join(path, file_name)
        if os.path.exists(full_path):
            sys.stderr.write('%s exists, not downloading\n' % full_path)
            continue
        try:
            sys.stderr.write('[%s] Downloading(%-*i of %i): %*s' % (str(disp_time().time())[:8], digits, i+1, n,
                                                                   maxlen + 2, file_name))
            br.retrieve(link, filename=full_path)
        except:
            sys.stderr.write('\n\nSomething happened, deleting last file: %s\n' % full_path)
            os.remove(full_path)
            sys.exit(0)
        sys.stderr.write(' of size %s MB in %5.2f min\n' % ("{:7.2f}".format(float(os.stat(full_path).st_size)/1000000),
                                                            (time.time() - download_start)/60))
        br.clear_history()

    sys.stderr.write('\ndownloaded %i files to %s directory in %15f seconds\n' % (n, path, time.time()-start_time))
Exemple #2
0
optsparser.add_option('-p',
                      '--password',
                      default=None,
                      dest='password',
                      help='Password')
options, args = optsparser.parse_args()

z = 1
# Capture wiki tracker data
print 'Capturing wiki data...'
wikifile = open(options.wikifile, 'r')
wikidata = wikifile.read()

# Post it
br = Browser()
br.clear_history()
br.set_handle_robots(False)  # don't pay attention to robots.txt
print 'Going to wiki...'
response = br.open(
    'http://www.openhpi.org/Status/OpenhpiBugsFeatures?action=login')
br.select_form(nr=2)  # The form we want is the third one
br['name'] = options.user or raw_input('Enter your username: '******'password'] = options.password or \
    getpass('Enter password for \'%s\': ' % br['name'])
print 'Loggin in...'
time.sleep(z)  # Be nice to the website
response = br.submit()
time.sleep(z)
print 'Editing page...'
response = br.follow_link(url_regex='editor=text')
br.select_form(nr=0)
Exemple #3
0
optsparser.add_option('-p',
                      '--password',
                      default=None,
                      dest='password',
                      help='Password')
options, args = optsparser.parse_args()

z = 1
# Capture wiki tracker data
print 'Capturing wiki data...'
wikifile = open(options.wikifile, 'r')
wikidata = wikifile.read()

# Post it
br = Browser()
br.clear_history()
br.set_handle_robots(False) # don't pay attention to robots.txt
print 'Going to wiki...'
response = br.open('http://www.openhpi.org/Status/OpenhpiBugsFeatures?action=login')
br.select_form(nr=2) # The form we want is the third one
br['name'] = options.user or raw_input('Enter your username: '******'password'] = options.password or \
    getpass('Enter password for \'%s\': ' % br['name'])
print 'Loggin in...'
time.sleep(z) # Be nice to the website
response = br.submit()
time.sleep(z)
print 'Editing page...'
response = br.follow_link(url_regex='editor=text')
br.select_form(nr=0)
br['savetext'] = wikidata