def download_images_from_webpage(self, href, out_path=None, img_name=None): ret = False print href sub_html = utils.wget(href) if not sub_html: print 'WARNING: request to %s failed.' % sub_html else: ret = True # get the jpg image in the page #image_urls = re.findall(ur'<img [^>]*src="([^"]*?\.jpg)"[^>]*>', sub_html) #print sub_html image_urls = re.findall(ur'<img [^>]*src\s*=\s*"([^"]*?)"[^>]*?>', sub_html) print image_urls for image_url in image_urls: if not image_url.startswith('/'): image_url = re.sub(ur'/[^/]*$', '/' + image_url, href) else: image_url = re.sub(ur'^(.*?//.*?/).*$', r'\1' + image_url, href) print image_url # get the image image = utils.wget(image_url) if not image: print 'WARNING: request to %s failed.' % image_url else: # save it image_path = os.path.join(out_path, img_name or re.sub(ur'^.*/', '', image_url)) + '' print image_path utils.write_file(image_path, image) return ret
def install_Windows(self): # ctags ctags_zip = os.path.join(DOWNLOADS, CTAGS_VER + '.zip') if not isfile(ctags_zip): wget('http://sourceforge.net/projects/ctags/files/ctags/5.8/{}.zip'. format(CTAGS_VER), ctags_zip) unzip(ctags_zip, CTAGS_VER + '/ctags.exe')
def import_orders(): url = 'https://mouser.com/OrderHistory/OrdersView.aspx' file = '/tmp/orders.html' wget(url, file) orders = parse_orders(url, file) for order in orders: (cols, url) = order import_order(url)
def import_orders(): url = 'https://www.digikey.com/classic/registereduser/WebOrderHistory.aspx' file = '/tmp/orders.html' wget(url, file) orders = parse_orders(url, file) for order in orders: (cols, url) = order import_order(url)
def install(should_identify=True): # Based on ideas from https://github.com/harvimt/quamash/blob/master/.travis.yml if should_identify: system_identify() td = Travis_Dispatcher() xqt( # Cached Downloads 'sudo mkdir -p /downloads', 'sudo chmod a+rw /downloads') sip_ver = 'sip-4.17' if not isfile('/downloads/sip.tar.gz'): wget('http://downloads.sourceforge.net/project/pyqt/sip/{}/{}'. format(sip_ver, _gz(sip_ver)), '/downloads/sip.tar.gz') # _`pyqt_ver`: Select a PyQt version. See also qt5_Linux_ and qt5_OS_X_. pyqt_ver = '5.5.1' pyqt_gpl_ver = 'PyQt-gpl-' + pyqt_ver if not isfile('/downloads/pyqt5.tar.gz'): wget('http://downloads.sourceforge.net/project/pyqt/PyQt5/PyQt-{}/{}'. format(pyqt_ver, _gz(pyqt_gpl_ver)), '/downloads/pyqt5.tar.gz') # Builds xqt('sudo mkdir -p /builds', 'sudo chmod a+rw /builds') # Qt5 td.qt5() # SIP. With Linux or OS_X, don't use the package manager to install these, # since they're installed for the system python, not the pyenv version # we're testing with. with pushd('/builds'): xqt('tar xzf /downloads/sip.tar.gz --keep-newer-files') chdir(sip_ver) xqt('python configure.py', 'make', 'sudo make install') # PyQt5 with pushd('/builds'): xqt('tar xzf /downloads/pyqt5.tar.gz --keep-newer-files') chdir(pyqt_gpl_ver) td.pyqt5_configure() xqt('make', 'sudo make install') # PCRE td.pcre() # Qutepart if build_os == 'Linux': set_display() xqt('sh -e /etc/init.d/xvfb start') # Install, which also builds Python C extensions. Use this instead of # ``build_ext`` so that Enki will have an already-installed qutepart, # rather than needing to regenrate the command below. xqt('python setup.py install')
def import_order(url): mouser = populate_distributor('Mouser', 'http://mouser.com/') file = '/tmp/order.html' wget(url, file) (sales_order_num, web_order_num, order_date, items, price) = parse_order(url, file) try: return get('/order/find/', {'dist': mouser['id'], 'order_number': sales_order_num}) except Http404: # %m/%d/%Y m = re.search(r'([0-9]+)/([0-9]+)/([0-9]+)', order_date) o = create_order(dist=mouser, order_number=sales_order_num, url=url, date=m.group(3) + '-' + m.group(1) + '-' + m.group(2), price=price) create_order_property(order=o, name='Sales Order Number', value=sales_order_num) create_order_property(order=o, name='Web Order Number', value=web_order_num) import_failures = [] for item in items: (part_url, dist_part_num) = item[0] quantity = item[2] price = item[3].replace('$', '') ext_price = item[4].replace('$', '') try: print 'importing ' + dist_part_num dp = import_mouser_part(dist_part_num) p = get_part(dp['part']) create_part_history(order=o, part=p, quantity=quantity, ext_price=ext_price) except Exception, e: print 'import failed:', repr(e) import_failures.append (dist_part_num)
def import_contours(): wget(env.contours_fichier) run(u'unzip communes-20150101-5m-shp.zip') table_name = 'communes-20150101-5m' drop_table_communes(env.conf_api.SQLALCHEMY_DATABASE_URI, table_name) run(u'shp2pgsql {} > communes.sql'.format(table_name)) run(u'psql {} -f communes.sql'.format(env.conf_api.SQLALCHEMY_DATABASE_URI)) run(u"""psql {} -c 'INSERT INTO \"ZUPC\" (nom, insee, shape,active) SELECT nom, insee, geom, false FROM \"{}\";' """.format(env.conf_api.SQLALCHEMY_DATABASE_URI, table_name)) require.files.file('sql_update', contents="""UPDATE "ZUPC" SET departement_id = sub.id FROM (SELECT id, numero FROM departement) AS sub WHERE insee LIKE sub.numero||\'%\';""") run('psql {} -f /tmp/zupc/sql_update '.format(env.conf_api.SQLALCHEMY_DATABASE_URI))
def download_osmesa(): import os, re, zipfile from utils import wget mesa_dir = os.path.join(context_dir,'OSMesa') if not os.path.exists(mesa_dir): sysinfo = platform.uname() osmesa_fname = 'OSMesa.%s.%s.zip' % (sysinfo[0], sysinfo[-2]) zip_fname = os.path.join(context_dir, osmesa_fname) if not os.path.exists(zip_fname): print "Downloading %s" % osmesa_fname # MPI url: http://files.is.tue.mpg.de/mloper/opendr/osmesa/%s # BL url: https://s3.amazonaws.com/bodylabs-assets/public/osmesa/%s wget('http://files.is.tue.mpg.de/mloper/opendr/osmesa/%s' % (osmesa_fname,), dest_fname=zip_fname) assert(os.path.exists(zip_fname)) with zipfile.ZipFile(zip_fname, 'r') as z: for f in filter(lambda x: re.search('[ah]$', x), z.namelist()): z.extract(f, path=context_dir) assert(os.path.exists(mesa_dir))
def install(should_identify=True): if should_identify: system_identify() # Create a place to store downloads. if not isdir(DOWNLOADS): mkdir(DOWNLOADS) # Download and install PyQt5. Only download if we don't have a cached copy # available. install_PyQt5 = os.path.join(DOWNLOADS, 'install-PyQt5.exe') if not isfile(install_PyQt5): wget('http://downloads.sourceforge.net/project/pyqt/PyQt5/PyQt-5.5.1/' 'PyQt5-5.5.1-gpl-Py3.4-Qt5.5.1-x32.exe', install_PyQt5) # See https://github.com/appveyor/ci/issues/363#issuecomment-148915001. xqt('REG ADD HKCU\\Software\\Python\\PythonCore\\3.4\\InstallPath /f /ve ' '/t REG_SZ /d C:\\Python34', install_PyQt5 + ' /S') # Download and compile PCRE. pcre_ver = 'pcre-8.38' pcre_zip = pcre_ver + '.zip' pcre_zip_path = os.path.join(DOWNLOADS, pcre_zip) if not isfile(pcre_zip_path): # Note: Don't use ftp://ftp.csx.cam.ac.uk/pub/software/programming/pcre/, # because this sometimes hangs during download, causing the build to # fail. Instead, use the more reliable SourceForge mirror. wget('http://downloads.sourceforge.net/project/pcre/pcre/8.38/' + pcre_zip, pcre_zip_path) # See https://sevenzip.osdn.jp/chm/cmdline/commands/extract_full.htm. xqt('7z x {} > nul'.format(pcre_zip_path)) with pushd(pcre_ver): mkdir('build') chdir('build') xqt('cmake .. -DBUILD_SHARED_LIBS:BOOL=OFF -DPCRE_SUPPORT_UTF:BOOL=ON ' '-DPCRE_SUPPORT_JIT:BOOL=ON -G "Visual Studio 10 2010"', 'cmake --build . --config Release') # Install, which also builds Python C extensions. Use this instead of # ``build_ext`` so that Enki will have an already-installed qutepart, # rather than needing to regenrate the command below. xqt('python setup.py install --include-dir={}/build ' '--lib-dir={}/build/Release --force'.format(pcre_ver, pcre_ver))
def download_osmesa(): import os, re, zipfile from utils import wget mesa_dir = os.path.join(context_dir, 'OSMesa') if not os.path.exists(mesa_dir): sysinfo = platform.uname() osmesa_fname = 'OSMesa.%s.%s.zip' % (sysinfo[0], sysinfo[-2]) zip_fname = os.path.join(context_dir, osmesa_fname) if not os.path.exists(zip_fname): print "Downloading %s" % osmesa_fname # MPI url: http://files.is.tue.mpg.de/mloper/opendr/osmesa/%s # BL url: https://s3.amazonaws.com/bodylabs-assets/public/osmesa/%s wget('http://files.is.tue.mpg.de/mloper/opendr/osmesa/%s' % (osmesa_fname, ), dest_fname=zip_fname) assert (os.path.exists(zip_fname)) with zipfile.ZipFile(zip_fname, 'r') as z: for f in filter(lambda x: re.search('[ah]$', x), z.namelist()): z.extract(f, path=context_dir) assert (os.path.exists(mesa_dir))
def install(should_identify=True): if should_identify: system_identify() # Create a place to store downloads. if not isdir(DOWNLOADS): mkdir(DOWNLOADS) # Download and compile PCRE. pcre_raw_ver = '8.42' pcre_ver = 'pcre-' + pcre_raw_ver pcre_zip = pcre_ver + '.zip' pcre_zip_path = os.path.join(DOWNLOADS, pcre_zip) if not isfile(pcre_zip_path): # Note: Don't use ftp://ftp.csx.cam.ac.uk/pub/software/programming/pcre/, # because this sometimes hangs during download, causing the build to # fail. Instead, use the more reliable SourceForge mirror. wget( 'http://downloads.sourceforge.net/project/pcre/pcre/{}/{}'.format( pcre_raw_ver, pcre_zip), pcre_zip_path) # See https://sevenzip.osdn.jp/chm/cmdline/commands/extract_full.htm. xqt('7z x {} > nul'.format(pcre_zip_path)) with pushd(pcre_ver): mkdir('build') chdir('build') # Per https://cmake.org/cmake/help/latest/generator/Visual%20Studio%2014%202015.html, # add the Win64 string for 64-bit Python. use_Win64 = ' Win64' if is_64bits else '' xqt( 'cmake .. -DBUILD_SHARED_LIBS:BOOL=OFF -DPCRE_SUPPORT_UTF:BOOL=ON ' '-DPCRE_SUPPORT_JIT:BOOL=ON -G "Visual Studio 14 2015{}"'.format( use_Win64), 'cmake --build . --config Release') # First, build Python C extensions. Use this instead of # ``build_ext`` so that Enki will have an already-installed qutepart, # rather than needing to regenrate the command below. xqt('python setup.py build_ext --include-dir={}/build ' '--lib-dir={}/build/Release --force'.format(pcre_ver, pcre_ver)) # Next, install it along with its dependencies. See comments at # ``install_requires`` on why this is necessary. xqt('python -m pip install -e .')
def import_zupc(import_='True'): require.files.directory('/tmp/zupc') with cd('/tmp/zupc/'): for f in list_dir(): if f == '*' or f.endswith('zip'): continue run('rm -f {}'.format(f)) if import_=='True': import_contours() base_dir = '' with cd(env.deploy_dir): for f in list_dir(): if files.is_dir(f) and 'deployment' in f and f > base_dir: base_dir = f api_dir = env.deploy_dir+'/' + base_dir with cd('/tmp/zupc'): wget(env.zupc_fichier) with python.virtualenv(base_dir + '/venvAPITaxi'), cd(base_dir+'/APITaxi-master'): with shell_env(APITAXI_CONFIG_FILE='prod_settings.py'): run('python manage.py load_zupc /tmp/zupc/zupc.geojson')
def download_osmesa(): import os, re, zipfile from utils import wget mesa_dir = os.path.join(context_dir,'OSMesa') if not os.path.exists(mesa_dir): sysinfo = platform.uname() osmesa_fname = 'OSMesa.%s.%s.zip' % (sysinfo[0], sysinfo[-2]) zip_fname = os.path.join(context_dir, osmesa_fname) if not os.path.exists(zip_fname): for base_url in osmesa_mirrors: print "Downloading %s" % (base_url + osmesa_fname, ) try: wget(base_url + osmesa_fname, dest_fname=zip_fname) break except Exception: print "File not found, trying mirrors" assert(os.path.exists(zip_fname)) with zipfile.ZipFile(zip_fname, 'r') as z: for f in filter(lambda x: re.search('[ah]$', x), z.namelist()): z.extract(f, path=context_dir) assert(os.path.exists(mesa_dir))
def ldhost(host, write=0): # Generate new host index based on existing thread indexes. meta = ["head.txt", "list.txt"] tdir = "/".join(["./threads", host]) indpath = "/".join([tdir, meta[1]]) threads = [x.path for x in os.scandir(tdir) if x.is_dir()] bind = [] # first, last, local, total, title for thread in threads: info = "/".join([thread, meta[0]]) replies = "/".join([thread, meta[1]]) if not os.path.isfile(info): t = thread.split("/")[-1] orig = "/".join([friends[host], "raw", "local", t, "head.txt"]) u.wget(orig, info) with open(info, "r") as info: info = info.read().strip() if len(info) == 0: continue info = info.splitlines()[0] with open(replies, "r") as replies: replies = replies.read().splitlines() replies = [r.split(" ") for r in replies] breps = [r[0] for r in replies] try: int(replies[0][1]) int(replies[-1][1]) except: continue tline = [ replies[0][1], replies[-1][1], str(breps.count("local")), str(len(replies)), info ] bind.append(tline) bind.sort(key=lambda x: x[1], reverse=1) if not write: return bind bind = "\n".join([" ".join(t) for t in bind]) with open(indpath, "w") as ind: ind.write(bind)
def install(should_identify=True): if should_identify: system_identify() # Create a place to store downloads. if not isdir(DOWNLOADS): mkdir(DOWNLOADS) # Download and compile PCRE. pcre_raw_ver = '8.39' pcre_ver = 'pcre-' + pcre_raw_ver pcre_zip = pcre_ver + '.zip' pcre_zip_path = os.path.join(DOWNLOADS, pcre_zip) if not isfile(pcre_zip_path): # Note: Don't use ftp://ftp.csx.cam.ac.uk/pub/software/programming/pcre/, # because this sometimes hangs during download, causing the build to # fail. Instead, use the more reliable SourceForge mirror. wget('http://downloads.sourceforge.net/project/pcre/pcre/{}/{}'. format(pcre_raw_ver, pcre_zip), pcre_zip_path) # See https://sevenzip.osdn.jp/chm/cmdline/commands/extract_full.htm. xqt('7z x {} > nul'.format(pcre_zip_path)) with pushd(pcre_ver): mkdir('build') chdir('build') # Per https://cmake.org/cmake/help/latest/generator/Visual%20Studio%2014%202015.html, # add the Win64 string for 64-bit Python. use_Win64 = ' Win64' if is_64bits else '' xqt('cmake .. -DBUILD_SHARED_LIBS:BOOL=OFF -DPCRE_SUPPORT_UTF:BOOL=ON ' '-DPCRE_SUPPORT_JIT:BOOL=ON -G "Visual Studio 14 2015{}"'. format(use_Win64), 'cmake --build . --config Release') # First, build Python C extensions. Use this instead of # ``build_ext`` so that Enki will have an already-installed qutepart, # rather than needing to regenrate the command below. xqt('python setup.py build_ext --include-dir={}/build ' '--lib-dir={}/build/Release --force'.format(pcre_ver, pcre_ver)) # Next, install it along with its dependencies. See comments at # ``install_requires`` on why this is necessary. xqt('python -m pip install -e .')
def download_images_from_webpage(self, href, out_path=None, img_name=None): ret = False print href sub_html = utils.wget(href) if not sub_html: print 'WARNING: request to %s failed.' % sub_html else: ret = True # get the jpg image in the page #image_urls = re.findall(ur'<img [^>]*src="([^"]*?\.jpg)"[^>]*>', sub_html) #print sub_html image_urls = re.findall(ur'<img [^>]*src\s*=\s*"([^"]*?)"[^>]*?>', sub_html) print image_urls for image_url in image_urls: if not image_url.startswith('/'): image_url = re.sub(ur'/[^/]*$', '/' + image_url, href) else: image_url = re.sub(ur'^(.*?//.*?/).*$', r'\1' + image_url, href) print image_url # get the image image = utils.wget(image_url) if not image: print 'WARNING: request to %s failed.' % image_url else: # save it image_path = os.path.join( out_path, img_name or re.sub(ur'^.*/', '', image_url)) + '' print image_path utils.write_file(image_path, image) return ret
def fetch_old(self, *args, **options): ''' fetch http://zzz//P3.html --links-file "bible1" --op=img1 Will save all the jpg images found at that address into a directory called img1. We first download the index from that address then follow each link with a name listed in bible1 file. Download all all the jpg images found in those sub-pages. ''' out_path = options['out_path'] if len(args) > 1: url = args[1] print url if options['links']: links = options['links'].split(' ') if options['links_file']: f = open(options['links_file'], 'rb') links = f.readlines() f.close() links = [link.strip() for link in links] if links: html = utils.wget(url) if not html: print 'ERROR: request to %s failed.' % url else: for link in links: print link href = re.findall( ur'<a [^>]*href="([^"]*?)"[^>]*>\s*' + re.escape(link) + '\s*<', html) if href: href = href[0] href = re.sub(ur'/[^/]*$', '/' + href, url) print href self.download_images_from_webpage(href, out_path)
def fetch_old(self, *args, **options): ''' fetch http://zzz//P3.html --links-file "bible1" --op=img1 Will save all the jpg images found at that address into a directory called img1. We first download the index from that address then follow each link with a name listed in bible1 file. Download all all the jpg images found in those sub-pages. ''' out_path = options['out_path'] if len(args) > 1: url = args[1] print url if options['links']: links = options['links'].split(' ') if options['links_file']: f = open(options['links_file'], 'rb') links = f.readlines() f.close() links = [link.strip() for link in links] if links: html = utils.wget(url) if not html: print 'ERROR: request to %s failed.' % url else: for link in links: print link href = re.findall(ur'<a [^>]*href="([^"]*?)"[^>]*>\s*' + re.escape(link) + '\s*<', html) if href: href = href[0] href = re.sub(ur'/[^/]*$', '/' + href, url) print href self.download_images_from_webpage(href, out_path)
#!/usr/bin/env python import re import string import utils url = "http://www.pythonchallenge.com/pc/def/ocr.html" if __name__ == "__main__": text_pattern = re.compile( r'<!--\nfind rare characters in the mess below:\n-->\n\n<!--\n(.*?)\n-->', re.DOTALL) text = re.search(text_pattern, utils.wget(url)).group(1) new_filename = '' for char in text: if char in string.letters: new_filename += char print utils.update_url(url, utils.return_this, new_filename)
import re import string import utils url = "http://www.pythonchallenge.com/pc/def/linkedlist.html" url = "http://www.pythonchallenge.com/pc/def/linkedlist.php" url = "http://www.pythonchallenge.com/pc/def/linkedlist.php?nothing=12345" if __name__ == "__main__": value_pattern = re.compile(r'and the next nothing is (\d+)') this_value = url[-5:] count = 0; while count <= 400: source = utils.wget(url) #print "%d: %s" % (count, source) mo = re.search(value_pattern, source) if mo: next_value = mo.group(1) elif source.find('Divide by two') >= 0: next_value = str(int(this_value) / 2) else: print "%d: %s" % (count, source) break url = url.replace(this_value, next_value) this_value = next_value count += 1 print utils.update_url(url, utils.return_this, source)
#! /usr/bin/env python ################################################################################ # RelMon: a tool for automatic Release Comparison # https://twiki.cern.ch/twiki/bin/view/CMSPublic/RelMon # # # # Danilo Piparo CERN - [email protected] # ################################################################################ from __future__ import print_function from sys import argv, exit from utils import wget if __name__ == "__main__": argc = len(argv) if argc != 2: print("Usage %prog url-to-fetch") exit(1) url = argv[1] wget(url)
def post(self): url = self.get_argument("url", default=None, strip=False) if not url.startswith('http://'): url = 'http://' + url response = 'Nothing yet' # Fetch URL and extract content try: response = wget(url) except: self.render("timeout.html", ) return response = response.replace('\r\n', '').replace('\n', '') # Remove newlines regex = re.compile( '(?<=body).*?(?=<\/body>)') # Extract content of body HTML tag response = regex.findall(response)[0] response = re.sub(r'<.*?>', ' ', response) # Remove HTML tags # Remove prepositions and articles prep_arr = [ 'on', 'in', 'at', 'since', 'for', 'ago', 'before', 'to', 'until', 'till', 'by', 'off', 'about', 'from', 'onto', 'unto', 'into', 'through', 'across', 'above', 'below', 'over', 'under', 'beside', 'next', 'a', 'an', 'the', 'some', 'few', 'this', 'that', 'those', 'these' 'how', 'why', 'what', 'who', 'when', 'there', ] for i in prep_arr: response = response.replace(i, '') # Create word dictionary and frequency wordcount = {} regex = re.compile('[a-zA-Z\-]{3,}') words = regex.findall(response) for word in words: if word not in wordcount: #wordcount[word] = {'freq': 1, 'fontSize': 1} wordcount[word] = 1 else: wordcount[word] += 1 # Find top 100 words tuples = sorted(wordcount.iteritems(), key=lambda (k, v): (v, k)) tuples.reverse() count = 0 wordcount = {} for i in tuples: wordcount[i[0]] = {'freq': i[1], 'fontSize': 1} count += 1 if count >= 100: break # Find highest value high = 0 for word in wordcount: high = max(high, wordcount[word]['freq']) # Define font size relative to the highest value for word in wordcount: wordcount[word]['fontSize'] = float( wordcount[word]['freq']) / float(high) * float(400) # Insert into DB db = MySQLdb.connect(unix_socket='/cloudsql/{}:{}'.format( CLOUDSQL_PROJECT, CLOUDSQL_INSTANCE), user='******', passwd='octo stuff is being setup', db='words') cursor = db.cursor() for word in wordcount: #salt = uuid.uuid4().hex hashed_word = hashlib.sha512(word + SALT).hexdigest() # Encryption cipher_text = rsa.encrypt(word, rsa.PublicKey.load_pkcs1(PUBLIC_KEY)) # Update/insert rows cursor.execute('select count(1) from entries where wordhash=%s', (hashed_word, )) if cursor.rowcount > 0: cursor.execute( 'update entries set wordencrypt=%s, wordfreq=%s where wordhash=%s', (cipher_text, wordcount[word]['freq'], hashed_word)) else: cursor.execute( 'insert into entries (wordhash, wordencrypt, wordfreq) values (%s, %s, %s)', (hashed_word, cipher_text, wordcount[word]['freq'])) db.commit() # Render HTML self.render("word.html", url=url, results=wordcount)
def wg(url): dest = join('/tmp', split(url)[1]) if not exists(dest): wget(url, dest)
def wg(url): dest = join("/tmp", split(url)[1]) if not exists(dest): wget(url, dest)
def post(self): url = self.get_argument("url", default=None, strip=False) if not url.startswith('http://'): url = 'http://' + url response = 'Nothing yet' # Fetch URL and extract content try: response = wget(url) except: self.render("timeout.html", ) return response = response.replace('\r\n','').replace('\n', '') # Remove newlines regex = re.compile('(?<=body).*?(?=<\/body>)') # Extract content of body HTML tag response = regex.findall(response)[0] response = re.sub(r'<.*?>', ' ', response) # Remove HTML tags # Remove prepositions and articles prep_arr = [ 'on', 'in', 'at', 'since', 'for', 'ago', 'before', 'to', 'until', 'till', 'by', 'off', 'about', 'from', 'onto', 'unto', 'into', 'through', 'across', 'above', 'below', 'over', 'under', 'beside', 'next', 'a', 'an', 'the', 'some', 'few', 'this', 'that', 'those', 'these' 'how', 'why', 'what', 'who', 'when', 'there', ] for i in prep_arr: response = response.replace(i, '') # Create word dictionary and frequency wordcount = {} regex = re.compile('[a-zA-Z\-]{3,}') words = regex.findall(response) for word in words: if word not in wordcount: #wordcount[word] = {'freq': 1, 'fontSize': 1} wordcount[word] = 1 else: wordcount[word] += 1 # Find top 100 words tuples = sorted(wordcount.iteritems(), key=lambda (k,v):(v,k)) tuples.reverse() count = 0 wordcount = {} for i in tuples: wordcount[i[0]] = {'freq': i[1], 'fontSize': 1} count += 1 if count >= 100: break # Find highest value high = 0 for word in wordcount: high = max(high, wordcount[word]['freq']) # Define font size relative to the highest value for word in wordcount: wordcount[word]['fontSize'] = float(wordcount[word]['freq']) / float(high) * float(400) # Insert into DB db = MySQLdb.connect( unix_socket='/cloudsql/{}:{}'.format(CLOUDSQL_PROJECT, CLOUDSQL_INSTANCE), user='******', passwd='octo stuff is being setup', db='words') cursor = db.cursor() for word in wordcount: #salt = uuid.uuid4().hex hashed_word = hashlib.sha512(word + SALT).hexdigest() # Encryption cipher_text = rsa.encrypt(word, rsa.PublicKey.load_pkcs1(PUBLIC_KEY)) # Update/insert rows cursor.execute('select count(1) from entries where wordhash=%s', (hashed_word,)) if cursor.rowcount > 0: cursor.execute( 'update entries set wordencrypt=%s, wordfreq=%s where wordhash=%s', (cipher_text, wordcount[word]['freq'], hashed_word)) else: cursor.execute( 'insert into entries (wordhash, wordencrypt, wordfreq) values (%s, %s, %s)', (hashed_word, cipher_text, wordcount[word]['freq'])) db.commit() # Render HTML self.render("word.html", url=url, results=wordcount)
def linksites(): mkfriends() furls = {friends[f]: f for f in friends} for f in friends: if f is "local": continue # furl - remote friendslist url # lurl - remote thread index url # ffn - friendslist filename (friends.host) # nffn - new friendslist filename (friends.host.new) # lfn - thread index filename (list.host) # nlfn - new thread index filename (list.host.new) # changes - threads with new replies from self # hosts - hosts that need their index rewritten furl = "/".join([friends[f], "raw", "friends.txt"]) # Legacy: rename /raw/ -> /api/ lurl = "/".join([friends[f], "raw", "list.txt"]) ffn = arc + "friends." + f if not os.path.exists(ffn): with open(ffn, "w") as fi: fi.write("") nffn = ffn + ".new" lfn = arc + "list." + f if not os.path.exists(lfn): with open(lfn, "w") as fi: fi.write("") nlfn = lfn + ".new" u.wget(furl, nffn) u.wget(lurl, nlfn) # Ideally, a list of [name, op] localreplies # is compared against the older version, and # if a difference is found, {common}/{thread}/{friend} # is downloaded, {common}/{thread} & {common} are then # rebuilt. This is contingent on {common} being a common # host between client and server. with open(nffn, "r") as nf: nf = [x.split() for x in nf.read().splitlines()] if len(nf) < 1: continue if len(nf[0][1]) < 6: continue # This breaks if a friend URL is blank nfurls = {x[1]: x[0] for x in nf if len(x) > 1} common = {nfurls[x]: furls[x] for x in nfurls if x in furls} common2 = {common[f]: f for f in common} with open(lfn, "r") as oldl: oldl = [o.split() for o in oldl.read().splitlines()] with open(nlfn, "r") as newl: newl = [n.split() for n in newl.read().splitlines()] changes = [] for n in newl: if n[0] not in common.keys(): continue if not int(n[3]): continue if n in oldl: continue n = [common[n[0]], n[1], n[3]] changes.append(n) for c in changes: url = "/".join( [friends[f], "raw", common2[c[0]], c[1], "local.txt"]) ldir = "/".join(["./threads", c[0], c[1]]) local = "/".join([ldir, f + ".txt"]) if not os.path.isdir(ldir): os.mkdir(ldir) u.wget(url, local) mkthread(c[0], c[1]) hosts = set([c[0] for c in changes]) for b in hosts: mkhost(b) os.rename(nffn, ffn) os.rename(nlfn, lfn) mksite()
def snif(url, locate="tmp", async=True, condition=is_type): """Main function to download files.""" if type(async) == int and async > 2: session = FuturesSession(executor=ThreadPoolExecutor( max_workers=async)) elif async: session = FuturesSession() links = filter(condition, get_all_http_func(url)) number_links = 0 for link in links: number_links += 1 try: if async: session.get(link).add_done_callback( lambda future: wget_future(future, subfolder=locate)) # wget_async(link, subfolder=locate, async=40) else: wget(link, subfolder=locate) except ConnectionError: print("Problème avec le lien:" + link) continue print(number_links, " éléments téléchargeables.") if __name__ == "__main__": if len(sys.argv) > 2: cd(pwd()) snif(sys.argv[1], sys.argv[2])
#!/usr/bin/env python url = "http://www.pythonchallenge.com/pc/def/integrity.html" title = "working hard?" auth_popup_message = "inflate" url2 = "http://www.pythonchallenge.com/pc/return/good.html" if __name__ == "__main__": import bz2 import re import utils pattern = re.compile(r"<!--\nun: '(.+)'\s+pw: '(.+)'") un, pw = re.findall(pattern, utils.wget(url))[0] #print un #print pw print "un: %s" % bz2.decompress(un.decode('string_escape')) print "pw: %s" % bz2.decompress(pw.decode('string_escape'))
#! /usr/bin/env python ################################################################################ # RelMon: a tool for automatic Release Comparison # https://twiki.cern.ch/twiki/bin/view/CMSPublic/RelMon # # # # Danilo Piparo CERN - [email protected] # ################################################################################ from sys import argv,exit from utils import wget if __name__=="__main__": argc=len(argv) if argc!=2: print "Usage %prog url-to-fetch" exit(1) url=argv[1] wget(url)
exit(-1) dirtolist = "" if lenargs == 1: dirtolist = args[0] mode = "relval" if options.online: mode = "online" if options.development: mode = "dev" directory = "%s/dqm/%s/data/browse/%s" % (server, mode, dirtolist) print "peeping ", directory contents = extract_list(get_page(directory), server, options.show_url) if len(contents) == 0: print "No contents found!" for content in contents: if not options.get and search(options.path, content): print content if options.get and options.show_url and len( options.path) > 0 and search(options.path, content): if not search('pre', options.path) and search('pre', content): continue bcontent = basename(content) print "Getting %s" % bcontent wget(content) print "Got %s!!" % bcontent
def import_order(url): digikey = find_distributor('Digi-Key') file = '/tmp/order.html' wget(url, file) soup = BeautifulSoup(open(file)) web_id_tag = soup.find(is_span_lblWebID) web_id = web_id_tag.get_text(strip=True) salesorder_number_tag = soup.find(is_span_lblSalesorderNumber) salesorder_number = salesorder_number_tag.get_text(strip=True) submitted_tag = soup.find(is_span_lblSubmitted) submitted = submitted_tag.get_text(strip=True) order = parse_order(url, soup) items = [item for item in order if len(item) == 9] print web_id, salesorder_number, submitted, items # FIX replace by populate_order try: return get('/order/find/', {'dist': digikey['id'], 'order_number': salesorder_number}) except Http404: price = None for item in order: if len(item) == 3: if item[1] == 'Subtotal': subtotal = item[2].replace ('$', '') elif item[1] == 'Total': m = re.match(r'\$([0-9]+.[0-9]+)', item[2]) if m: price = m.group(1) # %m/%d/%Y m = re.search(r'([0-9]+)/([0-9]+)/([0-9]+)', submitted) o = create_order(dist=digikey, order_number=salesorder_number, url=url, date=m.group(3) + '-' + m.group(1) + '-' + m.group(2), price=price if price else subtotal) if not price: create_order_property(order=o, name='Import Note', value='Total missing, used Subtotal for Order Price') create_order_property(order=o, name='Salesorder Number', value=salesorder_number) create_order_property(order=o, name='Web ID', value=web_id) import_failures = [] for item in items: (index, quantity, dist_part_num, desc, _, _, _, unit_price, ext_price) = item try: unit_price = Decimal(unit_price.replace('$', '')) ext_price = Decimal(ext_price.replace('$', '')) quantity = int(quantity.replace('NCNR', '')) print 'importing ' + dist_part_num dp = import_digikey_part(dist_part_num) p = get_part(dp['part']) create_part_history(order=o, part=p, quantity=quantity, ext_price=ext_price) except Exception, e: print 'import failed:', repr(e) import_failures.append(dist_part_num)
except: print("Error with file: " + filename) def snif(url, locate="tmp", async=True, condition=is_type): """Main function to download files.""" if type(async) == int and async > 2: session = FuturesSession(executor=ThreadPoolExecutor(max_workers=async)) elif async: session = FuturesSession() links = filter(condition, get_all_http_func(url)) number_links = 0 for link in links: number_links += 1 try: if async: session.get(link).add_done_callback(lambda future: wget_future(future, subfolder=locate)) # wget_async(link, subfolder=locate, async=40) else: wget(link, subfolder=locate) except ConnectionError: print("Problème avec le lien:" + link) continue print(number_links, " éléments téléchargeables.") if __name__ == "__main__": if len(sys.argv) > 2: cd(pwd()) snif(sys.argv[1], sys.argv[2])
dirtolist="" if lenargs==1: dirtolist=args[0] mode="relval" if options.online: mode="online" if options.development: mode="dev" directory="%s/dqm/%s/data/browse/%s" %(server,mode,dirtolist) print("peeping ",directory) contents=extract_list(get_page(directory),server,options.show_url) if len(contents)==0: print("No contents found!") for content in contents: if not options.get and search(options.path,content): print(content) if options.get and options.show_url and len(options.path)>0 and search(options.path,content): if not search('pre',options.path) and search('pre',content): continue bcontent=basename(content) print("Getting %s" %bcontent) wget(content) print("Got %s!!" %bcontent)
#!/usr/bin/env python url = "http://www.pythonchallenge.com/pc/def/peak.html" url = "http://www.pythonchallenge.com/pc/def/banner.p" if __name__ == "__main__": import pickle import sys import utils data = pickle.loads(utils.wget(url)) # Got help from below in recongnising data as a run length encoding! # http://unixwars.com/2007/09/11/python-challenge-level-5-peak-hell/ for line in data: print ''.join(map(lambda pair: pair[0]*pair[1], line))
#!/usr/bin/env python url = "http://www.pythonchallenge.com/pc/def/channel.zip" if __name__ == "__main__": import re import StringIO import zipfile import utils zip_data = utils.wget(url) filelike = StringIO.StringIO(zip_data) #print zipfile.is_zipfile(filelike) zf = zipfile.ZipFile(filelike, 'r') readme_text = zf.open('readme.txt').read() start_filename_pattern = re.compile(r'start from (\d+)') start_filename = re.search(start_filename_pattern, readme_text).group(1) comments = '' this_filename = start_filename count = 0 next_filename_pattern = re.compile(r'Next nothing is (\d+)') while count <= len(zf.namelist()): comments += zf.getinfo('%s.txt' % this_filename).comment text = zf.open("%s.txt" % this_filename).read() print "%d: %s.txt: %s" % (count, this_filename, text) mo = re.search(next_filename_pattern, text) if mo: next_filename = mo.group(1)