Example #1
0
    def download_images_from_webpage(self, href, out_path=None, img_name=None):
        ret = False
        print href
        sub_html = utils.wget(href)

        if not sub_html:
            print 'WARNING: request to %s failed.' % sub_html
        else:
            ret = True
            # get the jpg image in the page
            #image_urls = re.findall(ur'<img [^>]*src="([^"]*?\.jpg)"[^>]*>', sub_html)
            #print sub_html
            image_urls = re.findall(ur'<img [^>]*src\s*=\s*"([^"]*?)"[^>]*?>', sub_html)
            print image_urls
            for image_url in image_urls:
                if not image_url.startswith('/'):
                    image_url = re.sub(ur'/[^/]*$', '/' + image_url, href)
                else:
                    image_url = re.sub(ur'^(.*?//.*?/).*$', r'\1' + image_url, href)
                print image_url

                # get the image
                image = utils.wget(image_url)

                if not image:
                    print 'WARNING: request to %s failed.' % image_url
                else:
                    # save it
                    image_path = os.path.join(out_path, img_name or re.sub(ur'^.*/', '', image_url)) + ''
                    print image_path
                    utils.write_file(image_path, image)

        return ret
Example #2
0
File: ci.py Project: rapgro/enki
 def install_Windows(self):
     # ctags
     ctags_zip = os.path.join(DOWNLOADS, CTAGS_VER + '.zip')
     if not isfile(ctags_zip):
         wget('http://sourceforge.net/projects/ctags/files/ctags/5.8/{}.zip'.
              format(CTAGS_VER), ctags_zip)
     unzip(ctags_zip, CTAGS_VER + '/ctags.exe')
Example #3
0
def import_orders():
    url = 'https://mouser.com/OrderHistory/OrdersView.aspx'
    file = '/tmp/orders.html'
    wget(url, file)
    
    orders = parse_orders(url, file)
    
    for order in orders:
        (cols, url) = order
        import_order(url)
Example #4
0
def import_orders():
    url = 'https://www.digikey.com/classic/registereduser/WebOrderHistory.aspx'
    file = '/tmp/orders.html'
    wget(url, file)
    
    orders = parse_orders(url, file)
    
    for order in orders:
        (cols, url) = order
        import_order(url)
Example #5
0
def install(should_identify=True):
    # Based on ideas from https://github.com/harvimt/quamash/blob/master/.travis.yml
    if should_identify:
        system_identify()
    td = Travis_Dispatcher()
    xqt(
      # Cached Downloads
      'sudo mkdir -p /downloads',
      'sudo chmod a+rw /downloads')
    sip_ver = 'sip-4.17'
    if not isfile('/downloads/sip.tar.gz'):
        wget('http://downloads.sourceforge.net/project/pyqt/sip/{}/{}'.
             format(sip_ver, _gz(sip_ver)), '/downloads/sip.tar.gz')
    # _`pyqt_ver`: Select a PyQt version. See also qt5_Linux_ and qt5_OS_X_.
    pyqt_ver = '5.5.1'
    pyqt_gpl_ver = 'PyQt-gpl-' + pyqt_ver
    if not isfile('/downloads/pyqt5.tar.gz'):
        wget('http://downloads.sourceforge.net/project/pyqt/PyQt5/PyQt-{}/{}'.
             format(pyqt_ver, _gz(pyqt_gpl_ver)), '/downloads/pyqt5.tar.gz')
    # Builds
    xqt('sudo mkdir -p /builds',
      'sudo chmod a+rw /builds')

    # Qt5
    td.qt5()

    # SIP. With Linux or OS_X, don't use the package manager to install these,
    # since they're installed for the system python, not the pyenv version
    # we're testing with.
    with pushd('/builds'):
        xqt('tar xzf /downloads/sip.tar.gz --keep-newer-files')
        chdir(sip_ver)
        xqt('python configure.py',
          'make',
          'sudo make install')

    # PyQt5
    with pushd('/builds'):
        xqt('tar xzf /downloads/pyqt5.tar.gz --keep-newer-files')
        chdir(pyqt_gpl_ver)
        td.pyqt5_configure()
        xqt('make',
          'sudo make install')

    # PCRE
    td.pcre()

    # Qutepart
    if build_os == 'Linux':
        set_display()
        xqt('sh -e /etc/init.d/xvfb start')
    # Install, which also builds Python C extensions. Use this instead of
    # ``build_ext`` so that Enki will have an already-installed qutepart,
    # rather than needing to regenrate the command below.
    xqt('python setup.py install')
Example #6
0
def import_order(url):
    mouser = populate_distributor('Mouser',
                                  'http://mouser.com/')
    
    file = '/tmp/order.html'
    wget(url, file)
    
    (sales_order_num, web_order_num, order_date, items, price) = parse_order(url, file)
    
    try:
        return get('/order/find/',
                   {'dist': mouser['id'],
                    'order_number': sales_order_num})
    except Http404:
        # %m/%d/%Y
        m = re.search(r'([0-9]+)/([0-9]+)/([0-9]+)', order_date)
        
        o = create_order(dist=mouser,
                         order_number=sales_order_num,
                         url=url,
                         date=m.group(3) + '-' + m.group(1) + '-' + m.group(2),
                         price=price)
    
    create_order_property(order=o,
                          name='Sales Order Number',
                          value=sales_order_num)
    create_order_property(order=o,
                          name='Web Order Number',
                          value=web_order_num)
    
    import_failures = []
    for item in items:
        (part_url, dist_part_num) = item[0]
        
        quantity = item[2]
        price = item[3].replace('$', '')
        ext_price = item[4].replace('$', '')

        try:
            print 'importing ' + dist_part_num
            dp = import_mouser_part(dist_part_num)
            
            p = get_part(dp['part'])
            
            create_part_history(order=o,
                                part=p,
                                quantity=quantity,
                                ext_price=ext_price)
        except Exception, e:
            print 'import failed:', repr(e)
            import_failures.append (dist_part_num)
Example #7
0
def import_contours():
    wget(env.contours_fichier)
    run(u'unzip communes-20150101-5m-shp.zip')
    table_name = 'communes-20150101-5m'
    drop_table_communes(env.conf_api.SQLALCHEMY_DATABASE_URI, table_name)
    run(u'shp2pgsql {} > communes.sql'.format(table_name))
    run(u'psql {} -f communes.sql'.format(env.conf_api.SQLALCHEMY_DATABASE_URI))
    run(u"""psql {} -c 'INSERT INTO \"ZUPC\" (nom, insee, shape,active)
            SELECT nom, insee, geom, false FROM \"{}\";'
            """.format(env.conf_api.SQLALCHEMY_DATABASE_URI, table_name))
    require.files.file('sql_update',
            contents="""UPDATE "ZUPC" SET departement_id = sub.id FROM
                (SELECT id, numero FROM departement) AS sub
                WHERE insee LIKE sub.numero||\'%\';""")
    run('psql {} -f /tmp/zupc/sql_update '.format(env.conf_api.SQLALCHEMY_DATABASE_URI))
Example #8
0
def download_osmesa():
    import os, re, zipfile
    from utils import wget
    mesa_dir = os.path.join(context_dir,'OSMesa')
    if not os.path.exists(mesa_dir):
        sysinfo = platform.uname()
        osmesa_fname = 'OSMesa.%s.%s.zip' % (sysinfo[0], sysinfo[-2])
        zip_fname = os.path.join(context_dir, osmesa_fname)
        if not os.path.exists(zip_fname):
            print "Downloading %s" % osmesa_fname
            # MPI url: http://files.is.tue.mpg.de/mloper/opendr/osmesa/%s
            # BL url: https://s3.amazonaws.com/bodylabs-assets/public/osmesa/%s
            wget('http://files.is.tue.mpg.de/mloper/opendr/osmesa/%s' % (osmesa_fname,), dest_fname=zip_fname)
        assert(os.path.exists(zip_fname))
        with zipfile.ZipFile(zip_fname, 'r') as z:
            for f in filter(lambda x: re.search('[ah]$', x), z.namelist()):
                z.extract(f, path=context_dir)
        assert(os.path.exists(mesa_dir))
def install(should_identify=True):
    if should_identify:
        system_identify()

    # Create a place to store downloads.
    if not isdir(DOWNLOADS):
        mkdir(DOWNLOADS)

    # Download and install PyQt5. Only download if we don't have a cached copy
    # available.
    install_PyQt5 = os.path.join(DOWNLOADS, 'install-PyQt5.exe')
    if not isfile(install_PyQt5):
        wget('http://downloads.sourceforge.net/project/pyqt/PyQt5/PyQt-5.5.1/'
             'PyQt5-5.5.1-gpl-Py3.4-Qt5.5.1-x32.exe',
              install_PyQt5)
    # See https://github.com/appveyor/ci/issues/363#issuecomment-148915001.
    xqt('REG ADD HKCU\\Software\\Python\\PythonCore\\3.4\\InstallPath /f /ve '
        '/t REG_SZ /d C:\\Python34',
      install_PyQt5 + ' /S')

    # Download and compile PCRE.
    pcre_ver = 'pcre-8.38'
    pcre_zip = pcre_ver + '.zip'
    pcre_zip_path = os.path.join(DOWNLOADS, pcre_zip)
    if not isfile(pcre_zip_path):
        # Note: Don't use ftp://ftp.csx.cam.ac.uk/pub/software/programming/pcre/,
        # because this sometimes hangs during download, causing the build to
        # fail. Instead, use the more reliable SourceForge mirror.
        wget('http://downloads.sourceforge.net/project/pcre/pcre/8.38/' +
             pcre_zip, pcre_zip_path)
    # See https://sevenzip.osdn.jp/chm/cmdline/commands/extract_full.htm.
    xqt('7z x {} > nul'.format(pcre_zip_path))
    with pushd(pcre_ver):
        mkdir('build')
        chdir('build')
        xqt('cmake .. -DBUILD_SHARED_LIBS:BOOL=OFF -DPCRE_SUPPORT_UTF:BOOL=ON '
            '-DPCRE_SUPPORT_JIT:BOOL=ON -G "Visual Studio 10 2010"',
          'cmake --build . --config Release')

    # Install, which also builds Python C extensions. Use this instead of
    # ``build_ext`` so that Enki will have an already-installed qutepart,
    # rather than needing to regenrate the command below.
    xqt('python setup.py install --include-dir={}/build '
        '--lib-dir={}/build/Release --force'.format(pcre_ver, pcre_ver))
Example #10
0
def download_osmesa():
    import os, re, zipfile
    from utils import wget
    mesa_dir = os.path.join(context_dir, 'OSMesa')
    if not os.path.exists(mesa_dir):
        sysinfo = platform.uname()
        osmesa_fname = 'OSMesa.%s.%s.zip' % (sysinfo[0], sysinfo[-2])
        zip_fname = os.path.join(context_dir, osmesa_fname)
        if not os.path.exists(zip_fname):
            print "Downloading %s" % osmesa_fname
            # MPI url: http://files.is.tue.mpg.de/mloper/opendr/osmesa/%s
            # BL url: https://s3.amazonaws.com/bodylabs-assets/public/osmesa/%s
            wget('http://files.is.tue.mpg.de/mloper/opendr/osmesa/%s' %
                 (osmesa_fname, ),
                 dest_fname=zip_fname)
        assert (os.path.exists(zip_fname))
        with zipfile.ZipFile(zip_fname, 'r') as z:
            for f in filter(lambda x: re.search('[ah]$', x), z.namelist()):
                z.extract(f, path=context_dir)
        assert (os.path.exists(mesa_dir))
Example #11
0
def install(should_identify=True):
    if should_identify:
        system_identify()

    # Create a place to store downloads.
    if not isdir(DOWNLOADS):
        mkdir(DOWNLOADS)

    # Download and compile PCRE.
    pcre_raw_ver = '8.42'
    pcre_ver = 'pcre-' + pcre_raw_ver
    pcre_zip = pcre_ver + '.zip'
    pcre_zip_path = os.path.join(DOWNLOADS, pcre_zip)
    if not isfile(pcre_zip_path):
        # Note: Don't use ftp://ftp.csx.cam.ac.uk/pub/software/programming/pcre/,
        # because this sometimes hangs during download, causing the build to
        # fail. Instead, use the more reliable SourceForge mirror.
        wget(
            'http://downloads.sourceforge.net/project/pcre/pcre/{}/{}'.format(
                pcre_raw_ver, pcre_zip), pcre_zip_path)
    # See https://sevenzip.osdn.jp/chm/cmdline/commands/extract_full.htm.
    xqt('7z x {} > nul'.format(pcre_zip_path))
    with pushd(pcre_ver):
        mkdir('build')
        chdir('build')
        # Per https://cmake.org/cmake/help/latest/generator/Visual%20Studio%2014%202015.html,
        # add the Win64 string for 64-bit Python.
        use_Win64 = ' Win64' if is_64bits else ''
        xqt(
            'cmake .. -DBUILD_SHARED_LIBS:BOOL=OFF -DPCRE_SUPPORT_UTF:BOOL=ON '
            '-DPCRE_SUPPORT_JIT:BOOL=ON -G "Visual Studio 14 2015{}"'.format(
                use_Win64), 'cmake --build . --config Release')

    # First, build Python C extensions. Use this instead of
    # ``build_ext`` so that Enki will have an already-installed qutepart,
    # rather than needing to regenrate the command below.
    xqt('python setup.py build_ext --include-dir={}/build '
        '--lib-dir={}/build/Release --force'.format(pcre_ver, pcre_ver))
    # Next, install it along with its dependencies. See comments at
    # ``install_requires`` on why this is necessary.
    xqt('python -m pip install -e .')
Example #12
0
def import_zupc(import_='True'):
    require.files.directory('/tmp/zupc')
    with cd('/tmp/zupc/'):
        for f in list_dir():
            if f == '*' or f.endswith('zip'):
                continue
            run('rm -f {}'.format(f))
        if import_=='True':
            import_contours()

    base_dir = ''
    with cd(env.deploy_dir):
        for f in list_dir():
            if files.is_dir(f) and 'deployment' in f and f > base_dir:
                base_dir = f
    api_dir = env.deploy_dir+'/' + base_dir
    with cd('/tmp/zupc'):
        wget(env.zupc_fichier)
    with python.virtualenv(base_dir + '/venvAPITaxi'), cd(base_dir+'/APITaxi-master'):
        with shell_env(APITAXI_CONFIG_FILE='prod_settings.py'):
            run('python manage.py load_zupc /tmp/zupc/zupc.geojson')
Example #13
0
def download_osmesa():
    import os, re, zipfile
    from utils import wget
    mesa_dir = os.path.join(context_dir,'OSMesa')
    if not os.path.exists(mesa_dir):
        sysinfo = platform.uname()
        osmesa_fname = 'OSMesa.%s.%s.zip' % (sysinfo[0], sysinfo[-2])
        zip_fname = os.path.join(context_dir, osmesa_fname)
        if not os.path.exists(zip_fname):
            for base_url in osmesa_mirrors:
                print "Downloading %s" % (base_url + osmesa_fname, )
                try:
                    wget(base_url + osmesa_fname, dest_fname=zip_fname)
                    break
                except Exception:
                    print "File not found, trying mirrors"
        assert(os.path.exists(zip_fname))
        with zipfile.ZipFile(zip_fname, 'r') as z:
            for f in filter(lambda x: re.search('[ah]$', x), z.namelist()):
                z.extract(f, path=context_dir)
        assert(os.path.exists(mesa_dir))
Example #14
0
def ldhost(host, write=0):
    # Generate new host index based on existing thread indexes.
    meta = ["head.txt", "list.txt"]
    tdir = "/".join(["./threads", host])
    indpath = "/".join([tdir, meta[1]])
    threads = [x.path for x in os.scandir(tdir) if x.is_dir()]
    bind = []  # first, last, local, total, title
    for thread in threads:
        info = "/".join([thread, meta[0]])
        replies = "/".join([thread, meta[1]])
        if not os.path.isfile(info):
            t = thread.split("/")[-1]
            orig = "/".join([friends[host], "raw", "local", t, "head.txt"])
            u.wget(orig, info)
        with open(info, "r") as info:
            info = info.read().strip()
        if len(info) == 0:
            continue
        info = info.splitlines()[0]
        with open(replies, "r") as replies:
            replies = replies.read().splitlines()
        replies = [r.split(" ") for r in replies]
        breps = [r[0] for r in replies]
        try:
            int(replies[0][1])
            int(replies[-1][1])
        except:
            continue
        tline = [
            replies[0][1], replies[-1][1],
            str(breps.count("local")),
            str(len(replies)), info
        ]
        bind.append(tline)
    bind.sort(key=lambda x: x[1], reverse=1)
    if not write:
        return bind
    bind = "\n".join([" ".join(t) for t in bind])
    with open(indpath, "w") as ind:
        ind.write(bind)
def install(should_identify=True):
    if should_identify:
        system_identify()

    # Create a place to store downloads.
    if not isdir(DOWNLOADS):
        mkdir(DOWNLOADS)

    # Download and compile PCRE.
    pcre_raw_ver = '8.39'
    pcre_ver = 'pcre-' + pcre_raw_ver
    pcre_zip = pcre_ver + '.zip'
    pcre_zip_path = os.path.join(DOWNLOADS, pcre_zip)
    if not isfile(pcre_zip_path):
        # Note: Don't use ftp://ftp.csx.cam.ac.uk/pub/software/programming/pcre/,
        # because this sometimes hangs during download, causing the build to
        # fail. Instead, use the more reliable SourceForge mirror.
        wget('http://downloads.sourceforge.net/project/pcre/pcre/{}/{}'.
            format(pcre_raw_ver, pcre_zip), pcre_zip_path)
    # See https://sevenzip.osdn.jp/chm/cmdline/commands/extract_full.htm.
    xqt('7z x {} > nul'.format(pcre_zip_path))
    with pushd(pcre_ver):
        mkdir('build')
        chdir('build')
        # Per https://cmake.org/cmake/help/latest/generator/Visual%20Studio%2014%202015.html,
        # add the Win64 string for 64-bit Python.
        use_Win64 = ' Win64' if is_64bits else ''
        xqt('cmake .. -DBUILD_SHARED_LIBS:BOOL=OFF -DPCRE_SUPPORT_UTF:BOOL=ON '
            '-DPCRE_SUPPORT_JIT:BOOL=ON -G "Visual Studio 14 2015{}"'.
            format(use_Win64),
          'cmake --build . --config Release')

    # First, build Python C extensions. Use this instead of
    # ``build_ext`` so that Enki will have an already-installed qutepart,
    # rather than needing to regenrate the command below.
    xqt('python setup.py build_ext --include-dir={}/build '
        '--lib-dir={}/build/Release --force'.format(pcre_ver, pcre_ver))
    # Next, install it along with its dependencies. See comments at
    # ``install_requires`` on why this is necessary.
    xqt('python -m pip install -e .')
Example #16
0
    def download_images_from_webpage(self, href, out_path=None, img_name=None):
        ret = False
        print href
        sub_html = utils.wget(href)

        if not sub_html:
            print 'WARNING: request to %s failed.' % sub_html
        else:
            ret = True
            # get the jpg image in the page
            #image_urls = re.findall(ur'<img [^>]*src="([^"]*?\.jpg)"[^>]*>', sub_html)
            #print sub_html
            image_urls = re.findall(ur'<img [^>]*src\s*=\s*"([^"]*?)"[^>]*?>',
                                    sub_html)
            print image_urls
            for image_url in image_urls:
                if not image_url.startswith('/'):
                    image_url = re.sub(ur'/[^/]*$', '/' + image_url, href)
                else:
                    image_url = re.sub(ur'^(.*?//.*?/).*$', r'\1' + image_url,
                                       href)
                print image_url

                # get the image
                image = utils.wget(image_url)

                if not image:
                    print 'WARNING: request to %s failed.' % image_url
                else:
                    # save it
                    image_path = os.path.join(
                        out_path, img_name
                        or re.sub(ur'^.*/', '', image_url)) + ''
                    print image_path
                    utils.write_file(image_path, image)

        return ret
Example #17
0
    def fetch_old(self, *args, **options):
        '''
            fetch http://zzz//P3.html --links-file "bible1" --op=img1

            Will save all the jpg images found at that address into a directory called img1.
            We first download the index from that address then follow each link with a name listed in bible1 file.
            Download all all the jpg images found in those sub-pages.
        '''
        out_path = options['out_path']

        if len(args) > 1:
            url = args[1]
            print url

            if options['links']:
                links = options['links'].split(' ')

            if options['links_file']:
                f = open(options['links_file'], 'rb')
                links = f.readlines()
                f.close()
                links = [link.strip() for link in links]

            if links:
                html = utils.wget(url)
                if not html:
                    print 'ERROR: request to %s failed.' % url
                else:
                    for link in links:
                        print link
                        href = re.findall(
                            ur'<a [^>]*href="([^"]*?)"[^>]*>\s*' +
                            re.escape(link) + '\s*<', html)
                        if href:
                            href = href[0]
                            href = re.sub(ur'/[^/]*$', '/' + href, url)
                            print href

                            self.download_images_from_webpage(href, out_path)
Example #18
0
    def fetch_old(self, *args, **options):
        '''
            fetch http://zzz//P3.html --links-file "bible1" --op=img1

            Will save all the jpg images found at that address into a directory called img1.
            We first download the index from that address then follow each link with a name listed in bible1 file.
            Download all all the jpg images found in those sub-pages.
        '''
        out_path = options['out_path']

        if len(args) > 1:
            url = args[1]
            print url

            if options['links']:
                links = options['links'].split(' ')

            if options['links_file']:
                f = open(options['links_file'], 'rb')
                links = f.readlines()
                f.close()
                links = [link.strip() for link in links]

            if links:
                html = utils.wget(url)
                if not html:
                    print 'ERROR: request to %s failed.' % url
                else:
                    for link in links:
                        print link
                        href = re.findall(ur'<a [^>]*href="([^"]*?)"[^>]*>\s*' + re.escape(link) + '\s*<', html)
                        if href:
                            href = href[0]
                            href = re.sub(ur'/[^/]*$', '/' + href, url)
                            print href

                            self.download_images_from_webpage(href, out_path)
Example #19
0
#!/usr/bin/env python

import re
import string

import utils

url = "http://www.pythonchallenge.com/pc/def/ocr.html"

if __name__ == "__main__":
    text_pattern = re.compile(
            r'<!--\nfind rare characters in the mess below:\n-->\n\n<!--\n(.*?)\n-->', 
            re.DOTALL)
    text = re.search(text_pattern, utils.wget(url)).group(1)

    new_filename = ''
    for char in text:
        if char in string.letters:
            new_filename += char

    print utils.update_url(url, utils.return_this, new_filename)

Example #20
0
import re
import string

import utils

url = "http://www.pythonchallenge.com/pc/def/linkedlist.html"
url = "http://www.pythonchallenge.com/pc/def/linkedlist.php"
url = "http://www.pythonchallenge.com/pc/def/linkedlist.php?nothing=12345"

if __name__ == "__main__":
    value_pattern = re.compile(r'and the next nothing is (\d+)')

    this_value = url[-5:]
    count = 0;
    while count <= 400:
        source = utils.wget(url)
        #print "%d: %s" % (count, source)
        mo = re.search(value_pattern, source)
        if mo:
            next_value = mo.group(1)
        elif source.find('Divide by two') >= 0:
            next_value = str(int(this_value) / 2)
        else:
            print "%d: %s" % (count, source)
            break
        url = url.replace(this_value, next_value)
        this_value = next_value
        count += 1

    print utils.update_url(url, utils.return_this, source)
#! /usr/bin/env python
################################################################################
# RelMon: a tool for automatic Release Comparison
# https://twiki.cern.ch/twiki/bin/view/CMSPublic/RelMon
#
#
#
# Danilo Piparo CERN - [email protected]
#
################################################################################

from __future__ import print_function
from sys import argv, exit
from utils import wget

if __name__ == "__main__":
    argc = len(argv)
    if argc != 2:
        print("Usage %prog url-to-fetch")
        exit(1)

    url = argv[1]
    wget(url)
Example #22
0
    def post(self):
        url = self.get_argument("url", default=None, strip=False)
        if not url.startswith('http://'):
            url = 'http://' + url
        response = 'Nothing yet'

        # Fetch URL and extract content
        try:
            response = wget(url)
        except:
            self.render("timeout.html", )
            return

        response = response.replace('\r\n', '').replace('\n',
                                                        '')  # Remove newlines
        regex = re.compile(
            '(?<=body).*?(?=<\/body>)')  # Extract content of body HTML tag
        response = regex.findall(response)[0]
        response = re.sub(r'<.*?>', ' ', response)  # Remove HTML tags

        # Remove prepositions and articles
        prep_arr = [
            'on',
            'in',
            'at',
            'since',
            'for',
            'ago',
            'before',
            'to',
            'until',
            'till',
            'by',
            'off',
            'about',
            'from',
            'onto',
            'unto',
            'into',
            'through',
            'across',
            'above',
            'below',
            'over',
            'under',
            'beside',
            'next',
            'a',
            'an',
            'the',
            'some',
            'few',
            'this',
            'that',
            'those',
            'these'
            'how',
            'why',
            'what',
            'who',
            'when',
            'there',
        ]
        for i in prep_arr:
            response = response.replace(i, '')

        # Create word dictionary and frequency
        wordcount = {}
        regex = re.compile('[a-zA-Z\-]{3,}')
        words = regex.findall(response)
        for word in words:
            if word not in wordcount:
                #wordcount[word] = {'freq': 1, 'fontSize': 1}
                wordcount[word] = 1
            else:
                wordcount[word] += 1

        # Find top 100 words
        tuples = sorted(wordcount.iteritems(), key=lambda (k, v): (v, k))
        tuples.reverse()
        count = 0
        wordcount = {}
        for i in tuples:
            wordcount[i[0]] = {'freq': i[1], 'fontSize': 1}
            count += 1
            if count >= 100:
                break

        # Find highest value
        high = 0
        for word in wordcount:
            high = max(high, wordcount[word]['freq'])

        # Define font size relative to the highest value
        for word in wordcount:
            wordcount[word]['fontSize'] = float(
                wordcount[word]['freq']) / float(high) * float(400)

        # Insert into DB
        db = MySQLdb.connect(unix_socket='/cloudsql/{}:{}'.format(
            CLOUDSQL_PROJECT, CLOUDSQL_INSTANCE),
                             user='******',
                             passwd='octo stuff is being setup',
                             db='words')
        cursor = db.cursor()
        for word in wordcount:
            #salt = uuid.uuid4().hex
            hashed_word = hashlib.sha512(word + SALT).hexdigest()

            # Encryption
            cipher_text = rsa.encrypt(word,
                                      rsa.PublicKey.load_pkcs1(PUBLIC_KEY))

            # Update/insert rows
            cursor.execute('select count(1) from entries where wordhash=%s',
                           (hashed_word, ))
            if cursor.rowcount > 0:
                cursor.execute(
                    'update entries set wordencrypt=%s, wordfreq=%s where wordhash=%s',
                    (cipher_text, wordcount[word]['freq'], hashed_word))
            else:
                cursor.execute(
                    'insert into entries (wordhash, wordencrypt, wordfreq) values (%s, %s, %s)',
                    (hashed_word, cipher_text, wordcount[word]['freq']))

            db.commit()

        # Render HTML
        self.render("word.html", url=url, results=wordcount)
Example #23
0
 def wg(url):
     dest = join('/tmp', split(url)[1])
     if not exists(dest):
         wget(url, dest)
Example #24
0
 def wg(url):
     dest = join("/tmp", split(url)[1])
     if not exists(dest):
         wget(url, dest)
Example #25
0
    def post(self):
        url = self.get_argument("url", default=None, strip=False)
        if not url.startswith('http://'):
            url = 'http://' + url
        response = 'Nothing yet'

        # Fetch URL and extract content
        try:
            response = wget(url)
        except:
            self.render("timeout.html", )
            return

        response = response.replace('\r\n','').replace('\n', '')    # Remove newlines
        regex = re.compile('(?<=body).*?(?=<\/body>)')              # Extract content of body HTML tag
        response = regex.findall(response)[0]                       
        response = re.sub(r'<.*?>', ' ', response)                  # Remove HTML tags

        # Remove prepositions and articles
        prep_arr = [
                'on', 'in', 'at', 'since', 'for', 'ago', 'before', 'to', 'until', 'till', 'by',
                'off', 'about', 'from', 'onto', 'unto', 'into', 'through', 'across', 
                'above', 'below', 'over', 'under', 'beside', 'next',
                'a', 'an', 'the', 'some', 'few', 'this', 'that', 'those', 'these'
                'how', 'why', 'what', 'who', 'when', 'there', ]
        for i in prep_arr:
            response = response.replace(i, '')

        # Create word dictionary and frequency
        wordcount = {}
        regex = re.compile('[a-zA-Z\-]{3,}')
        words = regex.findall(response)
        for word in words:
            if word not in wordcount:
                #wordcount[word] = {'freq': 1, 'fontSize': 1}
                wordcount[word] = 1
            else:
                wordcount[word] += 1

        # Find top 100 words
        tuples = sorted(wordcount.iteritems(), key=lambda (k,v):(v,k))
        tuples.reverse()
        count = 0
        wordcount = {}
        for i in tuples:
            wordcount[i[0]] = {'freq': i[1], 'fontSize': 1}
            count += 1
            if count >= 100:
                break

        # Find highest value
        high = 0
        for word in wordcount:
            high = max(high, wordcount[word]['freq'])
        
        # Define font size relative to the highest value
        for word in wordcount:
            wordcount[word]['fontSize'] = float(wordcount[word]['freq']) / float(high) * float(400)

        # Insert into DB
        db = MySQLdb.connect(
                unix_socket='/cloudsql/{}:{}'.format(CLOUDSQL_PROJECT, CLOUDSQL_INSTANCE),
                user='******',
                passwd='octo stuff is being setup',
                db='words')
        cursor = db.cursor()
        for word in wordcount:
            #salt = uuid.uuid4().hex
            hashed_word = hashlib.sha512(word + SALT).hexdigest()

            # Encryption
            cipher_text = rsa.encrypt(word, rsa.PublicKey.load_pkcs1(PUBLIC_KEY))

            # Update/insert rows
            cursor.execute('select count(1) from entries where wordhash=%s', (hashed_word,))
            if cursor.rowcount > 0:
                cursor.execute(
                        'update entries set wordencrypt=%s, wordfreq=%s where wordhash=%s', 
                        (cipher_text, wordcount[word]['freq'], hashed_word))
            else:
                cursor.execute(
                        'insert into entries (wordhash, wordencrypt, wordfreq) values (%s, %s, %s)', 
                        (hashed_word, cipher_text, wordcount[word]['freq']))

            db.commit()
            
        # Render HTML
        self.render("word.html", url=url, results=wordcount)
Example #26
0
def linksites():
    mkfriends()
    furls = {friends[f]: f for f in friends}
    for f in friends:
        if f is "local":
            continue

        # furl - remote friendslist url
        # lurl - remote thread index url
        # ffn - friendslist filename (friends.host)
        # nffn - new friendslist filename (friends.host.new)
        # lfn - thread index filename (list.host)
        # nlfn - new thread index filename (list.host.new)
        # changes - threads with new replies from self
        # hosts - hosts that need their index rewritten

        furl = "/".join([friends[f], "raw",
                         "friends.txt"])  # Legacy: rename /raw/ -> /api/
        lurl = "/".join([friends[f], "raw", "list.txt"])
        ffn = arc + "friends." + f
        if not os.path.exists(ffn):
            with open(ffn, "w") as fi:
                fi.write("")
        nffn = ffn + ".new"
        lfn = arc + "list." + f
        if not os.path.exists(lfn):
            with open(lfn, "w") as fi:
                fi.write("")
        nlfn = lfn + ".new"
        u.wget(furl, nffn)
        u.wget(lurl, nlfn)

        # Ideally, a list of [name, op] localreplies
        # is compared against the older version, and
        # if a difference is found, {common}/{thread}/{friend}
        # is downloaded, {common}/{thread} & {common} are then
        # rebuilt. This is contingent on {common} being a common
        # host between client and server.
        with open(nffn, "r") as nf:
            nf = [x.split() for x in nf.read().splitlines()]
        if len(nf) < 1:
            continue
        if len(nf[0][1]) < 6:
            continue
        # This breaks if a friend URL is blank
        nfurls = {x[1]: x[0] for x in nf if len(x) > 1}
        common = {nfurls[x]: furls[x] for x in nfurls if x in furls}
        common2 = {common[f]: f for f in common}
        with open(lfn, "r") as oldl:
            oldl = [o.split() for o in oldl.read().splitlines()]
        with open(nlfn, "r") as newl:
            newl = [n.split() for n in newl.read().splitlines()]
        changes = []
        for n in newl:
            if n[0] not in common.keys():
                continue
            if not int(n[3]):
                continue
            if n in oldl:
                continue
            n = [common[n[0]], n[1], n[3]]
            changes.append(n)

        for c in changes:
            url = "/".join(
                [friends[f], "raw", common2[c[0]], c[1], "local.txt"])
            ldir = "/".join(["./threads", c[0], c[1]])
            local = "/".join([ldir, f + ".txt"])
            if not os.path.isdir(ldir):
                os.mkdir(ldir)
            u.wget(url, local)
            mkthread(c[0], c[1])
        hosts = set([c[0] for c in changes])
        for b in hosts:
            mkhost(b)
        os.rename(nffn, ffn)
        os.rename(nlfn, lfn)
    mksite()
Example #27
0

def snif(url, locate="tmp", async=True, condition=is_type):
    """Main function to download files."""
    if type(async) == int and async > 2:
        session = FuturesSession(executor=ThreadPoolExecutor(
            max_workers=async))
    elif async:
        session = FuturesSession()
    links = filter(condition, get_all_http_func(url))
    number_links = 0
    for link in links:
        number_links += 1
        try:
            if async:
                session.get(link).add_done_callback(
                    lambda future: wget_future(future, subfolder=locate))
                # wget_async(link, subfolder=locate, async=40)
            else:
                wget(link, subfolder=locate)
        except ConnectionError:
            print("Problème avec le lien:" + link)
            continue
    print(number_links, " éléments téléchargeables.")


if __name__ == "__main__":
    if len(sys.argv) > 2:
        cd(pwd())
        snif(sys.argv[1], sys.argv[2])
Example #28
0
#!/usr/bin/env python

url = "http://www.pythonchallenge.com/pc/def/integrity.html"
title = "working hard?"
auth_popup_message = "inflate"
url2 = "http://www.pythonchallenge.com/pc/return/good.html"

if __name__ == "__main__":
    import bz2
    import re

    import utils

    pattern = re.compile(r"<!--\nun: '(.+)'\s+pw: '(.+)'")
    un, pw = re.findall(pattern, utils.wget(url))[0]
    #print un
    #print pw

    print "un: %s" % bz2.decompress(un.decode('string_escape'))
    print "pw: %s" % bz2.decompress(pw.decode('string_escape'))

#! /usr/bin/env python
################################################################################
# RelMon: a tool for automatic Release Comparison                              
# https://twiki.cern.ch/twiki/bin/view/CMSPublic/RelMon
#
#
#                                                                              
# Danilo Piparo CERN - [email protected]                                   
#                                                                              
################################################################################

from sys import argv,exit
from utils import wget

if __name__=="__main__":
  argc=len(argv)
  if argc!=2:
    print "Usage %prog url-to-fetch"
    exit(1)
  
  url=argv[1]
  wget(url)
Example #30
0
        exit(-1)

    dirtolist = ""
    if lenargs == 1:
        dirtolist = args[0]

    mode = "relval"
    if options.online:
        mode = "online"
    if options.development:
        mode = "dev"

    directory = "%s/dqm/%s/data/browse/%s" % (server, mode, dirtolist)
    print "peeping ", directory
    contents = extract_list(get_page(directory), server, options.show_url)

    if len(contents) == 0:
        print "No contents found!"

    for content in contents:
        if not options.get and search(options.path, content):
            print content
        if options.get and options.show_url and len(
                options.path) > 0 and search(options.path, content):
            if not search('pre', options.path) and search('pre', content):
                continue
            bcontent = basename(content)
            print "Getting %s" % bcontent
            wget(content)
            print "Got %s!!" % bcontent
Example #31
0
def import_order(url):
    digikey = find_distributor('Digi-Key')
    
    file = '/tmp/order.html'
    wget(url, file)
    
    soup = BeautifulSoup(open(file))
    
    web_id_tag = soup.find(is_span_lblWebID)
    web_id = web_id_tag.get_text(strip=True)
    
    salesorder_number_tag = soup.find(is_span_lblSalesorderNumber)
    salesorder_number = salesorder_number_tag.get_text(strip=True)
    
    submitted_tag = soup.find(is_span_lblSubmitted)
    submitted = submitted_tag.get_text(strip=True)
    
    order = parse_order(url, soup)
    items = [item for item in order if len(item) == 9]
    
    print web_id, salesorder_number, submitted, items
    
    # FIX replace by populate_order
    try:
        return get('/order/find/',
                   {'dist': digikey['id'],
                    'order_number': salesorder_number})
    except Http404:
        price = None
        for item in order:
            if len(item) == 3:
                if item[1] == 'Subtotal':
                    subtotal = item[2].replace ('$', '')
                elif item[1] == 'Total':
                    m = re.match(r'\$([0-9]+.[0-9]+)', item[2])
                    if m:
                        price = m.group(1)
        
        # %m/%d/%Y
        m = re.search(r'([0-9]+)/([0-9]+)/([0-9]+)', submitted)
        
        o = create_order(dist=digikey,
                         order_number=salesorder_number,
                         url=url,
                         date=m.group(3) + '-' + m.group(1) + '-' + m.group(2),
                         price=price if price else subtotal)

        if not price:
            create_order_property(order=o,
                                  name='Import Note',
                                  value='Total missing, used Subtotal for Order Price')
    
    create_order_property(order=o,
                          name='Salesorder Number',
                          value=salesorder_number)
                
    create_order_property(order=o, 
                          name='Web ID',
                          value=web_id)

    import_failures = []
    for item in items:
        (index, quantity, dist_part_num, desc, _, _, _, unit_price, ext_price) = item
        
        try:
            unit_price = Decimal(unit_price.replace('$', ''))
            ext_price = Decimal(ext_price.replace('$', ''))
            quantity = int(quantity.replace('NCNR', ''))
            
            print 'importing ' + dist_part_num
            dp = import_digikey_part(dist_part_num)
            
            p = get_part(dp['part'])
            
            create_part_history(order=o,
                                part=p,
                                quantity=quantity,
                                ext_price=ext_price)
        except Exception, e:
            print 'import failed:', repr(e)
            import_failures.append(dist_part_num)
Example #32
0
    except:
        print("Error with file: " + filename)


def snif(url, locate="tmp", async=True, condition=is_type):
    """Main function to download files."""
    if type(async) == int and async > 2:
        session = FuturesSession(executor=ThreadPoolExecutor(max_workers=async))
    elif async:
        session = FuturesSession()
    links = filter(condition, get_all_http_func(url))
    number_links = 0
    for link in links:
        number_links += 1
        try:
            if async:
                session.get(link).add_done_callback(lambda future: wget_future(future, subfolder=locate))
                # wget_async(link, subfolder=locate, async=40)
            else:
                wget(link, subfolder=locate)
        except ConnectionError:
            print("Problème avec le lien:" + link)
            continue
    print(number_links, " éléments téléchargeables.")


if __name__ == "__main__":
    if len(sys.argv) > 2:
        cd(pwd())
        snif(sys.argv[1], sys.argv[2])
Example #33
0
  dirtolist=""
  if lenargs==1:
    dirtolist=args[0]
  
  mode="relval"
  if options.online:
    mode="online"
  if options.development:
    mode="dev"
  
    
  directory="%s/dqm/%s/data/browse/%s" %(server,mode,dirtolist)
  print("peeping ",directory)  
  contents=extract_list(get_page(directory),server,options.show_url)
  
  if len(contents)==0:
    print("No contents found!")
  
  for content in contents:
    if not options.get and search(options.path,content):
      print(content)
    if options.get and options.show_url and len(options.path)>0 and search(options.path,content):
      if not search('pre',options.path) and search('pre',content):
        continue
      bcontent=basename(content)
      print("Getting %s" %bcontent)
      wget(content)
      print("Got %s!!" %bcontent)
  
  
Example #34
0
#!/usr/bin/env python

url = "http://www.pythonchallenge.com/pc/def/peak.html"
url = "http://www.pythonchallenge.com/pc/def/banner.p"

if __name__ == "__main__":
    import pickle
    import sys
    import utils

    data = pickle.loads(utils.wget(url))
    # Got help from below in recongnising data as a run length encoding!
    # http://unixwars.com/2007/09/11/python-challenge-level-5-peak-hell/
    for line in data:
        print ''.join(map(lambda pair: pair[0]*pair[1], line))

Example #35
0
#!/usr/bin/env python

url = "http://www.pythonchallenge.com/pc/def/channel.zip"

if __name__ == "__main__":
    import re
    import StringIO
    import zipfile

    import utils

    zip_data = utils.wget(url)
    filelike = StringIO.StringIO(zip_data)

    #print zipfile.is_zipfile(filelike)
    zf = zipfile.ZipFile(filelike, 'r')
    readme_text = zf.open('readme.txt').read()
    start_filename_pattern = re.compile(r'start from (\d+)')
    start_filename = re.search(start_filename_pattern, readme_text).group(1)

    comments = ''
    this_filename = start_filename
    count = 0
    next_filename_pattern = re.compile(r'Next nothing is (\d+)')
    while count <= len(zf.namelist()):
        comments += zf.getinfo('%s.txt' % this_filename).comment
        text = zf.open("%s.txt" % this_filename).read()
        print "%d: %s.txt: %s" % (count, this_filename, text)
        mo = re.search(next_filename_pattern, text)
        if mo:
            next_filename = mo.group(1)