Example #1
0
def fill_file(result):
    url = result
    #print url
    f = get_page(url)
    #print f
    if f == "":
        return "a"
    f = f.read()
    ##removes tags
    f = html_to_text(f)
    ##decoding to ASCII
    encoding = 'utf-8'
    try:
        ustr = f.decode(encoding)
    except:
        pass
        return f
    b = StringIO()
    old = sys.stdout
    try:
        sys.stdout = b
        html2text.wrapwrite(html2text.html2text(ustr, url))
    finally: sys.stdout = old
    text = b.getvalue()
    b.close()
    return text
Example #2
0
def fill_file(result):
    url = result
    #print url
    f = get_page(url)
    #print f
    if f == "":
        return "a"
    f = f.read()
    ##removes tags
    f = html_to_text(f)
    ##decoding to ASCII
    encoding = 'utf-8'
    try:
        ustr = f.decode(encoding)
    except:
        pass
        return f
    b = StringIO()
    old = sys.stdout
    try:
        sys.stdout = b
        html2text.wrapwrite(html2text.html2text(ustr, url))
    finally:
        sys.stdout = old
    text = b.getvalue()
    b.close()
    return text
Example #3
0
def prep(dataset):
    "Retrieve information from CMS ReqMgr data-service"
    dsn = dataset.split('/')[1]
    purl = 'http://cms.cern.ch/iCMS/jsp/mcprod/admin/requestmanagement.jsp'
    args = {'dsn': dsn, 'campid': 'any'}
    sso = 'https://cms.cern.ch/test/env.cgi?url='
    url = sso + purl + '?' + urllib.urlencode(args)
    cert = os.path.join(os.environ['HOME'], '.globus/usercert.pem')
    data = ''
    #    print "url", url
    with working_pem(PEMMGR.pem) as key:
        data = get_data_sso(url, key, cert).read()
        params_dict, action = parse_sso_output(data)
        params = urllib.urlencode((params_dict))
        #        print "params", params
        #        print "action", action
        if action:
            opener = create_https_opener(key, cert)
            fdesc = opener.open(action, params)
            data = fdesc.read()
            for row in data.split('\n'):
                if row.find('setCookie') != -1:
                    ctup = row.split('(')[-1].replace('"', '').replace(
                        "'", '').split(',')[:2]
#                    print "key/val", ctup
            for hdl in opener.handlers:
                if repr(hdl).find('urllib2.HTTPCookieProcessor') != -1:
                    for ccc in hdl.__dict__['cookiejar']:
                        cookie = cookielib.Cookie(\
                                port=None, port_specified=False, domain=ccc.domain,
                                domain_specified=False, domain_initial_dot=False,
                                path=ccc.path, path_specified=False, secure=None, expires=None,
                                discard=True, comment=None, comment_url=None, rest=None,
                                version=0, name=ctup[0], value=ctup[1])
                        hdl.__dict__['cookiejar'].set_cookie(cookie)
                        break
                        print hdl.__dict__['cookiejar']
#            print "\n### data", '\n'.join([r for r in data.split() if r])
            fdesc = opener.open(purl + '?' + urllib.urlencode(args))
            data = fdesc.read()
#    print "\n### data", data
    wrapwrite(html2text(data, ''))
Example #4
0
def prep(dataset):
    "Retrieve information from CMS ReqMgr data-service"
    dsn  = dataset.split('/')[1]
    purl= 'http://cms.cern.ch/iCMS/jsp/mcprod/admin/requestmanagement.jsp'
    args = {'dsn': dsn, 'campid':'any'}
    sso  = 'https://cms.cern.ch/test/env.cgi?url='
    url  = sso + purl + '?' + urllib.urlencode(args)
    cert = os.path.join(os.environ['HOME'], '.globus/usercert.pem')
    data = ''
#    print "url", url
    with working_pem(PEMMGR.pem) as key:
        data   = get_data_sso(url, key, cert).read()
        params_dict, action = parse_sso_output(data)
        params = urllib.urlencode((params_dict))
#        print "params", params
#        print "action", action
        if  action:
            opener = create_https_opener(key, cert)
            fdesc  = opener.open(action, params)
            data   = fdesc.read()
            for row in data.split('\n'):
                if  row.find('setCookie') != -1:
                    ctup = row.split('(')[-1].replace('"', '').replace("'", '').split(',')[:2]
#                    print "key/val", ctup
            for hdl in opener.handlers:
                if  repr(hdl).find('urllib2.HTTPCookieProcessor') != -1:
                    for ccc in hdl.__dict__['cookiejar']:
                        cookie = cookielib.Cookie(\
                                port=None, port_specified=False, domain=ccc.domain,
                                domain_specified=False, domain_initial_dot=False,
                                path=ccc.path, path_specified=False, secure=None, expires=None,
                                discard=True, comment=None, comment_url=None, rest=None,
                                version=0, name=ctup[0], value=ctup[1])
                        hdl.__dict__['cookiejar'].set_cookie(cookie)
                        break
                        print hdl.__dict__['cookiejar']
#            print "\n### data", '\n'.join([r for r in data.split() if r])
            fdesc  = opener.open(purl + '?' + urllib.urlencode(args))
            data   = fdesc.read()
#    print "\n### data", data
    wrapwrite(html2text(data, ''))
Example #5
0
def read(url, output=None, debug=0):
    "Get run information from RunSummary data-service"
    encoding = 'utf-8'
    key  = None
    cert = os.path.join(os.environ['HOME'], '.globus/usercert.pem')
    if  os.path.isfile(url):
        with open(url, 'r') as stream:
            context = stream.read()
            try:
                pydoc.pager(context)
            except:
                print context
        return
    elif url.find('cmsweb.cern.ch') != -1:
        data = get_data(url, decoder=None)
        html = data
        encoding = None
    elif url.find('mcdb.cern.ch') != -1:
        data = urllib.urlopen(url)
        html = data.read().replace('&nbsp_place_holder;', '')
        encoding = enc(data.headers, html)[0]
    elif url.find('cern.ch') == -1:
        data = urllib.urlopen(url)
        html = data.read()
        encoding = enc(data.headers, html)[0]
    else:
        with working_pem(PEMMGR.pem) as key:
            data = get_data_sso(url, key, cert, debug)
            html = data.read()
            encoding = enc(data.headers, html)[0]
    if  encoding == 'us-ascii':
        encoding = 'utf-8'
    pager = os.environ.get('CMSSH_PAGER', None)
    if  html:
        if  int(os.environ.get('HTTPDEBUG', 0)):
            print_info('read data')
            print html
        if  encoding:
            text = html.decode(encoding)
            res  = html2text(text, '')
            if  output:
                with open(output, 'w') as stream:
                    stream.write(html)
            else:
                try:
                    if  pager:
                        pydoc.pager(res.encode('utf-8'))
                    else:
                        wrapwrite(html2text(text, ''))
                except:
                    wrapwrite(html2text(text, ''))
        else:
            if  output:
                with open(output, 'w') as stream:
                    stream.write(html)
            else:
                try:
                    if  pager:
                        pydoc.pager(html)
                    else:
                        print html
                except:
                    print html
Example #6
0
#!/usr/bin/env python
import sys
sys.path.append('..')

import html2text

if __name__ == "__main__":
    # process input
    args = sys.argv[1:]
    if len(args) > 0:
        file_ = args[0]
        encoding = None
        if len(args) == 2:
            encoding = args[1]
        if len(args) > 2:
            p.error('Too many arguments')

        data = open(file_, 'rb').read()
        if encoding is None:
            try:
                from chardet import detect
            except ImportError:
                detect = lambda x: {'encoding': 'utf-8'}
            encoding = detect(data)['encoding']
        data = data.decode(encoding)
    else:
        data = sys.stdin.read()
    html2text.wrapwrite(html2text.html2text(data, ''))
Example #7
0
def read(url, output=None, debug=0):
    "Get run information from RunSummary data-service"
    encoding = 'utf-8'
    key = None
    cert = os.path.join(os.environ['HOME'], '.globus/usercert.pem')
    if os.path.isfile(url):
        with open(url, 'r') as stream:
            context = stream.read()
            try:
                pydoc.pager(context)
            except:
                print context
        return
    elif url.find('cmsweb.cern.ch') != -1:
        data = get_data(url, decoder=None)
        html = data
        encoding = None
    elif url.find('mcdb.cern.ch') != -1:
        data = urllib.urlopen(url)
        html = data.read().replace('&nbsp_place_holder;', '')
        encoding = enc(data.headers, html)[0]
    elif url.find('cern.ch') == -1:
        data = urllib.urlopen(url)
        html = data.read()
        encoding = enc(data.headers, html)[0]
    else:
        with working_pem(PEMMGR.pem) as key:
            data = get_data_sso(url, key, cert, debug)
            html = data.read()
            encoding = enc(data.headers, html)[0]
    if encoding == 'us-ascii':
        encoding = 'utf-8'
    pager = os.environ.get('CMSSH_PAGER', None)
    if html:
        if int(os.environ.get('HTTPDEBUG', 0)):
            print_info('read data')
            print html
        if encoding:
            text = html.decode(encoding)
            res = html2text(text, '')
            if output:
                with open(output, 'w') as stream:
                    stream.write(html)
            else:
                try:
                    if pager:
                        pydoc.pager(res.encode('utf-8'))
                    else:
                        wrapwrite(html2text(text, ''))
                except:
                    wrapwrite(html2text(text, ''))
        else:
            if output:
                with open(output, 'w') as stream:
                    stream.write(html)
            else:
                try:
                    if pager:
                        pydoc.pager(html)
                    else:
                        print html
                except:
                    print html