Esempio n. 1
0
    def __init__(self,
                 server_url,
                 proxy=None,
                 timeout=socket._GLOBAL_DEFAULT_TIMEOUT,
                 ca_cert=None,
                 client_cert=None,
                 client_cert_pass=None):
        """
        initialize the transport class
        """
        xmlrpclib.Transport.__init__(self)

        client_cert = client_cert or (None, None)
        self.disable_ssl_validation = False
        self.scheme = urllib.splittype(server_url)[0]
        self.https = (self.scheme == 'https')
        self.proxy = None
        self.timeout = timeout
        self._certfile, self._keyfile = client_cert
        self.ca_cert = ca_cert
        self.client_cert_pass = client_cert_pass

        # pull system proxy if no proxy is forced
        if not proxy:
            if self.https:
                proxy = os.environ.get('https_proxy', None)
            else:
                proxy = os.environ.get('http_proxy', None)

        if proxy:
            scheme, proxy_url = urllib.splittype(proxy)
            self.proxy = urllib.splithost(proxy_url)[0]

            # re-check if we need to support https
            self.https = (scheme == 'https')
Esempio n. 2
0
 def start(self, ssh, url, port=16789, rpcport=6789, cmd=None, mpcactor=None, vcactor=None) -> bool:
     """
     启动一个节点;
     :param ssh:ssh连接对象
     :param url: 配置文件中的url
     :param port: p2p端口
     :param rpcport: rpc端口或ws端口
     :param cmd: 启动命令
     :param mpcactor: mpc账户,不需要0x
     :return:
         bool
     """
     if not cmd:
         base_ws = '--identity "platon" --verbosity 4 --debug --ws --wsorigins "*" --txpool.nolocals --wsapi "db,eth,net,web3,miner,admin,personal" --wsaddr 0.0.0.0'
         base_http = '--identity "platon" --verbosity 4 --debug --rpc --txpool.nolocals --rpcapi "db,eth,net,web3,miner,admin,personal" --rpcaddr 0.0.0.0'
         base_ws += ' --syncmode "{}"'.format(self.syncmode)
         base_http += ' --syncmode "{}"'.format(self.syncmode)
         if self.net_type:
             base_http = base_http + " --" + self.net_type
             base_ws = base_ws + " --" + self.net_type
         if parse.splittype(url)[0] == "ws":
             if mpcactor:
                 cmd = '''nohup {}/node-{}/platon {} --datadir {}/node-{}/data --port {} --wsport {} --mpc --mpc.actor {} > {}/node-{}/nohup.out 2>&1 &'''.format(
                     self.deploy_path, port, base_ws, self.deploy_path, port, port, rpcport, "0x" +
                     str(mpcactor),
                     self.deploy_path, port)
             elif vcactor:
                 cmd = '''nohup {}/node-{}/platon {} --datadir {}/node-{}/data --port {} --wsport {} --vc --vc.actor {} --vc.password 88888888 > {}/node-{}/nohup.out 2>&1 &'''.format(
                     self.deploy_path, port, base_ws, self.deploy_path, port, port, rpcport, "0x" +
                     str(vcactor),
                     self.deploy_path, port)
             else:
                 cmd = '''nohup {}/node-{}/platon {} --datadir {}/node-{}/data --port {} --wsport {} > {}/node-{}/nohup.out 2>&1 &'''.format(
                     self.deploy_path, port, base_ws, self.deploy_path, port, port, rpcport, self.deploy_path, port)
         elif parse.splittype(url)[0] == "http":
             if mpcactor:
                 cmd = '''nohup {}/node-{}/platon {} --datadir {}/node-{}/data --port {} --rpcport {} --mpc --mpc.actor {} > {}/node-{}/nohup.out 2>&1 &'''.format(
                     self.deploy_path, port, base_http, self.deploy_path, port, port, rpcport, "0x" +
                     str(mpcactor),
                     self.deploy_path, port)
             elif vcactor:
                 cmd = '''nohup {}/node-{}/platon {} --datadir {}/node-{}/data --port {} --rpcport {} --vc --vc.actor {} --vc.password 88888888 > {}/node-{}/nohup.out 2>&1 &'''.format(
                     self.deploy_path, port, base_http, self.deploy_path, port, port, rpcport, "0x" +
                     str(vcactor),
                     self.deploy_path, port)
             else:
                 cmd = '''nohup {}/node-{}/platon {} --datadir {}/node-{}/data --port {} --rpcport {} > {}/node-{}/nohup.out 2>&1 &'''.format(
                     self.deploy_path, port, base_http, self.deploy_path, port, port, rpcport, self.deploy_path,
                     port)
         else:
             raise Exception("url连接类型不正确")
     self.run_ssh(ssh, cmd)
     result = self.run_ssh(
         ssh, "ps -ef|grep platon|grep %s|grep -v grep|awk {'print $2'}" % str(
             rpcport))
     if not result:
         return False
     else:
         return True
Esempio n. 3
0
    def post(self, query):
        i = self.institution
        logging.debug('posting data to %s' % i.url)
        logging.debug('---- request ----')
        logging.debug(query)
        garbage, path = splittype(i.url)
        host, selector = splithost(path)
        h = HTTPSConnection(host, timeout=60)
        # Discover requires a particular ordering of headers, so send the
        # request step by step.
        h.putrequest('POST',
                     selector,
                     skip_host=True,
                     skip_accept_encoding=True)
        h.putheader('Content-Type', 'application/x-ofx')
        h.putheader('Host', host)
        h.putheader('Content-Length', len(query))
        h.putheader('Connection', 'Keep-Alive')
        if self.accept:
            h.putheader('Accept', self.accept)
        if self.user_agent:
            h.putheader('User-Agent', self.user_agent)
        h.endheaders(query.encode())
        res = h.getresponse()
        response = res.read().decode('ascii', 'ignore')
        logging.debug('---- response ----')
        logging.debug(res.__dict__)
        logging.debug(response)
        res.close()

        return response
def here(modal, string):
    ban_words = ['here', 'click', 'Here', 'Click', 'CLICK', 'HERE']
    here_num = 0
    ban_flag = 0
    stand_host = modal
    stand_host_1 = ''

    urls = re.findall(r'<[Aa].*?href=.*?</[Aa]>', string, re.S)
    for url in urls:
        for word in ban_words:
            if word in url:  #如果找到click、here敏感词
                ban_flag = 1
                break

        if ban_flag == 1:
            http_url = re.findall(
                r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+',
                str(url))

            if len(http_url) > 0:
                first_url = http_url[0]
                proto, rest = splittype(first_url)
                host, rest = splithost(rest)
                host, port = splitport(host)
                stand_host = host

                # print(host)
                if host in modal:  #如果域名相同->0
                    pass
                else:
                    stand_host_1 = host
                    here_num = 1  #域名不同->1
    if stand_host_1:
        stand_host = stand_host_1
    return str(here_num), str(stand_host)
Esempio n. 5
0
def load_timestream(file_path):
    """Load a time stream from either a text file, HDF5 file, or URL

    The argument "file_path" can be one of the following:

    1. A path to a text file;

    2. A path to an HDF5 file;

    3. An URL pointing to the JSON record of a test;

    4. An URL pointing to an HDF5 file.

    Return a pair consisting of a dictionary containing the medatada and a
    Timestream object."""

    if isinstance(file_path, Path) or (not urlparse.splittype(file_path)[0]):
        # Local path
        ext = os.path.splitext(file_path)[1]
        if ext.lower() == ".txt":
            return None, load_text_file(file_path)
        else:
            return load_hdf5_file(file_path)
    else:
        # URL
        url = file_path
        req = urlreq.urlopen(url)
        content_type = req.info().get_content_type()

        save_to_cache = False
        result = None
        default_hdf5_file_name
        # We are *forced* to create a named temporary file and close it
        # before reading, because h5py does not support reading from
        # file-like objects like BytesIO or an already opened TemporaryFile
        with NamedTemporaryFile(suffix="h5", delete=False) as h5_file:
            h5_file_name = h5_file.name
            if content_type == "application/json":
                metadata = json.loads(req.read().decode("utf-8"))
                file_path = default_hdf5_file_name(metadata)
                if file_path.is_file():
                    log.info(f'Test {url} found in cache "{file_path}"')
                    _, result = load_timestream(file_path)
                else:
                    save_to_cache = True
                    download_test(url, metadata, h5_file)
            elif content_type == "application/hdf5":
                copyfileobj(req, h5_file)
            else:
                raise ValueError('unknown content type: "{0}"'.format(content_type))

        if result is None:
            result = load_hdf5_file(h5_file_name)[1]

        if save_to_cache:
            log.info(f'Copy file downloaded from {url} to cache "{file_path}"')
            copyfile(src=h5_file_name, dst=file_path)

        os.remove(h5_file_name)
        return metadata, result
Esempio n. 6
0
    def work(self, task):
        sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
        try:
            pro_, rest = splittype(task[0])
            host, rest = splithost(rest)
            host, port = splitport(host)
            task.append(rest)
            task.append(host)
            sock.setblocking(0)
            sock.connect_ex((host, int(port) if port else 80))

            def timeout_cb():
                if not sock._closed:
                    KBEngine.deregisterWriteFileDescriptor(sock.fileno())
                    sock.close()
                if task and task[2]:
                    task[2](None)

            self._write_timer[sock.fileno()] = self.add_timer(
                REQUEST_TIMEOUT, timeout_cb)
            KBEngine.registerWriteFileDescriptor(
                sock.fileno(), Functor(self.onSend, task, sock))
        except:
            self._tasks.append(task)
            self.logsError()
            if not sock._closed:
                sock.close()
Esempio n. 7
0
    def __init__(self,
                 uri,
                 username=None,
                 password=None,
                 verify=False,
                 sp=None,
                 sp_kwargs=None):
        self.uri = uri  # : From X{__init__(self, url)}

        self.username = username
        self.password = password

        self.schema = urlparser.splittype(uri)[0]

        if sp:
            self.sp = sp
        elif self.schema in ['http', 'https']:
            self.sp = HTTPServerProxy
        elif self.schema == 'scgi':
            self.sp = SCGIServerProxy
        else:
            raise NotImplementedError()

        self.sp_kwargs = sp_kwargs or {}

        self.torrents = []  # : List of L{Torrent} instances
        self._rpc_methods = []  # : List of rTorrent RPC methods
        self._torrent_cache = []
        self._client_version_tuple = ()

        if verify is True:
            self._verify_conn()
Esempio n. 8
0
 def __init__(self, username=None, password=None, serverurl=None):
     xmlrpclib.Transport.__init__(self)
     self.username = username
     self.password = password
     self.verbose = False
     self.serverurl = serverurl
     if serverurl.startswith('http://'):
         type, uri = urllib.splittype(serverurl)
         host, path = urllib.splithost(uri)
         host, port = urllib.splitport(host)
         if port is None:
             port = 80
         else:
             port = int(port)
         def get_connection(host=host, port=port):
             return httplib.HTTPConnection(host, port)
         self._get_connection = get_connection
     elif serverurl.startswith('unix://'):
         def get_connection(serverurl=serverurl):
             # we use 'localhost' here because domain names must be
             # < 64 chars (or we'd use the serverurl filename)
             conn = UnixStreamHTTPConnection('localhost')
             conn.socketfile = serverurl[7:]
             return conn
         self._get_connection = get_connection
     else:
         raise ValueError('Unknown protocol for serverurl %s' % serverurl)
Esempio n. 9
0
    def __init__(self, uri, username=None, password=None,
                 verify=False, sp=None, sp_kwargs=None):
        self.uri = uri  # : From X{__init__(self, url)}

        self.username = username
        self.password = password

        self.schema = urlparser.splittype(uri)[0]

        if sp:
            self.sp = sp
        elif self.schema in ['http', 'https']:
            self.sp = HTTPServerProxy
        elif self.schema == 'scgi':
            self.sp = SCGIServerProxy
        else:
            raise NotImplementedError()

        self.sp_kwargs = sp_kwargs or {}

        self.torrents = []  # : List of L{Torrent} instances
        self._rpc_methods = []  # : List of rTorrent RPC methods
        self._torrent_cache = []
        self._client_version_tuple = ()

        if verify is True:
            self._verify_conn()
Esempio n. 10
0
    def __init__(self,
                 timeout=ExistDB.DEFAULT_TIMEOUT,
                 session=None,
                 url=None,
                 encoding='UTF-8',
                 *args,
                 **kwargs):
        # if default timeout is requested, use the global socket default
        if timeout is ExistDB.DEFAULT_TIMEOUT:
            timeout = socket.getdefaulttimeout()
        xmlrpc.client.Transport.__init__(self, *args, **kwargs)
        self.timeout = timeout
        # NOTE: assumues that if basic auth is needed, it is set
        # on the session that is passed in
        if session:
            self.session = session
        else:
            self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': self.user_agent,
            'Content-Type': 'application/xml'
        })

        # determine whether https is needed based on the url
        if url is not None:
            self.use_https = (splittype(url)[0] == 'https')

        self.encoding = encoding
Esempio n. 11
0
    def _get_site_meta(self, article):
        """
        Extract additional metadata about the article and its source, beyond
        what newspaper does for us by default.

        I think in the future it would be good to extend newspaper's content
        extractor with a more sophisticated one that does some of the below,
        and more.
        """

        source_url = article.source_url
        proto, url = [p.strip('/') for p in urlparse.splittype(source_url)]
        name = article.meta_site_name
        if not name:
            for xpath in META_SITE_NAME_EX:
                name = article.extractor.get_meta_content(
                    article.clean_doc, xpath)
                if name:
                    break
            else:
                self.log.warning(
                    f'{article.url} did not have a meta_site_name')
                name = tldextract.extract(source_url).domain.capitalize()

        favicon = article.meta_favicon
        if favicon:
            if favicon[:2] == '//':
                # protocol-relative URL
                favicon = f'{proto}:{favicon}'
            elif favicon[0] == '/':
                # relative URL to site base
                favicon = source_url + favicon

        return {'url': url, 'name': name, 'icon_url': favicon}
Esempio n. 12
0
    def download(self, url, localpath, username=None, passwd=None, overwrite=True):
        '''Download a url to a file or a directory, supported protocols: http, https, ftp, file
        @param url: URL to download from
        @type url: string
        @param localpath: filename or directory to download the url to pass - to return data
        @type localpath: string
        @param username: username for the url if it requires authentication
        @type username: string
        @param passwd: password for the url if it requires authentication
        @type passwd: string
        '''
        if not url:
            raise ValueError('URL can not be None or empty string')
        if not localpath:
            raise ValueError('Local path to download the url to can not be None or empty string')
        filename = ''
        if localpath == '-':
            filename = '-'
        if j.sal.fs.isDir(localpath):
            filename = j.sal.fs.joinPaths(localpath, j.sal.fs.getBaseName(url))
        else:
            if j.sal.fs.isDir(j.sal.fs.getDirName(localpath)):
                filename = localpath
            else:
                raise ValueError('Local path is an invalid path')
        self.logger.debug('Downloading url %s to local path %s'%(url, filename))
        from urllib.request import FancyURLopener
        from urllib.parse import splittype
        class myURLOpener(FancyURLopener):
            # read a URL, with automatic HTTP authentication
            def __init__(self, user, passwd):
                self._user = user
                self._passwd = passwd
                self._promptcalled = False
                FancyURLopener.__init__(self)

            def prompt_user_passwd(self, host, realm):
                if not self._user or not self._passwd:
                    raise j.exceptions.RuntimeError('Server requested authentication but nothing was given')
                if not self._promptcalled:
                    self._promptcalled = True
                    return self._user, self._passwd
                raise j.exceptions.RuntimeError('Could not authenticate with the given authentication user:%s and password:%s'%(self._user, self._passwd))

        urlopener = myURLOpener(username, passwd)


        if not j.sal.fs.exists(filename):
            overwrite=True

        if overwrite:
            if username and passwd and splittype(url)[0] == 'ftp':
                url = url.split('://')[0]+'://%s:%s@'%(username,passwd)+url.split('://')[1]
            if filename != '-':
                urlopener.retrieve(url, filename, None, None)
                self.logger.debug('URL %s is downloaded to local path %s'%(url, filename))
                return
            else:
                return urlopener.open(url).read()
        return print("!!! File already exists did not overwrite")
def find_modal(list):
    domain_list = []
    for url in list:
        http_url = re.findall(
            r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+',
            str(url))
        if len(http_url) > 0:
            first_url = http_url[0]
            proto, rest = splittype(first_url)
            host, rest = splithost(rest)
            host, port = splitport(host)
            domain_list.append(host)
            # print(host)
        else:
            host_ip_num = 0
    word_counts = collections.Counter(domain_list)
    # 出现频率最高的3个单词
    top_one = word_counts.most_common(1)
    if len(top_one) > 0:
        modal = top_one[0][0]
        # print(modal)
    else:
        modal = '-'

    return modal
def get_info_by_url(url):
    protocol, rest = parse.splittype(url)
    host, path = parse.splithost(rest)
    host, port = parse.splitport(host)
    if port is None:
        port = '80'
    return protocol, host, path, port
Esempio n. 15
0
    def __init__(self, username=None, password=None, serverurl=None):
        xmlrpclib.Transport.__init__(self)
        self.username = username
        self.password = password
        self.verbose = False
        self.serverurl = serverurl
        if serverurl.startswith('http://'):
            type, uri = urllib.splittype(serverurl)
            host, path = urllib.splithost(uri)
            host, port = urllib.splitport(host)
            if port is None:
                port = 80
            else:
                port = int(port)

            def get_connection(host=host, port=port):
                return httplib.HTTPConnection(host, port)

            self._get_connection = get_connection
        elif serverurl.startswith('unix://'):

            def get_connection(serverurl=serverurl):
                # we use 'localhost' here because domain names must be
                # < 64 chars (or we'd use the serverurl filename)
                conn = UnixStreamHTTPConnection('localhost')
                conn.socketfile = serverurl[7:]
                return conn

            self._get_connection = get_connection
        else:
            raise ValueError('Unknown protocol for serverurl %s' % serverurl)
 def handleData(self, response):
     s = response.text
     chapters_dict = dict()
     chapters = []
     queue_out = Queue()
     for field in chapter_fields:
         if self.re_rule.get(field):
             chapters_dict[field] = getRe(s, self.re_rule[field])
         elif self.xpath_rule.get(field):
             chapters_dict[field] = self.getXpath(s, self.xpath_rule[field])
     urls = chapters_dict['url']
     if urls[0] != '' and urls[0][0] == '/' and urls[0][1] != '/':
         menu_url = WEB_SETTINGS[self.web]['menu'].format('')
         proto, rest = splittype(menu_url)
         host, rest = splithost(rest)
         chapters_dict['url'] = [
             proto + '://' + host + url[:] for url in urls
         ]
     print(chapters_dict)
     # self.req().createChapter(self.web, chapters_dict['url'], queue_out)
     for i in range(len(chapters_dict['url'])):
         chapter = Chapter()
         chapter.url = chapters_dict['url'][i].replace(' ', '')
         chapter.title = chapters_dict['title'][i].replace(' ', '')
         chapter.content = '该章节下载失败'
         chapters.append(chapter)
     # self.dContent(chapters, queue_out)
     # Ui_MainWindow.tabWidget.get_ResultWidget('空').label.setText('啊哈哈')
     return chapters
Esempio n. 17
0
 def get_sha_and_start_download(self, download_result):
     res = download_result[self.new_download_url]
     checksum = res.buffer.getvalue().decode('utf-8').split()[0]
     url = self.new_download_url.replace('sums.php', 'download.php') + '&r=1'
     if not self.https:
         self.new_download_url = 'https://' + parse.splittype(self.new_download_url)[1]
     self.check_data_and_start_download(url, checksum)
Esempio n. 18
0
def get_source_meta(article):
    """
    Extract additional metadata about the article and its source, beyond
    what newspaper does for us by default.

    I think in the future it would be good to extend newspaper's content
    extractor with a more sophisticated one that does some of the below,
    and more.
    """

    source_url = article.source_url
    key = urlparse.splittype(source_url)[1].strip('/')
    name = article.meta_site_name
    if not name:
        for xpath in META_SITE_NAME_EX:
            name = article.extractor.get_meta_content(article.clean_doc, xpath)
            if name:
                break
        else:
            warnings.warn(f'{article.url} did not have a meta_site_name')
            name = tldextract.extract(source_url).domain.capitalize()
    publish_date = article.publish_date
    if not publish_date:
        warnings.warn(f'{article.url} did not have a publish_date')
    else:
        publish_date = publish_date.astimezone(timezone.utc).isoformat()
    favicon = article.meta_favicon
    if favicon and favicon[0] == '/':
        # relative URL to site base
        favicon = source_url + favicon
    return (key, name, publish_date, favicon)
Esempio n. 19
0
 def __init__(self, url, headers=None):
     self.url = url
     self.headers = headers
     self.origin_req_host = cookielib.request_host(self)
     self.type, r = splittype(url)
     self.host, r = splithost(r)
     if self.host:
         self.host = unquote(self.host)
Esempio n. 20
0
    def _do_post(self, query, extra_headers=[]):
        """
        Do a POST to the Institution.

        :param query: Body content to POST (OFX Query)
        :type query: str
        :param extra_headers: Extra headers to send with the request, as a list
          of (Name, Value) header 2-tuples.
        :type extra_headers: list
        :return: 2-tuple of (HTTPResponse, str response body)
        :rtype: tuple
        """
        i = self.institution
        logging.debug('posting data to %s' % i.url)
        garbage, path = splittype(i.url)
        host, selector = splithost(path)
        try:
            h = HTTPSConnection(host, timeout=60)
            h.connect()
        except ssl.SSLError as ex:
            if (ex.reason == "UNSUPPORTED_PROTOCOL"):
                h = HTTPSConnection(host,
                                    timeout=60,
                                    context=ssl.SSLContext(ssl.PROTOCOL_TLSv1))
                h.connect()
            else:
                raise
        # Discover requires a particular ordering of headers, so send the
        # request step by step.
        h.putrequest('POST',
                     selector,
                     skip_host=True,
                     skip_accept_encoding=True)
        headers = [('Content-Type', 'application/x-ofx'), ('Host', host),
                   ('Content-Length', len(query)),
                   ('Connection', 'Keep-Alive')]
        if self.accept:
            headers.append(('Accept', self.accept))
        if self.user_agent:
            headers.append(('User-Agent', self.user_agent))
        for ehname, ehval in extra_headers:
            headers.append((ehname, ehval))
        logging.debug('---- request headers ----')
        for hname, hval in headers:
            logging.debug('%s: %s', hname, hval)
            h.putheader(hname, hval)
        logging.debug('---- request body (query) ----')
        logging.debug(query)
        h.endheaders(query.encode())
        res = h.getresponse()
        response = res.read().decode('ascii', 'ignore')
        logging.debug('---- response ----')
        logging.debug(res.__dict__)
        logging.debug('Headers: %s', res.getheaders())
        logging.debug(response)
        res.close()
        return res, response
Esempio n. 21
0
def get_parser_from_url(url):
    global PARSER
    protocol, s1 = splittype(url)
    host, path = splithost(s1)
    for i, j in PARSER.items():
        if i in host:
            return j

    return None
Esempio n. 22
0
    def _do_post(self, query, extra_headers=[]):
        """
        Do a POST to the Institution.

        :param query: Body content to POST (OFX Query)
        :type query: str
        :param extra_headers: Extra headers to send with the request, as a list
          of (Name, Value) header 2-tuples.
        :type extra_headers: list
        :return: 2-tuple of (HTTPResponse, str response body)
        :rtype: tuple
        """
        i = self.institution
        logging.debug('posting data to %s' % i.url)
        garbage, path = splittype(i.url)
        host, selector = splithost(path)
        try:
            h = HTTPSConnection(host, timeout=60)
            h.connect()
        except ssl.SSLError as ex:
            if (ex.reason == "UNSUPPORTED_PROTOCOL"):
                h = HTTPSConnection(host, timeout=60, context=ssl.SSLContext(ssl.PROTOCOL_TLSv1))
                h.connect()
            else:
                raise
        # Discover requires a particular ordering of headers, so send the
        # request step by step.
        h.putrequest('POST', selector, skip_host=True,
                     skip_accept_encoding=True)
        headers = [
            ('Content-Type', 'application/x-ofx'),
            ('Host', host),
            ('Content-Length', len(query)),
            ('Connection', 'Keep-Alive')
        ]
        if self.accept:
            headers.append(('Accept', self.accept))
        if self.user_agent:
            headers.append(('User-Agent', self.user_agent))
        for ehname, ehval in extra_headers:
            headers.append((ehname, ehval))
        logging.debug('---- request headers ----')
        for hname, hval in headers:
            logging.debug('%s: %s', hname, hval)
            h.putheader(hname, hval)
        logging.debug('---- request body (query) ----')
        logging.debug(query)
        h.endheaders(query.encode())
        res = h.getresponse()
        response = res.read().decode('ascii', 'ignore')
        logging.debug('---- response ----')
        logging.debug(res.__dict__)
        logging.debug('Headers: %s', res.getheaders())
        logging.debug(response)
        res.close()
        return res, response
    def __init__(self,
                 uri,
                 transport=None,
                 encoding=None,
                 verbose=0,
                 version=None,
                 headers=None,
                 history=None,
                 config=jsonrpclib.config.DEFAULT,
                 context=None,
                 timeout=None):
        """
        Sets up the server proxy

        :param uri: Request URI
        :param transport: Custom transport handler
        :param encoding: Specified encoding
        :param verbose: Log verbosity level
        :param version: JSON-RPC specification version
        :param headers: Custom additional headers for each request
        :param history: History object (for tests)
        :param config: A JSONRPClib Config instance
        :param context: The optional SSLContext to use
        """
        # Store the configuration
        self._config = config
        self.__version = version or config.version

        schema, uri = splittype(uri)
        if schema not in ('http', 'https'):
            _logger.error("jsonrpclib only support http(s) URIs, not %s",
                          schema)
            raise IOError('Unsupported JSON-RPC protocol.')

        self.__host, self.__handler = splithost(uri)
        if not self.__handler:
            # Not sure if this is in the JSON spec?
            self.__handler = '/'

        if transport is None:
            if schema == 'https':
                transport = SafeTransport(config=config,
                                          context=context,
                                          timeout=timeout)
            else:
                transport = Transport(config=config, timeout=timeout)
        self.__transport = transport

        self.__encoding = encoding
        self.__verbose = verbose
        self.__history = history

        # Global custom headers are injected into Transport
        self.__transport.push_headers(headers or {})
Esempio n. 24
0
def parsePage(spider, url, response):
    html = response.content
    selector = etree.HTML(html)
    html = html.decode('utf-8')
    propertys = json.loads(spider.args['PagePropertyRegularExpression'])
    for key in propertys:
        item = propertys[key]
        if item.startswith('$'):
            p1 = r'%s' % item[1:]
            pattern = re.compile(p1)
            match = pattern.search(html)

            if match:
                propertys[key] = match.group(1)
            #对文章的内容进行特殊处理,提取图片
            if key == 'content_raw':
                contentselector = etree.HTML(propertys[key])
                etree.strip_elements(contentselector, 'script')
                etree.strip_tags(contentselector, 'a')
                propertys[key] = etree.tostring(contentselector).decode(
                    'utf-8')
                if spider.args['DownLoadImg'] == 1:
                    for imgsrc in contentselector.xpath("//img/@src"):
                        if imgsrc is not None and len(imgsrc) > 0:
                            cache.rpush('link-img', imgsrc)
                            proto, rest = splittype(imgsrc)
                            res, rest = splithost(rest)
                            propertys[key] = propertys[key].replace(
                                imgsrc, imgsrc.replace(res, 'img.zyai.top'))
                            logging.info('push a img link to queue %s .' %
                                         imgsrc)

        else:
            item = selector.xpath(item)[0]
            propertys[key] = item

    dataPersistenceType = spider.args['DataPersistenceType']

    if dataPersistenceType == 'WPRPC':
        wp = Client('http://tech.cocopass.com/xmlrpc.php', 'admin', '19841204')
        """
		发表博文
		"""
        post = WordPressPost()
        post.title = propertys['title'].encode('utf-8')
        post.content = propertys['content_raw'].encode('utf-8')
        post.post_status = 'publish'
        post.terms_names = {'post_tag': [post.title], 'category': ['爱好']}
        wp.call(NewPost(post))
        logging.info('successfully post one article: %s .' %
                     propertys['title'])

    elif dataPersistenceType == 'MYSQL':
        pass
Esempio n. 25
0
def get_db_engine():
    if settings.DB_URI is None:
        raise EnvironmentError('需要配置"DB_URI"变量!')
    _typ, _ = splittype(settings.DB_URI)
    if _typ.startswith('sqlite'):
        return create_engine(settings.DB_URI,
                             connect_args={"check_same_thread": False}
                             # 只有SQLite才需要,其他数据库不需要。SQLite 只允许一个线程与其通信
                             )
    else:
        return create_engine(settings.DB_URI, )
Esempio n. 26
0
    def _fix_url(self, url):
        if url != "":
            if "http" not in url:
                t, other = parse.splittype(self.pre_url)
                host, path = parse.splithost(other)

                if url[0] != "/":
                    url = t + "://" + host + "/" + url
                else:
                    url = t + "://" + host + url
        return url
Esempio n. 27
0
def domain_name(list):
    domain_name_list = []
    for url in list:
        proto, rest = splittype(url)
        host, rest = splithost(rest)
        host, port = splitport(host)
        if host not in domain_name_list:
            domain_name_list.append(host)
        else:
            pass
    return len(domain_name_list)
Esempio n. 28
0
 def parse_download_link(self, line, in_download):
     """Parse Eclipse download links"""
     if self.download_keyword in line and self.bits in line and 'linux' in line:
         in_download = True
     else:
         in_download = False
     if in_download:
         p = re.search(r"href='(http://www\.eclipse\.org\/downloads/download\.php\?file=.*\.tar\.gz)'", line)
         with suppress(AttributeError):
             self.new_download_url = p.group(1).replace('download.php', 'sums.php').replace('http://', 'https://')
             self.https = True if parse.splittype(self.new_download_url)[0] is "https" else False
     return ((None, None), in_download)
Esempio n. 29
0
def add_proxies():
    # request session proxies
    if app.PROXY_SETTING:
        log.debug(u"Using global proxy: " + app.PROXY_SETTING)
        scheme, address = splittype(app.PROXY_SETTING)
        address = app.PROXY_SETTING if scheme else 'http://' + app.PROXY_SETTING
        return {
            "http": address,
            "https": address,
        }
    else:
        return None
Esempio n. 30
0
def url_split(url):
    """Split url in a tuple (scheme, hostname, port, document) where
    hostname is always lowercased.
    Precondition: url is syntactically correct URI (eg has no whitespace)
    """
    scheme, netloc = parse.splittype(url)
    host, document = parse.splithost(netloc)
    port = default_ports.get(scheme, 0)
    if host:
        host = host.lower()
        host, port = splitport(host, port=port)
    return scheme, host, port, document
Esempio n. 31
0
def getp():
    path = request.args.get('path')
    resp = requests.get(path, headers=headers, timeout=5)
    proto, rest = UrlPase.splittype(resp.url)
    host, rest = UrlPase.splithost(rest)
    if host == 'm.zwdu.com' or host == 'm.biqubao.com':
        resp.encoding = "GBK"
    else:
        resp.encoding = "utf-8"
    content = resp.text

    return content
Esempio n. 32
0
    def load(self, url):
        self.url = url

        self.protocol, s1 = urllib_parse.splittype(self.url)
        s2, self.path = urllib_parse.splithost(s1)
        self.host, self.port = urllib_parse.splitport(s2)

        if not self.port:
            if self.protocol == 'http':
                self.port = 80
            elif self.protocol == 'https':
                self.port = 443
Esempio n. 33
0
    def __init__(
        self,
        uri,
        transport=None,
        encoding=None,
        verbose=0,
        version=None,
        headers=None,
        history=None,
        config=jsonrpclib.config.DEFAULT,
    ):
        """
        Sets up the server proxy

        :param uri: Request URI
        :param transport: Custom transport handler
        :param encoding: Specified encoding
        :param verbose: Log verbosity level
        :param version: JSON-RPC specification version
        :param headers: Custom additional headers for each request
        :param history: History object (for tests)
        :param config: A JSONRPClib Config instance
        """
        # Store the configuration
        self._config = config

        if not version:
            version = config.version
        self.__version = version

        schema, uri = splittype(uri)
        if schema not in ("http", "https"):
            raise IOError("Unsupported JSON-RPC protocol.")

        self.__host, self.__handler = splithost(uri)
        if not self.__handler:
            # Not sure if this is in the JSON spec?
            self.__handler = "/"

        if transport is None:
            if schema == "https":
                transport = SafeTransport(config=config)
            else:
                transport = Transport(config=config)
        self.__transport = transport

        self.__encoding = encoding
        self.__verbose = verbose
        self.__history = history

        # Global custom headers are injected into Transport
        self.__transport.push_headers(headers or {})
Esempio n. 34
0
    def load(self, url):
        self.url = url

        self.protocol, s1 = splittype(self.url)
        s2, self.path = splithost(s1)
        self.host, port = splitport(s2)
        self.port = int(port) if port is not None else None

        if not self.port:
            if self.protocol == 'http':
                self.port = 80
            elif self.protocol == 'https':
                self.port = 443
Esempio n. 35
0
    def _load_workflow(self, fname_snapshot):
        stype = splittype(fname_snapshot)[0]
        if stype == "odbc":
            import pyodbc

            addr = fname_snapshot[7:]
            parsed = addr.split('&')
            try:
                odbc, table, id_, log_id = parsed[:4]
            except TypeError:
                self.warning("Invalid ODBC source format. Here is the "
                             "template: odbc://<odbc data source spec>&"
                             "<table>&<id>&<log id>[&<optional name>]\n"
                             "<table> and <log id> may be empty (\"veles\" and"
                             " <id> value will be used).")
                return None
            if not table:
                table = "veles"
            if not log_id:
                log_id = id_
            if len(parsed) > 4:
                if len(parsed) > 5:
                    self.warning("Invalid ODBC source format")
                    return None
                name = parsed[-1]
            else:
                name = None
            try:
                return SnapshotterToDB.import_(odbc, table, id_, log_id, name)
            except pyodbc.Error as e:
                self.warning(
                    "Failed to load the snapshot from ODBC source: %s", e)
                return None
        elif stype in ("http", "https"):
            try:
                self.info("Downloading %s...", fname_snapshot)
                fname_snapshot = self.snapshot_file_name = wget.download(
                    fname_snapshot, root.common.dirs.snapshots)
                print()
                sys.stdout.flush()
            except:
                self.exception("Failed to fetch the snapshot at \"%s\"",
                               fname_snapshot)
                return None
        try:
            return SnapshotterToFile.import_(fname_snapshot)
        except FileNotFoundError:
            if fname_snapshot.strip() != "":
                self.warning("Workflow snapshot %s does not exist",
                             fname_snapshot)
            return None
Esempio n. 36
0
    def _load_workflow(self, fname_snapshot):
        stype = splittype(fname_snapshot)[0]
        if stype == "odbc":
            import pyodbc

            addr = fname_snapshot[7:]
            parsed = addr.split('&')
            try:
                odbc, table, id_, log_id = parsed[:4]
            except TypeError:
                self.warning("Invalid ODBC source format. Here is the "
                             "template: odbc://<odbc data source spec>&"
                             "<table>&<id>&<log id>[&<optional name>]\n"
                             "<table> and <log id> may be empty (\"veles\" and"
                             " <id> value will be used).")
                return None
            if not table:
                table = "veles"
            if not log_id:
                log_id = id_
            if len(parsed) > 4:
                if len(parsed) > 5:
                    self.warning("Invalid ODBC source format")
                    return None
                name = parsed[-1]
            else:
                name = None
            try:
                return SnapshotterToDB.import_(odbc, table, id_, log_id, name)
            except pyodbc.Error as e:
                self.warning(
                    "Failed to load the snapshot from ODBC source: %s", e)
                return None
        elif stype in ("http", "https"):
            try:
                self.info("Downloading %s...", fname_snapshot)
                fname_snapshot = self.snapshot_file_name = wget.download(
                    fname_snapshot, root.common.dirs.snapshots)
                print()
                sys.stdout.flush()
            except:
                self.exception("Failed to fetch the snapshot at \"%s\"",
                               fname_snapshot)
                return None
        try:
            return SnapshotterToFile.import_(fname_snapshot)
        except FileNotFoundError:
            if fname_snapshot.strip() != "":
                self.warning("Workflow snapshot %s does not exist",
                             fname_snapshot)
            return None
Esempio n. 37
0
def fetchurl(url, query=None):
    if query is not None:
        assert '?' not in url, ("Either include query in url"
                                "or pass as parameter, but not both")
        url += '?' + urlencode(query)
    proto, tail = splittype(url)
    if proto != 'http':
        raise RuntimeError("Unsupported protocol HTTP")
    host, tail = splithost(tail)
    cli = HTTPClient(host)
    resp = cli.request(tail, headers={'Host': host})
    if resp.status.endswith('200 OK'):
        return resp.body
    raise RequestError(resp.status, resp)
Esempio n. 38
0
 def stat_page(self):
    from urllib.parse import splittype, splithost
    from http.client import HTTPConnection
    
    url = self.get_url()
    self.log(20, 'Statting page {!r} at {!r}.'.format(self.name, url))
    
    (_, dp) = splittype(url)
    (host, path) = splithost(dp)
    conn = HTTPConnection(host)
    conn.request('HEAD', path)
    res = conn.getresponse()
    lmt_raw = res.getheader('last-modified')
    lm_dts = self._parse_http_dt(lmt_raw)
    
    return lm_dts
Esempio n. 39
0
    def __init__(self, uri, transport=None, encoding=None, verbose=False,
                 allow_none=False, use_datetime=False):
        type, uri = urlparser.splittype(uri)
        if type not in ('scgi'):
            raise IOError('unsupported XML-RPC protocol')
        self.__host, self.__handler = urlparser.splithost(uri)
        if not self.__handler:
            self.__handler = '/'

        if transport is None:
            transport = SCGITransport(use_datetime=use_datetime)
        self.__transport = transport

        self.__encoding = encoding
        self.__verbose = verbose
        self.__allow_none = allow_none
Esempio n. 40
0
    def request(self, uri, method="GET", body='', headers=None, 
        redirections=httplib2.DEFAULT_MAX_REDIRECTS, connection_type=None):
        DEFAULT_POST_CONTENT_TYPE = 'application/x-www-form-urlencoded'

        if not isinstance(headers, dict):
            headers = {}

        if method == "POST":
            headers['Content-Type'] = headers.get('Content-Type', 
                DEFAULT_POST_CONTENT_TYPE)

        is_form_encoded = \
            headers.get('Content-Type') == 'application/x-www-form-urlencoded'

        if is_form_encoded and body:
            parameters = parse_qs(body)
        else:
            parameters = None

        req = Request.from_consumer_and_token(self.consumer, 
            token=self.token, http_method=method, http_url=uri, 
            parameters=parameters, body=body, is_form_encoded=is_form_encoded)

        req.sign_request(self.method, self.consumer, self.token)

        schema, rest = splittype(uri)
        if rest.startswith('//'):
            hierpart = '//'
        else:
            hierpart = ''
        host, rest = splithost(rest)

        realm = schema + ':' + hierpart + host

        if is_form_encoded:
            body = req.to_postdata()
        elif method == "GET":
            uri = req.to_url()
        else:
            headers.update(req.to_header(realm=realm))

        return httplib2.Http.request(self, uri, method=method, body=body,
            headers=headers, redirections=redirections,
            connection_type=connection_type)
Esempio n. 41
0
    def __init__(self, uri, transport=None, encoding=None,
                 verbose=0, version=None, headers=None, history=None,
                 config=jsonrpclib.config.DEFAULT, context=None):
        """
        Sets up the server proxy

        :param uri: Request URI
        :param transport: Custom transport handler
        :param encoding: Specified encoding
        :param verbose: Log verbosity level
        :param version: JSON-RPC specification version
        :param headers: Custom additional headers for each request
        :param history: History object (for tests)
        :param config: A JSONRPClib Config instance
        :param context: The optional SSLContext to use
        """
        # Store the configuration
        self._config = config
        self.__version = version or config.version

        schema, uri = splittype(uri)
        if schema not in ('http', 'https'):
            _logger.error("jsonrpclib only support http(s) URIs, not %s",
                          schema)
            raise IOError('Unsupported JSON-RPC protocol.')

        self.__host, self.__handler = splithost(uri)
        if not self.__handler:
            # Not sure if this is in the JSON spec?
            self.__handler = '/'

        if transport is None:
            if schema == 'https':
                transport = SafeTransport(config=config, context=context)
            else:
                transport = Transport(config=config)
        self.__transport = transport

        self.__encoding = encoding
        self.__verbose = verbose
        self.__history = history

        # Global custom headers are injected into Transport
        self.__transport.push_headers(headers or {})
Esempio n. 42
0
 def proxy_open(self, req, proxy, type):
     orig_type = req.get_type()
     type, r_type = splittype(proxy)
     host, XXX = splithost(r_type)
     if '@' in host:
         user_pass, host = host.split('@', 1)
         user_pass = base64.encodestring(unquote(user_pass)).strip()
         req.add_header('Proxy-Authorization', 'Basic '+user_pass)
     host = unquote(host)
     req.set_proxy(host, type)
     if orig_type == type:
         # let other handlers take care of it
         # XXX this only makes sense if the proxy is before the
         # other handlers
         return None
     else:
         # need to start over, because the other handlers don't
         # grok the proxy's URL type
         return self.parent.open(req)
Esempio n. 43
0
 def putrequest(self, method, url, skip_host=0, skip_accept_encoding=0):
     #putrequest is called before connect, so can interpret url and get
     #real host/port to be used to make CONNECT request to proxy
     proto, rest = splittype(url)
     if proto is None:
         raise ValueError("unknown URL type: %s" % url)
     #get host
     host, rest = splithost(rest)
     #try to get port
     host, port = splitport(host)
     #if port is not defined try to get from proto
     if port is None:
         try:
             port = self._ports[proto]
         except KeyError:
             raise ValueError("unknown protocol for: %s" % url)
     self._real_host = host
     self._real_port = int(port)
     M2Crypto.httpslib.HTTPSConnection.putrequest(self, method, url, skip_host, skip_accept_encoding)
Esempio n. 44
0
    def __init__(self, uri, transport=None, encoding=None,
                 verbose=None, allow_none=0):
        utype, uri = splittype(uri)
        if utype not in ("http", "https"):
            raise IOError("Unsupported JSONRPC protocol")
        self.__host, self.__handler = splithost(uri)
        if not self.__handler:
            self.__handler = "/RPC2"

        if transport is None:
            if utype == "https":
                transport = SafeTransport()
            else:
                transport = Transport()
        self.__transport = transport

        self.__encoding = encoding
        self.__verbose = verbose
        self.__allow_none = allow_none
Esempio n. 45
0
    def post(self, query):
        i = self.institution
        logging.debug('posting data to %s' % i.url)
        logging.debug('---- request ----')
        logging.debug(query)
        garbage, path = splittype(i.url)
        host, selector = splithost(path)
        h = HTTPSConnection(host)
        h.request('POST', selector, query,
                  {
                      "Content-type": "application/x-ofx",
                      "Accept": "*/*, application/x-ofx"
                  })
        res = h.getresponse()
        response = res.read().decode('ascii', 'ignore')
        logging.debug('---- response ----')
        logging.debug(res.__dict__)
        logging.debug(response)
        res.close()

        return response
Esempio n. 46
0
def parse_proxy(proxy):
    """ _parse_proxy from urllib """
    scheme, r_scheme = splittype(proxy)
    if not r_scheme.startswith("/"):
        # authority
        scheme = None
        authority = proxy
    else:
        # URL
        if not r_scheme.startswith("//"):
            raise ValueError("proxy URL with no authority: %r" % proxy)
        # We have an authority, so for RFC 3986-compliant URLs (by ss 3.
        # and 3.3.), path is empty or starts with '/'
        end = r_scheme.find("/", 2)
        if end == -1:
            end = None
        authority = r_scheme[2:end]
    userinfo, hostport = splituser(authority)
    if userinfo is not None:
        user, password = splitpasswd(userinfo)
    else:
        user = password = None
    return scheme, user, password, hostport
Esempio n. 47
0
    def __init__(self, uri, transport=None, encoding=None,
                 verbose=0, version=None, headers=None, history=None,
                 config=jsonrpclib.config.DEFAULT, context=None):
        """
        Sets up the server proxy

        :param uri: Request URI
        :param transport: Custom transport handler
        :param encoding: Specified encoding
        :param verbose: Log verbosity level
        :param version: JSON-RPC specification version
        :param headers: Custom additional headers for each request
        :param history: History object (for tests)
        :param config: A JSONRPClib Config instance
        :param context: The optional SSLContext to use
        """
        # Store the configuration
        self._config = config
        self.__version = version or config.version

        schema, uri = splittype(uri)
        use_unix = False
        if schema.startswith("unix+"):
            schema = schema[len("unix+"):]
            use_unix = True

        if schema not in ('http', 'https'):
            _logger.error("jsonrpclib only support http(s) URIs, not %s",
                          schema)
            raise IOError('Unsupported JSON-RPC protocol.')

        self.__host, self.__handler = splithost(uri)
        if use_unix:
            unix_path = self.__handler
            self.__handler = '/'
        elif not self.__handler:
            # Not sure if this is in the JSON spec?
            self.__handler = '/'

        if transport is None:
            if use_unix:
                if schema == "http":
                    # In Unix mode, we use the path part of the URL (handler)
                    # as the path to the socket file
                    transport = UnixTransport(
                        config=config, path=unix_path
                    )
            elif schema == 'https':
                transport = SafeTransport(config=config, context=context)
            else:
                transport = Transport(config=config)

            if transport is None:
                raise IOError(
                    "Unhandled combination: UNIX={}, protocol={}"
                    .format(use_unix, schema)
                )

        self.__transport = transport

        self.__encoding = encoding
        self.__verbose = verbose
        self.__history = history

        # Global custom headers are injected into Transport
        self.__transport.push_headers(headers or {})
Esempio n. 48
0
def _parse_proxy(proxy):
    """Return (scheme, user, password, host/port) given a URL or an authority.

    If a URL is supplied, it must have an authority (host:port) component.
    According to RFC 3986, having an authority component means the URL must
    have two slashes after the scheme:

    >>> _parse_proxy('file:/ftp.example.com/')
    Traceback (most recent call last):
    ValueError: proxy URL with no authority: 'file:/ftp.example.com/'

    The first three items of the returned tuple may be None.

    Examples of authority parsing:

    >>> _parse_proxy('proxy.example.com')
    (None, None, None, 'proxy.example.com')
    >>> _parse_proxy('proxy.example.com:3128')
    (None, None, None, 'proxy.example.com:3128')

    The authority component may optionally include userinfo (assumed to be
    username:password):

    >>> _parse_proxy('joe:[email protected]')
    (None, 'joe', 'password', 'proxy.example.com')
    >>> _parse_proxy('joe:[email protected]:3128')
    (None, 'joe', 'password', 'proxy.example.com:3128')

    Same examples, but with URLs instead:

    >>> _parse_proxy('http://proxy.example.com/')
    ('http', None, None, 'proxy.example.com')
    >>> _parse_proxy('http://proxy.example.com:3128/')
    ('http', None, None, 'proxy.example.com:3128')
    >>> _parse_proxy('http://*****:*****@proxy.example.com/')
    ('http', 'joe', 'password', 'proxy.example.com')
    >>> _parse_proxy('http://*****:*****@proxy.example.com:3128')
    ('http', 'joe', 'password', 'proxy.example.com:3128')

    Everything after the authority is ignored:

    >>> _parse_proxy('ftp://*****:*****@proxy.example.com/rubbish:3128')
    ('ftp', 'joe', 'password', 'proxy.example.com')

    Test for no trailing '/' case:

    >>> _parse_proxy('http://*****:*****@proxy.example.com')
    ('http', 'joe', 'password', 'proxy.example.com')

    """
    scheme, r_scheme = splittype(proxy)
    if not r_scheme.startswith("/"):
        # authority
        scheme = None
        authority = proxy
    else:
        # URL
        if not r_scheme.startswith("//"):
            raise ValueError("proxy URL with no authority: %r" % proxy)
        # We have an authority, so for RFC 3986-compliant URLs (by ss 3.
        # and 3.3.), path is empty or starts with '/'
        end = r_scheme.find("/", 2)
        if end == -1:
            end = None
        authority = r_scheme[2:end]
    userinfo, hostport = splituser(authority)
    if userinfo is not None:
        user, password = splitpasswd(userinfo)
    else:
        user = password = None
    return scheme, user, password, hostport
Esempio n. 49
0
 def get_type(self):
     if self.type is None:
         self.type, self.__r_type = splittype(self.__original)
         if self.type is None:
             raise ValueError("unknown url type: %s" % self.__original)
     return self.type