def run(self): try: self.downloaded = os.path.getsize(self.filename) except OSError: self.downloaded = 0 self.start_pointer = self.ranges[0] + self.downloaded if self.start_pointer >= self.ranges[1]: print '%s has been over.' % self.filename return types = urllib.splittype(self.url) host, res = urllib.splithost(types[1]) h = HTTP() h.connect(host) h.putrequest('GET', res) h.putheader('Host', host) h.putheader('Range', "bytes=%d-%d" % (self.start_pointer, self.ranges[1])) h.endheaders() response = h._conn.getresponse() data = response.read(self.once_buffer) while data: file_opener = open(self.filename, 'ab+') file_opener.write(data) file_opener.close() self.downloaded += len(data) data = response.read(self.once_buffer) h.close()
def get_redirect_url(url): types = urllib.splittype(url) host, res = urllib.splithost(types[1]) h = HTTP() h.connect(host) h.putrequest('HEAD', res) h.endheaders() status, reason, headers = h.getreply() return headers['location']
def get_file_size(url): types = urllib.splittype(url) host, res = urllib.splithost(types[1]) h = HTTP() h.connect(host) h.putrequest('HEAD', res) h.putheader('Host', host) h.endheaders() status, reason, headers = h.getreply() return float(headers['Content-Length'])
def main(): cur_url = "http://zh.wikipedia.org/wiki/Category:%E9%A0%81%E9%9D%A2%E5%88%86%E9%A1%9E" #"http://zh.wikipedia.org/wiki/Wikipedia:%E5%88%86%E9%A1%9E%E7%B4%A2%E5%BC%95" cur_alias = "分类" cts = {} its = {} # ItemName(Wiki Item URL) : ([Alias1, Alias2, Alias3, ...], [InItemName1, InItemName, ..], [OutItemName1, OutItemName2, ..]) h = HTTP() h.connect("zh.wikipedia.org") qcs = Queue.Queue() qns = Queue.Queue() qcs.put({"url": cur_url, "alias": cur_alias}) #tp = ThreadPool(10) tds = [Thread(target=worker, args=(qcs if i % 2 else qns, qcs, qns, cts, its)) for i in xrange(10)] t_start = time.time() for i in tds: i.start() time.sleep(5) for i in tds: i.join() #worker_c(qcs, qns, cts, its) #rqs = makeRequests(worker_c, args) #[tp.putRequest(req) for req in rqs] #try: # tp.joinAllDismissedWorkers() #except KeyboardInterrupt: # tp.joinAllDismissedWorkers() with open("CTS.db", "wb") as f: pickle.dump(cts, f) with open("ITS.db", "wb") as f: pickle.dump(its, f) print "CTS:", cts print "___________________________" print "ITEMS:", its print "Coust Time: %s" % (time.time() - t_start)
def __call__(self, *args, **kw): method = self.method if method == 'PUT' and len(args) == 1 and not kw: query = [args[0]] args = () else: query = [] for i in range(len(args)): try: k = self.args[i] if kw.has_key(k): raise TypeError, 'Keyword arg redefined' kw[k] = args[i] except IndexError: raise TypeError, 'Too many arguments' headers = {} for k, v in self.headers.items(): headers[translate(k, dashtrans)] = v method = self.method if headers.has_key('Content-Type'): content_type = headers['Content-Type'] if content_type == 'multipart/form-data': return self._mp_call(kw) else: content_type = None if not method or method == 'POST': for v in kw.values(): if hasattr(v, 'read'): return self._mp_call(kw) can_marshal = type2marshal.has_key for k, v in kw.items(): t = type(v) if can_marshal(t): q = type2marshal[t](k, v) else: q = '%s=%s' % (k, quote(v)) query.append(q) url = self.rurl if query: query = join(query, '&') method = method or 'POST' if method == 'PUT': headers['Content-Length'] = str(len(query)) if method != 'POST': url = "%s?%s" % (url, query) query = '' elif not content_type: headers['Content-Type'] = 'application/x-www-form-urlencoded' headers['Content-Length'] = str(len(query)) else: method = method or 'GET' if (self.username and self.password and not headers.has_key('Authorization')): headers['Authorization'] = ("Basic %s" % replace( encodestring('%s:%s' % (self.username, self.password)), '\012', '')) try: h = HTTP() h.connect(self.host, self.port) h.putrequest(method, self.rurl) for hn, hv in headers.items(): h.putheader(translate(hn, dashtrans), hv) h.endheaders() if query: h.send(query) ec, em, headers = h.getreply() response = h.getfile().read() except: raise NotAvailable, RemoteException(NotAvailable, sys.exc_info()[1], self.url, query) if (ec - (ec % 100)) == 200: return (headers, response) self.handleError(query, ec, em, headers, response)
def __call__(self,*args,**kw): method=self.method if method=='PUT' and len(args)==1 and not kw: query=[args[0]] args=() else: query=[] for i in range(len(args)): try: k=self.args[i] if kw.has_key(k): raise TypeError, 'Keyword arg redefined' kw[k]=args[i] except IndexError: raise TypeError, 'Too many arguments' headers={} for k, v in self.headers.items(): headers[translate(k,dashtrans)]=v method=self.method if headers.has_key('Content-Type'): content_type=headers['Content-Type'] if content_type=='multipart/form-data': return self._mp_call(kw) else: content_type=None if not method or method=='POST': for v in kw.values(): if hasattr(v,'read'): return self._mp_call(kw) can_marshal=type2marshal.has_key for k,v in kw.items(): t=type(v) if can_marshal(t): q=type2marshal[t](k,v) else: q='%s=%s' % (k,quote(v)) query.append(q) url=self.rurl if query: query=join(query,'&') method=method or 'POST' if method == 'PUT': headers['Content-Length']=str(len(query)) if method != 'POST': url="%s?%s" % (url,query) query='' elif not content_type: headers['Content-Type']='application/x-www-form-urlencoded' headers['Content-Length']=str(len(query)) else: method=method or 'GET' if (self.username and self.password and not headers.has_key('Authorization')): headers['Authorization']=( "Basic %s" % replace(encodestring('%s:%s' % (self.username,self.password)), '\012','')) try: h=HTTP() h.connect(self.host, self.port) h.putrequest(method, self.rurl) for hn,hv in headers.items(): h.putheader(translate(hn,dashtrans),hv) h.endheaders() if query: h.send(query) ec,em,headers=h.getreply() response =h.getfile().read() except: raise NotAvailable, RemoteException( NotAvailable,sys.exc_info()[1],self.url,query) if (ec - (ec % 100)) == 200: return (headers,response) self.handleError(query, ec, em, headers, response)
def __call__(self, *args, **kw): method = self.method if method == "PUT" and len(args) == 1 and not kw: query = [args[0]] args = () else: query = [] for i in range(len(args)): try: k = self.args[i] if kw.has_key(k): raise TypeError, "Keyword arg redefined" kw[k] = args[i] except IndexError: raise TypeError, "Too many arguments" headers = {} for k, v in self.headers.items(): headers[translate(k, dashtrans)] = v method = self.method if headers.has_key("Content-Type"): content_type = headers["Content-Type"] if content_type == "multipart/form-data": return self._mp_call(kw) else: content_type = None if not method or method == "POST": for v in kw.values(): if hasattr(v, "read"): return self._mp_call(kw) can_marshal = type2marshal.has_key for k, v in kw.items(): t = type(v) if can_marshal(t): q = type2marshal[t](k, v) else: q = "%s=%s" % (k, quote(v)) query.append(q) url = self.rurl if query: query = join(query, "&") method = method or "POST" if method == "PUT": headers["Content-Length"] = str(len(query)) if method != "POST": url = "%s?%s" % (url, query) query = "" elif not content_type: headers["Content-Type"] = "application/x-www-form-urlencoded" headers["Content-Length"] = str(len(query)) else: method = method or "GET" if self.username and self.password and not headers.has_key("Authorization"): headers["Authorization"] = "Basic %s" % gsub( "\012", "", encodestring("%s:%s" % (self.username, self.password)) ) try: h = HTTP() h.connect(self.host, self.port) h.putrequest(method, self.rurl) for hn, hv in headers.items(): h.putheader(translate(hn, dashtrans), hv) h.endheaders() if query: h.send(query) ec, em, headers = h.getreply() response = h.getfile().read() except: raise NotAvailable, RemoteException(NotAvailable, sys.exc_value, self.url, query) if ec == 200: return (headers, response) self.handleError(query, ec, em, headers, response)