def test_PartialWrite(self): """Test recovery from previous crash w/ partial write""" q = Queue(self.path) for i in range(100): q.put('var%d' % i) del q with open(os.path.join(self.path, 'q00000'), 'ab') as f: pickle.dump('文字化け', f) q = Queue(self.path) self.assertEqual(100, q.qsize()) for i in range(100): self.assertEqual('var%d' % i, q.get()) q.task_done() with self.assertRaises(Empty): q.get_nowait()
def test_RandomReadWrite(self): """Test random read/write""" q = Queue(self.path) n = 0 for i in range(1000): if random.random() < 0.5: if n > 0: q.get_nowait() q.task_done() n -= 1 else: with self.assertRaises(Empty): q.get_nowait() else: q.put('var%d' % random.getrandbits(16)) n += 1
def test_OpenCloseOneHundred(self): """Write 1000 items, close, reopen checking if all items are there""" q = Queue(self.path) for i in range(1000): q.put('var%d' % i) del q q = Queue(self.path) self.assertEqual(1000, q.qsize()) for i in range(1000): data = q.get() self.assertEqual('var%d' % i, data) q.task_done() with self.assertRaises(Empty): q.get_nowait() # assert adding another one still works q.put('foobar') data = q.get()
def test_MultiThreaded(self): """Create consumer and producer threads, check parallelism""" q = Queue(self.path) def producer(): for i in range(1000): q.put('var%d' % i) def consumer(): for i in range(1000): q.get() q.task_done() c = Thread(target = consumer) c.start() p = Thread(target = producer) p.start() c.join() p.join() with self.assertRaises(Empty): q.get_nowait()
class Crawler(object): def __init__(self, site): ''' (Crawler, str) -> Crawler creates a Crawler with a given origin_url ''' self.site = site self.filters = site.referringsitefilter_set.all() self.domain = urlparse(site.url).netloc # http://alexeyvishnevsky.com/2013/11/tips-on-optimizing-scrapy-for-a-high-performance/ # fork of pybloom: https://github.com/joseph-fox/python-bloomfilter self.ignore_filter = ScalableBloomFilter(initial_capacity=10000000, error_rate=0.00001) ignore_filter_dir = '../ignore_filter/' if not os.path.exists(ignore_filter_dir): os.makedirs(ignore_filter_dir) self.ignore_filter = ScalableBloomFilter(initial_capacity=10000000, error_rate=0.00001) try: f = open( '../ignore_filter/' + self.site.name + '_ignore_file.txt', 'r+') f.write(self.ignore_filter) except IOError: f = open( '../ignore_filter/' + self.site.name + '_ignore_file.txt', 'w+') f.close() else: if (not (os.path.exists('../ignore_filter/' + self.site.name + '_ignore_file.txt'))): f = open( '../ignore_filter/' + self.site.name + '_ignore_file.txt', 'w+') f.close() with open('../ignore_filter/' + self.site.name + '_ignore_file.txt', 'r+', buffering=False) as ignore_filter_file: try: for line in ignore_filter_file: self.ignore_filter.add(line.decode('utf8').rstrip()) except Exception as e: logging.info(str(e)) ignore_filter_file.close() self.visited_count = 0 tmpqueuetmp_dir = '../tmpqueue/tmp/' if not os.path.exists(tmpqueuetmp_dir): os.makedirs(tmpqueuetmp_dir) slugified_name = slugify(unicode(site.name)) tmpqueue_dir = '../tmpqueue/{}'.format(slugified_name) if not os.path.exists(tmpqueue_dir): os.makedirs(tmpqueue_dir) self.to_visit = Queue(tmpqueue_dir, tempdir=tmpqueuetmp_dir) # Initial url if (self.site.is_shallow == False): self.to_visit.put(site.url) else: self.to_visit.put((site.url, str(0))) # Limit self.limit = common.get_config()["crawler"]["limit"] # Specifies how deep the shallow crawler should go; "1" is the lowest option for this self.level = common.get_config()["crawler"]["level"] """ self.probabilistic_n = common.get_config()["crawler"]["n"] self.probabilistic_k = common.get_config()["crawler"]["k"] self.db = psycopg2.connect(host='localhost', database=common.get_config()["crawler"]["postgresql"]["name"], user=common.get_config()["crawler"]["postgresql"]["user"], password=common.get_config()["crawler"]["postgresql"]["password"]) self.cursor = self.db.cursor() self.already_added_urls = set() self.visited_table = "visited_" + str(site.id) self.tovisit_table = "tovisit_" + str(site.id) #self.cursor.execute("DROP TABLE IF EXISTS " + self.visited_table) #self.cursor.execute("CREATE TABLE " + self.visited_table + " (url VARCHAR(1024) PRIMARY KEY)") self.cursor.execute("DROP TABLE IF EXISTS " + self.tovisit_table) self.cursor.execute(u"CREATE TABLE " + self.tovisit_table + " (id SERIAL PRIMARY KEY, url VARCHAR(1024))") #self.cursor.execute(u"INSERT INTO " + self.visited_table + " VALUES (%s)", (site.url,)) self.cursor.execute(u"INSERT INTO " + self.tovisit_table + " VALUES (DEFAULT, %s)", (site.url,)) self.db.commit() """ def __iter__(self): return self def next(self): ''' (Crawler) -> newspaper.Article returns the next article in the sequence ''' #standard non-recursive tree iteration with open('../ignore_filter/' + self.site.name + '_ignore_file.txt', 'a') as ignore_filter_file: try: current_level = 0 while (True): if (self.limit > 0 and self.visited_count > self.limit): raise StopIteration('Limit reached: {:d}'.format( self.limit)) # if(self.pages_visited > self.probabilistic_n): # raise StopIteration # self.cursor.execute("SELECT * FROM " + self.tovisit_table + " ORDER BY id LIMIT 1") # row = self.cursor.fetchone() # if(row): # row_id = row[0] # current_url = row[1] # self.cursor.execute("DELETE FROM " + self.tovisit_table + " WHERE id=%s", (row_id,)) # else: # raise StopIteration # if(self._should_skip()): # logging.info(u"skipping {0} randomly".format(current_url)) # continue try: if (self.site.is_shallow): current = self.to_visit.get_nowait() current_url = current[0] current_level = current[1] logging.info(u"Shallow on level {0} {1}".format( current_level, current_url)) else: current_url = self.to_visit.get_nowait() except Empty: self.site.is_shallow = True # On line 26 the site gets set TO DELETE self.to_visit.put((self.site.url, str(0))) self.ignore_filter = ScalableBloomFilter( initial_capacity=10000000, error_rate=0.00001) ignore_filter_file.close() os.remove('../ignore_filter/' + self.site.name + '_ignore_file.txt') logging.info("stopped iteration") logging.info(u"{0}".format(self.site.url)) raise ZeroDivisionError logging.info(u"visiting {0}".format(current_url)) self.visited_count += 1 #use newspaper to download and parse the article article = ExplorerArticle(current_url) article.download() if (self.site.is_shallow): if (int(current_level) > self.level): continue # get urls from the article for link in article.get_links(): url = urljoin(current_url, link.href, False) if self.url_in_filter(url, self.filters): logging.info( u"skipping url \"{0}\" because it matches filter" .format(url)) continue try: parsed_url = urlparse(url) parsed_as_list = list(parsed_url) if (parsed_url.scheme != u"http" and parsed_url.scheme != u"https"): logging.info( u"skipping url with invalid scheme: {0}". format(url)) continue parsed_as_list[5] = '' url = urlunparse( urlnorm.norm_tuple(*parsed_as_list)) except Exception as e: logging.info( u"skipping malformed url {0}. Error: {1}". format(url, str(e))) continue if (not parsed_url.netloc.endswith(self.domain)): continue # If the url have been added to ignore list, skip if (url in self.ignore_filter): continue # Ignores the subscribe links for many domains if (u"subscribe" in url or "subscribe" in url and not (u"-subscribe" in url or "-subscribe" or u"subscribe-" in url or "subscribe-")): continue # Append the url to to_visit queue if (self.site.is_shallow): self.to_visit.put( (url, str(int(current_level) + 1))) logging.info( u"added {0} to the to_visit as well as the level {1}" .format(url, str(int(current_level) + 1))) # Append the url to visited to remove duplicates self.ignore_filter.add(url) ignore_filter_file.write(url.encode('utf8') + "\n") else: self.to_visit.put(url) logging.info( u"added {0} to the to_visit".format(url)) # Append the url to visited to remove duplicates self.ignore_filter.add(url) ignore_filter_file.write(url.encode('utf8') + "\n") # Update the Queue self.to_visit.task_done() return article except StopIteration as e: raise e except ValueError as e: raise ValueError except Exception as e: raise e def url_in_filter(self, url, filters): """ Checks if any of the filters matches the url. Filters can be in regex search or normal string comparison. """ for filt in filters: if ((filt.regex and re.search(filt.pattern, url, re.IGNORECASE)) or (not filt.regex and filt.pattern in url)): return True return False
class MyApp(QMainWindow): def __init__(self): super().__init__() self.ui=Ui_MainWindow() self.ui.setupUi(self) #self.Ctl_timer = QTimer() ##self.Ctl_timer.setSingleShot(True) #self.Ctl_timer.setInterval(70) #self.Ctl_timer.timeout.connect(self.Ctl_loop) #self.Ctl_timer.start() self.N=1024*8 self.fmt_r='Q'*self.N self.ip="192.168.1.2" self.port_s=8889 self.port_r=8888 self.Int=500 #unit us self.Scan=5000000 #unit ms, setting trigger self.Sim=50 #unit ns, setting simulation self.counter=0 #UNIT and DATA initiation self._socket_s = None self._socket_t = None self._unit_connected_to = None self._unit_connected= False self.q=Queue('tmp',maxsize=self.N*2) #self.q=Queue('tmp') self.para_changed() self.threadpool = QThreadPool() self.line,=self.ui.widget.canvas.ax.plot([1],[1],'b.',markersize=0.3) self.ui.widget.canvas.ax.set_ylim(-1,1) self.ui.widget.canvas.ax.set_xlim(0,100) self.ui.widget.canvas.draw() self.ui.statusbar.showMessage(f"Software started") def para_changed(self): self.ip=self.ui.IP_line.text() self.port_s=int(self.ui.Port_s_line.text()) self.port_r=int(self.ui.Port_r_line.text()) self.Int=int(self.ui.Int_line.text())*50 self.Scan=int(self.ui.Scan_line.text())*50000 self.Sim=int(int(self.ui.Sim_line.text())/20) if self._unit_connected: self.Pls_control() def Pls_control(self): i=0 if i!=1: i=self.Pls_control_handler() def Pls_control_handler(self): fmt_tran='I'*3 mark=0 with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as self._socket_s: tran_raw=struct.pack(fmt_tran,self.Int,self.Scan,self.Sim) try: self._socket_s.connect((self.ip, self.port_s)) self._socket_s.sendall(tran_raw) self._socket_s.settimeout(1.0) mark=self._socket_s.recv(4) if mark==b'': mark=self._socket_s.recv(4) return 0 except Exception as e: self.ui.statusbar.showMessage(f"control data send failed") self._socket_s.close() return 0 self.ui.statusbar.showMessage(f"control data sended") self._socket_s.close() return 1 def server_connect(self): if self._unit_connected==False: try: self._socket_r= socket.socket(socket.AF_INET, socket.SOCK_STREAM) # instantiate self._socket_r.connect((self.ip, self.port_r)) # connect to the server #self._socket_s= socket.socket() # instantiate #self._socket_s.connect((self.ip, self.port_s)) # connect to the server self.ui.IP_line.setEnabled(False) self.ui.Port_s_line.setEnabled(False) self.ui.Port_r_line.setEnabled(False) self.ui.Connect_button.setText("Disconnect") self._unit_connected_to = True self._unit_connected = True self.ui.statusbar.showMessage(f"server connected") self.Pls_control() except Exception as ex: self._unit_connected_to =False self._unit_connected =False self.ui.statusbar.showMessage(str(ex)) #self.ui.statusbar.showMessage(f"server connect error!") self.ui.IP_line.setEnabled(True) self.ui.Port_s_line.setEnabled(True) self.ui.Port_r_line.setEnabled(True) self.ui.Connect_button.setText("Connect") worker_recv = Worker(self.Ctl_loop) # Any other args, kwargs are passed to the run function worker_plot = Worker(self.plot_loop) # Any other args, kwargs are passed to the run function #worker.signals.result.connect(self.print_output) #worker.signals.finished.connect(self.thread_complete) #worker.signals.progress.connect(self.progress_fn) self.threadpool.start(worker_recv) self.threadpool.start(worker_plot) else: self.counter=0 self.threadpool.releaseThread() self.threadpool.releaseThread() #self._socket_r.close() # instantiate #self._socket_s.close() # instantiate self.ui.IP_line.setEnabled(True) self.ui.Port_s_line.setEnabled(True) self.ui.Port_r_line.setEnabled(True) self.ui.Connect_button.setText("Connect") self._unit_connected_to = False self._unit_connected = False self.ui.statusbar.showMessage(f"server disconnectted") def plot_loop(self): si=2147483648; while self._unit_connected: while self._unit_connected & (self.q.qsize()>1): x=[] y=[] tmp=1 while tmp<=si: try: raw=self.q.get_nowait() except Exception as e: self.ui.statusbar.showMessage(str(e)) ar=struct.unpack('II',struct.pack('Q',raw)) x_tmp=ar[0] tmp=ar[1] x.append(x_tmp) y.append(tmp) y[-1]=y[-1]-si x=np.array(x)*20e-6 self.ui.widget.canvas.ax.set_ylim(0,np.mean(y)*1.2) self.ui.widget.canvas.ax.set_xlim(0,self.Scan*20e-6) self.line.set_ydata(y) self.line.set_xdata(x) self.ui.widget.canvas.draw() time.sleep((self.Scan*20e-9)*0.5) self.ui.widget.canvas.flush_events() self.ui.statusbar.showMessage(f"data plotting, "+str(self.counter)) self.counter=self.counter+1 #self.Plot_timer.start() def Ctl_loop(self): while self._unit_connected: if self._unit_connected_to: self.ui.statusbar.showMessage(f"data receiving") data_raw = self._socket_r.recv(8*self.N,socket.MSG_WAITALL) # receive response else: self._socket_r= socket.socket(socket.AF_INET, socket.SOCK_STREAM) # instantiate self._socket_r.connect((self.ip, self.port_r)) # connect to the server self.ui.statusbar.showMessage(f"data receiving") data_raw = self._socket_r.recv(8*self.N,socket.MSG_WAITALL) # receive response self.ui.statusbar.showMessage(f"data received") data=struct.unpack(self.fmt_r,data_raw) #self.plot_data(data) try: list(map(self.q.put_nowait,data)) except Exception as e: self.ui.statusbar.showMessage(str(e)) self._socket_r.close() self._unit_connected_to=False #self.Ctl_timer.start() def plot_data(self,data): si=2147483648; x=[] y=[] for i in range(self.N): x.append(data[i*2]) y.append(data[i*2+1]) y=np.array(y) I=y>si y[I]=y[I]-si x=np.array(x)*20e-6 ##self.ui.statusbar.showMessage(f"data plotting, "+str(self.counter)) ##self.counter=self.counter+1 self.ui.widget.canvas.ax.set_ylim(0,np.max(y)*1.2) self.ui.widget.canvas.ax.set_xlim(0,self.Scan*20e-6) ##self.line.set_ydata(y) ##self.line.set_xdata(x) ##self.ui.widget.canvas.draw() ##self.ui.widget.canvas.flush_events() II=np.where(np.diff(x)<0)[0] III=[0] for i in II: III.append(i) III.append(len(y)-1) sl=len(II)+1; print(II) for i in range(sl): xx=x[III[i]:III[i+1]] yy=y[III[i]:III[i+1]] self.line.set_ydata(y) self.line.set_xdata(x) self.ui.widget.canvas.draw() self.ui.widget.canvas.flush_events() self.ui.statusbar.showMessage(f"data plotting, "+str(self.counter)) self.counter=self.counter+1
class Crawler(object): def __init__(self, site): ''' (Crawler, str) -> Crawler creates a Crawler with a given origin_url ''' self.site = site self.filters = site.referringsitefilter_set.all() self.domain = urlparse(site.url).netloc # http://alexeyvishnevsky.com/2013/11/tips-on-optimizing-scrapy-for-a-high-performance/ # fork of pybloom: https://github.com/joseph-fox/python-bloomfilter self.ignore_filter = ScalableBloomFilter( initial_capacity=10000000, error_rate=0.00001) ignore_filter_dir='../ignore_filter/' if not os.path.exists(ignore_filter_dir): os.makedirs(ignore_filter_dir) self.ignore_filter = ScalableBloomFilter( initial_capacity=10000000, error_rate=0.00001) try: f = open('../ignore_filter/' + self.site.name + '_ignore_file.txt', 'r+') f.write(self.ignore_filter) except IOError: f = open('../ignore_filter/' + self.site.name + '_ignore_file.txt', 'w+') f.close() else: if (not(os.path.exists('../ignore_filter/' + self.site.name + '_ignore_file.txt'))): f = open('../ignore_filter/' + self.site.name + '_ignore_file.txt', 'w+') f.close() with open('../ignore_filter/' + self.site.name + '_ignore_file.txt', 'r+', buffering=False) as ignore_filter_file: try: for line in ignore_filter_file: self.ignore_filter.add(line.decode('utf8').rstrip()) except Exception as e: logging.info(str(e)) ignore_filter_file.close() self.visited_count = 0 tmpqueuetmp_dir='../tmpqueue/tmp/' if not os.path.exists(tmpqueuetmp_dir): os.makedirs(tmpqueuetmp_dir) slugified_name = slugify(unicode(site.name)) tmpqueue_dir = '../tmpqueue/{}'.format(slugified_name) if not os.path.exists(tmpqueue_dir): os.makedirs(tmpqueue_dir) self.to_visit = Queue(tmpqueue_dir, tempdir=tmpqueuetmp_dir) # Initial url if (self.site.is_shallow == False): self.to_visit.put(site.url) else: self.to_visit.put((site.url, str(0))) # Limit self.limit = common.get_config()["crawler"]["limit"] # Specifies how deep the shallow crawler should go; "1" is the lowest option for this self.level = common.get_config()["crawler"]["level"] """ self.probabilistic_n = common.get_config()["crawler"]["n"] self.probabilistic_k = common.get_config()["crawler"]["k"] self.db = psycopg2.connect(host='localhost', database=common.get_config()["crawler"]["postgresql"]["name"], user=common.get_config()["crawler"]["postgresql"]["user"], password=common.get_config()["crawler"]["postgresql"]["password"]) self.cursor = self.db.cursor() self.already_added_urls = set() self.visited_table = "visited_" + str(site.id) self.tovisit_table = "tovisit_" + str(site.id) #self.cursor.execute("DROP TABLE IF EXISTS " + self.visited_table) #self.cursor.execute("CREATE TABLE " + self.visited_table + " (url VARCHAR(1024) PRIMARY KEY)") self.cursor.execute("DROP TABLE IF EXISTS " + self.tovisit_table) self.cursor.execute(u"CREATE TABLE " + self.tovisit_table + " (id SERIAL PRIMARY KEY, url VARCHAR(1024))") #self.cursor.execute(u"INSERT INTO " + self.visited_table + " VALUES (%s)", (site.url,)) self.cursor.execute(u"INSERT INTO " + self.tovisit_table + " VALUES (DEFAULT, %s)", (site.url,)) self.db.commit() """ def __iter__(self): return self def next(self): ''' (Crawler) -> newspaper.Article returns the next article in the sequence ''' #standard non-recursive tree iteration with open('../ignore_filter/' + self.site.name + '_ignore_file.txt', 'a') as ignore_filter_file: try: current_level = 0 while(True): if (self.limit > 0 and self.visited_count > self.limit): raise StopIteration('Limit reached: {:d}'.format(self.limit)) # if(self.pages_visited > self.probabilistic_n): # raise StopIteration # self.cursor.execute("SELECT * FROM " + self.tovisit_table + " ORDER BY id LIMIT 1") # row = self.cursor.fetchone() # if(row): # row_id = row[0] # current_url = row[1] # self.cursor.execute("DELETE FROM " + self.tovisit_table + " WHERE id=%s", (row_id,)) # else: # raise StopIteration # if(self._should_skip()): # logging.info(u"skipping {0} randomly".format(current_url)) # continue try: if (self.site.is_shallow): current = self.to_visit.get_nowait() current_url = current[0] current_level = current[1] logging.info(u"Shallow on level {0} {1}".format(current_level, current_url)) else: current_url = self.to_visit.get_nowait() except Empty: self.site.is_shallow = True # On line 26 the site gets set TO DELETE self.to_visit.put((self.site.url, str(0))) self.ignore_filter = ScalableBloomFilter( initial_capacity=10000000, error_rate=0.00001) ignore_filter_file.close() os.remove('../ignore_filter/' + self.site.name + '_ignore_file.txt') logging.info("stopped iteration") logging.info(u"{0}".format(self.site.url)) raise ZeroDivisionError logging.info(u"visiting {0}".format(current_url)) self.visited_count += 1 #use newspaper to download and parse the article article = ExplorerArticle(current_url) article.download() if (self.site.is_shallow): if (int(current_level) > self.level): continue # get urls from the article for link in article.get_links(): url = urljoin(current_url, link.href, False) if self.url_in_filter(url, self.filters): logging.info(u"skipping url \"{0}\" because it matches filter".format(url)) continue try: parsed_url = urlparse(url) parsed_as_list = list(parsed_url) if(parsed_url.scheme != u"http" and parsed_url.scheme != u"https"): logging.info(u"skipping url with invalid scheme: {0}".format(url)) continue parsed_as_list[5] = '' url = urlunparse(urlnorm.norm_tuple(*parsed_as_list)) except Exception as e: logging.info(u"skipping malformed url {0}. Error: {1}".format(url, str(e))) continue if(not parsed_url.netloc.endswith(self.domain)): continue # If the url have been added to ignore list, skip if (url in self.ignore_filter): continue # Ignores the subscribe links for many domains if (u"subscribe" in url or "subscribe" in url and not(u"-subscribe" in url or "-subscribe" or u"subscribe-" in url or "subscribe-")): continue # Append the url to to_visit queue if (self.site.is_shallow): self.to_visit.put((url, str(int(current_level) + 1))) logging.info(u"added {0} to the to_visit as well as the level {1}".format(url, str(int(current_level) + 1))) # Append the url to visited to remove duplicates self.ignore_filter.add(url) ignore_filter_file.write(url.encode('utf8') + "\n") else: self.to_visit.put(url) logging.info(u"added {0} to the to_visit".format(url)) # Append the url to visited to remove duplicates self.ignore_filter.add(url) ignore_filter_file.write(url.encode('utf8') + "\n") # Update the Queue self.to_visit.task_done() return article except StopIteration as e: raise e except ValueError as e: raise ValueError except Exception as e: raise e def url_in_filter(self, url, filters): """ Checks if any of the filters matches the url. Filters can be in regex search or normal string comparison. """ for filt in filters: if ((filt.regex and re.search(filt.pattern, url, re.IGNORECASE)) or (not filt.regex and filt.pattern in url)): return True return False
class Rotator(object): def __init__(self, config): self.config = config self.dateformat = config['dateformat'] self.keep_files = int(config['rotate']) self.now = datetime.datetime.now() self.dateext = self.now.strftime(self.dateformat) self.mode = config['mode'] self.compress = config['compress'] self.user = config['user'] self.group = config['group'] self.sharedscripts = config['sharedscripts'] self.destext = config['destext'] self.copy = config['copy'] self.copytohdfs = config['copytohdfs'] self.prerotates = config['prerotate'] self.postrotates = config['postrotate'] self.hdfs_config = config['hdfs'] self.queuepath = config['queuepath'] self.queue_chunksize = 1000 self.queue_block_timeout = 30 self.queue = Queue(self.queuepath, self.queue_chunksize) self.client = None if self.hdfs_config: self.client = hdfs.InsecureClient(**self.hdfs_config) def get_rotated_dir(self, path): destext = self.now.strftime(self.destext) dest_dir = '{}-{}'.format(path, destext) return dest_dir def get_rotated_time(self, dest_path): dateext = dest_path.rsplit('-', 1)[-1] # remove gz ext dateext = dateext.split('.')[0] return datetime.datetime.strptime('-{}'.format(dateext), self.dateformat) def is_rotated_file(self, dest_path): try: t = self.get_rotated_time(dest_path) return bool(t) except: return False def get_dest_path(self, path): rotated_dir = self.get_rotated_dir(path) filename = os.path.split(path)[-1] dest_path = os.path.join(rotated_dir, '{}{}'.format(filename, self.dateext)) return dest_path def remove_old_files(self, path): rotated_dir = self.get_rotated_dir(path) filename = os.path.split(path)[-1] path = os.path.join(rotated_dir, filename) glob_path = '{}-*'.format(path) files = [f for f in glob.glob(glob_path) if self.is_rotated_file(f)] files.sort(key=self.get_rotated_time, reverse=True) for f in files[self.keep_files:]: os.remove(f) def create_rotated_dir(self, path): rotated_dir = self.get_rotated_dir(path) makedirs(rotated_dir, 0755) chown(rotated_dir, self.user, self.group) def rename_file(self, path): self.create_rotated_dir(path) dest_path = self.get_dest_path(path) shutil.move(path, dest_path) self.queue.put((path, dest_path), timeout=self.queue_block_timeout) os.chmod(dest_path, self.mode) chown(dest_path, self.user, self.group) return dest_path def compress_file(self, dest_path): gzip(dest_path) return '{}.gz'.format(dest_path) def _copy_file(self, path, from_, to): if not to: return dest = os.path.normpath(path.replace(from_, to)) dest_dir = os.path.dirname(dest) if not os.path.exists(dest_dir): makedirs(dest_dir, 0755) chown(dest_dir, self.user, self.group) if path.startswith(from_): shutil.copy(path, dest) def copy_file(self, dest_path): if isinstance(self.copy, dict): self.copy = [self.copy] for item in self.copy: to = item.get('to') from_ = item.get('from', '') self._copy_file(dest_path, from_, to) def _copy_to_hdfs(self, client, path, from_, to): if not to: return dest = os.path.normpath(path.replace(from_, to)) if path.startswith(from_): client.upload(dest, path, overwrite=True, cleanup=True) def copy_to_hdfs(self, path): if not (self.copytohdfs and self.hdfs_config): return for item in self.copytohdfs: to = item.get('to') from_ = item.get('from', '') self._copy_to_hdfs(self.client, path, from_, to) def secure_copy(self): to_be_clean = set() while True: try: path, rotated_path = self.queue.get_nowait() rotated_path_before = rotated_path if not os.path.exists(rotated_path): self.queue.task_done() continue if self.compress: rotated_path = self.compress_file(rotated_path) if self.copy: self.copy_file(rotated_path) if self.copytohdfs: self.copy_to_hdfs(rotated_path) self.queue.task_done() if self.compress: os.remove(rotated_path_before) to_be_clean.add(path) except Empty: break except Exception as e: print e for path in to_be_clean: self.remove_old_files(path) def rotate(self): if self.sharedscripts: self.prerotate() for f in iterate_log_paths(self.config['paths']): if not self.sharedscripts: self.prerotate() self.rename_file(f) if not self.sharedscripts: self.postrotate() if self.sharedscripts: self.postrotate() self.secure_copy() def prerotate(self): for cmd in self.prerotates: run(cmd) def postrotate(self): for cmd in self.postrotates: run(cmd)
class Rotator(object): def __init__(self, config): self.paths = config['paths'] self.mode = int(config['mode'], 8) self.user = config['user'] self.group = config['group'] # FIXME: Handle rotated files keeping correctly # self.keep_files = int(config['rotate']) self.compress = config['compress'] self.copy = config['copy'] self.copytohdfs = config['copytohdfs'] self.hdfs_config = config['hdfs'] self.hdfs_client = None if self.hdfs_config: self.hdfs_client = hdfs.InsecureClient(**self.hdfs_config) self.dateformat = config['dateformat'] self.now = datetime.datetime.now() self.timestamp = self.now.strftime(self.dateformat) self.destext = config['destext'] self.fnformat = config['fnformat'] if not self.fnformat: raise ValueError("'fnformat' cannot be empty") self.sharedscripts = config['sharedscripts'] self.prerotates = config['prerotate'] self.postrotates = config['postrotate'] self.queuepath = config['queuepath'] self.queue_chunksize = 1000 self.queue_block_timeout = 30 self.queue = Queue(self.queuepath, self.queue_chunksize) def get_rotated_dir(self, path): destext = self.now.strftime(self.destext) dest_dir = '{}-{}'.format(path, destext) return dest_dir def get_dest_path(self, path): rotated_dir = self.get_rotated_dir(path) logname = os.path.basename(path) dest_path = os.path.join( rotated_dir, self.fnformat.format(logname=logname, timestamp=self.timestamp, hostname=socket.gethostname())) return dest_path def create_rotated_dir(self, path): rotated_dir = self.get_rotated_dir(path) makedirs(rotated_dir, 0o755) chown(rotated_dir, self.user, self.group) def rename_file(self, path): self.create_rotated_dir(path) dest_path = self.get_dest_path(path) shutil.move(path, dest_path) self.queue.put((path, dest_path), timeout=self.queue_block_timeout) os.chmod(dest_path, self.mode) chown(dest_path, self.user, self.group) return dest_path def compress_file(self, dest_path): gzip(dest_path) return '{}.gz'.format(dest_path) def _copy_file(self, path, from_, to): if not to: return dest = os.path.normpath(path.replace(from_, to)) dest_dir = os.path.dirname(dest) if not os.path.exists(dest_dir): makedirs(dest_dir, 0o755) chown(dest_dir, self.user, self.group) if path.startswith(from_): shutil.copy2(path, dest) def copy_file(self, dest_path): if isinstance(self.copy, dict): self.copy = [self.copy] for item in self.copy: to = item.get('to') from_ = item.get('from', '') self._copy_file(dest_path, from_, to) def _copy_to_hdfs(self, client, path, from_, to): if not to: return dest = os.path.normpath(path.replace(from_, to)) if path.startswith(from_): client.upload(dest, path, overwrite=True, cleanup=True) def copy_to_hdfs(self, path): if not (self.copytohdfs and self.hdfs_config): return for item in self.copytohdfs: to = item.get('to') from_ = item.get('from', '') self._copy_to_hdfs(self.hdfs_client, path, from_, to) def secure_copy(self): while True: try: path, rotated_path = self.queue.get_nowait() rotated_path_before = rotated_path if not os.path.exists(rotated_path): self.queue.task_done() continue if self.compress: rotated_path = self.compress_file(rotated_path) if self.copy: self.copy_file(rotated_path) if self.copytohdfs: self.copy_to_hdfs(rotated_path) if self.compress: os.remove(rotated_path_before) self.queue.task_done() except Empty: break except Exception as e: print(e) raise def rotate(self): if self.sharedscripts: self.prerotate() for f in iterate_log_paths(self.paths): if is_empty_file(f): continue if not self.sharedscripts: self.prerotate() self.rename_file(f) if not self.sharedscripts: self.postrotate() if self.sharedscripts: self.postrotate() self.secure_copy() def prerotate(self): for cmd in self.prerotates: run(cmd) def postrotate(self): for cmd in self.postrotates: run(cmd)