def test_PartialWrite(self):
        """Test recovery from previous crash w/ partial write"""

        q = Queue(self.path)
        for i in range(100):
            q.put('var%d' % i)
        del q
        with open(os.path.join(self.path, 'q00000'), 'ab') as f:
            pickle.dump('文字化け', f)
        q = Queue(self.path)
        self.assertEqual(100, q.qsize())
        for i in range(100):
            self.assertEqual('var%d' % i, q.get())
            q.task_done()
        with self.assertRaises(Empty):
            q.get_nowait()
    def test_RandomReadWrite(self):
        """Test random read/write"""

        q = Queue(self.path)
        n = 0
        for i in range(1000):
            if random.random() < 0.5:
                if n > 0:
                    q.get_nowait()
                    q.task_done()
                    n -= 1
                else:
                    with self.assertRaises(Empty):
                        q.get_nowait()
            else:
                q.put('var%d' % random.getrandbits(16))
                n += 1
    def test_OpenCloseOneHundred(self):
        """Write 1000 items, close, reopen checking if all items are there"""

        q = Queue(self.path)
        for i in range(1000):
            q.put('var%d' % i)
        del q
        q = Queue(self.path)
        self.assertEqual(1000, q.qsize())
        for i in range(1000):
            data = q.get()
            self.assertEqual('var%d' % i, data)
            q.task_done()
        with self.assertRaises(Empty):
            q.get_nowait()
        # assert adding another one still works
        q.put('foobar')
        data = q.get()
    def test_MultiThreaded(self):
        """Create consumer and producer threads, check parallelism"""

        q = Queue(self.path)
        def producer():
            for i in range(1000):
                q.put('var%d' % i)

        def consumer():
            for i in range(1000):
                q.get()
                q.task_done()

        c = Thread(target = consumer)
        c.start()
        p = Thread(target = producer)
        p.start()
        c.join()
        p.join()
        with self.assertRaises(Empty):
            q.get_nowait()
Exemple #5
0
class Crawler(object):
    def __init__(self, site):
        '''
        (Crawler, str) -> Crawler
        creates a Crawler with a given origin_url
        '''
        self.site = site
        self.filters = site.referringsitefilter_set.all()
        self.domain = urlparse(site.url).netloc
        # http://alexeyvishnevsky.com/2013/11/tips-on-optimizing-scrapy-for-a-high-performance/
        # fork of pybloom: https://github.com/joseph-fox/python-bloomfilter
        self.ignore_filter = ScalableBloomFilter(initial_capacity=10000000,
                                                 error_rate=0.00001)
        ignore_filter_dir = '../ignore_filter/'
        if not os.path.exists(ignore_filter_dir):
            os.makedirs(ignore_filter_dir)
            self.ignore_filter = ScalableBloomFilter(initial_capacity=10000000,
                                                     error_rate=0.00001)
            try:
                f = open(
                    '../ignore_filter/' + self.site.name + '_ignore_file.txt',
                    'r+')
                f.write(self.ignore_filter)
            except IOError:
                f = open(
                    '../ignore_filter/' + self.site.name + '_ignore_file.txt',
                    'w+')
            f.close()
        else:
            if (not (os.path.exists('../ignore_filter/' + self.site.name +
                                    '_ignore_file.txt'))):
                f = open(
                    '../ignore_filter/' + self.site.name + '_ignore_file.txt',
                    'w+')
                f.close()

            with open('../ignore_filter/' + self.site.name +
                      '_ignore_file.txt',
                      'r+',
                      buffering=False) as ignore_filter_file:
                try:
                    for line in ignore_filter_file:
                        self.ignore_filter.add(line.decode('utf8').rstrip())
                except Exception as e:
                    logging.info(str(e))
            ignore_filter_file.close()
        self.visited_count = 0

        tmpqueuetmp_dir = '../tmpqueue/tmp/'
        if not os.path.exists(tmpqueuetmp_dir):
            os.makedirs(tmpqueuetmp_dir)

        slugified_name = slugify(unicode(site.name))
        tmpqueue_dir = '../tmpqueue/{}'.format(slugified_name)
        if not os.path.exists(tmpqueue_dir):
            os.makedirs(tmpqueue_dir)

        self.to_visit = Queue(tmpqueue_dir, tempdir=tmpqueuetmp_dir)

        # Initial url
        if (self.site.is_shallow == False):
            self.to_visit.put(site.url)
        else:
            self.to_visit.put((site.url, str(0)))

        # Limit
        self.limit = common.get_config()["crawler"]["limit"]
        # Specifies how deep the shallow crawler should go; "1" is the lowest option for this
        self.level = common.get_config()["crawler"]["level"]
        """
        self.probabilistic_n = common.get_config()["crawler"]["n"]
        self.probabilistic_k = common.get_config()["crawler"]["k"]

        self.db = psycopg2.connect(host='localhost',
                                   database=common.get_config()["crawler"]["postgresql"]["name"],
                                   user=common.get_config()["crawler"]["postgresql"]["user"],
                                   password=common.get_config()["crawler"]["postgresql"]["password"])

        self.cursor = self.db.cursor()
        self.already_added_urls = set()
        self.visited_table = "visited_" + str(site.id)
        self.tovisit_table = "tovisit_" + str(site.id)

        #self.cursor.execute("DROP TABLE IF EXISTS " + self.visited_table)
        #self.cursor.execute("CREATE TABLE " + self.visited_table + " (url VARCHAR(1024) PRIMARY KEY)")
        self.cursor.execute("DROP TABLE IF EXISTS " + self.tovisit_table)
        self.cursor.execute(u"CREATE TABLE " + self.tovisit_table + " (id SERIAL PRIMARY KEY, url VARCHAR(1024))")

        #self.cursor.execute(u"INSERT INTO " + self.visited_table + " VALUES (%s)", (site.url,))
        self.cursor.execute(u"INSERT INTO " + self.tovisit_table + " VALUES (DEFAULT, %s)", (site.url,))

        self.db.commit()
        """

    def __iter__(self):
        return self

    def next(self):
        '''
        (Crawler) -> newspaper.Article
        returns the next article in the sequence
        '''

        #standard non-recursive tree iteration
        with open('../ignore_filter/' + self.site.name + '_ignore_file.txt',
                  'a') as ignore_filter_file:
            try:
                current_level = 0
                while (True):
                    if (self.limit > 0 and self.visited_count > self.limit):
                        raise StopIteration('Limit reached: {:d}'.format(
                            self.limit))
                    # if(self.pages_visited > self.probabilistic_n):
                    #     raise StopIteration
                    # self.cursor.execute("SELECT * FROM " + self.tovisit_table + " ORDER BY id LIMIT 1")
                    # row = self.cursor.fetchone()
                    # if(row):
                    #     row_id = row[0]
                    #     current_url = row[1]
                    #     self.cursor.execute("DELETE FROM " + self.tovisit_table + " WHERE id=%s", (row_id,))
                    # else:
                    #     raise StopIteration

                    # if(self._should_skip()):
                    #     logging.info(u"skipping {0} randomly".format(current_url))
                    #     continue
                    try:
                        if (self.site.is_shallow):
                            current = self.to_visit.get_nowait()
                            current_url = current[0]
                            current_level = current[1]
                            logging.info(u"Shallow on level {0} {1}".format(
                                current_level, current_url))
                        else:
                            current_url = self.to_visit.get_nowait()
                    except Empty:
                        self.site.is_shallow = True  # On line 26 the site gets set TO DELETE
                        self.to_visit.put((self.site.url, str(0)))
                        self.ignore_filter = ScalableBloomFilter(
                            initial_capacity=10000000, error_rate=0.00001)
                        ignore_filter_file.close()
                        os.remove('../ignore_filter/' + self.site.name +
                                  '_ignore_file.txt')
                        logging.info("stopped iteration")
                        logging.info(u"{0}".format(self.site.url))
                        raise ZeroDivisionError

                    logging.info(u"visiting {0}".format(current_url))
                    self.visited_count += 1
                    #use newspaper to download and parse the article
                    article = ExplorerArticle(current_url)
                    article.download()
                    if (self.site.is_shallow):
                        if (int(current_level) > self.level):
                            continue
                    # get urls from the article
                    for link in article.get_links():
                        url = urljoin(current_url, link.href, False)
                        if self.url_in_filter(url, self.filters):
                            logging.info(
                                u"skipping url \"{0}\" because it matches filter"
                                .format(url))
                            continue
                        try:
                            parsed_url = urlparse(url)
                            parsed_as_list = list(parsed_url)

                            if (parsed_url.scheme != u"http"
                                    and parsed_url.scheme != u"https"):
                                logging.info(
                                    u"skipping url with invalid scheme: {0}".
                                    format(url))
                                continue
                            parsed_as_list[5] = ''
                            url = urlunparse(
                                urlnorm.norm_tuple(*parsed_as_list))
                        except Exception as e:
                            logging.info(
                                u"skipping malformed url {0}. Error: {1}".
                                format(url, str(e)))
                            continue
                        if (not parsed_url.netloc.endswith(self.domain)):
                            continue
                        # If the url have been added to ignore list, skip
                        if (url in self.ignore_filter):
                            continue
                        # Ignores the subscribe links for many domains
                        if (u"subscribe" in url or "subscribe" in url and
                                not (u"-subscribe" in url or "-subscribe"
                                     or u"subscribe-" in url or "subscribe-")):
                            continue

                        # Append the url to to_visit queue
                        if (self.site.is_shallow):
                            self.to_visit.put(
                                (url, str(int(current_level) + 1)))
                            logging.info(
                                u"added {0} to the to_visit as well as the level {1}"
                                .format(url, str(int(current_level) + 1)))

                            # Append the url to visited to remove duplicates
                            self.ignore_filter.add(url)
                            ignore_filter_file.write(url.encode('utf8') + "\n")
                        else:
                            self.to_visit.put(url)
                            logging.info(
                                u"added {0} to the to_visit".format(url))

                            # Append the url to visited to remove duplicates
                            self.ignore_filter.add(url)
                            ignore_filter_file.write(url.encode('utf8') + "\n")

                    # Update the Queue
                    self.to_visit.task_done()

                    return article

            except StopIteration as e:
                raise e
            except ValueError as e:
                raise ValueError
            except Exception as e:
                raise e

    def url_in_filter(self, url, filters):
        """
        Checks if any of the filters matches the url.
        Filters can be in regex search or normal string comparison.
        """
        for filt in filters:
            if ((filt.regex and re.search(filt.pattern, url, re.IGNORECASE))
                    or (not filt.regex and filt.pattern in url)):
                return True
        return False
Exemple #6
0
class MyApp(QMainWindow):
    def __init__(self):
        super().__init__()
        self.ui=Ui_MainWindow()
        self.ui.setupUi(self)

        #self.Ctl_timer = QTimer()
        ##self.Ctl_timer.setSingleShot(True)
        #self.Ctl_timer.setInterval(70)
        #self.Ctl_timer.timeout.connect(self.Ctl_loop)
        #self.Ctl_timer.start()
        
        self.N=1024*8
        self.fmt_r='Q'*self.N

        self.ip="192.168.1.2"
        self.port_s=8889
        self.port_r=8888
        self.Int=500 #unit us
        self.Scan=5000000 #unit ms, setting trigger
        self.Sim=50 #unit ns, setting simulation
        self.counter=0
        #UNIT and DATA initiation
        self._socket_s = None
        self._socket_t = None
        self._unit_connected_to = None
        self._unit_connected= False
        self.q=Queue('tmp',maxsize=self.N*2)
        #self.q=Queue('tmp')
        self.para_changed() 
        self.threadpool = QThreadPool()

        self.line,=self.ui.widget.canvas.ax.plot([1],[1],'b.',markersize=0.3)
        self.ui.widget.canvas.ax.set_ylim(-1,1)
        self.ui.widget.canvas.ax.set_xlim(0,100)

        self.ui.widget.canvas.draw()
        self.ui.statusbar.showMessage(f"Software started")
    def para_changed(self):
        self.ip=self.ui.IP_line.text()
        self.port_s=int(self.ui.Port_s_line.text())
        self.port_r=int(self.ui.Port_r_line.text())
        self.Int=int(self.ui.Int_line.text())*50
        self.Scan=int(self.ui.Scan_line.text())*50000
        self.Sim=int(int(self.ui.Sim_line.text())/20)
        if self._unit_connected:
            self.Pls_control()
    
        

    def Pls_control(self):
        i=0
        if i!=1:
            i=self.Pls_control_handler()

    def Pls_control_handler(self):
        fmt_tran='I'*3
        mark=0
        with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as self._socket_s:
            tran_raw=struct.pack(fmt_tran,self.Int,self.Scan,self.Sim)
            try:
                self._socket_s.connect((self.ip, self.port_s))
                self._socket_s.sendall(tran_raw)
                self._socket_s.settimeout(1.0)
                mark=self._socket_s.recv(4)
                if mark==b'':
                    mark=self._socket_s.recv(4)
                return 0
            except Exception as e:
                self.ui.statusbar.showMessage(f"control data send failed")
                self._socket_s.close()
                return 0
            self.ui.statusbar.showMessage(f"control data sended")
        self._socket_s.close()
        return 1

    def server_connect(self):
        if self._unit_connected==False:
            try:
                self._socket_r= socket.socket(socket.AF_INET, socket.SOCK_STREAM)  # instantiate
                self._socket_r.connect((self.ip, self.port_r))  # connect to the server
                #self._socket_s= socket.socket()  # instantiate
                #self._socket_s.connect((self.ip, self.port_s))  # connect to the server
                self.ui.IP_line.setEnabled(False)
                self.ui.Port_s_line.setEnabled(False)
                self.ui.Port_r_line.setEnabled(False)
                self.ui.Connect_button.setText("Disconnect")
                self._unit_connected_to = True
                self._unit_connected = True 
                self.ui.statusbar.showMessage(f"server connected")
                self.Pls_control()
            except Exception as ex:
                self._unit_connected_to =False
                self._unit_connected =False 

                self.ui.statusbar.showMessage(str(ex))
                #self.ui.statusbar.showMessage(f"server connect error!")
                self.ui.IP_line.setEnabled(True)
                self.ui.Port_s_line.setEnabled(True)
                self.ui.Port_r_line.setEnabled(True)
                self.ui.Connect_button.setText("Connect")
            worker_recv = Worker(self.Ctl_loop) # Any other args, kwargs are passed to the run function
            worker_plot = Worker(self.plot_loop) # Any other args, kwargs are passed to the run function
            #worker.signals.result.connect(self.print_output)
            #worker.signals.finished.connect(self.thread_complete)
            #worker.signals.progress.connect(self.progress_fn)
            self.threadpool.start(worker_recv)
            self.threadpool.start(worker_plot)
        else:
            self.counter=0
            self.threadpool.releaseThread()
            self.threadpool.releaseThread()
            #self._socket_r.close()  # instantiate
            #self._socket_s.close()  # instantiate
            self.ui.IP_line.setEnabled(True)
            self.ui.Port_s_line.setEnabled(True)
            self.ui.Port_r_line.setEnabled(True)
            self.ui.Connect_button.setText("Connect")
            self._unit_connected_to = False
            self._unit_connected = False 
            self.ui.statusbar.showMessage(f"server disconnectted")

    def plot_loop(self):
        si=2147483648;
        while self._unit_connected:
            while self._unit_connected & (self.q.qsize()>1):
                x=[]
                y=[]
                tmp=1
                while tmp<=si:
                    try:
                        raw=self.q.get_nowait()
                    except Exception as e:
                        self.ui.statusbar.showMessage(str(e))
                    ar=struct.unpack('II',struct.pack('Q',raw))
                    x_tmp=ar[0]
                    tmp=ar[1]
                    x.append(x_tmp)
                    y.append(tmp)
                y[-1]=y[-1]-si
                x=np.array(x)*20e-6
                self.ui.widget.canvas.ax.set_ylim(0,np.mean(y)*1.2)
                self.ui.widget.canvas.ax.set_xlim(0,self.Scan*20e-6)
                self.line.set_ydata(y)
                self.line.set_xdata(x)
                self.ui.widget.canvas.draw()
                time.sleep((self.Scan*20e-9)*0.5)
                self.ui.widget.canvas.flush_events()
                self.ui.statusbar.showMessage(f"data plotting, "+str(self.counter))
                self.counter=self.counter+1

        #self.Plot_timer.start()

    def Ctl_loop(self):
        while self._unit_connected:
            if self._unit_connected_to:
                self.ui.statusbar.showMessage(f"data receiving")
                data_raw = self._socket_r.recv(8*self.N,socket.MSG_WAITALL)  # receive response
            else:
                self._socket_r= socket.socket(socket.AF_INET, socket.SOCK_STREAM)  # instantiate
                self._socket_r.connect((self.ip, self.port_r))  # connect to the server
                self.ui.statusbar.showMessage(f"data receiving")
                data_raw = self._socket_r.recv(8*self.N,socket.MSG_WAITALL)  # receive response
            self.ui.statusbar.showMessage(f"data received")
            data=struct.unpack(self.fmt_r,data_raw)
            #self.plot_data(data)
            try:
                list(map(self.q.put_nowait,data))
            except Exception as e:
                self.ui.statusbar.showMessage(str(e))
            self._socket_r.close()
            self._unit_connected_to=False
        #self.Ctl_timer.start()
        
    def plot_data(self,data):
        si=2147483648;
        x=[]
        y=[]
        for i in range(self.N):
            x.append(data[i*2])
            y.append(data[i*2+1])
        y=np.array(y)
        I=y>si
        y[I]=y[I]-si
        x=np.array(x)*20e-6
        ##self.ui.statusbar.showMessage(f"data plotting, "+str(self.counter))
        ##self.counter=self.counter+1
        self.ui.widget.canvas.ax.set_ylim(0,np.max(y)*1.2)
        self.ui.widget.canvas.ax.set_xlim(0,self.Scan*20e-6)
        ##self.line.set_ydata(y)
        ##self.line.set_xdata(x)
        ##self.ui.widget.canvas.draw()
        ##self.ui.widget.canvas.flush_events()
        II=np.where(np.diff(x)<0)[0]
        III=[0]
        for i in II:
            III.append(i)
        III.append(len(y)-1)
        sl=len(II)+1;
        print(II)
        for i in range(sl): 
            xx=x[III[i]:III[i+1]]
            yy=y[III[i]:III[i+1]]
            self.line.set_ydata(y)
            self.line.set_xdata(x)
            self.ui.widget.canvas.draw()
            self.ui.widget.canvas.flush_events()
            self.ui.statusbar.showMessage(f"data plotting, "+str(self.counter))
            self.counter=self.counter+1
Exemple #7
0
class Crawler(object):
    def __init__(self, site):
        '''
        (Crawler, str) -> Crawler
        creates a Crawler with a given origin_url
        '''
        self.site = site
        self.filters = site.referringsitefilter_set.all()
        self.domain = urlparse(site.url).netloc
        # http://alexeyvishnevsky.com/2013/11/tips-on-optimizing-scrapy-for-a-high-performance/
        # fork of pybloom: https://github.com/joseph-fox/python-bloomfilter
        self.ignore_filter = ScalableBloomFilter(
                initial_capacity=10000000,
                error_rate=0.00001)
        ignore_filter_dir='../ignore_filter/'
        if not os.path.exists(ignore_filter_dir):
            os.makedirs(ignore_filter_dir)
            self.ignore_filter = ScalableBloomFilter(
                initial_capacity=10000000,
                error_rate=0.00001)
            try:
            	f = open('../ignore_filter/' + self.site.name + '_ignore_file.txt', 'r+')
            	f.write(self.ignore_filter)
            except IOError:
            	f = open('../ignore_filter/' + self.site.name + '_ignore_file.txt', 'w+')
            f.close()
        else:
            if (not(os.path.exists('../ignore_filter/' + self.site.name + '_ignore_file.txt'))):
                f = open('../ignore_filter/' + self.site.name + '_ignore_file.txt', 'w+')
                f.close()

            with open('../ignore_filter/' + self.site.name + '_ignore_file.txt', 'r+', buffering=False) as ignore_filter_file:
                try:
                    for line in ignore_filter_file:
                        self.ignore_filter.add(line.decode('utf8').rstrip())
                except Exception as e:
                    logging.info(str(e))
            ignore_filter_file.close()
        self.visited_count = 0

        tmpqueuetmp_dir='../tmpqueue/tmp/'
        if not os.path.exists(tmpqueuetmp_dir):
            os.makedirs(tmpqueuetmp_dir)

        slugified_name = slugify(unicode(site.name))
        tmpqueue_dir = '../tmpqueue/{}'.format(slugified_name)
        if not os.path.exists(tmpqueue_dir):
            os.makedirs(tmpqueue_dir)

        self.to_visit = Queue(tmpqueue_dir, tempdir=tmpqueuetmp_dir)

        # Initial url
        if (self.site.is_shallow == False):
            self.to_visit.put(site.url)
        else:
            self.to_visit.put((site.url, str(0)))

        # Limit
        self.limit = common.get_config()["crawler"]["limit"]
        # Specifies how deep the shallow crawler should go; "1" is the lowest option for this
        self.level = common.get_config()["crawler"]["level"]
        """
        self.probabilistic_n = common.get_config()["crawler"]["n"]
        self.probabilistic_k = common.get_config()["crawler"]["k"]

        self.db = psycopg2.connect(host='localhost',
                                   database=common.get_config()["crawler"]["postgresql"]["name"],
                                   user=common.get_config()["crawler"]["postgresql"]["user"],
                                   password=common.get_config()["crawler"]["postgresql"]["password"])

        self.cursor = self.db.cursor()
        self.already_added_urls = set()
        self.visited_table = "visited_" + str(site.id)
        self.tovisit_table = "tovisit_" + str(site.id)

        #self.cursor.execute("DROP TABLE IF EXISTS " + self.visited_table)
        #self.cursor.execute("CREATE TABLE " + self.visited_table + " (url VARCHAR(1024) PRIMARY KEY)")
        self.cursor.execute("DROP TABLE IF EXISTS " + self.tovisit_table)
        self.cursor.execute(u"CREATE TABLE " + self.tovisit_table + " (id SERIAL PRIMARY KEY, url VARCHAR(1024))")

        #self.cursor.execute(u"INSERT INTO " + self.visited_table + " VALUES (%s)", (site.url,))
        self.cursor.execute(u"INSERT INTO " + self.tovisit_table + " VALUES (DEFAULT, %s)", (site.url,))

        self.db.commit()
        """

    def __iter__(self):
        return self

    def next(self):
        '''
        (Crawler) -> newspaper.Article
        returns the next article in the sequence
        '''

        #standard non-recursive tree iteration
        with open('../ignore_filter/' + self.site.name + '_ignore_file.txt', 'a') as ignore_filter_file:
            try:
                current_level = 0
                while(True):
                    if (self.limit > 0 and self.visited_count > self.limit):
                        raise StopIteration('Limit reached: {:d}'.format(self.limit))
                    # if(self.pages_visited > self.probabilistic_n):
                    #     raise StopIteration
                    # self.cursor.execute("SELECT * FROM " + self.tovisit_table + " ORDER BY id LIMIT 1")
                    # row = self.cursor.fetchone()
                    # if(row):
                    #     row_id = row[0]
                    #     current_url = row[1]
                    #     self.cursor.execute("DELETE FROM " + self.tovisit_table + " WHERE id=%s", (row_id,))
                    # else:
                    #     raise StopIteration

                    # if(self._should_skip()):
                    #     logging.info(u"skipping {0} randomly".format(current_url))
                    #     continue
                    try:
                        if (self.site.is_shallow):
                            current = self.to_visit.get_nowait()
                            current_url = current[0]
                            current_level = current[1]
                            logging.info(u"Shallow on level {0} {1}".format(current_level, current_url))
                        else:
                            current_url = self.to_visit.get_nowait()
                    except Empty:
                        self.site.is_shallow = True # On line 26 the site gets set TO DELETE
                        self.to_visit.put((self.site.url, str(0)))
                        self.ignore_filter = ScalableBloomFilter(
                        initial_capacity=10000000,
                        error_rate=0.00001)
                        ignore_filter_file.close()
                        os.remove('../ignore_filter/' + self.site.name + '_ignore_file.txt')
                        logging.info("stopped iteration")
                        logging.info(u"{0}".format(self.site.url))
                        raise ZeroDivisionError


                    logging.info(u"visiting {0}".format(current_url))
                    self.visited_count += 1
                    #use newspaper to download and parse the article
                    article = ExplorerArticle(current_url)
                    article.download()
                    if (self.site.is_shallow):
                        if (int(current_level) > self.level):
                            continue
                    # get urls from the article
                    for link in article.get_links():
                        url = urljoin(current_url, link.href, False)
                        if self.url_in_filter(url, self.filters):
                            logging.info(u"skipping url \"{0}\" because it matches filter".format(url))
                            continue
                        try:
                            parsed_url = urlparse(url)
                            parsed_as_list = list(parsed_url)

                            if(parsed_url.scheme != u"http" and parsed_url.scheme != u"https"):
                                logging.info(u"skipping url with invalid scheme: {0}".format(url))
                                continue
                            parsed_as_list[5] = ''
                            url = urlunparse(urlnorm.norm_tuple(*parsed_as_list))
                        except Exception as e:
                            logging.info(u"skipping malformed url {0}. Error: {1}".format(url, str(e)))
                            continue
                        if(not parsed_url.netloc.endswith(self.domain)):
                            continue
                        # If the url have been added to ignore list, skip
                        if (url in self.ignore_filter):
                            continue
                        # Ignores the subscribe links for many domains
                        if (u"subscribe" in url or "subscribe" in url and not(u"-subscribe" in url or "-subscribe" or u"subscribe-" in url or "subscribe-")):
                        	continue

                        # Append the url to to_visit queue
                        if (self.site.is_shallow):
                            self.to_visit.put((url, str(int(current_level) + 1)))
                            logging.info(u"added {0} to the to_visit as well as the level {1}".format(url, str(int(current_level) + 1)))

                            # Append the url to visited to remove duplicates
                            self.ignore_filter.add(url)
                            ignore_filter_file.write(url.encode('utf8') + "\n")
                        else:
                            self.to_visit.put(url)
                            logging.info(u"added {0} to the to_visit".format(url))

                            # Append the url to visited to remove duplicates
                            self.ignore_filter.add(url)
                            ignore_filter_file.write(url.encode('utf8') + "\n")

                    # Update the Queue
                    self.to_visit.task_done()


                    return article


            except StopIteration as e:
                raise e
            except ValueError as e:
                raise ValueError
            except Exception as e:
                raise e

    def url_in_filter(self, url, filters):
        """
        Checks if any of the filters matches the url.
        Filters can be in regex search or normal string comparison.
        """
        for filt in filters:
            if ((filt.regex and re.search(filt.pattern, url, re.IGNORECASE)) or
                (not filt.regex and filt.pattern in url)):
                return True
        return False
Exemple #8
0
class Rotator(object):
    def __init__(self, config):
        self.config = config
        self.dateformat = config['dateformat']
        self.keep_files = int(config['rotate'])
        self.now = datetime.datetime.now()
        self.dateext = self.now.strftime(self.dateformat)
        self.mode = config['mode']
        self.compress = config['compress']
        self.user = config['user']
        self.group = config['group']
        self.sharedscripts = config['sharedscripts']
        self.destext = config['destext']
        self.copy = config['copy']
        self.copytohdfs = config['copytohdfs']
        self.prerotates = config['prerotate']
        self.postrotates = config['postrotate']
        self.hdfs_config = config['hdfs']
        self.queuepath = config['queuepath']
        self.queue_chunksize = 1000
        self.queue_block_timeout = 30
        self.queue = Queue(self.queuepath, self.queue_chunksize)
        self.client = None
        if self.hdfs_config:
            self.client = hdfs.InsecureClient(**self.hdfs_config)

    def get_rotated_dir(self, path):
        destext = self.now.strftime(self.destext)
        dest_dir = '{}-{}'.format(path, destext)
        return dest_dir

    def get_rotated_time(self, dest_path):
        dateext = dest_path.rsplit('-', 1)[-1]
        # remove gz ext
        dateext = dateext.split('.')[0]
        return datetime.datetime.strptime('-{}'.format(dateext),
                                          self.dateformat)

    def is_rotated_file(self, dest_path):
        try:
            t = self.get_rotated_time(dest_path)
            return bool(t)
        except:
            return False

    def get_dest_path(self, path):
        rotated_dir = self.get_rotated_dir(path)
        filename = os.path.split(path)[-1]
        dest_path = os.path.join(rotated_dir,
                                 '{}{}'.format(filename, self.dateext))
        return dest_path

    def remove_old_files(self, path):
        rotated_dir = self.get_rotated_dir(path)
        filename = os.path.split(path)[-1]
        path = os.path.join(rotated_dir, filename)
        glob_path = '{}-*'.format(path)
        files = [f for f in glob.glob(glob_path) if self.is_rotated_file(f)]
        files.sort(key=self.get_rotated_time, reverse=True)
        for f in files[self.keep_files:]:
            os.remove(f)

    def create_rotated_dir(self, path):
        rotated_dir = self.get_rotated_dir(path)
        makedirs(rotated_dir, 0755)
        chown(rotated_dir, self.user, self.group)

    def rename_file(self, path):
        self.create_rotated_dir(path)
        dest_path = self.get_dest_path(path)
        shutil.move(path, dest_path)

        self.queue.put((path, dest_path), timeout=self.queue_block_timeout)

        os.chmod(dest_path, self.mode)
        chown(dest_path, self.user, self.group)
        return dest_path

    def compress_file(self, dest_path):
        gzip(dest_path)
        return '{}.gz'.format(dest_path)

    def _copy_file(self, path, from_, to):
        if not to:
            return
        dest = os.path.normpath(path.replace(from_, to))
        dest_dir = os.path.dirname(dest)
        if not os.path.exists(dest_dir):
            makedirs(dest_dir, 0755)
            chown(dest_dir, self.user, self.group)
        if path.startswith(from_):
            shutil.copy(path, dest)

    def copy_file(self, dest_path):
        if isinstance(self.copy, dict):
            self.copy = [self.copy]

        for item in self.copy:
            to = item.get('to')
            from_ = item.get('from', '')
            self._copy_file(dest_path, from_, to)

    def _copy_to_hdfs(self, client, path, from_, to):
        if not to:
            return
        dest = os.path.normpath(path.replace(from_, to))
        if path.startswith(from_):
            client.upload(dest, path, overwrite=True, cleanup=True)

    def copy_to_hdfs(self, path):
        if not (self.copytohdfs and self.hdfs_config):
            return
        for item in self.copytohdfs:
            to = item.get('to')
            from_ = item.get('from', '')
            self._copy_to_hdfs(self.client, path, from_, to)

    def secure_copy(self):
        to_be_clean = set()
        while True:
            try:
                path, rotated_path = self.queue.get_nowait()
                rotated_path_before = rotated_path
                if not os.path.exists(rotated_path):
                    self.queue.task_done()
                    continue

                if self.compress:
                    rotated_path = self.compress_file(rotated_path)
                if self.copy:
                    self.copy_file(rotated_path)

                if self.copytohdfs:
                    self.copy_to_hdfs(rotated_path)

                self.queue.task_done()

                if self.compress:
                    os.remove(rotated_path_before)

                to_be_clean.add(path)
            except Empty:
                break
            except Exception as e:
                print e

        for path in to_be_clean:
            self.remove_old_files(path)

    def rotate(self):
        if self.sharedscripts:
            self.prerotate()

        for f in iterate_log_paths(self.config['paths']):
            if not self.sharedscripts:
                self.prerotate()

            self.rename_file(f)

            if not self.sharedscripts:
                self.postrotate()

        if self.sharedscripts:
            self.postrotate()

        self.secure_copy()

    def prerotate(self):
        for cmd in self.prerotates:
            run(cmd)

    def postrotate(self):
        for cmd in self.postrotates:
            run(cmd)
Exemple #9
0
class Rotator(object):
    def __init__(self, config):
        self.paths = config['paths']

        self.mode = int(config['mode'], 8)
        self.user = config['user']
        self.group = config['group']

        # FIXME: Handle rotated files keeping correctly
        # self.keep_files = int(config['rotate'])
        self.compress = config['compress']

        self.copy = config['copy']
        self.copytohdfs = config['copytohdfs']
        self.hdfs_config = config['hdfs']
        self.hdfs_client = None
        if self.hdfs_config:
            self.hdfs_client = hdfs.InsecureClient(**self.hdfs_config)

        self.dateformat = config['dateformat']
        self.now = datetime.datetime.now()
        self.timestamp = self.now.strftime(self.dateformat)
        self.destext = config['destext']

        self.fnformat = config['fnformat']
        if not self.fnformat:
            raise ValueError("'fnformat' cannot be empty")

        self.sharedscripts = config['sharedscripts']
        self.prerotates = config['prerotate']
        self.postrotates = config['postrotate']

        self.queuepath = config['queuepath']
        self.queue_chunksize = 1000
        self.queue_block_timeout = 30
        self.queue = Queue(self.queuepath, self.queue_chunksize)

    def get_rotated_dir(self, path):
        destext = self.now.strftime(self.destext)
        dest_dir = '{}-{}'.format(path, destext)
        return dest_dir

    def get_dest_path(self, path):
        rotated_dir = self.get_rotated_dir(path)
        logname = os.path.basename(path)
        dest_path = os.path.join(
            rotated_dir,
            self.fnformat.format(logname=logname,
                                 timestamp=self.timestamp,
                                 hostname=socket.gethostname()))
        return dest_path

    def create_rotated_dir(self, path):
        rotated_dir = self.get_rotated_dir(path)
        makedirs(rotated_dir, 0o755)
        chown(rotated_dir, self.user, self.group)

    def rename_file(self, path):
        self.create_rotated_dir(path)

        dest_path = self.get_dest_path(path)
        shutil.move(path, dest_path)

        self.queue.put((path, dest_path), timeout=self.queue_block_timeout)

        os.chmod(dest_path, self.mode)
        chown(dest_path, self.user, self.group)
        return dest_path

    def compress_file(self, dest_path):
        gzip(dest_path)
        return '{}.gz'.format(dest_path)

    def _copy_file(self, path, from_, to):
        if not to:
            return
        dest = os.path.normpath(path.replace(from_, to))
        dest_dir = os.path.dirname(dest)
        if not os.path.exists(dest_dir):
            makedirs(dest_dir, 0o755)
            chown(dest_dir, self.user, self.group)
        if path.startswith(from_):
            shutil.copy2(path, dest)

    def copy_file(self, dest_path):
        if isinstance(self.copy, dict):
            self.copy = [self.copy]

        for item in self.copy:
            to = item.get('to')
            from_ = item.get('from', '')
            self._copy_file(dest_path, from_, to)

    def _copy_to_hdfs(self, client, path, from_, to):
        if not to:
            return
        dest = os.path.normpath(path.replace(from_, to))
        if path.startswith(from_):
            client.upload(dest, path, overwrite=True, cleanup=True)

    def copy_to_hdfs(self, path):
        if not (self.copytohdfs and self.hdfs_config):
            return
        for item in self.copytohdfs:
            to = item.get('to')
            from_ = item.get('from', '')
            self._copy_to_hdfs(self.hdfs_client, path, from_, to)

    def secure_copy(self):
        while True:
            try:
                path, rotated_path = self.queue.get_nowait()
                rotated_path_before = rotated_path
                if not os.path.exists(rotated_path):
                    self.queue.task_done()
                    continue

                if self.compress:
                    rotated_path = self.compress_file(rotated_path)

                if self.copy:
                    self.copy_file(rotated_path)

                if self.copytohdfs:
                    self.copy_to_hdfs(rotated_path)

                if self.compress:
                    os.remove(rotated_path_before)

                self.queue.task_done()

            except Empty:
                break
            except Exception as e:
                print(e)
                raise

    def rotate(self):
        if self.sharedscripts:
            self.prerotate()

        for f in iterate_log_paths(self.paths):
            if is_empty_file(f):
                continue

            if not self.sharedscripts:
                self.prerotate()

            self.rename_file(f)

            if not self.sharedscripts:
                self.postrotate()

        if self.sharedscripts:
            self.postrotate()

        self.secure_copy()

    def prerotate(self):
        for cmd in self.prerotates:
            run(cmd)

    def postrotate(self):
        for cmd in self.postrotates:
            run(cmd)