Ejemplo n.º 1
0
class Ingester(threading.Thread):
    def __init__(self, queue, log, counter):
        self.log = log
        self.queue = queue
        #--
        self.counter = counter
        #--
        self.db = OpenSyllabusDb(log)
        self.extractor = TextExtractor(log)
        #--
        threading.Thread.__init__(self)

    def run(self):
        while True:
            data_file = self.queue.get()
            ext = get_file_ext(data_file)
            #--
            if ext and (ext in self.extractor.__class__.__dict__
                        ) and self.db.is_new(data_file):
                self.counter.inc_ing()
                file_type = get_file_type(data_file)
                data = getattr(self.extractor, file_type or ext)(data_file)
                self.db.insert_data(data_file,
                                    os.path.split(data_file)[1], data)
            else:
                if not ext:
                    self.counter.inc_wrong()
                elif ext not in self.extractor.__class__.__dict__:
                    self.counter.inc_unsupp()
                else:
                    self.counter.inc_ignore()
            #--
            self.counter.inc_proc()
            self.queue.task_done()
Ejemplo n.º 2
0
class Ingester(threading.Thread):
    def __init__(self, queue, log, counter):
        self.log = log
        self.queue = queue
        # --
        self.counter = counter
        # --
        self.db = OpenSyllabusDb(log)
        self.extractor = TextExtractor(log)
        # --
        threading.Thread.__init__(self)

    def run(self):
        while True:
            data_file = self.queue.get()
            ext = get_file_ext(data_file)
            # --
            if ext and (ext in self.extractor.__class__.__dict__) and self.db.is_new(data_file):
                self.counter.inc_ing()
                data = getattr(self.extractor, ext)(data_file)
                self.db.insert_data(data_file, os.path.split(data_file)[1], data)
            else:
                if not ext:
                    self.counter.inc_wrang()
                elif ext not in self.extractor.__class__.__dict__:
                    self.counter.inc_unsupp()
                else:
                    self.counter.inc_ignore()
            # --
            self.counter.inc_proc()
            self.queue.task_done()
Ejemplo n.º 3
0
 def __init__(self, queue, log, counter):
     self.log = log
     self.queue = queue
     #--
     self.counter = counter
     #--
     self.db = OpenSyllabusDb(log)
     self.extractor = TextExtractor(log)
     #--
     threading.Thread.__init__(self)
Ejemplo n.º 4
0
class BrokenDocsStats(object):
    
    def __init__(self, log):
        self.broken_ext_count = {}
        #--
        self.db = OpenSyllabusDb(log)


    def _get_broken_docs_1(self):
        """
        Get documents with empty 'text' field
        """
        for doc in self.db.get_empty_docs(''):
            ext = splitext(doc['filename'])[-1]
            self.broken_ext_count[ext] = self.broken_ext_count.setdefault(ext, 0) + 1
            print doc['path']


    def _get_broken_docs_2(self):
        """
        Get documents with null 'text' field
        """        
        for doc in self.db.get_empty_docs(None):
            ext = splitext(doc['filename'])[-1]
            self.broken_ext_count[ext] = self.broken_ext_count.setdefault(ext, 0) + 1
            print doc['path']
    
    def show_result(self):
        """
        Show calculated statistics
        """        
        report = '\nBroken Docs Stats:\n' 
        for ext, count in self.broken_ext_count.iteritems():
            report += '%s: %s\n' % (ext, count)
        print '=' * 80 + report + '=' * 80

    
    def get_broken_doc(self, doc_type):
        if doc_type == 'empty':
            self._get_broken_docs_1()
        else:
            self._get_broken_docs_2()
        #--
        self.show_result()
Ejemplo n.º 5
0
 def __init__(self, queue, log, counter):
     self.log = log
     self.queue = queue
     # --
     self.counter = counter
     # --
     self.db = OpenSyllabusDb(log)
     self.extractor = TextExtractor(log)
     # --
     threading.Thread.__init__(self)
Ejemplo n.º 6
0
class BrokenDocsStats(object):
    def __init__(self, log):
        self.broken_ext_count = {}
        #--
        self.db = OpenSyllabusDb(log)

    def _get_broken_docs_1(self):
        """
        Get documents with empty 'text' field
        """
        for doc in self.db.get_empty_docs(''):
            ext = splitext(doc['filename'])[-1]
            self.broken_ext_count[ext] = self.broken_ext_count.setdefault(
                ext, 0) + 1
            print doc['path']

    def _get_broken_docs_2(self):
        """
        Get documents with null 'text' field
        """
        for doc in self.db.get_empty_docs(None):
            ext = splitext(doc['filename'])[-1]
            self.broken_ext_count[ext] = self.broken_ext_count.setdefault(
                ext, 0) + 1
            print doc['path']

    def show_result(self):
        """
        Show calculated statistics
        """
        report = '\nBroken Docs Stats:\n'
        for ext, count in self.broken_ext_count.iteritems():
            report += '%s: %s\n' % (ext, count)
        print '=' * 80 + report + '=' * 80

    def get_broken_doc(self, doc_type):
        if doc_type == 'empty':
            self._get_broken_docs_1()
        else:
            self._get_broken_docs_2()
        #--
        self.show_result()
Ejemplo n.º 7
0
 def __init__(self, log):
     self.broken_ext_count = {}
     #--
     self.db = OpenSyllabusDb(log)
Ejemplo n.º 8
0
 def __init__(self, log):
     self.broken_ext_count = {}
     #--
     self.db = OpenSyllabusDb(log)