class Ingester(threading.Thread): def __init__(self, queue, log, counter): self.log = log self.queue = queue #-- self.counter = counter #-- self.db = OpenSyllabusDb(log) self.extractor = TextExtractor(log) #-- threading.Thread.__init__(self) def run(self): while True: data_file = self.queue.get() ext = get_file_ext(data_file) #-- if ext and (ext in self.extractor.__class__.__dict__ ) and self.db.is_new(data_file): self.counter.inc_ing() file_type = get_file_type(data_file) data = getattr(self.extractor, file_type or ext)(data_file) self.db.insert_data(data_file, os.path.split(data_file)[1], data) else: if not ext: self.counter.inc_wrong() elif ext not in self.extractor.__class__.__dict__: self.counter.inc_unsupp() else: self.counter.inc_ignore() #-- self.counter.inc_proc() self.queue.task_done()
class Ingester(threading.Thread): def __init__(self, queue, log, counter): self.log = log self.queue = queue # -- self.counter = counter # -- self.db = OpenSyllabusDb(log) self.extractor = TextExtractor(log) # -- threading.Thread.__init__(self) def run(self): while True: data_file = self.queue.get() ext = get_file_ext(data_file) # -- if ext and (ext in self.extractor.__class__.__dict__) and self.db.is_new(data_file): self.counter.inc_ing() data = getattr(self.extractor, ext)(data_file) self.db.insert_data(data_file, os.path.split(data_file)[1], data) else: if not ext: self.counter.inc_wrang() elif ext not in self.extractor.__class__.__dict__: self.counter.inc_unsupp() else: self.counter.inc_ignore() # -- self.counter.inc_proc() self.queue.task_done()
def __init__(self, queue, log, counter): self.log = log self.queue = queue #-- self.counter = counter #-- self.db = OpenSyllabusDb(log) self.extractor = TextExtractor(log) #-- threading.Thread.__init__(self)
class BrokenDocsStats(object): def __init__(self, log): self.broken_ext_count = {} #-- self.db = OpenSyllabusDb(log) def _get_broken_docs_1(self): """ Get documents with empty 'text' field """ for doc in self.db.get_empty_docs(''): ext = splitext(doc['filename'])[-1] self.broken_ext_count[ext] = self.broken_ext_count.setdefault(ext, 0) + 1 print doc['path'] def _get_broken_docs_2(self): """ Get documents with null 'text' field """ for doc in self.db.get_empty_docs(None): ext = splitext(doc['filename'])[-1] self.broken_ext_count[ext] = self.broken_ext_count.setdefault(ext, 0) + 1 print doc['path'] def show_result(self): """ Show calculated statistics """ report = '\nBroken Docs Stats:\n' for ext, count in self.broken_ext_count.iteritems(): report += '%s: %s\n' % (ext, count) print '=' * 80 + report + '=' * 80 def get_broken_doc(self, doc_type): if doc_type == 'empty': self._get_broken_docs_1() else: self._get_broken_docs_2() #-- self.show_result()
def __init__(self, queue, log, counter): self.log = log self.queue = queue # -- self.counter = counter # -- self.db = OpenSyllabusDb(log) self.extractor = TextExtractor(log) # -- threading.Thread.__init__(self)
class BrokenDocsStats(object): def __init__(self, log): self.broken_ext_count = {} #-- self.db = OpenSyllabusDb(log) def _get_broken_docs_1(self): """ Get documents with empty 'text' field """ for doc in self.db.get_empty_docs(''): ext = splitext(doc['filename'])[-1] self.broken_ext_count[ext] = self.broken_ext_count.setdefault( ext, 0) + 1 print doc['path'] def _get_broken_docs_2(self): """ Get documents with null 'text' field """ for doc in self.db.get_empty_docs(None): ext = splitext(doc['filename'])[-1] self.broken_ext_count[ext] = self.broken_ext_count.setdefault( ext, 0) + 1 print doc['path'] def show_result(self): """ Show calculated statistics """ report = '\nBroken Docs Stats:\n' for ext, count in self.broken_ext_count.iteritems(): report += '%s: %s\n' % (ext, count) print '=' * 80 + report + '=' * 80 def get_broken_doc(self, doc_type): if doc_type == 'empty': self._get_broken_docs_1() else: self._get_broken_docs_2() #-- self.show_result()
def __init__(self, log): self.broken_ext_count = {} #-- self.db = OpenSyllabusDb(log)