def extract(file_path): ''' a single task a worker would perform return a tuple(whehter the parsing is successful, whether there is now document has been added) ''' extractor = ExtractorFactory.get_extractor(file_path) extension = os.path.splitext(file_path)[-1] if extractor is not None and extension in ['.txt' ]: # only parse the txt file print 'Parsing', file_path # if extractor.is_parsed(): # print 'This file has been parse ' # return False try: if extractor.parse_document(): inc = extractor.update(ignore_parsed=True) return True, inc else: logging.error('Failed to parse %s, continue' % (file_path, )) return False, 0 except DuplicateKeyError: logging.error('document existed, continue...') return False, 0 else: return False, 0
def extract(file_path): ''' a single task a worker would perform return a tuple(whehter the parsing is successful, whether there is now document has been added) ''' extractor = ExtractorFactory.get_extractor(file_path) extension = os.path.splitext(file_path)[-1] if extractor is not None and extension in ['.txt']: # only parse the txt file print 'Parsing', file_path # if extractor.is_parsed(): # print 'This file has been parse ' # return False try: if extractor.parse_document(): inc = extractor.update(ignore_parsed=True) return True, inc else: logging.error('Failed to parse %s, continue' % (file_path,)) return False, 0 except DuplicateKeyError: logging.error('document existed, continue...') return False, 0 else: return False, 0
def run(self): while True: # get the file_path from the Q if not self.queue.empty(): file_path = self.queue.get() extension = os.path.splitext(file_path)[-1] # get the extractor from unique factory extractor_lock.acquire() extractor = ExtractorFactory.get_extractor(file_path) extractor_lock.release() if extractor is not None and extension in [ '.txt' ]: # only parse the txt file print 'worker', os.getpid(), 'is parsing', file_path try: if extractor.parse_document(): extractor.insert() else: print 'parser error for %s, continue' % ( file_path, ) except DuplicateKeyError: print 'document existed, continue' # self.queue.task_done() continue # get the next document continue else: break return
def run(self): while True: # get the file_path from the Q if not self.queue.empty(): file_path = self.queue.get() extension = os.path.splitext(file_path)[-1] # get the extractor from unique factory extractor_lock.acquire() extractor = ExtractorFactory.get_extractor(file_path) extractor_lock.release() if extractor is not None and extension in ['.txt']: # only parse the txt file print 'worker', os.getpid(), 'is parsing', file_path try: if extractor.parse_document(): extractor.insert() else: print 'parser error for %s, continue' % (file_path,) except DuplicateKeyError: print 'document existed, continue' # self.queue.task_done() continue # get the next document continue else: break return