Ejemplo n.º 1
0
def extract(file_path):
    '''
    a single task a worker would perform
    return a tuple(whehter the parsing is successful, whether there is now document has been added)
    '''
    extractor = ExtractorFactory.get_extractor(file_path)
    extension = os.path.splitext(file_path)[-1]
    if extractor is not None and extension in ['.txt'
                                               ]:  # only parse the txt file
        print 'Parsing', file_path

        # if extractor.is_parsed():
        #     print 'This file has been parse '
        #     return False
        try:
            if extractor.parse_document():
                inc = extractor.update(ignore_parsed=True)
                return True, inc
            else:
                logging.error('Failed to parse %s, continue' % (file_path, ))
                return False, 0
        except DuplicateKeyError:
            logging.error('document existed, continue...')
            return False, 0
    else:
        return False, 0
Ejemplo n.º 2
0
def extract(file_path):
    '''
    a single task a worker would perform
    return a tuple(whehter the parsing is successful, whether there is now document has been added)
    '''
    extractor = ExtractorFactory.get_extractor(file_path)
    extension = os.path.splitext(file_path)[-1]
    if extractor is not None and extension in ['.txt']: # only parse the txt file
        print 'Parsing', file_path

        # if extractor.is_parsed():
        #     print 'This file has been parse '
        #     return False
        try:
            if extractor.parse_document():
                inc = extractor.update(ignore_parsed=True)
                return True, inc
            else:
                logging.error('Failed to parse %s, continue' % (file_path,))
                return False, 0
        except DuplicateKeyError:
            logging.error('document existed, continue...')
            return False, 0
    else:
        return False, 0
Ejemplo n.º 3
0
    def run(self):
        while True:
            # get the file_path from the Q
            if not self.queue.empty():
                file_path = self.queue.get()
                extension = os.path.splitext(file_path)[-1]

                # get the extractor from unique factory
                extractor_lock.acquire()
                extractor = ExtractorFactory.get_extractor(file_path)
                extractor_lock.release()

                if extractor is not None and extension in [
                        '.txt'
                ]:  # only parse the txt file
                    print 'worker', os.getpid(), 'is parsing', file_path
                    try:
                        if extractor.parse_document():
                            extractor.insert()
                        else:
                            print 'parser error for %s, continue' % (
                                file_path, )
                    except DuplicateKeyError:
                        print 'document existed, continue'
                        # self.queue.task_done()
                        continue  # get the next document
                continue
            else:
                break
        return
Ejemplo n.º 4
0
    def run(self):
        while True:
            # get the file_path from the Q
            if not self.queue.empty():
                file_path = self.queue.get()
                extension = os.path.splitext(file_path)[-1]

                # get the extractor from unique factory
                extractor_lock.acquire()
                extractor = ExtractorFactory.get_extractor(file_path)
                extractor_lock.release()

                if extractor is not None and extension in ['.txt']: # only parse the txt file
                    print 'worker', os.getpid(), 'is parsing', file_path
                    try:
                        if extractor.parse_document():
                            extractor.insert()
                        else:
                            print 'parser error for %s, continue' % (file_path,)
                    except DuplicateKeyError:
                        print 'document existed, continue'
                        # self.queue.task_done()
                        continue # get the next document
                continue
            else:
                break
        return