Ejemplo n.º 1
0
 def dump_project_settings(self, project):
     path = os.path.join(self.output_dir, project.name,
                         '{0}.meta.json.xz'.format(project.name))
     self.fp = self.get_fp(path)
     self.writer = ProjectSettingsWriter(self.fp)
     self.writer.write_project(project)
     self.close_fp()
Ejemplo n.º 2
0
 def dump_project_settings(self, project):
     path = os.path.join(self.output_dir, project.name,
                         '{0}.meta.json.xz'.format(project.name))
     self.fp = self.get_fp(path)
     self.writer = ProjectSettingsWriter(self.fp)
     self.writer.write_project(project)
     self.close_fp()
Ejemplo n.º 3
0
class Exporter:
    def __init__(self, output_dir, format="beacon", settings={}):
        super().__init__()

        self.setup_format(format)
        self.output_dir = output_dir
        self.settings = settings
        self.after = self.settings['after']
        self.max_items = self.settings['max_items']

        self.projects_count = 0
        self.items_count = 0
        self.last_date = None

        self.lzma = True
        self.extension = 'txt.xz'

        # Length of directory name
        self.dir_length = settings['dir_length']
        # Number of characters from the right are not used in directory name
        # in other words, number of _
        self.max_right = settings['max_right']
        # Number of characters from the left that are used in file name
        # in other words, number of characters that are not in directory name and not _
        self.file_length = settings['file_length']

        # Example of settings:
        # dir_length = 2
        # max_right = 4
        # file_length = 2
        # output: projectname/00/01/000100____.txt, projectname/01/01__.txt

        self.fp = None
        self.writer = None
        self.project_result_sorters = {}

        self.working_set_filename = os.path.join(output_dir,
                                                 'current_working_set.txt')

    def setup_format(self, format):
        self.format = registry[format]

    def make_output_dir(self):
        if not os.path.isdir(self.output_dir):
            os.makedirs(self.output_dir)

    def dump(self):
        self.make_output_dir()

        database_busy_file = self.settings.get('database_busy_file')

        if database_busy_file:
            with open(database_busy_file, 'w'):
                pass

        self._drain_to_working_set()

        if database_busy_file:
            os.remove(database_busy_file)

        self._feed_input_sorters()

        with new_session() as session:
            for project_id, sorter in self.project_result_sorters.items():
                project = session.query(Project).filter_by(
                    name=project_id).first()

                if self.settings['include_settings']:
                    self.dump_project_settings(project)

                self.dump_project(project, sorter)

                if self.settings['zip']:
                    self.zip_project(project)

        os.remove(self.working_set_filename)

    def _drain_to_working_set(self, size=1000):
        logger.info('Draining to working set %s', self.working_set_filename)

        assert not os.path.exists(self.working_set_filename)

        with new_session() as session:
            query = session.query(Result)

            if self.after:
                query = query.filter(Result.datetime > self.after)

            with open(self.working_set_filename, 'wb') as work_file:
                last_id = -1
                num_results = 0
                running = True

                while running:
                    # Optimized for SQLite scrolling window
                    rows = query.filter(Result.id > last_id).limit(size).all()

                    if not rows:
                        break

                    delete_ids = []

                    for result in rows:
                        line = base64.b64encode(
                            pickle.dumps({
                                'id': result.id,
                                'project_id': result.project_id,
                                'shortcode': result.shortcode,
                                'url': result.url,
                                'encoding': result.encoding,
                                'datetime': result.datetime,
                            }))
                        work_file.write(line)
                        work_file.write(b'\n')

                        num_results += 1
                        self.items_count += 1

                        delete_ids.append(result.id)

                        if num_results % 10000 == 0:
                            logger.info('Drain progress: %d', num_results)

                        if num_results % 100000 == 0:
                            # Risky, but need to do this since WAL
                            # performance is low on large transactions
                            logger.info(
                                "Checkpoint. (Don't delete stray files if program crashes!)"
                            )
                            work_file.flush()
                            session.commit()

                        if self.max_items and num_results >= self.max_items:
                            logger.info('Reached max items %d.',
                                        self.max_items)
                            running = False
                            break

                    if self.settings['delete']:
                        delete_query = delete(Result).where(
                            Result.id == bindparam('id'))
                        session.execute(delete_query, [{
                            'id': result_id
                        } for result_id in delete_ids])

    def _feed_input_sorters(self):
        num_results = 0

        with open(self.working_set_filename, 'rb') as work_file:
            for line in work_file:
                result = pickle.loads(base64.b64decode(line))

                if result['project_id'] not in self.project_result_sorters:
                    self.project_result_sorters[result['project_id']] = \
                        GNUExternalSort(temp_dir=self.output_dir,
                                        temp_prefix='tott-{0}-'.format(
                                            result['project_id']
                                            )
                                        )
                    self.projects_count += 1

                sorter = self.project_result_sorters[result['project_id']]
                sorter.input(result['shortcode'],
                             (result['id'], result['url'], result['encoding'],
                              result['datetime']))
                num_results += 1

                if num_results % 10000 == 0:
                    logger.info('Sort progress: %d', num_results)

    def dump_project(self, project, sorter):
        logger.info('Looking in project %s', project.name)

        if project.url_template.endswith('{shortcode}'):
            site = project.url_template.replace('{shortcode}', '')
        else:
            site = project.url_template

        last_filename = None

        for i, (key, value) in enumerate(sorter.sort()):
            if i % 10000 == 0:
                logger.info('Format progress: %d/%d', i, sorter.rows)

            id_, url, encoding, datetime_ = value
            result = ResultContainer(id_, key, url, encoding, datetime_)

            # we can do this as the query is sorted
            # so that item that would end up together
            # would returned together
            filename = self.get_filename(project, result)
            if filename != last_filename:
                self.close_fp()

                logger.info('Writing results to file %s.', filename)
                assert not os.path.isfile(
                    filename), 'Target file %s already exists' % (filename)

                self.fp = self.get_fp(filename)
                self.writer = self.format(self.fp)
                self.writer.write_header(site)

                last_filename = filename

            for encoding in (result.encoding, 'latin-1', 'cp437', 'utf-8'):
                try:
                    result.url.encode(encoding)
                except UnicodeError:
                    logger.warning('Encoding failed %s|%s %s.',
                                   result.shortcode,
                                   repr(result.url),
                                   encoding,
                                   exc_info=True)
                    continue
                else:
                    self.writer.write_shortcode(result.shortcode, result.url,
                                                encoding)
                    break
            else:
                raise Exception('Unable to encode {}|{} {}'.format(
                    result.shortcode, repr(result.url), result.encoding))

            if not self.last_date or result.datetime > self.last_date:
                self.last_date = result.datetime

        self.close_fp()

    def dump_project_settings(self, project):
        path = os.path.join(self.output_dir, project.name,
                            '{0}.meta.json.xz'.format(project.name))
        self.fp = self.get_fp(path)
        self.writer = ProjectSettingsWriter(self.fp)
        self.writer.write_project(project)
        self.close_fp()

    def zip_project(self, project):
        project_path = os.path.join(self.output_dir, project.name)

        filename = project.name

        if self.settings.get('zip_filename_infix'):
            filename += self.settings['zip_filename_infix']

        zip_path = os.path.join(self.output_dir, '{0}.zip'.format(filename))

        assert not os.path.isfile(
            zip_path), 'Target file %s already exists' % (zip_path)

        with zipfile.ZipFile(zip_path,
                             mode='w',
                             compression=zipfile.ZIP_STORED) as zip_file:
            for root, dirs, files in os.walk(project_path):
                for file in files:
                    file_path = os.path.join(root, file)
                    arc_filename = os.path.relpath(file_path, self.output_dir)
                    zip_file.write(file_path, arc_filename)

        shutil.rmtree(project_path)

    def get_fp(self, filename):
        dirname = os.path.dirname(filename)
        if not os.path.isdir(dirname):
            os.makedirs(dirname)

        if self.lzma:
            return lzma.open(filename, 'wb')
        else:
            return open(filename, 'wb')

    def close_fp(self):
        if not self.fp or not self.writer:
            return
        self.writer.write_footer()
        self.fp.close()

    def get_filename(self, project, item):
        path = os.path.join(self.output_dir, project.name)

        dirs, prefix, underscores = self.split_shortcode(
            item.shortcode, self.dir_length, self.max_right, self.file_length)

        dirs = [quote(dirname.encode('ascii')) for dirname in dirs]
        path = os.path.join(path, *dirs)

        path = os.path.join(
            path, '%s%s.%s' % (quote(prefix.encode('ascii')),
                               '_' * len(underscores), self.extension))

        return path

    @classmethod
    def split_shortcode(cls,
                        shortcode,
                        dir_length=2,
                        max_right=4,
                        file_length=2):
        assert dir_length >= 0
        assert max_right >= 0
        assert file_length >= 0
        # 0001asdf
        # dir_length max_right file_length

        dirs = []

        # create directories until we left only max_right or less characters
        length = 0
        shortcode_temp = shortcode
        while dir_length and len(shortcode_temp) > max_right + file_length:
            dirname = shortcode_temp[:dir_length]
            dirs.append(dirname)
            length += len(dirname)
            shortcode_temp = shortcode_temp[dir_length:]

        # name the file
        code_length = len(shortcode)
        length_left = code_length - length
        underscores = min(length_left, max_right)

        return dirs, shortcode[:code_length -
                               underscores], shortcode[code_length -
                                                       underscores:]
Ejemplo n.º 4
0
class Exporter:
    def __init__(self, output_dir, format="beacon", settings={}):
        super().__init__()

        self.setup_format(format)
        self.output_dir = output_dir
        self.settings = settings
        self.after = self.settings["after"]
        self.max_items = self.settings["max_items"]

        self.projects_count = 0
        self.items_count = 0
        self.last_date = None

        self.lzma = True
        self.extension = "txt.xz"

        # Length of directory name
        self.dir_length = settings["dir_length"]
        # Number of characters from the right are not used in directory name
        # in other words, number of _
        self.max_right = settings["max_right"]
        # Number of characters from the left that are used in file name
        # in other words, number of characters that are not in directory name and not _
        self.file_length = settings["file_length"]

        # Example of settings:
        # dir_length = 2
        # max_right = 4
        # file_length = 2
        # output: projectname/00/01/000100____.txt, projectname/01/01__.txt

        self.fp = None
        self.writer = None
        self.project_result_sorters = {}

        self.working_set_filename = os.path.join(output_dir, "current_working_set.txt")

    def setup_format(self, format):
        self.format = registry[format]

    def make_output_dir(self):
        if not os.path.isdir(self.output_dir):
            os.makedirs(self.output_dir)

    def dump(self):
        self.make_output_dir()

        database_busy_file = self.settings.get("database_busy_file")

        if database_busy_file:
            with open(database_busy_file, "w"):
                pass

        self._drain_to_working_set()

        if database_busy_file:
            os.remove(database_busy_file)

        self._feed_input_sorters()

        with new_session() as session:
            for project_id, sorter in self.project_result_sorters.items():
                project = session.query(Project).filter_by(name=project_id).first()

                if self.settings["include_settings"]:
                    self.dump_project_settings(project)

                self.dump_project(project, sorter)

                if self.settings["zip"]:
                    self.zip_project(project)

        os.remove(self.working_set_filename)

    def _drain_to_working_set(self, size=1000):
        logger.info("Draining to working set %s", self.working_set_filename)

        assert not os.path.exists(self.working_set_filename)

        with new_session() as session:
            query = session.query(Result)

            if self.after:
                query = query.filter(Result.datetime > self.after)

            with open(self.working_set_filename, "wb") as work_file:
                last_id = -1
                num_results = 0
                running = True

                while running:
                    # Optimized for SQLite scrolling window
                    rows = query.filter(Result.id > last_id).limit(size).all()

                    if not rows:
                        break

                    delete_ids = []

                    for result in rows:
                        line = base64.b64encode(
                            pickle.dumps(
                                {
                                    "id": result.id,
                                    "project_id": result.project_id,
                                    "shortcode": result.shortcode,
                                    "url": result.url,
                                    "encoding": result.encoding,
                                    "datetime": result.datetime,
                                }
                            )
                        )
                        work_file.write(line)
                        work_file.write(b"\n")

                        num_results += 1
                        self.items_count += 1

                        delete_ids.append(result.id)

                        if num_results % 10000 == 0:
                            logger.info("Drain progress: %d", num_results)

                        if num_results % 100000 == 0:
                            # Risky, but need to do this since WAL
                            # performance is low on large transactions
                            logger.info("Checkpoint. (Don't delete stray files if program crashes!)")
                            work_file.flush()
                            session.commit()

                        if self.max_items and num_results >= self.max_items:
                            logger.info("Reached max items %d.", self.max_items)
                            running = False
                            break

                    if self.settings["delete"]:
                        delete_query = delete(Result).where(Result.id == bindparam("id"))
                        session.execute(delete_query, [{"id": result_id} for result_id in delete_ids])

    def _feed_input_sorters(self):
        num_results = 0

        with open(self.working_set_filename, "rb") as work_file:
            for line in work_file:
                result = pickle.loads(base64.b64decode(line))

                if result["project_id"] not in self.project_result_sorters:
                    self.project_result_sorters[result["project_id"]] = GNUExternalSort(
                        temp_dir=self.output_dir, temp_prefix="tott-{0}-".format(result["project_id"])
                    )
                    self.projects_count += 1

                sorter = self.project_result_sorters[result["project_id"]]
                sorter.input(result["shortcode"], (result["id"], result["url"], result["encoding"], result["datetime"]))
                num_results += 1

                if num_results % 10000 == 0:
                    logger.info("Sort progress: %d", num_results)

    def dump_project(self, project, sorter):
        logger.info("Looking in project %s", project.name)

        if project.url_template.endswith("{shortcode}"):
            site = project.url_template.replace("{shortcode}", "")
        else:
            site = project.url_template

        last_filename = None

        for i, (key, value) in enumerate(sorter.sort()):
            if i % 10000 == 0:
                logger.info("Format progress: %d/%d", i, sorter.rows)

            id_, url, encoding, datetime_ = value
            result = ResultContainer(id_, key, url, encoding, datetime_)

            # we can do this as the query is sorted
            # so that item that would end up together
            # would returned together
            filename = self.get_filename(project, result)
            if filename != last_filename:
                self.close_fp()

                logger.info("Writing results to file %s.", filename)
                assert not os.path.isfile(filename), "Target file %s already exists" % (filename)

                self.fp = self.get_fp(filename)
                self.writer = self.format(self.fp)
                self.writer.write_header(site)

                last_filename = filename

            for encoding in (result.encoding, "latin-1", "cp437", "utf-8"):
                try:
                    result.url.encode(encoding)
                except UnicodeError:
                    logger.warning(
                        "Encoding failed %s|%s %s.", result.shortcode, repr(result.url), encoding, exc_info=True
                    )
                    continue
                else:
                    self.writer.write_shortcode(result.shortcode, result.url, encoding)
                    break
            else:
                raise Exception("Unable to encode {}|{} {}".format(result.shortcode, repr(result.url), result.encoding))

            if not self.last_date or result.datetime > self.last_date:
                self.last_date = result.datetime

        self.close_fp()

    def dump_project_settings(self, project):
        path = os.path.join(self.output_dir, project.name, "{0}.meta.json.xz".format(project.name))
        self.fp = self.get_fp(path)
        self.writer = ProjectSettingsWriter(self.fp)
        self.writer.write_project(project)
        self.close_fp()

    def zip_project(self, project):
        project_path = os.path.join(self.output_dir, project.name)

        filename = project.name

        if self.settings.get("zip_filename_infix"):
            filename += self.settings["zip_filename_infix"]

        zip_path = os.path.join(self.output_dir, "{0}.zip".format(filename))

        assert not os.path.isfile(zip_path), "Target file %s already exists" % (zip_path)

        with zipfile.ZipFile(zip_path, mode="w", compression=zipfile.ZIP_STORED) as zip_file:
            for root, dirs, files in os.walk(project_path):
                for file in files:
                    file_path = os.path.join(root, file)
                    arc_filename = os.path.relpath(file_path, self.output_dir)
                    zip_file.write(file_path, arc_filename)

        shutil.rmtree(project_path)

    def get_fp(self, filename):
        dirname = os.path.dirname(filename)
        if not os.path.isdir(dirname):
            os.makedirs(dirname)

        if self.lzma:
            return lzma.open(filename, "wb")
        else:
            return open(filename, "wb")

    def close_fp(self):
        if not self.fp or not self.writer:
            return
        self.writer.write_footer()
        self.fp.close()

    def get_filename(self, project, item):
        path = os.path.join(self.output_dir, project.name)

        dirs, prefix, underscores = self.split_shortcode(
            item.shortcode, self.dir_length, self.max_right, self.file_length
        )

        dirs = [quote(dirname.encode("ascii")) for dirname in dirs]
        path = os.path.join(path, *dirs)

        path = os.path.join(path, "%s%s.%s" % (quote(prefix.encode("ascii")), "_" * len(underscores), self.extension))

        return path

    @classmethod
    def split_shortcode(cls, shortcode, dir_length=2, max_right=4, file_length=2):
        assert dir_length >= 0
        assert max_right >= 0
        assert file_length >= 0
        # 0001asdf
        # dir_length max_right file_length

        dirs = []

        # create directories until we left only max_right or less characters
        length = 0
        shortcode_temp = shortcode
        while dir_length and len(shortcode_temp) > max_right + file_length:
            dirname = shortcode_temp[:dir_length]
            dirs.append(dirname)
            length += len(dirname)
            shortcode_temp = shortcode_temp[dir_length:]

        # name the file
        code_length = len(shortcode)
        length_left = code_length - length
        underscores = min(length_left, max_right)

        return dirs, shortcode[: code_length - underscores], shortcode[code_length - underscores :]
Ejemplo n.º 5
0
class Exporter:
    def __init__(self, output_dir, format="beacon", settings={}):
        super().__init__()

        self.setup_format(format)
        self.output_dir = output_dir
        self.settings = settings
        self.after = self.settings['after']

        self.projects_count = 0
        self.items_count = 0
        self.last_date = None

        self.lzma = True
        self.extension = 'txt.xz'

        # Length of directory name
        self.dir_length = settings['dir_length']
        # Number of characters from the right are not used in directory name
        # in other words, number of _
        self.max_right = settings['max_right']
        # Number of characters from the left that are used in file name
        # in other words, number of characters that are not in directory name and not _
        self.file_length = settings['file_length']

        # Example of settings:
        # dir_length = 2
        # max_right = 4
        # file_length = 2
        # output: projectname/00/01/000100____.txt, projectname/01/01__.txt

    def setup_format(self, format):
        self.format = registry[format]

    def make_output_dir(self):
        if not os.path.isdir(self.output_dir):
            os.makedirs(self.output_dir)

    def dump(self):
        self.make_output_dir()

        with new_session() as session:
            for project in session.query(Project):
                if self.settings['include_settings']:
                    self.dump_project_settings(project)

                self.dump_project(project, session)

                if self.settings['zip']:
                    self.zip_project(project)

    def dump_project(self, project, session):
        print('Looking in project %s' % (project.name))
        query = session.query(Result) \
            .filter_by(project=project) \
            .order_by(func.char_length(Result.shortcode), Result.shortcode)

        if self.after:
            query = query.filter(Result.datetime > self.after)

        count = query.count()
        if count == 0:
            return

        self.projects_count += 1

        assert project.url_template.endswith('{shortcode}'), \
            'Writer only supports URL with prefix'

        # XXX: Use regex \{shortcode\}$ instead?
        site = project.url_template.replace('{shortcode}', '')

        self.fp = None
        self.writer = None
        last_filename = ''
        i = 0

        for item in query:
            self.items_count += 1
            i += 1

            if i % 1000 == 0:
                print('%d/%d' % (i, count))

            # we can do this as the query is sorted
            # so that item that would end up together
            # would returned together
            filename = self.get_filename(project, item)
            if filename != last_filename:
                self.close_fp()

                assert not os.path.isfile(filename), 'Target file %s already exists' % (filename)

                self.fp = self.get_fp(filename)
                self.writer = self.format(self.fp)
                self.writer.write_header(site)

                last_filename = filename

            self.writer.write_shortcode(item.shortcode, item.url, item.encoding)

            if not self.last_date or item.datetime > self.last_date:
                self.last_date = item.datetime

            if self.settings['delete']:
                session.delete(item)

        self.close_fp()

    def dump_project_settings(self, project):
        path = os.path.join(self.output_dir, project.name,
                            '{0}.meta.json.xz'.format(project.name))
        self.fp = self.get_fp(path)
        self.writer = ProjectSettingsWriter(self.fp)
        self.writer.write_project(project)
        self.close_fp()

    def zip_project(self, project):
        project_path = os.path.join(self.output_dir, project.name)
        zip_path = os.path.join(self.output_dir,
                                '{0}.zip'.format(project.name))

        assert not os.path.isfile(zip_path), 'Target file %s already exists' % (zip_path)

        with zipfile.ZipFile(zip_path, mode='w',
                             compression=zipfile.ZIP_STORED) as zip_file:
            for root, dirs, files in os.walk(project_path):
                for file in files:
                    file_path = os.path.join(root, file)
                    arc_filename = os.path.relpath(file_path, self.output_dir)
                    zip_file.write(file_path, arc_filename)

        shutil.rmtree(project_path)

    def get_fp(self, filename):
        dirname = os.path.dirname(filename)
        if not os.path.isdir(dirname):
            os.makedirs(dirname)

        if self.lzma:
            return lzma.open(filename, 'wb')
        else:
            return open(filename, 'wb')

    def close_fp(self):
        if not self.fp or not self.writer:
            return
        self.writer.write_footer()
        self.fp.close()

    def get_filename(self, project, item):
        path = os.path.join(self.output_dir, project.name)

        dirs, prefix, underscores = self.split_shortcode(
            item.shortcode, self.dir_length, self.max_right, self.file_length)

        dirs = [quote(dirname.encode(item.encoding)) for dirname in dirs]
        path = os.path.join(path, *dirs)

        path = os.path.join(path, '%s%s.%s' % (
            quote(prefix.encode(item.encoding)),
            '_' * len(underscores),
            self.extension
        ))

        return path

    @classmethod
    def split_shortcode(cls, shortcode, dir_length=2, max_right=4,
                        file_length=2):
        assert dir_length >= 0
        assert max_right >= 0
        assert file_length >= 0
        # 0001asdf
        # dir_length max_right file_length

        dirs = []

        # create directories until we left only max_right or less characters
        length = 0
        shortcode_temp = shortcode
        while dir_length and len(shortcode_temp) > max_right + file_length:
            dirname = shortcode_temp[:dir_length]
            dirs.append(dirname)
            length += len(dirname)
            shortcode_temp = shortcode_temp[dir_length:]

        # name the file
        code_length = len(shortcode)
        length_left = code_length - length
        underscores = min(length_left, max_right)

        return dirs, shortcode[:code_length - underscores], shortcode[code_length - underscores:]