Beispiel #1
0
def sql_functions(temp_db_conn, def_config, src_dir):
    orig_sql = def_config.lib_dir.sql
    def_config.lib_dir.sql = src_dir / 'lib-sql'
    sqlproc = SQLPreprocessor(temp_db_conn, def_config)
    sqlproc.run_sql_file(temp_db_conn, 'functions/utils.sql')
    sqlproc.run_sql_file(temp_db_conn, 'tokenizer/icu_tokenizer.sql')
    def_config.lib_dir.sql = orig_sql
Beispiel #2
0
 def _init_db_tables(self, config):
     """ Set up the word table and fill it with pre-computed word
         frequencies.
     """
     with connect(self.dsn) as conn:
         sqlp = SQLPreprocessor(conn, config)
         sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer_tables.sql')
         conn.commit()
Beispiel #3
0
def create_functions(conn, config, enable_diff_updates=True, enable_debug=False):
    """ (Re)create the PL/pgSQL functions.
    """
    sql = SQLPreprocessor(conn, config)

    sql.run_sql_file(conn, 'functions.sql',
                     disable_diff_updates=not enable_diff_updates,
                     debug=enable_debug)
Beispiel #4
0
    def _init_db_tables(self, config):
        """ Set up the word table and fill it with pre-computed word
            frequencies.
        """
        with connect(self.dsn) as conn:
            sqlp = SQLPreprocessor(conn, config)
            sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_tables.sql')
            conn.commit()

        LOG.warning("Precomputing word tokens")
        db_utils.execute_file(self.dsn, config.lib_dir.data / 'words.sql')
Beispiel #5
0
 def update_sql_functions(self, config):
     """ Reimport the SQL functions for this tokenizer.
     """
     with connect(self.dsn) as conn:
         max_word_freq = properties.get_property(conn, DBCFG_MAXWORDFREQ)
         modulepath = config.DATABASE_MODULE_PATH or \
                      str((config.project_dir / 'module').resolve())
         sqlp = SQLPreprocessor(conn, config)
         sqlp.run_sql_file(conn,
                           'tokenizer/legacy_tokenizer.sql',
                           max_word_freq=max_word_freq,
                           modulepath=modulepath)
    def _init_db_tables(self, config):
        """ Set up the word table and fill it with pre-computed word
            frequencies.
        """
        with connect(self.dsn) as conn:
            sqlp = SQLPreprocessor(conn, config)
            sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_tables.sql')
            conn.commit()

            LOG.warning("Precomputing word tokens")

            # get partial words and their frequencies
            words = Counter()
            with self.name_analyzer() as analyzer:
                with conn.cursor(name="words") as cur:
                    cur.execute(
                        "SELECT svals(name) as v, count(*) FROM place GROUP BY v"
                    )

                    for name, cnt in cur:
                        term = analyzer.make_standard_word(name)
                        if term:
                            for word in term.split():
                                words[word] += cnt

            # copy them back into the word table
            copystr = io.StringIO(''.join(
                ('{}\t{}\n'.format(*args) for args in words.items())))

            with conn.cursor() as cur:
                copystr.seek(0)
                cur.copy_from(copystr,
                              'word',
                              columns=['word_token', 'search_name_count'])
                cur.execute("""UPDATE word SET word_id = nextval('seq_word')
                               WHERE word_id is null""")

            conn.commit()
Beispiel #7
0
def add_tiger_data(data_dir, config, threads, tokenizer):
    """ Import tiger data from directory or tar file `data dir`.
    """
    dsn = config.get_libpq_dsn()
    files, tar = handle_tarfile_or_directory(data_dir)

    if not files:
        return

    with connect(dsn) as conn:
        sql = SQLPreprocessor(conn, config)
        sql.run_sql_file(conn, 'tiger_import_start.sql')

    # Reading files and then for each file line handling
    # sql_query in <threads - 1> chunks.
    place_threads = max(1, threads - 1)

    with WorkerPool(dsn, place_threads, ignore_sql_errors=True) as pool:
        with tokenizer.name_analyzer() as analyzer:
            for fname in files:
                if not tar:
                    fd = open(fname)
                else:
                    fd = io.TextIOWrapper(tar.extractfile(fname))

                handle_threaded_sql_statements(pool, fd, analyzer)

                fd.close()

    if tar:
        tar.close()
    print('\n')
    LOG.warning("Creating indexes on Tiger data")
    with connect(dsn) as conn:
        sql = SQLPreprocessor(conn, config)
        sql.run_sql_file(conn, 'tiger_import_finish.sql')
Beispiel #8
0
 def update_sql_functions(self, config):
     """ Reimport the SQL functions for this tokenizer.
     """
     with connect(self.dsn) as conn:
         sqlp = SQLPreprocessor(conn, config)
         sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer.sql')