Exemple #1
0
def test_file_with_env_value():
    random_env_name = random_string(length=16)
    random_env_value = random_string(length=16)

    os.environ[random_env_name] = random_env_value

    env_file = file_with_env_value(name=random_env_name)
    assert os.path.exists(env_file)

    with open(env_file, mode='r') as f:
        assert random_env_value == f.read()

    env_file_2 = file_with_env_value(name=random_env_name)
    assert env_file == env_file_2, f"Helper doesn't recreate file on identical value."

    random_env_value = random_string(length=16)

    # Try changing value
    os.environ[random_env_name] = random_env_value

    env_file_3 = file_with_env_value(name=random_env_name)

    with open(env_file_3, mode='r') as f:
        assert random_env_value == f.read()

    assert env_file != env_file_3, f"Helper recreates file on different value."
Exemple #2
0
def test_env_value():
    random_env_name = random_string(length=16)
    random_env_value = random_string(length=16)

    os.environ[random_env_name] = random_env_value

    assert env_value(name=random_env_name) == random_env_value
Exemple #3
0
    def get_temporary_ids_table(self,
                                ids: List[int],
                                ordered: bool = False) -> str:
        """Get the name of a temporary table that contains all of the IDs in "ids" as an "id BIGINT" field.

        The database connection must be within a transaction. The temporary table is setup to be dropped at the end of
        the current transaction. If "ordered" is True, include an "<...>_id SERIAL PRIMARY KEY" field in the table."""

        table_name = '_tmp_ids_%s' % random_string(length=16)

        l.debug("Temporary IDs table: %s" % table_name)

        primary_key_clause = ""
        if ordered:
            primary_key_clause = "%s_pkey SERIAL PRIMARY KEY," % table_name

        sql = """CREATE TEMPORARY TABLE %s (""" % table_name
        sql += primary_key_clause
        sql += "id BIGINT)"
        self.query(sql)

        copy = self.copy_from("COPY %s (id) FROM STDIN" % table_name)
        for single_id in ids:
            copy.put_line("%d\n" % single_id)
        copy.end()

        self.query("ANALYZE %s" % table_name)

        return table_name
Exemple #4
0
def test_env_value_required():
    nonexistent_env_name = random_string(length=16)

    with pytest.raises(McConfigEnvironmentVariableUnsetException):
        env_value(name=nonexistent_env_name)

    assert env_value(name=nonexistent_env_name, required=False) is None
def test_validate_new_password():
    # noinspection PyTypeChecker
    assert len(validate_new_password(email=None, password=None, password_repeat=None)) > 0

    assert len(validate_new_password(email='', password='', password_repeat='')) > 0

    assert len(validate_new_password(email='*****@*****.**', password='', password_repeat='')) > 0

    # Passwords do not match
    assert len(validate_new_password(email='*****@*****.**', password='******', password_repeat='abcdefghX')) > 0

    # Too short
    assert len(validate_new_password(email='*****@*****.**', password='******', password_repeat='abc')) > 0

    too_long_password = random_string(length=200)
    assert len(validate_new_password(email='*****@*****.**',
                                     password=too_long_password,
                                     password_repeat=too_long_password)) > 0

    # Email == password
    email = '*****@*****.**'
    assert len(validate_new_password(email=email, password=email, password_repeat=email)) > 0

    # All good
    password = '******'
    assert len(validate_new_password(email='*****@*****.**',
                                     password=password,
                                     password_repeat=password)) == 0
def get_test_s3_credentials() -> Union[dict, None]:
    """Return test Amazon S3 credentials as a dictionary or None if credentials are not configured."""

    config = py_get_config()

    credentials = None

    # Environment variables
    if os.getenv('MC_AMAZON_S3_TEST_ACCESS_KEY_ID') is not None:
        credentials = {
            'access_key_id':
            os.getenv('MC_AMAZON_S3_TEST_ACCESS_KEY_ID', None),
            'secret_access_key':
            os.getenv('MC_AMAZON_S3_TEST_SECRET_ACCESS_KEY', None),
            'bucket_name':
            os.getenv('MC_AMAZON_S3_TEST_BUCKET_NAME', None),
            'directory_name':
            os.getenv('MC_AMAZON_S3_TEST_DIRECTORY_NAME', None),
        }

    # mediawords.yml
    elif 'amazon_s3' in config and 'test' in config['amazon_s3']:
        credentials = copy.deepcopy(config['amazon_s3']['test'])

    # We want to be able to run S3 tests in parallel
    if credentials is not None:
        credentials['directory_name'] = credentials[
            'directory_name'] + '-' + random_string(64)

    return credentials
Exemple #7
0
    def get_temporary_ids_table(self, ids: List[int], ordered: bool = False) -> str:
        """Get the name of a temporary table that contains all of the IDs in "ids" as an "id BIGINT" field.

        The database connection must be within a transaction. The temporary table is setup to be dropped at the end of
        the current transaction. If "ordered" is True, include an "<...>_id SERIAL PRIMARY KEY" field in the table."""

        table_name = '_tmp_ids_%s' % random_string(length=16)

        log.debug("Temporary IDs table: %s" % table_name)

        primary_key_clause = ""
        if ordered:
            primary_key_clause = "%s_pkey SERIAL PRIMARY KEY," % table_name

        sql = """CREATE TEMPORARY TABLE %s (""" % table_name
        sql += primary_key_clause
        sql += "id BIGINT)"
        self.query(sql)

        copy = self.copy_from("COPY %s (id) FROM STDIN" % table_name)
        for single_id in ids:
            copy.put_line("%d\n" % int(single_id))
        copy.end()

        self.query("ANALYZE %s" % table_name)

        return table_name
def get_test_s3_credentials() -> Union[dict, None]:
    """Return test Amazon S3 credentials as a dictionary or None if credentials are not configured."""

    config = py_get_config()

    credentials = None

    # Environment variables
    if os.getenv('MC_AMAZON_S3_TEST_ACCESS_KEY_ID') is not None:
        credentials = {
            'access_key_id': os.getenv('MC_AMAZON_S3_TEST_ACCESS_KEY_ID', None),
            'secret_access_key': os.getenv('MC_AMAZON_S3_TEST_SECRET_ACCESS_KEY', None),
            'bucket_name': os.getenv('MC_AMAZON_S3_TEST_BUCKET_NAME', None),
            'directory_name': os.getenv('MC_AMAZON_S3_TEST_DIRECTORY_NAME', None),
        }

    # mediawords.yml
    elif 'amazon_s3' in config and 'test' in config['amazon_s3']:
        credentials = copy.deepcopy(config['amazon_s3']['test'])

    # We want to be able to run S3 tests in parallel
    if credentials is not None:
        credentials['directory_name'] = credentials['directory_name'] + '-' + random_string(64)

    return credentials
 def _initialize_store(self) -> CachedAmazonS3Store:
     return CachedAmazonS3Store(
         access_key_id=test_credentials.access_key_id(),
         secret_access_key=test_credentials.secret_access_key(),
         bucket_name=test_credentials.bucket_name(),
         directory_name=test_credentials.directory_name() + '/' +
         random_string(16),
         cache_table='cache.s3_raw_downloads_cache',
     )
Exemple #10
0
def test_env_value_empty_string():
    empty_env_name = random_string(length=16)

    os.environ[empty_env_name] = ''

    with pytest.raises(McConfigEnvironmentVariableUnsetException):
        env_value(name=empty_env_name)

    assert env_value(name=empty_env_name, allow_empty_string=True) == ''
def create_password_reset_token(db: DatabaseHandler,
                                email: str) -> Optional[str]:
    """Generate password reset token used for both activating newly registered users and resetting passwords.

    Returns non-hashed password reset token or None if user was not found.
    """

    email = decode_object_from_bytes_if_needed(email)

    if not email:
        raise McAuthProfileException('Email address is empty.')

    # Check if the email address exists in the user table; if not, pretend that we sent the activation link with a
    # "success" message. That way the adversary would not be able to find out which email addresses are active users.
    #
    # (Possible improvement: make the script work for the exact same amount of time in both cases to avoid timing
    # attacks)
    user_exists = db.query(
        """
        SELECT auth_users_id,
               email
        FROM auth_users
        WHERE email = %(email)s
        LIMIT 1
    """, {
            'email': email
        }).hash()
    if user_exists is None or len(user_exists) == 0:
        # User was not found, so set the email address to an empty string, but don't return just now and continue with a
        # rather slowish process of generating a activation token (in order to reduce the risk of timing attacks)
        email = ''

    # Generate the activation token
    password_reset_token = random_string(length=64)
    if len(password_reset_token) == 0:
        raise McAuthProfileException('Unable to generate an activation token.')

    # Hash + validate the activation token
    password_reset_token_hash = generate_secure_hash(
        password=password_reset_token)
    if not password_reset_token_hash:
        raise McAuthProfileException("Unable to hash an activation token.")

    # Set the activation token hash in the database (if the email address doesn't exist, this query will do nothing)
    db.query(
        """
        UPDATE auth_users
        SET password_reset_token_hash = %(password_reset_token_hash)s
        WHERE email = %(email)s
          AND email != ''
    """, {
            'email': email,
            'password_reset_token_hash': password_reset_token_hash,
        })

    return password_reset_token
Exemple #12
0
def test_file_with_env_value_base64():
    random_env_name = random_string(length=16)
    random_env_value = secrets.token_bytes(16)
    random_env_value_b64 = base64.b64encode(random_env_value).decode('utf-8')

    os.environ[random_env_name] = random_env_value_b64

    env_file = file_with_env_value(name=random_env_name, encoded_with_base64=True)
    assert os.path.exists(env_file)

    with open(env_file, mode='rb') as f:
        assert random_env_value == f.read()
Exemple #13
0
def create_password_reset_token(db: DatabaseHandler, email: str) -> Optional[str]:
    """Generate password reset token used for both activating newly registered users and resetting passwords.

    Returns non-hashed password reset token or None if user was not found.
    """

    email = decode_object_from_bytes_if_needed(email)

    if not email:
        raise McAuthProfileException('Email address is empty.')

    # Check if the email address exists in the user table; if not, pretend that we sent the activation link with a
    # "success" message. That way the adversary would not be able to find out which email addresses are active users.
    #
    # (Possible improvement: make the script work for the exact same amount of time in both cases to avoid timing
    # attacks)
    user_exists = db.query("""
        SELECT auth_users_id,
               email
        FROM auth_users
        WHERE email = %(email)s
        LIMIT 1
    """, {'email': email}).hash()
    if user_exists is None or len(user_exists) == 0:
        # User was not found, so set the email address to an empty string, but don't return just now and continue with a
        # rather slowish process of generating a activation token (in order to reduce the risk of timing attacks)
        email = ''

    # Generate the activation token
    password_reset_token = random_string(length=64)
    if len(password_reset_token) == 0:
        raise McAuthProfileException('Unable to generate an activation token.')

    # Hash + validate the activation token
    password_reset_token_hash = generate_secure_hash(password=password_reset_token)
    if not password_reset_token_hash:
        raise McAuthProfileException("Unable to hash an activation token.")

    # Set the activation token hash in the database (if the email address doesn't exist, this query will do nothing)
    db.query("""
        UPDATE auth_users
        SET password_reset_token_hash = %(password_reset_token_hash)s
        WHERE email = %(email)s
          AND email != ''
    """, {
        'email': email,
        'password_reset_token_hash': password_reset_token_hash,
    })

    return password_reset_token
Exemple #14
0
    def __init__(self, db: DatabaseHandler, snapshots_id: int):
        super().__init__()

        snapshots_id = int(snapshots_id)

        # Verify that topic exists
        if db.find_by_id(table='snapshots', object_id=snapshots_id) is None:
            raise McWord2vecException("Snapshot with ID %d does not exist." %
                                      snapshots_id)

        self.__snapshots_id = snapshots_id
        self.__sentence_counter = 0

        # Subselect such as:
        #
        #     SELECT sentence
        #     FROM story_sentences
        #     WHERE stories_id IN (
        #         SELECT stories_id
        #         FROM snap.snapshots
        #         WHERE snapshots_id = ...
        #     )
        #
        # or its variants (e.g. INNER JOIN) makes the query planner decide on a sequential scan on "story_sentences",
        # so we create a temporary table with snapshot's "stories_id" first.
        log.info("Creating a temporary table with snapshot's stories_id...")
        snapshots_stories_id_temp_table_name = 'snapshot_stories_ids_{}'.format(
            random_string(32))
        db.query(
            """
            CREATE TEMPORARY TABLE {} AS
                SELECT stories_id
                FROM snap.stories
                WHERE snapshots_id = %(snapshots_id)s
        """.format(snapshots_stories_id_temp_table_name),
            {'snapshots_id': snapshots_id})

        # "INNER JOIN" instead of "WHERE stories_id IN (SELECT ...)" here because then database doesn't have to compute
        # distinct "stories_id" to SELECT sentence FROM story_sentences against, i.e. it doesn't have to
        # Group + HashAggregate on the temporary table.
        log.info("Creating COPY TO object...")
        self.__copy_to = db.copy_to("""
            COPY (
                SELECT story_sentences.sentence
                FROM {} AS snapshot_stories_ids
                    INNER JOIN story_sentences
                        ON snapshot_stories_ids.stories_id = story_sentences.stories_id
            ) TO STDOUT WITH CSV
        """.format(snapshots_stories_id_temp_table_name))
Exemple #15
0
def test_random_string():
    with pytest.raises(McRandomStringException):
        random_string(0)
    with pytest.raises(McRandomStringException):
        random_string(-1)

    length = 16
    string_1 = random_string(length=length)
    string_2 = random_string(length=length)

    assert string_1 != string_2
    assert len(string_1) == length
    assert len(string_2) == length
    assert string_1.isalnum()
    assert string_2.isalnum()
Exemple #16
0
def test_random_string():
    with pytest.raises(McRandomStringException):
        random_string(0)
    with pytest.raises(McRandomStringException):
        random_string(-1)

    length = 16
    string_1 = random_string(length=length)
    string_2 = random_string(length=length)

    assert string_1 != string_2
    assert len(string_1) == length
    assert len(string_2) == length
    assert string_1.isalnum()
    assert string_2.isalnum()
def test_stories_checksum_matches_feed():
    db = connect_to_db()

    rand = random_string(length=8)

    medium = db.create(table='media',
                       insert_hash={
                           'name': f"test feed checksum {rand}",
                           'url': f"url://test/feed/checksum/{rand}",
                       })
    feed = db.create(table='feeds',
                     insert_hash={
                         'name': 'feed',
                         'url': medium['url'],
                         'media_id': medium['media_id'],
                     })
    feeds_id = feed['feeds_id']

    urls_a = [
        "http://www.bzf.ro/rezultate-liga-a-v-a-zona-fagaras-20.html",
        "http://www.mehrnews.com/detail/News/2027821",
        "http://www.chip.de/news/Parallels-Zwei-Android-Systeme-auf-einem-Handy_61383826.html",
        "http://www.inn.co.il/News/Flash.aspx/401095",
        ("http://www.moheet.com/2013/04/07/%d9%85%d8%ad%d8%b3%d9%88%d8%a8-%d8%a3%d8%ad%d8%af%d8%a7%d8%ab-%d8%a7%d9"
         "%84%d9%83%d8%a7%d8%aa%d8%af%d8%b1%d8.%a7%d8%a6%d9%8a%d8%a9-%d9%88%d8%a7%d8%ad%d8%af%d8%a9-%d9%85%d9%86-%d9"
         "%85%d9%88%d8%b1%d9%88/"),
        "http://twitter.com/radiationn/statuses/320948496549154816",
        "http://news.chinatimes.com/realtime/110105/112013040700840.html",
        "http://www.northkoreannews.net/index.php/sid/213669147/scat/08aysdf7tga9s7f7",
        "http://twitter.com/NastyaaPatrick/statuses/320956956149948417",
        "http://life.chinatimes.com/life/11051801/112013040800054.html",
        "http://www.enet.gr/?i=news.el.article&id=355553",
        ("http://www.ibtimes.co.uk/articles/454410/20130407/portugal-government-sticks-to-bailout-goals-despite-"
         "court-ruling.htm"),
        "http://www.egynews.net:80/wps/portal/news?params=223267",
        ("http://www.merkur-online.de:80/sport/fussball/hannover-trostlose-nullnummer-gegen-stuttgart-zr-"
         "2838522.html?cmp=defrss"),
        "http://www.farsnews.com/newstext.php?nn=13920118001322",
    ]

    urls_b = [
        "http://www.guardian.co.uk/football/blog/2013/apr/07/sunderland-chelsea-tactics-match",
        ("http://www.nicematin.com/monde/egypte-un-mort-dans-des-violences-apres-les-funerailles-de-coptes-tues."
         "1206791.html"),
        ("http://www.mercurynews.com/breaking-news/ci_22965002/immigration-talks-between-california-farm-groups-"
         "hit-impasse?source=rss_emailed"),
        "http://www.belfasttelegraph.co.uk/sport/racing/cut-too-sharp-for-gladness-rivals-29179755.html",
        "http://www.vz.ru/news/2013/4/7/627732.html",
        "http://www.thehindu.com/sport/ipl2013/fleming-unhappy-with-csk-batsmen/article4591746.ece",
        ("http://www.dallasnews.com/entertainment/music/headlines/20130407-academy-of-country-music-awards-7-p.m.-"
         "burleson-s-kelly-clarkson-set-to-perform.ece"),
        "http://feedproxy.google.com/~r/OTB/~3/TNKm_R0dEKo/",
        ("http://rss.feedsportal.com/c/266/f/3492/s/2a6f8876/l/0L0Sindependent0O0Cnews0Cworld0Cmiddle0Eeast0Cisraels"
         "0Enew0Estrategic0Eaffairs0Eminister0Ewest0Emust0Ethreaten0Eiran0Eover0Enuclear0Eplans0E85635150Bhtml/"
         "story01.htm"),
        "http://news.chinatimes.com/focus/11050105/112013040800090.html",
        "http://blogi.newsweek.pl/Tekst/naluzie/669783,marzenie-przyziemne.html#comment-168169",
        "http://jamaica-gleaner.com/gleaner/20130407/ent/ent6.html",
        "http://www.wboc.com/story/21901967/timeline-of-the-whereabouts-of-suspected-strangler",
        "http://www.cadenaser.com/internacional/articulo/feminismo-islamico-femen/csrcsrpor/20130407csrcsrint_6/Tes",
        "http://thehimalayantimes.com/rssReference.php?id=MzcyMDQw",
        ("http://au.ibtimes.com/articles/454410/20130408/portugal-government-sticks-to-bailout-goals-despite-court-"
         "ruling.htm"),
        "http://www.ziar.com/articol-din-ziar?id_syndic_article=5566035",
        "http://www.bellinghamherald.com/2013/04/07/2955579/hardwood-to-trading-floor-stocks.html#storylink=rss",
    ]

    stories_a = [{'url': url} for url in urls_a]
    stories_b = [{'url': url} for url in urls_b]

    # First check should fail since feed checksum should be empty
    assert stories_checksum_matches_feed(
        db=db, feeds_id=feeds_id,
        stories=stories_a) is False, "Empty checksum."

    # Next check with same stories should be a match
    assert stories_checksum_matches_feed(
        db=db, feeds_id=feeds_id, stories=stories_a) is True, "Match 1."

    # And another match
    assert stories_checksum_matches_feed(
        db=db, feeds_id=feeds_id, stories=stories_a) is True, "Match 2."

    # And now try with different set of stories
    assert stories_checksum_matches_feed(
        db=db, feeds_id=feeds_id, stories=stories_b) is False, "Fail 1."

    # And now with the same B stories
    assert stories_checksum_matches_feed(
        db=db, feeds_id=feeds_id, stories=stories_b) is True, "Match 3."

    # And now add one story
    stories_b.append({'url': 'http://foo.bar.com'})
    assert stories_checksum_matches_feed(
        db=db, feeds_id=feeds_id, stories=stories_b) is False, "Fail 2."
    assert stories_checksum_matches_feed(
        db=db, feeds_id=feeds_id, stories=stories_b) is True, "Match 4."

    # And now with no stories
    assert stories_checksum_matches_feed(db=db, feeds_id=feeds_id,
                                         stories=[]) is False, "Fail 3."

    # And now with B again
    assert stories_checksum_matches_feed(
        db=db, feeds_id=feeds_id, stories=stories_a) is False, "Fail 4."
    assert stories_checksum_matches_feed(
        db=db, feeds_id=feeds_id, stories=stories_a) is True, "Match 5."
Exemple #18
0
class DatabaseHandler(object):
    """PostgreSQL middleware (imitates DBIx::Simple's interface)."""

    # Min. "deadlock_timeout" to not cause problems under load (in seconds)
    __MIN_DEADLOCK_TIMEOUT = 5

    # "Double percentage sign" marker (see handler's quote() for explanation)
    __DOUBLE_PERCENTAGE_SIGN_MARKER = "<DOUBLE PERCENTAGE SIGN: " + random_string(length=16) + ">"

    # Whether or not "deadlock_timeout" was checked
    # * lowercase because it's not a constant
    # * class variable because we don't need to do it on every connect_to_db())
    __deadlock_timeout_checked = False

    __slots__ = [

        # Cache of table primary key columns ([schema][table])
        '__primary_key_columns',

        # Whether or not to print PostgreSQL warnings
        '__print_warnings',

        # Debugging variable to test whether we're in a transaction
        '__in_manual_transaction',

        # Pyscopg2 instance and cursor
        '__conn',
        '__db',

    ]

    def __init__(self,
                 host: str,
                 port: int,
                 username: str,
                 password: str,
                 database: str):
        """Database handler constructor; connects to PostgreSQL too."""

        host = decode_object_from_bytes_if_needed(host)
        # noinspection PyTypeChecker
        port = int(decode_object_from_bytes_if_needed(port))
        username = decode_object_from_bytes_if_needed(username)
        password = decode_object_from_bytes_if_needed(password)
        database = decode_object_from_bytes_if_needed(database)

        self.__primary_key_columns = {}
        self.__print_warnings = True
        self.__in_manual_transaction = False
        self.__conn = None
        self.__db = None

        self.__connect(
            host=host,
            port=port,
            username=username,
            password=password,
            database=database,
        )

    def __connect(self,
                  host: str,
                  port: int,
                  username: str,
                  password: str,
                  database: str) -> None:
        """Connect to PostgreSQL."""

        host = decode_object_from_bytes_if_needed(host)
        # noinspection PyTypeChecker
        port = int(decode_object_from_bytes_if_needed(port))
        username = decode_object_from_bytes_if_needed(username)
        password = decode_object_from_bytes_if_needed(password)
        database = decode_object_from_bytes_if_needed(database)

        if not (host and username and password and database):
            raise McConnectException("Database connection credentials are not set.")

        if not port:
            port = 5432

        application_name = '%s %d' % (socket.gethostname(), os.getpid())

        self.__conn = psycopg2.connect(
            host=host,
            port=port,
            user=username,
            password=password,
            database=database,
            application_name=application_name
        )

        # Magic bits for psycopg2 to start supporting UTF-8
        psycopg2.extensions.register_type(psycopg2.extensions.UNICODE, self.__conn)
        psycopg2.extensions.register_type(psycopg2.extensions.UNICODEARRAY, self.__conn)
        self.__conn.set_client_encoding(psycopg2.extensions.encodings['UTF8'])

        # Don't automatically decode JSON, just like DBD::Pg doesn't
        # MC_REWRITE_TO_PYTHON: (probably) remove after porting
        psycopg2.extras.register_default_json(loads=lambda x: x)

        # psycopg2.extras.DictCursor factory enables server-side query prepares so all result data does not get fetched
        # at once
        cursor_factory = psycopg2.extras.DictCursor
        self.__db = self.__conn.cursor(cursor_factory=cursor_factory)

        # Queries to have immediate effect by default
        self.__conn.set_isolation_level(psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT)

        # Check deadlock_timeout
        if not DatabaseHandler.__deadlock_timeout_checked:
            (deadlock_timeout,) = self.query("SHOW deadlock_timeout").flat()
            deadlock_timeout = re.sub(r'\s*s$', '', deadlock_timeout, re.I)
            deadlock_timeout = int(deadlock_timeout)
            if deadlock_timeout == 0:
                raise McConnectException("'deadlock_timeout' is 0, probably unable to read it")
            if deadlock_timeout < DatabaseHandler.__MIN_DEADLOCK_TIMEOUT:
                log.warning(
                    '"deadlock_timeout" is less than "{}", expect deadlocks on high extractor load.'.format(
                        DatabaseHandler.__MIN_DEADLOCK_TIMEOUT
                    )
                )

            DatabaseHandler.__deadlock_timeout_checked = True

    def disconnect(self) -> None:
        """Disconnect from the database."""
        self.__db.close()
        self.__db = None

        self.__conn.close()
        self.__db = None

    # noinspection PyMethodMayBeStatic
    def dbh(self) -> None:
        raise McDatabaseHandlerException("Please don't use internal database handler directly")

    def query(self, *query_params) -> DatabaseResult:
        """Run the query, return instance of DatabaseResult for accessing the result.

        Accepts either (preferred) psycopg2-style query and parameters:

            # Dictionary parameters (preferred):
            db.query('SELECT * FROM foo WHERE bar = %(bar)s AND baz = %(baz)s', {'bar': bar, 'baz': baz})

            # Dictionary parameters with tuple:
            db.query('SELECT * FROM foo WHERE bar IN %(bar)s, {'bar': tuple(['a', 'b', 'c'])})

            # Tuple parameters:
            db.query('SELECT * FROM foo WHERE bar = %s AND baz = %s', (bar, baz,))

        ...or DBD::Pg (DBIx::Simple) form of query and parameters:

            db.query('SELECT * FROM foo WHERE bar = ? AND baz = ?', bar, baz)
        """

        # MC_REWRITE_TO_PYTHON: remove after porting queries to named parameter style
        query_params = convert_dbd_pg_arguments_to_psycopg2_format(*query_params)

        if len(query_params) == 0:
            raise McQueryException("Query is unset.")
        if len(query_params) > 2:
            raise McQueryException("psycopg2's execute() accepts at most 2 parameters.")

        return DatabaseResult(cursor=self.__db,
                              query_args=query_params,
                              double_percentage_sign_marker=DatabaseHandler.__DOUBLE_PERCENTAGE_SIGN_MARKER,
                              print_warnings=self.__print_warnings)

    def primary_key_column(self, object_name: str) -> str:
        """Get INT / BIGINT primary key column name for a table or a view.

        If the table has a composite primary key, return the first INT / BIGINT column name.
        """

        object_name = decode_object_from_bytes_if_needed(object_name)

        if '.' in object_name:
            schema_name, object_name = object_name.split('.', maxsplit=1)
        else:
            schema_name = 'public'

        if schema_name not in self.__primary_key_columns:
            self.__primary_key_columns[schema_name] = {}

        if object_name not in self.__primary_key_columns[schema_name]:

            # noinspection SpellCheckingInspection,SqlResolve
            columns = self.query("""
                SELECT
                    n.nspname AS schema_name,
                    c.relname AS object_name,
                    c.relkind AS object_type,
                    a.attname AS column_name,
                    i.indisprimary AS is_primary_index,
                    t.typname AS column_type,
                    t.typcategory AS column_type_category

                FROM pg_namespace AS n
                    INNER JOIN pg_class AS c
                        ON n.oid = c.relnamespace
                    INNER JOIN pg_attribute AS a
                        ON a.attrelid = c.oid
                        AND NOT a.attisdropped
                    INNER JOIN pg_type AS t
                      ON a.atttypid = t.oid

                    -- Object might be a view, so LEFT JOIN
                    LEFT JOIN pg_index AS i
                        ON c.oid = i.indrelid
                        AND a.attnum = ANY(i.indkey)

                WHERE

                  -- No xid, cid, ...
                  a.attnum > 0

                  -- Live column
                  AND NOT attisdropped

                  -- Numeric (INT or BIGINT)
                  AND t.typcategory = 'N'

                  AND n.nspname = %(schema_name)s
                  AND c.relname = %(object_name)s

                -- In case of a composite PK, select the first numeric column
                ORDER BY a.attnum
            """, {
                'schema_name': schema_name,
                'object_name': object_name,
            }).hashes()
            if not columns:
                raise McPrimaryKeyColumnException(
                    "Object '{}' in schema '{} was not found.".format(schema_name, object_name)
                )

            primary_key_column = None

            for column in columns:

                column_name = column['column_name']

                if column['object_type'] in ['r', 'p']:
                    # Table
                    if column['is_primary_index']:
                        primary_key_column = column_name
                        break

                elif column['object_type'] in ['v', 'm']:
                    # (Materialized) view
                    if column['column_name'] == 'id' or column['column_name'] == '{}_id'.format(object_name):
                        primary_key_column = column_name
                        break

            if not primary_key_column:
                raise McPrimaryKeyColumnException(
                    "Primary key for schema '%s', object '%s' was not found" % (schema_name, object_name,)
                )

            self.__primary_key_columns[schema_name][object_name] = primary_key_column

        return self.__primary_key_columns[schema_name][object_name]

    def find_by_id(self, table: str, object_id: int) -> Union[Dict[str, Any], None]:
        """Do an ID lookup on the table and return a single row match if found."""

        # MC_REWRITE_TO_PYTHON: some IDs get passed as 'str' / 'bytes'; remove after getting rid of Catalyst
        # noinspection PyTypeChecker
        object_id = decode_object_from_bytes_if_needed(object_id)
        object_id = int(object_id)

        table = decode_object_from_bytes_if_needed(table)

        primary_key_column = self.primary_key_column(table)
        if not primary_key_column:
            raise McFindByIDException("Primary key for table '%s' was not found" % table)

        # Python substitution
        find_by_id_query = "SELECT * FROM %(table)s WHERE %(id_column)s" % {
            "table": table,
            "id_column": primary_key_column,
        }

        # psycopg2 substitution
        result = self.query(find_by_id_query + " = %(id_value)s", {'id_value': object_id})
        if result.rows() > 1:
            raise McFindByIDException("More than one row was found for ID '%d' from table '%s'" % (object_id, table))
        elif result.rows() == 1:
            return result.hash()
        else:
            return None

    def require_by_id(self, table: str, object_id: int) -> Dict[str, Any]:
        """find_by_id() or raise exception if not found."""

        # MC_REWRITE_TO_PYTHON: some IDs get passed as 'str' / 'bytes'; remove after getting rid of Catalyst
        # noinspection PyTypeChecker
        object_id = decode_object_from_bytes_if_needed(object_id)
        object_id = int(object_id)

        table = decode_object_from_bytes_if_needed(table)

        row = self.find_by_id(table, object_id)
        if row is None:
            raise McRequireByIDException("Unable to find ID '%d' in table '%s'" % (object_id, table))
        return row

    def update_by_id(self, table: str, object_id: int, update_hash: dict) -> Union[Dict[str, Any], None]:
        """Update the row in the table with the given ID. Ignore any fields that start with '_'."""

        # MC_REWRITE_TO_PYTHON: some IDs get passed as 'str' / 'bytes'; remove after getting rid of Catalyst
        # noinspection PyTypeChecker
        object_id = decode_object_from_bytes_if_needed(object_id)
        object_id = int(object_id)

        table = decode_object_from_bytes_if_needed(table)
        update_hash = decode_object_from_bytes_if_needed(update_hash)

        update_hash = update_hash.copy()  # To be able to safely modify it

        # MC_REWRITE_TO_PYTHON: remove after getting rid of Catalyst
        if "submit" in update_hash:
            del update_hash["submit"]

        update_hash = {k: v for k, v in update_hash.items() if not k.startswith("_")}

        if len(update_hash) == 0:
            raise McUpdateByIDException("Hash to UPDATE is empty.")

        primary_key_column = self.primary_key_column(table)
        if not primary_key_column:
            raise McUpdateByIDException("Primary key for table '%s' was not found" % table)

        keys = []
        for key, value in update_hash.items():
            key_value = key

            # Cast Inline::Python's booleans to Python's booleans
            # MC_REWRITE_TO_PYTHON: remove after porting
            if type(value).__name__ == '_perl_obj':
                value = bool(value)
                update_hash[key] = value

            key_value += " = %(" + key + ")s"  # "%(key)s" to be resolved by psycopg2, not Python

            keys.append(key_value)

        update_hash['__object_id'] = object_id

        sql = "UPDATE %s " % table
        sql += "SET %s " % ", ".join(keys)
        sql += "WHERE %s = " % primary_key_column
        sql += "%(__object_id)s"  # "%(__object_id)s" to be resolved by psycopg2, not Python

        self.query(sql, update_hash)

        updated_row = self.find_by_id(table=table, object_id=object_id)

        return updated_row

    def delete_by_id(self, table: str, object_id: int) -> None:
        """Delete the row in the table with the given ID."""

        # MC_REWRITE_TO_PYTHON: some IDs get passed as 'str' / 'bytes'; remove after getting rid of Catalyst
        # noinspection PyTypeChecker
        object_id = decode_object_from_bytes_if_needed(object_id)
        object_id = int(object_id)

        table = decode_object_from_bytes_if_needed(table)

        primary_key_column = self.primary_key_column(table)
        if not primary_key_column:
            raise McDeleteByIDException("Primary key for table '%s' was not found" % table)

        # noinspection SqlWithoutWhere
        sql = "DELETE FROM %s " % table
        sql += "WHERE %s = " % primary_key_column
        sql += "%(__object_id)s"  # "%(object_id)s" to be resolved by psycopg2, not Python

        self.query(sql, {"__object_id": object_id})

    def insert(self, table: str, insert_hash: dict) -> Dict[str, Any]:
        """Alias for create()."""
        table = decode_object_from_bytes_if_needed(table)
        insert_hash = decode_object_from_bytes_if_needed(insert_hash)

        return self.create(table=table, insert_hash=insert_hash)

    def create(self, table: str, insert_hash: dict) -> Dict[str, Any]:
        """Insert a row into the database for the given table with the given hash values and return the created row."""

        table = decode_object_from_bytes_if_needed(table)
        insert_hash = decode_object_from_bytes_if_needed(insert_hash)

        insert_hash = insert_hash.copy()  # To be able to safely modify it

        # MC_REWRITE_TO_PYTHON: remove after getting rid of Catalyst
        if "submit" in insert_hash:
            del insert_hash["submit"]

        if len(insert_hash) == 0:
            raise McCreateException("Hash to INSERT is empty")

        primary_key_column = self.primary_key_column(table)
        if not primary_key_column:
            raise McCreateException("Primary key for table '%s' was not found" % table)

        keys = []
        values = []
        for key, value in insert_hash.items():
            keys.append(key)
            values.append("%(" + key + ")s")  # "%(key)s" to be resolved by psycopg2, not Python

            # Cast Inline::Python's booleans to Python's booleans
            # MC_REWRITE_TO_PYTHON: remove after porting
            if type(value).__name__ == '_perl_obj':
                value = bool(value)
                insert_hash[key] = value

        sql = "INSERT INTO %s " % table
        sql += "(%s) " % ", ".join(keys)
        sql += "VALUES (%s) " % ", ".join(values)
        sql += "RETURNING %s" % primary_key_column

        try:
            last_inserted_id = self.query(sql, insert_hash).flat()
        except Exception as ex:
            if 'duplicate key value violates unique constraint' in str(ex):
                raise McUniqueConstraintException("Unable to INSERT into '%(table)s' data '%(data)s': %(exception)s" % {
                    'table': table,
                    'data': str(insert_hash),
                    'exception': str(ex),
                })
            else:
                raise ex

        if last_inserted_id is None or len(last_inserted_id) == 0:
            raise McCreateException("Last inserted ID was not found")
        last_inserted_id = last_inserted_id[0]

        inserted_row = self.find_by_id(table=table, object_id=last_inserted_id)
        if inserted_row is None:
            raise McCreateException("Could not find new ID %d in table '%s'" % (last_inserted_id, table))

        return inserted_row

    def select(self, table: str, what_to_select: str, condition_hash: dict = None) -> DatabaseResult:
        """SELECT chosen columns from the table that match given conditions."""

        table = decode_object_from_bytes_if_needed(table)
        what_to_select = decode_object_from_bytes_if_needed(what_to_select)
        condition_hash = decode_object_from_bytes_if_needed(condition_hash)

        if condition_hash is None:
            condition_hash = {}

        condition_hash = condition_hash.copy()  # To be able to safely modify it

        # MC_REWRITE_TO_PYTHON: remove after getting rid of Catalyst
        if "submit" in condition_hash:
            del condition_hash["submit"]

        sql_conditions = []

        for key, value in condition_hash.items():
            condition = key
            condition += " = %(" + key + ")s"  # "%(key)s" to be resolved by psycopg2, not Python
            sql_conditions.append(condition)

            # Cast Inline::Python's booleans to Python's booleans
            # MC_REWRITE_TO_PYTHON: remove after porting
            if type(value).__name__ == '_perl_obj':
                value = bool(value)
                condition_hash[key] = value

        sql = "SELECT %s " % what_to_select
        sql += "FROM %s " % table
        if len(sql_conditions) > 0:
            sql += "WHERE %s" % " AND ".join(sql_conditions)

        return self.query(sql, condition_hash)

    def find_or_create(self, table: str, insert_hash: dict) -> Dict[str, Any]:
        """Select a single row from the database matching the hash or insert a row with the hash values and return the
        inserted row as a hash."""

        # FIXME probably do this in a serialized transaction?

        table = decode_object_from_bytes_if_needed(table)
        insert_hash = decode_object_from_bytes_if_needed(insert_hash)

        insert_hash = insert_hash.copy()  # To be able to safely modify it

        if len(insert_hash) == 0:
            raise McFindOrCreateException("Hash to INSERT or SELECT is empty")

        # MC_REWRITE_TO_PYTHON: remove after getting rid of Catalyst
        if "submit" in insert_hash:
            del insert_hash["submit"]

        row = self.select(table=table, what_to_select='*', condition_hash=insert_hash)
        if row is not None and row.rows() > 0:
            return row.hash()
        else:
            # try to create it, but if some other process has created it because we don't have a lock, just use that one
            try:
                return self.create(table=table, insert_hash=insert_hash)
            except McUniqueConstraintException:
                return self.select(table=table, what_to_select='*', condition_hash=insert_hash).hash()

    # noinspection PyMethodMayBeStatic
    def show_error_statement(self) -> bool:
        """Return whether failed SQL statement will be included into thrown exception."""
        # FIXME I suppose psycopg2 always returns failed statement?
        # MC_REWRITE_TO_PYTHON remove after porting
        return True

    # noinspection PyMethodMayBeStatic
    def set_show_error_statement(self, show_error_statement: bool) -> None:
        """Set whether failed SQL statement will be included into thrown exception."""
        # FIXME I suppose psycopg2 always returns failed statement?
        # MC_REWRITE_TO_PYTHON remove after porting
        pass

    def print_warn(self) -> bool:
        """Return whether PostgreSQL warnings will be printed."""
        return self.__print_warnings

    def set_print_warn(self, print_warn: bool) -> None:
        """Set whether PostgreSQL warnings will be printed."""
        self.__print_warnings = print_warn

    def in_transaction(self) -> bool:
        """Return True if we're within a manually started transaction."""
        return self.__in_manual_transaction

    def __set_in_transaction(self, in_transaction: bool) -> None:
        if self.__in_manual_transaction == in_transaction:
            log.warning("Setting self.__in_manual_transaction to the same value (%s)" % str(in_transaction))
        self.__in_manual_transaction = in_transaction

    def begin(self) -> None:
        """Begin a transaction."""
        if self.in_transaction():
            raise McBeginException("Already in transaction, can't BEGIN.")

        self.query('BEGIN')
        self.__set_in_transaction(True)

    def begin_work(self) -> None:
        """Begin a transaction."""
        return self.begin()

    def commit(self) -> None:
        """Commit a transaction."""
        if not self.in_transaction():
            log.debug("Not in transaction, nothing to COMMIT.")
        else:
            self.query('COMMIT')
            self.__set_in_transaction(False)

    def rollback(self) -> None:
        """Rollback a transaction."""
        if not self.in_transaction():
            log.warning("Not in transaction, nothing to ROLLBACK.")
        else:
            self.query('ROLLBACK')
            self.__set_in_transaction(False)

    # noinspection PyMethodMayBeStatic
    def quote(self, value: Union[bool, int, float, str, None]) -> str:
        """Quote a string for being passed as a literal in a query.

        Also, replace all cases of a percentage sign ('%') with a random string shared within database handler's
        instance which will be later replaced back into double percentage sign ('%%') when executing the query."""

        value = decode_object_from_bytes_if_needed(value)

        quoted_obj = None
        try:
            # Docs say that: "While the original adapt() takes 3 arguments, psycopg2's one only takes 1: the bound
            # variable to be adapted", so:
            #
            # noinspection PyArgumentList
            quoted_obj = psycopg2_adapt(value)

            if hasattr(quoted_obj, 'encoding'):  # integer adaptors don't support encoding for example
                # Otherwise string gets treated as Latin-1:
                quoted_obj.encoding = psycopg2.extensions.encodings['UTF8']

        except Exception as ex:
            raise McQuoteException("psycopg2_adapt() failed while quoting '%s': %s" % (quoted_obj, str(ex)))
        if quoted_obj is None:
            raise McQuoteException("psycopg2_adapt() returned None while quoting '%s'" % quoted_obj)

        try:
            quoted_value = quoted_obj.getquoted()
        except Exception as ex:
            raise McQuoteException("getquoted() failed while quoting '%s': %s" % (quoted_obj, str(ex)))
        if quoted_value is None:
            raise McQuoteException("getquoted() returned None while quoting '%s'" % quoted_obj)

        if isinstance(quoted_value, bytes):
            quoted_value = quoted_value.decode(encoding='utf-8', errors='replace')

        if not isinstance(quoted_value, str):
            # Maybe overly paranoid, but better than returning random stuff for a string that will go into the database
            raise McQuoteException("Quoted value is not 'str' after quoting '%s'" % quoted_obj)

        # Replace percentage signs with a randomly generated marker that will be replaced back into '%%' when executing
        # the query.
        quoted_value = quoted_value.replace('%', DatabaseHandler.__DOUBLE_PERCENTAGE_SIGN_MARKER)

        return quoted_value

    def quote_bool(self, value: bool) -> str:
        """Quote a boolean value for being passed as a literal in a query."""
        # MC_REWRITE_TO_PYTHON: remove after starting to use Python's boolean type everywhere

        if isinstance(value, bool):
            pass
        elif isinstance(value, int):
            if value == 0:
                value = False
            elif value == 1:
                value = True
            else:
                raise McQuoteException("Value '%s' is neither 0 nor 1" % str(value))
        elif isinstance(value, str) or isinstance(value, bytes):
            value = decode_object_from_bytes_if_needed(value)
            if value.lower() in ['t', 'true', 'y', 'yes', 'on', '1']:
                value = True
            elif value.lower() in ['f', 'false', 'n', 'no', 'off', '0']:
                value = False
            else:
                raise McQuoteException("Value '%s' is string but neither of supported values" % str(value))
        else:
            raise McQuoteException("Value '%s' is unsupported" % str(value))

        return self.quote(value=value)

    def quote_varchar(self, value: str) -> str:
        """Quote VARCHAR for being passed as a literal in a query."""
        # MC_REWRITE_TO_PYTHON: remove after starting to use Python's boolean type everywhere
        value = decode_object_from_bytes_if_needed(value)

        return self.quote(value=value)

    def quote_date(self, value: str) -> str:
        """Quote DATE for being passed as a literal in a query."""
        value = decode_object_from_bytes_if_needed(value)

        return '%s::date' % self.quote(value=value)

    def quote_timestamp(self, value: str) -> str:
        """Quote TIMESTAMP for being passed as a literal in a query."""
        value = decode_object_from_bytes_if_needed(value)

        return '%s::timestamp' % self.quote(value=value)

    def copy_from(self, sql: str) -> CopyFrom:
        """Return COPY FROM helper object."""
        sql = decode_object_from_bytes_if_needed(sql)

        return CopyFrom(cursor=self.__db, sql=sql)

    def copy_to(self, sql: str) -> CopyTo:
        """Return COPY TO helper object."""
        sql = decode_object_from_bytes_if_needed(sql)

        return CopyTo(cursor=self.__db, sql=sql)

    def get_temporary_ids_table(self, ids: List[int], ordered: bool = False) -> str:
        """Get the name of a temporary table that contains all of the IDs in "ids" as an "id BIGINT" field.

        The database connection must be within a transaction. The temporary table is setup to be dropped at the end of
        the current transaction. If "ordered" is True, include an "<...>_id SERIAL PRIMARY KEY" field in the table."""

        table_name = '_tmp_ids_%s' % random_string(length=16)

        log.debug("Temporary IDs table: %s" % table_name)

        primary_key_clause = ""
        if ordered:
            primary_key_clause = "%s_pkey SERIAL PRIMARY KEY," % table_name

        sql = """CREATE TEMPORARY TABLE %s (""" % table_name
        sql += primary_key_clause
        sql += "id BIGINT)"
        self.query(sql)

        copy = self.copy_from("COPY %s (id) FROM STDIN" % table_name)
        for single_id in ids:
            copy.put_line("%d\n" % int(single_id))
        copy.end()

        self.query("ANALYZE %s" % table_name)

        return table_name

    def attach_child_query(self,
                           data: List[Dict[str, Any]],
                           child_query: str,
                           child_field: str,
                           id_column: str,
                           single: bool = False) -> List[Dict[str, Any]]:
        """For each row in "data", attach all results in the child query that match a JOIN with the "id_column" field in
        each row of "data".

        Then, attach to "row[child_field]":

        * If "single" is True, the "child_field" column in the corresponding row in "data";

        * If "single" is False, a list of values for each row in "data".

        For an example on how this works, see test_attach_child_query() in test_handler.py."""

        # FIXME get rid of this hard to understand reimplementation of JOIN which is here due to the sole reason that
        # _add_nested_data() is hard to refactor out and no one bothered to do it.
        # HMR: the point of this thing is to be able to add nested data in only a single query, which vastly increases
        # performance over performing one query per row for the nested data

        data = decode_object_from_bytes_if_needed(data)
        if not isinstance(data, list):
            raise McDecodeObjectFromBytesIfNeededException(
                "'data' is not a list anymore after converting: %s" % str(data)
            )
        data = list(data)  # get rid of return type warning by enforcing that 'data' is still a list
        child_query = decode_object_from_bytes_if_needed(child_query)
        child_field = decode_object_from_bytes_if_needed(child_field)
        id_column = decode_object_from_bytes_if_needed(id_column)

        parent_lookup = {}
        ids = []
        for parent in data:
            parent_id = parent[id_column]

            parent_lookup[parent_id] = parent
            ids.append(parent_id)

        ids_table = self.get_temporary_ids_table(ids=ids)
        sql = """
            -- noinspection SqlResolve
            SELECT q.*
            FROM ( %(child_query)s ) AS q
                -- Limit rows returned by "child_query" to only IDs from "ids"
                INNER JOIN %(ids_table)s AS ids
                    ON q.%(id_column)s = ids.id
        """ % {
            'child_query': child_query,
            'ids_table': ids_table,
            'id_column': id_column,
        }
        children = self.query(sql).hashes()

        # if we're appending lists, make sure each parent row has an empty list
        if not single:
            for parent in data:
                if child_field not in parent:
                    parent[child_field] = []

        for child in children:
            child_id = child[id_column]
            parent = parent_lookup[child_id]

            if single:
                parent[child_field] = child[child_field]
            else:
                parent[child_field].append(child)

        return data
 def directory_name():
     return '%s-%s'.format(AmazonS3DownloadsConfig.directory_name(), random_string(64))
Exemple #20
0
class DatabaseHandler(object):
    """PostgreSQL middleware (imitates DBIx::Simple's interface)."""

    # Environment variable which, when set, will make us ignore the schema version
    __IGNORE_SCHEMA_VERSION_ENV_VARIABLE = 'MEDIACLOUD_IGNORE_DB_SCHEMA_VERSION'

    # Min. "deadlock_timeout" to not cause problems under load (in seconds)
    __MIN_DEADLOCK_TIMEOUT = 5

    # cache of table primary key columns
    __primary_key_columns = {}

    # PIDs for which the schema version has been checked
    __schema_version_check_pids = {}

    # Whether or not to print PostgreSQL warnings
    __print_warnings = True

    # "Double percentage sign" marker (see handler's quote() for explanation)
    __double_percentage_sign_marker = "<DOUBLE PERCENTAGE SIGN: " + random_string(
        length=16) + ">"

    # Debugging variable to test whether we're in a transaction
    __in_manual_transaction = False

    # Pyscopg2 instance and cursor
    __conn = None
    __db = None

    def __init__(self,
                 host: str,
                 port: int,
                 username: str,
                 password: str,
                 database: str,
                 do_not_check_schema_version: bool = False):
        """Database handler constructor; connects to PostgreSQL too."""

        host = decode_object_from_bytes_if_needed(host)
        # noinspection PyTypeChecker
        port = int(decode_object_from_bytes_if_needed(port))
        username = decode_object_from_bytes_if_needed(username)
        password = decode_object_from_bytes_if_needed(password)
        database = decode_object_from_bytes_if_needed(database)

        self.__connect(host=host,
                       port=port,
                       username=username,
                       password=password,
                       database=database,
                       do_not_check_schema_version=do_not_check_schema_version)

    def __connect(self,
                  host: str,
                  port: int,
                  username: str,
                  password: str,
                  database: str,
                  do_not_check_schema_version: bool = False) -> None:
        """Connect to PostgreSQL."""

        host = decode_object_from_bytes_if_needed(host)
        # noinspection PyTypeChecker
        port = int(decode_object_from_bytes_if_needed(port))
        username = decode_object_from_bytes_if_needed(username)
        password = decode_object_from_bytes_if_needed(password)
        database = decode_object_from_bytes_if_needed(database)

        # If the user didn't clearly (via 'true' or 'false') state whether or not
        # to check schema version, check it once per PID
        pid = os.getpid()

        if not (host and username and password and database):
            raise McConnectException(
                "Database connection credentials are not set.")

        if not port:
            port = 5432

        if not do_not_check_schema_version:
            if pid in self.__schema_version_check_pids:
                do_not_check_schema_version = True
            else:
                do_not_check_schema_version = False

        self.__conn = psycopg2.connect(host=host,
                                       port=port,
                                       user=username,
                                       password=password,
                                       database=database)

        # Magic bits for psycopg2 to start supporting UTF-8
        psycopg2.extensions.register_type(psycopg2.extensions.UNICODE,
                                          self.__conn)
        psycopg2.extensions.register_type(psycopg2.extensions.UNICODEARRAY,
                                          self.__conn)
        self.__conn.set_client_encoding(psycopg2.extensions.encodings['UTF8'])

        # Don't automatically decode JSON, just like DBD::Pg doesn't
        # MC_REWRITE_TO_PYTHON: (probably) remove after porting
        psycopg2.extras.register_default_json(loads=lambda x: x)

        # psycopg2.extras.DictCursor factory enables server-side query prepares so all result data does not get fetched
        # at once
        cursor_factory = psycopg2.extras.DictCursor
        self.__db = self.__conn.cursor(cursor_factory=cursor_factory)

        # Queries to have immediate effect by default
        self.__conn.set_isolation_level(
            psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT)

        if not do_not_check_schema_version:
            if not self.schema_is_up_to_date():
                # It would make sense to check the MEDIACLOUD_IGNORE_DB_SCHEMA_VERSION environment variable
                # at this particular point too, but schema_is_up_to_date() warns the user about schema being
                # too old on every run, and that's supposedly a good thing.
                raise McConnectException("Database schema is not up-to-date.")

        # If schema is not up-to-date, connect() dies and we don't get to set PID here
        self.__schema_version_check_pids[pid] = True

        # Check deadlock_timeout
        (deadlock_timeout, ) = self.query("SHOW deadlock_timeout").flat()
        deadlock_timeout = re.sub(r'\s*s$', '', deadlock_timeout, re.I)
        deadlock_timeout = int(deadlock_timeout)
        if deadlock_timeout == 0:
            raise McConnectException(
                "'deadlock_timeout' is 0, probably unable to read it")
        if deadlock_timeout < self.__MIN_DEADLOCK_TIMEOUT:
            l.warning(
                '"deadlock_timeout" is less than "%ds", expect deadlocks on high extractor load'
                % self.__MIN_DEADLOCK_TIMEOUT)

    def disconnect(self) -> None:
        """Disconnect from the database."""
        self.__db.close()
        self.__db = None

        self.__conn.close()
        self.__db = None

    # noinspection PyMethodMayBeStatic
    def dbh(self) -> None:
        raise McDatabaseHandlerException(
            "Please don't use internal database handler directly")

    def __should_continue_with_outdated_schema(
            self, current_schema_version: int,
            target_schema_version: int) -> bool:
        """Schema is outdated / too new; returns 1 if MC should continue nevertheless, 0 otherwise"""
        config = py_get_config()

        config_ignore_schema_version = False
        if 'ignore_schema_version' in config['mediawords']:
            config_ignore_schema_version = config["mediawords"][
                "ignore_schema_version"]

        if config_ignore_schema_version and self.__IGNORE_SCHEMA_VERSION_ENV_VARIABLE in os.environ:
            l.warning("""
                The current Media Cloud database schema is older than the schema present in mediawords.sql,
                but %s is set so continuing anyway.
            """ % self.__IGNORE_SCHEMA_VERSION_ENV_VARIABLE)
            return True
        else:
            l.warning(
                """
                ################################

                The current Media Cloud database schema is not the same as the schema present in mediawords.sql.

                The database schema currently running in the database is %(current_schema_version)s,
                and the schema version in the mediawords.sql is %(target_schema_version)s.

                Please run:

                    ./script/mediawords_upgrade_db.py --import

                to automatically upgrade the database schema to the latest version.

                If you want to connect to the Media Cloud database anyway (ignoring the schema version),
                set the %(IGNORE_SCHEMA_VERSION_ENV_VARIABLE)s environment variable as such:

                    %(IGNORE_SCHEMA_VERSION_ENV_VARIABLE)s=1 ./script/your_script.py

                ################################

            """ % {
                    "current_schema_version":
                    current_schema_version,
                    "target_schema_version":
                    target_schema_version,
                    "IGNORE_SCHEMA_VERSION_ENV_VARIABLE":
                    self.__IGNORE_SCHEMA_VERSION_ENV_VARIABLE,
                })
            return False

    def schema_is_up_to_date(self) -> bool:
        """Checks if the database schema is up-to-date"""
        root_dir = mc_root_path()

        # Check if the database is empty
        db_vars_table_exists = len(
            self.query("""
            -- noinspection SqlResolve
            SELECT *
            FROM information_schema.tables
            WHERE table_name = 'database_variables'
        """).flat()) > 0
        if not db_vars_table_exists:
            l.info(
                "Database table 'database_variables' does not exist, probably the database is empty at this point."
            )
            return True

        # Current schema version
        (current_schema_version, ) = self.query("""
            SELECT value AS schema_version
            FROM database_variables
            WHERE name = 'database-schema-version'
            LIMIT 1
        """).flat()
        current_schema_version = int(current_schema_version)
        if current_schema_version == 0:
            raise McSchemaIsUpToDateException("Current schema version is 0")

        # Target schema version
        sql = open(os.path.join(root_dir, 'schema', 'mediawords.sql'),
                   'r').read()
        target_schema_version = schema_version_from_lines(sql)
        if not target_schema_version:
            raise McSchemaIsUpToDateException("Invalid target schema version.")

        # Check if the current schema is up-to-date
        if current_schema_version != target_schema_version:
            return self.__should_continue_with_outdated_schema(
                current_schema_version, target_schema_version)
        else:
            # Things are fine at this point.
            return True

    def query(self, *query_params) -> DatabaseResult:
        """Run the query, return instance of DatabaseResult for accessing the result.

        Accepts either (preferred) psycopg2-style query and parameters:

            # Dictionary parameters (preferred):
            db.query('SELECT * FROM foo WHERE bar = %(bar)s AND baz = %(baz)s', {'bar': bar, 'baz': baz})

            # Dictionary parameters with tuple:
            db.query('SELECT * FROM foo WHERE bar IN %(bar)s, {'bar': tuple(['a', 'b', 'c'])})

            # Tuple parameters:
            db.query('SELECT * FROM foo WHERE bar = %s AND baz = %s', (bar, baz,))

        ...or DBD::Pg (DBIx::Simple) form of query and parameters:

            db.query('SELECT * FROM foo WHERE bar = ? AND baz = ?', bar, baz)
        """

        # MC_REWRITE_TO_PYTHON: remove after porting queries to named parameter style
        query_params = convert_dbd_pg_arguments_to_psycopg2_format(
            *query_params)

        if len(query_params) == 0:
            raise McQueryException("Query is unset.")
        if len(query_params) > 2:
            raise McQueryException(
                "psycopg2's execute() accepts at most 2 parameters.")

        return DatabaseResult(
            cursor=self.__db,
            query_args=query_params,
            double_percentage_sign_marker=self.__double_percentage_sign_marker,
            print_warnings=self.__print_warnings)

    def prepare(self, sql: str) -> DatabaseStatement:
        """Return a prepared statement."""
        # MC_REWRITE_TO_PYTHON get rid of it because it was useful only for writing BYTEA cells; psycopg2 can just
        # use 'bytes' arguments

        sql = decode_object_from_bytes_if_needed(sql)

        return DatabaseStatement(
            cursor=self.__db,
            sql=sql,
            double_percentage_sign_marker=self.__double_percentage_sign_marker)

    def __get_current_work_mem(self) -> str:
        current_work_mem = self.query("SHOW work_mem").flat()[0]
        return current_work_mem

    def __get_large_work_mem(self) -> str:
        config = py_get_config()
        if 'large_work_mem' in config['mediawords']:
            work_mem = config['mediawords']['large_work_mem']
        else:
            work_mem = self.__get_current_work_mem()
        return work_mem

    def __set_work_mem(self, new_work_mem: str) -> None:
        new_work_mem = decode_object_from_bytes_if_needed(new_work_mem)
        self.query("SET work_mem TO %s", (new_work_mem, ))

    def execute_with_large_work_mem(self, *query_args) -> None:
        """Execute query with large 'work_mem' setting; does *not* return a result of any kind."""
        def __execute_with_large_work_mem_subquery():
            self.query(*query_args)

        exception = None
        try:
            self.run_block_with_large_work_mem(
                __execute_with_large_work_mem_subquery)
        except Exception as ex:
            l.error("Error while running query with large work memory: %s" %
                    str(ex))
            exception = ex

        if exception is not None:
            raise exception  # pass further

    def run_block_with_large_work_mem(self, block: Callable[[], None]) -> None:
        """Run a block (function) with a large 'work_mem' setting set; does *not* return a result of any kind."""
        l.debug("starting run_block_with_large_work_mem")

        large_work_mem = self.__get_large_work_mem()
        old_work_mem = self.__get_current_work_mem()

        if large_work_mem is not None:
            self.__set_work_mem(large_work_mem)
        else:
            l.warning("Large work memory is unset, using default 'work_mem'")

        exception = None
        try:
            block()
        except Exception as ex:
            l.error("Error while running block with large work memory: %s" %
                    str(ex))
            exception = ex

        self.__set_work_mem(old_work_mem)

        l.debug("exiting run_block_with_large_work_mem")

        if exception is not None:
            raise exception  # pass further

    def primary_key_column(self, table: str) -> str:
        """Get the primary key column for the table."""

        table = decode_object_from_bytes_if_needed(table)

        if table not in self.__primary_key_columns:
            # noinspection SqlResolve,SqlCheckUsingColumns
            primary_key_column = self.query(
                """
                SELECT column_name
                FROM information_schema.table_constraints
                     JOIN information_schema.key_column_usage
                         USING (constraint_catalog, constraint_schema, constraint_name,
                                table_catalog, table_schema, table_name)
                WHERE constraint_type = 'PRIMARY KEY'
                  AND table_name = %(table_name)s
                ORDER BY ordinal_position
            """, {
                    'table_name': table
                }).flat()
            if primary_key_column is None or len(primary_key_column) == 0:
                raise McPrimaryKeyColumnException(
                    "Primary key for table '%s' was not found" % table)
            if len(primary_key_column) > 1:
                raise McPrimaryKeyColumnException(
                    "More than one primary key column was found for table '%(table)s': %(primary_key_columns)s"
                    % {
                        'table': table,
                        'primary_key_columns': str(primary_key_column)
                    })
            primary_key_column = primary_key_column[0]

            self.__primary_key_columns[table] = primary_key_column

        return self.__primary_key_columns[table]

    def find_by_id(self, table: str,
                   object_id: int) -> Union[Dict[str, Any], None]:
        """Do an ID lookup on the table and return a single row match if found."""

        # MC_REWRITE_TO_PYTHON: some IDs get passed as 'str' / 'bytes'; remove after getting rid of Catalyst
        # noinspection PyTypeChecker
        object_id = decode_object_from_bytes_if_needed(object_id)
        object_id = int(object_id)

        table = decode_object_from_bytes_if_needed(table)

        primary_key_column = self.primary_key_column(table)
        if not primary_key_column:
            raise McFindByIDException(
                "Primary key for table '%s' was not found" % table)

        # Python substitution
        find_by_id_query = "SELECT * FROM %(table)s WHERE %(id_column)s" % {
            "table": table,
            "id_column": primary_key_column,
        }

        # psycopg2 substitution
        result = self.query(find_by_id_query + " = %(id_value)s",
                            {'id_value': object_id})
        if result.rows() > 1:
            raise McFindByIDException(
                "More than one row was found for ID '%d' from table '%s'" %
                (object_id, table))
        elif result.rows() == 1:
            return result.hash()
        else:
            return None

    def require_by_id(self, table: str, object_id: int) -> Dict[str, Any]:
        """find_by_id() or raise exception if not found."""

        # MC_REWRITE_TO_PYTHON: some IDs get passed as 'str' / 'bytes'; remove after getting rid of Catalyst
        # noinspection PyTypeChecker
        object_id = decode_object_from_bytes_if_needed(object_id)
        object_id = int(object_id)

        table = decode_object_from_bytes_if_needed(table)

        row = self.find_by_id(table, object_id)
        if row is None:
            raise McRequireByIDException(
                "Unable to find ID '%d' in table '%s'" % (object_id, table))
        return row

    def update_by_id(self, table: str, object_id: int,
                     update_hash: dict) -> Union[Dict[str, Any], None]:
        """Update the row in the table with the given ID. Ignore any fields that start with '_'."""

        # MC_REWRITE_TO_PYTHON: some IDs get passed as 'str' / 'bytes'; remove after getting rid of Catalyst
        # noinspection PyTypeChecker
        object_id = decode_object_from_bytes_if_needed(object_id)
        object_id = int(object_id)

        table = decode_object_from_bytes_if_needed(table)
        update_hash = decode_object_from_bytes_if_needed(update_hash)

        update_hash = update_hash.copy()  # To be able to safely modify it

        # MC_REWRITE_TO_PYTHON: remove after getting rid of Catalyst
        if "submit" in update_hash:
            del update_hash["submit"]

        update_hash = {
            k: v
            for k, v in update_hash.items() if not k.startswith("_")
        }

        if len(update_hash) == 0:
            raise McUpdateByIDException("Hash to UPDATE is empty.")

        primary_key_column = self.primary_key_column(table)
        if not primary_key_column:
            raise McUpdateByIDException(
                "Primary key for table '%s' was not found" % table)

        keys = []
        for key, value in update_hash.items():
            key_value = key

            # Cast Inline::Python's booleans to Python's booleans
            # MC_REWRITE_TO_PYTHON: remove after porting
            if type(value).__name__ == '_perl_obj':
                value = bool(value)
                update_hash[key] = value

            key_value += " = %(" + key + ")s"  # "%(key)s" to be resolved by psycopg2, not Python

            keys.append(key_value)

        update_hash['__object_id'] = object_id

        sql = "UPDATE %s " % table
        sql += "SET %s " % ", ".join(keys)
        sql += "WHERE %s = " % primary_key_column
        sql += "%(__object_id)s"  # "%(__object_id)s" to be resolved by psycopg2, not Python

        try:
            self.query(sql, update_hash)
        except Exception as ex:
            raise McUpdateByIDException("Update to UPDATE hash '%s': %s" %
                                        (str(update_hash), str(ex)))

        updated_row = self.find_by_id(table=table, object_id=object_id)

        return updated_row

    def delete_by_id(self, table: str, object_id: int) -> None:
        """Delete the row in the table with the given ID."""

        # MC_REWRITE_TO_PYTHON: some IDs get passed as 'str' / 'bytes'; remove after getting rid of Catalyst
        # noinspection PyTypeChecker
        object_id = decode_object_from_bytes_if_needed(object_id)
        object_id = int(object_id)

        table = decode_object_from_bytes_if_needed(table)

        primary_key_column = self.primary_key_column(table)
        if not primary_key_column:
            raise McDeleteByIDException(
                "Primary key for table '%s' was not found" % table)

        sql = "DELETE FROM %s " % table
        sql += "WHERE %s = " % primary_key_column
        sql += "%(__object_id)s"  # "%(object_id)s" to be resolved by psycopg2, not Python

        self.query(sql, {"__object_id": object_id})

    def insert(self, table: str, insert_hash: dict) -> Dict[str, Any]:
        """Alias for create()."""
        table = decode_object_from_bytes_if_needed(table)
        insert_hash = decode_object_from_bytes_if_needed(insert_hash)

        return self.create(table=table, insert_hash=insert_hash)

    def create(self, table: str, insert_hash: dict) -> Dict[str, Any]:
        """Insert a row into the database for the given table with the given hash values and return the created row."""

        table = decode_object_from_bytes_if_needed(table)
        insert_hash = decode_object_from_bytes_if_needed(insert_hash)

        insert_hash = insert_hash.copy()  # To be able to safely modify it

        # MC_REWRITE_TO_PYTHON: remove after getting rid of Catalyst
        if "submit" in insert_hash:
            del insert_hash["submit"]

        if len(insert_hash) == 0:
            raise McCreateException("Hash to INSERT is empty")

        primary_key_column = self.primary_key_column(table)
        if not primary_key_column:
            raise McCreateException(
                "Primary key for table '%s' was not found" % table)

        keys = []
        values = []
        for key, value in insert_hash.items():
            keys.append(key)
            values.append(
                "%(" + key +
                ")s")  # "%(key)s" to be resolved by psycopg2, not Python

            # Cast Inline::Python's booleans to Python's booleans
            # MC_REWRITE_TO_PYTHON: remove after porting
            if type(value).__name__ == '_perl_obj':
                value = bool(value)
                insert_hash[key] = value

        sql = "INSERT INTO %s " % table
        sql += "(%s) " % ", ".join(keys)
        sql += "VALUES (%s) " % ", ".join(values)
        sql += "RETURNING %s" % primary_key_column

        try:
            last_inserted_id = self.query(sql, insert_hash).flat()
        except Exception as ex:
            raise McCreateException(
                "Unable to INSERT into '%(table)s' data '%(data)s': %(exception)s"
                % {
                    'table': table,
                    'data': str(insert_hash),
                    'exception': str(ex),
                })

        if last_inserted_id is None or len(last_inserted_id) == 0:
            raise McCreateException("Last inserted ID was not found")
        last_inserted_id = last_inserted_id[0]

        inserted_row = self.find_by_id(table=table, object_id=last_inserted_id)
        if inserted_row is None:
            raise McCreateException("Could not find new ID %d in table '%s'" %
                                    (last_inserted_id, table))

        return inserted_row

    def select(self,
               table: str,
               what_to_select: str,
               condition_hash: dict = None) -> DatabaseResult:
        """SELECT chosen columns from the table that match given conditions."""

        table = decode_object_from_bytes_if_needed(table)
        what_to_select = decode_object_from_bytes_if_needed(what_to_select)
        condition_hash = decode_object_from_bytes_if_needed(condition_hash)

        if condition_hash is None:
            condition_hash = {}

        condition_hash = condition_hash.copy(
        )  # To be able to safely modify it

        # MC_REWRITE_TO_PYTHON: remove after getting rid of Catalyst
        if "submit" in condition_hash:
            del condition_hash["submit"]

        sql_conditions = []

        for key, value in condition_hash.items():
            condition = key
            condition += " = %(" + key + ")s"  # "%(key)s" to be resolved by psycopg2, not Python
            sql_conditions.append(condition)

            # Cast Inline::Python's booleans to Python's booleans
            # MC_REWRITE_TO_PYTHON: remove after porting
            if type(value).__name__ == '_perl_obj':
                value = bool(value)
                condition_hash[key] = value

        sql = "SELECT %s " % what_to_select
        sql += "FROM %s " % table
        if len(sql_conditions) > 0:
            sql += "WHERE %s" % " AND ".join(sql_conditions)

        return self.query(sql, condition_hash)

    def find_or_create(self, table: str, insert_hash: dict) -> Dict[str, Any]:
        """Select a single row from the database matching the hash or insert a row with the hash values and return the
        inserted row as a hash."""

        table = decode_object_from_bytes_if_needed(table)
        insert_hash = decode_object_from_bytes_if_needed(insert_hash)

        insert_hash = insert_hash.copy()  # To be able to safely modify it

        if len(insert_hash) == 0:
            raise McFindOrCreateException("Hash to INSERT or SELECT is empty")

        # MC_REWRITE_TO_PYTHON: remove after getting rid of Catalyst
        if "submit" in insert_hash:
            del insert_hash["submit"]

        row = self.select(table=table,
                          what_to_select='*',
                          condition_hash=insert_hash)
        if row is not None and row.rows() > 0:
            return row.hash()
        else:
            return self.create(table=table, insert_hash=insert_hash)

    # noinspection PyMethodMayBeStatic
    def show_error_statement(self) -> bool:
        """Return whether failed SQL statement will be included into thrown exception."""
        # FIXME I suppose psycopg2 always returns failed statement?
        # MC_REWRITE_TO_PYTHON remove after porting
        return True

    # noinspection PyMethodMayBeStatic
    def set_show_error_statement(self, show_error_statement: bool) -> None:
        """Set whether failed SQL statement will be included into thrown exception."""
        # FIXME I suppose psycopg2 always returns failed statement?
        # MC_REWRITE_TO_PYTHON remove after porting
        pass

    def print_warn(self) -> bool:
        """Return whether PostgreSQL warnings will be printed."""
        return self.__print_warnings

    def set_print_warn(self, print_warn: bool) -> None:
        """Set whether PostgreSQL warnings will be printed."""
        self.__print_warnings = print_warn

    def in_transaction(self) -> bool:
        """Return True if we're within a manually started transaction."""
        return self.__in_manual_transaction

    def __set_in_transaction(self, in_transaction: bool) -> None:
        if self.__in_manual_transaction == in_transaction:
            l.warning(
                "Setting self.__in_manual_transaction to the same value (%s)" %
                str(in_transaction))
        self.__in_manual_transaction = in_transaction

    def begin(self) -> None:
        """Begin a transaction."""
        if self.in_transaction():
            raise McBeginException("Already in transaction, can't BEGIN.")

        self.query('BEGIN')
        self.__set_in_transaction(True)

    def begin_work(self) -> None:
        """Begin a transaction."""
        return self.begin()

    def commit(self) -> None:
        """Commit a transaction."""
        if not self.in_transaction():
            l.debug("Not in transaction, nothing to COMMIT.")
        else:
            self.query('COMMIT')
            self.__set_in_transaction(False)

    def rollback(self) -> None:
        """Rollback a transaction."""
        if not self.in_transaction():
            l.warning("Not in transaction, nothing to ROLLBACK.")
        else:
            self.query('ROLLBACK')
            self.__set_in_transaction(False)

    def quote(self, value: Union[bool, int, float, str, None]) -> str:
        """Quote a string for being passed as a literal in a query.

        Also, replace all cases of a percentage sign ('%') with a random string shared within database handler's
        instance which will be later replaced back into double percentage sign ('%%') when executing the query."""

        value = decode_object_from_bytes_if_needed(value)

        quoted_obj = None
        try:
            # Docs say that: "While the original adapt() takes 3 arguments, psycopg2's one only takes 1: the bound
            # variable to be adapted", so:
            #
            # noinspection PyArgumentList
            quoted_obj = psycopg2_adapt(value)

            if hasattr(
                    quoted_obj, 'encoding'
            ):  # integer adaptors don't support encoding for example
                # Otherwise string gets treated as Latin-1:
                quoted_obj.encoding = psycopg2.extensions.encodings['UTF8']

        except Exception as ex:
            raise McQuoteException(
                "psycopg2_adapt() failed while quoting '%s': %s" %
                (quoted_obj, str(ex)))
        if quoted_obj is None:
            raise McQuoteException(
                "psycopg2_adapt() returned None while quoting '%s'" %
                quoted_obj)

        try:
            quoted_value = quoted_obj.getquoted()
        except Exception as ex:
            raise McQuoteException(
                "getquoted() failed while quoting '%s': %s" %
                (quoted_obj, str(ex)))
        if quoted_value is None:
            raise McQuoteException(
                "getquoted() returned None while quoting '%s'" % quoted_obj)

        if isinstance(quoted_value, bytes):
            quoted_value = quoted_value.decode(encoding='utf-8',
                                               errors='replace')

        if not isinstance(quoted_value, str):
            # Maybe overly paranoid, but better than returning random stuff for a string that will go into the database
            raise McQuoteException(
                "Quoted value is not 'str' after quoting '%s'" % quoted_obj)

        # Replace percentage signs with a randomly generated marker that will be replaced back into '%%' when executing
        # the query.
        quoted_value = quoted_value.replace(
            '%', self.__double_percentage_sign_marker)

        return quoted_value

    def quote_bool(self, value: bool) -> str:
        """Quote a boolean value for being passed as a literal in a query."""
        # MC_REWRITE_TO_PYTHON: remove after starting to use Python's boolean type everywhere

        if isinstance(value, bool):
            pass
        elif isinstance(value, int):
            if value == 0:
                value = False
            elif value == 1:
                value = True
            else:
                raise McQuoteException("Value '%s' is neither 0 nor 1" %
                                       str(value))
        elif isinstance(value, str) or isinstance(value, bytes):
            value = decode_object_from_bytes_if_needed(value)
            if value.lower() in ['t', 'true', 'y', 'yes', 'on', '1']:
                value = True
            elif value.lower() in ['f', 'false', 'n', 'no', 'off', '0']:
                value = False
            else:
                raise McQuoteException(
                    "Value '%s' is string but neither of supported values" %
                    str(value))
        else:
            raise McQuoteException("Value '%s' is unsupported" % str(value))

        return self.quote(value=value)

    def quote_varchar(self, value: str) -> str:
        """Quote VARCHAR for being passed as a literal in a query."""
        # MC_REWRITE_TO_PYTHON: remove after starting to use Python's boolean type everywhere
        value = decode_object_from_bytes_if_needed(value)

        return self.quote(value=value)

    def quote_date(self, value: str) -> str:
        """Quote DATE for being passed as a literal in a query."""
        value = decode_object_from_bytes_if_needed(value)

        return '%s::date' % self.quote(value=value)

    def quote_timestamp(self, value: str) -> str:
        """Quote TIMESTAMP for being passed as a literal in a query."""
        value = decode_object_from_bytes_if_needed(value)

        return '%s::timestamp' % self.quote(value=value)

    def copy_from(self, sql: str) -> CopyFrom:
        """Return COPY FROM helper object."""
        sql = decode_object_from_bytes_if_needed(sql)

        return CopyFrom(cursor=self.__db, sql=sql)

    def copy_to(self, sql: str) -> CopyTo:
        """Return COPY TO helper object."""
        sql = decode_object_from_bytes_if_needed(sql)

        return CopyTo(cursor=self.__db, sql=sql)

    def get_temporary_ids_table(self,
                                ids: List[int],
                                ordered: bool = False) -> str:
        """Get the name of a temporary table that contains all of the IDs in "ids" as an "id BIGINT" field.

        The database connection must be within a transaction. The temporary table is setup to be dropped at the end of
        the current transaction. If "ordered" is True, include an "<...>_id SERIAL PRIMARY KEY" field in the table."""

        table_name = '_tmp_ids_%s' % random_string(length=16)

        l.debug("Temporary IDs table: %s" % table_name)

        primary_key_clause = ""
        if ordered:
            primary_key_clause = "%s_pkey SERIAL PRIMARY KEY," % table_name

        sql = """CREATE TEMPORARY TABLE %s (""" % table_name
        sql += primary_key_clause
        sql += "id BIGINT)"
        self.query(sql)

        copy = self.copy_from("COPY %s (id) FROM STDIN" % table_name)
        for single_id in ids:
            copy.put_line("%d\n" % single_id)
        copy.end()

        self.query("ANALYZE %s" % table_name)

        return table_name

    def attach_child_query(self,
                           data: List[Dict[str, Any]],
                           child_query: str,
                           child_field: str,
                           id_column: str,
                           single: bool = False) -> List[Dict[str, Any]]:
        """For each row in "data", attach all results in the child query that match a JOIN with the "id_column" field in
        each row of "data".

        Then, attach to "row[child_field]":

        * If "single" is True, the "child_field" column in the corresponding row in "data";

        * If "single" is False, a list of values for each row in "data".

        For an example on how this works, see test_attach_child_query() in test_handler.py."""

        # FIXME get rid of this hard to understand reimplementation of JOIN which is here due to the sole reason that
        # _add_nested_data() is hard to refactor out and no one bothered to do it.

        data = decode_object_from_bytes_if_needed(data)
        if not isinstance(data, list):
            raise McDecodeObjectFromBytesIfNeededException(
                "'data' is not a list anymore after converting: %s" %
                str(data))
        data = list(
            data
        )  # get rid of return type warning by enforcing that 'data' is still a list
        child_query = decode_object_from_bytes_if_needed(child_query)
        child_field = decode_object_from_bytes_if_needed(child_field)
        id_column = decode_object_from_bytes_if_needed(id_column)

        parent_lookup = {}
        ids = []
        for parent in data:
            parent_id = parent[id_column]

            parent_lookup[parent_id] = parent
            ids.append(parent_id)

        ids_table = self.get_temporary_ids_table(ids=ids)
        sql = """
            -- noinspection SqlResolve
            SELECT q.*
            FROM ( %(child_query)s ) AS q
                -- Limit rows returned by "child_query" to only IDs from "ids"
                INNER JOIN %(ids_table)s AS ids
                    ON q.%(id_column)s = ids.id
        """ % {
            'child_query': child_query,
            'ids_table': ids_table,
            'id_column': id_column,
        }
        children = self.query(sql).hashes()

        for child in children:
            child_id = child[id_column]
            parent = parent_lookup[child_id]

            if single:
                parent[child_field] = child[child_field]
            else:
                if child_field not in parent:
                    parent[child_field] = []
                parent[child_field].append(child)

        return data

    def query_paged_hashes(self, query: str, page: int,
                           rows_per_page: int) -> DatabasePages:
        """Execute the query and return a list of pages hashes."""

        # MC_REWRITE_TO_PYTHON: some IDs get passed as 'str' / 'bytes'; remove after getting rid of Catalyst
        # noinspection PyTypeChecker
        page = decode_object_from_bytes_if_needed(page)
        page = int(page)

        query = decode_object_from_bytes_if_needed(query)

        return DatabasePages(
            cursor=self.__db,
            query=query,
            page=page,
            rows_per_page=rows_per_page,
            double_percentage_sign_marker=self.__double_percentage_sign_marker)
def _test_story(db: DatabaseHandler, story: dict, num: int) -> None:
    assert _find_dup_stories(
        db=db,
        story=story,
    ) == [story], f"{num} identical"

    assert _find_dup_stories(
        db=db,
        story={**story, **{
            'media_id': story['media_id'] + 1,
        }},
    ) == [], f"{num} media_id diff"

    assert _find_dup_stories(
        db=db,
        story={**story, **{
            'url': random_string(16),
            'guid': random_string(16),
        }},
    ) == [story], f"{num} URL + GUID diff, title same"

    assert _find_dup_stories(
        db=db,
        story={**story, **{
            'url': random_string(16),
            'title': random_string(16),
        }},
    ) == [story], f"{num} title + URL diff, GUID same"

    assert _find_dup_stories(
        db=db,
        story={**story, **{
            'guid': random_string(16),
            'title': random_string(16),
        }},
    ) == [story], f"{num} title + GUID diff, URL same"

    assert _find_dup_stories(
        db=db,
        story={**story, **{
            'url': story['url'].upper(),
            'guid': random_string(16),
            'title': random_string(16),
        }},
    ) == [story], f"{num} title + GUID diff, normalized url same"

    assert _find_dup_stories(
        db=db,
        story={**story, **{
            'url': random_string(16),
            'guid': random_string(16),
            'publish_date': increment_day(date=story['publish_date'], days=2),
        }},
    ) == [], f"{num} date + 2 days"

    assert _find_dup_stories(
        db=db,
        story={**story, **{
            'url': random_string(16),
            'guid': random_string(16),
            'publish_date': increment_day(date=story['publish_date'], days=-2),
        }},
    ) == [], f"{num} date - 2 days"

    # verify that we can find dup story by the url or guid of a previously dup'd story
    dup_url = random_string(16)
    dup_guid = random_string(16)

    nondup_url = random_string(16)
    nondup_guid = 'bogus unique guid'
    nondup_title = 'bogus unique title'

    dup_stories = _find_dup_stories(db, {**story, **{'url': dup_url, 'guid': dup_guid}})
    assert dup_stories == [story]

    assert _find_dup_stories(db, {**story, **{'url': dup_url, 'title': nondup_title}}) == [story]
    assert _find_dup_stories(db, {**story, **{'guid': dup_guid, 'title': nondup_title}}) == [story]

    nondup_story = {**story, **{'url': nondup_url, 'guid': nondup_guid, 'title': nondup_title}}
    assert _find_dup_stories(db, nondup_story) == []
Exemple #22
0
class JapaneseLanguage(StopWordsFromFileMixIn):
    """Japanese language support module."""

    # Paths where mecab-ipadic-neologd might be located
    __MECAB_DICTIONARY_PATHS = [

        # Ubuntu / Debian
        '/var/lib/mecab/dic/ipadic-neologd',

        # CentOS / Fedora
        '/usr/lib64/mecab/dic/ipadic-neologd/',

        # OS X
        '/usr/local/opt/mecab-ipadic-neologd/lib/mecab/dic/ipadic-neologd/',
    ]

    __MECAB_TOKEN_POS_SEPARATOR = random_string(
        length=16)  # for whatever reason tab doesn't work
    __MECAB_EOS_MARK = 'EOS'

    __slots__ = [
        # MeCab instance
        '__mecab',

        # Text -> sentence tokenizer for Japanese text
        '__japanese_sentence_tokenizer',

        # English language instance for tokenizing non-Chinese (e.g. English) text
        '__english_language',
    ]

    @staticmethod
    def _mecab_ipadic_neologd_path(
    ) -> str:  # (protected and not private because used by the unit test)
        """Return path to mecab-ipadic-neologd dictionary installed on system."""
        mecab_dictionary_path = None
        candidate_paths = JapaneseLanguage.__MECAB_DICTIONARY_PATHS

        for candidate_path in candidate_paths:
            if os.path.isdir(candidate_path):
                if os.path.isfile(os.path.join(candidate_path, 'sys.dic')):
                    mecab_dictionary_path = candidate_path
                    break

        if mecab_dictionary_path is None:
            raise McLanguageException(
                "mecab-ipadic-neologd was not found in paths: %s" %
                str(candidate_paths))

        return mecab_dictionary_path

    @staticmethod
    def _mecab_allowed_pos_ids() -> Dict[int, str]:
        """Return allowed MeCab part-of-speech IDs and their definitions from pos-id.def.

        Definitions don't do much in the language module itself, they're used by unit tests to verify that pos-id.def
        didn't change in some unexpected way and we're not missing out on newly defined POSes.
        """
        return {
            36: '名詞,サ変接続,*,*',  # noun-verbal
            38: '名詞,一般,*,*',  # noun
            40: '名詞,形容動詞語幹,*,*',  # adjectival nouns or quasi-adjectives
            41: '名詞,固有名詞,一般,*',  # proper nouns
            42: '名詞,固有名詞,人名,一般',  # proper noun, names of people
            43: '名詞,固有名詞,人名,姓',  # proper noun, first name
            44: '名詞,固有名詞,人名,名',  # proper noun, last name
            45: '名詞,固有名詞,組織,*',  # proper noun, organization
            46: '名詞,固有名詞,地域,一般',  # proper noun in general
            47: '名詞,固有名詞,地域,国',  # proper noun, country name
        }

    def __init__(self):
        """Constructor."""
        super().__init__()

        self.__japanese_sentence_tokenizer = RegexpTokenizer(
            r'([^!?。]*[!?。])',
            gaps=True,  # don't discard non-Japanese text
            discard_empty=True,
        )

        self.__english_language = EnglishLanguage()

        mecab_dictionary_path = JapaneseLanguage._mecab_ipadic_neologd_path()

        try:
            self.__mecab = MeCab.Tagger(
                '--dicdir=%(dictionary_path)s '
                '--node-format=%%m%(token_pos_separator)s%%h\\n '
                '--eos-format=%(eos_mark)s\\n' % {
                    'token_pos_separator': self.__MECAB_TOKEN_POS_SEPARATOR,
                    'eos_mark': self.__MECAB_EOS_MARK,
                    'dictionary_path': mecab_dictionary_path,
                })
        except Exception as ex:
            raise McLanguageException("Unable to initialize MeCab: %s" %
                                      str(ex))

        # Quick self-test to make sure that MeCab, its dictionaries and Python class are installed and working
        mecab_exc_message = "MeCab self-test failed; make sure that MeCab is built and dictionaries are accessible."
        try:
            test_words = self.split_sentence_to_words('pythonが大好きです')
        except Exception as _:
            raise McLanguageException(mecab_exc_message)
        else:
            if len(test_words) < 2 or test_words[1] != '大好き':
                raise McLanguageException(mecab_exc_message)

    @staticmethod
    def language_code() -> str:
        return "ja"

    @staticmethod
    def sample_sentence() -> str:
        return "いろはにほへと ちりぬるを わかよたれそ つねならむ うゐのおくやま けふこえて あさきゆめみし ゑひもせす(ん)。"

    # noinspection PyMethodMayBeStatic
    def stem_words(self, words: List[str]) -> List[str]:
        words = decode_object_from_bytes_if_needed(words)

        # MeCab's sentence -> word tokenizer already returns "base forms" of every word
        return words

    def split_text_to_sentences(self, text: str) -> List[str]:
        """Tokenize Japanese text into sentences."""
        text = decode_object_from_bytes_if_needed(text)
        if text is None:
            log.warning("Text is None.")
            return []

        text = text.strip()

        if len(text) == 0:
            return []

        # First split Japanese text
        japanese_sentences = self.__japanese_sentence_tokenizer.tokenize(text)
        sentences = []
        for sentence in japanese_sentences:

            # Split paragraphs separated by two line breaks denoting a list
            paragraphs = re.split("\n\s*?\n", sentence)
            for paragraph in paragraphs:

                # Split lists separated by "* "
                list_items = re.split("\n\s*?(?=\* )", paragraph)
                for list_item in list_items:
                    # Split non-Japanese text
                    non_japanese_sentences = self.__english_language.split_text_to_sentences(
                        list_item)

                    sentences += non_japanese_sentences

        # Trim whitespace
        sentences = [sentence.strip() for sentence in sentences]

        return sentences

    def split_sentence_to_words(self, sentence: str) -> List[str]:
        """Tokenize Japanese sentence into words.

        Removes punctuation and words that don't belong to part-of-speech whitelist."""

        sentence = decode_object_from_bytes_if_needed(sentence)
        if sentence is None:
            log.warning("Sentence is None.")
            return []

        sentence = sentence.strip()

        if len(sentence) == 0:
            return []

        parsed_text = self.__mecab.parse(sentence).strip()
        parsed_tokens = parsed_text.split("\n")

        allowed_pos_ids = self._mecab_allowed_pos_ids()

        words = []
        for parsed_token_line in parsed_tokens:
            if self.__MECAB_TOKEN_POS_SEPARATOR in parsed_token_line:

                primary_form_and_pos_number = parsed_token_line.split(
                    self.__MECAB_TOKEN_POS_SEPARATOR)

                primary_form = primary_form_and_pos_number[0]
                pos_number = primary_form_and_pos_number[1]

                if pos_number.isdigit():
                    pos_number = int(pos_number)

                    if pos_number in allowed_pos_ids:
                        words.append(primary_form)

            else:
                # Ignore all the "EOS" stuff
                pass

        return words
Exemple #23
0
class McJapaneseTokenizer(object):
    """Japanese language tokenizer that uses MeCab."""

    # Paths where mecab-ipadic-neologd might be located
    __MECAB_DICTIONARY_PATHS = [

        # Ubuntu / Debian
        '/var/lib/mecab/dic/ipadic-neologd',

        # CentOS / Fedora
        '/usr/lib64/mecab/dic/ipadic-neologd/',

        # OS X
        '/usr/local/opt/mecab-ipadic-neologd/lib/mecab/dic/ipadic-neologd/',
    ]

    # MeCab instance
    __mecab = None

    # Text -> sentence tokenizer for Japanese text
    __japanese_sentence_tokenizer = RegexpTokenizer(
        r'([^!?。]*[!?。])',
        gaps=True,  # don't discard non-Japanese text
        discard_empty=True,
    )

    # Text -> sentence tokenizer for non-Japanese (e.g. English) text
    __non_japanese_sentence_tokenizer = PunktSentenceTokenizer()

    __MECAB_TOKEN_POS_SEPARATOR = random_string(
        length=16)  # for whatever reason tab doesn't work
    __MECAB_EOS_MARK = 'EOS'

    def __init__(self):
        """Initialize MeCab tokenizer."""

        mecab_dictionary_path = McJapaneseTokenizer._mecab_ipadic_neologd_path(
        )

        try:
            self.__mecab = MeCab.Tagger(
                '--dicdir=%(dictionary_path)s '
                '--node-format=%%m%(token_pos_separator)s%%h\\n '
                '--eos-format=%(eos_mark)s\\n' % {
                    'token_pos_separator': self.__MECAB_TOKEN_POS_SEPARATOR,
                    'eos_mark': self.__MECAB_EOS_MARK,
                    'dictionary_path': mecab_dictionary_path,
                })
        except Exception as ex:
            raise McJapaneseTokenizerException(
                "Unable to initialize MeCab: %s" % str(ex))

    @staticmethod
    def _mecab_ipadic_neologd_path(
    ) -> str:  # (protected and not private because used by the unit test)
        """Return path to mecab-ipadic-neologd dictionary installed on system."""
        mecab_dictionary_path = None
        candidate_paths = McJapaneseTokenizer.__MECAB_DICTIONARY_PATHS

        for candidate_path in candidate_paths:
            if os.path.isdir(candidate_path):
                if os.path.isfile(os.path.join(candidate_path, 'sys.dic')):
                    mecab_dictionary_path = candidate_path
                    break

        if mecab_dictionary_path is None:
            raise McJapaneseTokenizerException(
                "mecab-ipadic-neologd was not found in paths: %s" %
                str(candidate_paths))

        return mecab_dictionary_path

    def tokenize_text_to_sentences(self, text: str) -> list:
        """Tokenize Japanese text into sentences."""

        text = decode_object_from_bytes_if_needed(text)

        if text is None:
            log.warning("Text to tokenize into sentences is None.")
            return []

        text = text.strip()

        if len(text) == 0:
            return []

        # First split Japanese text
        japanese_sentences = self.__japanese_sentence_tokenizer.tokenize(text)
        sentences = []
        for sentence in japanese_sentences:

            # Split paragraphs separated by two line breaks denoting a list
            paragraphs = re.split("\n\s*?\n", sentence)
            for paragraph in paragraphs:

                # Split lists separated by "* "
                list_items = re.split("\n\s*?(?=\* )", paragraph)
                for list_item in list_items:
                    # Split non-Japanese text
                    non_japanese_sentences = self.__non_japanese_sentence_tokenizer.tokenize(
                        list_item)

                    sentences += non_japanese_sentences

        # Trim whitespace
        sentences = [sentence.strip() for sentence in sentences]

        return sentences

    @staticmethod
    def _mecab_allowed_pos_ids() -> Dict[int, str]:
        """Return allowed MeCab part-of-speech IDs and their definitions from pos-id.def.
        
        Definitions don't do much in the language module itself, they're used by unit tests to verify that pos-id.def
        didn't change in some unexpected way and we're not missing out on newly defined POSes.
        """
        return {
            36: '名詞,サ変接続,*,*',  # noun-verbal
            38: '名詞,一般,*,*',  # noun
            40: '名詞,形容動詞語幹,*,*',  # adjectival nouns or quasi-adjectives
            41: '名詞,固有名詞,一般,*',  # proper nouns
            42: '名詞,固有名詞,人名,一般',  # proper noun, names of people
            43: '名詞,固有名詞,人名,姓',  # proper noun, first name
            44: '名詞,固有名詞,人名,名',  # proper noun, last name
            45: '名詞,固有名詞,組織,*',  # proper noun, organization
            46: '名詞,固有名詞,地域,一般',  # proper noun in general
            47: '名詞,固有名詞,地域,国',  # proper noun, country name
        }

    def tokenize_sentence_to_words(self, sentence: str) -> list:
        """Tokenize Japanese sentence into words.
        
        Removes punctuation and words that don't belong to part-of-speech whitelist."""

        sentence = decode_object_from_bytes_if_needed(sentence)

        if sentence is None:
            log.warning("Sentence to tokenize into words is None.")
            return []

        sentence = sentence.strip()

        if len(sentence) == 0:
            return []

        parsed_text = self.__mecab.parse(sentence).strip()
        parsed_tokens = parsed_text.split("\n")

        allowed_pos_ids = self._mecab_allowed_pos_ids()

        words = []
        for parsed_token_line in parsed_tokens:
            if self.__MECAB_TOKEN_POS_SEPARATOR in parsed_token_line:

                primary_form_and_pos_number = parsed_token_line.split(
                    self.__MECAB_TOKEN_POS_SEPARATOR)

                primary_form = primary_form_and_pos_number[0]
                pos_number = primary_form_and_pos_number[1]

                if pos_number.isdigit():
                    pos_number = int(pos_number)

                    if pos_number in allowed_pos_ids:
                        words.append(primary_form)

            else:
                # Ignore all the "EOS" stuff
                pass

        return words