Exemple #1
0
def do_link_batch_update_sess(logger, interface, link_batch):
	if not link_batch:
		return

	expected_keys = set([
			'url',
			'starturl',
			'netloc',
			'distance',
			'priority',
			'state',
			'addtime',
			'epoch',
		])


	for item in link_batch:
		try:
			assert 'url'              in item
			assert 'starturl'         in item
			assert 'netloc'           in item
			assert 'distance'         in item
			assert 'priority'         in item
			assert 'state'            in item
			assert 'addtime'          in item
			assert 'epoch'            in item

		except AssertionError:
			logger.error("Missing key from raw entry: ")
			item_str = pprint.pformat(item)
			for line in item_str.split("\n"):
				logger.error("	%s", line.rstrip())
			raise

		item_keys = set(item.keys())
		excess_keys = item_keys - expected_keys
		try:
			assert not excess_keys
		except AssertionError:
			logger.error("Excess key(s) in raw entry: '%s'", excess_keys)
			item_str = pprint.pformat(item)
			for line in item_str.split("\n"):
				logger.error("	%s", line.rstrip())
			raise


	logger.info("Inserting %s items into DB in batch.", len(link_batch))
	# This is kind of horrible.
	# Reach down through sqlalchemy and pull out the raw cursor directly.
	raw_cur = interface.connection().connection.cursor()

	per_cmd = """
	SELECT upsert_link_raw(
			%(url)s,
			%(starturl)s,
			%(netloc)s,
			%(distance)s,
			%(priority)s,
			%(addtime)s,
			%(state)s,
			%(epoch)s
			);
			""".replace("	", " ")

	per_cmd = per_cmd.replace("\n", " ")

	while "  " in per_cmd:
		per_cmd = per_cmd.replace("  ", " ")

	# Somehow we're getting here with an open transaction. I have no idea what's opening them.
	# Something something DBAPI
	raw_cur.execute("COMMIT;")


	rowcnt = 0
	try:
		for subc in misc.batch(link_batch, 50):
			# We don't care about isolation for these operations, as each operation
			# is functionally independent.
			raw_cur.execute("BEGIN TRANSACTION ISOLATION LEVEL READ COMMITTED;")

			# We use a statement timeout context of 2500 ms, so we don't get wedged on a lock.
			raw_cur.execute("SET statement_timeout TO 2500;")

			# We try the bulk insert command first.
			psycopg_execute_batch.execute_batch(raw_cur, per_cmd, subc)
			rowcnt += raw_cur.rowcount
			raw_cur.execute("COMMIT;")
			raw_cur.execute("RESET statement_timeout;")
		link_batch = []
		logger.info("Touched AT LEAST %s rows", rowcnt)
		return rowcnt

	except psycopg2.Error:
		logger.error("psycopg2.Error - Failure on bulk insert.")
		for line in traceback.format_exc().split("\n"):
			logger.error(line)
		raw_cur.execute("ROLLBACK;")
		logger.error("Retrying.")

	rowcnt = 0
	try:
		for subc in misc.batch(link_batch, 5):
			# We don't care about isolation for these operations, as each operation
			# is functionally independent.
			raw_cur.execute("BEGIN TRANSACTION ISOLATION LEVEL READ COMMITTED;")

			# We use a statement timeout context of 2500 ms, so we don't get wedged on a lock.
			raw_cur.execute("SET statement_timeout TO 2500;")

			# We try the bulk insert command first.
			psycopg_execute_batch.execute_batch(raw_cur, per_cmd, subc)
			rowcnt += raw_cur.rowcount
			raw_cur.execute("COMMIT;")
			raw_cur.execute("RESET statement_timeout;")
		link_batch = []
		logger.info("Touched AT LEAST %s rows", rowcnt)
		return rowcnt

	except psycopg2.Error:
		logger.error("psycopg2.Error - Failure on bulk insert.")
		for line in traceback.format_exc().split("\n"):
			logger.error(line)
		raw_cur.execute("ROLLBACK;")
		logger.error("Retrying with per upsert commit.")

	# If the bulk insert failed, we then try a per-URL upsert
	# We only commit per-URL if we're tried to do per-URL update in batch, and failed.
	commit_each = False
	while 1:
		rowcnt = 0
		try:
			raw_cur.execute("BEGIN TRANSACTION ISOLATION LEVEL READ COMMITTED;")

			# We use a statement timeout context of 2500 ms, so we don't get wedged on a lock.
			raw_cur.execute("SET statement_timeout TO 2500;")

			for paramset in link_batch:
				assert isinstance(paramset['starturl'], str)
				if len(paramset['url']) > 2000:
					logger.error("URL Is too long to insert into the database!")
					logger.error("URL: '%s'", paramset['url'])

				else:
					# Forward-data the next walk, time, rather then using now-value for the thresh.
					raw_cur.execute(per_cmd, paramset)
					rowcnt += raw_cur.rowcount

					if commit_each:
						raw_cur.execute("COMMIT;")
						raw_cur.execute("BEGIN TRANSACTION ISOLATION LEVEL READ COMMITTED;")
						# We use a statement timeout context of 2500 ms, so we don't get wedged on a lock.
						raw_cur.execute("SET statement_timeout TO 2500;")

			raw_cur.execute("COMMIT;")
			break

		except psycopg2.Error:
			if commit_each is False:
				logger.warning("psycopg2.Error - Retrying with commit each.")
			else:
				logger.warning("psycopg2.Error - Retrying.")
				traceback.print_exc()

			raw_cur.execute("ROLLBACK;")
			commit_each = True

	raw_cur.execute("RESET statement_timeout;")

	logger.info("Changed %s rows", rowcnt)

	return
Exemple #2
0
def do_link_batch_update_sess(logger,
                              interface,
                              link_batch,
                              max_pri=None,
                              show_progress=False):
    if not link_batch:
        return

    expected_keys = set([
        'url',
        'starturl',
        'netloc',
        'distance',
        'is_text',
        'priority',
        'type',
        'addtime',
        'state',
        'epoch',
        'maximum_priority',  # Optional
    ])

    for item in link_batch:
        try:
            assert 'url' in item
            assert 'starturl' in item
            assert 'netloc' in item
            assert 'distance' in item
            assert 'is_text' in item
            assert 'priority' in item
            assert 'type' in item
            assert 'addtime' in item
            assert 'state' in item
            assert 'epoch' in item

            if not 'maximum_priority' in item:
                item['maximum_priority'] = item['priority']

            if item['distance'] < item['maximum_priority']:
                item['distance'] = item['maximum_priority']

            assert 'maximum_priority' in item

            # psycopg2cffi._impl.exceptions.OperationalError: index row size 3192 exceeds maximum 2712 for index "ix_web_pages_url"
            assert len(
                item['url']
            ) < 2712, "URL Too long for postgres. Length %s for url '%s'" % (
                len(item['url']), item['url'])

            if max_pri is None:
                max_pri = db.DB_LOW_PRIORITY

            if item['maximum_priority'] < max_pri:
                item['maximum_priority'] = max_pri

            if item['distance'] < item['maximum_priority']:
                item['distance'] = item['maximum_priority']

        except AssertionError:
            logger.error("Missing key from entry: ")
            item_str = pprint.pformat(item)
            for line in item_str.split("\n"):
                logger.error("	%s", line.rstrip())
            raise

        item_keys = set(item.keys())
        excess_keys = item_keys - expected_keys
        try:
            assert not excess_keys
        except AssertionError:
            logger.error("Excess key(s) in entry: '%s'", excess_keys)
            item_str = pprint.pformat(item)
            for line in item_str.split("\n"):
                logger.error("	%s", line.rstrip())
            raise
        # print("item:", item)

    logger.info("Inserting %s items into DB in batch.", len(link_batch))
    # This is kind of horrible.
    # Reach down through sqlalchemy and pull out the raw cursor directly.
    try:
        raw_cur = interface.connection().connection.cursor()
    except sqlalchemy.exc.InvalidRequestError:
        interface.rollback()
        raw_cur = interface.connection().connection.cursor()

    per_cmd = """
	SELECT upsert_link(
			%(url)s,
			%(starturl)s,
			%(netloc)s,
			%(distance)s,
			%(is_text)s,
			%(priority)s,
			%(type)s,
			%(addtime)s,
			%(state)s,
			%(maximum_priority)s,
			%(epoch)s
			);
			""".replace("	", " ")

    per_cmd = per_cmd.replace("\n", " ")

    while "  " in per_cmd:
        per_cmd = per_cmd.replace("  ", " ")

    # Somehow we're getting here with an open transaction. I have no idea what's opening them.
    # Something something DBAPI
    raw_cur.execute("COMMIT;")

    rowcnt = 0
    try:
        for subc in misc.batch(link_batch, 50, show_progress=show_progress):
            # We don't care about isolation for these operations, as each operation
            # is functionally independent.
            raw_cur.execute(
                "BEGIN TRANSACTION ISOLATION LEVEL READ COMMITTED;")

            # We use a statement timeout context of 5000 ms, so we don't get wedged on a lock.
            raw_cur.execute("SET statement_timeout TO 5000;")

            # We try the bulk insert command first.
            psycopg_execute_batch.execute_batch(raw_cur, per_cmd, subc)
            rowcnt += raw_cur.rowcount
            raw_cur.execute("COMMIT;")
            raw_cur.execute("RESET statement_timeout;")
        link_batch = []
        logger.info("Touched AT LEAST %s rows", rowcnt)
        return rowcnt

    except psycopg2.Error:
        logger.error("psycopg2.Error - Failure on bulk insert.")
        for line in traceback.format_exc().split("\n"):
            logger.error(line)
        raw_cur.execute("ROLLBACK;")
        logger.error("Retrying.")

    rowcnt = 0
    try:
        for subc in misc.batch(link_batch, 5, show_progress=show_progress):
            # We don't care about isolation for these operations, as each operation
            # is functionally independent.
            raw_cur.execute(
                "BEGIN TRANSACTION ISOLATION LEVEL READ COMMITTED;")

            # We use a statement timeout context of 2500 ms, so we don't get wedged on a lock.
            raw_cur.execute("SET statement_timeout TO 2500;")

            # We try the bulk insert command first.
            psycopg_execute_batch.execute_batch(raw_cur, per_cmd, subc)
            rowcnt += raw_cur.rowcount
            raw_cur.execute("COMMIT;")
            raw_cur.execute("RESET statement_timeout;")
        link_batch = []
        logger.info("Touched AT LEAST %s rows", rowcnt)
        return rowcnt

    except psycopg2.Error:
        logger.error("psycopg2.Error - Failure on bulk insert.")
        for line in traceback.format_exc().split("\n"):
            logger.error(line)
        raw_cur.execute("ROLLBACK;")
        logger.error("Retrying with per upsert commit.")

    # If the bulk insert failed, we then try a per-URL upsert
    # We only commit per-URL if we're tried to do per-URL update in batch, and failed.
    commit_each = False
    while 1:
        rowcnt = 0
        try:
            raw_cur.execute(
                "BEGIN TRANSACTION ISOLATION LEVEL READ COMMITTED;")

            # We use a statement timeout context of 2500 ms, so we don't get wedged on a lock.
            raw_cur.execute("SET statement_timeout TO 2500;")

            for paramset in link_batch:
                assert isinstance(paramset['starturl'], str)
                if len(paramset['url']) > 2000:
                    logger.error(
                        "URL Is too long to insert into the database!")
                    logger.error("URL: '%s'", paramset['url'])

                else:
                    # Forward-data the next walk, time, rather then using now-value for the thresh.
                    raw_cur.execute(per_cmd, paramset)
                    rowcnt += raw_cur.rowcount

                    if commit_each:
                        raw_cur.execute("COMMIT;")
                        raw_cur.execute(
                            "BEGIN TRANSACTION ISOLATION LEVEL READ COMMITTED;"
                        )
                        # We use a statement timeout context of 2500 ms, so we don't get wedged on a lock.
                        raw_cur.execute("SET statement_timeout TO 2500;")

            raw_cur.execute("COMMIT;")
            break

        except psycopg2.Error:
            if commit_each is False:
                logger.warning("psycopg2.Error - Retrying with commit each.")
            else:
                logger.warning("psycopg2.Error - Retrying.")
                traceback.print_exc()

            raw_cur.execute("ROLLBACK;")
            commit_each = True

    raw_cur.execute("RESET statement_timeout;")

    logger.info("Changed %s rows", rowcnt)

    return