Exemple #1
0
def prune_expired_unverified_user_records(connection):
    records = db.get_expired_unverified_user_records(
        connection, int(os.getenv("WWW2PNG_UNVERIFIED_USER_PRUNE_DELAY")))
    for record in records:
        db.delete_unverified_user_record(connection, record["id"])
        log_message(
            f'Pruned unverified user record: {record["id"]} {record["email"]}')
Exemple #2
0
def manage():
	with db.connect() as connection:
		data = {}
		form = v.ManageForm()
		if form.validate_on_submit():
			accepted = json.loads(form.data["accepted"])
			for item in accepted:
				record = db.get_submission_record(connection, item)
				if record is not None and not db.check_url_exists(connection, record["url"]):
					data = {"url": record["url"]}
					db.create_url_record(connection, data)
					misc.log_message(f"""Created url record: {record["url"]}""")
				db.delete_submission_record(connection, item)
				misc.log_message(f"""Deleted submission record: {item}""")
				file_path_screenshot = os.path.join(os.getcwd(), os.getenv("ARCHIVE3_SCREENSHOT_DIR"), "preview-" + str(item) + ".jpg")
				if os.path.exists(file_path_screenshot):
					os.remove(file_path_screenshot)
				misc.log_message(f"""Deleted local preview screenshot: {file_path_screenshot}""")
			rejected = json.loads(form.data["rejected"])
			for item in rejected:
				db.delete_submission_record(connection, item)
				misc.log_message(f"""Deleted submission record: {item}""")
				file_path_screenshot = os.path.join(os.getcwd(), os.getenv("ARCHIVE3_SCREENSHOT_DIR"), "preview-" + str(item) + ".jpg")
				if os.path.exists(file_path_screenshot):
					os.remove(file_path_screenshot)
				misc.log_message(f"""Deleted local preview screenshot: {file_path_screenshot}""")
			connection.commit()
		data["pending"] = db.get_pending_submission_records(connection, 100)
		return render_template("manage.html", page_title=misc.page_title("manage"), data=data)
Exemple #3
0
def start_processing_thread():
    queue = greenstalk.Client(host=os.getenv("GREENSTALK_HOST"),
                              port=os.getenv("GREENSTALK_PORT"),
                              watch=[os.getenv("GREENSTALK_TUBE_QUEUE")])
    while True:
        job = queue.reserve()

        try:
            connection = psycopg2.connect(
                dbname=os.getenv("POSTGRESQL_DB"),
                host=os.getenv("POSTGRESQL_HOST"),
                port=os.getenv("POSTGRESQL_PORT"),
                user=os.getenv("POSTGRESQL_USER"),
                password=os.getenv("POSTGRESQL_PASS"))

            payload = json.loads(job.body)
            misc.log_message(f"Processing job: {payload}")

            # Process screenshot or preview.
            if payload["type"] == "screenshot":
                process_screenshot(connection, payload)
            elif payload["type"] == "preview":
                process_preview(connection, payload)
            else:
                raise Exception(f"""Invalid payload type: {payload["type"]}""")

            # Delete job.
            queue.delete(job)
            misc.log_message(f"Deleted job: {payload}")
        except:
            # Delete job.
            queue.delete(job)
            misc.log_message(f"Deleting failed job: {payload}")
            # Erase submission record if it exists.
            if payload["type"] == "preview":
                db.delete_submission_record(connection, payload["id"])
                connection.commit()
                misc.log_message(
                    f"""Deleted submission record: {payload["id"]}""")
            # Update error count for url_id.
            record = db.get_url_record(connection, payload["id"])
            error_count = record["errors"] + 1
            db.update_url_record(connection, payload["id"],
                                 {"errors": error_count})
            connection.commit()
            misc.log_message(
                f"""Updating errors for url_id {payload["id"]}: {error_count}"""
            )
            raise
        finally:
            connection.close()
Exemple #4
0
def prune_expired_data_records(connection):
    records = db.get_expired_data_records(
        connection, int(os.getenv("WWW2PNG_SCREENSHOT_PRUNE_DELAY")))
    for record in records:
        try:
            data = {"pruned": "true"}
            db.update_data_record(connection, record["id"], data)
            log_message(f'Pruned data record: {record["id"]}')
            screenshot_filename = ss.determine_screenshot_filename(
                record["request_id"])
            os.remove(screenshot_filename)
            log_message(f'Deleted screenshot: {screenshot_filename}')
        except FileNotFoundError:
            log_message(f'Failed to delete screenshot: {screenshot_filename}')
Exemple #5
0
        except FileNotFoundError:
            log_message(f'Failed to delete screenshot: {screenshot_filename}')


def prune_expired_unverified_user_records(connection):
    records = db.get_expired_unverified_user_records(
        connection, int(os.getenv("WWW2PNG_UNVERIFIED_USER_PRUNE_DELAY")))
    for record in records:
        db.delete_unverified_user_record(connection, record["id"])
        log_message(
            f'Pruned unverified user record: {record["id"]} {record["email"]}')


##### ENTRY POINT #####

if __name__ == "__main__":
    load_dotenv()
    with db.connect() as connection:
        while True:
            try:
                prune_expired_data_records(connection)
                connection.commit()
                prune_expired_unverified_user_records(connection)
                connection.commit()
            except Exception as e:
                log_message("Buried.")
                raise e
            finally:
                sys.stdout.flush()
            time.sleep(int(os.getenv("WWW2PNG_PRUNE_LOOP_DELAY")))
Exemple #6
0
              ttr=int(os.getenv("ARCHIVE3_PROCESSING_TTR")))


while True:
    connection = db.connect()

    expired = db.get_expired_active_url_records(
        connection, int(os.getenv("ARCHIVE3_URL_EXPIRATION")),
        int(os.getenv("ARCHIVE3_URL_ERROR_RETRIES")))

    # Always queue one random screenshot if nothing was expired.
    if len(expired) == 0 and os.getenv("ARCHIVE3_QUEUE_RANDOM_URLS") == "true":
        e = db.get_random_url_record(
            connection, int(os.getenv("ARCHIVE3_URL_ERROR_RETRIES")))
        queue_screenshot(connection, e["id"], e["url"])
        misc.log_message(f"""Queued Random Screenshot {e["id"]}: {e["url"]}""")

    # Queue all expired.
    for e in expired:
        queue_screenshot(connection, e["id"], e["url"])
        misc.log_message(f"""Queued Screenshot {e["id"]}: {e["url"]}""")

    # Process all unprocessed submission records.
    for p in db.get_unprocessed_submission_records(connection):
        if db.get_processed_submission_record_count(
                connection)["count"] >= int(
                    os.getenv("ARCHIVE3_PROCESSING_READY_LIMIT")):
            break
        if db.check_url_exists(connection, p["url"]):
            db.delete_submission_record(connection, p["id"])
            connection.commit()
Exemple #7
0
def process_screenshot(connection, payload):
    # Validate URL
    misc.log_message(f"""Validating URL: {payload["url"]}""")
    try:
        headers = {"User-Agent": os.getenv("ARCHIVE3_USER_AGENT")}
        response = requests.get(payload["url"],
                                headers=headers,
                                timeout=int(os.getenv("ARCHIVE3_URL_TIMEOUT")))
        if len(response.content) < 1:
            raise Exception("No content received.")
    except:
        raise Exception(f"""Failure while loading URL: {payload["url"]}""")
    response.raise_for_status()

    # Generate Screenshot
    file_path_temp = ss.generate_screenshot({"url": payload["url"]})
    misc.log_message(f"Generated screenshot: {file_path_temp}")

    # Generate Block
    block = b.generate_screenshot_block(file_path_temp)
    hash = block["data_hash"]
    misc.log_message(f"Block created: {block['id']}")

    # Rename screenshot.
    file_path_screenshot = os.path.join(misc.hash_to_path(hash), hash + ".png")
    os.makedirs(os.path.dirname(file_path_screenshot), exist_ok=True)
    os.rename(file_path_temp, file_path_screenshot)
    misc.log_message(
        f"""Screenshot moved to destination: {file_path_screenshot}""")

    # Generate thumbnail.
    file_path_thumb = os.path.join(misc.hash_to_path(hash),
                                   hash + "_thumb.jpg")
    ss.generate_thumbnail(file_path_screenshot, file_path_thumb)
    misc.log_message(f"""Thumbnail created: {file_path_thumb}""")

    # Upload to S3.
    if os.getenv("AWS_S3_UPLOAD_ENABLED") == "true":
        s3 = boto3.resource(
            "s3",
            aws_access_key_id=os.getenv("AWS_S3_ACCESS_KEY_ID"),
            aws_secret_access_key=os.getenv("AWS_S3_SECRET_ACCESS_KEY"))
        bucket = s3.Bucket(os.getenv("AWS_S3_BUCKET_ID"))
        bucket.put_object(
            Key=os.path.basename(file_path_screenshot),
            Body=open(file_path_screenshot, "rb"),
            StorageClass=os.getenv("AWS_S3_STORAGE_CLASS"),
            ACL="public-read",
            ContentType=mimetypes.guess_type(file_path_screenshot)[0])
        misc.log_message(
            f"""Uploaded to S3: {os.getenv("AWS_S3_CDN_BASE_URL")}{os.path.basename(file_path_screenshot)}"""
        )
        bucket.put_object(
            Key=os.path.basename(file_path_thumb),
            Body=open(file_path_thumb, "rb"),
            StorageClass=os.getenv("AWS_S3_STORAGE_CLASS"),
            ACL="public-read",
            ContentType=mimetypes.guess_type(file_path_screenshot)[0])
        misc.log_message(
            f"""Uploaded to S3: {os.getenv("AWS_S3_CDN_BASE_URL")}{os.path.basename(file_path_thumb)}"""
        )

    # Update url record.
    data = {"timestamp_updated": "NOW()"}
    db.update_url_record(connection, payload["id"], data)
    connection.commit()
    misc.log_message(f"""Updated url record: {payload["id"]}""")

    # Create data record.
    data = {"url_id": payload["id"], "hash": hash, "block_id": block["id"]}
    data_record_id = db.create_data_record(connection, data)
    connection.commit()
    misc.log_message(f"""Created data record: {data_record_id}""")

    # Delete screenshots.
    if os.getenv("ARCHIVE3_DELETE_SCREENSHOTS") == "true":
        os.remove(file_path_screenshot)
        misc.log_message(
            f"""Deleted local screenshot: {file_path_screenshot}""")
        os.remove(file_path_thumb)
        misc.log_message(f"""Deleted local screenshot: {file_path_thumb}""")
Exemple #8
0
def process_preview(connection, payload):
    # Check if URL already exists.
    if db.check_url_exists(connection, payload["url"]):
        db.delete_submission_record(connection, payload["id"])
        connection.commit()
        misc.log_message(f"""URL already exists: {payload["url"]}""")
        return

    # Validate URL
    misc.log_message(f"""Validating URL: {payload["url"]}""")
    try:
        headers = {"User-Agent": os.getenv("ARCHIVE3_USER_AGENT")}
        response = requests.get(payload["url"],
                                headers=headers,
                                timeout=int(os.getenv("ARCHIVE3_URL_TIMEOUT")))
    except:
        raise Exception(f"""Failure while loading URL: {payload["url"]}""")
    response.raise_for_status()

    # Generate screenshot.
    file_path_temp = ss.generate_screenshot({
        "url": payload["url"],
        "width": 1280,
        "height": 720,
        "delay": 2
    })
    misc.log_message(f"Generated screenshot: {file_path_temp}")

    # Compress screenshot.
    file_path_screenshot = os.path.join(
        os.getcwd(), os.getenv("ARCHIVE3_SCREENSHOT_DIR"),
        "preview-" + str(payload["id"]) + ".jpg")
    ss.compress_preview(file_path_temp, file_path_screenshot)
    misc.log_message(
        f"""Compressed screenshot created: {file_path_screenshot}""")

    # Remove temp file.
    os.remove(file_path_temp)
    misc.log_message(f"""Temp file deleted: {file_path_temp}""")

    # Update submission record.
    data = {"ready": "true"}
    db.update_submission_record(connection, payload["id"], data)
    connection.commit()
    misc.log_message(f"""Updated submission record: {payload["id"]}""")
Exemple #9
0
            error_count = record["errors"] + 1
            db.update_url_record(connection, payload["id"],
                                 {"errors": error_count})
            connection.commit()
            misc.log_message(
                f"""Updating errors for url_id {payload["id"]}: {error_count}"""
            )
            raise
        finally:
            connection.close()


##### ENTRY POINT #####

load_dotenv()

thread_count = int(os.getenv("ARCHIVE3_PROCESSING_THREADS"))
threads = []
while True:
    if len(threads) < thread_count:
        thread = threading.Thread(target=start_processing_thread, daemon=True)
        thread.start()
        threads.append(thread)
        misc.log_message(f"Spawned thread: {thread.name}")
    for thread in threads:
        if not thread.is_alive():
            misc.log_message(f"Removing dead thread: {thread.name}")
            threads.remove(thread)
            break
    time.sleep(1)
from misc import log_message
import emails as e

##### ENTRY POINT #####

if __name__ == "__main__":
    load_dotenv()
    with greenstalk.Client(host=os.getenv("GREENSTALK_HOST"),
                           port=os.getenv("GREENSTALK_PORT"),
                           watch=[os.getenv("GREENSTALK_TUBE_ACTIONS")
                                  ]) as action_queue:
        while True:
            job = action_queue.reserve()
            try:
                payload = json.loads(job.body)
                if payload["action"] == "send_api_request_email":
                    email = payload["data"]["email"]
                    challenge = payload["data"]["challenge"]
                    e.send_api_request_email(email, challenge)
                    log_message(f"Send API Request Email to: {email}")
                else:
                    raise Exception("Invalid payload action")
                action_queue.delete(job)
            except Exception as e:
                action_queue.bury(job)
                log_message("Buried.")
                raise e
            finally:
                sys.stdout.flush()