def report_file_processing_locked_and_exit():
    """ Creates a useful error report with information about the run time. """
    timedelta_since_last_run = FileProcessLock.get_time_since_locked()
    print("timedelta %s" % timedelta_since_last_run.total_seconds())
    if timedelta_since_last_run.total_seconds() > CELERY_ERROR_REPORT_TIMEOUT_SECONDS:
        error_msg = (
            "Data processing has overlapped with a prior data index run that started more than "
            "%s minutes ago.\nThat prior run has been going for %s hour(s), %s minute(s)"
        )
        error_msg = error_msg % (CELERY_ERROR_REPORT_TIMEOUT_SECONDS / 60,
                                 str(int(timedelta_since_last_run.total_seconds() / 60 / 60)),
                                 str(int(timedelta_since_last_run.total_seconds() / 60 % 60)))
        
        if timedelta_since_last_run.total_seconds() > CELERY_ERROR_REPORT_TIMEOUT_SECONDS * 4:
            error_msg = "DATA PROCESSING OVERLOADED, CHECK SERVER.\n" + error_msg
            email_system_administrators(error_msg, "DATA PROCESSING OVERLOADED, CHECK SERVER")
        raise ProcessingOverlapError(error_msg)
Ejemplo n.º 2
0
def do_email(study):
    email_system_administrators(
        study.name + " blew up while reindexing",
        "Go check on reindex operation.",
        source_email="*****@*****.**")
Ejemplo n.º 3
0
        "Go check on reindex operation.",
        source_email="*****@*****.**")


for study_id in studies:
    if isinstance(study_id, (str, unicode)):
        study_id = ObjectId(study_id)
    study = Study(study_id)
    print "============================================================="
    print "============================================================="
    print "============================================================="
    print "starting on %s, study id: %s" % (study.name, str(study_id))
    print "============================================================="
    print "============================================================="
    print "============================================================="

    study_id = ObjectId(study_id)

    try:
        reindex_study(study_id)
    except Exception as e:
        process_file_chunks(
        )  #will raise an error if things fail on second attempt

    if FilesToProcess.count() != 0:
        do_email(study)
        raise Exception("stopped on " + str(study_id))

email_system_administrators("OMG IT FINISHED AND EVERYTHING IS DONE.",
                            "Go git checkout .; touch wsgi.py",
                            source_email="*****@*****.**")
Ejemplo n.º 4
0
def zip_generator(files_list, construct_registry=False):
    """ Pulls in data from S3 in a multithreaded network operation, constructs a zip file of that
    data. This is a generator, advantage is it starts returning data (file by file, but wrapped
    in zip compression) almost immediately. """

    processed_files = set()
    duplicate_files = set()
    pool = ThreadPool(3)
    # 3 Threads has been heuristically determined to be a good value, it does not cause the server
    # to be overloaded, and provides more-or-less the maximum data download speed.  This was tested
    # on an m4.large instance (dual core, 8GB of ram).
    file_registry = {}

    zip_output = StreamingBytesIO()
    zip_input = ZipFile(zip_output,
                        mode="w",
                        compression=ZIP_STORED,
                        allowZip64=True)
    # random_id = generate_random_string()[:32]
    # print "returning data for query %s" % random_id
    try:
        # chunks_and_content is a list of tuples, of the chunk and the content of the file.
        # chunksize (which is a keyword argument of imap, not to be confused with Beiwe Chunks)
        # is the size of the batches that are handed to the pool. We always want to add the next
        # file to retrieve to the pool asap, so we want a chunk size of 1.
        # (In the documentation there are comments about the timeout, it is irrelevant under this construction.)
        chunks_and_content = pool.imap_unordered(batch_retrieve_s3,
                                                 files_list,
                                                 chunksize=1)
        total_size = 0
        for chunk, file_contents in chunks_and_content:
            if construct_registry:
                file_registry[chunk['chunk_path']] = chunk["chunk_hash"]
            file_name = determine_file_name(chunk)
            if file_name in processed_files:
                duplicate_files.add((file_name, chunk['chunk_path']))
                continue
            processed_files.add(file_name)
            zip_input.writestr(file_name, file_contents)
            # These can be large, and we don't want them sticking around in memory as we wait for the yield
            del file_contents, chunk
            # print len(zip_output)
            x = zip_output.getvalue()
            total_size += len(x)
            # print "%s: %sK, %sM" % (random_id, total_size / 1024, total_size / 1024 / 1024)
            yield x  # yield the (compressed) file information
            del x
            zip_output.empty()

        if construct_registry:
            zip_input.writestr("registry", json.dumps(file_registry))

        # close, then yield all remaining data in the zip.
        zip_input.close()
        yield zip_output.getvalue()

    except None:
        # The try-except-finally block is here to guarantee the Threadpool is closed and terminated.
        # we don't handle any errors, we just re-raise any error that shows up.
        # (with statement does not work.)
        raise
    finally:
        # We rely on the finally block to ensure that the threadpool will be closed and terminated,
        # and also to print an error to the log if we need to.
        pool.close()
        pool.terminate()
        if duplicate_files:
            duplcate_file_message = "encountered duplicate files: %s" % ",".join(
                str(name_path) for name_path in duplicate_files)
            email_system_administrators(
                duplcate_file_message,
                "encountered duplicate files in a data download")