Ejemplo n.º 1
0
def update_run_report(date_folders):
    """
    List of date folders in the form MM_DD_YY that you want to update.

    @param date_folders:
    """
    # fetch utags in run report collection
    db_utags = _DB_CONNECTOR.distinct(RUN_REPORT_COLLECTION, UTAG)

    if os.path.isdir(RUN_REPORT_PATH):

        reports = list()
        for folder in date_folders:
            path = os.path.join(RUN_REPORT_PATH, folder)
            if not os.path.isdir(path):
                continue

            date_obj = datetime.strptime(folder, '%m_%d_%y')

            for sf in os.listdir(path):
                report_file_path = get_run_info_path(path, sf)
                if report_file_path is None: continue

                utag = set_utag(date_obj, sf)
                if utag not in db_utags: # if not exists, need to insert to collection
                    log_data = read_report_file(report_file_path, date_obj, utag)
                    if log_data is None:
                        log_data = {DATETIME: date_obj, UTAG: utag}
                    if IMAGE_STACKS in log_data:
                        hdf5_datasets= get_hdf5_datasets(log_data, folder, sf)

                        log_data[IMAGE_STACKS].extend(hdf5_datasets)

                    reports.append(log_data)
                    print report_file_path
                else: # if exists, check HDF5 collection for new datasets
                    log_data = _DB_CONNECTOR.find_one(RUN_REPORT_COLLECTION, UTAG, utag)

                    # If previously a run report was not there or had wrong format,
                    # the mongo documents only has three fields, _id, datetime, and
                    # unique_tag. If this occurs, try reading the run report again.
                    if len(log_data.keys()) == 3:
                        log_data = read_report_file(report_file_path, date_obj, utag)

                    if log_data is not None and IMAGE_STACKS in log_data:
                        hdf5_datasets = get_hdf5_datasets(log_data, folder, sf)
                        exist_datasets = log_data[IMAGE_STACKS]

                        if set(hdf5_datasets) - set(exist_datasets):
                            updated_datasets = list(set(hdf5_datasets) | set(exist_datasets))
                            _DB_CONNECTOR.update(
                                    RUN_REPORT_COLLECTION,
                                    {UTAG: utag},
                                    {"$set": {IMAGE_STACKS: updated_datasets}})

        APP_LOGGER.info("Found %d run reports" % (len(reports)))
        if len(reports) > 0:
            # There is a possible race condition here. Ideally these operations
            # would be performed in concert atomically
            _DB_CONNECTOR.insert(RUN_REPORT_COLLECTION, reports)
Ejemplo n.º 2
0
def parse_pa_data_src(pa_data_src_name):
    """
    Determine primary analysis data source type (HDF5 or image stack) and return
    a list containing the archive paths and dataset names

    @param pa_data_src_name:    String, name of data source, could be either
                                the HDF5 dataset name or a folder name containing
                                image stacks
    @return:                    A list of tuples, each tuple contains the primary analysis
                                datasource name and a bool indicating whether or not it is HDF5.
    """
    # archives is a list of tuples, each tuple contains the path and the dataset name
    archives = list()
    if is_hdf5_archive(pa_data_src_name):
        archives.append((pa_data_src_name, True))
        APP_LOGGER.info('%s is an HDF5 file.' % pa_data_src_name)
    elif is_image_archive(pa_data_src_name):
        image_archive_paths = io_utilities.get_archive_dirs(
            pa_data_src_name, min_num_images=PA_MIN_NUM_IMAGES)
        for img_src_name in image_archive_paths:
            archives.append((
                img_src_name,
                False,
            ))
        APP_LOGGER.info('%s is an image stack.' % pa_data_src_name)
    else:
        raise Exception(
            'Unable to determine if %s is an image stack or HDF5 file.' %
            pa_data_src_name)

    return archives
Ejemplo n.º 3
0
def update_image_stacks(log_data, data_folder):
    """
    Check whether the image_stacks in a run report document exist in archive collection.
    If not, add them to database.

    @param log_data:            the document of run report yaml
    @param date_folder:         folder where data is located
    """
    if log_data is None or IMAGE_STACKS not in log_data: return

    new_records = list()
    for image_stack in log_data[IMAGE_STACKS]:
        exist_record = _DB_CONNECTOR.find_one(ARCHIVES_COLLECTION, ARCHIVE,
                                              image_stack)
        if not exist_record:
            for folder in [ARCHIVES_PATH, data_folder]:
                archive_path = os.path.join(folder, image_stack)
                if os.path.isdir(archive_path):
                    new_records.append({
                        ARCHIVE:
                        image_stack,
                        ARCHIVE_PATH:
                        remove_disk_directory(archive_path)
                    })
                    break

    if new_records:
        APP_LOGGER.info('Found %d image stacks: %s' %
                        (len(new_records), new_records))
        _DB_CONNECTOR.insert(ARCHIVES_COLLECTION, new_records)
Ejemplo n.º 4
0
    def process_request(cls, params_dict):
        dataset = params_dict[cls.dataset_parameter][0]
        report_uuid = params_dict[cls.report_uuid_parameter][0]

        http_status_code = 200
        json_response = {RUN_REPORT_UUID: report_uuid, HDF5_DATASET: dataset}

        try:
            cls._DB_CONNECTOR.update(
                RUN_REPORT_COLLECTION, {UUID: report_uuid},
                {'$pull': {
                    IMAGE_STACKS: {
                        'name': dataset,
                        'upload': True
                    }
                }})
            cls._DB_CONNECTOR.remove(HDF5_COLLECTION, {HDF5_DATASET: dataset})
            json_response.update({"unassociate": True})
            APP_LOGGER.info("Removed dataset name=%s from run report uuid=%s" %
                            (dataset, report_uuid))
        except:
            APP_LOGGER.exception(traceback.format_exc())
            json_response[ERROR] = str(sys.exc_info()[1])
            http_status_code = 500

        return json_response, http_status_code
Ejemplo n.º 5
0
    def process_request(cls, params_dict):
        tags = [t for t in params_dict[cls.tags_parameter] if t]
        report_uuid = params_dict[cls.report_uuid_parameter][0]

        http_status_code = 200
        json_response = {RUN_REPORT_UUID: report_uuid, TAGS: tags}

        try:
            cls._DB_CONNECTOR.update(RUN_REPORT_COLLECTION,
                                     {UUID: report_uuid},
                                     {'$addToSet': {
                                         TAGS: {
                                             '$each': tags
                                         }
                                     }})
            APP_LOGGER.info("Updated run report uuid=%s with tags %s." %
                            (report_uuid, tags))

            json_response[STATUS] = SUCCEEDED
        except:
            APP_LOGGER.exception(traceback.format_exc())
            json_response[STATUS] = FAILED
            json_response[ERROR] = str(sys.exc_info()[1])
            http_status_code = 500

        return make_clean_response(json_response, http_status_code)
    def _generate(self, ndyes, nchoose=5):
        """
        @param ndyes:   1nteger, number of dyes to use per solution
        #param nchoose: Integer, maximum number of combinations that will be further optimized
        """
        # check to see if the minimum maximum levels of dyes can make the requested number of dyes
        min_nbarcodes = numpy.product(self._barcode_min_nlvls[numpy.argsort(self._barcode_min_nlvls)[:ndyes]])
        max_nbarcodes = numpy.product(self._barcode_max_nlvls[numpy.argsort(self._barcode_max_nlvls)[-ndyes:]])

        # too many dyes were selected
        if min_nbarcodes > self._requested_nbarcodes:
            APP_LOGGER.info('Cannot generate requested number of barcodes (%d).  '
                            'Smallest library would have %d barcodes.' %
                            (self._requested_nbarcodes, min_nbarcodes))
            return

        # too few dyes were selected
        if max_nbarcodes < self._requested_nbarcodes:
            APP_LOGGER.info('Cannot generate requested number of barcodes (%d).  '
                            'Largest library would have %d barcodes.' %
                            (self._requested_nbarcodes, max_nbarcodes))
            return

        # find the optimal number of levels for each dye combination
        requested_dye_idxs = set(range(len(self._requested_dye_lots)))
        optimal_nlvls = list()
        for dye_idxs in itertools.combinations(xrange(len(self._barcode_profiles)), ndyes):
            dye_idxs = numpy.array(dye_idxs)

            # ignore combinations that do not include requested dyes
            if self.need_additional_db_dyes and \
                    self._requested_dye_lots and \
                    not requested_dye_idxs.issubset(dye_idxs):
                continue

            # ignore combinations in which the peaks are too close
            peaks = numpy.concatenate((self._barcode_peaks[dye_idxs], self._non_barcode_peaks))
            if numpy.any(numpy.diff(numpy.sort(peaks)) < self._min_peak_difference):
                continue

            try:
                candidate_nlvls, candidate_lowest_peak = self._calc_optimal_nlvls(dye_idxs)
                optimal_nlvls.append((candidate_lowest_peak, dye_idxs, candidate_nlvls))
            except Exception as e:
                APP_LOGGER.exception(e)

        optimal_nlvls.sort(key=lambda x: x[0])

        for _, dye_idxs, nlvls in optimal_nlvls[: nchoose]:
            try:
                self._make_design(nlvls, dye_idxs)
            except Exception as e:
                APP_LOGGER.exception(e)
Ejemplo n.º 7
0
def get_hdf5_datasets(log_data, data_folder):
    """
    Fetch the HDF5 archives associated with a run report.

    @param log_data:            the document of run report yaml
    @param date_folder:         folder where data is located
    """
    if log_data is None or RUN_ID not in log_data: return set()

    run_id = log_data[RUN_ID]
    hdf5_paths = [
        os.path.join(data_folder, f + '.h5')
        for f in [run_id, run_id + '-baseline']
        if os.path.isfile(os.path.join(data_folder, f + '.h5'))
    ]
    all_datasets = set()

    for path in hdf5_paths:
        exist_records = _DB_CONNECTOR.find(
            HDF5_COLLECTION, {HDF5_PATH: remove_disk_directory(path)})
        if exist_records:
            all_datasets.update(set(r[HDF5_DATASET] for r in exist_records))
            continue

        new_records = list()
        try:
            with h5py.File(path) as h5_file:
                dataset_names = h5_file.keys()
            for dsname in dataset_names:
                if re.match(r'^\d{4}-\d{2}-\d{2}_\d{4}\.\d{2}', dsname):
                    new_records.append({
                        HDF5_PATH: remove_disk_directory(path),
                        HDF5_DATASET: dsname,
                    })
        except:
            APP_LOGGER.exception(
                'Unable to get dataset information from HDF5 file: %s' % path)

        if new_records:
            APP_LOGGER.info('Found %d datasets from HDF5 file: %s' %
                            (len(new_records), path))
            _DB_CONNECTOR.insert(HDF5_COLLECTION, new_records)
            all_datasets.update(set(r[HDF5_DATASET] for r in new_records))

    return all_datasets
Ejemplo n.º 8
0
def get_variants(exp_def_name):
    """
    Return a list of variants in the experiment definition file.
    """
    APP_LOGGER.info("Retrieving list of variants from %s" % (exp_def_name, ))

    exp_def_doc = _DB_CONNECTOR.find_one(EXP_DEF_COLLECTION, NAME,
                                         exp_def_name)
    if exp_def_doc is not None:
        APP_LOGGER.info(
            "Experiment definition %s found in EXP_DEF_COLLECTION." %
            (exp_def_name, ))
        return exp_def_doc[VARIANTS]

    APP_LOGGER.debug(
        "Failed to find experiment definition %s from EXP_DEF_COLLECTION." %
        (exp_def_name, ))
    return []
Ejemplo n.º 9
0
def get_experiment_defintions():
    """
    Retrieve experiment definition from EXP_DEF_COLLECTION.
    """
    columns = OrderedDict()
    columns[ID] = 0
    columns[UUID] = 1
    columns[NAME] = 1
    columns[VARIANTS] = 1
    columns[DYES] = 1
    columns[TYPE] = 1

    column_names = columns.keys()
    column_names.remove(ID)

    exp_defs = _DB_CONNECTOR.find(EXP_DEF_COLLECTION, {}, columns)
    APP_LOGGER.info('Retrieved %d experiment definitions.' \
                    % (len(exp_defs), ))
    return (exp_defs, column_names, None)
Ejemplo n.º 10
0
    def process_request(cls, params_dict):
        tag = params_dict[cls.tag_parameter][0]
        report_uuid = params_dict[cls.report_uuid_parameter][0]

        http_status_code = 200
        json_response = {RUN_REPORT_UUID: report_uuid, TAGS: [tag]}

        try:
            cls._DB_CONNECTOR.update(RUN_REPORT_COLLECTION,
                                     {UUID: report_uuid},
                                     {'$pull': {
                                         TAGS: tag
                                     }})
            json_response[STATUS] = SUCCEEDED
            APP_LOGGER.info("Removed tag name=%s from run report uuid=%s" %
                            (tag, report_uuid))
        except:
            APP_LOGGER.exception(traceback.format_exc())
            json_response[ERROR] = str(sys.exc_info()[1])
            json_response[STATUS] = FAILED
            http_status_code = 500

        return json_response, http_status_code
Ejemplo n.º 11
0
 def process_request(cls, params_dict, del_file_keys=(RESULT,)):
     response         = {}
     http_status_code = 200
     
     uuids     = params_dict[ParameterFactory.job_uuid(cls.get_collection())]
     criteria  = {UUID: {"$in": uuids}}
     
     APP_LOGGER.info("Deleting the following jobs: %s" % ",".join(uuids))
     records = cls._DB_CONNECTOR.find(cls.get_collection(), criteria, 
                                      {ID:0})
     response["deleted"] = {}
     if len(records) > 0:
         # Record records
         for record in records:
             response["deleted"][record[UUID]] = record
         
         # Delete records from database
         result = cls._DB_CONNECTOR.remove(cls.get_collection(), 
                                           criteria)
         
         # Delete files from disk only if removal from DB was successful
         if result and result['n'] == len(response["deleted"]):
             for _,record in response["deleted"].iteritems():
                 for key in del_file_keys:
                     file_path = record.get(key, None)
                     if file_path is not None and os.path.isfile(file_path):
                         os.remove(file_path)
         else:
             del response["deleted"]
             raise Exception("Error deleting records from the " \
                             "database: %s" % result)
         APP_LOGGER.info("Successfully deleted the following jobs: %s" \
                         % ",".join(uuids))
     else:
         http_status_code = 404
         
     return response, http_status_code
Ejemplo n.º 12
0
        def gen_dye_scatterplot(dyes, sys_listener_path):
            try:
                analysis_df = pandas.read_table(self.analysis_file,
                                                sep=sniff_delimiter(
                                                    self.analysis_file))
                ac_df = pandas.read_table(self.tmp_outfile_path,
                                          sep=sniff_delimiter(
                                              self.tmp_outfile_path))
                analysis_df['assay'] = False
                analysis_df.loc[analysis_df['identity'].notnull(),
                                'assay'] = ac_df['assay'].values

                # System listener inputs
                dyn_align_offsets = {}
                temps = {}
                steps = {}
                if sys_listener_path is not None:
                    sys_listener_dir = os.path.dirname(sys_listener_path)
                    clamp_temp_tp = ClampTempTopicParser()
                    old_channel_offset_tp = OldChannelOffsetTopicParser()
                    channel_offset_tp = ChannelOffsetTopicParser()
                    dyn_align_steps_tp = DynamicAlignStepsParser()
                    topic_parsers = [
                        clamp_temp_tp, old_channel_offset_tp,
                        channel_offset_tp, dyn_align_steps_tp
                    ]
                    sys_listener_parser = SystemListenerParser(
                        sys_listener_dir, topic_parsers=topic_parsers)
                    temps = sys_listener_parser.get_topic_results(
                        clamp_temp_tp.topic)
                    dyn_align_offsets = sys_listener_parser.get_topic_results(
                        channel_offset_tp.topic)
                    if len(dyn_align_offsets) < 1:
                        APP_LOGGER.info("Using old channel offset parser...")
                        dyn_align_offsets = sys_listener_parser.get_topic_results(
                            old_channel_offset_tp.topic)
                    else:
                        APP_LOGGER.info("Using new channel offset parser...")
                    steps = sys_listener_parser.get_topic_results(
                        dyn_align_steps_tp.topic)

                generate_dye_scatterplots(analysis_df,
                                          dyes,
                                          self.tmp_dyes_plot_path,
                                          self.job_name,
                                          self.pico1_dye,
                                          dyn_align_offsets=dyn_align_offsets,
                                          temps=temps,
                                          steps=steps)
                shutil.copy(self.tmp_dyes_plot_path, self.dyes_plot_path)
                APP_LOGGER.info("Dyes scatter plot generated for %s." % \
                    self.job_name)
            except:
                APP_LOGGER.exception("Dyes scatter plot generation failed.")
Ejemplo n.º 13
0
def update_archives():
    '''
    Update the database with available primary analysis archives.  It is not
    an error if zero archives are available at this moment.

    @return True if database is successfully updated, False otherwise
    '''
    APP_LOGGER.info("Updating database with available archives...")
    exist_archives = _DB_CONNECTOR.distinct(ARCHIVES_COLLECTION, ARCHIVE)
    if os.path.isdir(ARCHIVES_PATH):
        # Remove archives named similarly (same name, different capitalization)
        archives = io_utilities.get_subfolders(ARCHIVES_PATH)

        # Check yyyy_mm/dd/HHMM_pilotX location
        run_folders = get_run_folders()
        for folder in run_folders:
            archives.extend(io_utilities.get_subfolders(folder))

        new_archives = [
            x for x in archives if os.path.basename(x) not in exist_archives
        ]
        records = [{
            ARCHIVE: os.path.basename(archive),
            ARCHIVE_PATH: remove_disk_directory(archive)
        } for archive in new_archives]

        APP_LOGGER.info("Found %d archives" % (len(records)))
        if len(records) > 0:
            # There is a possible race condition here. Ideally these operations
            # would be performed in concert atomically
            _DB_CONNECTOR.insert(ARCHIVES_COLLECTION, records)
    else:
        APP_LOGGER.error(
            "Couldn't locate archives path '%s', to update database." %
            ARCHIVES_PATH)
        return False

    APP_LOGGER.info("Database successfully updated with available archives.")
    return True
Ejemplo n.º 14
0
def update_dyes():
    '''
    Update the database with available dyes.

    @return True if database is successfully updated, False otherwise
    '''
    APP_LOGGER.info("Updating database with available dyes...")
    try:
        records = [{DYE: dye} for dye in _DATASTORE.dyes()]

        assert len(records) > 0, "Internal error: No dyes found"
        # There is a possible race condition here. Ideally these operations
        # would be performed in concert atomically
        _DB_CONNECTOR.remove(DYES_COLLECTION, {})
        _DB_CONNECTOR.insert(DYES_COLLECTION, records)
    except:
        APP_LOGGER.info("Failed to update database with available dyes: %s",
                        str(sys.exc_info()))
        raise

    APP_LOGGER.info("Database successfully updated with available dyes.")
    return True
Ejemplo n.º 15
0
    def process_request(cls, params_dict):
        filenames = params_dict[cls.filenames_parameter]
        report_uuid = params_dict[cls.report_uuid_parameter][0]

        http_status_code = 200
        json_response = {RUN_REPORT_UUID: report_uuid, FILENAMES: filenames}

        filepaths = [
            os.path.join(MODIFIED_ARCHIVES_PATH, secure_filename(fn))
            for fn in filenames
        ]
        if not filenames or not report_uuid or not all(
                allowed_file(fp) for fp in filepaths):
            http_status_code = 400
        elif any(
                cls._DB_CONNECTOR.find_one(HDF5_COLLECTION, HDF5_PATH,
                                           {'$regex': fn + '$'}) is not None
                for fn in filenames):
            http_status_code = 403
        else:
            try:
                fp_to_datasets, duplicate = get_datasets_from_files(filepaths)
                if not fp_to_datasets or duplicate:
                    http_status_code = 403
                else:
                    new_hdf5_records = [{
                        HDF5_PATH: fp,
                        HDF5_DATASET: dsname,
                        "upload": True
                    } for fp in fp_to_datasets
                                        for dsname in fp_to_datasets[fp]]
                    cls._DB_CONNECTOR.insert(HDF5_COLLECTION, new_hdf5_records)
                    APP_LOGGER.info('Updated database with %d new HDF5 files' %
                                    len(new_hdf5_records))

                    run_report = cls._DB_CONNECTOR.find_one(
                        RUN_REPORT_COLLECTION, UUID, report_uuid)
                    if run_report:
                        exist_datasets = set([
                            d for d in run_report[IMAGE_STACKS]
                            if isinstance(d, str) or isinstance(d, unicode)
                        ])
                        new_datasets = set()
                        for datasets in fp_to_datasets.values():
                            new_datasets = new_datasets | datasets
                        new_datasets = list(new_datasets - exist_datasets)
                        if new_datasets:
                            cls._DB_CONNECTOR.update(
                                RUN_REPORT_COLLECTION, {UUID: report_uuid}, {
                                    '$addToSet': {
                                        IMAGE_STACKS: {
                                            '$each': [{
                                                'name': d,
                                                'upload': True
                                            } for d in new_datasets]
                                        }
                                    }
                                })
                            APP_LOGGER.info(
                                "Updated run report uuid=%s with %d HDF5 datasets."
                                % (report_uuid, len(new_datasets)))

                        del run_report[ID]
                        json_response.update({
                            "run_report": run_report,
                            "uploaded": new_datasets
                        })
                    else:
                        json_response.update({
                            "error":
                            "Run report uuid=%s does not exist." % report_uuid
                        })
            except:
                APP_LOGGER.exception(traceback.format_exc())
                json_response[ERROR] = str(sys.exc_info()[1])
                http_status_code = 500

        return make_clean_response(json_response, http_status_code)
Ejemplo n.º 16
0
def get_run_reports(cartridge_sn=None):
    """
    Retrieve a list of run reports.
    """
    columns = OrderedDict()
    columns[ID] = 0
    columns[UUID] = 1
    columns[DATETIME] = 1
    columns[DEVICE_NAME] = 1
    columns[EXP_DEF_NAME] = 1
    columns[RUN_DESCRIPTION] = 1
    columns[SAMPLE_NAME] = 1
    columns[CARTRIDGE_SN] = 1
    columns[CARTRIDGE_SN_OLD] = 1
    columns[CARTRIDGE_BC] = 1
    columns[IMAGE_STACKS] = 1
    columns[PICO1_DYE] = 1
    columns[EXPERIMENT_CONFIGS] = 1
    columns[TAGS] = 1

    column_names = columns.keys()
    column_names.remove(ID)

    query = {
        UUID: {
            '$exists': True
        },
        DEVICE_NAME: {
            '$ne': ''
        },
        EXP_DEF_NAME: {
            '$ne': None
        },
        IMAGE_STACKS: {
            '$ne': None,
            '$not': {
                '$size': 0
            }
        }
    }
    if cartridge_sn is not None:
        query.update({
            '$or': [{
                CARTRIDGE_SN: cartridge_sn
            }, {
                CARTRIDGE_SN_OLD: cartridge_sn
            }, {
                '{0}.{1}'.format(CARTRIDGE_BC, 'serial_num'): cartridge_sn
            }]
        })

    reports = _DB_CONNECTOR.find(RUN_REPORT_COLLECTION, query, columns)
    APP_LOGGER.info('Retrieved %d run reports with image stack(s)' \
                    % (len(reports), ))

    if reports:
        all_jobs = _DB_CONNECTOR.find(FA_PROCESS_COLLECTION, {})
        job_map = defaultdict(list)
        for job in all_jobs:
            job_map[job[ARCHIVE]].append(job)

        for report in reports:
            report[DATA_TO_JOBS] = dict()
            for archive in report[IMAGE_STACKS]:
                archive_name = archive['name'] if isinstance(archive,
                                                             dict) else archive

                job_status = {STATUS: 'not processed', 'job_uuids': list()}
                jobs = job_map[
                    archive_name] if archive_name in job_map else list()
                if jobs:
                    if any(j[STATUS] == RUNNING for j in jobs):
                        job_status[STATUS] = RUNNING
                    elif any(j[STATUS] == SUBMITTED for j in jobs):
                        job_status[STATUS] = SUBMITTED
                    elif any(j[STATUS] == SUCCEEDED for j in jobs):
                        job_status[STATUS] = SUCCEEDED
                    else:
                        job_status[STATUS] = FAILED
                    job_status['job_uuids'] = [j[UUID] for j in jobs]
                report[DATA_TO_JOBS][archive_name] = job_status
    return (reports, column_names, None)
Ejemplo n.º 17
0
def update_run_reports(date_folders=None):
    '''
    Update the database with available run reports.  It is not an error
    if zero reports are available at this moment.

    @return True if database is successfully updated, False otherwise
    '''
    APP_LOGGER.info("Updating database with available run reports...")

    # fetch utags from run report collection
    db_utags = _DB_CONNECTOR.distinct(RUN_REPORT_COLLECTION, UTAG)

    if os.path.isdir(RUN_REPORT_PATH):
        if date_folders is None:
            try:
                latest_date = _DB_CONNECTOR.find_max(RUN_REPORT_COLLECTION,
                                                     DATETIME)[DATETIME]
            except TypeError:
                latest_date = datetime.now()

            def valid_date(folder):
                date_obj = get_date_object(folder)
                return date_obj >= latest_date - timedelta(days=6)

            date_folders = [
                folder for folder in os.listdir(RUN_REPORT_PATH)
                if re.match('\d{2}_\d{2}_\d{2}', folder) and valid_date(folder)
            ]

            # New file location
            new_date_folders = get_date_folders()
            date_folders.extend(f for f in new_date_folders if valid_date(f))

        date_folders = [os.path.join(RUN_REPORT_PATH, f) for f in date_folders]
        date_folders = [f for f in date_folders if os.path.isdir(f)]

        reports = list()
        for folder in date_folders:
            for sf in os.listdir(folder):
                report_file_path = get_run_info_path(folder, sf)
                if report_file_path is None: continue

                date_obj = get_date_object(folder)
                data_folder = os.path.join(RUN_REPORT_PATH, folder, sf)

                utag = set_utag(date_obj, sf)
                if utag not in db_utags:  # if not exists, need to insert to collection
                    log_data = read_report_file(report_file_path, date_obj,
                                                utag)
                    if log_data is None or all(
                            not log_data[DEVICE_NAME].lower().startswith(x)
                            for x in ['pilot', 'beta']):
                        log_data = {DATETIME: date_obj, UTAG: utag}
                    if IMAGE_STACKS in log_data:
                        # add image stacks to archive collection
                        update_image_stacks(log_data, data_folder)
                        # find HDF5 datasets and add them to HDF5 collection
                        hdf5_datasets = get_hdf5_datasets(
                            log_data, data_folder)
                        log_data[IMAGE_STACKS].extend(hdf5_datasets)
                    # add report direcotry path
                    log_data[DIR_PATH] = remove_disk_directory(
                        os.path.dirname(report_file_path))
                    reports.append(log_data)
                else:  # if exists, check HDF5 collection for new datasets
                    log_data = _DB_CONNECTOR.find_one(RUN_REPORT_COLLECTION,
                                                      UTAG, utag)

                    # If previously a run report was not there or had wrong format,
                    # the mongo documents only has three or four fields, _id, datetime,
                    # unique_tag, and maybe dir_path. If this occurs, try reading the
                    # run report again.
                    if not set(log_data.keys()) - set(
                        [ID, DATETIME, UTAG, DIR_PATH]):
                        log_data = read_report_file(report_file_path, date_obj,
                                                    utag)
                        if log_data is None or all(
                                not log_data[DEVICE_NAME].lower().startswith(x)
                                for x in ['pilot', 'beta']):
                            continue
                        # add report direcotry path
                        log_data[DIR_PATH] = remove_disk_directory(
                            os.path.dirname(report_file_path))
                        # add image stacks to archive collection
                        update_image_stacks(log_data, data_folder)

                    if IMAGE_STACKS in log_data:
                        # find HDF5 datasets and add new records to HDF5 collection
                        new_datasets = set(
                            get_hdf5_datasets(log_data, data_folder))
                        if new_datasets:
                            # exclude uploaded HDF5 datasets
                            exist_datasets = set([
                                d for d in log_data[IMAGE_STACKS] if
                                isinstance(d, str) or isinstance(d, unicode)
                            ])
                            new_datasets = list(new_datasets - exist_datasets)
                            if new_datasets:
                                _DB_CONNECTOR.update(
                                    RUN_REPORT_COLLECTION, {UTAG: utag}, {
                                        "$addToSet": {
                                            IMAGE_STACKS: {
                                                '$each': new_datasets
                                            }
                                        }
                                    })
                                APP_LOGGER.info(
                                    'Updated run report utag=%s with %d datasets'
                                    % (utag, len(new_datasets)))

        APP_LOGGER.info("Found %d run reports" % (len(reports)))
        if len(reports) > 0:
            # There is a possible race condition here. Ideally these operations
            # would be performed in concert atomically
            _DB_CONNECTOR.insert(RUN_REPORT_COLLECTION, reports)
    else:
        APP_LOGGER.error(
            "Couldn't locate run report path '%s', to update database." %
            RUN_REPORT_PATH)
        return False

    APP_LOGGER.info(
        "Database successfully updated with available run reports.")
    return True
Ejemplo n.º 18
0
def update_hdf5s():
    APP_LOGGER.info("Updating database with available HDF5 files...")

    # check if run report path exists
    if not os.path.isdir(RUN_REPORT_PATH):
        APP_LOGGER.error(
            "Couldn't locate run report path '%s', to update database." %
            RUN_REPORT_PATH)
        return False

    # find new hdf5 files, using nested listdirs, way faster than glob, os.walk, or scandir
    # only search two subdirectories within the run report folder
    # assumes each the hdf5 file is in a subfolder in the run report folder
    database_paths = set(
        _DB_CONNECTOR.distinct_sorted(HDF5_COLLECTION, HDF5_PATH))
    current_paths = set()
    for par_ in os.listdir(RUN_REPORT_PATH):
        report_dir = os.path.join(RUN_REPORT_PATH, par_)
        if os.path.isdir(report_dir):
            for sub_ in os.listdir(report_dir):
                subdir = os.path.join(report_dir, sub_)
                if os.path.isdir(subdir):
                    hdf5s = [
                        f for f in os.listdir(subdir)
                        if os.path.splitext(f)[-1] in VALID_HDF5_EXTENSIONS
                    ]
                    hdf5_paths = [os.path.join(subdir, f) for f in hdf5s]
                    current_paths.update(hdf5_paths)

    # Check yyyy_mm/dd/HHMM_pilotX location
    run_folders = get_run_folders()
    for folder in run_folders:
        hdf5s = [
            f for f in os.listdir(folder)
            if os.path.splitext(f)[-1] in VALID_HDF5_EXTENSIONS
        ]
        hdf5_paths = [os.path.join(folder, f) for f in hdf5s]
        current_paths.update(hdf5_paths)

    # update database with any new files
    new_hdf5_paths = current_paths - database_paths
    new_records = list()
    for hdf5_path in new_hdf5_paths:
        try:
            with h5py.File(hdf5_path) as h5_file:
                dataset_names = h5_file.keys()
            for dsname in dataset_names:
                if any(
                        re.match(pat, dsname) for pat in [
                            r'^\d{4}-\d{2}-\d{2}_\d{4}\.\d{2}',
                            r'^Pilot\d+_\d{4}-\d{2}-\d{2}_\d{4}\.\d{2}'
                        ]):
                    new_records.append({
                        HDF5_PATH:
                        remove_disk_directory(hdf5_path),
                        HDF5_DATASET:
                        dsname,
                    })
        except:
            APP_LOGGER.exception(
                'Unable to get dataset information from HDF5 file: %s' %
                hdf5_path)

    if new_records:
        # There is a possible race condition here. Ideally these operations
        # would be performed in concert atomically
        _DB_CONNECTOR.insert(HDF5_COLLECTION, new_records)
        APP_LOGGER.info('Updated database with %s new HDF5 files' %
                        len(new_records))
    else:
        APP_LOGGER.info('Unable to find any new HDF5 files')

    return True