Exemple #1
0
def submit_extractions_by_dataset(connector,
                                  host,
                                  key,
                                  datasetid,
                                  extractorname,
                                  ext=False):
    """Manually trigger an extraction on all files in a dataset.

        This will iterate through all files in the given dataset and submit them to
        the provided extractor.

        Keyword arguments:
        connector -- connector information, used to get missing parameters and send status updates
        host -- the clowder host, including http and port, should end with a /
        key -- the secret key to login to clowder
        datasetid -- the dataset UUID to submit
        extractorname -- registered name of extractor to trigger
        ext -- extension to filter. e.g. 'tif' will only submit TIFF files for extraction.
    """

    filelist = get_file_list(connector, host, key, datasetid)

    for f in filelist:
        # Only submit files that end with given extension, if specified
        if ext and not f['filename'].endswith(ext):
            continue

        submit_extraction(connector, host, key, f['id'], extractorname)
def check_file_in_dataset(connector,
                          host,
                          secret_key,
                          dsid,
                          filepath,
                          remove=False,
                          forcepath=False,
                          replacements=[]):
    # Replacements = [("L2","L1")]
    # Each tuple is checked replacing first element in filepath with second element for existing
    dest_files = get_file_list(connector, host, secret_key, dsid)

    if len(replacements) > 0:
        for r in replacements:
            filepath = filepath.replace(r[0], r[1])

    for source_path in connector.mounted_paths:
        if filepath.startswith(connector.mounted_paths[source_path]):
            filepath = filepath.replace(connector.mounted_paths[source_path],
                                        source_path)

    filename = os.path.basename(filepath)

    found_file = False
    for f in dest_files:
        if (not forcepath and f['filename']
                == filename) or (forcepath and f['filepath'] == filepath):
            if remove:
                delete_file(host, secret_key, f['id'])
            found_file = True

    return found_file
Exemple #3
0
def upload_to_dataset(connector,
                      host,
                      key,
                      datasetid,
                      filepath,
                      check_duplicate=False):
    """Upload file to existing Clowder dataset.

    Keyword arguments:
    connector -- connector information, used to get missing parameters and send status updates
    host -- the clowder host, including http and port, should end with a /
    key -- the secret key to login to clowder
    datasetid -- the dataset that the file should be associated with
    filepath -- path to file
    check_duplicate -- check if filename already exists in dataset and skip upload if so
    """

    logger = logging.getLogger(__name__)

    if check_duplicate:
        ds_files = get_file_list(connector, host, key, datasetid)
        for f in ds_files:
            if f['filename'] == os.path.basename(filepath):
                logger.debug("found %s in dataset %s; not re-uploading" %
                             (f['filename'], datasetid))
                return None

    for source_path in connector.mounted_paths:
        if filepath.startswith(connector.mounted_paths[source_path]):
            return _upload_to_dataset_local(connector, host, key, datasetid,
                                            filepath)

    url = '%sapi/uploadToDataset/%s?key=%s' % (host, datasetid, key)

    if os.path.exists(filepath):
        result = connector.post(
            url,
            files={"File": open(filepath, 'rb')},
            verify=connector.ssl_verify if connector else True)

        uploadedfileid = result.json()['id']
        logger.debug("uploaded file id = [%s]", uploadedfileid)

        return uploadedfileid
    else:
        logger.error("unable to upload file %s (not found)", filepath)
Exemple #4
0
    def submit_missing_regex(sensor_name, target, date):
        sensordef = count_defs[sensor_name]
        targetdef = sensordef[target]
        extractorname = targetdef["extractor"]
        submitted = []
        notfound = []

        if "parent" in targetdef:
            # Count expected parent counts from filesystem
            parentdef = sensordef[targetdef["parent"]]
            parent_dir = os.path.join(parentdef["path"], date)

            if parentdef["type"] == "regex" and parentdef["path"] == targetdef["path"]:
                for file in os.listdir(parent_dir):
                    if re.match(parentdef["regex"], file):
                        expected_output = file.replace(targetdef["parent_replacer_check"][1],
                                                       targetdef["parent_replacer_check"][0])
                        if not os.path.isfile(os.path.join(parent_dir, expected_output)):
                            # Find the file ID of the parent file and submit it
                            dataset_name = parentdef["dispname"]+" - "+date
                            dsid = get_dsid_by_name(dataset_name)
                            if dsid:
                                parent_id = None
                                dsfiles = get_file_list(CONN, CLOWDER_HOST, CLOWDER_KEY, dsid)
                                matchfile = file.replace("_thumb.tif", ".tif")
                                for dsfile in dsfiles:
                                    if dsfile["filename"] == matchfile:
                                        parent_id = dsfile["id"]
                                        break
                                if parent_id:
                                    submit_file_extraction(CONN, CLOWDER_HOST, CLOWDER_KEY, parent_id, extractorname)
                                    submitted.append({"name": matchfile, "id": parent_id})
                                else:
                                    notfound.append({"name": matchfile})
                            else:
                                notfound.append({"name": dataset_name})

        return json.dumps({
            "extractor": extractorname,
            "datasets submitted": submitted,
            "datasets not found": notfound
        })
Exemple #5
0
    def process_message(self, connector, host, secret_key, resource,
                        parameters):
        self.start_message(resource)

        # Build list of JSON files
        json_files = []
        for f in resource['files']:
            if f['filename'].endswith("_environmentlogger.json"):
                if f['filepath'].startswith("/home/clowder"):
                    json_files.append(f['filepath'].replace(
                        "/home/clowder", "/home/extractor"))
                else:
                    json_files.append(f['filepath'])
        json_files.sort()

        # Determine full output path
        timestamp = resource['name'].split(" - ")[1]
        out_fullday_netcdf = self.sensors.create_sensor_path(timestamp)
        temp_out_full = os.path.join(os.path.dirname(out_fullday_netcdf),
                                     "temp_full.nc")
        temp_out_single = temp_out_full.replace("_full.nc", "_single.nc")
        geo_csv = out_fullday_netcdf.replace(".nc", "_geo.csv")

        if not file_exists(temp_out_full):
            for json_file in json_files:
                self.log_info(
                    resource, "converting %s to netCDF & appending" %
                    os.path.basename(json_file))
                ela.mainProgramTrigger(json_file, temp_out_single)
                cmd = "ncrcat --record_append %s %s" % (temp_out_single,
                                                        temp_out_full)
                subprocess.call([cmd], shell=True)
                os.remove(temp_out_single)

            shutil.move(temp_out_full, out_fullday_netcdf)
            self.created += 1
            self.bytes += os.path.getsize(out_fullday_netcdf)

        # Write out geostreams.csv
        if not file_exists(geo_csv):
            self.log_info(resource, "writing geostreams CSV")
            geo_file = open(geo_csv, 'w')
            geo_file.write(','.join([
                'site', 'trait', 'lat', 'lon', 'dp_time', 'source', 'value',
                'timestamp'
            ]) + '\n')
            with Dataset(out_fullday_netcdf, "r") as ncdf:
                streams = set([
                    sensor_info.name
                    for sensor_info in ncdf.variables.values()
                    if sensor_info.name.startswith('sensor')
                ])
                for stream in streams:
                    if stream != "sensor_spectrum":
                        try:
                            memberlist = ncdf.get_variables_by_attributes(
                                sensor=stream)
                            for members in memberlist:
                                data_points = _produce_attr_dict(members)
                                for index in range(len(data_points)):
                                    dp_obj = data_points[index]
                                    if dp_obj["sensor"] == stream:
                                        time_format = "%Y-%m-%dT%H:%M:%S-07:00"
                                        time_point = (datetime.datetime(year=1970, month=1, day=1) + \
                                                      datetime.timedelta(days=ncdf.variables["time"][index])).strftime(time_format)

                                        geo_file.write(','.join([
                                            "Full Field - Environmental Logger",
                                            "(EL) %s" % stream,
                                            str(33.075576),
                                            str(-111.974304), time_point,
                                            host +
                                            ("" if host.endswith("/") else "/"
                                             ) + "datasets/" + resource['id'],
                                            '"%s"' % json.dumps(dp_obj).
                                            replace('"', '""'), timestamp
                                        ]) + '\n')

                        except:
                            self.log_error(
                                resource,
                                "NetCDF attribute not found: %s" % stream)

        # Fetch dataset ID by dataset name if not provided
        target_dsid = build_dataset_hierarchy_crawl(
            host,
            secret_key,
            self.clowder_user,
            self.clowder_pass,
            self.clowderspace,
            None,
            None,
            self.sensors.get_display_name(),
            timestamp[:4],
            timestamp[5:7],
            timestamp[8:10],
            leaf_ds_name=self.sensors.get_display_name() + ' - ' + timestamp)
        ds_files = get_file_list(connector, host, secret_key, target_dsid)
        found_full = False
        found_csv = False
        for f in ds_files:
            if f['filename'] == os.path.basename(out_fullday_netcdf):
                found_full = True
            if f['filename'] == os.path.basename(geo_csv):
                found_csv = True
        if not found_full:
            upload_to_dataset(connector, host, secret_key, target_dsid,
                              out_fullday_netcdf)
        if not found_csv:
            geoid = upload_to_dataset(connector, host, secret_key, target_dsid,
                                      geo_csv)
            self.log_info(resource,
                          "triggering geostreams extractor on %s" % geoid)
            submit_extraction(connector, host, secret_key, geoid,
                              "terra.geostreams")

        # Tell Clowder this is completed so subsequent file updates don't daisy-chain
        ext_meta = build_metadata(host, self.extractor_info, resource['id'],
                                  {"output_dataset": target_dsid}, 'dataset')
        upload_metadata(connector, host, secret_key, resource['id'], ext_meta)

        self.end_message(resource)