def get_file_list(self): logger.debug("fetching files") backfill = timedelta(days=int(tutil.get_env_var("GINA_BACKFILL_DAYS"))) end_date = datetime.utcnow() + timedelta(days=1) start_date = end_date - backfill url = GINA_URL url += "&start_date=" + start_date.strftime("%Y-%m-%d") url += "&end_date=" + end_date.strftime("%Y-%m-%d") url += "&sensors[]=viirs" url += "&processing_levels[]=level1" url += "&facilities[]=" + tutil.get_env_var("VIIRS_FACILITY") url += "&satellites[]=" + SATELLITE logger.debug("URL: %s", url) buf = BytesIO() c = pycurl.Curl() c.setopt(c.URL, url) c.setopt(c.WRITEFUNCTION, buf.write) c.perform() files = [] for file in json.loads(buf.getvalue()): files.append(Viirs(file["url"], file["md5sum"])) buf.close() logger.info("Found %s files", len(files)) return files
def main(): # let ctrl-c work as it should. signal.signal(signal.SIGINT, signal.SIG_DFL) # exit quickly if queue is already running (gotlock, lock) = aquire_lock() if not gotlock: tutil.exit_with_error( "Queue {} locked, skipping".format(SATELLITE + "-".join(CHANNELS))) return try: mirror_gina = MirrorGina() mirror_gina.fetch_files() finally: logger.info("All done with queue.") if gotlock: try: lock.unlock() except AttributeError: pass logger.debug("That's all for now, bye.") logging.shutdown()
def fetch_files(self): file_list = self.get_file_list() file_queue = self.file_store.queue_files(file_list, CHANNELS) # sort to retrieve geoloc files first. I should run frequently # enough that getting stuck wile retrieving several orbits # shouldn't be a problem. file_queue.sort() for file in file_queue: url = file.url tmp_file = path_from_url(self.tmp_path, url) logger.debug("Fetching %s from %s" % (tmp_file, url)) dl = Downloader(max_con=self.connection_count) dl.fetch(url, tmp_file) file_md5 = hashlib.md5(open(tmp_file, "rb").read()).hexdigest() logger.debug("MD5 %s : %s" % (file.md5, file_md5)) if file.md5 == file_md5: try: check = h5py.File(tmp_file, "r") check.close() except Exception as e: logger.info("Bad HDF5 file %s", tmp_file) logger.info(e) os.unlink(tmp_file) else: self.file_store.place_file(file, tmp_file) else: size = os.path.getsize(tmp_file) msg = "Bad checksum: %s != %s (%d bytes)" logger.info(msg, file_md5, file.md5, size) os.unlink(tmp_file)
def queue_files(file_list, channels): orbits = {} for new_file in file_list: orbit = new_file.orbit if orbit not in orbits: try: orbits[orbit] = list_files(orbit) except Exception as e: print("TOMP SAYS:") print(e.with_traceback()) print("THAT's ALL") queue = [] pattern = re.compile("/({})_".format("|".join(channels))) for new_file in file_list: orbit = new_file.orbit filename = f"{SATELLITE}/{orbit}/{new_file.basename}" if pattern.search(filename) and filename not in orbits[orbit]: logger.debug("Queueing %s", new_file.url) queue.append(new_file) else: logger.debug("Skipping %s", new_file.url) logger.info("%d files after pruning", len(queue)) return queue
def place_file(file, tmp_file): filename = file.basename orbit = file.orbit logger.debug("Uploading %s to S3 Bucket %s", tmp_file, BUCKET_NAME) key = f"{SATELLITE}/{orbit}/{filename}" try: s3 = boto3.resource("s3", verify=VERIFY) bucket = s3.Bucket(BUCKET_NAME) bucket.upload_file(tmp_file, key) except botocore.exceptions.SSLError as e: logger.debug("TOMP: caught exception") logger.error("TOMP: %s", e.__doc__) logger.error("TOMP: %s", e.message)
def create_multi(self): m = pycurl.CurlMulti() m.handles = [] for i in range(self._num_conn): logger.debug("creating curl object") c = pycurl.Curl() c.fp = None c.setopt(pycurl.FOLLOWLOCATION, 1) c.setopt(pycurl.MAXREDIRS, 5) c.setopt(pycurl.CONNECTTIMEOUT, 30) c.setopt(pycurl.TIMEOUT, 600) c.setopt(pycurl.NOSIGNAL, 1) m.handles.append(c) return m
def queue_files(file_list, channels): queue = [] pattern = re.compile("/({})_".format("|".join(channels))) logger.debug("%d files before pruning", len(file_list)) for new_file in file_list: out_file = path_from_url(OUT_PATH, new_file.url) if pattern.search(out_file) and not os.path.exists(out_file): logger.debug("Queueing %s", new_file.url) queue.append(new_file) else: logger.debug("Skipping %s", new_file.url) logger.info("%d files after pruning", len(queue)) return queue