Beispiel #1
0
class DataFromUSGS(Runnable):
    def __init__(self):
        self.crawler = USGSCrawler()
        self.extractor = TiffExtractor()
        self.dumper = PRISMDumper()
        self.buffer: List[bytes] = list()

    def run(self, end_clause: int = 210):
        """
        crawling routine
        :param end_clause: number of days we want to crawl, default=7
        :return: None
        """

        current_date = datetime.now(timezone.utc).date()
        end_date = current_date - timedelta(days=end_clause)

        # TODO: stop and continue
        with Connection() as conn:
            cur = conn.cursor()
            cur.execute('select date from usgs_info')
            exist_list = cur.fetchall()
            cur.close()

        date = current_date - timedelta(days=7)  # website update weekly
        diff_date = date - date_parser.parse('20190730').date()
        date = date - timedelta(days=diff_date.days % 7)

        while date >= end_date:
            logger.info(f'[fetch]{date}')
            # skip if exist
            if (date,) in exist_list:
                logger.info(f'skip: {date}')

            saved_zip_path = self.crawler.crawl(date)
            if saved_zip_path is None:
                logger.info(f'{date} not found, skipped')

            else:
                zf = zipfile.ZipFile(saved_zip_path)
                for file in zf.namelist():
                    if file.split('.')[-4] == 'VI_NDVI' and file.split('.')[-1] == 'tif':
                        zf.extract(file, os.path.split(saved_zip_path)[0])
                        tif_file_name = file
                zf.close()
                tif_path = os.path.join(os.path.split(saved_zip_path)[0], tif_file_name)
                if tif_path is not None:
                    unflattened = self.extractor.extract(tif_path)
                    if unflattened is not None:
                        self.dumper.insert(date, unflattened, 'usgs')

                    # clean up
                    os.remove(saved_zip_path)
                    os.remove(tif_path)

            # finish crawling a day
            date = date - timedelta(days=7)
                cur.execute(PRISMDumper.INSERT_INFOS[var_type], (date, 1))
            conn.commit()
            cur.close()

    @staticmethod
    def record_generator(date: datetime.date, _data):
        for gid, val in enumerate(_data.tolist()):
            yield (date, gid, val)


if __name__ == '__main__':
    logger.setLevel(logging.INFO)
    logger.addHandler(logging.StreamHandler())

    crawler = USGSCrawler()
    extractor = TiffExtractor()
    dumper = PRISMDumper()
    target_time = "20190806"

    zip_file_path = crawler.crawl(datetime.datetime.strptime(target_time, '%Y%m%d'))
    zf = zipfile.ZipFile(zip_file_path)
    for file in zf.namelist():
        if file.split('.')[-4] == 'VI_NDVI' and file.split('.')[-1] == 'tif':
            zf.extract(file, os.path.split(zip_file_path)[0])
            tif_file_name = file
    zf.close()
    tif_path = os.path.join(os.path.split(zip_file_path)[0], tif_file_name)

    if tif_path is not None:
        data = extractor.extract(tif_path)
        dumper.insert(datetime.datetime.strptime(target_time, '%Y%m%d'), data, var_type='usgs')
Beispiel #3
0
 def __init__(self):
     self.crawler = USGSCrawler()
     self.extractor = TiffExtractor()
     self.dumper = PRISMDumper()
     self.buffer: List[bytes] = list()
 def __init__(self):
     self.crawler = SoilMoisCrawler()
     self.extractor = TiffExtractor()
     self.dumper = SoilMoisDumper()
     self.end_time = datetime.strptime('20160104', '%Y%m%d')
class DataFromNASAGrace(Runnable):
    """
    This class is responsible for collecting data from NASAGrace
    """

    def __init__(self):
        self.crawler = SoilMoisCrawler()
        self.extractor = TiffExtractor()
        self.dumper = SoilMoisDumper()
        self.end_time = datetime.strptime('20160104', '%Y%m%d')

    def run(self, begin_time_str=datetime.today().strftime('%Y%m%d')) -> None:
        """
        The function that can be referenced in task manager
        Crawl, extract and dump data from NASAGrace
        :param begin_time_str: the earliest needed data's time
        :return: None
        """
        # get data from nasagrace
        begin_time = datetime.strptime(begin_time_str, '%Y%m%d')
        # make it a datetime object
        exists_set = self.crawler.get_exists()
        # crawl everyday's data from begin_time to end_time
        current_time = begin_time
        found_week_start = False
        while current_time > self.end_time:
            formatted_date_stamp = current_time.strftime('%Y%m%d')
            logger.info(f'start crawling for date {formatted_date_stamp}')

            try:
                if not found_week_start:
                    # to detect whether this is the last day with data
                    file_path = self.crawler.crawl(current_time) if (current_time,) not in exists_set else None
                    if file_path is not None:
                        self.extract_and_dump(file_path)
                        found_week_start = True
                    else:
                        current_time -= timedelta(days=1)
                else:
                    # start crawling every 7 days
                    current_time -= timedelta(days=7)
                    if (current_time,) not in exists_set:
                        file_path = self.crawler.crawl(current_time)
                        self.extract_and_dump(file_path)
                    else:
                        logger.info(f'{formatted_date_stamp} existed, skipped')
            finally:
                for tif_file in glob.glob(os.path.join(SOIL_MOIS_DATA_DIR, "*.tif")):
                    if 'res' not in tif_file and 'masked' not in tif_file:
                        os.remove(tif_file)
                    logger.info(f"file: {tif_file} removed")

        # if there are no files left, delete the directory
        for root, dirs, files in os.walk(SOIL_MOIS_DATA_DIR, topdown=False):
            if not files and not dirs:
                os.rmdir(root)
        logger.info(f'all data from {begin_time_str} to {self.end_time.strftime("%Y%m%d")}  processing finished')

    def extract_and_dump(self, file_path: str) -> None:
        """
        Using the file_path provided to extract the information needed and dump it into the database
        :param file_path: the data to be processed
        :return: None
        """
        data = self.extractor.extract(file_path)
        formatted_date_stamp = file_path.split('/')[-1].split('.')[0].split('/')[-1]
        logger.info(f'{formatted_date_stamp} extraction finished')
        self.dumper.insert(formatted_date_stamp, data)
        logger.info(f'{formatted_date_stamp} dumping finished')
Beispiel #6
0
                val = float('NaN') if val in [-999, -9999] else val
                try:
                    cur.execute(
                        self.INSERT_SOIL_MOISTURE,
                        (gid, datetime.datetime.strptime(date_str,
                                                         "%Y%m%d"), val))
                    self.inserted_count += cur.rowcount
                    conn.commit()
                except Exception:
                    logger.error("error: " + traceback.format_exc())

            logger.info(
                f'{date_str} finished, total inserted {self.inserted_count}')
            cur.close()


if __name__ == '__main__':
    logger.setLevel(logging.INFO)
    logger.addHandler(logging.StreamHandler())

    crawler = SoilMoisCrawler()
    extractor = TiffExtractor()
    dumper = SoilMoisDumper()
    target_time = "20131230"

    crawled_file_path = crawler.crawl(
        datetime.datetime.strptime(target_time, SoilMoisDumper.TIME_FORMAT))
    if crawled_file_path is not None:
        data = extractor.extract(crawled_file_path)
        dumper.insert(target_time, data)