Example #1
0
def output_state_data(dpdb: DataPointsDB):
    """
    Output from state data
    """
    sds = StateDataSources()

    for source_id, source_url, source_desc, datapoints in sds.iter_data_sources(
    ):
        SOURCE_INFO.append([source_id, source_url, source_desc])
        dpdb.extend(source_id, _rem_dupes(datapoints), is_derived=False)

    return sds.get_status_dict()
Example #2
0
def copy_failed_from_previous_revision(status: dict, dpdb: DataPointsDB):
    """
    If any of them failed, copy them across from the previous revision.
    Note the previous revision might have failed too, but should have
    copied the values from the previous revision before that, etc
    (assuming the crawler worked in the past)
    """
    migrate_source_ids = []
    for status_key, status_dict in status.items():
        if status_dict['status'] == 'ERROR':
            print("ERROR OCCURRED, reverting to previous source ID data:",
                  status_key)
            migrate_source_ids.append(status_key)

    revisions = SQLiteDataRevisions()
    rev_date, rev_subid, dt = revisions.get_revisions()[0]
    prev_revision_path = revisions.get_revision_path(rev_date, rev_subid)
    dpdb.migrate_source_ids(prev_revision_path, migrate_source_ids)
Example #3
0
def main():
    status = {}

    # Open the new output SQLite database
    sqlite_path = RevisionIDs.get_path_from_id(TIME_FORMAT, LATEST_REVISION_ID,
                                               'sqlite')
    dpdb = DataPointsDB(sqlite_path)
    run_crawlers(status, dpdb)
    dpdb.create_indexes()
    copy_failed_from_previous_revision(status, dpdb)

    # Derive "new cases" from "total cases" when
    # they aren't explicitly specified, etc
    DerivedData(dpdb).add_derived()

    # Commit and close the DB
    print("Derived data outputted OK: committing and closing")
    dpdb.commit()
    dpdb.close()

    # Output basic status info to a .json info
    # This also signifies to the web
    # interface that the import went OK
    print("Writing status JSON file")
    status_json_path = RevisionIDs.get_path_from_id(TIME_FORMAT,
                                                    LATEST_REVISION_ID, 'json')
    with open(status_json_path, 'w', encoding='utf-8') as f:
        f.write(json.dumps({'status': status}, indent=4))

    # Output datapoints to zip
    print("Outputting datapoints to zip...")
    with open(
            get_output_dir() / 'output' /
            f'{TIME_FORMAT}-{LATEST_REVISION_ID}.zip', 'wb') as f:
        output_revision_datapoints_to_zip(f, TIME_FORMAT, LATEST_REVISION_ID)

    # Upload them to remote AWS instance
    print("Uploading zip file to remote server...")
    system('/usr/bin/env bash /home/david/upload_to_remote.sh %s' %
           f'{TIME_FORMAT}-{LATEST_REVISION_ID}')

    # Clean up old DBs to save on space
    print("Deleting older DBs to save space..")
    delete_old_dbs()

    # Update the csv output
    print("Outputting CSV files:")
    output_csv_data(TIME_FORMAT, LATEST_REVISION_ID)
    print('CSV write done')

    # Output information about the sources to a markdown table/csv file
    print("Outputting source info...")
    output_source_info(SOURCE_INFO)

    # Output GeoJSON
    print("Outputting geojson...")
    output_geojson()

    # Commit to GitHub
    print("Pushing to GitHub...")
    push_to_github()
    print("Push to GitHub done!")

    print("[end of script]")
Example #4
0
 def _read_sqlite(self):
     self._datapoints_db = DataPointsDB(
         OUTPUT_DIR / f'{self.period}-{self.subperiod_id}.sqlite'
     )
Example #5
0
class SQLiteDataRevision:
    def __init__(self, period, subperiod_id, datapoints_db=None):
        self.__check_period(period)
        subperiod_id = int(subperiod_id)
        self.period = period
        self.subperiod_id = subperiod_id
        self._datapoints_db = datapoints_db

    def _read_sqlite(self):
        self._datapoints_db = DataPointsDB(
            OUTPUT_DIR / f'{self.period}-{self.subperiod_id}.sqlite'
        )

    def get_status_dict(self, ):
        with open(OUTPUT_DIR / f'{self.period}-{self.subperiod_id}.json',
                  'r', encoding='utf-8', errors='replace') as f:
            return json.loads(f.read())['status']
    
    @needsdatapoints
    def __getitem__(self, item):
        return self._datapoints_db[item]

    @needsdatapoints
    def __iter__(self):
        for i in self._datapoints_db:
            yield i

    @needsdatapoints
    def __len__(self):
        return len(self._datapoints_db)

    @needsdatapoints
    def get_datapoints(self):
        return self._datapoints_db[:]

    @needsdatapoints
    def get_updated_dates(self, region_schema, region_parent, region_child):
        return [i.date_updated for i in self._datapoints_db.select_many(
            region_schema=region_schema,
            region_parent=region_parent,
            region_child=region_child
        )]

    @needsdatapoints
    def get_time_series(self, datatypes,
                        region_schema,
                        region_parent,
                        region_child,
                        after_date=None):

        datatypes = [i.value for i in datatypes]
        datapoints = self._datapoints_db.select_many(
            region_schema=['= ?', [region_schema]],
            region_parent=['= ?', [region_parent]] if region_parent else None,
            region_child=['= ?', [region_child]] if region_child else None,
            datatype=[f"IN ({','.join('?' for _ in datatypes)})", datatypes],
            date_updated=[f"> ?", [after_date]] if after_date else None,
        )

        r = {}
        for datapoint in datapoints:
            r.setdefault(
                (datapoint.region_child, datapoint.agerange), {}
            ).setdefault(
                datapoint.date_updated, []
            ).append(datapoint)
        return r

    @needsdatapoints
    def get_source_ids(self):
        return self._datapoints_db.get_source_ids()

    @needsdatapoints
    def get_datapoints_by_source_id(self, source_id, datatype=None, add_source_urls=True):
        return self._datapoints_db.get_datapoints_by_source_id(
            source_id, datatype=datatype, add_source_urls=add_source_urls
        )

    @needsdatapoints
    def get_region_schemas(self):
        return self._datapoints_db.get_region_schemas()

    @needsdatapoints
    def get_datatypes_by_region_schema(self, region_schema):
        return self._datapoints_db.get_datatypes_by_region_schema(region_schema)

    @needsdatapoints
    def get_region_parents(self, region_schema):
        return self._datapoints_db.get_region_parents(region_schema)

    #=============================================================#
    #                       Utility Functions                     #
    #=============================================================#

    def __check_period(self, path):
        assert not '..' in path
        assert not '/' in path
        assert not '\\' in path

        dd, mm, yyyy = path.split('_')
        int(yyyy), int(mm), int(dd)

    def get_revision_time_string(self):
        rev_time = getctime(OUTPUT_DIR / f'{self.period}-{self.subperiod_id}.sqlite')
        dt = str(datetime.datetime.fromtimestamp(rev_time) \
                 .astimezone(timezone('Australia/Melbourne'))).split('.')[0]
        return dt

    def __date_updated_sort_key(self, x):
        """
        Sort so that the most recent dates come first,
        then sort by state, datatype and name
        """
        def sortable_date(i):
            yyyy, mm, dd = i.split('_')
            return (
                str(9999 - int(yyyy)) + '_' +
                str(99 - int(mm)) + '_' +
                str(99 - int(dd))
            )

        return (
            sortable_date(x.date_updated),
            x.region_parent,
            x.region_child,
            x.datatype,
            x.agerange,
            x.region_child
        )

    def __generic_sort_key(self, x):
        """
        Sort only by state, datatype and name, ignoring date
        """
        return (
            x.region_parent,
            x.region_child,
            x.datatype,
            x.agerange,
            x.region_child
        )

    def get_datatypes_by_source_id(self, source_id):
        return self._datapoints_db.get_datatypes_by_source_id(source_id)

    def iter_rows(self, source_id, datatype):
        datapoints = self.get_datapoints_by_source_id(
            source_id, datatype, add_source_urls=False
        )
        dates = set()
        [dates.add(i.date_updated.replace('_', '-')) for i in datapoints]

        by_unique_key = {}
        for datapoint in datapoints:
            unique_key = (
                datapoint.region_schema,
                datapoint.region_parent,
                datapoint.region_child,
                datapoint.agerange
            )
            by_unique_key.setdefault(unique_key, {})[
                datapoint.date_updated.replace('_', '-')
            ] = datapoint.value

        for (region_schema, region_parent, region_child, agerange), values in sorted(by_unique_key.items()):
            row = {
                'region_schema': region_schema,
                'region_parent': region_parent,
                'region_child': region_child,
                'agerange': agerange
            }
            yield row, list(sorted(values.items()))

    #=============================================================#
    #                       Get DataPoints                        #
    #=============================================================#

    def get_combined_values_by_datatype(self, region_schema, datatypes, 
                                        from_date=None,
                                        region_parent=None, region_child=None):
        """
        Returns as a combined dict,
        e.g. if datatypes a list of ((datatype, name/None), ...) is (
            "DataTypes.AGE",
            "DataTypes.AGE_FEMALE",
        )
        it will output as [{
            'name': (e.g.) '70+',
            'date_updated': ...,
            'DataTypes.AGE': ...,
            'DataTypes.AGE_FEMALE': ...
        }, ...]
        """
        if region_parent:
            region_parent = region_parent.lower()
        if region_child:
            region_child = region_child.lower()

        combined = {}
        for datatype in datatypes:
            for datapoint in self.get_combined_value(region_schema, datatype,
                                                     from_date=from_date,
                                                     region_parent=region_parent,
                                                     region_child=region_child):

                if datapoint.agerange and datapoint.region_child:
                    k = f"{datapoint.agerange} {datapoint.region_child}"
                elif datapoint.agerange:
                    k = datapoint.agerange or ''
                else:
                    k = datapoint.region_child or ''

                i_combined = combined.setdefault(datapoint.region_parent, {}) \
                                     .setdefault(k, {})

                if (
                    not 'date_updated' in i_combined or
                    datapoint.date_updated < i_combined['date_updated']
                ):
                    # Use the least recent date
                    i_combined['date_updated'] = datapoint.date_updated
                    i_combined['date_today'] = datetime.datetime.now() \
                        .strftime('%Y_%m_%d')

                i_combined['agerange'] = datapoint.agerange
                i_combined['region_child'] = datapoint.region_child
                i_combined['region_parent'] = datapoint.region_parent
                i_combined['region_schema'] = datapoint.region_schema

                if not datatype.value in i_combined:
                    i_combined[datatype.value] = datapoint.value
                    i_combined[f'{datatype.value} date_updated'] = datapoint.date_updated
                    i_combined[f'{datatype.value} source_url'] = datapoint.source_url

        out = []
        for i_combined in combined.values():
            for add_me in i_combined.values():
                out.append(add_me)
        return out

    def get_combined_values(self, filters, from_date=None):
        """
        Returns as a combined dict,
        e.g. if filters (a list of ((region_schema, datatype, region_parent/None), ...) is (
            (DataTypes.PATIENT_STATUS, "Recovered"),
            (DataTypes.PATIENT_STATUS, "ICU")
        )
        it will output as [{
            'date_updated': ...,
            'DataTypes.PATIENT_STATUS (Recovered)': ...,
            'DataTypes.PATIENT_STATUS (ICU)': ...
        }, ...]
        """

        combined = {}
        for region_schema, datatype, region_parent in filters:
            if region_parent:
                region_parent = region_parent.lower()

            for datapoint in self.get_combined_value(region_schema,
                                                     datatype,
                                                     region_parent,
                                                     from_date=from_date):

                i_combined = combined.setdefault(
                    (datapoint.region_parent, datapoint.region_child), {}
                )

                if (
                    not 'date_updated' in i_combined or
                    datapoint.date_updated < i_combined['date_updated']
                ):
                    # Use the least recent date
                    i_combined['date_updated'] = datapoint.date_updated
                    i_combined['date_today'] = datetime.datetime.now() \
                        .strftime('%Y_%m_%d')

                k = datatype
                if datapoint.agerange:
                    k = f"{k} ({datapoint.agerange})"
                #if datapoint['region_child']:
                #    k = f"{k} ({datapoint['region_child']})"

                i_combined['region_parent'] = datapoint.region_parent
                i_combined['region_child'] = datapoint.region_child
                i_combined['agerange'] = datapoint.agerange
                i_combined['region_schema'] = datapoint.region_schema

                if not k in i_combined:
                    i_combined[k] = datapoint.value
                    i_combined[f'{k} date_updated'] = datapoint.date_updated
                    i_combined[f'{k} source_url'] = datapoint.source_url
                    i_combined[f'{k} text_match'] = datapoint.text_match or ''

        out = []
        for i_combined in combined.values():
            out.append(i_combined)
        return out
    
    @needsdatapoints
    def get_combined_value(self, region_schema, datatype,
                           region_parent=None, region_child=None,
                           from_date=None):
        """
        Filter `datapoints` to have only `datatype` (e.g. "DataTypes.PATIENT_STATUS"),
        and optionally only have `name` (e.g. "Recovered" or "None" as a string)

        Returns only the most recent value (optionally from `from_date`)
        """

        if region_child: region_child = region_child.lower()
        if region_parent: region_parent = region_parent.lower()

        datapoints = self._datapoints_db.select_many(
            region_schema=['= ?', [region_schema]] if region_schema is not None else None,
            region_parent=['= ?', [region_parent]] if region_parent is not None else None,
            region_child=['= ?', [region_child]] if region_child is not None else None,
            date_updated=['<= ?', [from_date]] if from_date is not None else None,
            datatype=['= ?', [datatype]] if datatype is not None else None,
            order_by='date_updated DESC',
            add_source_url=True
        )

        if not datapoints:
            print(f"WARNING: not found for {region_parent}, {region_schema}, {datatype}")

        r = {}
        for datapoint in datapoints:
            # Note we're restricting to only `datatype` already,
            # so no need to include it in the key
            unique_k = (
                datapoint.region_parent,
                datapoint.agerange,
                datapoint.region_child
            )
            if unique_k in r:
                continue
            r[unique_k] = datapoint

        r = list(r.values())
        r.sort(key=self.__generic_sort_key)
        return r
Example #6
0
                    DataPoint(region_schema=region_schema,
                              region_parent=region_parent,
                              region_child=region_child,
                              datatype=datatype,
                              date_updated=date_updated,
                              agerange=None,
                              value=value,
                              source_url='DERIVED'))

        self.datapoints_db.extend(source_id,
                                  append_datapoints,
                                  is_derived=True)


if __name__ == '__main__':
    from covid_db.SQLiteDataRevisions import SQLiteDataRevisions
    from covid_db.DataPointsDB import DataPointsDB
    from _utility.get_package_dir import get_output_dir

    OUTPUT_DIR = get_output_dir() / 'output'

    sdr = SQLiteDataRevisions()
    most_recent_revision = sdr.get_revisions()[0]
    period = most_recent_revision[0]
    subperiod_id = most_recent_revision[1]
    path = OUTPUT_DIR / f'{period}-{subperiod_id}.sqlite'

    dpdb = DataPointsDB(path)
    DerivedData(dpdb).add_derived()
    dpdb.close()