コード例 #1
0
def addtoQuilt(df_new, name):
    from quilt.data.nmduarte import gdelt3

    if name=="data_with_news":
        d = gdelt3.data.data_with_news()
    else:
        d = gdelt3.data.events()

    #print("Original data has :", original_data.count())

    df_new2= df_new.toPandas()

    print("Appending:", df_new2.count())

    print("original: ",type(d))
    print("new: ", type(df_new2))

    d = d.append(df_new2)
    print("TTOAL:", d.count())

    # gdelt3._set(["data","data_with_news"],df)

    # data_with_news['new_column'] = "aaaaaa"
    # data_with_news['new_column2'] = "bbbbb"

    quilt.build("nmduarte/gdelt3/data/"+name, d)
    quilt.push("nmduarte/gdelt3/data/"+name, is_public=True, is_team=False)
コード例 #2
0
    def _build_file_as_package(self, filepath: Union[str, pathlib.Path],
                               package_name: str) -> str:
        # enforce types
        checks.check_types(filepath, [str, pathlib.Path])
        checks.check_types(package_name, str)
        checks.check_file_exists(filepath)

        # convert types
        filepath = pathlib.Path(filepath)
        filepath = filepath.expanduser()
        filepath = filepath.resolve()

        # construct manifest
        load = {}
        load["file"] = str(filepath)
        load["transform"] = "id"
        contents = {"load": load}
        node = {"contents": contents}

        # write temporary manifest
        temp_write_loc = pathlib.Path(os.getcwd())
        temp_write_loc /= "single_file.yml"
        with open(temp_write_loc, "w") as write_out:
            yaml.dump(node, write_out, default_flow_style=False)

        # create quilt node
        full_package_name = self.storage_user + "/" + package_name
        quilt.build(full_package_name, str(temp_write_loc))

        # remove the temp file
        os.remove(temp_write_loc)

        return full_package_name
コード例 #3
0
def df_to_quilt(df, path):
    parts = path.split('/')
    assert len(parts) > 2

    root_pkg = '/'.join(parts[0:2])
    try:
        quilt.install(root_pkg, force=True)
    except Exception:
        pass

    object_encoding = {}
    df = df.copy()
    for col, dtype in df.dtypes.iteritems():
        if dtype.name in ('Int8', 'Int32'):
            object_encoding[col] = 'int32'
            df[col] = df[col].astype(object)
        else:
            object_encoding[col] = 'infer'

    with tempfile.NamedTemporaryFile(suffix='.parquet') as f:
        print('writing to %s' % f.name)
        fastparquet.write(f.name,
                          df,
                          compression='snappy',
                          object_encoding=object_encoding)
        print('build')
        quilt.build(path, f.name)
        print('push')
        quilt.push(root_pkg, is_public=True)
コード例 #4
0
def update_quilt_datasets():
    import quilt
    from quilt.data.jyrjola import karttahel

    df = get_buildings()
    karttahel._set(['buildings'], df)
    quilt.build('jyrjola/karttahel', karttahel)
    quilt.push('jyrjola/karttahel', is_public=True)
コード例 #5
0
    def _get_root_node(self):
        store, package = self.find_package()
        if not package:
            quilt.build(self.package_name)
            store, package = self.find_package()

        root_node = _from_core_node(store, package)
        return root_node
コード例 #6
0
    def update(self, df):
        root_node = self._get_root_node()

        df, meta = pint_df_to_quilt(df)
        root_node._set([self.sub_path], df)
        data_node = getattr(root_node, self.sub_path)
        data_node._meta.update(meta)

        quilt.build(self.package_name, root_node)
コード例 #7
0
def update_quilt_datasets():
    QUILT_TARGET = 'jyrjola/statfi'
    from quilt.data.jyrjola import statfi as node
    import quilt
    import requests_cache
    requests_cache.install_cache()

    df = get_fuel_classification()
    df.to_csv('fuel_classification.csv')
    print(df)
    exit()
    node._set(['fuel_classification'], df)
    quilt.build(QUILT_TARGET, node)
    quilt.push(QUILT_TARGET, is_public=True)
コード例 #8
0
ファイル: traficom.py プロジェクト: kausaltech/ghg-notebooks
def refresh_pxweb_datasets():
    import requests_cache
    requests_cache.install_cache()

    api = PXWebAPI('http://trafi2.stat.fi/PXWeb', 'fi')
    # print(api.list_topics('TraFi/Ensirekisteroinnit'))
    # exit()

    for path, table in PXWEB_TABLES:
        print(path, table)
        pxf = api.get_table('%s/%s.px' % (path, table))
        table = 'tf%s' % table
        root_node = update_node_from_pcaxis(QUILT_DATASET, table, pxf)

    quilt.build(QUILT_DATASET, root_node)
    quilt.push(QUILT_DATASET, is_public=True)
コード例 #9
0
    def merge(self, df):
        root_node = self._get_root_node()

        data_node = getattr(root_node, self.sub_path, None)
        if data_node is None:
            return self.update(df)

        old_df = quilt_to_pint_df(data_node)
        merged = old_df.append(df).sort_index()
        # Remove duplicate rows
        merged = merged[~merged.index.duplicated(keep='first')]

        merged, meta = pint_df_to_quilt(merged)

        root_node._set([self.sub_path], merged)
        data_node = getattr(root_node, self.sub_path)
        data_node._meta.update(meta)

        quilt.build(self.package_name, root_node)

        return merged
コード例 #10
0
ファイル: quilt.py プロジェクト: jalabort/nucleus
def update_pkg(
    df: pd.DataFrame,
    user: str,
    package: str,
    readme: Optional[str] = None,
    hash_key=None,
):
    r"""

    Parameters
    ----------
    df
    user
    package
    readme
    hash_key

    Returns
    -------

    """
    pkg_path = f'{user}/{package}'
    quilt.build(pkg_path, quilt.nodes.GroupNode(dict(author='@hudlrd')))

    quilt.build(f'{pkg_path}/df', quilt.nodes.DataNode(None, None, df, {}))

    # TODO: warn the user if readme if not provided
    if readme is not None:
        with NamedTemporaryFile() as tmp:
            tmp.write(readme.encode('UTF-8'))
            tmp.flush()
            quilt.build(f'{pkg_path}/README', tmp.model_name)

    quilt.login()
    quilt.push(pkg_path, is_public=True, hash=hash_key)
コード例 #11
0
ファイル: gdelt.py プロジェクト: nmduarteus/insight-gdelt
def upload_to_quilt(spark, schemas_dic):
    """
    Function to upload data to quilt and to append it to already existing data
    :param spark: Spark Sessuin
    :return: None
    """

    # remove old data and get new one
    logging.info("Installing quilt gdelt data...")
    quilt.rm("nmduarte/gdelt", force=True)
    quilt.install("nmduarte/gdelt", force=True)
    from quilt.data.nmduarte import gdelt

    # get the old data from quilt
    logging.info("getting data from quilt...")
    events_from_quilt = gdelt.events()
    mentions_from_quilt = gdelt.mentions()
    news_from_quilt = gdelt.news()

    # transform the data into dataframes so it can be appended
    logging.info("Creating dataframes from quilt data...")
    events_from_quilt_df = spark.createDataFrame(events_from_quilt,
                                                 schema=schemas_dic['events2'])
    mentions_from_quilt_df = spark.createDataFrame(
        mentions_from_quilt, schema=schemas_dic['mentions'])
    news_from_quilt_df = spark.createDataFrame(news_from_quilt,
                                               schema=schemas_dic['news'])

    # mentions data - new data
    logging.info("Reading last 15min data from S3...")
    mentions_df = tools.read_from_s3_enriched(spark, "mentions",
                                              schemas_dic['mentions'],
                                              cmd_opts.date)
    events_df = tools.read_from_s3_enriched(spark, "events",
                                            schemas_dic['events2'],
                                            cmd_opts.date)
    news_df = tools.read_from_s3_enriched(spark, "news", schemas_dic['news'],
                                          cmd_opts.date)

    # concatenate already existing data with new data
    logging.info("Appending data to old quilt data...")
    mentions_concat = mentions_from_quilt_df.union(mentions_df)
    events_concat = events_from_quilt_df.union(events_df)
    news_concat = news_from_quilt_df.union(news_df)

    # build the 3 packages
    logging.info("Building quilt packages...")
    quilt.build("nmduarte/gdelt/mentions", mentions_concat.toPandas())
    quilt.build("nmduarte/gdelt/events", events_concat.toPandas())
    quilt.build("nmduarte/gdelt/news", news_concat.toPandas())

    # push the 3 packages
    logging.info("Pushing quilt info...")
    quilt.push("nmduarte/gdelt/mentions", is_public=True, is_team=False)
    quilt.push("nmduarte/gdelt/events", is_public=True, is_team=False)
    quilt.push("nmduarte/gdelt/news", is_public=True, is_team=False)
コード例 #12
0
def update_node_from_pcaxis(root_node_or_path, sub_path, px_file):
    assert '/' not in sub_path

    if isinstance(root_node_or_path, str):
        root_path = root_node_or_path
        root_mod_path = root_path.replace('/', '.')
        try:
            root_node = importlib.import_module('quilt.data.%s' %
                                                root_mod_path)
        except ImportError:
            quilt.build(root_path)
            root_node = importlib.import_module('quilt.data.%s' %
                                                root_mod_path)
    else:
        root_node = root_node_or_path

    df = px_file.to_df(melt=True, dropna=True)
    root_node._set([sub_path], df)
    meta = dict(px_file.meta)
    for key, val in meta.items():
        if isinstance(val, collections.OrderedDict):
            meta[key] = dict(val)
        elif isinstance(val, datetime):
            meta[key] = val.isoformat()

    try:
        import json
        json.dumps(meta, sort_keys=True)
    except Exception:
        from pprint import pprint
        pprint(meta)
        raise

    getattr(root_node, sub_path)._meta['pxmeta'] = meta

    return root_node
コード例 #13
0
def uploadToQuilt(spark):
    #downloads the data from s3
    print("Getting schemas..")
    events_schema, mentions_schema, news_schema,events_schema2 = set_schemas()

    # quilt.install("nmduarte/gdelt3")

    # mentions data
    print("Getting mention data..")
    mentions_df = read_from_s3_enriched(spark, "mentions", mentions_schema, cmd_opts.date)
    mentions_df.show()
    mentions_df.write.csv("tmp_data/mentions", header="true", mode="overwrite")

    events_df = read_from_s3_enriched(spark, "events", events_schema2, cmd_opts.date)
    events_df.write.csv("tmp_data/events", header="true", mode="overwrite")

    news_df = read_from_s3_enriched(spark, "news", news_schema, cmd_opts.date)
    news_df.write.csv("tmp_data/news", header="true", mode="overwrite")

    #news_df.write.csv("hdfs://10.0.0.13/ubuntu/hdfs/data/example.csv")

    #news_df.show()

    #quilt.build("nmduarte/gdelt8_news")
    #from quilt.data.nmduarte import gdelt8_news
    #news1 = pd.read_csv("tmp_data/news/part-00000-0f8595b0-2bd0-4156-9254-78e7b5cfa5c9-c000.csv", engine='python', escapechar="\\")
    #gdelt8_news._set(['bar'], news1)
    #print(gdelt8_news.bar())
    #quilt.push("nmduarte/gdelt8_news", is_public=True)

    #quilt.build("nmduarte/gdelt_news","tmp_data/news")
    # put some data in it
    #from quilt.data.nmduarte import gdelt9_news
    #df = pd.DataFrame(data=[1, 2, 3])
    #gdelt9_news._set(['bar'], df)
    #print(gdelt9_news.bar())
    #quilt.push("nmduarte/gdelt_news", is_public=True)

    #print(news1.head())

    # build the 3 packages
    quilt.build("nmduarte/gdelt_mentions","tmp_data/mentions")
    quilt.build("nmduarte/gdelt_events", "tmp_data/events")
    quilt.build("nmduarte/gdelt_news", "tmp_data/news")

    # push the 3 packages
    quilt.push("nmduarte/gdelt_mentions", is_public=True, is_team=False)
    quilt.push("nmduarte/gdelt_events", is_public=True, is_team=False)
    quilt.push("nmduarte/gdelt_news", is_public=True, is_team=False)
コード例 #14
0
ファイル: data.py プロジェクト: emorrow3/geosnap
sys.path.insert(0,
                os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))

from util import adjust_inflation, convert_gdf

try:
    from quilt.data.spatialucr import census
except ImportError:
    warn("Fetching data. This should only happen once")
    quilt.install("spatialucr/census")
    quilt.install("spatialucr/census_cartographic")
    from quilt.data.spatialucr import census
try:
    from quilt.data.geosnap_data import data_store
except ImportError:
    quilt.build("geosnap_data/data_store")
    from quilt.data.geosnap_data import data_store


class Bunch(dict):
    """A dict with attribute-access."""
    def __getattr__(self, key):
        try:
            return self.__getitem__(key)
        except KeyError:
            raise AttributeError(key)

    def __setattr__(self, key, value):
        self.__setitem__(key, value)

    def __dir__(self):
コード例 #15
0
t = time.time()
CopyImages(folds, allImageIDs, dataSource, packagePath, logPath, imagePrefix)
copyTime = time.time() - t

print('########Copy Completed#########')

FilterAndSavePandasTable(folds, allImageIDs, annList, packagePath, logPath)
print('######## Panda Table Generated#########')

GenerateREADME(packagePath + 'README.md', classDescription)
if (os.path.exists(packagePath + 'build.yml')):
    os.remove(packagePath + 'build.yml')

t = time.time()
quilt.generate(packagePath)
quilt.build(quiltUser + '/' + classDescription, packagePath + 'build.yml')

pkgNode = quilt.load(quiltUser + '/' + classDescription)

pkgNode._meta['trainable'] = classID in oi.classes_trainable().values
pkgNode._meta['labelName'] = classDescription
numImages = GetNumImages(folds, allImageIDs)
pkgNode._meta['image_count'] = numImages

GenerateImageMetadata(folds, allImageIDs, pkgNode, annList, logPath,
                      imagePrefix)
print('######## Image Metadata Generated#########')

quilt.build(quiltUser + '/' + classDescription, pkgNode)
print('######## New Package Generated#########')
コード例 #16
0
ファイル: data_prep.py プロジェクト: jtaquia/deepbedmap
np.save(file="model/train/W2_data.npy", arr=measuresvelocity)
np.save(file="model/train/W3_data.npy", arr=accumulation)
np.save(file="model/train/X_data.npy", arr=lores)
np.save(file="model/train/Y_data.npy", arr=hires)

# %% [markdown]
# ### Quilt
#
# Login -> Build -> Push

# %%
quilt.login()

# %%
# Tiled datasets for training neural network
quilt.build(package="weiji14/deepbedmap/model/train/W1_data", path=rema)
quilt.build(package="weiji14/deepbedmap/model/train/W2_data",
            path=measuresvelocity)
quilt.build(package="weiji14/deepbedmap/model/train/W3_data",
            path=accumulation)
quilt.build(package="weiji14/deepbedmap/model/train/X_data", path=lores)
quilt.build(package="weiji14/deepbedmap/model/train/Y_data", path=hires)

# %%
# Original datasets for neural network predictions on bigger area
quilt.build(package="weiji14/deepbedmap/lowres/bedmap2_bed",
            path="lowres/bedmap2_bed.tif")
quilt.build(
    package="weiji14/deepbedmap/misc/REMA_100m_dem_filled",
    path="misc/REMA_100m_dem_filled.tif",
)
コード例 #17
0
ファイル: deepbedmap.py プロジェクト: next-mooon/deepbedmap
            print("Done!")

    return X_tile, W1_tile, W2_tile, W3_tile


# %%
X_tile, W1_tile, W2_tile, W3_tile = get_deepbedmap_model_inputs(
    window_bound=window_bound)
print(X_tile.shape, W1_tile.shape, W2_tile.shape, W3_tile.shape)

# Build quilt package for datasets covering our test region
reupload = False
if reupload == True:
    bounds_str = "_".join(str(int(b))
                          for b in (window_bound)).replace("-", "m")
    quilt.build(package=f"weiji14/deepbedmap/model/test/{bounds_str}/W1_tile",
                path=W1_tile)
    quilt.build(package=f"weiji14/deepbedmap/model/test/{bounds_str}/W2_tile",
                path=W2_tile)
    quilt.build(package=f"weiji14/deepbedmap/model/test/{bounds_str}/W3_tile",
                path=W3_tile)
    quilt.build(package=f"weiji14/deepbedmap/model/test/{bounds_str}/X_tile",
                path=X_tile)
    quilt.push(package=f"weiji14/deepbedmap/model/test/{bounds_str}",
               is_public=True)


# %%
def subplot(directive: str, row: int = None, col: int = None, **kwargs):
    """Thin wrapper around https://docs.generic-mapping-tools.org/latest/subplot.html"""
    with gmt.clib.Session() as lib:
        rowcol = ""  # default is blank, e.g. when directive == "end"
コード例 #18
0
ファイル: osm.py プロジェクト: kausaltech/ghg-notebooks
          ST_Length(ST_Intersection(way, (SELECT way FROM munigeom))) AS length,
          ST_Transform(ST_SetSRID(way, 3067), 4326) AS way
        FROM planet_osm_line
        WHERE (highway='cycleway' OR tags ? 'cycleway') AND
          ST_Intersects(way, (SELECT way FROM munigeom))""" % (muni_sql,
                                                               col_sql)

    df = gpd.GeoDataFrame.from_postgis(sql, con, geom_col='way')
    return df


if __name__ == '__main__':
    import sys
    import quilt
    from quilt.data.jyrjola import osm

    data_date = datetime.strptime(sys.argv[1], '%Y-%m-%d').date()
    print("Executing SQL...")
    df = get_bike_lanes('helsinki')

    df['date'] = data_date
    # Quilt is unable to store geometry data, so drop the geometry
    # column for now.
    df.drop('way', inplace=True, axis=1)
    print("%d rows received, total length %d km" %
          (len(df), df['length'].sum() / 1000))

    old_df = osm.helsinki_bike_lanes()
    quilt.build('jyrjola/osm/helsinki_bike_lanes', old_df.append(df))
    #quilt.push('jyrjola/osm', is_public=True)
コード例 #19
0
def update_quilt(quilt_path):
    import os
    import glob
    import settings

    def upload_px_dataset(root_node, file):
        fname = os.path.splitext(os.path.basename(file))[0]
        if 'hginseutu' not in fname.lower(
        ) and 'UM' not in fname and 'hki' not in fname.lower():
            return

        print(fname)
        if re.match('^[0-9]', fname):
            # If the name begins with a number, prefix it with an letter
            # to make it a legal Python identifier.
            fname = 'z' + fname

        fname = fname.replace('-', '_').lower()

        content = open(file, 'r', encoding='windows-1252').read()
        parser = PxParser()
        try:
            file = parser.parse(content)
        except Exception as e:
            print(e)
            return

        now = datetime.now()

        parser = PxParser()
        file = parser.parse(content)
        now = datetime.now()
        from pprint import pprint
        #if 'last_updated' not in file.meta or (now - file.meta['last_updated']) > timedelta(days=2 * 365):
        #    return

        print("\t%s" % file.meta['contents'])

        if root_node:
            quilt_target = root_node
        else:
            quilt_target = quilt_path

        node = update_node_from_pcaxis(quilt_target, fname, file)
        return node

    SKIP_FILES = []

    data_dir = os.path.join(settings.DATA_DIR, 'aluesarjat')
    files = glob.glob('%s/*.px' % data_dir)
    skip_until = None

    root_node = None
    for file in files:
        if 'A01S_HKI_Rak' not in file:
            continue
        if skip_until:
            if skip_until not in file:
                continue
            skip_until = None

        skip = False
        for sf in SKIP_FILES:
            if sf in file:
                skip = True
                break
        if skip:
            continue

        ret = upload_px_dataset(root_node, file)
        if ret:
            root_node = ret

    assert root_node
    quilt.build(quilt_path, root_node)
    quilt.push(quilt_path, is_public=True)
コード例 #20
0
ファイル: traficom.py プロジェクト: kausaltech/ghg-notebooks
if __name__ == '__main__':
    refresh_pxweb_datasets()
    exit()

    import quilt
    try:
        pass
        #quilt.install('jyrjola/traficom', force=True)
    except Exception:
        pass

    FORCE_QUARTERS = ['2019q3']

    # quilt.push('jyrjola/traficom', is_public=True)
    from quilt.data.jyrjola import traficom  # noqa

    for url, quarter in VEHICLE_URLS:
        print(url)
        dataset_name = 'vehicle_register_%s' % quarter
        if dataset_name in traficom._keys() and quarter not in FORCE_QUARTERS:
            print('skipping')
            continue
        outfn = '/tmp/out-%s.pq' % quarter
        if not os.path.exists(outfn):
            fetch_road_vehicle_register(url.split('/')[-1], quarter, outfn)
        print('build')
        quilt.build('jyrjola/traficom/%s' % dataset_name,
                    path='/tmp/out-%s.pq' % quarter)
        print('push')
        quilt.push('jyrjola/traficom', is_public=True)
コード例 #21
0
ファイル: data.py プロジェクト: emorrow3/geosnap
def read_ncdb(filepath):
    """
    Read & store data from Geolytics's Neighborhood Change Database.

    Parameters
    ----------
    filepath : str
        location of the input CSV file extracted from your Geolytics DVD

    Returns
    -------
    pandas.DataFrame

    """
    ncdb_vars = dictionary["ncdb"].dropna()[1:].values

    names = []
    for name in ncdb_vars:
        for suffix in ['7', '8', '9', '0', '1', '2']:
            names.append(name + suffix)
    names.append('GEO2010')

    c = pd.read_csv(filepath, nrows=1).columns
    c = pd.Series(c.values)

    keep = []
    for i, col in c.items():
        for name in names:
            if col.startswith(name):
                keep.append(col)

    df = pd.read_csv(
        filepath,
        usecols=keep,
        engine='c',
        na_values=["", " ", 99999, -999],
        converters={
            "GEO2010": str,
            "COUNTY": str,
            "COUSUB": str,
            "DIVISION": str,
            "REGION": str,
            "STATE": str,
        },
    )

    cols = df.columns
    fixed = []
    for col in cols:
        if col.endswith("D"):
            fixed.append("D" + col[:-1])
        elif col.endswith("N"):
            fixed.append("N" + col[:-1])
        elif col.endswith("1A"):
            fixed.append(col[:-2] + "2")

    orig = []
    for col in cols:
        if col.endswith("D"):
            orig.append(col)
        elif col.endswith("N"):
            orig.append(col)
        elif col.endswith("1A"):
            orig.append(col)

    renamer = dict(zip(orig, fixed))
    df.rename(renamer, axis="columns", inplace=True)

    df = df[df.columns[df.columns.isin(names)]]

    df = pd.wide_to_long(df,
                         stubnames=ncdb_vars,
                         i="GEO2010",
                         j="year",
                         suffix="(7|8|9|0|1|2)").reset_index()

    df["year"] = df["year"].replace({
        7: 1970,
        8: 1980,
        9: 1990,
        0: 2000,
        1: 2010,
        2: 2010
    })
    df = df.groupby(["GEO2010", "year"]).first()

    mapper = dict(zip(dictionary.ncdb, dictionary.variable))

    df.reset_index(inplace=True)

    df = df.rename(mapper, axis="columns")

    df = df.set_index("geoid")

    for row in dictionary['formula'].dropna().tolist():
        try:
            df.eval(row, inplace=True)
        except:
            warn('Unable to compute ' + str(row))

    df = df.round(0)

    keeps = df.columns[df.columns.isin(dictionary['variable'].tolist() +
                                       ['year'])]

    df = df[keeps]

    df = df.loc[df.n_total_pop != 0]

    data_store._set(['ncdb'], df)
    quilt.build("geosnap_data/data_store", data_store)
コード例 #22
0
ファイル: deepbedmap.py プロジェクト: jtaquia/deepbedmap
                )
            print("Done!")

    return X_tile, W1_tile, W2_tile, W3_tile


# %%
X_tile, W1_tile, W2_tile, W3_tile = get_deepbedmap_model_inputs(
    window_bound=window_bound
)
print(X_tile.shape, W1_tile.shape, W2_tile.shape, W3_tile.shape)

# Build quilt package for datasets covering our test region
reupload = False
if reupload == True:
    quilt.build(package="weiji14/deepbedmap/model/test/W1_tile", path=W1_tile)
    quilt.build(package="weiji14/deepbedmap/model/test/W2_tile", path=W2_tile)
    quilt.build(package="weiji14/deepbedmap/model/test/W3_tile", path=W3_tile)
    quilt.build(package="weiji14/deepbedmap/model/test/X_tile", path=X_tile)
    quilt.push(package="weiji14/deepbedmap/model/test", is_public=True)


# %%
def plot_3d_view(
    img: np.ndarray,
    ax: matplotlib.axes._subplots.Axes,
    elev: int = 60,
    azim: int = 330,
    z_minmax: tuple = None,
    title: str = None,
    zlabel: str = None,
コード例 #23
0
ファイル: data.py プロジェクト: emorrow3/geosnap
def read_ltdb(sample, fullcount):
    """
    Read & store data from Brown's Longitudinal Tract Database (LTDB).

    Parameters
    ----------
    sample : str
        file path of the zip file containing the standard Sample CSV files
        downloaded from
        https://s4.ad.brown.edu/projects/diversity/Researcher/LTBDDload/Default.aspx

    fullcount: str
        file path of the zip file containing the standard Fullcount CSV files
        downloaded from
        https://s4.ad.brown.edu/projects/diversity/Researcher/LTBDDload/Default.aspx

    Returns
    -------
    pandas.DataFrame

    """
    sample_zip = zipfile.ZipFile(sample)
    fullcount_zip = zipfile.ZipFile(fullcount)

    def _ltdb_reader(path, file, year, dropcols=None):

        df = pd.read_csv(
            path.open(file),
            na_values=["", " ", 99999, -999],
            converters={
                0: str,
                "placefp10": str
            },
            low_memory=False,
            encoding="latin1",
        )

        if dropcols:
            df.drop(dropcols, axis=1, inplace=True)
        df.columns = df.columns.str.lower()
        names = df.columns.values.tolist()
        names[0] = "geoid"
        newlist = []

        # ignoring the first 4 columns, remove year suffix from column names
        for name in names[4:]:
            newlist.append(name[:-2])
        colnames = names[:4] + newlist
        df.columns = colnames

        # prepend a 0 when FIPS is too short
        df["geoid"] = df["geoid"].str.rjust(11, "0")
        df.set_index("geoid", inplace=True)

        df["year"] = year

        inflate_cols = [
            "mhmval", "mrent", "incpc", "hinc", "hincw", "hincb", "hinch",
            "hinca"
        ]

        inflate_available = list(
            set(df.columns).intersection(set(inflate_cols)))

        if len(inflate_available):
            # try:
            df = adjust_inflation(df, inflate_available, year)
        # except KeyError:  # half the dfs don't have these variables
        #     pass
        return df

    # read in Brown's LTDB data, both the sample and fullcount files for each
    # year population, housing units & occupied housing units appear in both
    # "sample" and "fullcount" files-- currently drop sample and keep fullcount

    sample70 = _ltdb_reader(
        sample_zip,
        "ltdb_std_all_sample/ltdb_std_1970_sample.csv",
        dropcols=["POP70SP1", "HU70SP", "OHU70SP"],
        year=1970,
    )

    fullcount70 = _ltdb_reader(fullcount_zip,
                               "LTDB_Std_1970_fullcount.csv",
                               year=1970)

    sample80 = _ltdb_reader(
        sample_zip,
        "ltdb_std_all_sample/ltdb_std_1980_sample.csv",
        dropcols=["pop80sf3", "pop80sf4", "hu80sp", "ohu80sp"],
        year=1980,
    )

    fullcount80 = _ltdb_reader(fullcount_zip,
                               "LTDB_Std_1980_fullcount.csv",
                               year=1980)

    sample90 = _ltdb_reader(
        sample_zip,
        "ltdb_std_all_sample/ltdb_std_1990_sample.csv",
        dropcols=["POP90SF3", "POP90SF4", "HU90SP", "OHU90SP"],
        year=1990,
    )

    fullcount90 = _ltdb_reader(fullcount_zip,
                               "LTDB_Std_1990_fullcount.csv",
                               year=1990)

    sample00 = _ltdb_reader(
        sample_zip,
        "ltdb_std_all_sample/ltdb_std_2000_sample.csv",
        dropcols=["POP00SF3", "HU00SP", "OHU00SP"],
        year=2000,
    )

    fullcount00 = _ltdb_reader(fullcount_zip,
                               "LTDB_Std_2000_fullcount.csv",
                               year=2000)

    sample10 = _ltdb_reader(sample_zip,
                            "ltdb_std_all_sample/ltdb_std_2010_sample.csv",
                            year=2010)

    # join the sample and fullcount variables into a single df for the year
    ltdb_1970 = sample70.drop(columns=['year']).join(fullcount70.iloc[:, 7:],
                                                     how="left")
    ltdb_1980 = sample80.drop(columns=['year']).join(fullcount80.iloc[:, 7:],
                                                     how="left")
    ltdb_1990 = sample90.drop(columns=['year']).join(fullcount90.iloc[:, 7:],
                                                     how="left")
    ltdb_2000 = sample00.drop(columns=['year']).join(fullcount00.iloc[:, 7:],
                                                     how="left")
    ltdb_2010 = sample10

    df = pd.concat([ltdb_1970, ltdb_1980, ltdb_1990, ltdb_2000, ltdb_2010],
                   sort=True)

    renamer = dict(
        zip(dictionary['ltdb'].tolist(), dictionary['variable'].tolist()))

    df.rename(renamer, axis="columns", inplace=True)

    # compute additional variables from lookup table
    for row in dictionary['formula'].dropna().tolist():
        df.eval(row, inplace=True)

    keeps = df.columns[df.columns.isin(dictionary['variable'].tolist() +
                                       ['year'])]
    df = df[keeps]

    data_store._set(['ltdb'], df)
    quilt.build("geosnap_data/data_store", data_store)
コード例 #24
0
ファイル: fmi.py プロジェクト: jtuomist/ghg-notebooks
 def build_and_push(package, df):
     quilt.build('%s/%s/%s' % (USER, PACKAGE_BASE, package), df)
     quilt.push('%s/%s/%s' % (USER, PACKAGE_BASE, package), is_public=True)