Ejemplo n.º 1
0
def addtoQuilt(df_new, name):
    from quilt.data.nmduarte import gdelt3

    if name=="data_with_news":
        d = gdelt3.data.data_with_news()
    else:
        d = gdelt3.data.events()

    #print("Original data has :", original_data.count())

    df_new2= df_new.toPandas()

    print("Appending:", df_new2.count())

    print("original: ",type(d))
    print("new: ", type(df_new2))

    d = d.append(df_new2)
    print("TTOAL:", d.count())

    # gdelt3._set(["data","data_with_news"],df)

    # data_with_news['new_column'] = "aaaaaa"
    # data_with_news['new_column2'] = "bbbbb"

    quilt.build("nmduarte/gdelt3/data/"+name, d)
    quilt.push("nmduarte/gdelt3/data/"+name, is_public=True, is_team=False)
Ejemplo n.º 2
0
def df_to_quilt(df, path):
    parts = path.split('/')
    assert len(parts) > 2

    root_pkg = '/'.join(parts[0:2])
    try:
        quilt.install(root_pkg, force=True)
    except Exception:
        pass

    object_encoding = {}
    df = df.copy()
    for col, dtype in df.dtypes.iteritems():
        if dtype.name in ('Int8', 'Int32'):
            object_encoding[col] = 'int32'
            df[col] = df[col].astype(object)
        else:
            object_encoding[col] = 'infer'

    with tempfile.NamedTemporaryFile(suffix='.parquet') as f:
        print('writing to %s' % f.name)
        fastparquet.write(f.name,
                          df,
                          compression='snappy',
                          object_encoding=object_encoding)
        print('build')
        quilt.build(path, f.name)
        print('push')
        quilt.push(root_pkg, is_public=True)
Ejemplo n.º 3
0
def update_pkg(
    df: pd.DataFrame,
    user: str,
    package: str,
    readme: Optional[str] = None,
    hash_key=None,
):
    r"""

    Parameters
    ----------
    df
    user
    package
    readme
    hash_key

    Returns
    -------

    """
    pkg_path = f'{user}/{package}'
    quilt.build(pkg_path, quilt.nodes.GroupNode(dict(author='@hudlrd')))

    quilt.build(f'{pkg_path}/df', quilt.nodes.DataNode(None, None, df, {}))

    # TODO: warn the user if readme if not provided
    if readme is not None:
        with NamedTemporaryFile() as tmp:
            tmp.write(readme.encode('UTF-8'))
            tmp.flush()
            quilt.build(f'{pkg_path}/README', tmp.model_name)

    quilt.login()
    quilt.push(pkg_path, is_public=True, hash=hash_key)
Ejemplo n.º 4
0
def update_quilt_datasets():
    import quilt
    from quilt.data.jyrjola import karttahel

    df = get_buildings()
    karttahel._set(['buildings'], df)
    quilt.build('jyrjola/karttahel', karttahel)
    quilt.push('jyrjola/karttahel', is_public=True)
Ejemplo n.º 5
0
def update_quilt_datasets():
    QUILT_TARGET = 'jyrjola/statfi'
    from quilt.data.jyrjola import statfi as node
    import quilt
    import requests_cache
    requests_cache.install_cache()

    df = get_fuel_classification()
    df.to_csv('fuel_classification.csv')
    print(df)
    exit()
    node._set(['fuel_classification'], df)
    quilt.build(QUILT_TARGET, node)
    quilt.push(QUILT_TARGET, is_public=True)
Ejemplo n.º 6
0
def refresh_pxweb_datasets():
    import requests_cache
    requests_cache.install_cache()

    api = PXWebAPI('http://trafi2.stat.fi/PXWeb', 'fi')
    # print(api.list_topics('TraFi/Ensirekisteroinnit'))
    # exit()

    for path, table in PXWEB_TABLES:
        print(path, table)
        pxf = api.get_table('%s/%s.px' % (path, table))
        table = 'tf%s' % table
        root_node = update_node_from_pcaxis(QUILT_DATASET, table, pxf)

    quilt.build(QUILT_DATASET, root_node)
    quilt.push(QUILT_DATASET, is_public=True)
Ejemplo n.º 7
0
def upload_to_quilt(spark, schemas_dic):
    """
    Function to upload data to quilt and to append it to already existing data
    :param spark: Spark Sessuin
    :return: None
    """

    # remove old data and get new one
    logging.info("Installing quilt gdelt data...")
    quilt.rm("nmduarte/gdelt", force=True)
    quilt.install("nmduarte/gdelt", force=True)
    from quilt.data.nmduarte import gdelt

    # get the old data from quilt
    logging.info("getting data from quilt...")
    events_from_quilt = gdelt.events()
    mentions_from_quilt = gdelt.mentions()
    news_from_quilt = gdelt.news()

    # transform the data into dataframes so it can be appended
    logging.info("Creating dataframes from quilt data...")
    events_from_quilt_df = spark.createDataFrame(events_from_quilt,
                                                 schema=schemas_dic['events2'])
    mentions_from_quilt_df = spark.createDataFrame(
        mentions_from_quilt, schema=schemas_dic['mentions'])
    news_from_quilt_df = spark.createDataFrame(news_from_quilt,
                                               schema=schemas_dic['news'])

    # mentions data - new data
    logging.info("Reading last 15min data from S3...")
    mentions_df = tools.read_from_s3_enriched(spark, "mentions",
                                              schemas_dic['mentions'],
                                              cmd_opts.date)
    events_df = tools.read_from_s3_enriched(spark, "events",
                                            schemas_dic['events2'],
                                            cmd_opts.date)
    news_df = tools.read_from_s3_enriched(spark, "news", schemas_dic['news'],
                                          cmd_opts.date)

    # concatenate already existing data with new data
    logging.info("Appending data to old quilt data...")
    mentions_concat = mentions_from_quilt_df.union(mentions_df)
    events_concat = events_from_quilt_df.union(events_df)
    news_concat = news_from_quilt_df.union(news_df)

    # build the 3 packages
    logging.info("Building quilt packages...")
    quilt.build("nmduarte/gdelt/mentions", mentions_concat.toPandas())
    quilt.build("nmduarte/gdelt/events", events_concat.toPandas())
    quilt.build("nmduarte/gdelt/news", news_concat.toPandas())

    # push the 3 packages
    logging.info("Pushing quilt info...")
    quilt.push("nmduarte/gdelt/mentions", is_public=True, is_team=False)
    quilt.push("nmduarte/gdelt/events", is_public=True, is_team=False)
    quilt.push("nmduarte/gdelt/news", is_public=True, is_team=False)
Ejemplo n.º 8
0
def uploadToQuilt(spark):
    #downloads the data from s3
    print("Getting schemas..")
    events_schema, mentions_schema, news_schema,events_schema2 = set_schemas()

    # quilt.install("nmduarte/gdelt3")

    # mentions data
    print("Getting mention data..")
    mentions_df = read_from_s3_enriched(spark, "mentions", mentions_schema, cmd_opts.date)
    mentions_df.show()
    mentions_df.write.csv("tmp_data/mentions", header="true", mode="overwrite")

    events_df = read_from_s3_enriched(spark, "events", events_schema2, cmd_opts.date)
    events_df.write.csv("tmp_data/events", header="true", mode="overwrite")

    news_df = read_from_s3_enriched(spark, "news", news_schema, cmd_opts.date)
    news_df.write.csv("tmp_data/news", header="true", mode="overwrite")

    #news_df.write.csv("hdfs://10.0.0.13/ubuntu/hdfs/data/example.csv")

    #news_df.show()

    #quilt.build("nmduarte/gdelt8_news")
    #from quilt.data.nmduarte import gdelt8_news
    #news1 = pd.read_csv("tmp_data/news/part-00000-0f8595b0-2bd0-4156-9254-78e7b5cfa5c9-c000.csv", engine='python', escapechar="\\")
    #gdelt8_news._set(['bar'], news1)
    #print(gdelt8_news.bar())
    #quilt.push("nmduarte/gdelt8_news", is_public=True)

    #quilt.build("nmduarte/gdelt_news","tmp_data/news")
    # put some data in it
    #from quilt.data.nmduarte import gdelt9_news
    #df = pd.DataFrame(data=[1, 2, 3])
    #gdelt9_news._set(['bar'], df)
    #print(gdelt9_news.bar())
    #quilt.push("nmduarte/gdelt_news", is_public=True)

    #print(news1.head())

    # build the 3 packages
    quilt.build("nmduarte/gdelt_mentions","tmp_data/mentions")
    quilt.build("nmduarte/gdelt_events", "tmp_data/events")
    quilt.build("nmduarte/gdelt_news", "tmp_data/news")

    # push the 3 packages
    quilt.push("nmduarte/gdelt_mentions", is_public=True, is_team=False)
    quilt.push("nmduarte/gdelt_events", is_public=True, is_team=False)
    quilt.push("nmduarte/gdelt_news", is_public=True, is_team=False)
Ejemplo n.º 9
0

# %%
X_tile, W1_tile, W2_tile, W3_tile = get_deepbedmap_model_inputs(
    window_bound=window_bound
)
print(X_tile.shape, W1_tile.shape, W2_tile.shape, W3_tile.shape)

# Build quilt package for datasets covering our test region
reupload = False
if reupload == True:
    quilt.build(package="weiji14/deepbedmap/model/test/W1_tile", path=W1_tile)
    quilt.build(package="weiji14/deepbedmap/model/test/W2_tile", path=W2_tile)
    quilt.build(package="weiji14/deepbedmap/model/test/W3_tile", path=W3_tile)
    quilt.build(package="weiji14/deepbedmap/model/test/X_tile", path=X_tile)
    quilt.push(package="weiji14/deepbedmap/model/test", is_public=True)


# %%
def plot_3d_view(
    img: np.ndarray,
    ax: matplotlib.axes._subplots.Axes,
    elev: int = 60,
    azim: int = 330,
    z_minmax: tuple = None,
    title: str = None,
    zlabel: str = None,
):
    """
    Creates a 3D perspective view plot of an elevation surface using matplotlib 3D.
    The elevation (elev) and azimuth (azim) angle will need to be set accordingly,
Ejemplo n.º 10
0
            path=measuresvelocity)
quilt.build(package="weiji14/deepbedmap/model/train/W3_data",
            path=accumulation)
quilt.build(package="weiji14/deepbedmap/model/train/X_data", path=lores)
quilt.build(package="weiji14/deepbedmap/model/train/Y_data", path=hires)

# %%
# Original datasets for neural network predictions on bigger area
quilt.build(package="weiji14/deepbedmap/lowres/bedmap2_bed",
            path="lowres/bedmap2_bed.tif")
quilt.build(
    package="weiji14/deepbedmap/misc/REMA_100m_dem_filled",
    path="misc/REMA_100m_dem_filled.tif",
)
with xr.open_dataset("misc/antarctic_ice_vel_phase_map_v01.nc") as ds:
    with tempfile.NamedTemporaryFile(suffix=".nc") as tmpfile:
        ds[["VX",
            "VY"]].to_netcdf(path=tmpfile.name)  # save only VX, VY variables
        quilt.build(
            package=
            "weiji14/deepbedmap/misc/antarctic_ice_vel_phase_map_v01_VX_VY",
            path=tmpfile.name,
        )
quilt.build(
    package="weiji14/deepbedmap/misc/Arthern_accumulation_bedmap2_grid1",
    path="misc/Arthern_accumulation_bedmap2_grid1.tif",
)

# %%
quilt.push(package="weiji14/deepbedmap", is_public=True)
Ejemplo n.º 11
0
pkgNode = quilt.load(quiltUser + '/' + classDescription)

pkgNode._meta['trainable'] = classID in oi.classes_trainable().values
pkgNode._meta['labelName'] = classDescription
numImages = GetNumImages(folds, allImageIDs)
pkgNode._meta['image_count'] = numImages

GenerateImageMetadata(folds, allImageIDs, pkgNode, annList, logPath,
                      imagePrefix)
print('######## Image Metadata Generated#########')

quilt.build(quiltUser + '/' + classDescription, pkgNode)
print('######## New Package Generated#########')

buildPkgTime = time.time() - t

t = time.time()
quilt.push(quiltUser + '/' + classDescription, is_public=True)
pushTime = time.time() - t

pkgSize = GetPkgSize(packagePath)

with open(statsPath, 'a+') as myFile:
    myFile.write('{} {} {} {} {} {}\n'.format(classDescription, numImages,
                                              pkgSize, copyTime, buildPkgTime,
                                              pushTime))

#cleaning
quilt.rm(quiltUser + '/' + classDescription, force=True)
shutil.rmtree(packagePath)
Ejemplo n.º 12
0
 def build_and_push(package, df):
     quilt.build('%s/%s/%s' % (USER, PACKAGE_BASE, package), df)
     quilt.push('%s/%s/%s' % (USER, PACKAGE_BASE, package), is_public=True)
Ejemplo n.º 13
0
 def push(self):
     quilt.push(self.package_name, is_public=True)
Ejemplo n.º 14
0
if __name__ == '__main__':
    refresh_pxweb_datasets()
    exit()

    import quilt
    try:
        pass
        #quilt.install('jyrjola/traficom', force=True)
    except Exception:
        pass

    FORCE_QUARTERS = ['2019q3']

    # quilt.push('jyrjola/traficom', is_public=True)
    from quilt.data.jyrjola import traficom  # noqa

    for url, quarter in VEHICLE_URLS:
        print(url)
        dataset_name = 'vehicle_register_%s' % quarter
        if dataset_name in traficom._keys() and quarter not in FORCE_QUARTERS:
            print('skipping')
            continue
        outfn = '/tmp/out-%s.pq' % quarter
        if not os.path.exists(outfn):
            fetch_road_vehicle_register(url.split('/')[-1], quarter, outfn)
        print('build')
        quilt.build('jyrjola/traficom/%s' % dataset_name,
                    path='/tmp/out-%s.pq' % quarter)
        print('push')
        quilt.push('jyrjola/traficom', is_public=True)
Ejemplo n.º 15
0
def update_quilt(quilt_path):
    import os
    import glob
    import settings

    def upload_px_dataset(root_node, file):
        fname = os.path.splitext(os.path.basename(file))[0]
        if 'hginseutu' not in fname.lower(
        ) and 'UM' not in fname and 'hki' not in fname.lower():
            return

        print(fname)
        if re.match('^[0-9]', fname):
            # If the name begins with a number, prefix it with an letter
            # to make it a legal Python identifier.
            fname = 'z' + fname

        fname = fname.replace('-', '_').lower()

        content = open(file, 'r', encoding='windows-1252').read()
        parser = PxParser()
        try:
            file = parser.parse(content)
        except Exception as e:
            print(e)
            return

        now = datetime.now()

        parser = PxParser()
        file = parser.parse(content)
        now = datetime.now()
        from pprint import pprint
        #if 'last_updated' not in file.meta or (now - file.meta['last_updated']) > timedelta(days=2 * 365):
        #    return

        print("\t%s" % file.meta['contents'])

        if root_node:
            quilt_target = root_node
        else:
            quilt_target = quilt_path

        node = update_node_from_pcaxis(quilt_target, fname, file)
        return node

    SKIP_FILES = []

    data_dir = os.path.join(settings.DATA_DIR, 'aluesarjat')
    files = glob.glob('%s/*.px' % data_dir)
    skip_until = None

    root_node = None
    for file in files:
        if 'A01S_HKI_Rak' not in file:
            continue
        if skip_until:
            if skip_until not in file:
                continue
            skip_until = None

        skip = False
        for sf in SKIP_FILES:
            if sf in file:
                skip = True
                break
        if skip:
            continue

        ret = upload_px_dataset(root_node, file)
        if ret:
            root_node = ret

    assert root_node
    quilt.build(quilt_path, root_node)
    quilt.push(quilt_path, is_public=True)
Ejemplo n.º 16
0
# get all the data from quilt
events = gdelt3.data.events()
data_with_news = gdelt3.data.data_with_news()
data_master = gdelt3.data.data_with_news_master()

# generate new data from the day
print(data_with_news)
# initialize list of lists

data = [['boi', 'do ', 'mato'], ['eu', 'sou', 'o ze'],
        ['querias', 'nao', 'era']]

# Create the pandas DataFrame
df = pd.DataFrame(data, columns=['SOURCEURL', 'NewsText', 'HashURL'])

final = data_with_news.append(df)

#print(data_with_news)
#print(df)
#print(final)

#gdelt3._set(["data","data_with_news"],df)
gdelt3._set(["data", "data_with_news"], df)
#quilt.build("nmduarte/gdelt3/data/data_with_news",final)

# print(ola['GLOBALEVENTID'])
quilt.push("nmduarte/gdelt3/data/data_with_news",
           is_public=True,
           is_team=False)
Ejemplo n.º 17
0
print(X_tile.shape, W1_tile.shape, W2_tile.shape, W3_tile.shape)

# Build quilt package for datasets covering our test region
reupload = False
if reupload == True:
    bounds_str = "_".join(str(int(b))
                          for b in (window_bound)).replace("-", "m")
    quilt.build(package=f"weiji14/deepbedmap/model/test/{bounds_str}/W1_tile",
                path=W1_tile)
    quilt.build(package=f"weiji14/deepbedmap/model/test/{bounds_str}/W2_tile",
                path=W2_tile)
    quilt.build(package=f"weiji14/deepbedmap/model/test/{bounds_str}/W3_tile",
                path=W3_tile)
    quilt.build(package=f"weiji14/deepbedmap/model/test/{bounds_str}/X_tile",
                path=X_tile)
    quilt.push(package=f"weiji14/deepbedmap/model/test/{bounds_str}",
               is_public=True)


# %%
def subplot(directive: str, row: int = None, col: int = None, **kwargs):
    """Thin wrapper around https://docs.generic-mapping-tools.org/latest/subplot.html"""
    with gmt.clib.Session() as lib:
        rowcol = ""  # default is blank, e.g. when directive == "end"
        if row is not None and col is not None:
            if directive == "begin":
                rowcol = f"{row}x{col}"
            elif directive == "set":
                rowcol = f"{row},{col}"
        arg_str = " ".join(
            a
            for a in [directive, rowcol,