def addtoQuilt(df_new, name): from quilt.data.nmduarte import gdelt3 if name=="data_with_news": d = gdelt3.data.data_with_news() else: d = gdelt3.data.events() #print("Original data has :", original_data.count()) df_new2= df_new.toPandas() print("Appending:", df_new2.count()) print("original: ",type(d)) print("new: ", type(df_new2)) d = d.append(df_new2) print("TTOAL:", d.count()) # gdelt3._set(["data","data_with_news"],df) # data_with_news['new_column'] = "aaaaaa" # data_with_news['new_column2'] = "bbbbb" quilt.build("nmduarte/gdelt3/data/"+name, d) quilt.push("nmduarte/gdelt3/data/"+name, is_public=True, is_team=False)
def df_to_quilt(df, path): parts = path.split('/') assert len(parts) > 2 root_pkg = '/'.join(parts[0:2]) try: quilt.install(root_pkg, force=True) except Exception: pass object_encoding = {} df = df.copy() for col, dtype in df.dtypes.iteritems(): if dtype.name in ('Int8', 'Int32'): object_encoding[col] = 'int32' df[col] = df[col].astype(object) else: object_encoding[col] = 'infer' with tempfile.NamedTemporaryFile(suffix='.parquet') as f: print('writing to %s' % f.name) fastparquet.write(f.name, df, compression='snappy', object_encoding=object_encoding) print('build') quilt.build(path, f.name) print('push') quilt.push(root_pkg, is_public=True)
def update_pkg( df: pd.DataFrame, user: str, package: str, readme: Optional[str] = None, hash_key=None, ): r""" Parameters ---------- df user package readme hash_key Returns ------- """ pkg_path = f'{user}/{package}' quilt.build(pkg_path, quilt.nodes.GroupNode(dict(author='@hudlrd'))) quilt.build(f'{pkg_path}/df', quilt.nodes.DataNode(None, None, df, {})) # TODO: warn the user if readme if not provided if readme is not None: with NamedTemporaryFile() as tmp: tmp.write(readme.encode('UTF-8')) tmp.flush() quilt.build(f'{pkg_path}/README', tmp.model_name) quilt.login() quilt.push(pkg_path, is_public=True, hash=hash_key)
def update_quilt_datasets(): import quilt from quilt.data.jyrjola import karttahel df = get_buildings() karttahel._set(['buildings'], df) quilt.build('jyrjola/karttahel', karttahel) quilt.push('jyrjola/karttahel', is_public=True)
def update_quilt_datasets(): QUILT_TARGET = 'jyrjola/statfi' from quilt.data.jyrjola import statfi as node import quilt import requests_cache requests_cache.install_cache() df = get_fuel_classification() df.to_csv('fuel_classification.csv') print(df) exit() node._set(['fuel_classification'], df) quilt.build(QUILT_TARGET, node) quilt.push(QUILT_TARGET, is_public=True)
def refresh_pxweb_datasets(): import requests_cache requests_cache.install_cache() api = PXWebAPI('http://trafi2.stat.fi/PXWeb', 'fi') # print(api.list_topics('TraFi/Ensirekisteroinnit')) # exit() for path, table in PXWEB_TABLES: print(path, table) pxf = api.get_table('%s/%s.px' % (path, table)) table = 'tf%s' % table root_node = update_node_from_pcaxis(QUILT_DATASET, table, pxf) quilt.build(QUILT_DATASET, root_node) quilt.push(QUILT_DATASET, is_public=True)
def upload_to_quilt(spark, schemas_dic): """ Function to upload data to quilt and to append it to already existing data :param spark: Spark Sessuin :return: None """ # remove old data and get new one logging.info("Installing quilt gdelt data...") quilt.rm("nmduarte/gdelt", force=True) quilt.install("nmduarte/gdelt", force=True) from quilt.data.nmduarte import gdelt # get the old data from quilt logging.info("getting data from quilt...") events_from_quilt = gdelt.events() mentions_from_quilt = gdelt.mentions() news_from_quilt = gdelt.news() # transform the data into dataframes so it can be appended logging.info("Creating dataframes from quilt data...") events_from_quilt_df = spark.createDataFrame(events_from_quilt, schema=schemas_dic['events2']) mentions_from_quilt_df = spark.createDataFrame( mentions_from_quilt, schema=schemas_dic['mentions']) news_from_quilt_df = spark.createDataFrame(news_from_quilt, schema=schemas_dic['news']) # mentions data - new data logging.info("Reading last 15min data from S3...") mentions_df = tools.read_from_s3_enriched(spark, "mentions", schemas_dic['mentions'], cmd_opts.date) events_df = tools.read_from_s3_enriched(spark, "events", schemas_dic['events2'], cmd_opts.date) news_df = tools.read_from_s3_enriched(spark, "news", schemas_dic['news'], cmd_opts.date) # concatenate already existing data with new data logging.info("Appending data to old quilt data...") mentions_concat = mentions_from_quilt_df.union(mentions_df) events_concat = events_from_quilt_df.union(events_df) news_concat = news_from_quilt_df.union(news_df) # build the 3 packages logging.info("Building quilt packages...") quilt.build("nmduarte/gdelt/mentions", mentions_concat.toPandas()) quilt.build("nmduarte/gdelt/events", events_concat.toPandas()) quilt.build("nmduarte/gdelt/news", news_concat.toPandas()) # push the 3 packages logging.info("Pushing quilt info...") quilt.push("nmduarte/gdelt/mentions", is_public=True, is_team=False) quilt.push("nmduarte/gdelt/events", is_public=True, is_team=False) quilt.push("nmduarte/gdelt/news", is_public=True, is_team=False)
def uploadToQuilt(spark): #downloads the data from s3 print("Getting schemas..") events_schema, mentions_schema, news_schema,events_schema2 = set_schemas() # quilt.install("nmduarte/gdelt3") # mentions data print("Getting mention data..") mentions_df = read_from_s3_enriched(spark, "mentions", mentions_schema, cmd_opts.date) mentions_df.show() mentions_df.write.csv("tmp_data/mentions", header="true", mode="overwrite") events_df = read_from_s3_enriched(spark, "events", events_schema2, cmd_opts.date) events_df.write.csv("tmp_data/events", header="true", mode="overwrite") news_df = read_from_s3_enriched(spark, "news", news_schema, cmd_opts.date) news_df.write.csv("tmp_data/news", header="true", mode="overwrite") #news_df.write.csv("hdfs://10.0.0.13/ubuntu/hdfs/data/example.csv") #news_df.show() #quilt.build("nmduarte/gdelt8_news") #from quilt.data.nmduarte import gdelt8_news #news1 = pd.read_csv("tmp_data/news/part-00000-0f8595b0-2bd0-4156-9254-78e7b5cfa5c9-c000.csv", engine='python', escapechar="\\") #gdelt8_news._set(['bar'], news1) #print(gdelt8_news.bar()) #quilt.push("nmduarte/gdelt8_news", is_public=True) #quilt.build("nmduarte/gdelt_news","tmp_data/news") # put some data in it #from quilt.data.nmduarte import gdelt9_news #df = pd.DataFrame(data=[1, 2, 3]) #gdelt9_news._set(['bar'], df) #print(gdelt9_news.bar()) #quilt.push("nmduarte/gdelt_news", is_public=True) #print(news1.head()) # build the 3 packages quilt.build("nmduarte/gdelt_mentions","tmp_data/mentions") quilt.build("nmduarte/gdelt_events", "tmp_data/events") quilt.build("nmduarte/gdelt_news", "tmp_data/news") # push the 3 packages quilt.push("nmduarte/gdelt_mentions", is_public=True, is_team=False) quilt.push("nmduarte/gdelt_events", is_public=True, is_team=False) quilt.push("nmduarte/gdelt_news", is_public=True, is_team=False)
# %% X_tile, W1_tile, W2_tile, W3_tile = get_deepbedmap_model_inputs( window_bound=window_bound ) print(X_tile.shape, W1_tile.shape, W2_tile.shape, W3_tile.shape) # Build quilt package for datasets covering our test region reupload = False if reupload == True: quilt.build(package="weiji14/deepbedmap/model/test/W1_tile", path=W1_tile) quilt.build(package="weiji14/deepbedmap/model/test/W2_tile", path=W2_tile) quilt.build(package="weiji14/deepbedmap/model/test/W3_tile", path=W3_tile) quilt.build(package="weiji14/deepbedmap/model/test/X_tile", path=X_tile) quilt.push(package="weiji14/deepbedmap/model/test", is_public=True) # %% def plot_3d_view( img: np.ndarray, ax: matplotlib.axes._subplots.Axes, elev: int = 60, azim: int = 330, z_minmax: tuple = None, title: str = None, zlabel: str = None, ): """ Creates a 3D perspective view plot of an elevation surface using matplotlib 3D. The elevation (elev) and azimuth (azim) angle will need to be set accordingly,
path=measuresvelocity) quilt.build(package="weiji14/deepbedmap/model/train/W3_data", path=accumulation) quilt.build(package="weiji14/deepbedmap/model/train/X_data", path=lores) quilt.build(package="weiji14/deepbedmap/model/train/Y_data", path=hires) # %% # Original datasets for neural network predictions on bigger area quilt.build(package="weiji14/deepbedmap/lowres/bedmap2_bed", path="lowres/bedmap2_bed.tif") quilt.build( package="weiji14/deepbedmap/misc/REMA_100m_dem_filled", path="misc/REMA_100m_dem_filled.tif", ) with xr.open_dataset("misc/antarctic_ice_vel_phase_map_v01.nc") as ds: with tempfile.NamedTemporaryFile(suffix=".nc") as tmpfile: ds[["VX", "VY"]].to_netcdf(path=tmpfile.name) # save only VX, VY variables quilt.build( package= "weiji14/deepbedmap/misc/antarctic_ice_vel_phase_map_v01_VX_VY", path=tmpfile.name, ) quilt.build( package="weiji14/deepbedmap/misc/Arthern_accumulation_bedmap2_grid1", path="misc/Arthern_accumulation_bedmap2_grid1.tif", ) # %% quilt.push(package="weiji14/deepbedmap", is_public=True)
pkgNode = quilt.load(quiltUser + '/' + classDescription) pkgNode._meta['trainable'] = classID in oi.classes_trainable().values pkgNode._meta['labelName'] = classDescription numImages = GetNumImages(folds, allImageIDs) pkgNode._meta['image_count'] = numImages GenerateImageMetadata(folds, allImageIDs, pkgNode, annList, logPath, imagePrefix) print('######## Image Metadata Generated#########') quilt.build(quiltUser + '/' + classDescription, pkgNode) print('######## New Package Generated#########') buildPkgTime = time.time() - t t = time.time() quilt.push(quiltUser + '/' + classDescription, is_public=True) pushTime = time.time() - t pkgSize = GetPkgSize(packagePath) with open(statsPath, 'a+') as myFile: myFile.write('{} {} {} {} {} {}\n'.format(classDescription, numImages, pkgSize, copyTime, buildPkgTime, pushTime)) #cleaning quilt.rm(quiltUser + '/' + classDescription, force=True) shutil.rmtree(packagePath)
def build_and_push(package, df): quilt.build('%s/%s/%s' % (USER, PACKAGE_BASE, package), df) quilt.push('%s/%s/%s' % (USER, PACKAGE_BASE, package), is_public=True)
def push(self): quilt.push(self.package_name, is_public=True)
if __name__ == '__main__': refresh_pxweb_datasets() exit() import quilt try: pass #quilt.install('jyrjola/traficom', force=True) except Exception: pass FORCE_QUARTERS = ['2019q3'] # quilt.push('jyrjola/traficom', is_public=True) from quilt.data.jyrjola import traficom # noqa for url, quarter in VEHICLE_URLS: print(url) dataset_name = 'vehicle_register_%s' % quarter if dataset_name in traficom._keys() and quarter not in FORCE_QUARTERS: print('skipping') continue outfn = '/tmp/out-%s.pq' % quarter if not os.path.exists(outfn): fetch_road_vehicle_register(url.split('/')[-1], quarter, outfn) print('build') quilt.build('jyrjola/traficom/%s' % dataset_name, path='/tmp/out-%s.pq' % quarter) print('push') quilt.push('jyrjola/traficom', is_public=True)
def update_quilt(quilt_path): import os import glob import settings def upload_px_dataset(root_node, file): fname = os.path.splitext(os.path.basename(file))[0] if 'hginseutu' not in fname.lower( ) and 'UM' not in fname and 'hki' not in fname.lower(): return print(fname) if re.match('^[0-9]', fname): # If the name begins with a number, prefix it with an letter # to make it a legal Python identifier. fname = 'z' + fname fname = fname.replace('-', '_').lower() content = open(file, 'r', encoding='windows-1252').read() parser = PxParser() try: file = parser.parse(content) except Exception as e: print(e) return now = datetime.now() parser = PxParser() file = parser.parse(content) now = datetime.now() from pprint import pprint #if 'last_updated' not in file.meta or (now - file.meta['last_updated']) > timedelta(days=2 * 365): # return print("\t%s" % file.meta['contents']) if root_node: quilt_target = root_node else: quilt_target = quilt_path node = update_node_from_pcaxis(quilt_target, fname, file) return node SKIP_FILES = [] data_dir = os.path.join(settings.DATA_DIR, 'aluesarjat') files = glob.glob('%s/*.px' % data_dir) skip_until = None root_node = None for file in files: if 'A01S_HKI_Rak' not in file: continue if skip_until: if skip_until not in file: continue skip_until = None skip = False for sf in SKIP_FILES: if sf in file: skip = True break if skip: continue ret = upload_px_dataset(root_node, file) if ret: root_node = ret assert root_node quilt.build(quilt_path, root_node) quilt.push(quilt_path, is_public=True)
# get all the data from quilt events = gdelt3.data.events() data_with_news = gdelt3.data.data_with_news() data_master = gdelt3.data.data_with_news_master() # generate new data from the day print(data_with_news) # initialize list of lists data = [['boi', 'do ', 'mato'], ['eu', 'sou', 'o ze'], ['querias', 'nao', 'era']] # Create the pandas DataFrame df = pd.DataFrame(data, columns=['SOURCEURL', 'NewsText', 'HashURL']) final = data_with_news.append(df) #print(data_with_news) #print(df) #print(final) #gdelt3._set(["data","data_with_news"],df) gdelt3._set(["data", "data_with_news"], df) #quilt.build("nmduarte/gdelt3/data/data_with_news",final) # print(ola['GLOBALEVENTID']) quilt.push("nmduarte/gdelt3/data/data_with_news", is_public=True, is_team=False)
print(X_tile.shape, W1_tile.shape, W2_tile.shape, W3_tile.shape) # Build quilt package for datasets covering our test region reupload = False if reupload == True: bounds_str = "_".join(str(int(b)) for b in (window_bound)).replace("-", "m") quilt.build(package=f"weiji14/deepbedmap/model/test/{bounds_str}/W1_tile", path=W1_tile) quilt.build(package=f"weiji14/deepbedmap/model/test/{bounds_str}/W2_tile", path=W2_tile) quilt.build(package=f"weiji14/deepbedmap/model/test/{bounds_str}/W3_tile", path=W3_tile) quilt.build(package=f"weiji14/deepbedmap/model/test/{bounds_str}/X_tile", path=X_tile) quilt.push(package=f"weiji14/deepbedmap/model/test/{bounds_str}", is_public=True) # %% def subplot(directive: str, row: int = None, col: int = None, **kwargs): """Thin wrapper around https://docs.generic-mapping-tools.org/latest/subplot.html""" with gmt.clib.Session() as lib: rowcol = "" # default is blank, e.g. when directive == "end" if row is not None and col is not None: if directive == "begin": rowcol = f"{row}x{col}" elif directive == "set": rowcol = f"{row},{col}" arg_str = " ".join( a for a in [directive, rowcol,