class HdfsClient: def __init__(self): self.client = Config().get_client('dev') try: self.client.list('datasets') except: self.client.makedirs('datasets')
class PendingWindow(object): """docstring for PendingWindow""" def __init__(self, backup_dir, node): # TODO: not cut # each pending window (or node) only has a single downstream cut, # otherwise inconsistency occurs during truncating self.backup_dir = backup_dir self.node = node self.hdfs_client = Config().get_client('dev') self.hdfs_client.makedirs(self.backup_dir) # each backup file is named by the ending version, so the current writing one is named temporarily self.current_backup_path = os.path.join(self.backup_dir, 'current') # touch the file for later appending self.hdfs_client.write(self.current_backup_path, data='') # the version that last truncation conducted against self.safe_version_path = os.path.join(self.backup_dir, 'safe_version') # special case for initial version self.hdfs_client.write(self.safe_version_path, data=str(0)) # the latest integral version self.latest_version_path = os.path.join(self.backup_dir, 'latest_version') # special case for initial version self.hdfs_client.write(self.latest_version_path, data=str(0)) if self.node.type != 'sink': self.version_acks = dict() for n in self.node.downstream_connectors: self.version_acks[n] = 0 def append(self, tuple_): """Make an output tuple persistent, and complete a version if necessary """ self.hdfs_client.write(self.current_backup_path, data=pickle.dumps(tuple_), append=True) if isinstance(tuple_, BarrierTuple): self.hdfs_client.rename( self.current_backup_path, os.path.join(self.backup_dir, str(tuple_.version))) self.hdfs_client.write(self.latest_version_path, data=str(tuple_.version), overwrite=True) self.hdfs_client.write(self.current_backup_path, data='') def extend(self, tuples): # TODO: can be improved with self.hdfs_client.write(self.current_backup_path, append=True) as f: for t in tuples: pickle.dump(t, f) if isinstance(tuples[-1], BarrierTuple): self.hdfs_client.rename( self.current_backup_path, os.path.join(self.backup_dir, str(tuples[-1].version))) self.hdfs_client.write(self.latest_version_path, data=str(tuples[-1].version), overwrite=True) self.hdfs_client.write(self.current_backup_path, data='') def truncate(self, version): """Delete files with filename <= version """ # with self.hdfs_client.read(self.safe_version_path) as f: # safe_version = int(f.read()) # # # only = condition can occur # if version <= safe_version: # return for f in self.hdfs_client.list(self.backup_dir): if f.isdigit() and int(f) <= version: self.hdfs_client.delete(os.path.join(self.backup_dir, f)) # self.node.LOGGER.info('truncated version %d' % version) def handle_version_ack(self, version_ack): old_safe_version = min(self.version_acks.values()) self.version_acks[version_ack.sent_from] = version_ack.version new_safe_version = min(self.version_acks.values()) if new_safe_version > old_safe_version: self.hdfs_client.write(self.safe_version_path, data=str(new_safe_version), overwrite=True) self.truncate(new_safe_version) def get_latest_version(self): with self.hdfs_client.read(self.latest_version_path) as f: latest_version = int(f.read()) return latest_version def rewind(self, version=None): """Delete files with filename > version (including current file) """ if version == None: self.hdfs_client.write(self.current_backup_path, data='', overwrite=True) return # TODO: underflow # assert version == 0 or for f in self.hdfs_client.list(self.backup_dir): if f.isdigit() and int(f) > version: self.hdfs_client.delete(os.path.join(self.backup_dir, f)) self.hdfs_client.write(self.current_backup_path, data='', overwrite=True) self.hdfs_client.write(self.latest_version_path, data=str(version), overwrite=True) def replay(self): """When both the node and pending window state are ready, replay the pending window before resuming """ for v in sorted( map( int, filter(unicode.isdigit, self.hdfs_client.list(self.backup_dir)))): # filter out the faster nodes tuples = [] with self.hdfs_client.read(os.path.join(self.backup_dir, str(v))) as f: while True: try: t = pickle.load(f) tuples.append(t) except EOFError: self.node.LOGGER.debug( 'reached EOF, send this version') break # Spout needs version too, so that data source can resend from a version # except pickle.UnpickleableError: # self.node.LOGGER.debug('spout reached partial dump location, send this incomplete version') # break self.node.multicast(self.node.downstream_nodes, tuples)
HDFS_BASE_URL = "hdfs://bdrenfdludcf01:9000" if __name__ == "__main__": # Folder creation for placing all the spark data cmd_a = "mkdir -p " + "/tmp/SPARK_PROCESS/" os.system(cmd_a) # Configure Spark conf = SparkConf().setAppName(APP_NAME).set("spark.local.dir", "/tmp/SPARK_PROCESS/") sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) client = Config().get_client('bdrenhdfs') files = client.list(HDFS_RAWFILE_DIR) totalfilecount = len(files) if totalfilecount == 0: print("There is no files to be processed, application exiting...") sys.exit(0) filecount = 0 for filename in files: print(filename) if filename.find("Covid_Analysis_DataSet.csv") >= 0: filecount = filecount + 1 df_covid = sqlContext.read.format("csv").option( "delimiter", ":").option("header", 'true').load(HDFS_BASE_URL +
top_all = None def print_rdd(rdd): global top_all top_all = rdd.take(10) # 3 in fact # for row in top_all: # print('{}\t{}'.format(*row)) # In[3]: # Эмулируем реальную жизнь, когда данные поступают частями с периодичностью DATA_PATH = "/data/course4/uid_ua_100k_splitted_by_5k" batches = [sc.textFile(os.path.join(*[nn_address, DATA_PATH, path])) for path in client.list(DATA_PATH)] # формируем батчи из файлов датасета #batches = batches[:2] BATCH_TIMEOUT = 1 # раз в 5 с. посылаем батчи в виде RDD ssc = StreamingContext(sc, BATCH_TIMEOUT) ssc.checkpoint("./checkpoints") dstream = ssc.queueStream(rdds=batches) result = (dstream .flatMap(extract_segments) ) #result.foreachRDD(print_rdd) result.foreachRDD(recognize_finish) (result .updateStateByKey(update_count)
'first_feature': 2., 'second_feature': 12., } # First, we delete any existing `models/` folder on HDFS. client.delete('models', recursive=True) # We can now upload the data, first as CSV. with client.write('models/1.csv', encoding='utf-8') as writer: for item in model.items(): writer.write(u'%s,%s\n' % item) # We can also serialize it to JSON and directly upload it. with client.write('models/1.json', encoding='utf-8') as writer: dump(model, writer) # We can check that the files exist and get their properties. assert client.list('models') == ['1.csv', '1.json'] status = client.status('models/1.csv') content = client.content('models/1.json') # Later, we can download the files back. The `delimiter` option makes it # convenient to read CSV files. with client.read('models/1.csv', delimiter='\n', encoding='utf-8') as reader: items = (line.split(',') for line in reader if line) assert dict((name, float(value)) for name, value in items) == model # Loading JSON directly from HDFS is even simpler. with client.read('models/1.json', encoding='utf-8') as reader: assert load(reader) == model
class Pickler: def __init__(self, sc, spark_session, uri, port): self.sc = sc self.spark_session = spark_session self.df = [] self.models = [] self.graphs = [] self.base_path = uri + ":" + port self.local_pickle_path = os.path.dirname( os.path.realpath(__file__)) + '/../pickles/' self.pickle_path = '/user/hadoop/pickles/' self.model_path = '/user/hadoop/pickles/models/' self.dataset_path = self.pickle_path + "dataset/" self.private_release_path = self.dataset_path + "private/" self.anon_release_path = self.dataset_path + "github/" self.prod_release_path = self.dataset_path + "prod/" self.df_path = self.pickle_path + 'df/' self.graph_path = self.local_pickle_path + 'graphs/' self.labelled_df_path = self.df_path + 'labelled/' self.hdfs_client = Config().get_client('dev') self.load_df() self.load_models() self.load_graphs() #TODO: Implement generic methods for read dataset / model ONLY def read(self): pass def save(self): pass def getLabelledFiles(self): return self.hdfs_client.list(self.prod_release_path) def readCSVToDF(self, date, folder): return self.spark_session.read.option( "header", True).csv(self.base_path + self.dataset_path + folder + "/" + date) def getLabelledTelemetry(self): return self.hdfs_client.list(self.private_release_path) def existsModel(self, name): res = self.hdfs_client.list(self.model_path) file_extension = '.model' if name + file_extension in res: return True def getModel(self, name): return PipelineModel.load(self.base_path + self.model_path + name + ".model") def isDateLabelled(self, date): res = self.hdfs_client.list(self.prod_release_path) file_extension = ".csv" if date + file_extension in res: return True return False def load_graphs(self): for file in os.listdir(self.graph_path): if file.endswith(".pickle"): self.graphs.append(file[:-7]) def existsGraph(self, date): if date in self.graphs: return True return False def getGraph(self, date): if date in self.graphs: with open(self.graph_path + date + ".pickle", 'rb') as pickle_file: content = pickle.load(pickle_file) return content def saveGraph(self, G, date): if date in self.graphs: return False nx.write_gpickle(G, self.graph_path + date + ".pickle") self.graphs.append(date) def existsDF(self, date, source): #2020.03.01_joy hash = self.getHash(date, source) if hash in self.df: return True return False def load_df(self): #Load Joy Data res = self.hdfs_client.list(self.df_path + 'joy') # print(f"Joy Items in directory: {res}") for file in res: if file.endswith(".parquet"): self.df.append(sha256(file[:-8].encode('utf-8')).hexdigest()) #Load graph features DF res = self.hdfs_client.list(self.df_path + 'graph') # print(f"Graph DF Items in directory: {res}") for file in res: if file.endswith(".parquet"): self.df.append(sha256(file[:-8].encode('utf-8')).hexdigest()) res = self.hdfs_client.list(self.df_path + 'labelled') # print(f"Labelled Items in directory: {res}") for file in res: if file.endswith(".parquet"): self.df.append(sha256(file[:-8].encode('utf-8')).hexdigest()) # TODO : Load others? def saveModel(self, model, name): model.save(self.base_path + self.model_path + name + ".model") def load_models(self): res = self.hdfs_client.list(self.model_path) for file in res: if file.endswith(".model"): self.models.append(file.split('.')[0]) def saveDFToCSV(self, df, date, folder, coalesced=False): if coalesced: df.coalesce(1).write.csv(self.base_path + self.pickle_path + "dataset/" + folder + '/' + date + '.csv', header=True) else: df.write.csv(self.base_path + self.pickle_path + "dataset/" + folder + '/' + date + '.csv', header=True) df.write.parquet(self.base_path + self.pickle_path + "dataset/" + folder + '/' + date + '.parquet') def saveDF(self, df, date, source): hash = self.getHash(date, source) if hash in self.df: return False else: df.write.parquet(self.base_path + self.df_path + source + '/' + date + "_" + source + '.parquet') self.df.append(hash) def getDF(self, date, source): hash = self.getHash(date, source) if hash in self.df: df = self.spark_session.read.parquet(self.base_path + self.df_path + source + '/' + date + "_" + source + '.parquet') return df return False def getHash(self, date, source): id = date + "_" + source hash = sha256(id.encode('utf-8')).hexdigest() return hash
class SparkHDFSClient(object): def __init__(self, datasource): self.datasource = datasource self.client = Config().get_client("dev") def get_file_list(self, folder): files = self.client.list(folder.strip()) files = [folder + '/' + file for file in files] return files def list_collections(self): results = [] status = self.client.status(self.datasource.url, strict=False) print(status, self.datasource.url) if status is not None: if status['type'] == "DIRECTORY": files = self.get_file_list(self.datasource.url) while len(files) > 0: file = files.pop() status = self.client.status(os.path.join( self.datasource.url, file), strict=False) if status is None: continue if status['type'] == "DIRECTORY": subfiles = self.get_file_list( os.path.join(self.datasource.url, file)) files.extend(subfiles) continue else: if self.datasource.dstype == DataSourceType.SPARK_CSV and file[-2:] != 'sv' \ or self.datasource.dstype == DataSourceType.SPARK_TSV and file[-2:] != 'sv'\ or self.datasource.dstype == DataSourceType.SPARK_XML and file[-3:] != 'xml'\ or self.datasource.dstype == DataSourceType.SPARK_JSON and file[-4:] != 'json': continue row = { "db": file[:file.rfind('/')] if '/' in file else self.datasource.url, "document": file[file.rfind('/') + 1:] if '/' in file else file, "count": -1 } results.append(row) return results else: return [{ "db": self.datasource.url, "document": self.datasource.url, "count": -1 }] else: return results def get_documents(self, filename, limit=10): results = [] delimiter = "\n" header = None rows = 0 if self.datasource.dstype == DataSourceType.SPARK_CSV or \ self.datasource.dstype == DataSourceType.SPARK_TSV: delimiter = "\n" with self.client.read(filename, encoding='utf-8', delimiter=delimiter) as reader: for line in reader: if len(line.strip()) == 0 or line[0] == '#': continue if filename[-3:] == "csv": line = line.split(',') else: line = line.split('\t') if header is None: header = line continue res = { header[i]: line[i] for i in range(len(line)) if i < len(header) } results.append(res) rows += 1 if rows > limit + 1: break elif self.datasource.dstype == DataSourceType.SPARK_XML: with self.client.read(filename, encoding='utf-8', chunk_size=2048) as reader: header = ['content'] for chunk in reader: res = {'content': str(chunk)} results.append(res) print(results) break elif self.datasource.dstype == DataSourceType.SPARK_JSON: with self.client.read(filename, encoding='utf-8') as reader: model = load(reader) if isinstance(model, list): model = [{ p: str(list(md[p][0].keys())) if isinstance(md[p], list) and isinstance(md[p][0], dict) else str(model[p]) if isinstance(md[p], list) else str(list(md[p].keys())) if isinstance(md[p], dict) else md[p] for p in md } for md in model] results.extend(model) else: model = { p: str(list(model[p][0].keys())) if isinstance(model[p], list) and isinstance( model[p][0], dict) else model[p] if isinstance( model[p], list) else str(list(model[p].keys())) if isinstance(model[p], dict) else model[p] for p in model } results.append(model) return results[:limit], limit
############### ### Setting up File Paths and Lists ############### client = Config().get_client('dev') workingFolder_Indian = "SgIndian_vcf/dataFreeze_Feb2013/SNP/biAllele/" workingFolder_Malay = "SgMalay_vcf/2012_05/snps/" workingFolder_Chinese = "1000G_CDX/Phase3/integrated/" # Filing number of unique samples found in the working folder... freqFiles_Indian = [ f for f in client.list(workingFolder_Indian) if re.match(r'chr\d+_analysis_exome\.frq', f) ] rsIDFiles_Indian = [ f for f in client.list(workingFolder_Indian) if re.match(r'chr\d+_rsID', f) ] freqFiles_Malay = [ f for f in client.list(workingFolder_Malay) if re.match(r'chr\d+_analysis_exome\.frq', f) ] rsIDFiles_Malay = [ f for f in client.list(workingFolder_Malay) if re.match(r'chr\d+_rsID', f) ] freqFiles_Chinese = [ f for f in client.list(workingFolder_Chinese)