def train_partition(idx, iterator): port = 50000 + idx % 256 main = SparkFiles.get("main.py") architecture = SparkFiles.get("train_val.prototxt") model = SparkFiles.get("deepq16.caffemodel") solver = SparkFiles.get("solver.prototxt") root = SparkFiles.getRootDirectory() dset = os.path.join(root, "dset-%02d.hdf5" % idx) flag_file = "flags/__BARISTA_READY__.%d" % port if os.path.isfile(flag_file): os.remove(flag_file) # out = open(os.path.join(root, "barista.log"), 'w') subprocess.Popen([ "python", main, architecture, model, "--dataset", dset, "--solver", solver, "--dset-size", "30000", "--initial-replay", "20000", "--debug", "--overwrite", "--port", str(port) ]) while not os.path.isfile(flag_file): pass for step in iterator: dc = DummyClient("127.0.0.1", port) dc.send(barista.GRAD_UPDATE) response = dc.recv() yield response
def test_transform_data(self): """ Function that checks all the main process functionality based on check: :num rows :num of columns :column names """ spark, sc = init_spark() file_input_path = getcwd() + "/test_data.csv" file_compare_path = getcwd() + "/output_test" sc.addFile(file_input_path, True) df = spark.read.option("header", "true").option( "delimiter", ";").schema(get_schema()).load(SparkFiles.get(file_input_path), format="csv") df = df.withColumn("country", lit("country_test")) df = process_logic(df) sc.addFile(file_compare_path, True) df_compare = spark.read.parquet(SparkFiles.get(file_compare_path)) expected_cols = len(df_compare.columns) expected_rows = df_compare.count() cols = len(df.columns) rows = df.count() self.assertEqual(expected_cols, cols) self.assertEqual(expected_rows, rows) self.assertTrue([col in df.columns for col in df_compare.columns])
def predict(self, X): """ Assumes X is an RDD or a list of (data, label) minibatch tuples.""" if isinstance(X, RDD): # Distribute files X.context.addFile(self._solver_filename) X.context.addFile(self._architecture_filename) X.mapPartitions(self.predict) solver_filename = \ SparkFiles.get(self._solver_filename.rsplit('/', 1)[-1]) architecture_filename = \ SparkFiles.get(self._architecture_filename.rsplit('/', 1)[-1]) # Might need to modify path to architecture file inside solver file. # Maybe we should do this before shipping the file since all Spark # tmp directories will be identically named. net = SGDSolver(solver_filename).net for minibatch_data, minibatch_label in X: # TODO: update function call for latest Caffe net.set_input_arrays(minibatch_data, minibatch_label, self.input_index) output = net.forward(end=self.score_blob) scores = output[self.score_blob] pred = np.argmax(scores, axis=1).squeeze() yield pred
def start_spark(app_name='my_spark_app', master='local[*]', files=[], spark_config={}): spark_builder = (SparkSession.builder.master(master).config( "spark.driver.extraClassPath", "C:\\Users\\Saumya.Sahu\\Downloads\\Microsoft JDBC Driver 6.0 for SQL Server\\sqljdbc_6.0\\enu\\jre8\\sqljdbc42.jar" ).appName(app_name)) spark_files = ','.join(list(files)) spark_builder.config('spark.files', spark_files) for key, val in spark_config.items(): spark_builder.config(key, val) spark_sess = spark_builder.getOrCreate() spark_files_dir = SparkFiles.getRootDirectory() config_f = SparkFiles.get('Config/etl_config.json') config_files = [ filename for filename in listdir(spark_files_dir) if filename.endswith('config.json') ] if config_files: path_to_config_file = path.join(spark_files_dir, config_files[0]) with open(path_to_config_file, 'r') as config_file: config_dict = json.load(config_file) print(config_file) else: config_dict = None return spark_sess, config_dict
def train_partition(idx, iterator): port = 50000 + idx % 256 main = SparkFiles.get("main.py") architecture = SparkFiles.get("train_val.prototxt") model = SparkFiles.get("deepq16.caffemodel") solver = SparkFiles.get("solver.prototxt") root = SparkFiles.getRootDirectory() dset = os.path.join(root, "dset-%02d.hdf5" % idx) flag_file = "flags/__BARISTA_READY__.%d" % port if os.path.isfile(flag_file): os.remove(flag_file) # out = open(os.path.join(root, "barista.log"), 'w') subprocess.Popen(["python", main, architecture, model, "--dataset", dset, "--solver", solver, "--dset-size", "30000", "--initial-replay", "20000", "--debug", "--overwrite", "--port", str(port)]) while not os.path.isfile(flag_file): pass for step in iterator: dc = DummyClient("127.0.0.1", port) dc.send(barista.GRAD_UPDATE) response = dc.recv() yield response
def spark_mapper(current_range): """ Gets the paths to the file(s) in the current executor, then declares the headers found. Args: current_range (tuple): A pair that contains the starting and ending values of the current range. Returns: function: The map function to be executed on each executor, complete with all headers needed for the analysis. """ # Get and declare headers on each worker headers_on_executor = [ SparkFiles.get(ntpath.basename(filepath)) for filepath in includes_headers ] Utils.declare_headers(headers_on_executor) # Get and declare shared libraries on each worker shared_libs_on_ex = [ SparkFiles.get(ntpath.basename(filepath)) for filepath in includes_shared_libraries ] Utils.declare_shared_libraries(shared_libs_on_ex) return mapper(current_range)
def processTweets(_, lines): import re import pyproj import rtree import shapely.geometry as geom separator = re.compile('\W+') proj = pyproj.Proj(init="epsg:5070", preserve_units=True) index, zones = createIndex(SparkFiles.get('500cities_tracts.geojson')) counts = {} drugs1 = SparkFiles.get('drug_illegal.txt') drugs2 = SparkFiles.get('drug_sched2.txt') terms = set(map(lambda x: x.strip(), (open(drugs1, 'r').readlines()+ open(drugs2, 'r').readlines()))) for line in lines: fields = line.split('|') lat,lon,body = fields[1], fields[2], fields[-1] words = separator.split(body.lower()) for words in terms: if len(terms.intersection(words)) >= len(words): p = geom.Point(proj(lon,lat)) match = None try: zone = findZone(p, index, zones) except: continue if zone: if zone[1] > 0: counts[zone[0]] = counts.get(zone[0], 0) + (1.0 / zone[1]) return counts.items()
def crfexec(sc, inputFilename, outputDirectory, limit=LIMIT, location='hdfs', outputFormat="text", partitions=None): crfConfigDir = os.path.join(os.path.dirname(__file__), "data/config") crfExecutable = "/usr/local/bin/crf_test" crfModelFilename = os.path.join(crfConfigDir, "dig-hair-eye-train.model") rdd_pipeinput = sc.textFile(inputFilename) rdd_pipeinput.setName('rdd_pipeinput') # rdd_pipeinput.persist() # DON'T USE SparkFiles.get to fetch the crf_test or model # This only works with local Spark (--master local[*]) if location == 'hdfs': cmd = "%s -m %s" % (os.path.basename(crfExecutable), os.path.basename(crfModelFilename)) elif location == 'local': cmd = "%s -m %s" % (SparkFiles.get(os.path.basename(crfExecutable)), SparkFiles.get(os.path.basename(crfModelFilename))) print "### %s" % cmd rdd_crf = rdd_pipeinput.pipe(cmd) rdd_final = rdd_crf if outputFormat == "sequence": rdd_final.saveAsSequenceFile(outputDirectory) elif outputFormat == "text": rdd_final.saveAsTextFile(outputDirectory) else: raise RuntimeError("Unrecognized output format: %s" % outputFormat)
def main( toxcast: str, output: str, adverse_events: str, safety_risk: str, log_file: Optional[str] = None, ): """ This module puts together data from different sources that describe target safety liabilities. Args: adverse_events: Input TSV containing adverse events associated with targets that have been collected from relevant publications. Fetched from GitHub. safety_risk: Input TSV containing cardiovascular safety liabilities associated with targets that have been collected from relevant publications. Fetched from GitHub. toxcast: Input table containing biological processes associated with relevant targets that have been observed in toxicity assays. output: Output gzipped json file following the target safety liabilities data model. log_file: Destination of the logs generated by this script. Defaults to None. """ # Logger initializer. If no log_file is specified, logs are written to stderr logging.basicConfig( level=logging.INFO, format= '%(asctime)s %(levelname)s %(module)s - %(funcName)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S', ) if log_file: logging.config.fileConfig(filename=log_file) else: logging.StreamHandler(sys.stderr) # Initialize spark context global spark spark = initialize_spark() spark.sparkContext.addFile(adverse_events) spark.sparkContext.addFile(safety_risk) logging.info('Remote files successfully added to the Spark Context.') # Load and process the input files into dataframes ae_df = process_adverse_events( SparkFiles.get(adverse_events.split('/')[-1])) sr_df = process_safety_risk(SparkFiles.get(safety_risk.split('/')[-1])) toxcast_df = process_toxcast(toxcast) logging.info('Data has been processed. Merging...') # Combine dfs and group evidence safety_df = ( # dfs are combined; unionByName is used instead of union to address for the differences in the schemas ae_df.unionByName(sr_df, allowMissingColumns=True).unionByName( toxcast_df, allowMissingColumns=True)) # Write output logging.info('Evidence strings have been processed. Saving...') write_evidence_strings(safety_df, output) logging.info( f'{safety_df.count()} evidence of safety liabilities have been saved to {output}. Exiting.' ) return 0
def ship_prototxt_to_data(self, rdd): rdd.context.addFile(self._solver_filename) rdd.context.addFile(self._architecture_filename) solver_filename = \ SparkFiles.get(self._solver_filename.rsplit('/', 1)[-1]) architecture_filename = \ SparkFiles.get(self._architecture_filename.rsplit('/', 1)[-1]) return solver_filename, architecture_filename
def start_spark(app_name='my_spark_app', master='local[*]', jar_packages=[], files=[], spark_config={}): # detect execution environment flag_repl = not(hasattr(__main__, '__file__')) flag_debug = 'DEBUG' in environ.keys() if not (flag_repl or flag_debug): spark_builder = ( SparkSession .builder .appName(app_name)) else: spark_builder = ( SparkSession .builder .master(master) .appName(app_name)) spark_files = ','.join(list(files)) spark_builder.config('spark.files', spark_files) # add other config params for key, val in spark_config.items(): spark_builder.config(key, val) # create session and retrieve Spark logger object spark_sess = spark_builder.getOrCreate() spark_logger = logging.Log4j(spark_sess) # get config file if sent to cluster with --files spark_files_dir = SparkFiles.getRootDirectory() spark_logger.warn('spark_files_dir' + str(spark_files_dir)) for filename in listdir(spark_files_dir): spark_logger.warn('filename' + str(filename)) config_f = SparkFiles.get('configs/etl_config.json') spark_logger.warn('config_f' + str(config_f)) config_files = [filename for filename in listdir(spark_files_dir) if filename.endswith('config.json')] spark_logger.warn('config_files' + str(config_files)) if config_files: path_to_config_file = path.join(spark_files_dir, config_files[0]) with open(path_to_config_file, 'r') as config_file: config_dict = json.load(config_file) spark_logger.warn('loaded config from ' + config_files[0]) else: spark_logger.warn('no config file found') config_dict = None return spark_sess, spark_logger, config_dict
def compute_buried_area(pdb_complex): chZ = "chZ" sasa_complex = -1.0 sasa_rec = -1.0 sasa_lig = -1.0 buried_total = -1.0 base_name = get_name_model_pdb(pdb_complex) ligand_name = get_ligand_from_receptor_ligand_model(base_name) f_pdb_ligand_no_docking = os.path.join(pdb_ligand_path.value,ligand_name+".pdb") f_ndx = os.path.join(path_analysis_pdb_complex_b.value,base_name+".ndx") f_temp_sasa_complex = os.path.join(path_analysis_pdb_complex_b.value,base_name+"_sasa_complex.xvg") f_temp_sasa_rec = os.path.join(path_analysis_pdb_complex_b.value,base_name+"_sasa_rec.xvg") f_temp_sasa_lig = os.path.join(path_analysis_pdb_complex_b.value,base_name+"_sasa_lig.xvg") # Makes the index file with the ligand (chain z) and the rest (non chain z) script_make_ndx = SparkFiles.get("make_ndx_buried_area_total.sh") #Getting bash script that was copied by addFile command command = script_make_ndx + " " + gromacs_path.value + " "+ pdb_complex + " "+ f_ndx process = Popen(command,shell=True, stdout=PIPE, stderr=PIPE) stdout, stderr = process.communicate() command = gromacs_path.value +"gmx sasa -f " + pdb_complex + " -s " + pdb_complex + " -nopbc " + " -n " + f_ndx + " -surface System " + " -output System "+ " -xvg none " + " -o " + f_temp_sasa_complex process = Popen(command,shell=True, stdout=PIPE, stderr=PIPE) stdout, stderr = process.communicate() # Makes f_temp_sasa_rec file script_make_sasa_rec = SparkFiles.get("make_sasa_rec_buried_area_total.sh") #Getting bash script that was copied by addFile command command = script_make_sasa_rec + " " + gromacs_path.value + " "+ pdb_complex + " "+ f_ndx + " " + f_temp_sasa_rec process = Popen(command,shell=True, stdout=PIPE, stderr=PIPE) stdout, stderr = process.communicate() command = gromacs_path.value +"gmx sasa -f " + pdb_complex + " -s " + pdb_complex + " -nopbc " + " -n " + f_ndx + " -surface chZ " + " -output chZ "+ " -xvg none " + " -o " + f_temp_sasa_lig process = Popen(command,shell=True, stdout=PIPE, stderr=PIPE) stdout, stderr = process.communicate() sasa_complex = get_value_from_xvg_sasa(f_temp_sasa_complex) sasa_rec = get_value_from_xvg_sasa(f_temp_sasa_rec) sasa_lig = get_value_from_xvg_sasa(f_temp_sasa_lig) buried_total = sasa_rec + sasa_lig - sasa_complex #Generating result - See column sorting because resultaed file will be created based on this sorting returned_list = (base_name, buried_total) #Deleting files os.remove(f_ndx) os.remove(f_temp_sasa_complex) os.remove(f_temp_sasa_rec) os.remove(f_temp_sasa_lig) return returned_list
class Dictionary(object): automaton = Automaton() with open(SparkFiles.get('dict_token.txt')) as f: token_dict = f.read().split() token_dictionary = [x.strip() for x in token_dict] automaton.add_all(token_dictionary) with open(SparkFiles.get('dict_garbage.txt')) as f: garbage_dict = f.readlines() garbage_dictionary = [x.strip() for x in garbage_dict] garbage_dictionary.sort(key=lambda item: (-len(item), item))
def SparkFiles(self, file_path): u""" 访问Spark作业文件 """ # 获取添加文件 SparkFiles.get(file_path) # 获取包含添加文件的根目录 SparkFiles.getRootDirectory() with open(SparkFiles.get(file_path)) as f: rows = f.readlines() for row in rows: print row return rows
def start_spark(app_name="my_spark_app", master="local[*]", files=['etl_conf.json']): flag_repl = not (hasattr(__main__, '__file__')) flag_debug = 'DEBUG' in environ.keys() if not (flag_repl or flag_debug): spark_builder = (SparkSession.builder.appName(app_name)) else: spark_builder = SparkSession.builder.appName(app_name).master(master) spark_files = '.'.join(list(files)) spark_builder.config('spark.files', spark_files) spark_builder.config(conf=SparkConf()) spark_sess = spark_builder.getOrCreate() #spark_logger=logger.Log4j(spark_sess) spark_files_dir = SparkFiles.getRootDirectory() config_files = [ x for x in listdir(spark_files_dir) if x.endswith('conf.json') ] if config_files: path_to_config_file = path.join(spark_files_dir, config_files[0]) with open(path_to_config_file, 'r') as f: config_dict = json.load(f) else: config_dict = None return spark_sess, config_dict
def read_gene_burden_curation(curated_data: str, spark_instance: SparkSession) -> DataFrame: """Read manual gene burden curation from remote to a Spark DataFrame.""" schema = StructType( [ StructField('projectId', StringType(), True), StructField('targetFromSource', StringType(), True), StructField('targetFromSourceId', StringType(), True), StructField('diseaseFromSource', StringType(), True), StructField('diseaseFromSourceMappedId', StringType(), True), StructField('resourceScore', DoubleType(), True), StructField('pValueMantissa', DoubleType(), True), StructField('pValueExponent', IntegerType(), True), StructField('oddsRatio', DoubleType(), True), StructField('ConfidenceIntervalLower', DoubleType(), True), StructField('ConfidenceIntervalUpper', DoubleType(), True), StructField('beta', DoubleType(), True), StructField('ancestry', StringType(), True), StructField('ancestryId', StringType(), True), StructField('cohortId', StringType(), True), StructField('studyId', StringType(), True), StructField('studySampleSize', IntegerType(), True), StructField('studyCases', IntegerType(), True), StructField('studyCasesWithQualifyingVariants', IntegerType(), True), StructField('allelicRequirements', StringType(), True), StructField('statisticalMethod', StringType(), True), StructField('statisticalMethodOverview', StringType(), True), StructField('literature', StringType(), True), StructField('url', StringType(), True), ] ) spark.sparkContext.addFile(curated_data) return spark_instance.read.csv(SparkFiles.get(curated_data.split('/')[-1]), sep='\t', header=True, schema=schema)
def read_source_data(self): print("Log Step 1: Reading Source Data") try: if "json" in self.filename: with open(self.filename, newline=''): file_path = dir_path + "/" + self.filename disticts_df = spark.read.json(SparkFiles.get(file_path)) return (disticts_df) elif "log" in self.filename: list_final = [] with open(self.filename) as f: for line in f: list_a = json.loads(line) list_final.append(list_a) drivers_df = pd.DataFrame(list_final) drivers_df = spark.createDataFrame(drivers_df, schema=driverSchema) return (drivers_df) except TestFailed as message: print("Entered exception") print(message) email = Email(message) email.send_email()
def transformAndMask(spark, df_valid, sensitive_words): with open(SparkFiles.get(sensitive_words)) as f: fieldnames = f.read().split('\n') for fieldname in fieldnames: if (fieldname) : df_valid = func1(fieldname, df_valid) return df_valid
def get_cached_dataset(): """ Returns a dataset of missense mutations in dbSNP mapped to UniProt and PDB residue positions. The dataset contains the following columns: +---+---------+---------+------------+---------+----------+----------+----------+---------+-------+-------+ ... |chr| pos| snp_id| master_acc|master_gi|master_pos|master_res|master_var| pdb_gi|pdb_res|pdb_pos| +---+---------+---------+------------+---------+----------+----------+----------+---------+-------+-------+ ... | 4| 79525461|764726341| NP_005130| 4826643| 274| R| *|157829892| R| 274| | 4| 79525462|771966889| NP_005130| 4826643| 274| R| P|157829892| R| 274| ... ... +-----------+--------------------+----------+------+---------+---------+----------+ |blast_ident| clinsig|pdbChainId|tax_id|pdbResNum|uniprotId|uniprotNum| ... +-----------+--------------------+----------+------+---------+---------+----------+ | 100.0| null| 1AII.A| 9606| 275| P12429| 274| ... | 100.0| null| 1AII.A| 9606| 275| P12429| 274| Reference: dbSNP: https://www.ncbi.nlm.nih.gov/projects/SNP/ :return: dataset of missense mutations in dbSNP """ spark = SparkSession.builder.getOrCreate() # download cached dataset spark.sparkContext.addFile(CACHED_FILE_URL) # read dataset spark.conf.set("spark.sql.orc.impl", "native") return spark.read.orc(SparkFiles.get(FILENAME))
def test_add_file_locally(self): path = os.path.join(SPARK_HOME, "python/test_support/hello/hello.txt") self.sc.addFile(path) download_path = SparkFiles.get("hello.txt") self.assertNotEqual(path, download_path) with open(download_path) as test_file: self.assertEqual("Hello World!\n", test_file.readline())
def start_spark(app, files=[], pyfiles=[]): ## Spark Context ## #conf = SparkConf().setAppName("AppSparkContext").set("spark.files","etl_config.json").toDebugString() #sc = SparkContext(conf=conf) #warehouse_location = abspath('hdfs://localhost:9001/lake/files') ## Spark Session ## spark_builder = SparkSession.builder.appName(app).master('local[*]') spark_builder.config( 'spark.files', "SparkFinal/configs/etl_config.json,/usr/local/Cellar/hive/2.1.0/libexec/conf/hive-site.xml" ) #spark_builder.config('spark.logConf','true') #spark_builder.config('spark.jars.repositories','/Users/kkartikgoel/dev/spark-2.1.0-bin-hadoop2.7/jars') #spark_builder.config('spark.jars.packages','com.databricks:spark-avro_2.10:1.0.0') #spark_builder.config('spark.jars.packages','com.databricks:spark-avro_2.10:1.0.0') #spark_builder.config('hive.metastore.uris','thrift://localhost:9083') spark_sess = spark_builder.enableHiveSupport().getOrCreate() ##properties spark_conf_list = spark_sess.sparkContext.getConf().getAll() for key, val in spark_conf_list: print key + "=" + val #spark_sess.sparkContext.getConf().contains("spark.files") #spark_sess.conf.get("spark.files") print "Spark WebURL= %s" % spark_sess.sparkContext.uiWebUrl ## Spark Files ## spark_files_dir = SparkFiles.getRootDirectory() print "spark_files_dir= %s" % spark_files_dir print "file_in_Spark_dir= %s" % os.listdir(spark_files_dir) spark_sess.sql("SET -v").show() spark_logger = logging.Log4j(spark_sess) return spark_sess, spark_files_dir, spark_logger
def run_etl(source, output_path, spark=None): """ Run Spark ETL of source file. :params source (string) - name of source type (should be module in intake/sources/) :param output_path (string) - where to write parquet output :params spark - spark context """ if not spark: spark = SparkSession.builder.getOrCreate() config = yaml.safe_load( pkg_resources.resource_stream(f'intake.sources.{source}', f'{source}_config.yml')) file_path = config['source'] src_type = file_path.split('.')[-1] header_keys = config['header_keys'] ignore_symbol = config['ignore_symbol'] spark.sparkContext.addFile(file_path) data_path = SparkFiles.get(file_path.split('/')[-1]) rdd = spark.sparkContext.textFile(data_path) # Use mapPartitions for structuring rows to only load # keys once per partition. Alternatively, we can consider # broadcasting the header_keys to workers... # TODO - refactor column renames/yyyymmdd index creation as add more data sources... df = rdd.mapPartitions(lambda partition: filter_helper(partition, header=','.join(list(header_keys.keys())), ignore_symbol=ignore_symbol)) \ .mapPartitions(lambda partition: structure_as_row(partition, header_keys, src_type)) \ .map(lambda Row: create_yyyymmdd_index(Row.asDict())).toDF() \ df = column_rename_factory(df, source) df.write.mode("overwrite").parquet( output_path) # Always overwrite with latest dataset
def privatize_example(example, local_vocab, local_embedding_dims, local_epsilon): from annoy import AnnoyIndex # Load files local_index = AnnoyIndex(local_embedding_dims, 'euclidean') local_index.load(SparkFiles.get("index.ann")) sensitive_phrases = [ x.strip() for x in clean_example(example) if x.strip() ] privatized_phrases = [] for sensitive_phrase in sensitive_phrases: privatized_words = [] for sensitive_word in sensitive_phrase.split(' '): privatized_word = replace_word(sensitive_word, local_vocab, local_epsilon, local_index, local_embedding_dims) privatized_words.append(privatized_word) # Flatten nested list of words privatized_phrases.append(itertools.chain(*[privatized_words])) privatized_review = " ".join(list(itertools.chain(*privatized_phrases))) privatized_row = "\"{}\",{}".format(privatized_review, example.sentiment) return privatized_row
def partition_handle(mons): from geoip2 import database etcd_key = get_etcd_key(cfg) def ip2subdivision(monitor): try: province_name = reader.city( monitor['pubip']).subdivisions.most_specific.name monitor['province'] = province_name or 'None' identity_code = IDENTITY_CODE.get(province_name) # 获取地区分组类别 monitor['group'] = etcd_key.get(identity_code, 'default') except: monitor['province'] = 'None' monitor['group'] = 'default' return monitor def rtt2level(monitor): for info in monitor.get('req_rtt', []): info['rtt_level'] = rtt_level_map.get( bisect.bisect_right(rtt_level_list, info['rtt']), rtt_level_map[1]) return monitor reader = database.Reader(SparkFiles.get(geo_db_path)) ip_res = [ip2subdivision(mon) for mon in mons] rtt_level_res = [rtt2level(mon) for mon in ip_res] return rtt_level_res
def get_spark_session(keyfile=None, chdir=False, gcs_temp_bucket=None) -> SparkSession: conf = SparkConf().setAppName("Metric Engine").set("spark.scheduler.mode", "FAIR") if gcs_temp_bucket is not None: conf = conf.set('temporaryGcsBucket', gcs_temp_bucket) conf = conf.set( "spark.hadoop.fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem") conf = conf.set("spark.hadoop.fs.AbstractFileSystem.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS") conf = conf.set('spark.hadoop.fs.gs.auth.service.account.enable', 'true') if keyfile is not None: conf = conf.set( 'spark.hadoop.google.cloud.auth.service.account.json.keyfile', keyfile) spark = SparkSession.builder.config( conf=conf).enableHiveSupport().getOrCreate() if chdir: os.chdir(SparkFiles.getRootDirectory()) return spark
def collect_docs(p, lang_detection_model_name=None, lang='en'): if lang_detection_model_name != None: from pyfasttext import FastText model_path = SparkFiles.get(lang_detection_model_name) model = FastText(model_path) regex = re.compile( r'^(?:http|ftp)s?://' # http:// or https:// r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|' #domain... r'localhost|' #localhost... r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ip r'(?::\d+)?' # optional port r'(?:/?|[/?]\S+)$', re.IGNORECASE) result = [] lines = list(p) indices = [i for i, line in enumerate(lines) if regex.search(line.strip())] for i in range(0, len(indices)): idx = indices[i] content = lines[idx + 1] paras = re.findall('<PAR>(.*?)</PAR>', content, re.DOTALL) if model: #filter only english paras langs = model.predict(paras) en_paras = list(filter(lambda p: lang in p[1], zip(paras, langs))) paras = list(map(lambda pair: pair[0], en_paras)) if paras: url = lines[idx].strip() result.append((url, paras)) return result
def load_params(): try: import yaml params = yaml.load(open(SparkFiles.get('params.yaml'))) except ImportError: from params import params return params
def _install_libs(run_id): current_dir = os.getcwd() base_dir = os.path.join(current_dir, run_id) lib_dir = os.path.join(base_dir, 'python_libs') lib_zip = SparkFiles.get("lib.zip") lock_name = os.path.join(base_dir, '__lock__') os.makedirs(base_dir, exist_ok=True) for i in range(0, 100): try: lock_fh = os.open(lock_name, os.O_CREAT | os.O_EXCL | os.O_WRONLY) os.close(lock_fh) try: if not os.path.isdir(lib_dir): print("_install_libs: install lib starts") os.makedirs(lib_dir) subprocess.check_call(['unzip', "-qq", lib_zip, "-d", lib_dir]) print("_install_libs: install lib done") if lib_dir not in sys.path: print(f"_install_libs: add {lib_dir} path") sys.path.insert(0, lib_dir) return finally: os.remove(lock_name) except OSError as e: if e.errno == errno.EEXIST: time.sleep(random.randint(1, 10)) continue raise raise Exception("Failed to install libraries!")
def get_sentences(s, prop): import nltk nltk.data.path.append(SparkFiles.get("nltk_data")) from nltk import ne_chunk, pos_tag, word_tokenize, sent_tokenize from nltk.corpus import stopwords text = s[prop] if "_____" in text: text = text.split("_____")[0] if "Hello, users of CMV!" in text: text = text.split("Hello, users of CMV!")[0] sentences = sent_tokenize(text) sentences_processed = [] for se in sentences: stops = stopwords.words('english') + list(string.punctuation) wo_stops = " ".join( [word for word in re.findall('\w+', se) if word not in stops]) sentences_processed.append(wo_stops) s['sentences'] = sentences_processed entities = [] tokenized = word_tokenize(text) chunks = ne_chunk(pos_tag(tokenized)) for c in chunks.subtrees(): if c.label() in ('PERSON', 'GPE'): entities.append(" ".join(w for w, t in c)) s['ne'] = entities return s
def send2monit(data): """ Helper function which wraps StompAMQ and incoming dataframe into notification message. Then it sends it to AMQ end-point provided by credentials file. """ if not StompAMQ: return # main function logic with open(SparkFiles.get('amq_broker.json')) as istream: creds = json.load(istream) host, port = creds['host_and_ports'].split(':') port = int(port) amq = StompAMQ(creds['username'], creds['password'], \ creds['producer'], creds['topic'], \ validation_schema=None, \ host_and_ports=[(host, port)]) arr = [] for idx, row in enumerate(data): # if not idx: # print("### row", row, type(row)) doc = json.loads(row) hid = doc.get("hash", 1) arr.append(amq.make_notification(doc, hid)) amq.send(arr) print("### Send %s docs to CERN MONIT" % len(arr))
def process(records): import csv reader = csv.reader(records) counts = {} streets_list = dict() with open(SparkFiles.get("nyc_cscl.csv")) as csv_file: tmp = csv.DictReader(csv_file, delimiter=',') for item in tmp: # print(item) if item['FULL_STREE'] not in streets_list.keys(): streets_list[item['FULL_STREE']] = [] streets_list[item['FULL_STREE']].append([ item['PHYSICALID'], item['FULL_STREE'], item['ST_NAME'], item['L_LOW_HN'], item['L_HIGH_HN'], item['R_LOW_HN'], item['R_HIGH_HN'] ]) print("!!!!!!!!!!!!!!") print("!!!!!!!!!!!!!!") print("!!!!!!!!!!!!!!") print("!!!!!!!!!!!!!!") for row in reader: county = row[0] num = row[1] st = row[2] zoneid = find_id(num, st, streets_list) if zoneid: counts[zoneid] = counts.get(zoneid, 0) + 1 return counts.items()
def start_spark(app_name='my_spark_app', master='local[*]', jar_packages=[], files=[], spark_config={}): spark_builder = (SparkSession.builder.appName(app_name)) # create Spark JAR packages string spark_jars_packages = ','.join(list(jar_packages)) spark_builder.config('spark.jars.packages', spark_jars_packages) spark_files = ','.join(list(files)) spark_builder.config('spark.files', spark_files) # add other config params for key, val in spark_config.items(): spark_builder.config(key, val) # create session and retrieve Spark logger object spark_sess = spark_builder.getOrCreate() spark_logger = log.Log4j(spark_sess) # get config file if sent to cluster with --files spark_files_dir = SparkFiles.getRootDirectory() config_files = [filename for filename in listdir(spark_files_dir) if filename.endswith('config.json')] if config_files: path_to_config_file = path.join(spark_files_dir, config_files[0]) with open(path_to_config_file, 'r') as config_file: config_dict = json.load(config_file) spark_logger.warn('loaded config from ' + config_files[0]) else: spark_logger.warn('no config file found') config_dict = None return spark_sess, spark_logger, config_dict
def get_sentences(s, prop): import nltk nltk.data.path.append(SparkFiles.get("nltk_data")) from nltk import ne_chunk, pos_tag, word_tokenize, sent_tokenize from nltk.corpus import stopwords try: text = s[prop] except Exception as e: print(s) s['sentences'] = [] return s if "_____" in text: text = text.split("_____")[0] if "Hello, users of CMV!" in text: text = text.split("Hello, users of CMV!")[0] sentences = sent_tokenize(text) sentences_processed = [] for se in sentences: stops = stopwords.words('english') + list(string.punctuation) wo_stops = " ".join( [word for word in re.findall('\w+', se) if word not in stops]) sentences_processed.append(wo_stops) s['sentences'] = sentences_processed return s
def load_timestep(timestep): path = data_path if download or config.copy_local: path = SparkFiles.get('pr_amon_BCSD_rcp26_r1i1p1_CONUS_bcc-csm1-1_202101-202512.nc') data = Dataset(path) pr = data.variables['pr'] step = pr[timestep] # Return valid values return (timestep, step[~step.mask])
def spawn_barista(partition): main = SparkFiles.get("main.py") architecture = SparkFiles.get("train_val.prototxt") model = SparkFiles.get("deepq16.caffemodel") solver = SparkFiles.get("solver.prototxt") root = SparkFiles.getRootDirectory() dset = os.path.join(root, "dset.hdf5") flag_file = "flags/__BARISTA_READY__" if os.path.isfile(flag_file): os.remove("flags/__BARISTA_READY__") out = open(os.path.join(root, "barista.log"), 'w') subprocess.Popen(["python", main, architecture, model, "--dataset", dset, "--solver", solver], stdout=out, stderr=subprocess.STDOUT) while not os.path.isfile("flags/__BARISTA_READY__"): pass
def partitionIp2city(iter): from geoip2 import database def ip2city(ip): try: city = reader.city(ip).city.name except: city = 'not found' return city reader = database.Reader(SparkFiles.get(geoDBpath)) #return [ip2city(ip) for ip in iter] return ip2city(iter)
def main(sc): sqlContext = SQLContext(sc) df = sqlContext.jsonFile(DATA_PATH) #add the filter file sc.addFile(FILTER_TERMS_FILE_PATH) filter_terms = sc.textFile(SparkFiles.get("freebase-symptoms-just-terms.txt")) global filter_terms_set_bc filter_terms_set_bc = sc.broadcast(Set(filter_terms.collect())) # Register the DataFrame as a table. df.registerTempTable("tweet") results = sqlContext.sql("SELECT id,user.id,user.lang,created_at, coordinates,text FROM tweet where user.lang='en'") #filter tweets to find health related tweets filter_health_tweets = results.rdd.filter(healthFilter) filter_health_tweets.mapPartitions(writeRecords).saveAsTextFile("output/")
def compute_buried_area_ligand(pdb_complex): chZ = "chZ" buried_lig_rec_perc = -1.0 buried_lig_rec = -1.0 buried_lig_lig = -1.0 buried_lig_lig_perc = -1.0 base_name = get_name_model_pdb(pdb_complex) ligand_name = get_ligand_from_receptor_ligand_model(base_name) receptor_name = get_receptor_from_receptor_ligand_model(base_name) pose = get_model_from_receptor_ligand_model(base_name) pdb_before_vs = os.path.join(pdb_ligand_path.value,ligand_name+".pdb") #ndx files f_ndx = os.path.join(path_analysis_pdb_complex_b.value,base_name+".ndx") #xvg files xvg_temp_sasa_lig_pose = os.path.join(path_analysis_pdb_complex_b.value,base_name+"_sasa_lig_pose"+".xvg") xvg_temp_sasa_lig_complex = os.path.join(path_analysis_pdb_complex_b.value,base_name+"_sasa_lig_complex"+".xvg") xvg_temp_sasa_lig_min = os.path.join(path_analysis_pdb_complex_b.value,base_name+"_sasa_lig_min"+".xvg") # Creates a selection with the residues that are closer than 6A to the ligand script_make_ndx_buried_area_ligand = SparkFiles.get("make_ndx_buried_area_ligand.sh") #Getting bash script that was copied by addFile command command = script_make_ndx_buried_area_ligand + " " + gromacs_path.value + " "+ pdb_complex + " "+ f_ndx + " "+ xvg_temp_sasa_lig_pose + " "+ str(probe.value) + " "+ str(ndots.value) + " "+ xvg_temp_sasa_lig_complex + " "+ pdb_before_vs + " "+ xvg_temp_sasa_lig_min process = Popen(command,shell=True, stdout=PIPE, stderr=PIPE) stdout, stderr = process.communicate() try: # SASA of the isolated ligand in the pose conformation sasa_lig_pose = get_value_from_xvg_sasa(xvg_temp_sasa_lig_pose) # SASA of the complexed ligand in the pose conformation sasa_lig_complex = get_value_from_xvg_sasa(xvg_temp_sasa_lig_complex) # SASA of the isolated ligand in its energy-minimized conformation. Only for carbohydrates! sasa_lig_min = get_value_from_xvg_sasa(xvg_temp_sasa_lig_min) # Area of the ligand which is buried in the receptor buried_lig_rec = sasa_lig_pose - sasa_lig_complex buried_lig_rec_perc = buried_lig_rec / sasa_lig_pose # Area of the ligand in the pose conformation which is buried in itself when compared to the energy-minimized conformation buried_lig_lig = sasa_lig_min - sasa_lig_pose buried_lig_lig_perc = buried_lig_lig / sasa_lig_min returned_list = (base_name, buried_lig_rec, buried_lig_rec_perc, buried_lig_lig, buried_lig_lig_perc) #Deleting files os.remove(f_ndx) os.remove(xvg_temp_sasa_lig_pose) os.remove(xvg_temp_sasa_lig_complex) os.remove(xvg_temp_sasa_lig_min) return returned_list except: return (base_name, float(0.0), float(0.0), float(0.0), float(0.0))
def partition_processor(partitionlinechunks): """ Partition logic for pyspark parallel processing """ model_pipe_object = joblib.load(SparkFiles.get("mmp_phase1_D2.clf")) def set_predictions(x): segment = model_pipe_object.predict_proba(x) return segment df_with_nan = build_dataframe(partitionlinechunks) df_with_newline = df_with_nan.replace(u"NULL", pd.np.nan) behaviour_df = df_with_newline.replace(u"\\N", pd.np.nan) predictions_ser = set_predictions(behaviour_df) predictions_list = [value for value in [zip(predictions_ser.index, predictions_ser.loc[:,'A'], predictions_ser.loc[:,'Y'], predictions_ser.loc[:,'segment'], predictions_ser.loc[:,'model_version'])]] return iter(predictions_list)
def load_matrix( filename, sc, num_users=NUM_USER, num_items=NUM_SONG ): global alpha global total global num_zeros print 'Start to load matrix...' t0 = time.time() counts = np.zeros((num_users, num_items)) total = 0.0 num_zeros = num_users * num_items url = "s3n://spark-mllib/fastcode/data/" + filename # url = "hdfs://localhost:9000/data/" + filename print 'loading... ' + url # data = sc.textFile(url) # data.map(lambda l: fill_maxtrix(l, counts)) sc.addFile(url) with open(SparkFiles.get(filename)) as f: for line in f: fill_maxtrix(line, counts) alpha = num_zeros / total print 'alpha %.2f' % alpha counts *= alpha t1 = time.time() print 'Finished loading matrix in %f seconds\n' % (t1 - t0) print 'Total entry:', num_users * num_items print 'Non-zeros:', num_users * num_items - num_zeros counts = sparse.csr_matrix(counts) return counts, num_users * num_items - num_zeros
distScript = os.getcwd()+"/src/R/finddistance.R" distScriptName = "finddistance.R" sc.addFile(distScript) def hasDistInfo(call): """Verify that a call has the fields required to compute the distance""" requiredFields = ["mylat", "mylong", "contactlat", "contactlong"] return all(map(lambda f: call[f], requiredFields)) def formatCall(call): """Format a call so that it can be parsed by our R program""" return "{0},{1},{2},{3}".format( call["mylat"], call["mylong"], call["contactlat"], call["contactlong"]) pipeInputs = contactsContactList.values().flatMap( lambda calls: map(formatCall, filter(hasDistInfo, calls))) distances = pipeInputs.pipe(SparkFiles.get(distScriptName)) print distances.collect() # Convert our RDD of strings to numeric data so we can compute stats and # remove the outliers. distanceNumerics = distances.map(lambda string: float(string)) stats = distanceNumerics.stats() stddev = stats.stdev() mean = stats.mean() reasonableDistances = distanceNumerics.filter( lambda x: math.fabs(x - mean) < 3 * stddev) print reasonableDistances.collect()
def driver(sc, inputFilename, outputDirectory, crfExecutable, crfScript, featureListFilename, crfModelFilename, eyeColorRef, eyeColorConfig, hairRef, hairConfig, limit=limit, location='hdfs', outputFormat="text", partitions=None): dump = False partitions = None # Program to compute CRF++ c = crf_features.CrfFeatures(featureListFilename) # Add files to be downloaded with this Spark job on every node. sc.addFile(crfExecutable) sc.addFile(crfScript) sc.addFile(crfModelFilename) # Map to reference sets smEyeColor = HybridJaccard(ref_path=eyeColorRef, config_path=eyeColorConfig) smHairColor = HybridJaccard(ref_path=hairRef, config_path=hairConfig) if location == "hdfs": print "We want to do hdfs dfs -rm -r %s" % outputDirectory elif location == "local": try: shutil.rmtree(outputDirectory) print "rmtree %s" % outputDirectory except: pass else: raise RuntimeError("No such location: %s" % location) rdd_sequence_file_input = sc.sequenceFile(inputFilename) rdd_sequence_file_input.setName('rdd_sequence_file_input') # rdd_sequence_file_input.persist() origSize = rdd_sequence_file_input.count() # if limit: # rdd = sc.parallelize(rdd_sequence_file_input.take(limit)) if partitions: rdd_sequence_file_input = rdd_sequence_file_input.repartition(partitions) print "### input %s: %d ads (orig %s, limit was %s), %d partitions" % (inputFilename, rdd_sequence_file_input.count(), origSize, limit, rdd_sequence_file_input.getNumPartitions()) rdd_json = rdd_sequence_file_input.mapValues(lambda x: json.loads(x)) rdd_json.setName('rdd_json') # rdd_json.persist() # all below should also be done for title rdd_body = rdd_json.mapValues(lambda x: extract_body(x)) rdd_body.setName('rdd_body') # rdd_body.persist() if dump: rdd_body.saveAsTextFile(ff("body")) rdd_body_tokens = rdd_body.mapValues(lambda x: textTokens(x)) rdd_body_tokens.setName('rdd_body_tokens') # rdd_body_tokens.persist() if dump: rdd_body_tokens.saveAsTextFile(ff("body_tokens")) rdd_features = rdd_body_tokens.map(lambda x: (x[0], c.computeFeatMatrix(x[1], False, addLabels=[x[0]], addIndex=True))) rdd_features.setName('rdd_features') # rdd_features.persist() if dump: rdd_features.saveAsTextFile(ff("features")) # rdd_pipeinput = rdd_features.mapValues(lambda x: base64.b64encode(vectorToString(x))) rdd_pipeinput = rdd_features.mapValues(lambda x: vectorToString(x)) rdd_pipeinput.setName('rdd_pipeinput') # rdd_pipeinput.persist() if dump: rdd_pipeinput.values().saveAsTextFile(ff("pi")) # This caused a cannot concatenate string + None error # rdd_pipeinput.saveAsTextFile(outputDirectory + "-pipeinput") # DON'T USE SparkFiles.get to fetch the crf_test or model # This only works with local Spark (--master local[*]) if location == 'hdfs': cmd = "%s %s" % (os.path.basename(crfScript), os.path.basename(crfModelFilename)) elif location == 'local': cmd = "%s %s" % (SparkFiles.get(os.path.basename(crfScript)), SparkFiles.get(os.path.basename(crfModelFilename))) print "### %s" % cmd rdd_pipeinput.saveAsTextFile(ff("before")) exit(0) rdd_crf_b64 = rdd_pipeinput.values().pipe(cmd) rdd_crf_b64.setName('rdd_crf_b64') # rdd_crf_b64.persist() if dump: rdd_crf_b64.saveAsTextFile(ff("po")) # Go directly from base64 output to a reconstructed tuple format mapping URI to vector of vectors, # with empty string suffix indicating blank line # This is key for avoiding the groupBy step rdd_restore = rdd_crf_b64.map(lambda x: restore(x)) rdd_restore.setName('rdd_restore') # rdd_restore.persist() if dump: rdd_restore.saveAsTextFile(ff("restore")) # ### WE NO LONGER HAVE TO GROUPBY # ### BUT WE MUST TREAT EACH LINE INDIVIDUALLY NOW # rdd_withuri = sc.parallelize(rdd_withuri.take(10)) rdd_harvested = rdd_restore.mapValues(lambda x: computeSpans(x, indexed=True)).filter(lambda p: p[1]) rdd_harvested.setName('rdd_harvested') # rdd_harvested.persist() if dump: rdd_harvested.saveAsTextFile(ff("harvested")) # This has the effect of generating 0, 1, 2, ... lines according to the number of spans rdd_controlled = rdd_harvested.flatMapValues(lambda x: list(x)) rdd_controlled.setName('rdd_controlled') # rdd_controlled.persist() # map any eyeColor spans using smEyeColor, hairType spans using smHairColor rdd_aligned = rdd_controlled.mapValues(lambda x: alignToControlledVocab(x, {"eyeColor": smEyeColor, "hairType": smHairColor})) rdd_aligned.setName('rdd_aligned') # rdd_aligned.persist() if dump: rdd_aligned.saveAsTextFile(ff("aligned")) rdd_aligned_json = rdd_aligned.mapValues(lambda x: json.dumps(x)) rdd_aligned_json.setName('rdd_aligned_json') # rdd_aligned_json.persist() if dump: rdd_aligned_json.saveAsTextFile(ff("aligned_json")) rdd_final = rdd_aligned_json empty = rdd_final.isEmpty() if not empty: l = "unknown>1" print "### writing %s output (%s records) to %s" % (outputFormat, l, outputDirectory) # print len(rdd_final.collect()) if outputFormat == "sequence": rdd_final.saveAsSequenceFile(outputDirectory) elif outputFormat == "text": rdd_final.saveAsTextFile(outputDirectory) else: raise RuntimeError("Unrecognized output format: %s" % outputFormat) else: print "### No records: no output into %s" % (outputDirectory)
#X = sys.argv[1] #normal normalFilePath = '/home/worker/workspace/DeepDefense_dataStatistics' + '/csv' + '/topXraw.csv' normalPath = os.path.join(normalFilePath) sc.addFile(normalPath); #attack attackFilePath = '/home/worker/workspace/DeepDefense_dataStatistics' + '/csv' + '/topXraw.csv' attackPath = os.path.join(attackFilePath) sc.addFile(attackPath); from pyspark import SparkFiles normalRdd = sc.textFile(SparkFiles.get(normalFilePath)).cache() attackRdd = sc.textFile(SparkFiles.get(attackFilePath)).cache() # src, dst, data_length, protocol_name, protocol_number, arrival_time (len = 6) normalRaw = normalRdd.map(lambda x: x.split(',')).filter(lambda x: len(x) == 6).cache() attackRaw = attackRdd.map(lambda x: x.split(',')).filter(lambda x: len(x) == 6).cache() #(ip, count) normalTopXSrcIP = normalRaw.map(lambda x:(x[0], 1)).groupByKey().map(lambda (k,v):(k, sum(v))).takeOrdered(10, key = lambda (k,v): -v) attackTopXSrcIP = attackRaw.map(lambda x:(x[0], 1)).groupByKey().map(lambda (k,v):(k, sum(v))).takeOrdered(10, key = lambda (k,v): -v) #(ip, count) normalTopXDstIP = normalRaw.map(lambda x:(x[1], 1)).groupByKey().map(lambda (k,v):(k, sum(v))).takeOrdered(10, key = lambda (k,v): -v) attackTopXDstIP = attackRaw.map(lambda x:(x[1], 1)).groupByKey().map(lambda (k,v):(k, sum(v))).takeOrdered(10, key = lambda (k,v): -v) #(ip, data_length)
def upload_file_job(context): from pyspark import SparkFiles with open(SparkFiles.get(upload_file_name)) as testFile: file_val = testFile.readline() return file_val
sc = SparkContext('local', 'testGeoSpark') #X = sys.argv[1] #normal normalFilePath = '/home/worker/workspace/DeepDefense_dataStatistics' + '/csv' + '/topXraw.csv' normalPath = os.path.join(normalFilePath) sc.addFile(normalPath); #attack attackFilePath = '/home/worker/workspace/DeepDefense_dataStatistics' + '/csv' + '/topXraw.csv' attackPath = os.path.join(attackFilePath) sc.addFile(attackPath) from pyspark import SparkFiles normalRdd = sc.textFile(SparkFiles.get(normalFilePath)) attackRdd = sc.textFile(SparkFiles.get(attackFilePath)) import geoip2.database geoDBpath = '/home/worker/workspace/geoDB/GeoLite2-City.mmdb' geoPath = os.path.join(geoDBpath) sc.addFile(geoPath) #reader = geoip2.database.Reader(SparkFiles.get(geoPath)) #reader = geoip2.database.Reader('GeoLite2-City.mmdb') # def ip2city(ip): # try: # city = reader.city(ip).city.name # except: # city = 'not found'
def crfprep(sc, inputFilename, outputDirectory, limit=LIMIT, location='hdfs', outputFormat="text", partitions=None): crfConfigDir = os.path.join(os.path.dirname(__file__), "data/config") featureListFilename = os.path.join(crfConfigDir, "features.hair-eye") crfConfigDir = os.path.join(os.path.dirname(__file__), "data/config") crfExecutable = "/usr/local/bin/crf_test_filter.sh" crfModelFilename = os.path.join(crfConfigDir, "dig-hair-eye-train.model") rdd_sequence_file_input = sc.sequenceFile(inputFilename) rdd_sequence_file_input.setName('rdd_sequence_file_input') # rdd_sequence_file_input.persist() if limit: rdd_sequence_file_input = sc.parallelize(rdd_sequence_file_input.take(limit)) if partitions: rdd_sequence_file_input = rdd_sequence_file_input.repartition(partitions) rdd_json = rdd_sequence_file_input.mapValues(lambda x: json.loads(x)) rdd_json.setName('rdd_json') # rdd_json.persist() rdd_texts = rdd_json.mapValues(lambda x: (textTokens(extract_body(x)), textTokens(extract_title(x)))) rdd_texts.setName('rdd_texts') # data format issue? # rdd_texts.saveAsSequenceFile(outputDirectory + "_texts") # This separator could have appeared in original text, and should serve to cleanly delimit the body from the title # Not perfect, it could have appeared between real tokens # Needs to have single labels+index feature # former code was lost c = crf_features.CrfFeatures(featureListFilename) SEPARATOR = '&nbsp;', def makeMatrix(c, uri, bodyTokens, titleTokens): b = c.computeFeatMatrix(bodyTokens, False, addLabels=False, addIndex=False) s = c.computeFeatMatrix([SEPARATOR, ""], False, addLabels=False, addIndex=False) t = c.computeFeatMatrix(titleTokens, False, addLabels=False, addIndex=False) idx = 1 for row in b: if row == u"": pass else: label = uri + "/%05d/%05d" % (0, idx) row.append(label) idx += 1 idx = 1 for row in s: if row == u"": pass else: label = uri + "/%05d/%05d" % (1, idx) row.append(label) idx += 1 idx = 1 for row in t: if row == u"": pass else: label = uri + "/%05d/%05d" % (2, idx) row.append(label) idx += 1 # might be b[0:-1] + s[0:-1] + t? return b[0:-1] + s[0:-1] + t rdd_features = rdd_texts.map(lambda x: (x[0], makeMatrix(c, x[0], x[1][0], x[1][1]))) rdd_features.setName('rdd_features') # rdd_features.persist() rdd_pipeinput = rdd_features.mapValues(lambda x: vectorToUTF8(x)).values() rdd_pipeinput.setName('rdd_pipeinput') if location == 'hdfs': cmd = "%s %s" % (os.path.basename(crfExecutable), os.path.basename(crfModelFilename)) elif location == 'local': cmd = "%s %s" % (SparkFiles.get(os.path.basename(crfExecutable)), SparkFiles.get(os.path.basename(crfModelFilename))) print "###CMD %s" % cmd rdd_crfoutput = rdd_pipeinput.pipe(cmd) rdd_crfoutput.setName('rdd_crfoutput') # rdd_features.persist() rdd_final = rdd_crfoutput if outputFormat == "sequence": rdd_final.saveAsSequenceFile(outputDirectory) elif outputFormat == "text": rdd_final.saveAsTextFile(outputDirectory) else: raise RuntimeError("Unrecognized output format: %s" % outputFormat)
h2oContext = H2OContext(sc).start() # Define file names chicagoAllWeather = "chicagoAllWeather.csv" chicagoCensus = "chicagoCensus.csv" chicagoCrimes10k = "chicagoCrimes10k.csv" # Add files to Spark Cluster sc.addFile(_locate(chicagoAllWeather)) sc.addFile(_locate(chicagoCensus)) sc.addFile(_locate(chicagoCrimes10k)) # Since we have already loaded files into spark, we have to use h2o.upload_file instead of h2o.import_file since # h2o.import_file expects cluster-relative path (ie. the file on this path can be accessed from all the machines on the cluster) # but SparkFiles.get(..) already give us relative path to the file on a current node which h2o.upload_file can handle ( it uploads file # located on current node and distributes it to the H2O cluster) f_weather = h2o.upload_file(SparkFiles.get(chicagoAllWeather)) f_census = h2o.upload_file(SparkFiles.get(chicagoCensus)) f_crimes = h2o.upload_file(SparkFiles.get(chicagoCrimes10k)) # Transform weather table # Remove 1st column (date) f_weather = f_weather[1:] # Transform census table # Remove all spaces from column names (causing problems in Spark SQL) col_names = map(lambda s: s.strip().replace(' ', '_').replace('+', '_'), f_census.col_names) # Update column names in the table # f_weather.names = col_names f_census.names = col_names
rdd_json = rdd.mapValues(lambda x: json.loads(x)) rdd_body = rdd_json.mapValues(lambda x: extract_body(x)) rdd_body_tokens = rdd_body.mapValues(lambda x: textTokens(x)) # TBD # rdd_title = rdd_json.mapValues(lambda x: extract_title(x)) # rdd_title_tokens = rdd.title.mapValues(lambda x: textTokens(x)) # all below should also be done for title # not a pair RDD? rdd_features = rdd_body_tokens.map(lambda x: (x[0], c.computeFeatMatrix(x[1], False, addLabels=[x[0]], addIndex=True))) rdd_pipeinput = rdd_features.mapValues(lambda x: vectorToString(x)) cmd = SparkFiles.get("crf_test") + " -m " + SparkFiles.get(crfModelFilename) rdd_crf = rdd_pipeinput.values().pipe(cmd) # not a pair RDD # but we have the URI in the -3 position # and the index in the -2 position rdd_withuri = rdd_crf.map(lambda x: reconstructTuple(x)) rdd_grouped = rdd_withuri.groupByKey() rdd_flat = rdd_grouped.mapValues(lambda x: [l[1:] for l in sorted(x, key=lambda r: int(r[0]))]) rdd_harvested = rdd_flat.mapValues(lambda x: computeSpans(x, indexed=True)) # This has the effect of generating 0, 1, 2, ... lines according to the number of spans rdd_controlled = rdd_harvested.flatMapValues(lambda x: list(x)) # map any eyeColor spans using smEye, hairType spans using smHair rdd_aligned = rdd_controlled.mapValues(lambda x: alignToControlledVocab(x, {"eyeColor": smEye, "hairType": smHair}))
def sparkFilePathMapper(self, path): """When Spark forwards files from the driver to worker nodes, it may be necessary to map the filename path on a per-worker node basis.""" # Note the implication in this code that the feature list file and # model file must have unique basenames. return SparkFiles.get(os.path.basename(path))
def mapper(line, title, secfile, idsec): post = mdb.posts tokens = word_tokenize(line) tagged = pos_tag(tokens) ntities = chunk.ne_chunk(tagged) newline = line.encode('utf-8') posting = {"securitynow_id": idsec, "episode": secfile[3:6], "speaker": title, "original": line, "tokens": tokens, "entities": ntities, "sentiment": classifier.classify(dict([(word, True) for word in newline]))} post_id = post.insert(posting) sc.addFile("/home/th3m4d0n3/NetBeansProjects/twAppDemo/data_dir/allSentimentData") with open(SparkFiles.get("allSentimentData")) as f: reader = csv.reader(f, delimiter=" ", quotechar='"') jobs = bg.BackgroundJobManager() map(parseForNltk, reader) print("chezdata type DATA: {0} COUNT: {1}".format(type(chezdata), len(chezdata))) map(getHighest, chezdata) chezdataP = sc.parallelize(chezdata) lowRatedP = sc.parallelize(lowRated) highlyRatedP = sc.parallelize(highlyRated) print("chezdataP type DATA: {0} COUNT: {1}".format(type(chezdataP), chezdataP.count())) print("lowRatedP type DATA: {0} COUNT: {1}".format(type(lowRatedP), lowRatedP.count()))
def __ls(broadcast_vars, iterator): """ Get the list of files in the worker-local directory """ return [__get_hostname(), os.listdir(SparkFiles.getRootDirectory())]
# Can I perist a Caffe network object? import copy from pyspark import SparkContext, SparkConf from pyspark import SparkFiles from pyspark import StorageLevel conf = SparkConf().setAppName("SparkCaffe Test") conf.set("spark.executor.memory", "1g") sc = SparkContext(conf=conf) sc.addFile("models/solver.prototxt") sc.addFile("models/train_val.prototxt") solver = SparkFiles.get("solver.prototxt") architecture = SparkFiles.get("train_val.prototxt") def create_net(solver_filename): from caffe import SGDSolver net = SGDSolver(str(solver_filename)).net return net netRDD = sc.parallelize([solver] * 2, 2).map(create_net) netRDD.persist(StorageLevel.MEMORY_ONLY) def extract_unique_val(net): return net.params["conv1"][0].data[0, 0, 0, 0]
def main(): conf = (SparkConf() .setMaster("local[*]") .setAppName("compare_engine")) sc = SparkContext(conf = conf) sc.setLogLevel('INFO') sc.addFile(primary) # rdd_primary = sc.textFile(primary, minPartitions=4, use_unicode=True).distinct() rdd_primary = sc.textFile(SparkFiles.get(primary), minPartitions=4, use_unicode=True).distinct() rdd_primary.partitionBy(10).cache() os.system('rm -Rf collects_*') os.system('rm -Rf holder.txt') rdd_secondary = sc.textFile(secondary, minPartitions=4, use_unicode=True).distinct() rdd_secondary.partitionBy(10).cache() primary_count = rdd_primary.count() primary_report['count'] = primary_count print(primary_report) secondary_count = rdd_secondary.count() secondary_report['count'] = secondary_count print(secondary_report) # Return each Primary file line/record not contained in Secondary not_in_primary = rdd_primary.subtract(rdd_secondary) primary_diff = not_in_primary.count() primary_report['diff'] = primary_diff os.system('rm -Rf collects_*.csv') primary_dir = 'collects_{}_primary'.format(run_date) primary_report_name = 'collects_{}_primary_report.csv'.format(run_date) not_in_primary.coalesce(1, True).saveAsTextFile(primary_dir) # os.system('cat collects_{}_primary/part-0000* >> collects_{}_primary_report.csv'.format(run_date, run_date)) os.system('cat {}/part-0000* >> {}'.format(primary_dir, primary_report_name)) os.system('wc -l collects_{}_primary_report.csv'.format(run_date)) # Flip Primary Vs Secondary # Return each Secondary file line/record not contained in Primary not_in_secondary = rdd_secondary.subtract(rdd_primary) secondary_diff = not_in_secondary.count() secondary_report['diff'] = secondary_diff not_in_secondary.coalesce(1,True).saveAsTextFile('collects_{}_secondary'.format(run_date)) os.system('cat collects_{}_secondary/part-0000* >> collects_{}_secondary_report.csv'.format(run_date, run_date)) os.system('wc -l collects_{}_secondary_report.csv'.format(run_date)) process_report['primary'] = primary_report process_report['secondary'] = secondary_report print("=" * 100) print('\n') print(process_report) print('\n') print("=" * 100) spark_details(sc) sc.stop()
def _getCountryByIP(ip): citydb = geoIP.Reader(SparkFiles.get('GeoLite2-City.mmdb')) return (citydb.city(ip).country.name or u'Unknown').encode()
def update_data(self): self.arrays['data'][:] = np.random.randn(*self.arrays['data'].shape) self.arrays['label'][:] = np.random.choice( xrange(10), size=self.arrays['label'].shape) def process_model(self): pass # Create some dummy data dataRDD = sc.parallelize(xrange(100)) # Create some barista instances num_baristas = 2 start_script = 'python -m barista.start' solver = SparkFiles.get("solver.prototxt") interfaces = sc.parallelize([solver]*num_baristas, num_baristas) \ .pipe(start_script) \ .collect() # Join the data def train(interface, data): solver_filename, pid = interface.split(',') customer = MyCustomer(solver_filename) customer.run_transaction() grad_norm = np.linalg.norm(customer.arrays['conv1_dW']) return grad_norm grad_norms = dataRDD.map(lambda x: train(interfaces[0], x)).collect() print grad_norms
def start_spark(app_name='my_spark_app', master='local[*]', jar_packages=[], files=[], spark_config={}): """Start Spark session, get the Spark logger and load config files. Start a Spark session on the worker node and register the Spark application with the cluster. NOTE - only the app_name argument will apply when this is called from a script sent to spark-submit (i.e. when __name__ = '__main__'). All other arguments exist solely for testing the script from within an interactive Python console. This function also looks for a file ending in 'config.json' that can be sent with the Spark job. If it is found, it is opened, the contents parsed (assuming it contains valid JSON for the ETL job configuration), into a dict of ETL job configuration parameters, which are returned as the last element in the tuple returned by this function. If the file cannot be found then the return tuple only contains the Spark session and Spark logger objects. :param app_name: Name of Spark app. :param master: Cluster connection details (defaults to local[*]. :param jar_packages: List of Spark JAR package names. :param files: List of files to send to Spark cluster (master and workers). :param spark_config: Dictionary of config key-value pairs. :return: A tuple of references to the Spark session, logger and config dict (only if available). """ if __name__ == '__main__': # get Spark session factory spark_builder = ( SparkSession .builder .appName(app_name)) else: # get Spark session factory spark_builder = ( SparkSession .builder .master(master) .appName(app_name)) # create Spark JAR packages string spark_jars_packages = ','.join(list(jar_packages)) spark_builder.config('spark.jars.packages', spark_jars_packages) spark_files = ','.join(list(files)) spark_builder.config('spark.files', spark_files) # add other config params for key, val in spark_config.items(): spark_builder.config(key, val) # create session and retrieve Spark logger object spark_sess = spark_builder.getOrCreate() spark_logger = logging.Log4j(spark_sess) # get config file if sent to cluster with --files spark_files_dir = SparkFiles.getRootDirectory() config_files = [filename for filename in listdir(spark_files_dir) if filename.endswith('config.json')] if len(config_files) != 0: path_to_config_file = path.join(spark_files_dir, config_files[0]) with open(path_to_config_file, 'r') as config_file: config_json = config_file.read().replace('\n', '') config_dict = loads(config_json) spark_logger.warn('loaded config from ' + config_files[0]) else: config_dict = None # build return tuple conditional on presence of config if config_dict is not None: return_tup = spark_sess, spark_logger, config_dict else: return_tup = spark_sess, spark_logger return return_tup
def compute_buried_area_all_residues_and_receptor_area(pdb_complex): chZ = "chZ" res_buried_area_perc = -1 res_buried_area = -1 buried_receptor_system = -1 buried_receptor_res = -1 base_name = get_name_model_pdb(pdb_complex) ligand_name = get_ligand_from_receptor_ligand_model(base_name) receptor_name = get_receptor_from_receptor_ligand_model(base_name) pose = get_model_from_receptor_ligand_model(base_name) #output area receptor file f_output_receptor_buried_area = os.path.join(path_analysis_pdb_complex_b.value,base_name+".outAreaRecep") #ndx files #f_ndx = os.path.join(path_analysis_pdb_complex_b.value,base_name+".ndx") f_ndx_temporary_index_z = os.path.join(path_analysis_pdb_complex_b.value,base_name+"_temporary_index_z"+".ndx") f_ndx_temporary = os.path.join(path_analysis_pdb_complex_b.value,base_name+"_temporary"+".ndx") f_ndx_temporary_sasa = os.path.join(path_analysis_pdb_complex_b.value,base_name+"_temporary_sasa"+".ndx") #xvg files f_xvg_temporary_sasa_res_lig = os.path.join(path_analysis_pdb_complex_b.value,base_name+"_temporary_sasa_res-lig"+".xvg") f_xvg_temporary_sasa_res = os.path.join(path_analysis_pdb_complex_b.value,base_name+"_temporary_sasa_res"+".xvg") f_xvg_temporary_sasa_rec_lig = os.path.join(path_analysis_pdb_complex_b.value,base_name+"_temporary_sasa_rec_lig"+".xvg") f_xvg_temporary_sasa_rec = os.path.join(path_analysis_pdb_complex_b.value,base_name+"_temporary_sasa_rec"+".xvg") # Creates a selection with the residues that are closer than 6A to the ligand script_make_ndx_buried_area_receptor = SparkFiles.get("make_ndx_buried_area_receptor.sh") #Getting bash script that was copied by addFile command command = script_make_ndx_buried_area_receptor + " " + gromacs_path.value + " "+ pdb_complex + " "+ f_ndx_temporary_index_z + " "+ f_ndx_temporary process = Popen(command,shell=True, stdout=PIPE, stderr=PIPE) stdout, stderr = process.communicate() #coping file if os.path.exists(f_ndx_temporary): shutil.copy(f_ndx_temporary, f_ndx_temporary_sasa) #Get all residues for computing area receptor all_res = get_residues_receptor_from_ndx_files(f_ndx_temporary) returned_list = [] for res in all_res: script_make_ndx_buried_area_receptor_res = SparkFiles.get("make_ndx_buried_area_receptor_res.sh") #Getting bash script that was copied by addFile command command = script_make_ndx_buried_area_receptor_res + " " + gromacs_path.value + " "+ pdb_complex + " "+ f_ndx_temporary_sasa + " "+ str(res) process = Popen(command,shell=True, stdout=PIPE, stderr=PIPE) stdout, stderr = process.communicate() # compute surface of system - saved on xvg command = gromacs_path.value +"gmx sasa -surface complex -output rec_"+str(res)+ " -o "+ f_xvg_temporary_sasa_res_lig + " -xvg none -f " + pdb_complex +" -s " + pdb_complex + " -n "+ f_ndx_temporary + " -nopbc " process = Popen(command,shell=True, stdout=PIPE, stderr=PIPE) stdout, stderr = process.communicate() # compute surface of receptor - save on xvg command = gromacs_path.value +"gmx sasa -surface rec -output rec_"+str(res)+ " -o "+ f_xvg_temporary_sasa_res + " -xvg none -f " + pdb_complex +" -s " + pdb_complex + " -n "+ f_ndx_temporary + " -nopbc " process = Popen(command,shell=True, stdout=PIPE, stderr=PIPE) stdout, stderr = process.communicate() #calculate area if os.path.exists(f_xvg_temporary_sasa_res_lig): buried_receptor_system = get_value_from_xvg_sasa(f_xvg_temporary_sasa_res_lig) else: buried_receptor_system = 0 if os.path.exists(f_xvg_temporary_sasa_res): buried_receptor_res = get_value_from_xvg_sasa(f_xvg_temporary_sasa_res) else: buried_receptor_res = 0 res_buried_area = buried_receptor_res - buried_receptor_system if (res_buried_area > 0) and (buried_receptor_res > 0): res_buried_area_perc = res_buried_area/buried_receptor_res #Generating result result = (base_name, res, res_buried_area, res_buried_area_perc) returned_list.append(result) #Deleting files if os.path.exists(f_xvg_temporary_sasa_res_lig): os.remove(f_xvg_temporary_sasa_res_lig) if os.path.exists(f_xvg_temporary_sasa_res): os.remove(f_xvg_temporary_sasa_res) #Computing Receptor Area command = gromacs_path.value +"gmx sasa -surface complex -output rec"+ " -o "+ f_xvg_temporary_sasa_rec_lig + " -xvg none -f " + pdb_complex +" -s " + pdb_complex + " -n "+ f_ndx_temporary + " -nopbc " process = Popen(command,shell=True, stdout=PIPE, stderr=PIPE) stdout, stderr = process.communicate() command = gromacs_path.value +"gmx sasa -surface rec -output rec"+ " -o "+ f_xvg_temporary_sasa_rec + " -xvg none -f " + pdb_complex +" -s " + pdb_complex + " -n "+ f_ndx_temporary + " -nopbc " process = Popen(command,shell=True, stdout=PIPE, stderr=PIPE) stdout, stderr = process.communicate() if os.path.exists(f_xvg_temporary_sasa_rec_lig): sasa_rec_lig = get_value_from_xvg_sasa(f_xvg_temporary_sasa_rec_lig) else: sasa_rec_lig = 0 if os.path.exists(f_xvg_temporary_sasa_rec): sasa_rec = get_value_from_xvg_sasa(f_xvg_temporary_sasa_rec) else: sasa_rec = 0 receptor_area = sasa_rec - sasa_rec_lig #Saving result file output_receptor_buried_area = open(f_output_receptor_buried_area, "w") output_receptor_buried_area.write(str(base_name)+" "+str(receptor_area) +"\n") output_receptor_buried_area.close() #Deleting all files if os.path.exists(f_xvg_temporary_sasa_rec_lig): os.remove(f_xvg_temporary_sasa_rec_lig) if os.path.exists(f_xvg_temporary_sasa_rec): os.remove(f_xvg_temporary_sasa_rec) if os.path.exists(f_ndx_temporary): os.remove(f_ndx_temporary) if os.path.exists(f_ndx_temporary_sasa): os.remove(f_ndx_temporary_sasa) if os.path.exists(f_ndx_temporary_index_z): os.remove(f_ndx_temporary_index_z) return returned_list else: #Here means that some problem for computing area return (base_name, "NAN", float(0), float(0))