Beispiel #1
0
def getSC(appName='fea'):
    sconf = SparkConf().set("spark.hadoop.validateOutputSpecs", "false") \
        .set("spark.akka.frameSize", "1000") \
        .set("spark.kryoserializer.buffer.max", "1000")
    sc = SparkContext(appName=appName, conf=sconf)
    sc.addPyFile("fea.py")
    return sc
def init_spark_context(details=[]):
    global spark_context
    if spark_context:
        return
    build_type = yb_dist_tests.global_conf.build_type
    from pyspark import SparkContext
    # We sometimes fail tasks due to unsynchronized clocks, so we should tolerate a fair number of
    # retries.
    # https://stackoverflow.com/questions/26260006/are-failed-tasks-resubmitted-in-apache-spark
    # NOTE: we never retry failed tests to avoid hiding bugs. This failure tolerance mechanism
    #       is just for the resilience of the test framework itself.
    SparkContext.setSystemProperty('spark.task.maxFailures', str(SPARK_TASK_MAX_FAILURES))
    if yb_dist_tests.global_conf.build_type == 'tsan':
        logging.info("Using a separate default Spark cluster for TSAN tests")
        default_spark_master_url = DEFAULT_SPARK_MASTER_URL_TSAN
    else:
        logging.info("Using the regular default Spark cluster for non-TSAN tests")
        default_spark_master_url = DEFAULT_SPARK_MASTER_URL

    spark_master_url = os.environ.get('YB_SPARK_MASTER_URL', default_spark_master_url)
    details += [
        'user: {}'.format(getpass.getuser()),
        'build type: {}'.format(build_type)
        ]

    if 'BUILD_URL' in os.environ:
        details.append('URL: {}'.format(os.environ['BUILD_URL']))

    spark_context = SparkContext(spark_master_url, "YB tests ({})".format(', '.join(details)))
    spark_context.addPyFile(yb_dist_tests.__file__)
def main():
	
	sc = SparkContext()
	sqlCtx = SQLContext(sc)
	config = configparser.ConfigParser()
	config.read('config.ini')

	#Path where docking list file will be saved
	path_to_save = str(sys.argv[1])

	#Path for drugdesign project
	path_spark_drugdesign = config.get('DRUGDESIGN', 'path_spark_drugdesign')

	sc.addPyFile(os.path.join(path_spark_drugdesign,"database_crud.py"))
	sc.addPyFile(os.path.join(path_spark_drugdesign,"database_io.py"))


#**************** Loading Ligand Database
	ligand_database = config.get('DEFAULT', 'ligand_database_path_file')
	rdd_database = load_database(sc, ligand_database)	
	#Creating Dataframe
	database_table = sqlCtx.createDataFrame(rdd_database)	
	database_table.registerTempTable("database")
#**************** Finish 

	#Creating input files for peforming virtual screening
	creating_docking_list(path_to_save, config, sqlCtx)
Beispiel #4
0
def getSC(appName='aux'):
    sconf = SparkConf().set("spark.hadoop.validateOutputSpecs", "false") \
        .set("spark.akka.frameSize", "2000") \
        .set("spark.kryoserializer.buffer.max", "2000")
    sc = SparkContext(appName=appName, conf=sconf)
    sc.addPyFile("src/data_loader.py")
    sc.addPyFile("src/common.py")
    return sc
def main():

	sc = SparkContext()
	sqlCtx = SQLContext(sc)

	config = configparser.ConfigParser()
	config.read('config.ini')

	#Path that contains all files for analysis
	path_analysis = config.get('DEFAULT', 'path_analysis')	
	#Ligand Database file
	ligand_database  = config.get('DEFAULT', 'ligand_database_path_file')
	#Path for drugdesign project
	path_spark_drugdesign = config.get('DRUGDESIGN', 'path_spark_drugdesign')

	#Adding Python Source file
	sc.addPyFile(os.path.join(path_spark_drugdesign,"vina_utils.py"))
	sc.addPyFile(os.path.join(path_spark_drugdesign,"json_utils.py"))

	start_time = datetime.now()

#**************** Loading file that contains all scores
	score_file_name = os.path.join(path_analysis,get_file_name_sorted_energy())
	text_file = sc.textFile(score_file_name)

	#Spliting score file by \t
	header = text_file.first() #extract header
	rdd_vs_score_sorted_split = text_file.filter(lambda x:x !=header)    #filter out header
	rdd_vs_score_sorted_split = rdd_vs_score_sorted_split.map(lambda line: line.split("\t"))
	rdd_vs_score_sorted = rdd_vs_score_sorted_split.map(lambda p: Row(energy=float(p[0]), pose=str(p[1]), ligand=get_ligand_from_receptor_ligand_model(p[1]) )) 
	#Creating Vina Datafrase based on score file
	vina_table = sqlCtx.createDataFrame(rdd_vs_score_sorted)	
	vina_table.registerTempTable("vina")	
#**************** Finish 

#**************** Loading Ligand Database
	rdd_database = load_database(sc, ligand_database)	
	#Creating Dataframe
	database_table = sqlCtx.createDataFrame(rdd_database)	
	database_table.registerTempTable("database")
#**************** Finish 
	
	#Computing ligand efficiency
	ligand_efficiencyRDD = sqlCtx.sql("SELECT vina.pose, vina.energy as affinity, (vina.energy / database.heavyAtom) as lig_efficiency FROM database JOIN  vina ON vina.ligand = database.ligand ORDER BY vina.energy") 
	ligand_efficiencyRDD = ligand_efficiencyRDD.map(lambda p: (p.pose, p.affinity, p.lig_efficiency) ).collect()

	#Saving ligand efficiency file
	save_ligand_efficiency(path_analysis, ligand_efficiencyRDD)

	finish_time = datetime.now()

	save_ligand_efficiency_log(finish_time, start_time)
Beispiel #6
0
def functionToCreateContext():
    # new context
    conf = SparkConf()
    conf = conf.setAppName(APP_NAME)
    sc   = SparkContext(conf=conf)
    
    # http://stackoverflow.com/questions/24686474/shipping-python-modules-in-pyspark-to-other-nodes
    sc.addPyFile("common.py")
    
    # As argument Spark Context and batch retention
    ssc = StreamingContext(sc, 10)
    
    # set checkpoint directory
    ssc.checkpoint(CHECKPOINT_DIR)
    
    # return streaming spark context
    return ssc
Beispiel #7
0
    def sc(self):  # noqa
        if not self._spark_context:
            spark_context = SparkContext(conf=self.spark_config)

            assert self.spex_conf.spex_file is not None, "The spex builder must be broken I do not know my spex conf!"
            spark_context.addFile(self.spex_conf.spex_file)

            for py_file in self.spex_conf.spark_config.py_files:
                spark_context.addPyFile(py_file)

            for file in self.spex_conf.spark_config.files:  # noqa
                spark_context.addFile(file)

            for jar in self.spex_conf.spark_config.jars:  # noqa
                spark_context.addFile(jar)

            self._spark_context = spark_context
            print_banner(self)
        return self._spark_context
def main():
    # master = 'local[2]'
    master = 'spark://192.168.9.164:7077'
    app_name = 'test-broadcast'
    # spark_home = '/data01/app/bigdata/spark'  # local
    spark_home = '/home/hadoop/app/spark'  # test

    pyFiles = ['mysql_utils.py']
    spark_conf = SparkConf()
    spark_conf.setMaster(master).setAppName(app_name).setSparkHome(spark_home)
    sc = SparkContext(conf=spark_conf)
    for path in (pyFiles or []):
        sc.addPyFile(path)

    external_cache = get_api_deviceinfo()

    deviceinfo_b = sc.broadcast(external_cache)


    sc.stop()
def main():
	
	sc = SparkContext()

	config = configparser.ConfigParser()
	config.read('config.ini')

	#Broadcast
	path_analysis = config.get('DEFAULT', 'path_analysis')
	path_save_log = config.get('DEFAULT', 'path_save_log')
	path_spark_drugdesign = config.get('DRUGDESIGN', 'path_spark_drugdesign')

	#Adding Python Source file
	sc.addPyFile(os.path.join(path_spark_drugdesign,"vina_utils.py"))
	sc.addPyFile(os.path.join(path_spark_drugdesign,"json_utils.py"))


	#Checking path_analysis
	if not os.path.exists(path_analysis):
		os.makedirs(path_analysis)
	else:
		if len(os.listdir(path_analysis)) > 0:
			raise EnvironmentError("Analysis directory contains files ")

	#preparing log list
	list_obj_log = []
	log_files = get_files_log(path_save_log)
	for flog in log_files:
		list_obj_log.append(flog)

	#appling map and collect
	logRDD = sc.parallelize(list_obj_log)	
	all_lines_dic = logRDD.map(build_log_lines).collect()

	#creating a dictionary from the returned rdd
	dict_from_rdd = create_dictionary_from_rdd(all_lines_dic)
	#sorting dictionary
	sorted_dict_list = sorted(dict_from_rdd.items(), key=operator.itemgetter(1))

	#saving energy file
	create_file_by_sorted_energy(path_analysis, sorted_dict_list)
Beispiel #10
0
def getSC(master, name):
    conf = (SparkConf()
             .setMaster(master)
             .setAppName(name)
             #.set("spark.executor.memory", "1g")
             .set("spark.akka.frameSize", "512")
             )
    sc = SparkContext(conf = conf)
    
    sc.addPyFile('default.py')
    sc.addPyFile('segment.py')
    sc.addPyFile('radix.py')
    sc.addPyFile('partition.py')
    sc.addPyFile('bwt.py')

    return sc
Beispiel #11
0
def main():
    try:
        partitions_num = sys.argv[1]
        csv_filename = sys.argv[2]
        base_dir = sys.argv[3]
        attrs_to_save = sys.argv[4:]
    except:
        logger.error("Usage: ./mmsongsdb_to_csv.py partitions_num <csv_filename> <directory> [<attrs_to_save>]")
        sys.exit(1)
        return
    sc = SparkContext(appName="mmSongtoCSV")
    sc.addPyFile("/root/mm-songs-db-tools-master2/hdf5_getters.py")
    sc.addPyFile("/root/mm-songs-db-tools-master2/mmsongsdbtocsvconverter.py")
    converter = MMSongsDbToCsvConverter(csv_filename, attrs_to_save)

    file_list = filter(lambda s: s.endswith(".h5"), ["%s%s%s" %(root, os.sep, file)
                                                    for root, dirs, files in
                                                    os.walk(base_dir)
                                                    for file in files])
    
    file_partitions = sc.parallelize(file_list, partitions_num)
    rdd = file_partitions.map(converter._handle_h5_file)
    #print rdd.count()
    rdd.saveAsTextFile(csv_filename)
def main():
	config = configparser.ConfigParser()
	config.read('config.ini')

	#Number of poses to select by buried area
	number_poses_to_select_mult_obj = int(config.get('DRUGDESIGN', 'number_poses_to_select_mult_obj') )
	#Path that contains all files for analysis
	path_analysis = config.get('DEFAULT', 'path_analysis')	
	#File for saving the filtered buried area
	result_file_to_select_buried_area = config.get('DRUGDESIGN', 'result_file_to_select_buried_area')	
	#File for saving the filtered buried area only poses
	result_file_to_select_buried_area_only_pose = config.get('DRUGDESIGN', 'result_file_to_select_buried_area_only_pose')
	result_file_to_select_normalized_buried_area_only_pose = config.get('DRUGDESIGN', 'result_file_to_select_normalized_buried_area_only_pose')	
	#Ligand Database file
	ligand_database  = config.get('DEFAULT', 'ligand_database_path_file')	
	#Path where all pdb receptor are
	path_receptor = config.get('DEFAULT', 'pdb_path')	
	#Path for saving pdb files of models generated by VS
	path_ligand = get_directory_pdb_analysis(path_analysis)	
	#Path where saved the selected compelex
	path_to_save = os.path.join(path_analysis, "mult_objective")
	if not os.path.exists(path_to_save):
		os.makedirs(path_to_save)

	# Create SPARK config
	maxResultSize = str(config.get('SPARK', 'maxResultSize'))
	conf = (SparkConf().set("spark.driver.maxResultSize", maxResultSize))

	# Create context
	sc = SparkContext(conf=conf)
	sqlCtx = SQLContext(sc)

	#Adding Python Source file
	#Path for drugdesign project
	path_spark_drugdesign = config.get('DRUGDESIGN', 'path_spark_drugdesign')	
	sc.addPyFile(os.path.join(path_spark_drugdesign,"vina_utils.py"))
	sc.addPyFile(os.path.join(path_spark_drugdesign,"pdb_io.py"))
	sc.addPyFile(os.path.join(path_spark_drugdesign,"database_io.py"))

	start_time = datetime.now()


	finish_time = datetime.now()

	save_log(finish_time, start_time)
def main():

	sc = SparkContext()
	sqlCtx = SQLContext(sc)

	config = configparser.ConfigParser()
	config.read('config.ini')

	#Path that contains all files for analysis
	path_analysis = config.get('DEFAULT', 'path_analysis')	
	#Ligand Database file
	ligand_database  = config.get('DEFAULT', 'ligand_database_path_file')
	#Path for drugdesign project
	path_spark_drugdesign = config.get('DRUGDESIGN', 'path_spark_drugdesign')

	#Adding Python Source file
	sc.addPyFile(os.path.join(path_spark_drugdesign,"vina_utils.py"))
	sc.addPyFile(os.path.join(path_spark_drugdesign,"database_io.py"))
	sc.addPyFile(os.path.join(path_spark_drugdesign,"hydrogen_bond_io.py"))	
	sc.addPyFile(os.path.join(path_spark_drugdesign,"hydrogen_bond_crud.py"))

	#Sufix of completly data file
	full_data_file_name = config.get('DRUGDESIGN', 'full_data_file_name')

	start_time = datetime.now()

#**************** Loading file that contains all scores and ligand efficiency
	score_file_name = os.path.join(path_analysis, "summary_energies.dat")
	text_file = sc.textFile(score_file_name)
	header = text_file.first() #extract header		

	#Spliting score file by \t
	rdd_vs_score_sorted_split = text_file.filter(lambda x:x !=header).map(lambda line: line.split("\t"))
	#rdd_vs_score_sorted = rdd_vs_score_sorted_split.map(lambda p: Row(receptor=str(p[0]), ligand=str(p[1]), mode=int(p[2]), energy=float(p[3]) ))
	rdd_vs_score_sorted = rdd_vs_score_sorted_split.map(lambda p: Row(affinity=float(p[0]), ligand_efficiency=float(p[1]), pose=str(p[2]) ))	
	#Creating Vina Datafrase based on score file
	vina_table = sqlCtx.createDataFrame(rdd_vs_score_sorted)	
	vina_table.registerTempTable("vina_lig_efficiency")
#**************** Finish 

#**************** Loading Ligand Database

	rdd_database = load_database(sc, ligand_database)
	#Creating Dataframe
	database_table = sqlCtx.createDataFrame(rdd_database)	
	database_table.registerTempTable("database")
#**************** Finish 

#**************** Loading Buried Area total
	buried_area_file_name = os.path.join(path_analysis,"summary_buried_areas_total.dat")
	buried_area_file = sc.textFile(buried_area_file_name)

	#Spliting file by \t
	header = buried_area_file.first() #extract header		
	rdd_buried_area_split = buried_area_file.filter(lambda x:x !=header).map(lambda line: line.split("\t"))
	#rdd_buried_area = rdd_buried_area_split.map(lambda p: Row( receptor=str(p[0]), ligand=str(p[1]), mode=int(p[2]), buried_lig_rec=float(p[3]), buried_lig_rec_perc=float(p[4]), buried_lig_lig_perc=float(p[5]) ))
	rdd_buried_area = rdd_buried_area_split.map(lambda p: Row( buried_area_total=float(p[0]), pose=str(p[1]) ))

	#Creating buried Dataframe
	buried_table = sqlCtx.createDataFrame(rdd_buried_area)	
	buried_table.registerTempTable("buriedArea_total")
#**************** Finish	

#**************** Loading Buried Area receptor
	buried_area_file_name = os.path.join(path_analysis,"summary_buried_areas_receptor.dat")
	buried_area_file_receptor = sc.textFile(buried_area_file_name)
	header = buried_area_file_receptor.first() #extract header	

	#Spliting file by \t
	buried_area_file_receptor_split = buried_area_file_receptor.filter(lambda x:x !=header).map(lambda line: line.split("\t"))
	buried_area_file_receptor = buried_area_file_receptor_split.map(lambda p: Row( buried_area_receptor=float(p[0]), pose=str(p[1]) ))

	#Creating buried Dataframe
	buried_area_file_receptor_table = sqlCtx.createDataFrame(buried_area_file_receptor)	
	buried_area_file_receptor_table.registerTempTable("buried_area_receptor")
#**************** Finish	

#**************** Loading Buried Area ligand
	buried_area_file_name = os.path.join(path_analysis,"summary_buried_area_ligand.dat")
	buried_area_file_ligand = sc.textFile(buried_area_file_name)
	header = buried_area_file_ligand.first() #extract header	

	#Spliting file by \t
	buried_area_file_ligand_split = buried_area_file_ligand.filter(lambda x:x !=header).map(lambda line: line.split("\t"))
	buried_area_file_ligand = buried_area_file_ligand_split.map(lambda p: Row( buried_area_lig=float(p[0]), buried_area_lig_perc=float(p[1]), buried_area_lig_lig_perc=float(p[2]), pose=str(p[3]) ))

	#Creating buried Dataframe
	buried_area_file_ligand_table = sqlCtx.createDataFrame(buried_area_file_ligand)	
	buried_area_file_ligand_table.registerTempTable("buried_area_ligand")
#**************** Finish	

#**************** Loading Hydrogen Bond 
	hydrogen_bond_num_pose_file_name = os.path.join(path_analysis,"summary_hbonds_4.0A_30.0deg.dat")
	rdd_hydrogen_bond = load_file_summary_hbonds(sc, hydrogen_bond_num_pose_file_name)
	#Creating buried Dataframe
	hydrogen_bond_table = create_df_hydrogen_bond(sqlCtx, rdd_hydrogen_bond)
	
#**************** Finish	

	#Creating SQL command
	sql = ""
	sql = "SELECT vina_lig_efficiency.pose, vina_lig_efficiency.affinity, vina_lig_efficiency.ligand_efficiency"
	sql +=" ,buriedArea_total.buried_area_total"
	sql +=" ,buried_area_receptor.buried_area_receptor"
	sql +=" ,buried_area_ligand.buried_area_lig, buried_area_ligand.buried_area_lig_perc, buried_area_ligand.buried_area_lig_lig_perc "
	sql +=" ,hydrogenbond.numHydroBond	"
	sql +=" FROM vina_lig_efficiency"
	sql +=" JOIN buriedArea_total ON buriedArea_total.pose = vina_lig_efficiency.pose"	
	sql +=" JOIN buried_area_receptor ON buried_area_receptor.pose = vina_lig_efficiency.pose"	
	sql +=" JOIN buried_area_ligand ON buried_area_ligand.pose = vina_lig_efficiency.pose"	
	sql +=" LEFT OUTER	"	
	sql +=" JOIN hydrogenbond ON hydrogenbond.pose = vina_lig_efficiency.pose"		
	sql +=" ORDER BY vina_lig_efficiency.pose"

	#Getting all data
	full_dataRDD = sqlCtx.sql(sql) 
	full_dataRDD = full_dataRDD.map(lambda p: (p.affinity, p.ligand_efficiency, p.numHydroBond, p.buried_area_lig, p.buried_area_lig_perc, p.buried_area_lig_lig_perc, p.buried_area_total, p.buried_area_receptor, p.pose) ).collect()

	#Saving file
	save_vs_full_data(path_analysis, full_dataRDD, full_data_file_name)	

	finish_time = datetime.now()

	save_vs_full_data_analysis_log(finish_time, start_time)
Beispiel #14
0
cf = ConfRead(conf_path)


def getPara(paraName, section="PARA"):
    try:
        return cf.get(section, paraName)
    except Exception, e:
        logging.error("Fail to get para[%s]: %s" % (paraName, e))
        return None
        # sys.exit(1)


code_path = getPara("code_path")
logging.info(code_path)
sc.addPyFile(code_path + '/preprocess/FeatureManager.py')
sc.addPyFile(code_path + '/preprocess/warm_start.py')
sc.addPyFile(code_path + '/preprocess/__init__.py')
sc.addPyFile(code_path + '/optimize/olbfgs.py')
sc.addPyFile(code_path + '/optimize/__init__.py')
sc.addPyFile(code_path + '/trainer.py')
sc.addPyFile(code_path + '/__init__.py')

from FeatureManager import *
from olbfgs import *
from warm_start import set_first_intercept, set_first_intercept_spark

#####################################
data_path = cf.get("PARA", "data_path")

max_iter = int(getPara('max_iter'))
def main():
    # 解析配置
    app_id = int(sys.argv[1])
    master = sys.argv[2]
    app_name = sys.argv[3]

    # 应用配置
    assert APP_CONFIG.get(app_id) is not None, \
        '[myapp streaming_app_main.main()] configuration error invalid APP_CONFIG with app.id = ' + str(app_id)
    app_conf = map_conf_properties(APP_CONFIG.get(app_id), 'app.id')[app_id]
    spark_home = app_conf['sparkHome']
    pyFiles = app_conf['pyFiles.list']
    di_id = app_conf.get('app.interfaceId')

    # 数据接口配置
    di_in_conf_with_ds_conf = get_di_conf_with_ds_conf(
        di_id, DATAINTERFACE_CONFIG, DATASOURCE_CONFIG,
        di_key='interface.id', di_ds_key='interface.sourceId', ds_key='source.id', merge_key_name='interface.id'
    )[di_id]
    print('= = ' * 20, type(di_in_conf_with_ds_conf), 'di_in_conf_with_ds_conf = ')
    pprint(di_in_conf_with_ds_conf)

    schema_conf_string = di_in_conf_with_ds_conf['schema']
    struct_type = generate_df_schmea(schema_conf_string)
    # schema_field_list = [x.name for x in struct_type.fields]
    di_in_conf_with_ds_conf['struct.type'] = struct_type
    # di_in_conf_with_ds_conf['struct.field.list'] = schema_field_list

    di_out_confs = [kv for kv in DATAINTERFACE_CONFIG.iteritems() if kv[1].get('interface.type', '') == 'output']
    print('= = ' * 20, type(di_out_confs), 'di_out_confs = ')
    pprint(di_out_confs)

    di_out_confs_with_ds_conf = list_dict_merge(
        [get_di_conf_with_ds_conf(
            kv[0], DATAINTERFACE_CONFIG, DATASOURCE_CONFIG,
            di_key='interface.id', di_ds_key='interface.sourceId', ds_key='source.id', merge_key_name='interface.id')
         for kv in DATAINTERFACE_CONFIG.iteritems() if kv[1].get('interface.type', '') == 'output']
    )

    print('= = ' * 20, type(di_out_confs_with_ds_conf), 'di_out_confs_with_ds_conf = ')
    pprint(di_out_confs_with_ds_conf)

    # 外部缓存配置
    cache_confs_with_ds_conf = list_dict_merge(
        [get_di_conf_with_ds_conf(
            kv[0], CACHE_CONFIG, DATASOURCE_CONFIG,
            di_key='cache.id', di_ds_key='cache.sourceId', ds_key='source.id', merge_key_name='cache.id')
         for kv in CACHE_CONFIG.iteritems()]
    )
    print('= = ' * 20, type(cache_confs_with_ds_conf), 'cache_confs_with_ds_conf = ')
    pprint(cache_confs_with_ds_conf)

    # 指定输入接口准备阶段的配置
    # 准备阶段配置中有效步骤的配置
    # Note: 对 dict 进行 filter,传给function的参数是 dict 的 key
    prepares_config_active = PREPARES_CONFIG[di_id] \
        if PREPARES_CONFIG.get(di_id, {}).get('prepares.enabled', False) else {}
    # print('= = ' * 20, type(prepares_config_active), 'prepares_config_active = ')
    # pprint(prepares_config_active)

    # TODO: 2中方法的结果==测试False, 删除注释
    # prepares_config_active_steps = filter(
    # lambda step_conf: step_conf[1].get('step.enabled', False),
    #     map(lambda step_conf: (step_conf[0], map_conf_properties(step_conf[1])),
    #         prepares_config_active.get('steps', {}).iteritems()
    #     )
    # )
    prepares_config_active_steps = \
        [(k, map_conf_properties(v)) for k, v in prepares_config_active.get('steps', {}).iteritems()
         if v.get('step.enabled', False)]

    print('= = ' * 20, type(prepares_config_active_steps), 'prepares_config_active_steps = ')
    pprint(prepares_config_active_steps)

    # 指定输入接口计算阶段的配置
    # filter 之后变成 list,list 的每个元素是 tuple(computeStatistics.id, computeStatistics.conf_dict)
    computes_config_active = COMPUTES_CONFIG[di_id] \
        if COMPUTES_CONFIG.get(di_id, {}).get('computeStatistics.enabled', False) else {}

    # list[{computeStatistic.id: {conf}}, ...]
    # # TODO: 2中方法的结果==测试False, 删除注释
    # compute_computeStatistics_config_active = filter(
    #     lambda computeStatistic_conf: computeStatistic_conf[1].get('computeStatistic.enabled', False),
    #     computes_config_active.get('computeStatistics', {}).iteritems())

    compute_computeStatistics_config_active = [
        kv for kv in computes_config_active.get('computeStatistics', {}).iteritems()
        if kv[1].get('computeStatistic.enabled', False)]
    print('= = ' * 20, type(compute_computeStatistics_config_active), 'compute_computeStatistics_config_active = ')
    pprint(compute_computeStatistics_config_active)

    # {computeStatistic.id -> list[step_conf_tuple]}, 其中 step_conf_tuple = (step_id, step_conf_dict)
    compute_prepares_config_active = dict(map(
        lambda computeStatistic_conf: (computeStatistic_conf[0],
                                       sorted(list_dict_merge(
                                           map(lambda step_conf: map_conf_properties(step_conf[1], 'step.id'),
                                               filter(
                                                   lambda step_conf: step_conf[1].get('step.enabled', False),
                                                   computeStatistic_conf[1].get('prepares.steps', {}).iteritems())
                                           )).iteritems())
        ), compute_computeStatistics_config_active))
    # print('= = ' * 30, compute_prepares_config_active2 == compute_prepares_config_active)

    print('= = ' * 20, type(compute_prepares_config_active), 'compute_prepares_config_active = ')
    pprint(compute_prepares_config_active)

    compute_computes_config_active = dict(map(
        lambda computeStatistic_conf: (computeStatistic_conf[0],
                                       sorted(list_dict_merge(
                                           map(lambda step_conf: map_conf_properties(step_conf[1], 'step.id'),
                                               filter(lambda step_conf: step_conf[1].get('step.enabled', False),
                                                      computeStatistic_conf[1].get('computes.steps', {}).iteritems())
                                           )).iteritems())
        ), compute_computeStatistics_config_active))
    print('= = ' * 20, type(compute_computes_config_active), 'compute_computes_config_active = ')
    pprint(compute_computes_config_active)

    test_flag = False
    if not test_flag:
        # 初始化
        # 测试 serializer
        # serializer 默认取值 PickleSerializer()  #UnpicklingError: invalid load key, '{'.
        # serializer=MarshalSerializer()  # ValueError: bad marshal data
        # serializer=AutoSerializer()  # ValueError: invalid sevialization type: {
        # serializer=CompressedSerializer(PickleSerializer())  # error: Error -3 while decompressing data: incorrect header check

        # sc = SparkContext(master, app_name, sparkHome = spark_home, pyFiles=pyFiles)
        # sc = SparkContext(master, app_name, sparkHome = sparkHome, pyFiles=pyFiles, serializer=MarshalSerializer())
        # sc = SparkContext(master, app_name, sparkHome = sparkHome, pyFiles=pyFiles, serializer=AutoSerializer())
        # sc = SparkContext(master, app_name, sparkHome = sparkHome, pyFiles=pyFiles, serializer=CompressedSerializer(PickleSerializer()))

        spark_conf = SparkConf()
        spark_conf.setMaster(master).setAppName(app_name).setSparkHome(spark_home)

        # spark streaming 调优配置
        spark_streaming_blockInterval = str(app_conf.get('spark.streaming.blockInterval', '')).strip()
        if spark_streaming_blockInterval:
            spark_conf.set('spark.streaming.blockInterval', spark_streaming_blockInterval)

        spark_streaming_kafka_maxRatePerPartition = str(
            app_conf.get('spark.streaming.kafka.maxRatePerPartition', '')).strip()
        if spark_streaming_kafka_maxRatePerPartition:
            spark_conf.set('spark.streaming.kafka.maxRatePerPartition', spark_streaming_kafka_maxRatePerPartition)

        spark_streaming_receiver_maxRate = str(app_conf.get('spark.streaming.receiver.maxRate', '')).strip()
        if spark_streaming_receiver_maxRate:
            spark_conf.set('spark.streaming.receiver.maxRate', spark_streaming_receiver_maxRate)

        spark_streaming_concurrentJobs = str(app_conf.get('spark.streaming.concurrentJobs', '')).strip()
        if spark_streaming_concurrentJobs:
            spark_conf.set('spark.streaming.concurrentJobs', spark_streaming_concurrentJobs)

        # spark sql 调优配置
        spark_sql_shuffle_partitions = str(app_conf.get('spark.sql.shuffle.partitions', '')).strip()
        if spark_sql_shuffle_partitions:
            spark_conf.set('spark.sql.shuffle.partitions', spark_sql_shuffle_partitions)

        sc = SparkContext(conf=spark_conf)
        for path in (pyFiles or []):
            sc.addPyFile(path)

        # 外部缓存优化,broadcast 分发
        cache_manager = CacheManager()
        cache_broadcast_list = \
            [(cache_id, cache_manager.cache_dataset(sc, cache_conf))
             for cache_id, cache_conf in cache_confs_with_ds_conf.iteritems()
             if cache_conf.get('broadcast.enabled', False)]

        for cache_id, cache_broadcast in cache_broadcast_list:
            cache_confs_with_ds_conf[cache_id]['broadcast'] = cache_broadcast

        batchDruationSeconds = app_conf['batchDuration.seconds']
        ssc = StreamingContext(sc, batchDruationSeconds)
        sqlc = SQLContext(sc)

        # 读取数据源
        stream = StreamingReader.readSource(ssc, di_in_conf_with_ds_conf, app_conf)
        # 流处理: 1 根据配置初始化处理指定数据接口的类的实例, 2 调用指定处理类实例的流数据处理方法
        # 测试 kafka_wordcount
        # counts = stream.flatMap(lambda line: line.split(" ")) \
        # .map(lambda word: (word, 1)) \
        # .reduceByKey(lambda a, b: a+b)
        # counts.pprint()
        StreamingApp.process(
            stream, sc, sqlc,
            di_in_conf_with_ds_conf, di_out_confs_with_ds_conf, cache_confs_with_ds_conf,
            prepares_config_active_steps, compute_prepares_config_active, compute_computes_config_active)

        ssc.start()
        ssc.awaitTermination()
#Average dialy taxi speed
from pyspark import SparkContext, StorageLevel, SparkConf
import os
conf = (SparkConf()
         .setMaster("local")
         .setAppName("My nyc taxi app")
         .set("spark.executor.memory", "1g"))

sc = SparkContext(conf=conf)
root_dir = ''

sc.addPyFile(root_dir + 'utils.py')

from utils import parse_taxi_record_avg_speed

# Load Data
raw_data_url = "data/trips-subset.csv"
raw_data = sc.textFile(raw_data_url)

trips = raw_data.map(parse_taxi_record_avg_speed).reduceByKey( lambda a, b: a + b )
trips.persist(StorageLevel.MEMORY_AND_DISK)
#Number of trips
print trips.count()

trips_avg_speed_grouped = trips_avg_speed.map(lambda ((r,c,t),s): (t[6:8],s)).groupByKey()
trips_avg_speed_dialy = trips_avg_speed_grouped.map(lambda x: (x[0], round(sum(x[1])/len(x[1]))))
#average daily taxi speed
print trips_avg_speed_dialy.sortByKey().collect()

Beispiel #17
0
from pyspark import SparkContext
from pyspark.mllib.linalg import Vectors

sc = SparkContext()
sc.addPyFile("./dbSparkFinalClass.py")
from dbSparkFinalClass import *

def main() :

    data = sc.textFile("./inputData.csv")

    parsedData = data.map(lambda s : Vectors.dense([float(i) for i in s.split(',')])).cache()
 

    dbScan = ParallelDBScan();
    trainResult = dbScan.train(parsedData, 3, 3,  4);

 
    for clus in trainResult : 
         print (str(clus[0][0]) +"," + str(clus[0][1]) + "," + clus[1]); 

    sc.stop()



main()
def main():
    sc = SparkContext()
    sqlCtx = SQLContext(sc)

    config = configparser.ConfigParser()
    config.read('config.ini')

    # Path for gromacs spark project
    path_spark_drugdesign = config.get('DRUGDESIGN', 'path_spark_drugdesign')

    # Adding Python Source file
    sc.addPyFile(os.path.join(path_spark_drugdesign, "gromacs_utils.py"))
    sc.addPyFile(os.path.join(path_spark_drugdesign, "os_utils.py"))
    sc.addPyFile(os.path.join(path_spark_drugdesign, "basic_analysis.py"))
    sc.addPyFile(os.path.join(path_spark_drugdesign, "md_description.py"))

    # Path for gromacs program
    gromacs_path = preparing_path(config.get('DRUGDESIGN', 'gromacs_path'))

    time_dt = int(config.get('GROMACS_ANALYSIS', 'time_dt'))
    time_dt_pdb = int(config.get('GROMACS_ANALYSIS', 'time_dt_pdb'))
    water_layer_thickness = int(config.get('GROMACS_ANALYSIS',
                                           'water_layer_thickness'))

    # File that contains all md to create the trajectory
    file_of_md_analysis = sys.argv[1]
    check_file_exists(file_of_md_analysis)

    start_time = datetime.now()

    # Broadcast
    gromacs_path = sc.broadcast(gromacs_path)
    time_dt = sc.broadcast(time_dt)
    time_dt_pdb = sc.broadcast(time_dt_pdb)
    water_layer_thickness = sc.broadcast(water_layer_thickness)

# ********************* STARTING FUNCTION ***************************
    def run_trajetory(md_obj):
        ana_dir = os.path.join(md_obj.get_path(), "analysis")
        make_directory(ana_dir)

        # Original file names from the simulation
        reference_xtc = os.path.join(md_obj.get_path(),
                                     md_obj.get_simulation_prefix() + ".xtc")
        reference_tpr = os.path.join(md_obj.get_path(),
                                     md_obj.get_simulation_prefix() + ".tpr")

        # File names after trajectory treatment.
        allatom_xtc = os.path.join(ana_dir, "".join([md_obj.get_prefix_ref(),
                                                     "_fit.",
                                                     str(md_obj.get_repetion_number()),
                                                     ".xtc"]))
        allatom_tpr = reference_tpr
        nonwater_xtc = os.path.join(ana_dir,"".join([md_obj.get_prefix_ref(),
                                                     "_non-water.",
                                                     str(md_obj.get_repetion_number()),
                                                     ".xtc"]))
        nonwater_tpr = os.path.join(ana_dir, "".join([md_obj.get_prefix_ref(),
                                                      "_non-water.",
                                                      str(md_obj.get_repetion_number()),
                                                      ".tpr"]))
        nonwater_pdb = os.path.join(ana_dir, "".join([md_obj.get_prefix_ref(),
                                                      "_non-water.",
                                                      str(md_obj.get_repetion_number()),
                                                      ".pdb"]))
        waterlayer_pdb = os.path.join(ana_dir, "".join([md_obj.get_prefix_ref(),
                                                        "_water-",
                                                        str(water_layer_thickness.value),
                                                        "A-layer.",
                                                        str(md_obj.get_repetion_number()),
                                                        ".pdb"]))

        # Trajectory treatment to remove PBC artifacts
        xtc_whole = os.path.join(ana_dir, "".join([md_obj.get_prefix_ref(),
                                                   "_whole.",
                                                   str(md_obj.get_repetion_number()),
                                                   ".xtc"]))

        command = "".join(["echo System | ",
                           gromacs_path.value,
                           "./gmx trjconv ",
                           "-f ",
                           reference_xtc,
                           " -s ",
                           reference_tpr,
                           " -pbc whole",
                           " -o ",
                           xtc_whole,
                           " >/dev/null 2>/dev/null"])
        proc = Popen(command, shell=True, stdout=PIPE)
        proc.communicate()

        # Extracting first frame
        gro_first_frame = os.path.join(ana_dir, "".join(["0.",
                                                         str(md_obj.get_repetion_number()),
                                                         ".gro"]))
        command = "".join(["echo System | ",
                           gromacs_path.value,
                           "./gmx trjconv ",
                           "-f ",
                           xtc_whole,
                           " -s ",
                           reference_tpr,
                           " -e 0.1 ",
                           " -o ",
                           gro_first_frame,
                           " >/dev/null 2>/dev/null"])
        proc = Popen(command, shell=True, stdout=PIPE)
        proc.communicate()

        # Removing jumps
        xtc_nojump = os.path.join(ana_dir,
                                  "".join([md_obj.get_prefix_ref(),
                                           "_nojump.",
                                           str(md_obj.get_repetion_number()),
                                           ".xtc"]))
        command = "".join(["echo System | ",
                           gromacs_path.value,
                           "./gmx trjconv ",
                           "-f ",
                           xtc_whole,
                           " -s ",
                           gro_first_frame,
                           " -pbc nojump ",
                           " -o ",
                           xtc_nojump,
                           " >/dev/null 2>/dev/null"])
        proc = Popen(command, shell=True, stdout=PIPE)
        proc.communicate()

        # Centering the protein
        xtc_center_protein = os.path.join(ana_dir, "".join([md_obj.get_prefix_ref(),
                                                            "_center.",
                                                            str(md_obj.get_repetion_number()),
                                                            ".xtc"]))
        command = "".join(["echo C-alpha System | ",
                           gromacs_path.value,
                           "./gmx trjconv ",
                           "-f ",
                           xtc_whole,
                           " -s ",
                           gro_first_frame,
                           " -center ",
                           " -o ",
                           xtc_center_protein,
                           " >/dev/null 2>/dev/null"])
        proc = Popen(command, shell=True, stdout=PIPE)
        proc.communicate()

        # Putting all atoms in a compact box
        xtc_atoms_box = os.path.join(ana_dir, "".join([md_obj.get_prefix_ref(),
                                                       "_atom.",
                                                       str(md_obj.get_repetion_number()),
                                                       ".xtc"]))
        command = "".join(["echo System | ",
                           gromacs_path.value,
                           "./gmx trjconv ",
                           "-f ",
                           xtc_center_protein,
                           " -s ",
                           gro_first_frame,
                           " -ur compact ",
                           " -pbc atom ",
                           " -o ",
                           xtc_atoms_box,
                           " >/dev/null 2>/dev/null"])
        proc = Popen(command, shell=True, stdout=PIPE)
        proc.communicate()

        # Fitting the protein
        command = "".join(["echo C-alpha System | ",
                           gromacs_path.value,
                           "./gmx trjconv ",
                           "-f ",
                           xtc_atoms_box,
                           " -s ",
                           gro_first_frame,
                           " -fit rot+trans ",
                           " -o ",
                           allatom_xtc,
                           " >/dev/null 2>/dev/null"])
        proc = Popen(command, shell=True, stdout=PIPE)
        proc.communicate()

        # Creating water-free trajectory
        command = "".join(["echo non-water | ",
                           gromacs_path.value,
                           "./gmx convert-tpr ",
                           " -s ",
                           reference_tpr,
                           " -o ",
                           nonwater_tpr,
                           " >/dev/null 2>/dev/null"])
        proc = Popen(command, shell=True, stdout=PIPE)
        proc.communicate()
        command = "".join(["echo non-water | ",
                           gromacs_path.value,
                           "./gmx trjconv ",
                           "-f ",
                           allatom_xtc,
                           " -s ",
                           gro_first_frame,
                           " -o ",
                           nonwater_xtc,
                           " >/dev/null 2>/dev/null"])
        proc = Popen(command, shell=True, stdout=PIPE)
        proc.communicate()
        command = "".join(["echo system | ",
                           gromacs_path.value,
                           "./gmx trjconv ",
                           " -f ",
                           nonwater_xtc,
                           " -s ",
                           nonwater_tpr,
                           " -o ",
                           nonwater_pdb,
                           " -dt ",
                           str(time_dt_pdb.value),
                           " >/dev/null 2>/dev/null"])
        proc = Popen(command, shell=True, stdout=PIPE)
        proc.communicate()

        # Creating water_layer_thickness - A water-layer pdb trajectory
        t = 0
        frame = 0
        ndx_water_layer = os.path.join(ana_dir, "".join([md_obj.get_prefix_ref(),
                                                         "_water-layer.",
                                                         str(md_obj.get_repetion_number()),
                                                         ".ndx"]))
        ndx_temporary = os.path.join(ana_dir, "".join([md_obj.get_prefix_ref(),
                                                       "_temporary_",
                                                       str(md_obj.get_repetion_number()),
                                                       ".ndx"]))
        if os.path.isfile(waterlayer_pdb):
            os.remove(waterlayer_pdb)
        if os.path.isfile(ndx_water_layer):
            os.remove(ndx_water_layer)
        select_string = ('\'"water_layer" (same residue as ((resname SOL and within 0.'"$water_layer_thickness"' of group "Protein"))) or\
                        (group "Ion" and within 0.'"$water_layer_thickness"' of group "Protein") \
                         or (group "Protein") \'')
        select_string = select_string.replace("$water_layer_thickness",
                                              str(water_layer_thickness.value))
        # Running make_ndx
        command = "".join(["echo -e ",
                           "'chain z'\"\\n\"'q'\"\\n\" | ",
                           gromacs_path.value,
                           "gmx make_ndx ",
                           "-f ",
                           reference_tpr,
                           " -o ",
                           ndx_temporary,
                           " >/dev/null 2>/dev/null"])
        proc = Popen(command, shell=True, stdout=PIPE)
        proc.communicate()

        # Are there ligands?
        if search_for_ligand_ndx_file(ndx_temporary) is True:
            select_string = (select_string
                             + '\'or (same residue as ((resname SOL and within 0.'"$water_layer_thickness"' of group "Other"))) \
                             or (group "Ion" and within 0.'"$water_layer_thickness"' of group "Other") \
                             or (group "Other")\'')
        select_string = select_string.replace("$water_layer_thickness",
                                              str(water_layer_thickness.value))
        command = "".join([gromacs_path.value,
                           "gmx select -f ",
                           allatom_xtc,
                           " -s ",
                           allatom_tpr,
                           " -on ",
                           ndx_water_layer,
                           " -select ",
                           select_string,
                           " -dt ",
                           str(time_dt_pdb.value),
                           " >/dev/null 2>/dev/null"])
        proc = Popen(command, shell=True, stdout=PIPE)
        proc.communicate()

        # Creating pdb files
        command = "".join(["echo ",
                           str(frame),
                           " | ",
                           gromacs_path.value,
                           "./gmx trjconv ",
                           "-f ",
                           allatom_xtc,
                           " -s ",
                           allatom_tpr,
                           " -n ",
                           ndx_water_layer,
                           " -o ",
                           "frame_",
                           str(frame),
                           ".pdb ",
                           "-b ",
                           str(t),
                           " -e ",
                           str(t),
                           " >/dev/null 2>/dev/null"])
        proc = Popen(command, shell=True, stdout=PIPE)
        proc.communicate()
        command = "".join(["echo MODEL ", str(frame), " >> ", waterlayer_pdb])
        proc = Popen(command, shell=True, stdout=PIPE)
        proc.communicate()
        command = "".join(["grep ATOM ",
                           "frame_",
                           str(frame),
                           ".pdb ",
                           ">> ",
                           waterlayer_pdb])
        proc = Popen(command, shell=True, stdout=PIPE)
        proc.communicate()
        command = "".join(["echo ENDML", ">> ", waterlayer_pdb])
        proc = Popen(command, shell=True, stdout=PIPE)
        proc.communicate()

        # Removing temporary files
        command = "".join(["rm frame_", str(frame), ".pdb"])
        proc = Popen(command, shell=True, stdout=PIPE)
        proc.communicate()
        frame = frame + 1
        t = t + int(time_dt_pdb.value)

        if os.path.isfile(xtc_whole):
            os.remove(xtc_whole)
        if os.path.isfile(xtc_nojump):
            os.remove(xtc_nojump)
        if os.path.isfile(xtc_center_protein):
            os.remove(xtc_center_protein)
        if os.path.isfile(xtc_atoms_box):
            os.remove(xtc_atoms_box)
        if os.path.isfile(ndx_water_layer):
            os.remove(ndx_water_layer)
        if os.path.isfile(gro_first_frame):
            os.remove(gro_first_frame)
        command = "rm \#* 2>/dev/null"
        proc = Popen(command, shell=True, stdout=PIPE)
        proc.communicate()

        # Basic Analysis
        basic_an_data = (gromacs_path.value,
                         nonwater_xtc,
                         nonwater_tpr,
                         md_obj.get_simulation_prefix(),
                         ana_dir,
                         time_dt.value)
        run_basic_analysis(basic_an_data)

# ************************** END FUNCTION **********************************

    list_obj_md = load_md_traj(file_of_md_analysis)

    md_trajRDD = sc.parallelize(list_obj_md)

    md_trajRDD.foreach(run_trajetory)

    finish_time = datetime.now()

    time_execution_log(finish_time, start_time, "gromacs_trajectory.log")
    config_vina = config.get('VINA', 'config_file')
    vina_path = config.get('VINA', 'vina_program')
    pdbqt_ligand_path = config.get('DEFAULT', 'pdbqt_ligand_path')
    pdbqt_receptor_path = config.get('DEFAULT', 'pdbqt_receptor_path')
    path_save_output = config.get('DEFAULT', 'path_save_structure')
    path_save_log = config.get('DEFAULT', 'path_save_log')
    path_spark_drugdesign = config.get('DRUGDESIGN', 'path_spark_drugdesign')

    path_save_log = preparing_path(path_save_log)
    make_directory(path_save_log)

    path_save_output = preparing_path(path_save_output)
    make_directory(path_save_output)

    # Adding Python Source file
    sc.addPyFile(os.path.join(path_spark_drugdesign, "docking_description.py"))

    # Broadcast
    vina_path = sc.broadcast(vina_path)
    pdbqt_ligand_path = sc.broadcast(pdbqt_ligand_path)
    pdbqt_receptor_path = sc.broadcast(pdbqt_receptor_path)
    path_save_output = sc.broadcast(path_save_output)
    path_save_log = sc.broadcast(path_save_log)
    sc.addFile(config_vina)

    file_of_vina_docking = sys.argv[1]
    check_file_exists(file_of_vina_docking)
    start_time = datetime.now()

    def run_vina_docking(vd_obj):
            activations = sparkRun(sess, image_filenames)
            return activations, y


start_time = time.time()

cat_image_names_train = os.listdir(str(train_dir) + 'cat/')
dog_image_names_train = os.listdir(str(train_dir) + 'dog/')
random.shuffle(cat_image_names_train)
random.shuffle(dog_image_names_train)

image_filenames = getTrainBatchImages(cat_image_names_train[0:20],
                                      dog_image_names_train[0:20])

imageNames = sc.parallelize(image_filenames)
sc.addPyFile("vgg16_cat_and_dog_svm.py")
sc.addPyFile("utils_svm.py")
sc.addFile("vgg16.npy")

train_activations = imageNames.map(activationRun)
train_activations_collect = train_activations.collect()
print('##### train_activations_len: ', len(train_activations_collect))
print('##### train_activations_collect 1: ',
      train_activations_collect[0][0].shape)
# print('##### train_activations_collect 2: ', train_activations_collect[0][1])
# print('##### train_activations_collect 2: ', train_activations_collect[1][1])
# print('##### train_activations_collect 2: ', train_activations_collect[2][1])
print('########duration: ' + str((time.time() - start_time)))

train_activations_rdd = sc.parallelize(train_activations_collect)
parsedData = train_activations_rdd.map(parsePoint)
Beispiel #21
0
        if (longitude > 140 and longitude <= 180):
            result.append("node_7")

        result.append("node_9")

    return result


if __name__ == '__main__':
    import happybase
    # configure the spark environment
    sparkConf = SparkConf().setAppName("Simulating Streamline")
    sparkConf.set("spark.serializer",
                  "org.apache.spark.serializer.KryoSerializer")
    sc = SparkContext(conf=sparkConf)
    sc.addPyFile("module.zip")
    #     from pywebhdfs.webhdfs import PyWebHdfsClient;
    distributed_dataset = sc.textFile(
        "hdfs:/user/uacharya/110_Stations_Data_Combined.txt",
        use_unicode=False,
        minPartitions=24)
    print("this is the driver container")
    # getting the header of the whole dataset
    header = distributed_dataset.first()
    # filtering the header out of the data
    distributed_dataset = distributed_dataset.filter(lambda d: d != header)
    # mapping the data to prepare for processing
    data_in_required_format = distributed_dataset.map(
        create_required_datewise_data)
    data_in_required_format.cache()
    #collecting keys to do batch processing based on keys
Beispiel #22
0
from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import KafkaUtils
from movie import Movie, clean_movie

es_write_conf = {"es.nodes": 'localhost',
                 "es.port": '9200',
                 "es.resource": 'movies-index/movie',
                 "es.nodes.wan.only": "true"
                 }

conf = SparkConf().setAppName("PythonStreamingDirectKafkaWordCount") \
    .set("es.nodes", "localhost:9200") \
    .set("es.index.auto.create", "true")

sc = SparkContext(appName="PythonStreamingDirectKafkaWordCount", conf=conf)
sc.addPyFile("./Spark/movie.py")
ssc = StreamingContext(sc, 5)
# brokers, topic = sys.argv[1:]
kvs = KafkaUtils.createDirectStream(ssc, ["movies"], {"metadata.broker.list": "localhost:9092"})

x = kvs.map(lambda row: row[1]) \
    .map(lambda row: row.split("||")) \
    .map(lambda row: Movie(row[0], float(row[1]), row[2], float(row[3]), float(row[4]), row[5], row[6],
                           row[7], row[8], row[9], row[10], row[11])) \
    .map(lambda obj: clean_movie(obj)) \
    .map(lambda obj: obj.__dict__) \
    .map(lambda obj: (None, obj)) \
    .foreachRDD(lambda rdd: rdd.saveAsNewAPIHadoopFile(
                                path='-',
                                outputFormatClass="org.elasticsearch.hadoop.mr.EsOutputFormat",
                                keyClass="org.apache.hadoop.io.NullWritable",
Beispiel #23
0
    return rdd


# Add the streaming package and initialize
findspark.add_packages(
    ["org.apache.spark:spark-streaming-kafka-0-8_2.11:2.2.2"])
findspark.init()
TOPICS = ['taiwan']
BROKERS = "localhost:9092"
PERIOD = 10
APP_NAME = 'sentiment'
COMPANY = 'taiwan'

sc = SparkContext(appName="PythonStreamingDirectKafkaWordCount")
sc.addPyFile(
    os.path.dirname(os.path.join(os.path.realpath(__file__),
                                 'stanfordNLP.py')))

# except:
#     conf = SparkConf().set("spark.default.paralleism", 1)
#     spark = pyspark.sql.SparkSession.builder \
#                                     .master("local[4]") \
#                                     .appName(APP_NAME) \
#                                     .config(conf=conf)  \
#                                     .getOrCreate()
# sc = spark.sparkContext
#create a streaming context with batch interval 10 sec
ssc = StreamingContext(sc, PERIOD)
directKafkaStream = KafkaUtils.createDirectStream(
    ssc, TOPICS, {"metadata.broker.list": BROKERS})
Beispiel #24
0
from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import KafkaUtils
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS

from pyspark.sql import SparkSession
from pyspark import SparkContext

sc = SparkContext("spark://vm1:7077", "StreamProcessing")
sc.addPyFile('/home/tom/Spark-Recommendation-System/db_connector.py')
sc.setLogLevel("ERROR")

from db_connector import DBConnector

TRIGGER_INTERVAL = 30  # in seconds
TOPIC_NAME = 'spark_streaming'
KAFKA_PORT = 'vm1:2181'
db = DBConnector('streaming_db')


def fit_model(df):
    als = ALS(maxIter=10,
              regParam=0.01,
              userCol="userId",
              itemCol="movieId",
              ratingCol="rating",
              coldStartStrategy="drop")
    model = als.fit(df)
    return model

if __name__ == "__main__":
    start_time = time.time()
    if len(sys.argv) != 3:
        print("Number of arguments not valid!")
        sys.exit(1)

    with open('./config.json') as config:
        parameters = json.load(config)["configuration"][0]

    INPUT_PATH = str(sys.argv[1])
    OUTPUT_PATH = str(sys.argv[2])

    sc = SparkContext("yarn", "Kmeans")
    sc.setLogLevel("ERROR")
    sc.addPyFile(
        "./point.py"
    )  ## It's necessary, otherwise the spark framework doesn't see point.py

    print("\n***START****\n")

    points = sc.textFile(INPUT_PATH).map(Point).cache()
    initial_centroids = init_centroids(points, k=parameters["k"])
    distance_broadcast = sc.broadcast(parameters["distance"])
    centroids_broadcast = sc.broadcast(initial_centroids)

    stop, n = False, 0
    while True:
        print("--Iteration n. {itr:d}".format(itr=n + 1), end="\r", flush=True)
        cluster_assignment_rdd = points.map(assign_centroids)
        sum_rdd = cluster_assignment_rdd.reduceByKey(lambda x, y: x.sum(y))
        centroids_rdd = sum_rdd.mapValues(
def main():

    config = configparser.ConfigParser()
    config.read('config.ini')

    #Number of poses to select by buried area
    number_poses_to_select_hydrogen_bond = int(
        config.get('DRUGDESIGN', 'number_poses_to_select_hydrogen_bond'))
    # list of residues to select buried area
    file_select_hydrogen_bond = config.get(
        'DRUGDESIGN', 'file_residue_to_select_hydrogen_bond')
    #Path that contains all files for analysis
    path_analysis = config.get('DEFAULT', 'path_analysis')
    #Path where all pdb receptor are
    path_receptor = config.get('DEFAULT', 'pdb_path')
    #Ligand Database file
    ligand_database = config.get('DEFAULT', 'ligand_database_path_file')
    #Path for saving pdb files of models generated by VS
    path_ligand = get_directory_pdb_analysis(path_analysis)
    #File for saving the filtered buried area
    result_file_to_select_hydrogen_bond = config.get(
        'DRUGDESIGN', 'result_file_to_select_hydrogen_bond')
    #File for saving the filtered buried area only poses
    result_file_to_select_hydrogen_bond_only_pose = config.get(
        'DRUGDESIGN', 'result_file_to_select_hydrogen_bond_only_pose')
    result_file_to_select_normalized_hydrogen_bond_only_pose = config.get(
        'DRUGDESIGN',
        'result_file_to_select_normalized_hydrogen_bond_only_pose')
    result_file_to_select_normalized_heavy_atom_hydrogen_bond_only_pose = config.get(
        'DRUGDESIGN',
        'result_file_to_select_normalized_heavy_atom_hydrogen_bond_only_pose')
    #Path where saved the selected compelex
    path_to_save = os.path.join("selected_complexo", "hydrogen_bond")
    path_to_save = os.path.join(path_analysis, path_to_save)
    if not os.path.exists(path_to_save):
        os.makedirs(path_to_save)
    #Path where saved the normalized selected compelex
    path_to_save_normalized_da = os.path.join(
        "selected_complexo", "normalized_hydrogen_bond_donors_acceptors")
    path_to_save_normalized_da = os.path.join(path_analysis,
                                              path_to_save_normalized_da)
    if not os.path.exists(path_to_save_normalized_da):
        os.makedirs(path_to_save_normalized_da)
    path_to_save_normalized_heavyAtom = os.path.join(
        "selected_complexo", "normalized_hydrogen_bond_heavyAtom")
    path_to_save_normalized_heavyAtom = os.path.join(
        path_analysis, path_to_save_normalized_heavyAtom)
    if not os.path.exists(path_to_save_normalized_heavyAtom):
        os.makedirs(path_to_save_normalized_heavyAtom)
    #Path where saved the normalized by residue list selected compelex
    path_to_save_normalized_residue = os.path.join(
        "selected_complexo",
        "normalized_hydrogen_bond_residue_donors_acceptors")
    path_to_save_normalized_residue = os.path.join(
        path_analysis, path_to_save_normalized_residue)
    if not os.path.exists(path_to_save_normalized_residue):
        os.makedirs(path_to_save_normalized_residue)

    path_to_save_normalized_residue_heavyAtoms = os.path.join(
        "selected_complexo", "normalized_hydrogen_bond_residue_heavyAtoms")
    path_to_save_normalized_residue_heavyAtoms = os.path.join(
        path_analysis, path_to_save_normalized_residue_heavyAtoms)
    if not os.path.exists(path_to_save_normalized_residue_heavyAtoms):
        os.makedirs(path_to_save_normalized_residue_heavyAtoms)

    # Create SPARK config
    maxResultSize = str(config.get('SPARK', 'maxResultSize'))
    conf = (SparkConf().set("spark.driver.maxResultSize", maxResultSize))

    # Create context
    sc = SparkContext(conf=conf)
    sqlCtx = SQLContext(sc)

    start_time = datetime.now()

    #Broadcast
    path_to_save_b = sc.broadcast(path_to_save)
    path_receptor_b = sc.broadcast(path_receptor)
    path_ligand_b = sc.broadcast(path_ligand)

    #Adding Python Source file
    #Path for drugdesign project
    path_spark_drugdesign = config.get('DRUGDESIGN', 'path_spark_drugdesign')
    sc.addPyFile(os.path.join(path_spark_drugdesign, "vina_utils.py"))
    sc.addPyFile(os.path.join(path_spark_drugdesign, "database_io.py"))
    sc.addPyFile(os.path.join(path_spark_drugdesign, "hydrogen_bond_io.py"))
    sc.addPyFile(os.path.join(path_spark_drugdesign, "hydrogen_bond_crud.py"))
    sc.addPyFile(os.path.join(path_spark_drugdesign, "json_utils.py"))

    #load all-residue_hbonds_4.0A_30.0deg.dat file
    path_file_hydrogen_bond = os.path.join(
        path_analysis, "all-residue_hbonds_4.0A_30.0deg.dat")
    all_residue_split = load_file_all_residue_hbonds(sc,
                                                     path_file_hydrogen_bond)

    #Creating all_residue Dataframe
    df_all_residue = create_df_all_residue(sqlCtx, all_residue_split)

    if os.path.isfile(file_select_hydrogen_bond):
        #Creating resudue list as Dataframe
        residue_listRDD = load_file_select_hydrogen_bond(
            sc, file_select_hydrogen_bond)
        df_residue_list = create_df_residue_list(sqlCtx, residue_listRDD)

        df_result = create_df_all_residue_filtered_by_res_list(sqlCtx)
        #Saving result
        path_file_result_file = os.path.join(
            path_analysis, result_file_to_select_hydrogen_bond)
        save_result(path_file_result_file, df_result)

        #Grouping by poses
        df_result = get_group_by_poses_all_residue_filtered_by_res_list(sqlCtx)

        #Saving result only pose
        path_file_result_file_only_pose = os.path.join(
            path_analysis, result_file_to_select_hydrogen_bond_only_pose)
        save_result_only_pose(path_file_result_file_only_pose, df_result)

        #Loading all poses group by poses
        only_poseRDD = load_only_poses_file_hydrogen_bond(
            sc, path_file_result_file_only_pose)
        only_pose_takeRDD = only_poseRDD.take(
            number_poses_to_select_hydrogen_bond)

        #Calculating normalized hydrogen bond

        #Loading database
        rdd_database = load_database(sc, ligand_database)
        #Creating Dataframe
        database_table = sqlCtx.createDataFrame(rdd_database)
        database_table.registerTempTable("database")

        #Creating Dataframe normalized_by_donors_acceptors
        df_result = create_df_normalized_by_donors_acceptors(sqlCtx, df_result)
        #Saving result only pose by normalized hydrogen bond
        path_file_result_file_only_pose = os.path.join(
            path_analysis,
            result_file_to_select_normalized_hydrogen_bond_only_pose)
        save_result_only_pose_normalized_by_residue_list(
            path_file_result_file_only_pose, df_result)

        #Loading poses - normalized_residues_filtered_by_list
        only_pose_normalizedRDD = load_only_poses_file_hydrogen_bond_normalized_by_residues(
            sc, path_file_result_file_only_pose)
        only_pose_normalizedRDD = only_pose_normalizedRDD.take(
            number_poses_to_select_hydrogen_bond)

        # Normalized Hydrogen Bond by heavy atoms
        df_result = create_df_normalized_by_heavy_atoms(sqlCtx)

        #Saving result only pose by normalized buried area
        path_file_result_file_only_pose = os.path.join(
            path_analysis,
            result_file_to_select_normalized_heavy_atom_hydrogen_bond_only_pose
        )
        save_result_only_pose_normalized_by_residue_list_heavy_atoms(
            path_file_result_file_only_pose, df_result)

        #Loading poses - normalized_residues_filtered_by_list
        only_pose_normalized_heavyAtomsRDD = load_only_poses_file_hydrogen_bond_normalized_by_residues(
            sc, path_file_result_file_only_pose)
        only_pose_normalized_heavyAtomsRDD = only_pose_normalized_heavyAtomsRDD.take(
            number_poses_to_select_hydrogen_bond)

#************** END OF RESIDUE LIST

#Loading normalized poses by donors and acceptors
    path_file_normalized_pose = os.path.join(
        path_analysis,
        "summary_normalized_hbonds_donors_acceptors_4.0A_30.0deg.dat")
    normalized_poseRDD = load_file_summary_normalized_hbonds(
        sc, path_file_normalized_pose)

    normalized_poseRDD = normalized_poseRDD.take(
        number_poses_to_select_hydrogen_bond)

    #Loading normalized poses by heavy atoms
    path_file_normalized_pose = os.path.join(
        path_analysis, "summary_normalized_hbonds_heavyAtom_4.0A_30.0deg.dat")
    normalized_pose_heavyAtomsRDD = load_file_summary_normalized_hbonds(
        sc, path_file_normalized_pose)

    normalized_pose_heavyAtomsRDD = normalized_pose_heavyAtomsRDD.take(
        number_poses_to_select_hydrogen_bond)

    # ******************** STARTED FUNCTION ********************************
    def build_complex_from_pose_file_name(p_name):
        from vina_utils import get_receptor_from_receptor_ligand_model, get_ligand_from_receptor_ligand_model, get_model_from_receptor_ligand_model, get_separator_filename_mode
        #Broadcast
        path_to_save = path_to_save_b.value
        path_receptor = path_receptor_b.value
        path_ligand = path_ligand_b.value
        #Based on row value from dataframe
        pose_file_name = p_name.pose

        #Receptor
        receptor_file_name = get_receptor_from_receptor_ligand_model(
            pose_file_name)
        receptor_file = os.path.join(path_receptor,
                                     receptor_file_name + ".pdb")
        f_receptor_file = open(receptor_file, "r")
        #ligand file name
        ligand_file_name = os.path.join(path_ligand, pose_file_name + ".pdb")
        f_ligand_file_name = open(ligand_file_name, "r")

        #Open file for writting the complex
        full_path_for_save_complex = os.path.join(path_to_save,
                                                  p_name.f_name + ".pdb")
        f_compl = open(full_path_for_save_complex, "w")
        #Insert lines of receptor
        for item in f_receptor_file:
            if str(item).find("END") == -1:
                f_compl.write(item)
        #Insert lines of model
        for item in f_ligand_file_name:
            if str(item).find("REMARK") == -1:
                f_compl.write(item)
        #Closing files
        f_compl.close()
        f_ligand_file_name.close()
        f_receptor_file.close()


# ******************** FINISHED FUNCTION ********************************

    if os.path.isfile(file_select_hydrogen_bond):
        #Selecting poses by residues filtered
        sc.parallelize(only_pose_takeRDD).foreach(
            build_complex_from_pose_file_name)
        #Updated path to save complex
        path_to_save_b = sc.broadcast(path_to_save_normalized_residue)
        sc.parallelize(only_pose_normalizedRDD).foreach(
            build_complex_from_pose_file_name)
        #Updated path to save complex
        path_to_save_b = sc.broadcast(
            path_to_save_normalized_residue_heavyAtoms
        )  #Updated path to save complex
        sc.parallelize(only_pose_normalized_heavyAtomsRDD).foreach(
            build_complex_from_pose_file_name)

    #Selecting poses by normalized donors and acceptors
    #Broadcast
    path_to_save_b = sc.broadcast(
        path_to_save_normalized_da)  #Updated path to save complex
    sc.parallelize(normalized_poseRDD).foreach(
        build_complex_from_pose_file_name)

    #Selecting poses by normalized heavy atoms
    #Broadcast
    path_to_save_b = sc.broadcast(
        path_to_save_normalized_heavyAtom)  #Updated path to save complex
    sc.parallelize(normalized_pose_heavyAtomsRDD).foreach(
        build_complex_from_pose_file_name)

    finish_time = datetime.now()

    save_log(finish_time, start_time)
#todo: visualisasi

from pyspark import SparkConf, SparkContext
from pyspark.sql import SQLContext

conf = SparkConf().setMaster('local').setAppName('InternationalStudentsByCountry')
sc = SparkContext(conf = conf)
sqlContext = SQLContext(sc)

import pyspark_csv as pycsv
sc.addPyFile('pyspark_csv.py')

def extract_row(row):
   country = row[2]
   year = row[13]
   internationalStudents = 0
   if (row[9] != None and row[11] != None):
      numStudents = float(str(row[9]).replace(',',''))
      internationalPercentage = float(str(row[11])[:-1])
      internationalStudents = int(numStudents * internationalPercentage / 100.0)
   return ((year, country), internationalStudents)

plaintext_rdd = sc.textFile('file:///Users/Wik/Documents/Kuliah/BigData/Tugas-2/WorldRankUniversity-Mining/data/timesData.csv')
rdd = pycsv.csvToDataFrame(sqlContext, plaintext_rdd).rdd
mapped = rdd.map(extract_row)
reduced = mapped.reduceByKey(lambda a, b : a + b)
sorted = reduced.sortByKey()
result = sorted.collect()
for item in result:
   print str(item[0][0]) + ' - ' + str(item[0][1]) + ': ' + str(item[1])
   
#spark-submit <name of job py file>.py /data/movie-ratings/ratings.dat /data/movie-ratings/movies.dat 1 0.97 20 1000 COSINE
import findspark
findspark.init()


import pyspark
import sys
import re
import random
#import numpy

from pyspark import SparkConf, SparkContext
sc = SparkContext(appName = "MovieLens")
from math import sqrt
#sc.addPyFile("similarity.py")
sc.addPyFile("movielensfcn.py")


from movielensfcn import parseMovies, removeDuplicates, itemItem
#from similarity import cosine_similarity, jaccard_similarity


if __name__=="__main__":
    if len(sys.argv)< 3:
        print >> sys.stderr, "Usage: MovieLens ratings movies"
        exit(-1)
    ratings_file = sys.argv[1]
    movies_file = sys.argv[2]
    if len(sys.argv)>6:
        movie_id = int(sys.argv[3])
        threshold = float(sys.argv[4])
Beispiel #29
0
from sklearn import cross_validation 
from sklearn.metrics import precision_recall_curve
from sklearn.cross_validation import train_test_split 
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm 
from sklearn.grid_search import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.cross_validation import cross_val_score



#add spark content 
sc = SparkContext(appName="sidgan")
sc.addPyFile('/home/spark-1.4.1-bin-hadoop2.4/pyspark-csv/pyspark_csv.py')

def transform_csv():
    global data
    global target
    #make target column for classification
    #target = data.map(convert)
    #this gives RDD
    #target should be float and not RDD

    target = filter(convert)
    #should give target as a float 

def merge_csv():
    global data
    global week 
Beispiel #30
0
			returns.append([ arr[x] for x in stations[val] ] )
	return returns
if __name__ == "__main__":
	import os
	#if not os.path.exists("step" + str(step_size) + "/window_" + str(window_size) + "_dataset" ):
	#	Path( "step" + str(step_size) + "/window_" + str(window_size) + "_dataset").mkdir(parents=True, exist_ok=True)
	conf = SparkConf().setAppName("features")
	# Run the above function and store its results in a variable.   
	full_file_paths = get_filepaths("inputs/")
	#random.seed(100)
	paths = [ x for x in full_file_paths if x.split("/")[-1][:2] =="sp" ]#[ : int( 0.3 * len(full_file_paths) )]
	random.shuffle(paths)
	
	sc = SparkContext(master='spark://137.30.125.208:7077', appName='spark_features')
	#local files to import 
	sc.addPyFile('bfi.py')
	sc.addPyFile('features.py')
	for f in paths:
		fil = open(f).readlines()
		val = gen_data(fil)
		depth = calculate_depth(val[1]) 
		val = sc.parallelize(val)
		features = val.map( lambda x: separate_into_energy_features(x) )#.map(lambda a: window(a) )
		llist = features.collect()
		val = [ a for a in window(llist, n = window_size) if a != [] and a[0] != [] ] 
		my_val = sc.parallelize(val).map(lambda a : multiply(a) )
		vals = my_val.collect()
		bfis =  features.map(lambda a : calculate_bfi(a) ).collect()
		with open("feature_files/" + f.split("/")[-1], "w" ) as myfile:
			#since we loop over outputs rather than inputs, it is implicitly - window_size too
			inputs = [x for x in vals ][1:][:len(vals ) - step_size]# - window_size  ] # first element is [0] by some thing and window already starts like that
Beispiel #31
0
            iter += 1

        return KMeansModel(centers)


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="do kmeans clustering")
    parser.add_argument("master", type=str)
    parser.add_argument("datafile", type=str)
    parser.add_argument("outputdir", type=str)
    parser.add_argument("k", type=int)
    parser.add_argument("--maxiter", type=float, default=20, required=False)
    parser.add_argument("--tol", type=float, default=0.001, required=False)
    parser.add_argument("--preprocess", choices=("raw", "dff", "dff-highpass", "sub"), default="raw", required=False)

    args = parser.parse_args()

    sc = SparkContext(args.master, "kmeans")

    if args.master != "local":
        egg = glob.glob(os.path.join(os.environ['THUNDER_EGG'], "*.egg"))
        sc.addPyFile(egg[0])

    data = load(sc, args.datafile, args.preprocess).cache()
    model = KMeans(k=args.k, maxiter=args.maxiter, tol=args.tol).train(data)
    labels = model.predict(data)

    outputdir = args.outputdir + "-kmeans"
    save(model.centers, outputdir, "centers", "matlab")
    save(labels, outputdir, "labels", "matlab")
from pyspark.sql import SparkSession
import time
from pyspark import SparkContext
sc = SparkContext('yarn')
sc.addPyFile("s3a://rogerzhuo/graphframes-0.6.0-spark2.3-s_2.11.jar")
from pyspark.sql.functions import *
from graphframes import *

spark = SparkSession.builder.appName("Prime_algorithm").getOrCreate()

# Prepare data.
v1 = spark.createDataFrame([
    (0,),
    (1,),
    (2,),
    (3,),
    (4,),
    (5,),
    (6,),
    (7,),
    (8,),
    (9,)], ["id"])

# Edges DataFrame
e1 = spark.createDataFrame([
    (1, 2, 1),
    (2, 3, 7),
    (1, 9, 5),
    (1, 8, 10),
    (9, 0, 2),
    (9, 5, 6),
Beispiel #33
0
			printOnConsole('Nothing to process')
	
	except Exception, ex:
		printOnConsole('There was an error...')
		print ex			
	
if __name__ == "__main__":
	#_conf = new SparkConf(true)
    	conf = (SparkConf()
         .setAppName(SPARK_APPNAME)
         .set("spark.serializer", SPARK_SERIALIZER))

        sc = SparkContext(conf=conf)
	ssc = StreamingContext(sc, SPARK_STREAM_BATCH)

	sc.addPyFile(CODE_PATH + '/pyspark_csv.py')
        sc.addPyFile(CODE_PATH + '/constants.py')

        sc._jsc.hadoopConfiguration().set("fs.s3n.awsAccessKeyId", S3ACCESSID)
        sc._jsc.hadoopConfiguration().set("fs.s3n.awsSecretAccessKey", S3SECRETKEY)

	sqlContext = SQLContext(sc)
	registerUDF(sqlContext)

	printOnConsole('Streaming started')

	
	kinesisStream = [KinesisUtils.createStream(ssc, APPLICATION_NAME, STREAM_NAME, ENDPOINT, REGION_NAME, INITIAL_POS, CHECKPOINT_INTERVAL, awsAccessKeyId =AWSACCESSID, awsSecretKey=AWSSECRETKEY, storageLevel=STORAGE_LEVEL) for _ in range (NUM_STREAMS)]
	
	unifiedStream = ssc.union(*kinesisStream)
		
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.sql import SQLContext
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession

conf = (SparkConf()
        .setMaster("spark://sparkmaster:7077")
        .setAppName("HUDI_EXERCISE"))

sc = SparkContext(conf=conf)
spark = SparkSession(sc)

os.system("echo 'PROCESSING DATA...'")

sc.addPyFile("/var/hoodie/ws/spark-job/utils.py")
data = sc.wholeTextFiles("hdfs://namenode:8020/wiki/extra/delete")

pages = data.flatMap(lambda x: (x[1].split('</doc>'))).map(lambda x: (Utils.get_title(x), Utils.get_date_timestamp(
    x), Utils.get_content(x))).filter(lambda x: ((len(x[0]) != 0) or (len(x[1]) != 0))).filter(lambda x: Utils.check_if_person(x[1]))
df = pages.toDF(["title", "date", "content"])
df = df.select('title', to_date(
    df.date, 'MM/dd/yyyy').alias('date'), "content")

tableName = "hudi_celebrities"
basePath = "hdfs://namenode:8020/wiki/hudi_celebrities"

hudi_delete_options = {
    'hoodie.table.name': tableName,
    'hoodie.datasource.write.table.type': 'COPY_ON_WRITE',
    'hoodie.datasource.write.operation': 'delete',

    sc.stop()

if __name__ == '__main__':
    main()


import sys
sys.path.append('/usr/local/lib/python2.7/site-packages')
sys.path.append('/home/hadoop/app/spark/python')
sys.path.append('/home/hadoop/app/spark/python/lib/py4j-0.8.2.1-src.zip')
from pyspark import SparkContext, SparkConf
from mysql_utils import MySQLUtils
master = 'local[2]'
app_name = 'test-broadcast'
# spark_home = '/data01/app/bigdata/spark'  # local
spark_home = '/home/hadoop/app/spark'  # test

pyFiles = ['mysql_utils.py']
spark_conf = SparkConf()
spark_conf.setMaster(master).setAppName(app_name).setSparkHome(spark_home)
sc = SparkContext(conf=spark_conf)
for path in (pyFiles or []):
    sc.addPyFile(path)

external_cache = get_api_deviceinfo()

deviceinfo_b = sc.broadcast(external_cache)

Beispiel #36
0
from pyspark.sql.types import StringType, DateType, IntegerType
from pyspark import SparkConf, SparkContext
from pyspark.sql.functions import UserDefinedFunction
from pyspark.storagelevel import StorageLevel
from flask import Flask, jsonify
from utils import crossdomain
app = Flask(__name__)

conf = SparkConf() \
    .setMaster("spark://172.21.0.14:7077") \
    .setAppName("tv-scenes") \
    .set("spark.executor.memory", "1g") \
    .set("spark.ui.port", 4040)

sc = SparkContext(conf=conf)
sc.addPyFile("foo.py")
sqlContext = SQLContext(sc)

CSV_PATH = 'file:///home/ubuntu/DWDB/'


def read_csv_into_temptable(table_name):
    filename = "{}/{}.csv".format(CSV_PATH, table_name)
    df = sqlContext.read.csv(filename, header=True, inferSchema=True)
    df.registerTempTable(table_name)


# load data

read_csv_into_temptable('EventClientChannelTune')
read_csv_into_temptable('Channels')
        for x in range(100)
    ]
    rdd = run(image_collection).collect()
    rdd.sort(key=lambda x: x[0])
    rdd = [str(x[0]) + ": " + str(x[1]) + "\n" for x in rdd]
    with open("test/test_output.txt", 'w') as f:
        f.writelines(rdd)
else:
    #connecting mysql
    # db = mysql.connector.connect(user='******', password='******',
    #                           host=os.environ['mySQLHost'],
    #                           database='my_db')
    # cursor=db.cursor()
    # db.commit()
    sc = SparkContext()
    sc.addPyFile("./helper-functions.py")
    sc.addPyFile("./constants.py")
    sc.addPyFile("./spark_image_compressor.py")
    while True:
        # sql1='select * from people'
        # cursor.execute(sql1)
        # data=cursor.fetchall()
        # if len(data) % 3 == 0:

        # file_like=cStringIO.StringIO(data[0][0])
        # img=PIL.Image.open(file_like)
        # this is the line that gets the images
        image = cv2.imread(args.input, cv2.IMREAD_UNCHANGED)
        image_collection = [(x, image) for x in range(10)]
        rdd = run(image_collection, sc).collect()
        cv2.imwrite(args.output, rdd[0][1])
Beispiel #38
0
def main():
    sc = SparkContext(conf=SparkConf().setAppName("wil_hot_sku_calc_online"))
    hc = HiveContext(sc)
    sc.addPyFile(sys.argv[1])
    from core.common import common
    param = common.init_params(sys.argv, p)
    # date, pid, ts, dc_id = param["date"], param["pid"], param["ts"], param["dc_id"]
    date = '2018-10-19'
    pid = '201810190031002'
    ts = '1539915102918'
    today = dt.datetime.strptime(date, "%Y-%m-%d").date()
    someday = dt.datetime.strptime('2018-10-24', "%Y-%m-%d").date()
    yesterday = today - dt.timedelta(1)
    three_days_ago = today - dt.timedelta(3)
    # thirty_days = today - dt.timedelta(30)
    # 2.1.关联订单数据
    # 未来在表a, b中要加上dc_id字段, 并在join时使用dc_id作关联
    sql_hot_sku_data_all = """
        select a.sku_id as sku_id
               ,a.future_source_store_id as future_source_store_id
               ,to_date(b.out_wh_tm) as sub_dt
               ,b.ord_id as ord_id
               ,b.sale_qtty as sale_qtty
        from
            (select sku_id
                   ,current_source_store_id
                   ,future_source_store_id
            from app.app_wil_hot_sku_all
            where dt='""" + str(today) + """'
            and pid='""" + pid + """'
            and ts='""" + ts + """'
            ) a
        left join
            (select sku_id
                   ,store_id
                   ,parent_sale_ord_id as ord_id
                   ,sale_qtty
                   ,out_wh_tm
            from app.app_wil_hot_sale_store_sku_ord_sale
            where dt = '""" + str(someday) + """'
                and sale_qtty >= 0
               
            ) b
        on a.current_source_store_id = b.store_id
            and a.sku_id = b.sku_id
        where  to_date(out_wh_tm) is not null
    """
    hc.sql(sql_hot_sku_data_all).createOrReplaceTempView("tb_sku_data_all")

    # 2.2.得到sku的打标信息
    # 当前来源仓有多个的时候, 只有有可能被选入, 则认为这个sku可以被选品选出: white_flag = 1
    # 当前来源仓有多个的时候, 只要在set1出现过, 则认为这个sku在爆品仓中不动: unset1_flag = 0

    sql_hot_sku_all_flag = """
        select sku_id
                ,future_source_store_id
                ,max(white_flag) as white_flag
                ,min(unset1_flag) as unset1_flag
        from
            (select sku_id
                   ,white_flag
                   ,future_source_store_id
                   ,case when future_source_store_id <> hot_sku_target_store_id then 1 else 0 end as unset1_flag
            from app.app_wil_hot_sku_all
            ) a
        group by sku_id,future_source_store_id

    """
    hc.sql(sql_hot_sku_all_flag).createOrReplaceTempView("tb_sku_all_flag")

    # 订单维度设置一个ord-sku 的权重值ord_weight, 一订单中如果含有3个sku, 这三个sku的这个订单对其总订单量的贡献为1/3)
    sql_hot_ord_cnt = """
        select aa.sub_dt
           ,aa.sku_id
           ,round(sum(case when aa.ord_weight is not null then aa.ord_weight else 1.0 end),2) as ord_cnt
        from
        (select a.*
               ,b.ord_weight
        from tb_sku_data_all a
        left join
            (
            select ord_id
                    ,cast(1/count(distinct sku_id) as float) as ord_weight
            from tb_sku_data_all
            group by ord_id
            ) b
        on a.ord_id = b.ord_id
        ) aa
    group by sub_dt,sku_id
    """
    hc.sql(sql_hot_ord_cnt).createOrReplaceTempView("tb_hot_ord_cnt")

    # -- 设目标是a, 则每天选出min(0.15 + a, 1)比例订单
    # --不同的sku根据其
    # future_source_store_id
    # 不同而有不同的
    # hot_sku_out_store_rate
    # 然后分别计算, 需要分量的比例
    # ---------- 如果分开计算, 相当于对订单进行了截断, 计算值相同, 分开计算的结果也不会和和在一起的结果一致
    # ---------- 所以需要实际设置是否是两个不同值判断记否要分开计算, 如果只有一个值, 就和在一起走另一个选品算法
    sql_sku_default_ratio = """
        select future_source_store_id
           ,round(avg(hot_sku_out_store_rate),2) as default_ratio
        from app.app_wil_hot_sku_all
        where future_source_store_id != hot_sku_target_store_id and future_source_store_id is not null
        group by future_source_store_id
    """
    value = hc.sql(sql_sku_default_ratio).rdd.map(lambda x: map(
        lambda a: a, x)).filter(lambda x: x[1] != None).collect()
    if len(value) == 2 and value[0][1] != value[1][1]:
        v_future_source_store_id_1 = value[0][0]
        v_future_source_store_id_2 = value[1][0]
        v_param_1 = min(0.15 + value[0][1], 1)
        v_param_2 = min(0.15 + value[1][1], 1)
        #     判断 不同的 future_source_store_id 是否对应不同的 default_ratio , 如果是 则 分别得到 两种 default_ratio: 并通过<<算法1>>分别选品
        #     示例sql对应参数为:
        #                         38 对应 0.50  (背景一共18960个sku)
        #                         39 对应 0.50  (背景一共2710个sku)
        # 2.4.2.1 算法1(两仓分别选品)
        sql_hot_sku_list_1 = """
               select c.sub_dt,
                       c.sku_id
                (
                select e.sub_dt
                ,e.sku_id
                from
                (select a.sub_dt
                       ,a.sku_id
                       ,cast(SUM(a.ord_cnt * b.canchoose_flag) OVER (PARTITION BY a.sub_dt ORDER BY a.ord_cnt desc ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) as float) AS CumulativeTotal
                       ,cast(SUM(a.ord_cnt * b.background_flag) OVER (PARTITION BY a.sub_dt) * CAST ('""" + str(
            v_param_1) + """' as float) as float) AS TotalOrd
                from tb_hot_ord_cnt a
                left join
                    (select sku_id
                            ,case when white_flag = 1 and unset1_flag = 1 then 1 else 0 end as canchoose_flag
                            ,case when future_source_store_id = '""" + str(
                v_future_source_store_id_1
            ) + """' then 1 else 0 end as background_flag
                    from tb_sku_all_flag
                    ) b
                on a.sku_id = b.sku_id
                ) e
                where e.CumulativeTotal <= e.TotalOrd
                ) c 
                join
                (
                    select distinct sku_id
                    from 
                    tb_sku_all_flag 
                    where canchoose_flag = 1 
                )d
                on c.sku_id = d.sku_id          
                
        """
        hc.sql(sql_hot_sku_list_1).createOrReplaceTempView("tb_hot_sku_list_1")
        sql_select_result_1 = """
            select a.sku_id
               ,sum(a.cnt) as re_times
            from
            (select sku_id
                    ,1 as cnt
            from tb_hot_sku_list_1
            union all
            select sku_id
                   ,4 as cnt
            from tb_hot_sku_list_1
            where sub_dt between '""" + str(
            three_days_ago) + """' and '""" + str(yesterday) + """'
            ) a
          group by a.sku_id
          having re_times > 18
        """
        hc.sql(sql_select_result_1).createOrReplaceTempView(
            "tb_select_result_1")
        sql_hot_sku_list_2 = """
        select c.sub_dt,
                       c.sku_id
                       FROM 
                (
                select a.sub_dt
                ,a.sku_id
                from
                (select a.sub_dt
                       ,a.sku_id
                       ,cast(SUM(a.ord_cnt * b.canchoose_flag) OVER (PARTITION BY a.sub_dt ORDER BY a.ord_cnt desc ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) as float) AS CumulativeTotal
                       ,cast(SUM(a.ord_cnt * b.background_flag) OVER (PARTITION BY a.sub_dt) * CAST ('""" + str(
            v_param_2) + """' as float) as float) AS TotalOrd
                from tb_hot_ord_cnt a
                left join
                    (select sku_id
                            ,case when white_flag = 1 and unset1_flag = 1 then 1 else 0 end as canchoose_flag
                            ,case when future_source_store_id = '""" + str(
                v_future_source_store_id_2
            ) + """' then 1 else 0 end as background_flag
                    from tb_sku_all_flag
                    ) b
                on a.sku_id = b.sku_id
                ) a
                where a.CumulativeTotal <= a.TotalOrd
                ) c 
                join
                (
                    select distinct sku_id
                    from 
                    tb_sku_all_flag 
                    where canchoose_flag = 1 
                )d
                on c.sku_id = d.sku_id          
                  
                    
            """
        hc.sql(sql_hot_sku_list_2).createOrReplaceTempView("tb_hot_sku_list_2")

        sql_select_result_2 = """
                select a.sku_id
                   ,sum(a.cnt) as re_times
                from
                (select sku_id
                        ,1 as cnt
                from tb_hot_sku_list_2
                union all
                select sku_id
                       ,4 as cnt
                from tb_hot_sku_list_2
                where sub_dt between '""" + str(
            three_days_ago) + """' and '""" + str(yesterday) + """'
                ) a
              group by a.sku_id
              having re_times > 16
            """
        hc.sql(sql_select_result_2).createOrReplaceTempView(
            "tb_select_result_2")
        #
        # sql_result = """
        #     insert overwrite table dev.dev_ipc_ioa_hot_select_result
        #     select * from tb_select_result_1
        #     union
        #     select * from tb_select_result_2
        # """
        # hc.sql(sql_result)
        # 最终选品结果
        partition = """dt='""" + today + """',pid='""" + pid + """',ts='""" + ts + """'"""
        sql_select_result = """
                        insert overwrite table app.app_wil_hot_sku_selected partition(""" + partition + """)
                        select a.sku_id
                        from 
                       ( select * from tb_select_result_1
                        union
                        select * from tb_select_result_2) a
                    """
        hc.sql(sql_select_result)
    else:
        v_param = min(0.15 + value[0][1], 1)
        sql_hot_sku_list = """
                    select c.sub_dt,
                       c.sku_id
                       FROM 
                (
                select e.sub_dt
                ,e.sku_id
                from
                (select a.sub_dt
                       ,a.sku_id
                       ,cast(SUM(a.ord_cnt * b.canchoose_flag) OVER (PARTITION BY a.sub_dt ORDER BY a.ord_cnt desc ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) as float) AS CumulativeTotal
                       ,cast(SUM(a.ord_cnt) OVER (PARTITION BY a.sub_dt) * CAST ('""" + str(
            v_param) + """' as float) as float) AS TotalOrd
                from tb_hot_ord_cnt a
                left join
                    (select sku_id
                            ,case when white_flag = 1 and unset1_flag = 1 then 1 else 0 end as canchoose_flag
                    from tb_sku_all_flag
                    ) b
                on a.sku_id = b.sku_id
                ) e
                where e.CumulativeTotal <= e.TotalOrd
                ) c 
                join
                (
                    select distinct sku_id
                    from 
                    tb_sku_all_flag 
                    where canchoose_flag = 1 
                )d
                on c.sku_id = d.sku_id          
            """
        hc.sql(sql_hot_sku_list).createOrReplaceTempView("tb_hot_sku_list")
        partition = """dt='""" + str(
            today) + """',pid='""" + pid + """',ts='""" + ts + """'"""
        sql_select_result = """
                    insert overwrite table app.app_wil_hot_sku_selected partition(""" + partition + """)
                    select b.sku_id
                    from 
                   ( select a.sku_id
                       ,sum(a.cnt) as re_times
                    from
                    (
                    select sku_id
                            ,1 as cnt
                    from tb_hot_sku_list
                    union all
                    
                    select sku_id
                           ,4 as cnt
                    from tb_hot_sku_list
                    where sub_dt between '""" + str(
            three_days_ago) + """' and '""" + str(yesterday) + """') a
                  group by a.sku_id
                  having re_times > 0) b
                """
        hc.sql(sql_select_result)
Beispiel #39
0
from pyspark import SparkConf, SparkContext

conf = SparkConf().setAppName("LogisticRegressionWithSGD").setExecutorEnv("PYTHON_EGG_CACHE","/tmp/geap")
sc = SparkContext(conf = conf)
sc.addPyFile("hdfs://nameservice1/user/geap/warehouse/lib/numpy.egg")

from pyspark.mllib.linalg import SparseVector
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.classification import LogisticRegressionWithSGD

import numpy
import logging

# retain items etype=="pv", chkout==50 and total_price<=100000
# cks, ckp, acc, aid, chkout, ua, res, ip, genre, igenre, itemid, ni, price, ts
def filterPoint(line):
	try:
		value = line.split("\t")
	except Exception, e:
		logging.exception(e)
	#etype = value[1]
	chkout = ""
	try:
		chkout = value[4]
	except Exception, e:
		logging.exception(e)
	if chkout == "50":
		try:
			prices = [int(i) for i in eval(value[12])]
			num = [int(i) for i in eval(value[11])]
			if len(prices) == len(num):
Beispiel #40
0
		totalLocations = totalLocations.union(busyLocations)
		totalRoutes = totalRoutes.union(busyRoutes)

	totalTimes.reduceByKey(add).sortBy(lambda x:x[1], False).saveAsTextFile("/zitong/output/Busy_Times")
	totalLocations.reduceByKey(add).sortBy(lambda x:x[1], False).saveAsTextFile("/zitong/output/Busy_Locations")
	totalRoutes.reduceByKey(add).sortBy(lambda x:x[1], False).saveAsTextFile("/zitong/output/Busy_Routes")

if __name__ == "__main__":
	conf = SparkConf()
	conf.setAppName(APP_NAME)
	conf.setMaster('yarn-client')
	conf.set('spark.executor.memory', '1g')
    conf.set('spark.executor.cores','1')
    conf.set('spark.executor.instances','5')
	sc = SparkContext(conf=conf)
	sc.addPyFile("shapefile.py")
	COUNTIES = ['Albany', 'Allegany', 'Bronx', 'Broome', 'Cattaraugus', 'Cayuga', 
			'Chautauqua', 'Chemung', 'Chenango', 'Clinton', 'Columbia', 'Cortland', 
			'Delaware', 'Dutchess', 'Erie', 'Essex', 'Franklin', 'Fulton', 'Genesee', 
			'Greene', 'Hamilton', 'Herkimer', 'Jefferson', 'Kings', 'Lewis', 
			'Livingston', 'Madison', 'Monroe', 'Montgomery', 'Nassau', 'New York', 
			'Niagara', 'Oneida', 'Onondaga', 'Ontario', 'Orange', 'Orleans', 'Oswego', 
			'Otsego', 'Putnam', 'Queens', 'Rensselaer', 'Richmond', 'Rockland', 'Saratoga', 
			'Schenectady', 'Schoharie', 'Schuyler', 'Seneca', 'St. Lawrence', 'Steuben', 'Suffolk', 
			'Sullivan', 'Tioga', 'Tompkins', 'Ulster', 'Warren', 'Washington', 'Wayne', 
			'Westchester', 'Wyoming', 'Yates']
	spatialIdx = readSpatialIndex("spatialIdx.csv")
	spatialIdx = sc.broadcast(spatialIdx)
	couties = sc.broadcast(COUNTIES)
	main(sc)
        if(longitude<-140 and longitude>=-180):
            result.append("node_3")
        elif (longitude >140 and longitude<=180):
            result.append("node_1");
            
    return result;
        
    
                
if __name__ == '__main__':
    import happybase;
    # configure the spark environment
    sparkConf = SparkConf().setAppName("Simulating Streamline");
    sparkConf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
    sc = SparkContext(conf=sparkConf);
    sc.addPyFile("module.zip");      
#     from pywebhdfs.webhdfs import PyWebHdfsClient;
    distributed_dataset = sc.textFile("hdfs:/user/uacharya/subset_dataset_1934.txt",use_unicode=False,minPartitions=24);
    print("this is the driver container");
    # getting the header of the whole dataset
    header = distributed_dataset.first();
    # filtering the header out of the data 
    distributed_dataset = distributed_dataset.filter(lambda d: d != header);
    # mapping the data to prepare for processing
    data_in_required_format = distributed_dataset.map(create_required_datewise_data);
    data_in_required_format.cache();
    #collecting keys to do batch processing based on keys
    temp = set(data_in_required_format.keys().collect());
    print("total keys "+str(len(temp)));
    #sorting keys to create data in chronological order based on date
    sorted_keys = sorted(temp,key=int);
Beispiel #42
0
def main():
	
	config = configparser.ConfigParser()
	config.read('config.ini')

	#Path for Gromacs project
	gromacs_path = preparing_path(config.get('DRUGDESIGN', 'gromacs_path'))
	#Path where PDB ligand are - They are NOT participated in docking
	pdb_ligand_path = config.get('DEFAULT', 'pdb_ligand_path')
	#Path that contains all files for analysis
	path_analysis = config.get('DEFAULT', 'path_analysis')	
	#Path where all pdb receptor are
	path_receptor_pdb = config.get('DEFAULT', 'pdb_path')	
	#Path for saving pdb files of models generated by VS
	path_analysis_pdb = get_directory_pdb_analysis(path_analysis)
	
	# Create SPARK config
	maxResultSize = str(config.get('SPARK', 'maxResultSize'))
	conf = (SparkConf().set("spark.driver.maxResultSize", maxResultSize))

	# Create context
	sc = SparkContext(conf=conf)

	#Adding Python Source file
	#Path for drugdesign project
	path_spark_drugdesign = config.get('DRUGDESIGN', 'path_spark_drugdesign')	
	sc.addPyFile(os.path.join(path_spark_drugdesign,"vina_utils.py"))
	sc.addPyFile(os.path.join(path_spark_drugdesign,"os_utils.py"))
	sc.addPyFile(os.path.join(path_spark_drugdesign,"gromacs_utils.py"))
	sc.addPyFile(os.path.join(path_spark_drugdesign,"pdb_io.py"))
	sc.addPyFile(os.path.join(path_spark_drugdesign,"json_utils.py"))

	#Adding bash scripts	
	sc.addFile(os.path.join(path_spark_drugdesign,"make_ndx_buried_area_ligand.sh"))	

	#Parameters form command line
	#Indicates probe. Example: 0.14
	probe = float(sys.argv[1])
	#Indicates ndots. Example: 24
	ndots = int(sys.argv[2])

	#Broadcast
	path_analysis_pdb_complex_b = sc.broadcast(path_analysis_pdb)
	gromacs_path = sc.broadcast(gromacs_path)	 
	pdb_ligand_path = sc.broadcast(pdb_ligand_path)
	probe = sc.broadcast(probe)
	ndots = sc.broadcast(ndots)

	start_time = datetime.now()

	os.environ["GMX_MAXBACKUP"]="-1"

	#Loading all PDB receptor files into memory
	list_all_pdb_receptor_files_path = []
	all_receptor_for_complex = get_files_pdb(path_receptor_pdb)
	for receptor in all_receptor_for_complex:
		list_all_pdb_receptor_files_path.append(loading_pdb_2_list(receptor))

	for pdb_receptor_files in list_all_pdb_receptor_files_path:
		#Getting receptor name by fully path
		base_file_name_receptor = get_name_receptor_pdb(str(pdb_receptor_files[0]))
		#PDB file loaded into memory is sent by broadcast
		pdb_file_receptor = pdb_receptor_files[1]
		pdb_file_receptor = sc.broadcast(pdb_file_receptor)
		#Loading PDB model files based on receptor into memory
		base_file_name_receptor_for_filter = base_file_name_receptor+"_-_"
		all_model_for_complex = get_files_pdb_filter(path_analysis_pdb,base_file_name_receptor_for_filter)
		all_model_for_complexRDD = sc.parallelize(all_model_for_complex)
		all_model_filesRDD = all_model_for_complexRDD.map(loading_pdb_2_list).collect()

# ********** Starting function **********************************************************		
		def save_model_receptor(list_receptor_model_file):
			receptor_file = pdb_file_receptor.value #Obtained from broadcast
			model_file = list_receptor_model_file[0]			
			full_path_for_save_complex = list_receptor_model_file[1]
			#Open file for writting the complex
			f_compl = open(full_path_for_save_complex, "w")
			#Insert lines of receptor
			for item in  receptor_file:
				f_compl.write(item)
			#Insert lines of model and insert Z chain
			for item in model_file:
				item = replace_chain_atom_line(item,"d","z")
				f_compl.write(item)
			f_compl.close()
# ********** Finish function **********************************************************					

# ********** Starting function **********************************************************		
		def compute_buried_area_ligand(pdb_complex):
			chZ = "chZ"
			buried_lig_rec_perc = -1.0
			buried_lig_rec = -1.0
			buried_lig_lig = -1.0
			buried_lig_lig_perc = -1.0
			base_name = get_name_model_pdb(pdb_complex)		
			ligand_name = get_ligand_from_receptor_ligand_model(base_name)
			receptor_name = get_receptor_from_receptor_ligand_model(base_name)
			pose = get_model_from_receptor_ligand_model(base_name)						
			pdb_before_vs = os.path.join(pdb_ligand_path.value,ligand_name+".pdb")			
			#ndx files					
			f_ndx = os.path.join(path_analysis_pdb_complex_b.value,base_name+".ndx")			
			#xvg files
			xvg_temp_sasa_lig_pose = os.path.join(path_analysis_pdb_complex_b.value,base_name+"_sasa_lig_pose"+".xvg")
			xvg_temp_sasa_lig_complex = os.path.join(path_analysis_pdb_complex_b.value,base_name+"_sasa_lig_complex"+".xvg")
			xvg_temp_sasa_lig_min = os.path.join(path_analysis_pdb_complex_b.value,base_name+"_sasa_lig_min"+".xvg")
			# Creates a selection with the residues that are closer than 6A to the ligand
			script_make_ndx_buried_area_ligand = SparkFiles.get("make_ndx_buried_area_ligand.sh") #Getting bash script that was copied by addFile command
			command = script_make_ndx_buried_area_ligand + " " + gromacs_path.value + " "+ pdb_complex + " "+ f_ndx + " "+  xvg_temp_sasa_lig_pose + " "+ str(probe.value)  + " "+ str(ndots.value)  + " "+  xvg_temp_sasa_lig_complex  + " "+ pdb_before_vs  + " "+  xvg_temp_sasa_lig_min
			process = Popen(command,shell=True, stdout=PIPE, stderr=PIPE)
			stdout, stderr = process.communicate()			
			try:
				# SASA of the isolated ligand in the pose conformation			
				sasa_lig_pose = get_value_from_xvg_sasa(xvg_temp_sasa_lig_pose)
				# SASA of the complexed ligand in the pose conformation
				sasa_lig_complex = get_value_from_xvg_sasa(xvg_temp_sasa_lig_complex)
				# SASA of the isolated ligand in its energy-minimized conformation. Only for carbohydrates!
				sasa_lig_min = get_value_from_xvg_sasa(xvg_temp_sasa_lig_min)
				# Area of the ligand which is buried in the receptor
				buried_lig_rec = sasa_lig_pose - sasa_lig_complex
				buried_lig_rec_perc = buried_lig_rec / sasa_lig_pose
				# Area of the ligand in the pose conformation which is buried in itself when compared to the energy-minimized conformation
				buried_lig_lig = sasa_lig_min - sasa_lig_pose
				buried_lig_lig_perc = buried_lig_lig / sasa_lig_min
				returned_list = (base_name, buried_lig_rec, buried_lig_rec_perc, buried_lig_lig, buried_lig_lig_perc)

				#Deleting files
				os.remove(f_ndx)			
				os.remove(xvg_temp_sasa_lig_pose)
				os.remove(xvg_temp_sasa_lig_complex)
				os.remove(xvg_temp_sasa_lig_min)

				return returned_list
			except:
				return (base_name, float(0.0), float(0.0), float(0.0), float(0.0))
# ********** Finish function **********************************************************					

# ********** Starting function **********************************************************		
		def build_list_model_for_complex(model):
			full_path_model = model[0]
			model_file = model[1]
			path_pdb_complex = path_analysis_pdb_complex_b.value #Obtained from broadcast
			#Building complex file based on model file name
			base_name_model = get_name_model_pdb(full_path_model)
			complex_name = "compl_"+base_name_model+".pdb"
			full_path_for_save_complex = os.path.join(path_pdb_complex,complex_name)
			list_receptor_model_file = (model_file, full_path_for_save_complex)						
			save_model_receptor(list_receptor_model_file)			
			list_ret = compute_buried_area_ligand(full_path_for_save_complex)			
			os.remove(full_path_for_save_complex)
			return list_ret
# ********** Finish function **********************************************************	

		all_model_filesRDD = sc.parallelize(all_model_filesRDD)
		all_model_filesRDD = all_model_filesRDD.map(build_list_model_for_complex).collect()	
		#Saving buried area of residue receptor
		full_area_file  = os.path.join(path_analysis,base_file_name_receptor+".ligandArea")
		save_buried_area_ligand(full_area_file, all_model_filesRDD)

	#Loading all area file 
	all_area_file = os.path.join(path_analysis,"*.ligandArea")		
	buried_areaRDD = sc.textFile(all_area_file).map(loading_lines_from_ligandArea_files).collect()	

	#Sorting by buried_lig_lig column
	buried_area_sorted_by_buried_lig_rec = sorting_buried_area_ligand(sc, buried_areaRDD)
	buried_area_sorted_by_buried_lig_rec = buried_area_sorted_by_buried_lig_rec.map(lambda p: (p.pose, p.buried_lig_rec, p.buried_lig_rec_perc, p.buried_lig_lig, p.buried_lig_lig_perc) ).collect() #p.receptor, p.ligand, p.model

	#Saving buried area ligand file
	path_file_buried_area = os.path.join(path_analysis, "summary_buried_area_ligand.dat")
	save_buried_area_ligand_sort(path_file_buried_area, buried_area_sorted_by_buried_lig_rec)	

	#Removing all area files
	all_area_files = get_files_ligandArea(path_analysis)
	for area_file in all_area_files:
		os.remove(area_file)

	finish_time = datetime.now()

	save_log(finish_time, start_time)
Beispiel #43
0
from pyspark import SparkConf, SparkContext

conf = SparkConf().setAppName("LogisticRegressionWithSGD").setExecutorEnv(["PYTHON_EGG_CACHE","/tmp/geap"),("SPARK_LIBRARY_PATH", "$SPARK_LIBRARY_PATH:$HADOOP_HOME/lib/native")])
sc = SparkContext(conf = conf)
sc.addPyFile("hdfs://nameservice1/tmp/geap/numpy.egg")

from pyspark.mllib.linalg import SparseVector
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.classification import LogisticRegressionWithSGD

import numpy
import json

# retain items etype=="pv", chkout==50 and total_price<=100000
# cks, ckp, acc, aid, chkout, ua, res, ip, genre, igenre, itemid, ni, price, ts
def filterPoint(line):
	value = json.loads(line[1])
	etype = value.get("etype")
	chkout = value.get("chkout")
	if chkout == "50":
		prices = [int(i) for i in value.get("price")]
		num = [int(i) for i in value.get("ni")]
		if len(prices) == len(num):
			total_price = sum([a*b for a, b in zip(prices, num)])
		else:
			return False
		if total_price <= 100000:
			return True
	return False

def parsePoint(line):
    topics_reduced = {}
    for sc, v in shortest_paths.items():
        for id, topic, count in v:
            if topic not in topics_reduced:
                topics_reduced[topic] = 1
            else:
                topics_reduced[topic] += 1

    return (p_id, topics_reduced)


conf = SparkConf().setAppName("entity_topics")
sc = SparkContext(conf=conf)
pp = pprint.PrettyPrinter(width=100)

sc.addPyFile('/home/username/src/pyspark/dist/libs.zip')
sys.path.insert(0, SparkFiles.get('/home/username/src/pyspark/dist/libs.zip'))
import networkx

_topics = [(693763, 'Academic disciplines'), (4892515, 'Arts'),
           (771152, 'Business'), (24980271, 'Concepts'), (694861, 'Culture'),
           (696763, 'Education'), (693016, 'Entertainment'),
           (2766046, 'Events'), (693800, 'Geography'), (751381, 'Health'),
           (693555, 'History'), (1004110, 'Humanities'), (8017451, 'Language'),
           (691928, 'Law'), (2389032, 'Life'), (690747, 'Mathematics'),
           (696603, 'Nature'), (691008, 'People'), (691810, 'Philosophy'),
           (695027, 'Politics'), (722196, 'Reference'), (692694, 'Religion'),
           (691182, 'Science'), (1633936, 'Society'), (693708, 'Sports'),
           (696648, 'Technology'), (48005914, 'Universe'), (3260154, 'World')]
topics = sc.broadcast(_topics)
def main():
	
	sc = SparkContext()
	config = configparser.ConfigParser()
	config.read('config.ini')

	#Broadcast - global
	path_pdbqt     = config.get('DEFAULT', 'pdbqt_ligand_path')
	pythonsh       = config.get('VINA', 'pythonsh')
	script_ligand4 = config.get('VINA', 'script_ligand4')
	database_comp  = config.get('DEFAULT', 'ligand_database_path_file')
	pdb_ligand_path = config.get('DEFAULT', 'pdb_ligand_path')
	script_pdbqt_to_pdb = config.get('VINA', 'script_pdbqt_to_pdb')
	path_spark_drugdesign = config.get('DRUGDESIGN', 'path_spark_drugdesign')

	#Adding Python Source file
	sc.addPyFile(os.path.join(path_spark_drugdesign,"vina_utils.py"))
	sc.addPyFile(os.path.join(path_spark_drugdesign,"pdbqt_io.py"))

	#creating pdbqt path
	if not os.path.isdir(path_pdbqt):
		os.mkdir(path_pdbqt)

	#creating PDB path
	if not os.path.isdir(pdb_ligand_path):
		os.mkdir(pdb_ligand_path)

	start_time = datetime.now()

	#preparing compound list
	list_obj_lig_vina = []
	mol2_files = vina_utils.get_files_mol2(config.get('DEFAULT', 'mol2_path'))
	for fmol2 in mol2_files:
		obj_lig_vina = (path_pdbqt, pythonsh,script_ligand4, fmol2)
		list_obj_lig_vina.append(obj_lig_vina)

	molRDD = sc.parallelize(list_obj_lig_vina)	
	molRDD.foreach(prepare_ligand)

	# *** Preparation of compound list finished. Now, it is able to create the database

	#preparing enviroment for creating database
	prepare_for_creating_database(database_comp, path_pdbqt)

	#preparing pdbqt list
	list_obj_pdbqt = []
	pdbqt_files = vina_utils.get_files_pdbqt(path_pdbqt)
	for fpdbqt in pdbqt_files:
		list_obj_pdbqt.append(fpdbqt)

	#appling map and collect
	pdbqtRDD = sc.parallelize(list_obj_pdbqt)
	all_lines = pdbqtRDD.map(build_compound_database).collect()
	
	#creating database file
	save_database(database_comp, all_lines)

	#converting ligand pdbqt to pdb
	list_pdbqt_files_lig = []
	all_pdbqt_files_lig = vina_utils.get_files_pdbqt(path_pdbqt)
	for pdbqt_files_lig in all_pdbqt_files_lig:
		list_pdbqt_files_lig.append( (pdbqt_files_lig, pdb_ligand_path, pythonsh, script_pdbqt_to_pdb) )
	pdbqt_files_ligRDD = sc.parallelize(list_pdbqt_files_lig)
	pdbqt_files_ligRDD.foreach(pdbqt2pdb)

	finish_time = datetime.now()

	save_log(finish_time, start_time)
def main():

	sc = SparkContext()
	sqlCtx = SQLContext(sc)

	config = configparser.ConfigParser()
	config.read('config.ini')

	#Path for drugdesign project
	path_spark_drugdesign = config.get('DRUGDESIGN', 'path_spark_drugdesign')
	#Detect interactions program
	detect_hbonds_program = config.get('DRUGDESIGN', 'detect_hbonds_program') 	
	#Path where all pdb receptor are
	path_receptor_pdbqt = config.get('DEFAULT', 'pdbqt_receptor_path')
	#Path that contains all files for analysis
	path_analysis = config.get('DEFAULT', 'path_analysis') 
	#Ligand Database file
	ligand_database  = config.get('DEFAULT', 'ligand_database_path_file')	
	#Path of pdbqt model
	path_analysis_pdbqt_model = get_directory_pdbqt_analysis(path_analysis)
	#Path analysis temp
	path_analysis_temp = get_directory_temp_analysis(path_analysis)

	#Getting parameters
	# cutoff for hydrogen bind
	distance_cutoff = float(sys.argv[1])
	angle_cutoff = float(sys.argv[2])

	#Adding Python Source file
	sc.addPyFile(os.path.join(path_spark_drugdesign,"vina_utils.py"))
	sc.addPyFile(os.path.join(path_spark_drugdesign,"pdbqt_io.py"))
	sc.addPyFile(os.path.join(path_spark_drugdesign,"database_io.py"))
	sc.addPyFile(os.path.join(path_spark_drugdesign,"json_utils.py"))

	start_time = datetime.now()

	#broadcast
	path_analysis_temp_b = sc.broadcast(path_analysis_temp)
	detect_hbonds_program_b = sc.broadcast(detect_hbonds_program)
	distance_cutoff_b = sc.broadcast(distance_cutoff)
	angle_cutoff_b = sc.broadcast(angle_cutoff)
#******************* start function ************************************************
	def get_hydrogen_bind(ligand_pdbqt):

		#getting base name
		base_name = get_name_model_pdb(ligand_pdbqt)

		#temporary_lig_no
		temporary_lig_no = base_name+"_temporary_lig_no"
		list_param = ["C", "O", "N", "HD", "HS"]
		list_atom_pdbqt = get_atom_section_from_atom_list(ligand_pdbqt, list_param)	
		list_ref = get_lig_values_from_atom_list_2_hydrogen_bind(list_atom_pdbqt)
		path_file_lig_no = os.path.join(path_analysis_temp_b.value, temporary_lig_no)
		save_text_file_from_list(path_file_lig_no, list_ref)
		total_lig_no = int(get_line_number(path_file_lig_no)) 

		#temporary_rec_no
		temporary_rec_no = base_name+"_temporary_rec_no"
		list_param = ["C", "OA", "N", "HD", "HS", "SA", "A"]
		list_atom_pdbqt = get_atom_section_from_atom_list(receptor_b.value, list_param)	
		list_ref = get_receptor_values_from_atom_list_2_hydrogen_bind(list_atom_pdbqt)
		path_file_rec_no = os.path.join(path_analysis_temp_b.value, temporary_rec_no)
		save_text_file_from_list(path_file_rec_no, list_ref)
		total_rec_no = int(get_line_number(path_file_rec_no)) 

		#temporary_rec_h
		temporary_rec_h = base_name+"_temporary_rec_h"
		list_param = ["HD", "HS"]
		list_atom_pdbqt = get_atom_section_from_atom_list(receptor_b.value, list_param)	
		list_ref = get_receptor_values_from_atom_list_2_hydrogen_bind(list_atom_pdbqt)
		path_file_rec_h = os.path.join(path_analysis_temp_b.value, temporary_rec_h)
		save_text_file_from_list(path_file_rec_h, list_ref)
		total_rec_h = int(get_line_number(path_file_rec_h)) 
		
		#preparing file for saving	
		file_for_saving = base_name+".saving"
		path_file_for_saving = os.path.join(path_analysis_temp_b.value, file_for_saving)		
		if total_lig_no > 0:		
			#print detect_hbonds_program_b.value+" "+ receptor_b.value+" "+ str(total_rec_no)+" "+ ligand_pdbqt+" "+ str(total_lig_no)+" "+ str(distance_cutoff_b.value)+" "+ str(angle_cutoff_b.value)+" "+ path_file_for_saving+" "+ path_file_rec_no+" "+ path_file_lig_no+" "+ path_file_rec_h+" "+ path_file_rec_no			
			process = Popen( [detect_hbonds_program_b.value, receptor_b.value, str(total_rec_no), ligand_pdbqt, str(total_lig_no), str(distance_cutoff_b.value), str(angle_cutoff_b.value), path_file_for_saving, path_file_rec_no, path_file_lig_no, path_file_rec_h, path_file_rec_no ], stdout=PIPE, stderr=PIPE)
			stdout, stderr = process.communicate()

		os.remove(path_file_rec_no)
		os.remove(path_file_lig_no)
		os.remove(path_file_rec_h)

#******************* finish function ************************************************

	#Getting all receptores
	all_receptores = get_files_pdbqt(path_receptor_pdbqt)

	#Getting all pdbqt models
	all_pdbqt_models = get_files_pdbqt(path_analysis_pdbqt_model)
	all_pdbqt_modelsRDD = sc.parallelize(all_pdbqt_models)

	for receptor in all_receptores:
		check_temp_directory(path_analysis_temp)
		receptor_b = sc.broadcast(receptor)
		base_name_receptor = get_name_receptor_pdbqt(receptor)
		base_name_receptor = base_name_receptor+"_-_"		
		models_by_receptorRDD = all_pdbqt_modelsRDD.filter(lambda m : base_name_receptor in m).collect()
		models_by_receptorRDD = sc.parallelize(models_by_receptorRDD)
		models_by_receptorRDD.foreach(get_hydrogen_bind)

		#Getting all saving files that have lines > 0
		all_saving_files_by_receptor = get_saving_files_with_lines(path_analysis_temp, base_name_receptor)
		#Creating file based on all saving files
		create_file_receptor_all_saving_files(all_saving_files_by_receptor,base_name_receptor,path_analysis)		
		
		#Getting all saving files that have lines equal 0		
		all_saving_files_no_lines = get_saving_files_no_lines(path_analysis_temp, base_name_receptor)		
		#Creating file based on all saving files
		create_file_receptor_no_hydrogen_bonds(all_saving_files_no_lines,base_name_receptor,path_analysis)		

		#Removing temp directory
		shutil.rmtree(path_analysis_temp)

	#Starting the final analysis
	all_hydrogen_bind = get_hydrogen_bind_files(path_analysis)

	if len(all_hydrogen_bind) > 0:		
		#No Hydrogen bind
		all_NOT_hydrogen_bind = get_NOT_hydrogen_bind_files(path_analysis)
		all_NOT_hydrogen_bindRDD = sc.parallelize(all_NOT_hydrogen_bind)
		#loading from files
		all_NOT_hydrogen_bindRDD = all_NOT_hydrogen_bindRDD.flatMap(loading_from_files_NOT_hydrogen_bind).collect()		
		#loading all values from list
		all_NOT_hydrogen_bindRDD = loading_from_all_lists_NOT_hydrogen_bind(sc, all_NOT_hydrogen_bindRDD, sqlCtx)
		all_NOT_hydrogen_bindRDD.cache()		

		#Working with Hydrogen bind
		all_hydrogen_bindRDD = sc.parallelize(all_hydrogen_bind)
		#loading from files
		all_hydrogen_bindRDD = all_hydrogen_bindRDD.flatMap(loading_from_files).collect()
		#loading all values from list
		all_hydrogen_bindRDD = loading_from_all_lists(sc, all_hydrogen_bindRDD, sqlCtx)
		all_hydrogen_bindRDD.cache()
		#saving all_bonds_file	
		save_all_bonds_file(path_analysis, distance_cutoff, angle_cutoff, all_hydrogen_bindRDD)

		#number hydrogen binds of poses
		number_poseRDD = get_hbonds_number_pose(sqlCtx)
		number_poseRDD.cache()
		save_number_pose(path_analysis, distance_cutoff, angle_cutoff, number_poseRDD, all_NOT_hydrogen_bindRDD)

		#Calculating Normalized Hydrogen Bond 
		#Loading database
		rdd_database = load_database(sc, ligand_database)
		#Creating Dataframe
		database_table = sqlCtx.createDataFrame(rdd_database)	
		database_table.registerTempTable("database")

		number_pose_ligandRDD = number_poseRDD.map(lambda p: Row(numPose=int(p.numPose), ligand=get_ligand_from_receptor_ligand_model(p.pose), pose=str(p.pose) ) ).collect()
		number_pose_ligand_table = sqlCtx.createDataFrame(number_pose_ligandRDD)	
		number_pose_ligand_table.registerTempTable("pose_ligand_hb")

		#Calculating normalized Hydrogen Bond by donors_acceptors
		sql = """
				SELECT pose, (b.numPose / a.hb_donors_acceptors) as normalized_hb
				FROM database a 
				JOIN pose_ligand_hb b ON b.ligand = a.ligand
				ORDER BY normalized_hb DESC 
		      """
		#Getting all data
		full_dataRDD = sqlCtx.sql(sql) 		
		#Saving file
		save_number_pose_normalized_donors_acceptors(path_analysis, distance_cutoff, angle_cutoff, full_dataRDD)

		#Calculating normalized Hydrogen Bond by heavy atoms
		sql = """
				SELECT pose, (b.numPose / a.heavyAtom) as normalized_hb
				FROM database a 
				JOIN pose_ligand_hb b ON b.ligand = a.ligand
				ORDER BY normalized_hb DESC 
		      """
		#Getting all data
		full_dataRDD = sqlCtx.sql(sql) 		
		#Saving file
		save_number_pose_normalized_heavyAtom(path_analysis, distance_cutoff, angle_cutoff, full_dataRDD)

		#number hydrogen binds of ligands
#		number_ligandRDD = get_hbonds_number_ligand(sc, number_poseRDD, sqlCtx)
#		save_number_ligand(path_analysis, distance_cutoff, angle_cutoff, number_ligandRDD)

		#Removing all hydrogen bind files
		remove_all_hydrogen_files(all_hydrogen_bind)

	else:
		save_all_bonds_file_with_mensage(path_analysis, cutoff)

	finish_time = datetime.now()

	save_vs_hydrogen_bind_log(finish_time, start_time)
Beispiel #47
0
# Dummy Spark App demo
from pyspark import SparkContext, SparkConf
from pyspark import SparkFiles

import numpy as np
from barista.customer import Customer

conf = SparkConf().setAppName("Dummy Demo")
sc = SparkContext(conf=conf)

# Add prototxt files to Spark Context
sc.addFile("models/solver.prototxt")
sc.addFile("models/train_val.prototxt")

# Add barista module
sc.addPyFile("barista.zip")
sc.addPyFile("barista/start.py")


# Subclass generic barista Customer
class MyCustomer(Customer):
    def __init__(self, filename):
        compute_semaphore, model_semaphore, handles = \
            Customer.parse_ipc_interface_file(filename)
        Customer.__init__(self, compute_semaphore, model_semaphore, handles)

    def update_data(self):
        self.arrays['data'][:] = np.random.randn(*self.arrays['data'].shape)
        self.arrays['label'][:] = np.random.choice(
                                      xrange(10),
                                      size=self.arrays['label'].shape)
Beispiel #48
0

if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description="compute summary statistics on time series data")
    parser.add_argument("master", type=str)
    parser.add_argument("datafile", type=str)
    parser.add_argument("outputdir", type=str)
    parser.add_argument("mode",
                        choices=("mean", "median", "std", "norm"),
                        help="which summary statistic")
    parser.add_argument("--preprocess",
                        choices=("raw", "dff", "dff-highpass", "sub"),
                        default="raw",
                        required=False)

    args = parser.parse_args()

    sc = SparkContext(args.master, "stats")

    if args.master != "local":
        egg = glob.glob(os.path.join(os.environ['THUNDER_EGG'], "*.egg"))
        sc.addPyFile(egg[0])

    data = load(sc, args.datafile, args.preprocess).cache()

    vals = stats(data, args.mode)

    outputdir = args.outputdir + "-stats"

    save(vals, outputdir, "stats_" + args.mode, "matlab")
Beispiel #49
0
from pyspark.ml.feature import VectorAssembler, StringIndexer, VectorIndexer

from plotly.offline import download_plotlyjs, init_notebook_mode, iplot, plot
import plotly
import plotly.graph_objs as go
import pandas as pd

# init
sc = SparkContext("local", "Test_PSAML")

# Get parent directory of the tests directory
parent_dir = os.path.abspath(os.path.dirname(os.path.dirname(__file__)))
sys.path.append(os.path.join(parent_dir, "psaml"))
import psaml

sc.addPyFile(os.path.join(parent_dir, "psaml/psaml.py"))

sql_context = SQLContext(sc)

# header=false so the columns aren't named after the first row values
# inferSchema=true so that data is read in as correct data type, not just strings
data = sql_context.read.load(
    "tests/resources/iris.csv", format="com.databricks.spark.csv", header="false", inferSchema="true"
)

# now we create a vector of the input columns so they can be one column
ignore = ["C4"]  # ignore the output column
assembler = VectorAssembler(inputCols=[x for x in data.columns if x not in ignore], outputCol="features")

# Automatically identify categorical features, and index them.
# We specify maxCategories so features with > 4 distinct values are treated as continuous.
Beispiel #50
0
    # Top stories and map data over time
    for tag in tags:
        analysis.top_stories(df_full, df_sub, context, tag)
        analysis.top_stories(df_full, df_sub, context, tag, 10)
        map_wrap_to_pandas(df_full, context, tag)

    # Scatter, sentiment, map data
    for t in [["demP", "demN"], ["gopP", "gopN"], ["djtP", "djtN"]]:
        map_wrap_to_pandas(df_full, context, t[0], t[1])
        analysis.sentiment_over_time(df_full, context, t[0], t[1])
        analysis.scatter(df_full, df_sub, context, t[0], t[1], 1)
        analysis.scatter(df_full, df_sub, context, t[0], t[1], 100)

    # Total Republican Scatter
    analysis.total_scatter(df_full, df_sub, context)




# ---------------- END ----------------
if __name__ == "__main__":
    conf = SparkConf().setAppName("CS143 Project 2B")
    conf = conf.setMaster("local[*]")
    sc   = SparkContext(conf=conf)
    sqlContext = SQLContext(sc)
    sc.addPyFile("cleantext.py")
    sc.setLogLevel("ERROR")

    main(sqlContext)
def main():

	config = configparser.ConfigParser()
	config.read('config.ini')

	#Number of poses to select by buried area
	number_poses_to_select_buried_area = int(config.get('DRUGDESIGN', 'number_poses_to_select_buried_area') )
	# list of residues to select buried area
	file_select_buried_area = config.get('DRUGDESIGN', 'file_residue_to_select_buried_area')
	#Path that contains all files for analysis
	path_analysis = config.get('DEFAULT', 'path_analysis')	
	#File for saving the filtered buried area
	result_file_to_select_buried_area = config.get('DRUGDESIGN', 'result_file_to_select_buried_area')
	#File for saving the filtered buried area only poses
	result_file_to_select_buried_area_only_pose = config.get('DRUGDESIGN', 'result_file_to_select_buried_area_only_pose')
	#Path where all pdb receptor are
	path_receptor = config.get('DEFAULT', 'pdb_path')	
	#Path for saving pdb files of models generated by VS
	path_ligand = get_directory_pdb_analysis(path_analysis)	
	#Path where saved the selected compelex
	path_to_save = os.path.join("selected_complexo", "buried_area_residue")
	path_to_save = os.path.join(path_analysis, path_to_save)
	if not os.path.exists(path_to_save):
		os.makedirs(path_to_save)

	# Create SPARK config
	maxResultSize = str(config.get('SPARK', 'maxResultSize'))
	conf = (SparkConf().set("spark.driver.maxResultSize", maxResultSize))

	# Create context
	sc = SparkContext(conf=conf)
	sqlCtx = SQLContext(sc)

	start_time = datetime.now()

	#Broadcast
	path_to_save_b = sc.broadcast(path_to_save) 
	path_receptor_b = sc.broadcast(path_receptor) 
	path_ligand_b = sc.broadcast(path_ligand) 

	#Adding Python Source file
	#Path for drugdesign project
	path_spark_drugdesign = config.get('DRUGDESIGN', 'path_spark_drugdesign')	
	sc.addPyFile(os.path.join(path_spark_drugdesign,"vina_utils.py"))
	sc.addPyFile(os.path.join(path_spark_drugdesign,"pdb_io.py"))

	#load all-residue_buried_areas.dat file
	path_file_buried_area = os.path.join(path_analysis, "all-residue_buried_areas.dat")
	all_residue	= sc.textFile(path_file_buried_area)
	header = all_residue.first() #extract header	

	#Spliting file by \t
	all_residue_split = all_residue.filter(lambda x:x !=header).map(lambda line: line.split("\t"))
	all_residue_split = all_residue_split.map(lambda p: Row( residue=str(p[0]), buried_area_residue=float(p[1]), residue_sasa_buried_perc=float(p[2]), pose=str(p[3]) ))

	#Creating all_residue Dataframe
	df_all_residue = sqlCtx.createDataFrame(all_residue_split)	
	df_all_residue.registerTempTable("all_residue")

	#Creating resudue list as Dataframe
	residue_list = sc.textFile(file_select_buried_area)	
	header = residue_list.first() #extract header		
	#Spliting file by \t
	residue_listRDD = residue_list.filter(lambda x:x !=header).map(lambda line: line)
	residue_listRDD = residue_listRDD.map(lambda p: Row( residue=str(p).strip() ))

	df_residue_list = sqlCtx.createDataFrame(residue_listRDD)	
	df_residue_list.registerTempTable("residue_list")

	#Getting all information based on list of residues
	sql = """
	       SELECT all_residue.*
	       FROM all_residue 
	       JOIN residue_list ON residue_list.residue = all_residue.residue	       
	      """
	df_result = sqlCtx.sql(sql)
	df_result.registerTempTable("residues_filtered_by_list")	

	#Saving result
	path_file_result_file = os.path.join(path_analysis, result_file_to_select_buried_area)
	save_result(path_file_result_file, df_result)	

	#Grouping
	sql = """
	       SELECT pose, count(*) as num_res
	       FROM residues_filtered_by_list 
	       GROUP BY pose
	       ORDER BY num_res DESC 
	      """	
	df_result = sqlCtx.sql(sql)	

	#Saving result only pose
	path_file_result_file_only_pose = os.path.join(path_analysis, result_file_to_select_buried_area_only_pose)
	save_result_only_pose(path_file_result_file_only_pose, df_result)	

	#Loading poses
	only_poseRDD = sc.textFile(path_file_result_file_only_pose)
	header = only_poseRDD.first() #extract header		
	#Spliting file by \t
	only_poseRDD = only_poseRDD.filter(lambda x:x !=header).map(lambda line: line.split("\t"))
	only_poseRDD = only_poseRDD.map(lambda p: Row( pose=str(p[0]).strip(), num_res=int(str(p[1]).strip() ) ))

	only_pose_takeRDD = only_poseRDD.take(number_poses_to_select_buried_area)

# ******************** STARTED FUNCTION ********************************
	def build_complex_from_pose_file_name(p_name):
		from vina_utils import get_receptor_from_receptor_ligand_model, get_ligand_from_receptor_ligand_model, get_model_from_receptor_ligand_model, get_separator_filename_mode
		#Broadcast
		path_to_save = path_to_save_b.value
		path_receptor = path_receptor_b.value
		path_ligand = path_ligand_b.value
		#Based on row value from dataframe
		pose_file_name = p_name.pose

		#Receptor
		receptor_file_name = get_receptor_from_receptor_ligand_model(pose_file_name)				
		receptor_file = os.path.join(path_receptor, receptor_file_name+".pdb")
		f_receptor_file = open(receptor_file,"r")
		#ligand file name
		ligand_file_name = os.path.join(path_ligand, pose_file_name+".pdb")
		f_ligand_file_name = open(ligand_file_name,"r")

		#Open file for writting the complex
		full_path_for_save_complex = os.path.join(path_to_save, pose_file_name+".pdb")
		f_compl = open(full_path_for_save_complex, "w")
		#Insert lines of receptor
		for item in  f_receptor_file:
			f_compl.write(item)
		#Insert lines of model
		for item in f_ligand_file_name:		
			f_compl.write(item)
		#Closing files
		f_compl.close()
		f_ligand_file_name.close()
		f_receptor_file.close()
# ******************** FINISHED FUNCTION ********************************

	sc.parallelize(only_pose_takeRDD).foreach(build_complex_from_pose_file_name)

	finish_time = datetime.now()

	save_log(finish_time, start_time)
from __future__ import print_function
from pyspark import SparkContext
import csv
from label import *

def save_label_count(rdd, col_index, count_file_pattern, basic_type, semantic_type, label_func):
    """ Group by label column and count and save """ 
    rdd.map(lambda row: (label_func(row[col_index - 7].strip()), 1)) \
        .reduceByKey(lambda x, y: x + y) \
        .map(lambda row: '%s,%s,%s,%d' % (basic_type, semantic_type, row[0], row[1])) \
        .coalesce(1) \
        .saveAsTextFile(count_file_pattern.format(col_index))


sc = SparkContext()
sc.addPyFile('label.py')
data = sc.textFile('./NYPD_Complaint_Data_Historic.csv')

# Header
header = data.first()

# Extract column 7 - 13
rdd = data.filter(lambda row: row != header) \
    .mapPartitions(lambda row: csv.reader(row)) \
    .map(lambda row: (row[6], row[7], row[8], row[9], row[10], row[11], row[12])).cache()


count_file_pattern = 'result/label_count/col{}.out' # Save for each row a tuple (basic type, semantic type, label, count) for each column 

## Aggregate and count the label for each column. 
#Each row is assigned the count (last column) for the label (second last column): (basic_type, semantic_type, label, count)
Beispiel #53
0
from pyspark.ml.feature import VectorAssembler, StringIndexer, VectorIndexer

from plotly.offline import download_plotlyjs, init_notebook_mode, iplot, plot
import plotly
import plotly.graph_objs as go
import pandas as pd

# init
sc = SparkContext('local', 'PSAML_Titanic')

# Get parent directory of the tests directory
parent_dir = os.path.abspath(os.path.dirname(os.path.dirname(__file__)))
sys.path.append(os.path.join(parent_dir, 'psaml'))
import psaml

sc.addPyFile(os.path.join(parent_dir, 'psaml/psaml.py'))

sql_context = SQLContext(sc)

# header=false so the columns aren't named after the first row values
# inferSchema=true so that data is read in as correct data type, not just strings
data = sql_context.read.load('tests/resources/titanic/train.csv', format='com.databricks.spark.csv', header='true', inferSchema='true')

# now we create a vector of the input columns so they can be one column
ignore = ['Survived', 'Name', 'Ticket', 'Cabin']  # ignore the output column and nonquantifiable data
assembler = VectorAssembler(inputCols=[x for x in data.columns if x not in ignore], outputCol='features')

# Automatically identify categorical features, and index them.
# We specify maxCategories so features with > 4 distinct values are treated as continuous.
# (maxCategories is not set at the moment, however)
#  feature_indexer = VectorIndexer(inputCol="features", outputCol="indexed")
def main():

    sc = SparkContext()
    sqlCtx = SQLContext(sc)

    config = configparser.ConfigParser()
    config.read('config.ini')

    #Path that contains all files for analysis
    path_analysis = config.get('DEFAULT', 'path_analysis')
    #Ligand Database file
    ligand_database = config.get('DEFAULT', 'ligand_database_path_file')
    #Path for drugdesign project
    path_spark_drugdesign = config.get('DRUGDESIGN', 'path_spark_drugdesign')

    #Adding Python Source file
    sc.addPyFile(os.path.join(path_spark_drugdesign, "vina_utils.py"))
    sc.addPyFile(os.path.join(path_spark_drugdesign, "database_io.py"))
    sc.addPyFile(os.path.join(path_spark_drugdesign, "hydrogen_bond_io.py"))
    sc.addPyFile(os.path.join(path_spark_drugdesign, "hydrogen_bond_crud.py"))

    #Sufix of completly data file
    full_data_file_name = config.get('DRUGDESIGN', 'full_data_file_name')

    start_time = datetime.now()

    #**************** Loading file that contains all scores and ligand efficiency
    score_file_name = os.path.join(path_analysis, "summary_energies.dat")
    text_file = sc.textFile(score_file_name)
    header = text_file.first()  #extract header

    #Spliting score file by \t
    rdd_vs_score_sorted_split = text_file.filter(lambda x: x != header).map(
        lambda line: line.split("\t"))
    #rdd_vs_score_sorted = rdd_vs_score_sorted_split.map(lambda p: Row(receptor=str(p[0]), ligand=str(p[1]), mode=int(p[2]), energy=float(p[3]) ))
    rdd_vs_score_sorted = rdd_vs_score_sorted_split.map(lambda p: Row(
        affinity=float(p[0]), ligand_efficiency=float(p[1]), pose=str(p[2])))
    #Creating Vina Datafrase based on score file
    vina_table = sqlCtx.createDataFrame(rdd_vs_score_sorted)
    vina_table.registerTempTable("vina_lig_efficiency")
    #**************** Finish

    #**************** Loading Ligand Database

    rdd_database = load_database(sc, ligand_database)
    #Creating Dataframe
    database_table = sqlCtx.createDataFrame(rdd_database)
    database_table.registerTempTable("database")
    #**************** Finish

    #**************** Loading Buried Area total
    buried_area_file_name = os.path.join(path_analysis,
                                         "summary_buried_areas_total.dat")
    buried_area_file = sc.textFile(buried_area_file_name)

    #Spliting file by \t
    header = buried_area_file.first()  #extract header
    rdd_buried_area_split = buried_area_file.filter(lambda x: x != header).map(
        lambda line: line.split("\t"))
    #rdd_buried_area = rdd_buried_area_split.map(lambda p: Row( receptor=str(p[0]), ligand=str(p[1]), mode=int(p[2]), buried_lig_rec=float(p[3]), buried_lig_rec_perc=float(p[4]), buried_lig_lig_perc=float(p[5]) ))
    rdd_buried_area = rdd_buried_area_split.map(
        lambda p: Row(buried_area_total=float(p[0]), pose=str(p[1])))

    #Creating buried Dataframe
    buried_table = sqlCtx.createDataFrame(rdd_buried_area)
    buried_table.registerTempTable("buriedArea_total")
    #**************** Finish

    #**************** Loading Buried Area receptor
    buried_area_file_name = os.path.join(path_analysis,
                                         "summary_buried_areas_receptor.dat")
    buried_area_file_receptor = sc.textFile(buried_area_file_name)
    header = buried_area_file_receptor.first()  #extract header

    #Spliting file by \t
    buried_area_file_receptor_split = buried_area_file_receptor.filter(
        lambda x: x != header).map(lambda line: line.split("\t"))
    buried_area_file_receptor = buried_area_file_receptor_split.map(
        lambda p: Row(buried_area_receptor=float(p[0]), pose=str(p[1])))

    #Creating buried Dataframe
    buried_area_file_receptor_table = sqlCtx.createDataFrame(
        buried_area_file_receptor)
    buried_area_file_receptor_table.registerTempTable("buried_area_receptor")
    #**************** Finish

    #**************** Loading Buried Area ligand
    buried_area_file_name = os.path.join(path_analysis,
                                         "summary_buried_area_ligand.dat")
    buried_area_file_ligand = sc.textFile(buried_area_file_name)
    header = buried_area_file_ligand.first()  #extract header

    #Spliting file by \t
    buried_area_file_ligand_split = buried_area_file_ligand.filter(
        lambda x: x != header).map(lambda line: line.split("\t"))
    buried_area_file_ligand = buried_area_file_ligand_split.map(
        lambda p: Row(buried_area_lig=float(p[0]),
                      buried_area_lig_perc=float(p[1]),
                      buried_area_lig_lig_perc=float(p[2]),
                      pose=str(p[3])))

    #Creating buried Dataframe
    buried_area_file_ligand_table = sqlCtx.createDataFrame(
        buried_area_file_ligand)
    buried_area_file_ligand_table.registerTempTable("buried_area_ligand")
    #**************** Finish

    #**************** Loading Hydrogen Bond
    hydrogen_bond_num_pose_file_name = os.path.join(
        path_analysis, "summary_hbonds_4.0A_30.0deg.dat")
    rdd_hydrogen_bond = load_file_summary_hbonds(
        sc, hydrogen_bond_num_pose_file_name)
    #Creating buried Dataframe
    hydrogen_bond_table = create_df_hydrogen_bond(sqlCtx, rdd_hydrogen_bond)

    #**************** Finish

    #Creating SQL command
    sql = ""
    sql = "SELECT vina_lig_efficiency.pose, vina_lig_efficiency.affinity, vina_lig_efficiency.ligand_efficiency"
    sql += " ,buriedArea_total.buried_area_total"
    sql += " ,buried_area_receptor.buried_area_receptor"
    sql += " ,buried_area_ligand.buried_area_lig, buried_area_ligand.buried_area_lig_perc, buried_area_ligand.buried_area_lig_lig_perc "
    sql += " ,hydrogenbond.numHydroBond	"
    sql += " FROM vina_lig_efficiency"
    sql += " JOIN buriedArea_total ON buriedArea_total.pose = vina_lig_efficiency.pose"
    sql += " JOIN buried_area_receptor ON buried_area_receptor.pose = vina_lig_efficiency.pose"
    sql += " JOIN buried_area_ligand ON buried_area_ligand.pose = vina_lig_efficiency.pose"
    sql += " LEFT OUTER	"
    sql += " JOIN hydrogenbond ON hydrogenbond.pose = vina_lig_efficiency.pose"
    sql += " ORDER BY vina_lig_efficiency.pose"

    #Getting all data
    full_dataRDD = sqlCtx.sql(sql)
    full_dataRDD = full_dataRDD.map(lambda p: (
        p.affinity, p.ligand_efficiency, p.numHydroBond, p.buried_area_lig, p.
        buried_area_lig_perc, p.buried_area_lig_lig_perc, p.buried_area_total,
        p.buried_area_receptor, p.pose)).collect()

    #Saving file
    save_vs_full_data(path_analysis, full_dataRDD, full_data_file_name)

    finish_time = datetime.now()

    save_vs_full_data_analysis_log(finish_time, start_time)
class CommonSparkContext(object):
    def __init__(self):
        """
        Create a spark context.

        The spark configuration is taken from $XPATTERNS_HOME/config.ini or from
        the values set in SparkInitContext.set().

        Notes
        -----
        cluster_url : str, optional
            The url of the spark cluster to use.  To use the local spark, give
            'local'.  To use a spark cluster with its master on a specific IP addredd,
            give the IP address or the hostname as in the following example:
            cluster_url=spark://my_spark_host:7077

        app_name : str, optional
            The app name is used on the job monitoring server, and for logging.

        cores_max : str, optional
            The maximum number of cores to use for execution.

        executor_memory : str, optional
            The amount of main memory to allocate to executors.  For example, '2g'.
        """

        env = Environment.create_default()
        config_context = {'cluster_url': env.get_config('spark', 'cluster_url', default='local'),
                          'cores_max': env.get_config('spark', 'cores_max', default='8'),
                          'executor_memory': env.get_config('spark', 'executor_memory', default='8g'),
                          'app_name': env.get_config('spark', 'app_name', 'xFrame')}
        config_context.update(SparkInitContext.context)
        config_pairs = [(k, v) for k, v in config_context.iteritems()]
        conf = (SparkConf().setAll(config_pairs))
        self._sc = SparkContext(conf=conf)
        self._sqlc = SQLContext(self._sc)

        self.zip_path = self.build_zip()
        if self.zip_path:
            self._sc.addPyFile(self.zip_path)
        atexit.register(self.close_context)

    def close_context(self):
        if self._sc:
            self._sc.stop()
            self._sc = None
            if self.zip_path:
                os.remove(self.zip_path)

    def sc(self):
        return self._sc

    def sqlc(self):
        return self._sqlc

    @staticmethod
    def build_zip():
        if 'XPATTERNS_HOME' not in os.environ:
            return None
        # This can fail at writepy if there is something wrong with the files
        #  in xpatterns.  Go ahead anyway, but things will probably fail of this job is
        #  distributed
        try:
            tf = NamedTemporaryFile(suffix='.zip', delete=False)
            z = PyZipFile(tf, 'w')
            z.writepy(os.path.join(os.environ['XPATTERNS_HOME'], 'xpatterns'))
            z.close()
            return tf.name
        except:
            print 'Zip file distribution failed -- workers will not get xpatterns code.'
            print 'Check for unexpected files in XPATTERNS_HOME/xpatterns.'
            return None
import datetime
from myUtils import *
from validation_utils import *


def getDateHour(date_text):
    given_date = datetime.datetime.strptime(date_text, '%Y-%m-%d %H:%M:%S')
    year = given_date.year
    month = given_date.month
    hour = given_date.hour
    day_of_the_week = given_date.isoweekday()
    return (given_date.date(), hour ,"Valid")



sc = SparkContext()
sc.addPyFile("myUtils.py")
sc.addPyFile("validation_utils.py")

(taxi_data,prefix) = readFiles2({2016:range(1,7),2015:range(1,13),2014:range(1,13),2013:range(1,13)},sc)
	
field = taxi_data.map(lambda entry: (entry[1],checkPickUpDateValid(entry[1])))
filtered_valid_records = field.filter(lambda x: x[1] == "Valid").map(lambda x: (getDateHour(x[0]))).map(lambda x: (str(x[0])+"\t"+str(x[1]),1)).reduceByKey(lambda x,y: x+y)
	
tabSeparated =  filtered_valid_records.map(lambda x: x[0]+"\t"+str(x[1])) 
tabSeparated.saveAsTextFile("pickup_date_and_time_frequency.out")
	
sc.stop()
	

Beispiel #57
0
    Example usage: digests.foreachRDD(print_RDD_contents)
    """
    for x in rdd.collect():
        print x


##########################################
# Spark job
##########################################

# set Spark context
sc = SparkContext(appName="Latency")
sc.setLogLevel("WARN")

sc.addPyFile("../tdigest/tdigest_altered.py")  # import custom tdigest class
from tdigest_altered import TDigest

ssc = StreamingContext(sc, microbatch_size)

# create D-Stream from Kafka topic
kafka_stream = KafkaUtils.createDirectStream(ssc, [topic],
                                             {"metadata.broker.list": brokers})

# extract latency data (combined across devices)
# json schema: {u'device': u'type2', u'latency': 2.487, u'message_num': 189}
latencies = kafka_stream.map(lambda row: row[1])\
                        .map(json.loads)\
                        .map(lambda x: x["latency"])

# compute tdigest of each partition and write to redis
class CommonSparkContext(object):
    __metaclass__ = Singleton

    def __init__(self):
        """
        Create a spark context.

        The spark configuration is taken from xframes/config.ini and from
        the values set in SparkInitContext.set() if this has been called.
        """

        # This is placed here because otherwise it causes an error when used in a spark slave.
        from pyspark import SparkConf, SparkContext, SQLContext, HiveContext
        # This reads from default.ini and then xframes/config.ini
        # if they exist.
        self._env = Environment.create()
        context = create_spark_config(self._env)
        verbose = self._env.get_config('xframes', 'verbose',
                                       'false').lower() == 'true'
        hdfs_user_name = self._env.get_config('webhdfs', 'user', 'hdfs')
        os.environ['HADOOP_USER_NAME'] = hdfs_user_name
        config_pairs = [(k, v) for k, v in context.iteritems()]
        self._config = (SparkConf().setAll(config_pairs))
        if verbose:
            print 'Spark Config: {}'.format(config_pairs)

        self._sc = SparkContext(conf=self._config)
        self._sqlc = SQLContext(self._sc)
        self._hivec = HiveContext(self._sc)
        self.zip_path = []
        version = [int(n) for n in self._sc.version.split('.')]
        self.status_tracker = self._sc.statusTracker()
        if cmp(version, [1, 4, 1]) >= 0:
            self.application_id = self._sc.applicationId
        else:
            self.application_id = None

        if verbose:
            print 'Spark Version: {}'.format(self._sc.version)
            if self.application_id:
                print 'Application Id: {}'.format(self.application_id)

        if not context['spark.master'].startswith('local'):
            zip_path = self.build_zip(get_xframes_home())
            if zip_path:
                self._sc.addPyFile(zip_path)
                self.zip_path.append(zip_path)

        trace_flag = self._env.get_config('xframes', 'rdd-trace',
                                          'false').lower() == 'true'
        XRdd.set_trace(trace_flag)
        atexit.register(self.close_context)

    def spark_add_files(self, dirs):
        """
        Adds python files in the given directory or directories.

        Parameters
        ----------
        dirs: str or list(str)
            If a str, the pathname to a directory containing a python module.
            If a list, then it is a list of such directories.

            The python files in each directory are compiled, packed into a zip, distributed to each
            spark slave, and placed in PYTHONPATH.

            This is only done if spark is deployed on a cluster.
        """
        props = self.config()
        if props.get('spark.master', 'local').startswith('local'):
            return
        if isinstance(dirs, basestring):
            dirs = [dirs]
        for path in dirs:
            zip_path = self.build_zip(path)
            if zip_path:
                self._sc.addPyFile(zip_path)
                self.zip_path.append(zip_path)

    def close_context(self):
        if self._sc:
            self._sc.stop()
            self._sc = None
            for zip_path in self.zip_path:
                os.remove(zip_path)

    def config(self):
        """
        Gets the configuration parameters used to initialize the spark context.

        Returns
        -------
        out : dict
            A dict of the properties used to initialize the spark context.
        """
        props = self._config.getAll()
        return {prop[0]: prop[1] for prop in props}

    def env(self):
        """
        Gets the config environment.

        Returns
        -------
        out : Environment
            The environment.  This contains all the values from the configuration file(s).
        """

        return self._env

    def sc(self):
        """
        Gets the spark context.

        Returns
        -------
        out : SparkContext
            The spark context.  There is a single spark context per process.
        """
        return self._sc

    def sqlc(self):
        """
        Gets the spark sql context.

        Returns
        -------
        out : sql.SqlContext
            The spark sql context.
        """
        return self._sqlc

    def hivec(self):
        """
        Gets the hive context.

        Returns
        -------
        out : sql.HiveContext
            The hive context.
        """
        return self._hivec

    def version(self):
        """
        Gets the spark version.

        Returns
        -------
        out: lst[int]
            The spark version, as a list of integers.
        """
        return [int(n) for n in self._sc.version.split('.')]

    def jobs(self):
        """
        Get the spark job ID and info for the active jobs.

        This method would normally be called by another thread from the executing job.

        Returns
        -------
        out: map(job_id: job_info}
            A map of the active job IDs and their corresponding job info
        """
        return {
            job_id: self.status_tracker.getJobInfo(job_id)
            for job_id in self.status_tracker.getActiveJobIds()
        }

    def cluster_mode(self):
        """
        Get the cluster mode of the spark cluster.

        Returns
        -------
        out: boolean
            True if spark is running in cluster mode.  Cluster mode means that spark is running on a platform separate
            the program.  In practice, cluster mode means that file arguments must be located on
            a network filesystem such as HDFS or NFS.
        """
        return not self._config.get('spark.master').startswith('local')

    # noinspection PyBroadException
    @staticmethod
    def build_zip(module_dir):
        # This can fail at writepy if there is something wrong with the files
        #  in xframes.  Go ahead anyway, but things will probably fail if this job is
        #  distributed
        try:
            tf = NamedTemporaryFile(suffix='.zip', delete=False)
            z = PyZipFile(tf, 'w')
            z.writepy(module_dir)
            z.close()
            return tf.name
        except:
            logging.warn(
                'Zip file distribution failed -- workers will not get xframes code.'
            )
            logging.warn('Check for unexpected files in xframes directory.')
            return None

    @staticmethod
    def spark_context():
        """
        Returns the spark context.

        Returns
        -------
        out : pyspark.SparkContext
            The SparkContext object from spark.
        """
        return CommonSparkContext().sc()

    @staticmethod
    def spark_config():
        """
        Returns the spark cofig parameters.

        Returns
        -------
        out : list
            A list of the key-value pairs stored as tuples, used to initialize the spark context.
        """
        return CommonSparkContext().config()

    @staticmethod
    def spark_sql_context():
        """
        Returns the spark sql context.

        Returns
        -------
        out : pyspark.sql.SQLContext
            The SQLContext object from spark.
        """
        return CommonSparkContext().sqlc()

    @staticmethod
    def hive_context():
        """
        Returns the hive context.

        Returns
        -------
        out : pyspark.sql.HiveContext
            The Hive object from spark.
        """
        return CommonSparkContext().hivec()

    @staticmethod
    def spark_version():
        """
        Gets the spark version.

        Returns
        -------
        out: list[int]
            The spark version, as a list of integers.
        """
        return CommonSparkContext().version()

    @staticmethod
    def spark_cluster_mode():
        """
        Gets the cluster mode

        Returns
        -------
        out: boolean
            True if spark is running in cluster mode.  Cluster mode means that spark is running on a platform separate
            the program.  In practice, cluster mode means that file arguments must be located on
            a network filesystem such as HDFS or NFS.
        """
        env = Environment.create()
        config = create_spark_config(env)
        return not config.get('spark.master').startswith('local')
from __future__ import print_function
from pyspark import SparkContext
import sys

if __name__ == "__main__":
    print("Hello")
    sc = SparkContext()
    sc.addPyFile("classes.zip")
    from HelperTransformations import HelperTransformations
    text_file = sc.textFile(
        's3://torstar-datateam-workspace/data/raw/samples/textfile')
    counts = text_file.map(
        lambda x: HelperTransformations.removeStringSpecialCharacters(
            x)).flatMap(lambda line: line.split(" ")).map(
                lambda word: (word, 1)).reduceByKey(lambda a, b: a + b)
    print(counts.take(5))
    counts.saveAsTextFile(
        's3://torstar-datateam-workspace/data/transformed/samples/textfile_output'
    )
    sc.stop()
Beispiel #60
0
from .ProteinModels import xSeriesWLCe
from collections import OrderedDict
from multiprocessing import Pool
from functools import partial

#create a spark context
conf = SparkConf().setAppName("App")
conf = (conf.setMaster('local[*]').set('spark.executor.memory', '2G').set(
    'spark.driver.memory', '8G').set('spark.driver.maxResultSize', '15G'))
sc = SparkContext(conf=conf)

### NOTE: you will need to re-create this zip file every time you want to run this code. This is super annoying. We should look into using Dask instead of Spark.
print(
    'NOTE: you will need to re-create this zip file every time you want to run this code. This is super annoying. We should look into using Dask instead of Spark.'
)
sc.addPyFile("/home/tbartsch/source/repos/single_molecule_mechanics.zip")


class TimeSeriesLoader(object):
    '''Provides data structures and methods to analyze single-molecule data.'''
    def __init__(self):
        #define some default values
        data = np.empty((3, 2))
        data.fill(np.nan)
        self.properties_mephisto = pd.DataFrame(data, columns=['k', 'tau'])

        data = np.empty((6, 3))
        data.fill(np.nan)
        self.nonlin_correction = pd.DataFrame(
            data, columns=['coeff_x', 'coeff_y', 'coeff_z'])