Beispiel #1
0
def ops_main(opt, config):
    """
    ssc main函数
    :param opt: 
    :param config: 
    :return: 
    """
    # 初始化
    sc = SparkContext(appName="bops-ssc")
    print sc.version

    # 加载ip库
    if config.IP_FILE_PATH:
        sc.addFile(config.IP_FILE_PATH)
    else:
        sc.addFile(geo_db_path)

    # 创建Spark Streaming Context,每隔3min处理一批数据
    step_num = 3 * 60
    ssc = StreamingContext(sc, step_num)

    # monitor kafka消息处理
    monitor_main(opt, config, sc, ssc, step_num)

    opt_main(opt, config, sc, ssc, step_num)

    error_logs_main(opt, config, sc, ssc, step_num)

    # 开始streaming 处理
    ssc.start()
    # 等待结束,在执行过程中发生的任何异常将被抛出在这个线程
    ssc.awaitTermination()
def main():
    ''' Main function
        ST_AUTH - Object storage auth string where fna containers are found
        ST_USER - Ojbect storage user token
        ST_KEY - Ojbect storage secret token
        MAX_FILE_SIZE - Maximum file set parameter for makeblastdb
        TASKS - Number of tasks to launch, db partition factor
        MAKEBLASTDB - Location of makeblastdb executable
        OBJECT_STORES - list of source containers that built the blast db
    '''
    # Set the context
    conf = SparkConf()
    sc = SparkContext(conf=conf)
    all_config = sc._conf.getAll()
    fasta_files = all_config['dirs'].split(",")

    OBJECT_STORES = ['geba']
    TASKS = 3

    # Quiet the logs
    sc.setLogLevel("WARN")

    # Set our spark database creation script and add all the files that are needed to be on the
    # remote hosts to the shall script
    ShellScript = "hdfs:///exec/spark_blast/spark_blastdb.bash"
    sc.addFile(ShellScript)

    sc.addFile("hdfs:///exec/spark_blast/makeblastdb")
    # this will be our root name for our DB names
    db_container = "blastdb_" + "-".join(
        sorted(OBJECT_STORES)) + "_" + str(TASKS)

    # Get the list of objects we are going to need, i.e. .fast files
    # Distribute our data, shuffle it in case there is any size ordering going on
    shuffle(fasta_files)
    distData = sc.parallelize(fasta_files, TASKS)

    # Pass our bash script our parameters, ideally we would like to pass the executor ID/Task ID, but
    # this doesn't appear to be available in ver 2.1.1
    pipeRDD = distData.pipe(ShellScript)

    # Now let the bash script do its work.  This will assemble and store the results of all the list of
    # fna files collected from each Object Store
    #
    # It has done its work--I toss it carelessly to fall where it may
    #   -- Walt Whitman: Leaves of Grass, Book 4 - Children of Adam, Spontaneous Me
    print("Starting to create %d blast database 'partitions'" % TASKS)
    for line in pipeRDD.collect():
        print(line)

    print("Complete")
Beispiel #3
0
def spark_tob_ats_parse(input_path, output_path1, output_path2):
    sc = SparkContext(appName="tob_ats_parse")
    sc.addFile("industries.csv")
    sc.addFile("function_taxonomy.txt")
    sc.addFile("dedup_majors_v1.jsonl")
    rdd = sc.textFile(input_path) \
        .mapPartitions(tob_ats_extract_feature_mappattion)

    rdd.map(lambda line: filter_fea(line)) \
        .map(lambda line: json.dumps(line, ensure_ascii=False)) \
        .saveAsTextFile(output_path1)

    rdd.map(lambda line: json.dumps(line, ensure_ascii=False)) \
        .saveAsTextFile(output_path2)
class mapreduce:
    def __init__(self, path):
        conf = SparkConf()
        self.sc = SparkContext(conf=conf)
        filelist = []
        for filename in os.listdir(path):
            if filename != '.DS_Store':  #for testing locally on Mac
                filelist.append(path + filename)
        self.doc = self.sc.textFile(','.join(filelist))

    def wordCount(self):
        self.counts = self.doc.flatMap(lambda line: line.split()).map(
            lambda word: (word, 1)).reduceByKey(
                lambda x, y: x + y).sortByKey()
        return self.counts

    def doubleWordCount(self):

        # Create list of double words
        def doubleWords(line):
            line = line.split()
            doubleWords = ()
            for i in range(len(line) - 1):
                doubleWords += (line[i] + ' ' + line[i + 1], )
            return doubleWords

        self.double_counts = self.doc.flatMap(doubleWords).map(
            lambda doubleWord: (doubleWord, 1)).reduceByKey(
                lambda x, y: x + y).sortByKey()
        return self.double_counts

    def findFreq(self, filepath, filename):
        self.sc.addFile(filepath)

        # Check if the word is in the target list
        def isTarget(word):
            targetList = []
            with open(SparkFiles.get(filepath.split('/')[-1])) as publicF:
                targetList = publicF.read().split()
            if word[0] in targetList:
                return True
            else:
                return False

        self.find_freq_counts = self.doc.flatMap(
            lambda line: line.split()).map(lambda word: (word, 1)).reduceByKey(
                lambda x, y: x + y).filter(isTarget).sortByKey()
        return self.find_freq_counts
def save_data_to_db():
    from pyspark import SparkContext, SparkConf
    from pyspark.streaming import StreamingContext

    conf = SparkConf().setMaster("localhost")
    sc = SparkContext("local[*]", "tikcket_mechine_gen")
    sc.setLogLevel("WARN")
    sc.addFile(lib_dir+'/getDistance.py')

    data_used_by_ticket_mechine_gen.drop()
    path = '/3/2014-10-15'
    for s in stations:
        full_path = data_dir_path+'v0/'+s+path
        print full_path
        data_to_save = getDistance.get_one_day_group_by_time(full_path, sc)
        for item in data_to_save:
            data_used_by_ticket_mechine_gen.insert({'station_name':s, 'time':item[0], 'data':item[1]})
def main():

    # Configure Spark
    conf = SparkConf()
    conf.setAppName("Application name")  # Specify the application name
    conf.set("spark.jars", "file:/shared_data/spark_jars/hadoop-openstack-3.0.0-SNAPSHOT.jar")  # Don't modify
    sc = SparkContext(conf=conf)  # Spark Context variable that will be used for all operations running on the cluster

    parser = argparse.ArgumentParser()
    parser.add_argument("backend", type=str)
    parser.add_argument("helperpath", type=str)
    parser.add_argument("shuffle_partitions", type=str)
    parser.add_argument("params", type=str)
    parser.add_argument("inputs", type=str)
    parser.add_argument("features", type=str, nargs='?')

    args = parser.parse_args()

    # Swift Connection
    if(args.backend == 'swift'):
        hadoopConf = sc._jsc.hadoopConfiguration()
        hadoopConf.set("fs.swift.impl", "org.apache.hadoop.fs.swift.snative.SwiftNativeFileSystem")
        hadoopConf.set("fs.swift.service.SparkTest.auth.url", os.environ['OS_AUTH_URL'] + "/tokens")
        hadoopConf.set("fs.swift.service.SparkTest.http.port", "8443")
        hadoopConf.set("fs.swift.service.SparkTest.auth.endpoint.prefix", "/")
        hadoopConf.set("fs.swift.service.SparkTest.region", os.environ['OS_REGION_NAME'])
        hadoopConf.set("fs.swift.service.SparkTest.public", "false")
        hadoopConf.set("fs.swift.service.SparkTest.tenant", os.environ['OS_TENANT_ID'])
        hadoopConf.set("fs.swift.service.SparkTest.username", os.environ['OS_USERNAME'])
        hadoopConf.set("fs.swift.service.SparkTest.password", os.environ['OS_PASSWORD'])

    helperpath = str(args.helperpath)  # This is passed by default
    sc.addFile(helperpath + "/utils/helper.py")  # To import custom modules
    shuffle_partitions = args.shuffle_partitions

    # Create a dict and pass it in your_module_implementation
    params = json.loads(args.params)
    inputs = json.loads(args.inputs)
    features = json.loads(args.features)  # Only used when you want to create a feature set

    sqlContext = SQLContext(sc)  # Create SQLContext var from SparkContext, To work with our default format of datasets i.e. Parquet
    sqlContext.setConf("spark.sql.shuffle.partitions", shuffle_partitions)  # Don't change, required for controlling parallelism

    # Pass the sc (Spark Context) and sqlContext along with the different paramters and inputs.
    module_implementation(sc, sqlContext, params=params, inputs=inputs, features=features)
Beispiel #7
0
def main():
    APP_NAME = "CS179G"

    conf = SparkConf().setAppName(APP_NAME)
    conf = conf.setMaster("spark://spark53.cs.ucr.edu:7077")
    sc = SparkContext(conf=conf)
    sc.addFile("/home/cs179g/artistData.py")
    #sc.addFile("/home/cs179g/info3.txt")
    for i in range(1, len(sys.argv)):

        textRDD = sc.textFile(sys.argv[i])

        words = textRDD.map(lambda x: x.split(
            "--------------------------------------")).map(lambda x: (x, 1))

        artistRDD = words.map(lambda x: get_artist_stats(x)).distinct().filter(
            lambda x: x is not None).filter(lambda x: x != "").filter(
                lambda row: row.artist_name != "" and row.artist_name != None)
        temp = artistRDD.map(lambda row: {'name': row.artist_name,\
            'albums': row.artist_album_num,\
            'avg_album': row.artist_avg_album_len,\
            'avg_song': row.artist_avg_song_len,\
            'cont': row.artist_content,\
            'followers': row.artist_followers,\
            'genres': row.artist_genres,\
            'pop': row.artist_pop,\
            'songs': row.artist_song_num,\
            'sum': row.artist_sum,\
            'ref_count': row.artist_ref_count,\
            'duration': row.artist_duration,\
            'sum_word': row.artist_sum_word,\
            'cont_word': row.artist_content_word,\
            'sum_count': row.artist_sum_count,\
            'cont_count': row.artist_content_count,\
            'artist_albums': row.artist_albums,\
            'album_avg_song': row.album_avg_song,\
            'album_duration': row.album_duration,\
            'album_popularity': row.album_popularity,\
            'album_release': row.album_release ,\
            'album_tracks': row.album_tracks,\
            'artist_track': row.artist_track,\
            'track_popularity': row.track_popularity,\
            'track_duration': row.track_duration})
        temp.saveToCassandra(keyspace='data', table='all_data')
Beispiel #8
0
def main():
    APP_NAME = "CS179G"

    conf = SparkConf().setAppName(APP_NAME)
    conf = conf.setMaster("spark://spark53.cs.ucr.edu:7077")
    #conf = conf.setMaster("local[*]")
    sc = SparkContext(conf=conf)
    sc.addFile("/home/cs179g/artistData.py")
    sqlContext = SQLContext(sc)
    test = sqlContext.read.json("fixed_info.json")
    test = test.map(lambda x: get_artist_stats(x))
    test = test.distinct().filter(lambda x: x is not None).filter(
        lambda x: x != "").filter(
            lambda row: row.artist_name != "" and row.artist_name != None)
    temp = test.map(lambda row: {'name': row.artist_name,\
         'albums': row.artist_album_num,\
         'avg_album': row.artist_avg_album_len,\
         'avg_song': row.artist_avg_song_len,\
         'cont': row.artist_content,\
         'followers': row.artist_followers,\
         'genres': row.artist_genres,\
         'pop': row.artist_pop,\
         'songs': row.artist_song_num,\
         'sum': row.artist_sum,\
         'ref_count': row.artist_ref_count,\
         'duration': row.artist_duration,\
         'sum_word': row.artist_sum_word,\
         'cont_word': row.artist_content_word,\
         'sum_count': row.artist_sum_count,\
         'cont_count': row.artist_content_count,\
         'artist_albums': row.artist_albums,\
         'album_avg_song': row.album_avg_song,\
         'album_duration': row.album_duration,\
         'album_popularity': row.album_popularity,\
         'album_release': row.album_release ,\
         'album_tracks': row.album_tracks,\
         'artist_track': row.artist_track,\
         'track_popularity': row.track_popularity,\
         'track_duration': row.track_duration})
    temp.saveToCassandra(keyspace='data', table='all_data')
Beispiel #9
0
class SparkDriver:
    def __init__(self, config):
        self.config = config
        log.debug('SPARK_CONFIG: {0}'.format(config))

        spark_conf = SparkConf().setMaster(self.config['master']).setAppName(
            datetime.now().strftime('%Y%m%d%H%M%S'))

        self.sc = SparkContext(conf=spark_conf)
        self.sqlContext = SQLContext(self.sc)

        # 测试代码
        if True:
            import os
            path = os.path.join('./', "test.txt")
            with open(path, "w") as testFile:
                _ = testFile.write("100")
            self.sc.addFile(path)

            result = self.sc.parallelize([1, 2, 3,
                                          4]).mapPartitions(func).collect()
            print(">>>>>>>>>>>>>>>>>", result)
Beispiel #10
0
class SimplePySparkSubmit:
    """ ... """

    sc = None

    def __init__(self, master="local"):
        ''' ... '''
        from pyspark import SparkConf, SparkContext
        conf = (SparkConf().setMaster(master).setAppName("My app").set(
            "spark.executor.memory", "1g"))
        try:
            self.sc = SparkContext(conf=conf)
        except Exception as err:
            print(err)

    def calculate_iterator(self, iterator):
        ''' ... '''
        from pyspark import SparkFiles

        path = "tests/data/test.txt"
        with open(path, "w") as test_file:
            _ = test_file.write("100")
        self.sc.addFile(path)

        with open(SparkFiles.get("test.txt")) as test_file:
            file_val = int(test_file.readline())
        return [x * file_val for x in iterator]

    def test_map_reduct(self):
        ''' ... '''
        try:
            stringRDD = self.sc.parallelize(
                ['Apple', 'Orange', 'Grape', 'Banana', 'Apple'])
            print(
                stringRDD.map(lambda f: (f, 1)).reduceByKey(
                    lambda f, n: n + 1).collect())
        except:
            print("Sorry")
Beispiel #11
0
def main():

    spark_conf = SparkConf().setAppName("Spark Streaming MinHash")

    global sc
    sc = SparkContext(conf=spark_conf)
    sc.addFile(
        os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +
        "/lib/min_hash.py")
    sc.addFile(
        os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +
        "/lib/locality_sensitive_hash.py")
    sc.addFile(
        os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +
        "/lib/util.py")
    sc.addFile(
        os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +
        "/config/config.py")

    global ssc
    ssc = StreamingContext(sc, config.SPARK_STREAMING_MINI_BATCH_WINDOW)
    ssc.checkpoint("_spark_streaming_checkpoint")

    kafka_stream = KafkaUtils.createDirectStream(
        ssc, [config.KAFKA_TOPIC],
        {"metadata.broker.list": config.KAFKA_SERVERS})

    # Create and save MinHash and LSH or load them from file
    if (not os.path.isfile(config.MIN_HASH_PICKLE)
            or not os.path.isfile(config.LSH_PICKLE)):
        mh = MinHash(config.MIN_HASH_K_VALUE)
        lsh = LSH(config.LSH_NUM_BANDS, config.LSH_BAND_WIDTH,
                  config.LSH_NUM_BUCKETS)

        util.save_pickle_file(mh, config.MIN_HASH_PICKLE)
        util.save_pickle_file(lsh, config.LSH_PICKLE)
    else:
        mh = util.load_pickle_file(config.MIN_HASH_PICKLE)
        lsh = util.load_pickle_file(config.LSH_PICKLE)

    # Process stream
    kafka_stream.map(lambda kafka_response: json.loads(kafka_response[1]))\
        .map(lambda json_body: extract_data(json_body))\
        .foreachRDD(lambda rdd: rdd.foreachPartition(lambda question: process_mini_batch(question, mh, lsh)))

    ssc.start()
    ssc.awaitTermination()
Beispiel #12
0
def main():
    ### Initialize the SparkConf and SparkContext

    ### Locations of Python files.
    sheets_loc = "/root/IdeaNets/Synapsify/Synapsify/loadCleanly/sheets.py"
    lstm_class_loc = "/root/IdeaNets/IdeaNets/models/lstm/scode/lstm_class.py"
    load_params_loc = "/root/IdeaNets/IdeaNets/models/lstm/scode/load_params.py"
    preprocess_loc = "/root/IdeaNets/IdeaNets/models/lstm/scode/synapsify_preprocess.py"

    ### Pass Python files to Spark.
    pyFiles = []
    pyFiles.append(sheets_loc)
    pyFiles.append(lstm_class_loc)
    pyFiles.append(load_params_loc)
    pyFiles.append(preprocess_loc)

    ### Automatically get the master node url from AWS, normally it is fixed.
    cmd = ["./../../spark/ec2/spark-ec2", "-r", "us-east-1", "get-master", "ruofan-cluster"]
    hostname = (
        subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()[0].split("\n")[2]
    )  ### host name of the master node.
    master_url = ""
    master_url += "spark://"
    master_url += hostname
    master_url += ":7077"
    # print master_url
    ### Initialize the spark configuration.
    conf = SparkConf().setAppName("ruofan").setMaster(master_url)
    sc = SparkContext(conf=conf, pyFiles=pyFiles)

    ### Add non-python files passing to Spark.
    sc.addFile("/root/spark/bin/nonbreaking_prefix.en")
    sc.addFile("/root/IdeaNets/IdeaNets/models/lstm/scode/tokenizer.perl")
    sc.addFile("/root/IdeaNets/Synapsify/Synapsify/loadCleanly/stopwords.txt")
    sc.addFile("/root/IdeaNets/Synapsify/Synapsify/loadCleanly/prepositions.txt")

    datafile = sc.wholeTextFiles(
        "s3n://synapsify-lstm/Synapsify_data1", use_unicode=False
    )  ### Read data directory from S3 storage.

    ### Sent the application in each of the slave node
    datafile.foreach(lambda (path, content): lstm_test(path, content))
Beispiel #13
0
def init_spark_context():

    global predictionModel

    # load spark context
    conf = SparkConf().setAppName("movie_recommendation-server")

    # IMPORTANT: pass aditional Python modules to each worker
    sc = SparkContext(conf=conf, pyFiles=['webapp.py', 'service_func.py'])

    # absolute path in hdfs
    # to run locally, remove first slash '/' i.e my_model1, not /my_model1

    predictionModel = DecisionTreeModel.load(sc, '/my_model1')
    sc.addFile( 'conv/6.p')
    sc.addFile( 'conv/7.p')
    sc.addFile( 'conv/8.p')
    sc.addFile('conv/10.p')
    sc.addFile('conv/12.p')
    sc.addFile( 'conv/36.p')

    return sc
Beispiel #14
0
    def sc(self):  # noqa
        if not self._spark_context:
            spark_context = SparkContext(conf=self.spark_config)

            assert self.spex_conf.spex_file is not None, "The spex builder must be broken I do not know my spex conf!"
            spark_context.addFile(self.spex_conf.spex_file)

            for py_file in self.spex_conf.spark_config.py_files:
                spark_context.addPyFile(py_file)

            for file in self.spex_conf.spark_config.files:  # noqa
                spark_context.addFile(file)

            for jar in self.spex_conf.spark_config.jars:  # noqa
                spark_context.addFile(jar)

            self._spark_context = spark_context
            print_banner(self)
        return self._spark_context
Beispiel #15
0
    def sc(self):  # noqa
        if not self._spark_context:
            spark_context = SparkContext(conf=self.spark_config)

            assert self.spex_conf.spex_file is not None, "The spex builder must be broken I do not know my spex conf!"
            spark_context.addFile(self.spex_conf.spex_file)

            for py_file in self.spex_conf.spark_config.py_files:
                spark_context.addPyFile(py_file)

            for file in self.spex_conf.spark_config.files:  # noqa
                spark_context.addFile(file)

            for jar in self.spex_conf.spark_config.jars:  # noqa
                spark_context.addFile(jar)

            self._spark_context = spark_context
            print_banner(self)
        return self._spark_context
def trial_case(results,
               seed=180555,
               context='wstack',
               nworkers=8,
               threads_per_worker=1,
               processes=True,
               order='frequency',
               nfreqwin=7,
               ntimes=3,
               rmax=750.0,
               facets=1,
               wprojection_planes=1,
               parallelism=16):
    npol = 1

    if parallelism == -1:
        parallelism = None

    np.random.seed(seed)
    results['seed'] = seed

    start_all = time.time()

    results['context'] = context
    results['hostname'] = socket.gethostname()
    results['git_hash'] = git_hash()
    results['epoch'] = time.strftime("%Y-%m-%d %H:%M:%S")

    zerow = False
    print("Context is %s" % context)

    results['nworkers'] = nworkers
    results['threads_per_worker'] = threads_per_worker
    results['processes'] = processes
    results['order'] = order
    results['nfreqwin'] = nfreqwin
    results['ntimes'] = ntimes
    results['rmax'] = rmax
    results['facets'] = facets
    results['wprojection_planes'] = wprojection_planes

    print("At start, configuration is {0!r}".format(results))

    conf = SparkConf().setMaster("local[4]")
    sc = SparkContext(conf=conf)
    sc.addFile("./LOWBD2.csv")
    sc.addFile("./sc256")
    sc.addFile("./SKA1_LOW_beam.fits")
    # sc.addFile("./GLEAM_EGC.fits")

    frequency = np.linspace(0.8e8, 1.2e8, nfreqwin)
    if nfreqwin > 1:
        channel_bandwidth = np.array(nfreqwin * [frequency[1] - frequency[0]])
    else:
        channel_bandwidth = np.array([1e6])
    times = np.linspace(-np.pi / 3.0, np.pi / 3.0, ntimes)

    phasecentre = SkyCoord(ra=+30.0 * u.deg,
                           dec=-60.0 * u.deg,
                           frame='icrs',
                           equinox='J2000')
    config = 'LOWBD2'
    polarisation_frame = PolarisationFrame("stokesI")
    #add broadcast value for telescope_management_data
    telescope_management = telescope_management_handle_locality(
        sc, config, rmax)
    telescope_management_data = telescope_data_generate_locality(
        telescope_management,
        times=times,
        frequencys=frequency,
        channel_bandwidth=channel_bandwidth,
        weight=1.0,
        phasecentre=phasecentre,
        polarisation_frame=polarisation_frame,
        order=order)
    key, meta = next(telescope_management_data)
    print(key)
    print(meta["frequencys"])
    broadcast_tele = sc.broadcast(telescope_management_data)

    vis_graph_list = create_simulate_vis_graph(
        sc,
        'LOWBD2',
        frequency=frequency,
        channel_bandwidth=channel_bandwidth,
        times=times,
        phasecentre=phasecentre,
        order=order,
        format='blockvis',
        rmax=rmax)

    print("****** Visibility creation ******")

    wprojection_planes = 1
    vis = None
    for v in vis_graph_list.collect():
        if v[0][2] == 0:
            vis = v[1]
            break

    advice = advise_wide_field(convert_blockvisibility_to_visibility(vis),
                               guard_band_image=6.0,
                               delA=0.02,
                               facets=facets,
                               wprojection_planes=wprojection_planes,
                               oversampling_synthesised_beam=4.0)

    kernel = advice['kernel']

    npixel = advice['npixels2']
    cellsize = advice['cellsize']
    print(cellsize)
    print(npixel)

    if context == 'timeslice' or context == 'facets_timeslice':
        vis_slices = ntimes
    elif context == '2d' or context == 'facets':
        vis_slices = 1
        kernel = '2d'
    else:
        vis_slices = advice['vis_slices']

    # vis_slices = 4
    results['vis_slices'] = vis_slices
    results['cellsize'] = cellsize
    results['npixel'] = npixel
    print(vis_slices)

    gleam_model_graph = create_low_test_image_from_gleam_spark(
        sc=sc,
        npixel=npixel,
        frequency=frequency,
        channel_bandwidth=channel_bandwidth,
        cellsize=cellsize,
        phasecentre=phasecentre,
        polarisation_frame=PolarisationFrame("stokesI"),
        flux_limit=0.1,
        applybeam=False)

    start = time.time()
    print("****** Starting GLEAM model creation ******")
    # gleam_model_graph.cache()
    # gleam_model_graph.collect()

    print("****** Finishing GLEAM model creation *****")
    end = time.time()
    results['time create gleam'] = end - start
    print("Creating GLEAM model took %.2f seconds" % (end - start))

    vis_graph_list = create_predict_graph_first(gleam_model_graph,
                                                broadcast_tele,
                                                vis_slices=vis_slices,
                                                facets=facets,
                                                context=context,
                                                kernel=kernel,
                                                nfrequency=nfreqwin)
    start = time.time()
    print("****** Starting GLEAM model visibility prediction ******")
    # vis_graph_list.cache()
    # vis_graph_list.collect()
    end = time.time()
    results['time predict'] = end - start
    print("GLEAM model Visibility prediction took %.2f seconds" %
          (end - start))

    # Correct the visibility for the GLEAM model
    print("****** Visibility corruption ******")
    vis_graph_list = create_corrupt_vis_graph(vis_graph_list, phase_error=1.0)
    start = time.time()
    vis_graph_list.cache()
    vis_graph_list.collect()
    end = time.time()
    results['time corrupt'] = end - start
    print("Visibility corruption took %.2f seconds" % (end - start))

    # Create an empty model image
    model_graph = create_empty_image(
        vis_graph_list,
        npixel=npixel,
        cellsize=cellsize,
        frequency=frequency,
        channel_bandwidth=channel_bandwidth,
        polarisation_frame=PolarisationFrame("stokesI"))

    model_graph.cache()
    model_graph.collect()

    # psf_graph = create_invert_graph(vis_graph_list, model_graph, vis_slices=vis_slices, context=context, facets=facets,
    #                                 dopsf=True, kernel=kernel)
    #
    # start = time.time()
    # print("****** Starting PSF calculation ******")
    # psfs = psf_graph.collect()
    # psf = None
    # for i in psfs:
    #     if i[0][2] == 0:
    #         psf = i[1][0]
    # end = time.time()
    # results['time psf invert'] = end - start
    # print("PSF invert took %.2f seconds" % (end - start))
    #
    # results['psf_max'] = qa_image(psf).data['max']
    # results['psf_min'] = qa_image(psf).data['min']
    #
    # print(results['psf_max'])
    # print(results['psf_min'])
    #
    #
    # dirty_graph = create_invert_graph(vis_graph_list, model_graph, vis_slices=vis_slices, context=context, facets=facets,
    #                                 kernel=kernel)
    #

    # start = time.time()
    # print("****** Starting dirty image calculation ******")
    # dirtys  = dirty_graph.collect()
    # dirty, sumwt = (None, None)
    # for i in dirtys:
    #     if i[0][2] == 0:
    #         dirty, sumwt = i[1]
    #
    # print(psf.shape)
    # print(dirty.shape)
    # end = time.time()
    # results['time invert'] = end - start
    # print("Dirty image invert took %.2f seconds" % (end - start))
    # print("Maximum in dirty image is ", numpy.max(numpy.abs(dirty.data)), ", sumwt is ", sumwt)
    # qa = qa_image(dirty)
    # results['dirty_max'] = qa.data['max']
    # results['dirty_min'] = qa.data['min']
    #
    # start = time.time()
    # print("***** write data to file *****")
    # export_images_to_fits(psfs, nfreqwin, "psf.fits")
    # export_images_to_fits(dirtys, nfreqwin, "dirty.fits")
    # end = time.time()
    # results['time write'] = end - start

    print("****** Starting ICAL ******" + " parallelism = " + str(parallelism))
    start = time.time()
    residual_graph, deconvolve_graph, restore_graph = create_ical_graph_locality(
        sc,
        vis_graph_list,
        model_graph,
        nchan=nfreqwin,
        context=context,
        vis_slices=vis_slices,
        facets=facets,
        first_selfcal=1,
        algorithm='msclean',
        nmoments=3,
        niter=1000,
        fractional_threshold=0.1,
        scales=[0, 3, 10],
        threshold=0.1,
        nmajor=5,
        gain=0.7,
        timeslice='auto',
        global_solution=True,
        window_shape='quarter',
        parallelism=parallelism)

    deconvolveds = deconvolve_graph.collect()
    residuals = residual_graph.collect()
    restores = restore_graph.collect()

    end = time.time()
    results['time ICAL'] = end - start
    print("ICAL graph execution took %.2f seconds" % (end - start))

    residual = None
    for i in residuals:
        if i[0][2] == 0:
            residual = i[1][0]
    print(residual)
    qa = qa_image(residual)
    results['residual_max'] = qa.data['max']
    results['residual_min'] = qa.data['min']
    export_images_to_fits(residuals, nfreqwin,
                          "pipelines-timings-delayed-ical_residual.fits")

    deconvolve = None
    for i in deconvolveds:
        if i[0][2] == 0:
            deconvolve = i[1]
    print(deconvolve)
    qa = qa_image(deconvolve)
    results['deconvolved_max'] = qa.data['max']
    results['deconvolved_min'] = qa.data['min']
    export_images_to_fits(deconvolveds,
                          nfreqwin,
                          "pipelines-timings-delayed-deconvolved.fits",
                          has_sumwt=False)

    restore = None
    for i in restores:
        if i[0][2] == 0:
            restore = i[1]
    print(restore)
    qa = qa_image(restore)
    results['restored_max'] = qa.data['max']
    results['restored_min'] = qa.data['min']
    export_images_to_fits(restores,
                          nfreqwin,
                          "pipelines-timings-delayed-restored.fits",
                          has_sumwt=False)

    end_all = time.time()
    results['time overall'] = end_all - start_all

    print("At end, results are {0!r}".format(results))

    sc.stop()

    return results
Beispiel #17
0
# Dummy Spark App demo
from pyspark import SparkContext, SparkConf
from pyspark import SparkFiles

import numpy as np
from barista.customer import Customer

conf = SparkConf().setAppName("Dummy Demo")
sc = SparkContext(conf=conf)

# Add prototxt files to Spark Context
sc.addFile("models/solver.prototxt")
sc.addFile("models/train_val.prototxt")

# Add barista module
sc.addPyFile("barista.zip")
sc.addPyFile("barista/start.py")


# Subclass generic barista Customer
class MyCustomer(Customer):
    def __init__(self, filename):
        compute_semaphore, model_semaphore, handles = \
            Customer.parse_ipc_interface_file(filename)
        Customer.__init__(self, compute_semaphore, model_semaphore, handles)

    def update_data(self):
        self.arrays['data'][:] = np.random.randn(*self.arrays['data'].shape)
        self.arrays['label'][:] = np.random.choice(
                                      xrange(10),
                                      size=self.arrays['label'].shape)
Beispiel #18
0
import os
from pyspark import SparkFiles, SparkConf, SparkContext

# sparkConf = SparkConf().setAppName("cz").setMaster("local[2]")
# sc = SparkContext(sparkConf)
sc = SparkContext('local[1]', 'pyspark')

tempdir = "D:\panrui\我的桌面\learning file\data\\"
path = os.path.join(tempdir, "test.txt")
with open(path, "w") as TextFile:
    _ = TextFile.write("100")
sc.addFile(path)


def func(iterator):
    with open(SparkFiles.get("test.txt")) as textFile:
        fileVal = int(textFile.readline())
        return [x * fileVal for x in iterator]


if __name__ == '__main__':
    sc.parallelize([1, 2, 3, 4]).mapPartitions(func).collect()
Beispiel #19
0
from pyspark.sql.types import *
from pyspark.ml.clustering import *
from pyspark.ml.feature import *
from pyspark.ml.linalg import *
from pyspark.ml.linalg import SparseVector, DenseVector, VectorUDT
from pyspark.mllib.linalg.distributed import IndexedRowMatrix
from operator import add
from distribute_riak import *
from sklearn.neighbors import LSHForest
import numpy as np

#create spark context and SQL context
sc = SparkContext(appName="Recommend")
sqlContext = SQLContext(sc)

sc.addFile("settings.yaml")
sc.addPyFile("distribute_riak.py")

#load settings.yaml
with open("settings.yaml", 'r') as stream:
    try:
        settings = yaml.load(stream)
    except yaml.YAMLError as exc:
        print(exc)

#read in vector data from S3
subreddit_vectors_df = sqlContext.read.parquet(settings['subreddit-vectors'])
author_vectors_df = sqlContext.read.parquet(settings['author-vectors'])

#filter out inactive subs
inactive_subs = sqlContext.read.parquet(
def runFPGrowth(data, minSupport):
    freqItems = getFrequentItems(data, minSupport)
    freqItemsets = getFrequentItemsets(data, minSupport, freqItems)
    return freqItemsets


if __name__ == "__main__":

    APP_NAME = "FPGrowth"

    conf = SparkConf().setAppName(APP_NAME)
    conf = conf.setMaster("local[*]")  ##comment this if working on server

    sc = SparkContext(conf=conf)
    sc.addFile("fpTree.py")
    # sc.setLogLevel("ERROR")

    finput = sys.argv[1]
    foutput = sys.argv[2]
    numPartitions = int(sys.argv[4])
    # file = open("output.txt",'w+')
    threshold = float(sys.argv[3])

    data = sc.textFile(
        finput,
        numPartitions).map(lambda x: [int(y) for y in x.strip().split(' ')])

    minSupport = data.count() * threshold / 100
    freqItems = getFrequentItems(data, minSupport)
    rank = dict([(index, item) for (item, index) in enumerate(freqItems)])
Beispiel #21
0
def main():
    conf = (SparkConf()
                .setMaster("local[*]")
                .setAppName("compare_engine"))
                
    sc = SparkContext(conf = conf)
    sc.setLogLevel('INFO')

    sc.addFile(primary)

    # rdd_primary = sc.textFile(primary, minPartitions=4, use_unicode=True).distinct() 
    rdd_primary = sc.textFile(SparkFiles.get(primary), minPartitions=4, use_unicode=True).distinct() 
    rdd_primary.partitionBy(10).cache()

    os.system('rm -Rf collects_*')
    os.system('rm -Rf holder.txt')
       
    rdd_secondary = sc.textFile(secondary, minPartitions=4, use_unicode=True).distinct()
    rdd_secondary.partitionBy(10).cache()

    primary_count = rdd_primary.count()
    primary_report['count'] = primary_count
    print(primary_report)

    secondary_count = rdd_secondary.count()
    secondary_report['count'] = secondary_count
    print(secondary_report)

    # Return each Primary file line/record not contained in Secondary
    not_in_primary  = rdd_primary.subtract(rdd_secondary)
    primary_diff = not_in_primary.count()
    primary_report['diff'] = primary_diff
    
    os.system('rm -Rf collects_*.csv')

    primary_dir = 'collects_{}_primary'.format(run_date)
    primary_report_name = 'collects_{}_primary_report.csv'.format(run_date)

    not_in_primary.coalesce(1, True).saveAsTextFile(primary_dir)

    # os.system('cat collects_{}_primary/part-0000* >> collects_{}_primary_report.csv'.format(run_date, run_date))
    os.system('cat {}/part-0000* >> {}'.format(primary_dir, primary_report_name))
    os.system('wc -l collects_{}_primary_report.csv'.format(run_date))

    # Flip Primary Vs Secondary
    # Return each Secondary file line/record not contained in Primary
    not_in_secondary  = rdd_secondary.subtract(rdd_primary)
    secondary_diff = not_in_secondary.count()
    secondary_report['diff'] = secondary_diff

    not_in_secondary.coalesce(1,True).saveAsTextFile('collects_{}_secondary'.format(run_date))
    os.system('cat collects_{}_secondary/part-0000* >> collects_{}_secondary_report.csv'.format(run_date, run_date))
    os.system('wc -l collects_{}_secondary_report.csv'.format(run_date))

    process_report['primary'] = primary_report
    process_report['secondary'] =  secondary_report

    print("=" * 100)
    print('\n')
    print(process_report)
    print('\n')
    print("=" * 100)
    spark_details(sc)
    

    sc.stop()
    # fetch the results
    result = map(lambda x: (x[0], json.loads(x[1].data)), requests)
    # remove any empty results and return
    return filter(lambda x: x[1] is not None, result)


def fetchCallSigns(input):
    """Fetch call signs"""
    return input.mapPartitions(lambda callSigns: processCallSigns(callSigns))

contactsContactList = fetchCallSigns(validSigns)

# Compute the distance of each call using an external R program
distScript = os.getcwd()+"/src/R/finddistance.R"
distScriptName = "finddistance.R"
sc.addFile(distScript)


def hasDistInfo(call):
    """Verify that a call has the fields required to compute the distance"""
    requiredFields = ["mylat", "mylong", "contactlat", "contactlong"]
    return all(map(lambda f: call[f], requiredFields))


def formatCall(call):
    """Format a call so that it can be parsed by our R program"""
    return "{0},{1},{2},{3}".format(
        call["mylat"], call["mylong"],
        call["contactlat"], call["contactlong"])

pipeInputs = contactsContactList.values().flatMap(
import itertools
import sys


def _getCountryByIP(ip):
    citydb = geoIP.Reader(SparkFiles.get('GeoLite2-City.mmdb'))
    return (citydb.city(ip).country.name or u'Unknown').encode()

if __name__ == '__main__':
    if len(sys.argv) != 3:
        print >> sys.stderr, "Usage: forgeInternationalAccess <date> <hour>"
        exit(-1)

    spark = SparkContext(appName='ForgeGeoAccess')
    spark.addPyFile('hdfs://digiledap/user/spark/share/lib/accessLogParser.py')
    spark.addFile('hdfs://digiledap/user/spark/share/lib/GeoLite2-City.mmdb')

    from accessLogParser import *
    from snakebite.client import Client

    hdfsHandle = Client('hmaster01')
    hosts = spark.parallelize(hdfsHandle.ls(['/flume/events/apache_access_combined/']))\
                 .filter(lambda dirs: dirs['file_type'] == 'd')\
                 .map(lambda directory: 'hdfs://digiledap%s' % directory['path'])\
                 .collect()

    rdds = {
        item.split('/')[-1]: spark.textFile('%s/%s/%s' % (item, sys.argv[1], sys.argv[2])) for item in hosts
    }

    results = {
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from  pyspark.mllib.regression import LabeledPoint
from random import randint
from  pyspark.mllib.classification import NaiveBayes
from pyspark.mllib.linalg import SparseVector
from pyspark import SparkContext
from pyspark import SparkFiles
from functools import partial

trainF="./data/train" #the path to where the train data is

sc = SparkContext(appName="Simple App")  #initialize the spark context
#since we are not in the command line interface we need to add to the spark context
#some of our classes so that they are available to the workers
sc.addFile("/home/christos.giatsidis/data_camp_2015_dec/helpers.py") 
sc.addFile("/home/christos.giatsidis/data_camp_2015_dec/exctract_terms.py")
#now if we import these files they will also be available to the workers
from helpers import *
import exctract_terms as et



# load data : data is a list with the text per doc in each cell. 
#Y is the respective class value
#1 :positive , 0 negative
print "loading local data"
data,Y=lf.loadLabeled(trainF) 

print "preprocessing"
pp.proc(data) #clean up the data from  number, html tags, punctuations (except for "?!." ...."?!" are replaced by "."
import re
import sys

from pyspark import SparkContext

#Create Spark Context with the master details and the application name
sc = SparkContext("spark://localhost:7077", "max_temperature")

#Add a file to be downloaded with this Spark job on every node.
sc.addFile("/home/bigdatavm/Code/Spark/filter_weather_records.rb")

#Create an RDD from the input data in HDFS
weatherData = sc.textFile("hdfs://localhost:9000/user/bigdatavm/input")

#Transform the data to extract/filter and then find the max temperature
max_temperature_per_year = weatherData.pipe("../ruby/filter_weather_records.rb").map(lambda x: (x.split("\t")[0], x.split("\t")[1])).reduceByKey(lambda a,b : a if int(a) > int(b) else b).coalesce(1)

#Save the RDD back into HDFS
max_temperature_per_year.saveAsTextFile("hdfs://localhost:9000/user/bigdatavm/output")
def main():

	config = configparser.ConfigParser()
	config.read('config.ini')

	#Path for Gromacs project
	gromacs_path = preparing_path(config.get('DRUGDESIGN', 'gromacs_path'))
	#Path where PDB ligand are - They are NOT participated in docking
	pdb_ligand_path = config.get('DEFAULT', 'pdb_ligand_path')
	#Path that contains all files for analysis
	path_analysis = config.get('DEFAULT', 'path_analysis')
	#Ligand Database file
	ligand_database  = config.get('DEFAULT', 'ligand_database_path_file')
	#Path where all pdb receptor are
	path_receptor_pdb = config.get('DEFAULT', 'pdb_path')
	#Path for saving pdb files of models generated by VS
	path_analysis_pdb = get_directory_pdb_analysis(path_analysis)

	# Create SPARK config
	maxResultSize = str(config.get('SPARK', 'maxResultSize'))
	conf = (SparkConf().set("spark.driver.maxResultSize", maxResultSize))

	# Create context
	sc = SparkContext(conf=conf)
	sqlCtx = SQLContext(sc)

	#Adding Python Source file
	#Path for drugdesign project
	path_spark_drugdesign = config.get('DRUGDESIGN', 'path_spark_drugdesign')
	sc.addPyFile(os.path.join(path_spark_drugdesign,"vina_utils.py"))
	sc.addPyFile(os.path.join(path_spark_drugdesign,"os_utils.py"))
	sc.addPyFile(os.path.join(path_spark_drugdesign,"gromacs_utils.py"))
	sc.addPyFile(os.path.join(path_spark_drugdesign,"pdb_io.py"))
	sc.addPyFile(os.path.join(path_spark_drugdesign,"database_io.py"))
	sc.addPyFile(os.path.join(path_spark_drugdesign,"json_utils.py"))

	#Adding bash scripts
	sc.addFile(os.path.join(path_spark_drugdesign,"make_ndx_buried_area_total.sh"))
	sc.addFile(os.path.join(path_spark_drugdesign,"make_sasa_rec_buried_area_total.sh"))

	#Parameters form command line
	#Indicates probe. Example: 0.14
	probe = float(sys.argv[1])
	#Indicates ndots. Example: 24
	ndots = int(sys.argv[2])

	#Broadcast
	path_analysis_pdb_complex_b = sc.broadcast(path_analysis_pdb)
	gromacs_path = sc.broadcast(gromacs_path)
	pdb_ligand_path = sc.broadcast(pdb_ligand_path)
	probe = sc.broadcast(probe)
	ndots = sc.broadcast(ndots)

	start_time = datetime.now()

	os.environ["GMX_MAXBACKUP"]="-1"

	#Loading all PDB receptor files into memory
	list_all_pdb_receptor_files_path = []
	all_receptor_for_complex = get_files_pdb(path_receptor_pdb)
	for receptor in all_receptor_for_complex:
		list_all_pdb_receptor_files_path.append(loading_pdb_2_list(receptor))

	#Computing Buried areas
	for pdb_receptor_files in list_all_pdb_receptor_files_path:
		#Getting receptor name by fully path
		base_file_name_receptor = get_name_receptor_pdb(str(pdb_receptor_files[0]))
		#PDB file loaded into memory is sent by broadcast
		pdb_file_receptor = pdb_receptor_files[1]
		pdb_file_receptor = sc.broadcast(pdb_file_receptor)
		#Loading PDB model files based on receptor into memory
		base_file_name_receptor_for_filter = base_file_name_receptor+"_-_"
		all_model_for_complex = get_files_pdb_filter(path_analysis_pdb,base_file_name_receptor_for_filter)
		all_model_for_complexRDD = sc.parallelize(all_model_for_complex)
		all_model_filesRDD = all_model_for_complexRDD.map(loading_pdb_2_list).collect()
# ********** Starting function **********************************************************
		def compute_buried_area(pdb_complex):
			chZ = "chZ"

			sasa_complex = -1.0
			sasa_rec = -1.0
			sasa_lig = -1.0
			buried_total = -1.0

			returned_list = []

			try:
				base_name = get_name_model_pdb(pdb_complex)
				ligand_name = get_ligand_from_receptor_ligand_model(base_name)
				f_pdb_ligand_no_docking = os.path.join(pdb_ligand_path.value,ligand_name+".pdb")
				f_ndx = os.path.join(path_analysis_pdb_complex_b.value,base_name+".ndx")

				f_temp_sasa_complex = os.path.join(path_analysis_pdb_complex_b.value,base_name+"_sasa_complex.xvg")
				f_temp_sasa_rec = os.path.join(path_analysis_pdb_complex_b.value,base_name+"_sasa_rec.xvg")
				f_temp_sasa_lig = os.path.join(path_analysis_pdb_complex_b.value,base_name+"_sasa_lig.xvg")

				# Makes the index file with the ligand (chain z) and the rest (non chain z)
				script_make_ndx = SparkFiles.get("make_ndx_buried_area_total.sh") #Getting bash script that was copied by addFile command
				command = script_make_ndx + " " + gromacs_path.value + " "+ pdb_complex + " "+ f_ndx
				process = Popen(command,shell=True, stdout=PIPE, stderr=PIPE)
				stdout, stderr = process.communicate()

				command = gromacs_path.value +"gmx sasa -f " + pdb_complex + " -s " + pdb_complex + " -nopbc " + " -n " + f_ndx + " -surface System " + " -output System "+ " -xvg none " + " -o " + f_temp_sasa_complex
				process = Popen(command,shell=True, stdout=PIPE, stderr=PIPE)
				stdout, stderr = process.communicate()

				# Makes f_temp_sasa_rec file
				script_make_sasa_rec = SparkFiles.get("make_sasa_rec_buried_area_total.sh") #Getting bash script that was copied by addFile command
				command = script_make_sasa_rec + " " + gromacs_path.value + " "+ pdb_complex + " "+ f_ndx + " " + f_temp_sasa_rec
				process = Popen(command,shell=True, stdout=PIPE, stderr=PIPE)
				stdout, stderr = process.communicate()

				command = gromacs_path.value +"gmx sasa -f " + pdb_complex + " -s " + pdb_complex + " -nopbc " + " -n " + f_ndx + " -surface chZ " + " -output chZ "+ " -xvg none " + " -o " +  f_temp_sasa_lig
				process = Popen(command,shell=True, stdout=PIPE, stderr=PIPE)
				stdout, stderr = process.communicate()

				sasa_complex = get_value_from_xvg_sasa(f_temp_sasa_complex)
				sasa_rec = get_value_from_xvg_sasa(f_temp_sasa_rec)
				sasa_lig = get_value_from_xvg_sasa(f_temp_sasa_lig)

				buried_total = sasa_rec + sasa_lig - sasa_complex

				#Generating result - See column sorting because resultaed file will be created based on this sorting
				returned_list = (base_name, buried_total)

			except:
				returned_list = (base_name, float(0))

			#Deleting files
			if os.path.exists(f_ndx):
				os.remove(f_ndx)
			if os.path.exists(f_temp_sasa_complex):
				os.remove(f_temp_sasa_complex)
			if os.path.exists(f_temp_sasa_rec):
				os.remove(f_temp_sasa_rec)
			if os.path.exists(f_temp_sasa_lig):
				os.remove(f_temp_sasa_lig)

			return returned_list
# ********** Finish function **********************************************************

# ********** Starting function **********************************************************
		def save_model_receptor(list_receptor_model_file):
			receptor_file = pdb_file_receptor.value #Obtained from broadcast
			model_file = list_receptor_model_file[0]
			full_path_for_save_complex = list_receptor_model_file[1]
			#Open file for writting the complex
			f_compl = open(full_path_for_save_complex, "w")
			#Insert lines of receptor
			for item in  receptor_file:
				f_compl.write(item)
			#Insert lines of model and insert Z chain
			for item in model_file:
				item = replace_chain_atom_line(item,"d","z")
				f_compl.write(item)
			f_compl.close()
# ********** Finish function **********************************************************

# ********** Starting function **********************************************************
		def build_list_model_for_complex(model):
			full_path_model = model[0]
			model_file = model[1]
			path_pdb_complex = path_analysis_pdb_complex_b.value #Obtained from broadcast
			#Building complex file based on model file name
			base_name_model = get_name_model_pdb(full_path_model)
			complex_name = "compl_"+base_name_model+".pdb"
			full_path_for_save_complex = os.path.join(path_pdb_complex,complex_name)
			list_receptor_model_file = (model_file, full_path_for_save_complex)
			save_model_receptor(list_receptor_model_file)
			list_ret = compute_buried_area(full_path_for_save_complex)
			os.remove(full_path_for_save_complex)
			return list_ret
# ********** Finish function **********************************************************

		all_model_filesRDD = sc.parallelize(all_model_filesRDD)
		all_model_filesRDD = all_model_filesRDD.map(build_list_model_for_complex).collect()
		#Saving buried area of receptor
		full_area_file  = os.path.join(path_analysis,base_file_name_receptor+".area")
		save_receptor_buried_area(full_area_file, all_model_filesRDD)

	#Loading all area file
	all_area_file = os.path.join(path_analysis,"*.area")
	buried_areaRDD = sc.textFile(all_area_file).map(loading_lines_from_area_files).collect()

	#Sorting by buried_total column
	buried_area_sorted_by_buried_total = sorting_buried_area(sc, buried_areaRDD)
	buried_area_sorted_by_buried_total.cache()
	buried_area_sorted_by_buried_total_LIST = buried_area_sorted_by_buried_total.map(lambda p: (p.pose, p.buried_total) ).collect()

	#Saving buried area file
	path_file_buried_area = os.path.join(path_analysis, "summary_buried_areas_total.dat")
	save_buried_area(path_file_buried_area, buried_area_sorted_by_buried_total_LIST)

	#Calculating normalized buried area
	#Loading database
	rdd_database = load_database(sc, ligand_database)
	#Creating Dataframe
	database_table = sqlCtx.createDataFrame(rdd_database)
	database_table.registerTempTable("database")

	number_pose_ligandRDD = buried_area_sorted_by_buried_total.map(lambda p: Row(buried_total=int(p.buried_total), ligand=get_ligand_from_receptor_ligand_model(p.pose), pose=str(p.pose) ) ).collect()
	number_pose_ligand_table = sqlCtx.createDataFrame(number_pose_ligandRDD)
	number_pose_ligand_table.registerTempTable("buried_area_total_sort")

	sql = """
			SELECT pose, (b.buried_total / a.heavyAtom) as normalized_buried_area
			FROM database a
			JOIN buried_area_total_sort b ON b.ligand = a.ligand
			ORDER BY normalized_buried_area DESC
	      """
	#Getting all data
	full_dataRDD = sqlCtx.sql(sql)

	#Saving normalized buried area file
	path_file_buried_area = os.path.join(path_analysis, "summary_normalized_buried_areas.dat")
	save_normalized_buried_area(path_file_buried_area, full_dataRDD)

	#Removing all area files
	all_area_files = get_files_area(path_analysis)
	for area_file in all_area_files:
		os.remove(area_file)

	finish_time = datetime.now()

	save_log(finish_time, start_time)
def main():
	
	config = configparser.ConfigParser()
	config.read('config.ini')

	#Path for Gromacs project
	gromacs_path = preparing_path(config.get('DRUGDESIGN', 'gromacs_path'))
	#Path where PDB ligand are - They are NOT participated in docking
	pdb_ligand_path = config.get('DEFAULT', 'pdb_ligand_path')
	#Path that contains all files for analysis
	path_analysis = config.get('DEFAULT', 'path_analysis')	
	#Path where all pdb receptor are
	path_receptor_pdb = config.get('DEFAULT', 'pdb_path')	
	#Path for saving pdb files of models generated by VS
	path_analysis_pdb = get_directory_pdb_analysis(path_analysis)
	
	# Create SPARK config
	maxResultSize = str(config.get('SPARK', 'maxResultSize'))
	conf = (SparkConf().set("spark.driver.maxResultSize", maxResultSize))

	# Create context
	sc = SparkContext(conf=conf)

	#Adding Python Source file
	#Path for drugdesign project
	path_spark_drugdesign = config.get('DRUGDESIGN', 'path_spark_drugdesign')	
	sc.addPyFile(os.path.join(path_spark_drugdesign,"vina_utils.py"))
	sc.addPyFile(os.path.join(path_spark_drugdesign,"os_util.py"))
	sc.addPyFile(os.path.join(path_spark_drugdesign,"gromacs_utils.py"))
	sc.addPyFile(os.path.join(path_spark_drugdesign,"pdb_io.py"))

	#Adding bash scripts	
	sc.addFile(os.path.join(path_spark_drugdesign,"make_ndx_buried_area_ligand.sh"))	

	#Parameters form command line
	#Indicates probe. Example: 0.14
	probe = float(sys.argv[1])
	#Indicates ndots. Example: 24
	ndots = int(sys.argv[2])

	#Broadcast
	path_analysis_pdb_complex_b = sc.broadcast(path_analysis_pdb)
	gromacs_path = sc.broadcast(gromacs_path)	 
	pdb_ligand_path = sc.broadcast(pdb_ligand_path)
	probe = sc.broadcast(probe)
	ndots = sc.broadcast(ndots)

	start_time = datetime.now()

	os.environ["GMX_MAXBACKUP"]="-1"

	#Loading all PDB receptor files into memory
	list_all_pdb_receptor_files_path = []
	all_receptor_for_complex = get_files_pdb(path_receptor_pdb)
	for receptor in all_receptor_for_complex:
		list_all_pdb_receptor_files_path.append(loading_pdb_2_list(receptor))

	for pdb_receptor_files in list_all_pdb_receptor_files_path:
		#Getting receptor name by fully path
		base_file_name_receptor = get_name_receptor_pdb(str(pdb_receptor_files[0]))
		#PDB file loaded into memory is sent by broadcast
		pdb_file_receptor = pdb_receptor_files[1]
		pdb_file_receptor = sc.broadcast(pdb_file_receptor)
		#Loading PDB model files based on receptor into memory
		base_file_name_receptor_for_filter = base_file_name_receptor+"_-_"
		all_model_for_complex = get_files_pdb_filter(path_analysis_pdb,base_file_name_receptor_for_filter)
		all_model_for_complexRDD = sc.parallelize(all_model_for_complex)
		all_model_filesRDD = all_model_for_complexRDD.map(loading_pdb_2_list).collect()

# ********** Starting function **********************************************************		
		def save_model_receptor(list_receptor_model_file):
			receptor_file = pdb_file_receptor.value #Obtained from broadcast
			model_file = list_receptor_model_file[0]			
			full_path_for_save_complex = list_receptor_model_file[1]
			#Open file for writting the complex
			f_compl = open(full_path_for_save_complex, "w")
			#Insert lines of receptor
			for item in  receptor_file:
				f_compl.write(item)
			#Insert lines of model and insert Z chain
			for item in model_file:
				item = replace_chain_atom_line(item,"d","z")
				f_compl.write(item)
			f_compl.close()
# ********** Finish function **********************************************************					

# ********** Starting function **********************************************************		
		def compute_buried_area_ligand(pdb_complex):
			chZ = "chZ"
			buried_lig_rec_perc = -1.0
			buried_lig_rec = -1.0
			buried_lig_lig = -1.0
			buried_lig_lig_perc = -1.0
			base_name = get_name_model_pdb(pdb_complex)		
			ligand_name = get_ligand_from_receptor_ligand_model(base_name)
			receptor_name = get_receptor_from_receptor_ligand_model(base_name)
			pose = get_model_from_receptor_ligand_model(base_name)						
			pdb_before_vs = os.path.join(pdb_ligand_path.value,ligand_name+".pdb")			
			#ndx files					
			f_ndx = os.path.join(path_analysis_pdb_complex_b.value,base_name+".ndx")			
			#xvg files
			xvg_temp_sasa_lig_pose = os.path.join(path_analysis_pdb_complex_b.value,base_name+"_sasa_lig_pose"+".xvg")
			xvg_temp_sasa_lig_complex = os.path.join(path_analysis_pdb_complex_b.value,base_name+"_sasa_lig_complex"+".xvg")
			xvg_temp_sasa_lig_min = os.path.join(path_analysis_pdb_complex_b.value,base_name+"_sasa_lig_min"+".xvg")
			# Creates a selection with the residues that are closer than 6A to the ligand
			script_make_ndx_buried_area_ligand = SparkFiles.get("make_ndx_buried_area_ligand.sh") #Getting bash script that was copied by addFile command
			command = script_make_ndx_buried_area_ligand + " " + gromacs_path.value + " "+ pdb_complex + " "+ f_ndx + " "+  xvg_temp_sasa_lig_pose + " "+ str(probe.value)  + " "+ str(ndots.value)  + " "+  xvg_temp_sasa_lig_complex  + " "+ pdb_before_vs  + " "+  xvg_temp_sasa_lig_min
			process = Popen(command,shell=True, stdout=PIPE, stderr=PIPE)
			stdout, stderr = process.communicate()			
			# SASA of the isolated ligand in the pose conformation			
			sasa_lig_pose = get_value_from_xvg_sasa(xvg_temp_sasa_lig_pose)
			# SASA of the complexed ligand in the pose conformation
			sasa_lig_complex = get_value_from_xvg_sasa(xvg_temp_sasa_lig_complex)
			# SASA of the isolated ligand in its energy-minimized conformation. Only for carbohydrates!
			sasa_lig_min = get_value_from_xvg_sasa(xvg_temp_sasa_lig_min)
			# Area of the ligand which is buried in the receptor
			buried_lig_rec = sasa_lig_pose - sasa_lig_complex
			buried_lig_rec_perc = buried_lig_rec / sasa_lig_pose
			# Area of the ligand in the pose conformation which is buried in itself when compared to the energy-minimized conformation
			buried_lig_lig = sasa_lig_min - sasa_lig_pose
			buried_lig_lig_perc = buried_lig_lig / sasa_lig_min
			returned_list = (base_name, buried_lig_rec, buried_lig_rec_perc, buried_lig_lig, buried_lig_lig_perc)

			#Deleting files
			os.remove(f_ndx)			
			os.remove(xvg_temp_sasa_lig_pose)
			os.remove(xvg_temp_sasa_lig_complex)
			os.remove(xvg_temp_sasa_lig_min)

			return returned_list
			
# ********** Finish function **********************************************************					

# ********** Starting function **********************************************************		
		def build_list_model_for_complex(model):
			full_path_model = model[0]
			model_file = model[1]
			path_pdb_complex = path_analysis_pdb_complex_b.value #Obtained from broadcast
			#Building complex file based on model file name
			base_name_model = get_name_model_pdb(full_path_model)
			complex_name = "compl_"+base_name_model+".pdb"
			full_path_for_save_complex = os.path.join(path_pdb_complex,complex_name)
			list_receptor_model_file = (model_file, full_path_for_save_complex)						
			save_model_receptor(list_receptor_model_file)			
			list_ret = compute_buried_area_ligand(full_path_for_save_complex)			
			os.remove(full_path_for_save_complex)
			return list_ret
# ********** Finish function **********************************************************	

		all_model_filesRDD = sc.parallelize(all_model_filesRDD)
		all_model_filesRDD = all_model_filesRDD.map(build_list_model_for_complex).collect()	
		#Saving buried area of residue receptor
		full_area_file  = os.path.join(path_analysis,base_file_name_receptor+".ligandArea")
		save_buried_area_ligand(full_area_file, all_model_filesRDD)

	#Loading all area file 
	all_area_file = os.path.join(path_analysis,"*.ligandArea")		
	buried_areaRDD = sc.textFile(all_area_file).map(loading_lines_from_ligandArea_files).collect()	

	#Sorting by buried_lig_lig column
	buried_area_sorted_by_buried_lig_rec = sorting_buried_area_ligand(sc, buried_areaRDD)
	buried_area_sorted_by_buried_lig_rec = buried_area_sorted_by_buried_lig_rec.map(lambda p: (p.pose, p.buried_lig_rec, p.buried_lig_rec_perc, p.buried_lig_lig, p.buried_lig_lig_perc) ).collect() #p.receptor, p.ligand, p.model

	#Saving buried area ligand file
	path_file_buried_area = os.path.join(path_analysis, "summary_buried_area_ligand.dat")
	save_buried_area_ligand_sort(path_file_buried_area, buried_area_sorted_by_buried_lig_rec)	

	#Removing all area files
	all_area_files = get_files_ligandArea(path_analysis)
	for area_file in all_area_files:
		os.remove(area_file)

	finish_time = datetime.now()

	save_log(finish_time, start_time)
def _locate(example_name):
    return "../examples/smalldata/" + example_name

conf = SparkConf().setAppName("ChicagoCrimeTest").setIfMissing("spark.master", os.getenv("spark.master", "local[*]"))
sc = SparkContext(conf=conf)
# SQL support
sqlContext = SQLContext.getOrCreate(sc)
# Start H2O services
h2oContext = H2OContext(sc).start()
# Define file names
chicagoAllWeather = "chicagoAllWeather.csv"
chicagoCensus = "chicagoCensus.csv"
chicagoCrimes10k = "chicagoCrimes10k.csv"

# Add files to Spark Cluster
sc.addFile(_locate(chicagoAllWeather))
sc.addFile(_locate(chicagoCensus))
sc.addFile(_locate(chicagoCrimes10k))

# Since we have already loaded files into spark, we have to use h2o.upload_file instead of h2o.import_file since
# h2o.import_file expects cluster-relative path (ie. the file on this path can be accessed from all the machines on the cluster)
# but SparkFiles.get(..) already give us relative path to the file on a current node which h2o.upload_file can handle ( it uploads file
# located on current node and distributes it to the H2O cluster)
f_weather = h2o.upload_file(SparkFiles.get(chicagoAllWeather))
f_census = h2o.upload_file(SparkFiles.get(chicagoCensus))
f_crimes = h2o.upload_file(SparkFiles.get(chicagoCrimes10k))


# Transform weather table
# Remove 1st column (date)
f_weather = f_weather[1:]
Beispiel #29
0
    # np.random.seed(1337) # To match with MATLAB
    # ---------------------------LOGGING----------------------------------------------
    logfname = SPARK_HOME + 'log_size_' + str(GRAPH_NODES) + '_' + \
        datetime.now().strftime('%Y-%m-%d-%H:%M:%S') + '.log'
    logging.basicConfig(filename=logfname, filemode='w', level= logging.INFO, \
        format='%(asctime)s:%(levelname)s:%(message)s', \
        datefmt='%m/%d/%Y %I:%M:%S %p')
    logging.warn(sys.argv[0] + '\n SPARK_HOME = ' + SPARK_HOME \
                 + '\n p = ' + str(SQUARE_BLOCK_SIZE))
    # ----------------Create new Spark config---------------------------------------------
    # 0 means unlimited; if driver fails, set some value like 16g
    conf = SparkConf().set("spark.driver.maxResultSize", "32g")
    conf.set("spark.akka.frameSize", "2040")
    sc = SparkContext(conf=conf, appName="Commute time distances ")
    sqlContext = SQLContext(sc)
    sc.addFile(SPARK_HOME + "construct_graphs.py")

    # ----------------------------------------------------------------------------------
    n, p = GRAPH_NODES, SQUARE_BLOCK_SIZE
    zfile1, zfile2 = RESULTS_DIR + 'elections-12-'+ str(n) + '-Z.mat', \
                 RESULTS_DIR + 'elections-16-'+ str(n) + '-Z.mat'

    if not os.path.exists(zfile1):
        RESULTS_dict = {}
        A1 = constructGraphs.createAdjMat(n, 12, SPARSE_GRAPH, p, sc)
        Z1 = commuteTimeDistancesEmbed(A1, tol, epsilon, d)
        RESULTS_dict['Z'] = Z1
        if not os.path.exists(RESULTS_DIR):
            os.makedirs(RESULTS_DIR)
        sio.savemat(zfile1, RESULTS_dict)
def main():

	config = configparser.ConfigParser()
	config.read('config.ini')

	#Path for Gromacs project
	gromacs_path = preparing_path(config.get('DRUGDESIGN', 'gromacs_path'))
	#Path where PDB ligand are - They are NOT participated in docking
	pdb_ligand_path = config.get('DEFAULT', 'pdb_ligand_path')
	#Path that contains all files for analysis
	path_analysis = config.get('DEFAULT', 'path_analysis')
	#Path where all pdb receptor are
	path_receptor_pdb = config.get('DEFAULT', 'pdb_path')
	#Path for saving pdb files of models generated by VS
	path_analysis_pdb = get_directory_pdb_analysis(path_analysis)

	# Create SPARK config
	maxResultSize = str(config.get('SPARK', 'maxResultSize'))
	conf = (SparkConf().set("spark.driver.maxResultSize", maxResultSize))

	# Create context
	sc = SparkContext(conf=conf)

	#Adding Python Source file
	#Path for drugdesign project
	path_spark_drugdesign = config.get('DRUGDESIGN', 'path_spark_drugdesign')
	sc.addPyFile(os.path.join(path_spark_drugdesign,"vina_utils.py"))
	sc.addPyFile(os.path.join(path_spark_drugdesign,"os_utils.py"))
	sc.addPyFile(os.path.join(path_spark_drugdesign,"gromacs_utils.py"))
	sc.addPyFile(os.path.join(path_spark_drugdesign,"pdb_io.py"))
	sc.addPyFile(os.path.join(path_spark_drugdesign,"json_utils.py"))

	#Adding bash scripts
	sc.addFile(os.path.join(path_spark_drugdesign,"make_ndx_buried_area_receptor.sh"))
	sc.addFile(os.path.join(path_spark_drugdesign,"make_ndx_buried_area_receptor_res.sh"))

	#Parameters form command line
	#Indicates probe. Example: 0.14
	#probe = float(sys.argv[1])
	#Indicates ndots. Example: 24
	#ndots = int(sys.argv[2])

	#Broadcast
	path_analysis_pdb_complex_b = sc.broadcast(path_analysis_pdb)
	gromacs_path = sc.broadcast(gromacs_path)
	pdb_ligand_path = sc.broadcast(pdb_ligand_path)
	#probe = sc.broadcast(probe)
	#ndots = sc.broadcast(ndots)

	start_time = datetime.now()

	os.environ["GMX_MAXBACKUP"]="-1"

	#Loading all PDB receptor files into memory
	list_all_pdb_receptor_files_path = []
	all_receptor_for_complex = get_files_pdb(path_receptor_pdb)
	for receptor in all_receptor_for_complex:
		list_all_pdb_receptor_files_path.append(loading_pdb_2_list(receptor))

	for pdb_receptor_files in list_all_pdb_receptor_files_path:
		#Getting receptor name by fully path
		base_file_name_receptor = get_name_receptor_pdb(str(pdb_receptor_files[0]))
		#PDB file loaded into memory is sent by broadcast
		pdb_file_receptor = pdb_receptor_files[1]
		pdb_file_receptor = sc.broadcast(pdb_file_receptor)
		#Loading PDB model files based on receptor into memory
		base_file_name_receptor_for_filter = base_file_name_receptor+"_-_"
		all_model_for_complex = get_files_pdb_filter(path_analysis_pdb,base_file_name_receptor_for_filter)
		all_model_for_complexRDD = sc.parallelize(all_model_for_complex)
		all_model_filesRDD = all_model_for_complexRDD.map(loading_pdb_2_list).collect()

# ********** Starting function **********************************************************
		def save_model_receptor(list_receptor_model_file):
			receptor_file = pdb_file_receptor.value #Obtained from broadcast
			model_file = list_receptor_model_file[0]
			full_path_for_save_complex = list_receptor_model_file[1]
			#Open file for writting the complex
			f_compl = open(full_path_for_save_complex, "w")
			#Insert lines of receptor
			for item in  receptor_file:
				f_compl.write(item)
			#Insert lines of model and insert Z chain
			for item in model_file:
				item = replace_chain_atom_line(item,"d","z")
				f_compl.write(item)
			f_compl.close()
# ********** Finish function **********************************************************

# ********** Starting function **********************************************************
		def compute_buried_area_all_residues_and_receptor_area(pdb_complex):
			chZ = "chZ"
			res_buried_area_perc = -1
			res_buried_area = -1
			buried_receptor_system = -1
			buried_receptor_res = -1
			base_name = get_name_model_pdb(pdb_complex)
			ligand_name = get_ligand_from_receptor_ligand_model(base_name)
			receptor_name = get_receptor_from_receptor_ligand_model(base_name)
			pose = get_model_from_receptor_ligand_model(base_name)

			#output area receptor file
			f_output_receptor_buried_area = os.path.join(path_analysis_pdb_complex_b.value,base_name+".outAreaRecep")
			#ndx files
			#f_ndx = os.path.join(path_analysis_pdb_complex_b.value,base_name+".ndx")
			f_ndx_temporary_index_z = os.path.join(path_analysis_pdb_complex_b.value,base_name+"_temporary_index_z"+".ndx")
			f_ndx_temporary = os.path.join(path_analysis_pdb_complex_b.value,base_name+"_temporary"+".ndx")
			f_ndx_temporary_sasa = os.path.join(path_analysis_pdb_complex_b.value,base_name+"_temporary_sasa"+".ndx")

			#xvg files
			f_xvg_temporary_sasa_res_lig = os.path.join(path_analysis_pdb_complex_b.value,base_name+"_temporary_sasa_res-lig"+".xvg")
			f_xvg_temporary_sasa_res  = os.path.join(path_analysis_pdb_complex_b.value,base_name+"_temporary_sasa_res"+".xvg")
			f_xvg_temporary_sasa_rec_lig  = os.path.join(path_analysis_pdb_complex_b.value,base_name+"_temporary_sasa_rec_lig"+".xvg")
			f_xvg_temporary_sasa_rec  = os.path.join(path_analysis_pdb_complex_b.value,base_name+"_temporary_sasa_rec"+".xvg")

			# Creates a selection with the residues that are closer than 6A to the ligand
			script_make_ndx_buried_area_receptor = SparkFiles.get("make_ndx_buried_area_receptor.sh") #Getting bash script that was copied by addFile command
			command = script_make_ndx_buried_area_receptor + " " + gromacs_path.value + " "+ pdb_complex + " "+ f_ndx_temporary_index_z + " "+ f_ndx_temporary
			process = Popen(command,shell=True, stdout=PIPE, stderr=PIPE)
			stdout, stderr = process.communicate()
			#coping file
			if os.path.exists(f_ndx_temporary):
				shutil.copy(f_ndx_temporary, f_ndx_temporary_sasa)
				#Get all residues for computing area receptor
				all_res = get_residues_receptor_from_ndx_files(f_ndx_temporary)
				returned_list = []
				for res in all_res:
					script_make_ndx_buried_area_receptor_res = SparkFiles.get("make_ndx_buried_area_receptor_res.sh") #Getting bash script that was copied by addFile command
					command = script_make_ndx_buried_area_receptor_res + " " + gromacs_path.value + " "+ pdb_complex + " "+ f_ndx_temporary_sasa + " "+ str(res)
					process = Popen(command,shell=True, stdout=PIPE, stderr=PIPE)
					stdout, stderr = process.communicate()
					# compute surface of system - saved on xvg
					command = gromacs_path.value +"gmx sasa -surface complex -output rec_"+str(res)+ " -o "+ f_xvg_temporary_sasa_res_lig + " -xvg none -f " + pdb_complex +" -s " + pdb_complex + " -n "+ f_ndx_temporary + " -nopbc "
					process = Popen(command,shell=True, stdout=PIPE, stderr=PIPE)
					stdout, stderr = process.communicate()
					# compute surface of receptor - save on xvg
					command = gromacs_path.value +"gmx sasa -surface rec -output rec_"+str(res)+ " -o "+ f_xvg_temporary_sasa_res + " -xvg none -f " + pdb_complex +" -s " + pdb_complex + " -n "+ f_ndx_temporary + " -nopbc "
					process = Popen(command,shell=True, stdout=PIPE, stderr=PIPE)
					stdout, stderr = process.communicate()
					#calculate area
					if os.path.exists(f_xvg_temporary_sasa_res_lig):
						buried_receptor_system = get_value_from_xvg_sasa(f_xvg_temporary_sasa_res_lig)
					else:
						buried_receptor_system = 0
					if os.path.exists(f_xvg_temporary_sasa_res):
						buried_receptor_res  = get_value_from_xvg_sasa(f_xvg_temporary_sasa_res)
					else:
						buried_receptor_res = 0
					res_buried_area = buried_receptor_res - buried_receptor_system
					if (res_buried_area > 0) and (buried_receptor_res > 0):
						res_buried_area_perc = res_buried_area/buried_receptor_res
						#Generating result
						result = (base_name, res, res_buried_area,  res_buried_area_perc)
						returned_list.append(result)
					#Deleting files
					if os.path.exists(f_xvg_temporary_sasa_res_lig):
						os.remove(f_xvg_temporary_sasa_res_lig)
					if os.path.exists(f_xvg_temporary_sasa_res):
						os.remove(f_xvg_temporary_sasa_res)

					#Computing Receptor Area
					command = gromacs_path.value +"gmx sasa -surface complex -output rec"+ " -o "+ f_xvg_temporary_sasa_rec_lig + " -xvg none -f " + pdb_complex +" -s " + pdb_complex + " -n "+ f_ndx_temporary + " -nopbc "
					process = Popen(command,shell=True, stdout=PIPE, stderr=PIPE)
					stdout, stderr = process.communicate()

					command = gromacs_path.value +"gmx sasa -surface rec -output rec"+ " -o "+ f_xvg_temporary_sasa_rec + " -xvg none -f " + pdb_complex +" -s " + pdb_complex + " -n "+ f_ndx_temporary + " -nopbc "
					process = Popen(command,shell=True, stdout=PIPE, stderr=PIPE)
					stdout, stderr = process.communicate()

					if os.path.exists(f_xvg_temporary_sasa_rec_lig):
						sasa_rec_lig = get_value_from_xvg_sasa(f_xvg_temporary_sasa_rec_lig)
					else:
						sasa_rec_lig = 0

					if os.path.exists(f_xvg_temporary_sasa_rec):
						sasa_rec = get_value_from_xvg_sasa(f_xvg_temporary_sasa_rec)
					else:
						sasa_rec = 0

					receptor_area = sasa_rec - sasa_rec_lig

					#Saving result file
					output_receptor_buried_area = open(f_output_receptor_buried_area, "w")
					output_receptor_buried_area.write(str(base_name)+" "+str(receptor_area) +"\n")
					output_receptor_buried_area.close()

					#Deleting all files
					if os.path.exists(f_xvg_temporary_sasa_rec_lig):
						os.remove(f_xvg_temporary_sasa_rec_lig)
					if os.path.exists(f_xvg_temporary_sasa_rec):
						os.remove(f_xvg_temporary_sasa_rec)
					if os.path.exists(f_ndx_temporary):
						os.remove(f_ndx_temporary)
					if os.path.exists(f_ndx_temporary_sasa):
						os.remove(f_ndx_temporary_sasa)
					if os.path.exists(f_ndx_temporary_index_z):
						os.remove(f_ndx_temporary_index_z)

					return returned_list
			else:
				#Here means that some problem for computing area
				return (base_name, "NAN", float(0),  float(0))

# ********** Finish function **********************************************************

# ********** Starting function **********************************************************
		def build_list_model_for_complex(model):
			full_path_model = model[0]
			model_file = model[1]
			path_pdb_complex = path_analysis_pdb_complex_b.value #Obtained from broadcast
			#Building complex file based on model file name
			base_name_model = get_name_model_pdb(full_path_model)
			complex_name = "compl_"+base_name_model+".pdb"
			full_path_for_save_complex = os.path.join(path_pdb_complex,complex_name)
			list_receptor_model_file = (model_file, full_path_for_save_complex)
			save_model_receptor(list_receptor_model_file)
			list_ret = compute_buried_area_all_residues_and_receptor_area(full_path_for_save_complex)
			if os.path.exists(full_path_for_save_complex):
				os.remove(full_path_for_save_complex)
			return list_ret
# ********** Finish function **********************************************************

		#Computing buried area of All-residues and receptor
		all_model_filesRDD = sc.parallelize(all_model_filesRDD)
		all_model_filesRDD = all_model_filesRDD.map(build_list_model_for_complex).collect()
		full_area_file  = os.path.join(path_analysis,base_file_name_receptor+".recepArea")
		save_receptor_buried_area_receptor(full_area_file, all_model_filesRDD)

# ***************** Starting ******************************************/
	#Loading All-residues files
	all_area_file = os.path.join(path_analysis,"*.recepArea")
	buried_areaRDD = sc.textFile(all_area_file).map(loading_lines_from_recepArea_files).collect()
	#Sorting by res_buried_area_perc column
	buried_area_sorted_by_res_buried_area_perc = sorting_buried_area_all_residues(sc, buried_areaRDD)
	buried_area_sorted_by_res_buried_area_perc = buried_area_sorted_by_res_buried_area_perc.map(lambda p: (p.res, p.res_buried_area, p.res_buried_area_perc, p.pose) ).collect() #p.receptor, p.ligand, p.model,

	#Saving buried area file
	path_file_buried_area = os.path.join(path_analysis, "all-residue_buried_areas.dat")
	save_buried_area_recep(path_file_buried_area, buried_area_sorted_by_res_buried_area_perc)

	#Removing all area files
	all_area_files = get_files_recepArea(path_analysis)
	for area_file in all_area_files:
		os.remove(area_file)
# ***************** Finish ******************************************/

# ***************** Starting ******************************************/

	#Loading outAreaRecep files
	all_outAreaRecep_file = os.path.join(path_analysis_pdb,"*.outAreaRecep")
	buried_outAreaRecepRDD = sc.textFile(all_outAreaRecep_file).map(loading_lines_from_outAreaRecep_files).collect()

	buried_outAreaRecepRDD_sort_by_buried_lig_rec = sorting_by_buried_lig_rec(sc, buried_outAreaRecepRDD)
	buried_outAreaRecepRDD_sort_by_buried_lig_rec = buried_outAreaRecepRDD_sort_by_buried_lig_rec.map(lambda p: (p.pose, p.buried_lig_rec) ).collect()

	#Saving buried area receptor file
	path_file_buried_area_rec = os.path.join(path_analysis, "summary_buried_areas_receptor.dat")
	save_buried_area_receptor_sort(path_file_buried_area_rec, buried_outAreaRecepRDD_sort_by_buried_lig_rec)

	#Removing all outAreaRecep files
	all_outAreaRecep = get_files_outAreaRecep(path_analysis)
	for outAreaRecep in all_outAreaRecep:
		os.remove(outAreaRecep)
# ***************** Finish ******************************************/


	finish_time = datetime.now()

	save_log(finish_time, start_time)
Beispiel #31
0
            "longest_values": longest,
            "average_length": "%.f2" % average
        }
        res.append(result)
        typeCount[2] = 1

    return res, typeCount


if __name__ == "__main__":
    config = pyspark.SparkConf().setAll([('spark.executor.memory', '8g'),
                                         ('spark.executor.cores', '5'),
                                         ('spark.cores.max', '5'),
                                         ('spark.driver.memory', '8g')])
    sc = SparkContext(conf=config)
    sc.addFile("FileInputManager.py")
    sc.addFile("task1_coinflippers.py")
    sc.addFile("task2_coinflippers.py")

    spark = SparkSession \
        .builder \
        .appName("hw2sql") \
        .config("spark.some.config.option", "some-value") \
        .getOrCreate()

    sqlContext = SQLContext(spark)
    fm.iterate_files_from_file_for_task1(
        sc, spark, sqlContext, "/user/yy3090/input/task1_filename.txt", 0,
        output_path)

    sc.stop()
    
    try:
        c_options = parser.parse_args()
        print "Got options:", c_options
    except Exception as inst:
        print inst
        parser.print_help()
    
    es_ts_start, es_ts_end, ingestion_id = get_ingestion_start_end_id(c_options)


    # Setup SparkContext    
    sc = SparkContext(appName="extract-features-"+ingestion_id+job_suffix)
    sc.addPyFile('hdfs://memex/user/skaraman/extract-features/network.py')
    sc.addPyFile('hdfs://memex/user/skaraman/extract-features/tfdeepsentibank.py')
    sc.addFile('hdfs://memex/user/skaraman/extract-features/imagenet_mean.npy')
    sc.addFile('hdfs://memex/user/skaraman/extract-features/tfdeepsentibank.npy')
    conf = SparkConf()
    log4j = sc._jvm.org.apache.log4j
    log4j.LogManager.getRootLogger().setLevel(log4j.Level.ERROR)
    
    # Setup HBase managers
    # just to be sure we will be able to write out to the table
    get_create_table(c_options.tab_sha1_infos_name, c_options)
    get_create_table(c_options.tab_update_name, c_options)
    # hbase managers
    hbase_fullhost = c_options.hbase_host+':'+str(c_options.hbase_port)
    hbase_man_sha1infos_out = HbaseManager(sc, conf, hbase_fullhost, c_options.tab_sha1_infos_name)
    hbase_man_update_out = HbaseManager(sc, conf, hbase_fullhost, c_options.tab_update_name)
    
    # Run extraction
from pyspark import SparkContext
from pyspark import SparkFiles
finddistance = "/home/srimlcloud/temp_dat/pySpark_projects/finddistance.R"
finddistancename = "finddistance.R"
sc = SparkContext("local", "SparkFile App")
sc.addFile(finddistance)
print "Absolute Path -> %s" % SparkFiles.get(finddistancename)
    path_save_log = preparing_path(path_save_log)
    make_directory(path_save_log)

    path_save_output = preparing_path(path_save_output)
    make_directory(path_save_output)

    # Adding Python Source file
    sc.addPyFile(os.path.join(path_spark_drugdesign, "docking_description.py"))

    # Broadcast
    vina_path = sc.broadcast(vina_path)
    pdbqt_ligand_path = sc.broadcast(pdbqt_ligand_path)
    pdbqt_receptor_path = sc.broadcast(pdbqt_receptor_path)
    path_save_output = sc.broadcast(path_save_output)
    path_save_log = sc.broadcast(path_save_log)
    sc.addFile(config_vina)

    file_of_vina_docking = sys.argv[1]
    check_file_exists(file_of_vina_docking)
    start_time = datetime.now()

    def run_vina_docking(vd_obj):

        receptor = ''.join([pdbqt_receptor_path.value,
                            vd_obj.get_receptor(),
                            ".pdbqt"])
        ligand = ''.join([pdbqt_ligand_path.value,
                          vd_obj.get_ligand(),
                          ".pdbqt"])
        output_save = ''.join([path_save_output.value,
                               vd_obj.get_receptor(),
from hdfs_paths import hdfs_path, make_hdfs_dirs


# load yaml config from this dir
config_path = os.path.join(os.path.dirname(__file__), "config.yaml")
config = yaml.load(open(config_path))

# set up Spark
conf = SparkConf()
conf.set("spark.executor.instances", 8)
sc = SparkContext("yarn-client", "pyspark-demo", conf=conf)

# keys output in each dictionary for map_each_image.  The values are np.arrays
RESULT_KEYS = ["cen", "histo", "ward", "pca_fac", "pca_var", "phash"]
# Do addFile so remote workers have python code
sc.addFile(os.path.join(os.path.dirname(__file__), "hdfs_paths.py"))
sc.addFile(os.path.join(os.path.dirname(__file__), "map_each_image.py"))
sc.addFile(config_path)
sc.addFile(os.path.join(os.path.dirname(__file__), "search.py"))
sc.addFile(os.path.join(os.path.dirname(__file__), "fuzzify_training.py"))

# These are options to the flat_map_indicators function
# which can do these mappings.
options_template = {
    "cluster_to_flattened": True,
    "cluster_to_key": True,
    "cluster_to_phash": True,
    # TODO it would be more efficient to combine
    # cluster_to_phash with cluster_to_ward
    "cluster_to_ward": True,
    "flattened_to_cluster": True,
import numpy as np
from csv import reader
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext
from pyspark.sql.functions import *
from pyspark.sql.types import StringType, StructType, StructField
from pyspark.sql.functions import col
from pyspark import SparkFiles
import datetime
from pyspark import SparkConf, SparkContext

conf = SparkConf().setAppName("building a warehouse")
sc = SparkContext(conf=conf)
sqlCtx = SQLContext(sc)

sc.addFile(
    "/home/hk2451/project/Cool_name_pending/codes/cleaning/cleaning_io.py")
sc.addFile(
    "/home/hk2451/project/Cool_name_pending/codes/mutual_information.py")

import cleaning_io as clean
import mutual_information as mi


##################################
# Inner Join and filter by year
##################################
# find common name column and change it
def process(df1, df2, year):
    colA = df1.columns
    colB = df2.columns
    colAB = list(set(colA).intersection(colB))
Beispiel #37
0
import time
from tqdm import tqdm
import re
import string
import os
from pyspark import SparkConf, SparkContext
from collections import Counter

ROOT = '/data0/lucy/ingroup_lang/'
DATA = ROOT + 'data/'
LOG_DIR = ROOT + 'logs/'
SUBREDDITS = DATA + 'subreddit_list.txt'
SR_FOLDER_MONTH = ROOT + 'subreddits_month/'
conf = SparkConf()
sc = SparkContext(conf=conf)
sc.addFile('/data0/lucy/langid.py/langid/langid.py')
import langid

reddits = set()

def get_language(text): 
    return langid.classify(text)[0]

def id_langs():
    lang_dict = {}
    log_file = open(LOG_DIR + 'language_id.temp', 'w')
    for sr in os.listdir(SR_FOLDER_MONTH): 
        log_file.write(sr + '\n') 
        path = SR_FOLDER_MONTH + sr + '/RC_sample'
        data = sc.textFile(path)
        data = data.filter(lambda line: not line.startswith('@@#USER#@@_'))
def test5():
    sc = SparkContext('local', 'first app')
    file = '/home/Edison/project/chatbot/README.md'
    sc.addFile(file)
    print('------file: %s---------' % SparkFiles.get('README.md'))
Beispiel #39
0
# You can also add the HDFS directory path. Suppose your images are saved in HDFS in the directory '/user/maddy/my_images/'.
# Then same HDFS path can be given here.
img_dir = './my_images/'

# This is the path of the directory where the images will be stored after face is detected.
# After the face is detected in the image, we will draw a rectangle around the face in the image & store that image in the below directory.
rect_img_dir = './face_detected/'

# Haar Cascade Classifier (from OpenCV library)
# This classifier will be used to detect front faces in the images.
# Give below the path of the classifier.
distCascade = "./haarcascade_frontalface_default.xml"

# This adds the Cascade file on different nodes in Spark cluster.
# This is necessary if you run this spark code on muti-node spark cluster.
sc.addFile(distCascade)

# Converting the images into RDD
images_RDD = sc.binaryFiles(img_dir)

# For more details about this function. You can do help(sc.binaryFiles)

# If you have large number of images to process (like a million) then the Spark will by default make a lot of partitions.
# To repartition your image data into less number of partitions, you can run below command & change the number of partitions to what you want.
#images_RDD = images_RDD.repartition(20000)


# Face Detection function
def face_detect(an_img_rdd_element):
    x = an_img_rdd_element[0]
    img = an_img_rdd_element[1]
Beispiel #40
0
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sun Jun 14 06:54:46 2020
@author: joshua
"""

from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.sql.functions import udf, struct
from pyspark.sql.types import BooleanType
from pyspark import SparkContext
import computedistance

sc = SparkContext("local", "SparkFile App")
sc.addFile("/home/ubuntu/Housing-Insight/process_datasets/computedistance.py")

boroughs = ["BRONX", "BROOKLYN", "STATEN ISLAND", "QUEENS", "MANHATTAN"]


def handle_building(building, _311_service):
    if _311_service.city in boroughs:
        if building.borough != _311_service.city:
            return False
    latlong = [building.longitude, building.latitude]
    latlong2 = [_311_service.longitude, _311_service.latitude]
    if computedistance.computeDistance(latlong, latlong2) < 1.5:
        return True
    else:
        return False
Beispiel #41
0
def main(InfoAppName="consumer",
         InfoTopic="chpart1",
         InfoAddPyFile="/home/test/CoreProcessAType__.py",
         InfoTotalCore=4,
         SysPath=""):
    global importFile, sysPath

    prop = configparser.RawConfigParser()
    prop.read('SparkConfig.properties')

    importFile = InfoAddPyFile

    sysPath = SysPath
    conf = SparkConf()
    conf.setMaster(prop.get('SparkConfig', 'spark.master'))
    conf.setAppName(InfoAppName)

    ##
    conf.set("spark.cores.max", InfoTotalCore)
    conf.set("spark.streaming.backpressure.enabled",
             prop.get('SparkConfig', 'spark.streaming.backpressure.enabled'))
    conf.set("spark.executor.memory",
             prop.get('SparkConfig', 'spark.executor.memory'))
    conf.set("spark.python.worker.memory",
             prop.get('SparkConfig', 'spark.python.worker.memory'))
    conf.set("spark.streaming.concurrentJobs",
             prop.get('SparkConfig', 'spark.streaming.concurrentJobs'))
    conf.set("spark.executor.cores",
             prop.get('SparkConfig', 'spark.executor.cores'))
    conf.set("spark.task.cpus", prop.get('SparkConfig', 'spark.task.cpus'))
    conf.set("spark.executor.extraLibraryPath",
             prop.get('SparkConfig', 'spark.executor.extraLibraryPath'))
    conf.set("spark.locality.wait",
             prop.get('SparkConfig', 'spark.locality.wait'))
    conf.set("spark.scheduler.mode",
             prop.get('SparkConfig', 'spark.scheduler.mode'))
    conf.set("spark.streaming.blockInterval",
             prop.get('SparkConfig', 'spark.streaming.blockInterval'))
    conf.set("spark.serializer", prop.get('SparkConfig', 'spark.serializer'))

    kafkaParams = {
        "metadata.broker.list": prop.get('KafkaConfig',
                                         'metadata.broker.list'),
        "group.id": prop.get('KafkaConfig', 'group.id')
    }

    sc = SparkContext(conf=conf)
    sc.addPyFile(InfoAddPyFile)
    sc.addFile('c_count.txt')
    sc.addFile('p_count.txt')
    ssc = StreamingContext(sc, 1)
    topic1 = [InfoTopic]

    dstream = KafkaUtils.createDirectStream(ssc,
                                            topic1,
                                            kafkaParams,
                                            valueDecoder=none_decoder)

    parse_rdd(dstream)
    ssc.start()
    ssc.awaitTermination()
    classifier.show_most_informative_features()


def mapper(line, title, secfile, idsec):
    post = mdb.posts
    tokens = word_tokenize(line)
    tagged = pos_tag(tokens)
    ntities = chunk.ne_chunk(tagged)
    newline = line.encode('utf-8')

    posting = {"securitynow_id": idsec, "episode": secfile[3:6], "speaker": title, "original": line, "tokens": tokens,
               "entities": ntities, "sentiment": classifier.classify(dict([(word, True) for word in newline]))}
    post_id = post.insert(posting)


sc.addFile("/home/th3m4d0n3/NetBeansProjects/twAppDemo/data_dir/allSentimentData")
with open(SparkFiles.get("allSentimentData")) as f:
    reader = csv.reader(f, delimiter=" ", quotechar='"')

    jobs = bg.BackgroundJobManager()
    map(parseForNltk, reader)

    print("chezdata type DATA: {0} COUNT: {1}".format(type(chezdata), len(chezdata)))

    map(getHighest, chezdata)

    chezdataP = sc.parallelize(chezdata)
    lowRatedP = sc.parallelize(lowRated)
    highlyRatedP = sc.parallelize(highlyRated)

    print("chezdataP type DATA: {0} COUNT: {1}".format(type(chezdataP), chezdataP.count()))
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from pyspark.mllib.classification import LogisticRegressionWithSGD
from pyspark.mllib.regression import LabeledPoint
from numpy import array
from pyspark.mllib.classification import LogisticRegressionWithSGD
from pyspark.mllib.classification import SVMWithSGD, SVMModel


#### TODO change to the cluster directory
trainF="/home/xavier.callens/DataCamp/train" #the path to where the train data is

sc = SparkContext(appName="Simple App")  #initialize the spark context
#since we are not in the command line interface we need to add to the spark context
#some of our classes so that they are available to the workers
sc.addFile("helpers.py") 
sc.addFile("exctract_terms.py")
#now if we import these files they will also be available to the workers
from helpers import *
import extract_terms as et



# load data : data is a list with the text per doc in each cell. Y is the respective class value
#1 :positive , 0 negative
print "loading local data"
data,Y=lf.loadLabeled(trainF) 

print "preprocessing"
pp.proc(data) #clean up the data from  number, html tags, punctuations (except for "?!." ...."?!" are replaced by "."
m = TfidfVectorizer(analyzer=et.terms) # m is a compressed matrix with the tfidf matrix the terms are extracted with our own custom function 
Beispiel #44
0
CONF = {
    'spark.driver.extraClassPath':
    os.environ['HOME'] + 'mongo-hadoop/spark/build/libs/mongo-hadoop-spark.jar'
}

STANFORD_SEGMENTER = APP_HOME + '/stanford_segmenter'
STANFORD_POSTAGGER = APP_HOME + '/stanford-postagger'
STANFORD_MODELS = STANFORD_POSTAGGER + '/models'

LOG_DIR = 'log'

MONGO_SERVER = 'localhost'
MONGO_PORT = 27017
DB = 'tweets_data'

### Prepare SparkContext ###
conf = SparkConf().setAppName(APP_NAME)

for prop, val in CONF.items():  #set configuration properties
    conf.set(prop, val)

sc = SparkContext(conf=conf, environment=ENV_VARS)

for f in PY_FILES:  #add dependencies
    sc.addPyFile('%s://%s/%s' % (FILESYSTEM, APP_HOME, f))

for f in FILES:  #add required files
    sc.addFile('%s://%s/%s' % (FILESYSTEM, APP_HOME, f))

pymongo_spark.activate()
sys.path.append(SPARK_HOME_PYTHON)

from pyspark import SparkContext
from pyspark import SparkConf

sc = SparkContext(appName = 'topXIp')

#test local speed: only around 75s, much faster
#sc = SparkContext('local' , 'topXIp')

#X = sys.argv[1]

#normal
normalFilePath = '/home/worker/workspace/DeepDefense_dataStatistics' + '/csv' + '/topXraw.csv'
normalPath = os.path.join(normalFilePath)
sc.addFile(normalPath);

#attack
attackFilePath = '/home/worker/workspace/DeepDefense_dataStatistics' + '/csv' + '/topXraw.csv'
attackPath = os.path.join(attackFilePath)
sc.addFile(attackPath);


from pyspark import SparkFiles
normalRdd = sc.textFile(SparkFiles.get(normalFilePath)).cache()
attackRdd = sc.textFile(SparkFiles.get(attackFilePath)).cache()

# src, dst, data_length, protocol_name, protocol_number, arrival_time (len = 6)
normalRaw = normalRdd.map(lambda x: x.split(',')).filter(lambda x: len(x) == 6).cache()
attackRaw = attackRdd.map(lambda x: x.split(',')).filter(lambda x: len(x) == 6).cache()
def predict(row_coord,cSize,model):
	vector_dict={}
	for w in row_coord[1]:
		vector_dict[int(w[1])]=w[0]
	return (row_coord[0], model.value.predict(SparseVector(cSize.value,vector_dict)))

trainF="./data/train" #the path to where the train data is
testF="./data/test" # the path to the unlabelled data 
saveF="./predictions.txt" #where to save the predictions

sc = SparkContext(appName=" \--(o_o)--/ ")  #initialize the spark context


#since we are not in the command line interface we need to add to the spark context
#some of our classes so that they are available to the workers
sc.addFile("/home/julien.hamilius/datacamp/code/helpers.py") 
sc.addFile("/home/julien.hamilius/datacamp/code/extract_terms.py")
#now if we import these files they will also be available to the workers
from helpers import *
import extract_terms as et


# load data : data is a list with the text per doc in each cell. Y is the respective class value
#1 :positive , 0 negative
print "loading local data"
data,Y=lf.loadLabeled(trainF) 


print "preprocessing"
pp.proc(data) #clean up the data from  number, html tags, punctuations (except for "?!." ...."?!" are replaced by "."
m = TfidfVectorizer(analyzer=et.terms) # m is a compressed matrix with the tfidf matrix the terms are extracted with our own custom function 
from pyspark import SparkContext as SC
from pyspark.sql import SQLContext
from pyspark import SparkFiles
from pyspark.sql.types import *
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler

sc = SC()
url = "https://raw.githubusercontent.com/guru99-edu/R-Programming/master/adult_data.csv"
sc.addFile(url)
sqlContext = SQLContext(sc)

# df = sqlContext.read.csv(SparkFiles.get("adult_data.csv"), header=True, inferSchema=True) # with inferSchema=False all values are considered string

df_string = sqlContext.read.csv(SparkFiles.get("adult.csv"),
                                header=True,
                                inferSchema=False)


def convertColumn(df, names, newType):
    for name in names:
        df = df.withColumn(name, df[name].cast(newType))
    return df


CONTI_FEATURES = [
    'age', 'fnlwgt', 'capital-gain', 'educational-num', 'capital-loss',
    'hours-per-week'
]
df_string = convertColumn(df_string, CONTI_FEATURES,
                          FloatType())  # schema casted
import os
from pyspark import SparkContext
from pyspark import SparkConf
from pyspark import SparkFiles

conf = SparkConf().setMaster('local')
sc = SparkContext(conf=conf, appName='DemoPipeR')

contactsContactList = sc.parallelize([('null', '45.4,34.2,90.3,66.1'),
			              ('null', '49.3,31.6,42.3,76.7'),
				      ('null', '40.9,36.2,99.8,16.0')])

# Compute the distance of each call using an external R program
distScript = os.getcwd() + "/find_distance.R"
distScriptName = "find_distance.R"
sc.addFile(distScript)

def hasDistInfo(call):
    """Verify that a call has the fields required to compute the distance"""
    requiredFields = ["mylat", "mylong", "contactlat", "contactlong"]
    return all(map(lambda f: call[f], requiredFields))

def formatCall(call):
    """Format a call so that it can be parsed by our R program"""
    return "{0},{1},{2},{3}".format(
        call["mylat"], call["mylong"],
        call["contactlat"], call["contactlong"])

# here we do not bother with storing dictionaries in contactsContactList
#pipeInputs = contactsContactList.values().flatMap(lambda calls: map(formatCall, filter(hasDistInfo, calls)))
pipeInputs = contactsContactList.values()
Beispiel #49
0
#!/usr/bin/python

# ./spark-submit spark.py fmriData.csv

import sys
import os

from pyspark import SparkContext
from pyspark.mllib.classification import LogisticRegressionWithSGD
from pyspark.mllib.regression import LabeledPoint
from numpy import array

if __name__ == "__main__":

    csvName = sys.argv[1]
    path = os.path.realpath(csvName)

    context = SparkContext('local', 'fmri analysis')
    context.addFile(path)
Beispiel #50
0
limit = 50

if __name__ == "__main__":
    sc = SparkContext(appName="MTurk")
    inputFilename = sys.argv[1]
    outputDirectory = sys.argv[2]
    featureListFilename = sys.argv[3]
    crfModelFilename = sys.argv[4]
    eyeRef = sys.argv[5]
    eyeConfig = sys.argv[6]
    hairRef = sys.argv[7]
    hairConfig = sys.argv[8]
    # Program to compute CRF++
    c = crf_features.CrfFeatures(featureListFilename)
    # Add files to be downloaded with this Spark job on every node.
    sc.addFile("/usr/local/bin/crf_test")
    sc.addFile(crfModelFilename)

    # Map to reference sets
    smEye = HybridJaccard(ref_path=eyeRef, config_path=eyeConfig)
    smHair = HybridJaccard(ref_path=hairRef, config_path=hairConfig)

    rdd = sc.sequenceFile(inputFilename)
    if limit:
        rdd = sc.parallelize(rdd.take(limit))

    rdd_json = rdd.mapValues(lambda x: json.loads(x))

    rdd_body = rdd_json.mapValues(lambda x: extract_body(x))
    rdd_body_tokens = rdd_body.mapValues(lambda x: textTokens(x))
Beispiel #51
0
PY_FILES = ['settings.py', 'stanford_segmenter.py', 'pos_tag.py', 'logger.py', 'pymongo_spark.py']
FILES = ['NER.model']
CONF = {'spark.driver.extraClassPath':
        os.environ['HOME'] + 'mongo-hadoop/spark/build/libs/mongo-hadoop-spark.jar'}

STANFORD_SEGMENTER = APP_HOME + '/stanford_segmenter'
STANFORD_POSTAGGER = APP_HOME + '/stanford-postagger'
STANFORD_MODELS = STANFORD_POSTAGGER + '/models'

LOG_DIR = 'log'

MONGO_SERVER = 'localhost'
MONGO_PORT = 27017
DB = 'tweets_data'

### Prepare SparkContext ###
conf = SparkConf().setAppName(APP_NAME)

for prop, val in CONF.items():          #set configuration properties
    conf.set(prop, val)         

sc = SparkContext(conf=conf, environment=ENV_VARS)

for f in PY_FILES:          #add dependencies
    sc.addPyFile('%s://%s/%s' %(FILESYSTEM, APP_HOME, f))

for f in FILES:          #add required files
    sc.addFile('%s://%s/%s' %(FILESYSTEM, APP_HOME, f))
    
pymongo_spark.activate()
Beispiel #52
0
    spark_config.setExecutorEnv('AWS_ACCESS_KEY_ID', config.access_key)
    spark_config.setExecutorEnv('AWS_SECRET_ACCESS_KEY ', config.secret_access_key)


# Build up the context, using the master URL
sc = SparkContext('spark://ulex:7077', 'mean', conf=spark_config)
local_data_path = '/media/bitbucket/pr_amon_BCSD_rcp26_r1i1p1_CONUS_bcc-csm1-1_202101-202512.nc'
data_path = local_data_path
data_url = 'https://nasanex.s3.amazonaws.com/NEX-DCP30/BCSD/rcp26/mon/atmos/pr/r1i1p1/v1.0/CONUS/pr_amon_BCSD_rcp26_r1i1p1_CONUS_bcc-csm1-1_202101-202512.nc'

if download:
    data_path = data_url

# Download the file onto each node
if download or config.copy_local:
    sc.addFile(data_path)

# Still need to open dataset on master node to get number of timesteps. For
# some reason the master node doesn't seem to be able to access the downloaded
# version, this may be a bug in addFile(...)
data = Dataset(local_data_path)
pr = data.variables['pr']

# Get the number of timesteps
num_timesteps = data.variables['time'].size

data.close()

# Now partition timesteps across the cluster
timesteps = sc.parallelize(range(0, num_timesteps), 30)
Beispiel #53
0
if __name__ == '__main__':
    conf = SparkConf()

    sc = SparkContext(conf=conf)

    datadir = "/YOUR/DATA/DIR/"

    # sudo dpkg --configure -a
    # sudo apt-get install python-setuptools
    # sudo easy_install dateutils
    # Download pyspark_csv.py from https://github.com/seahboonsiew/pyspark-csv
    sys.path.append('/YOUR/PYSPARK_LIBS/DIR')  # replace as necessary
    import pyspark_csv

    sc.addFile('/YOUR/PYSPARK_LIBS/DIR/pyspark_csv.py')  # ditto
    sqlContext = SQLContext(sc)

    # Task 1: load the prop-prices.csv file as an RDD, and use the csvToDataFrame function from the pyspark_csv module
    # to create a DataFrame and register it as a temporary table so that you can run SQL queries:
    print("------- ******* Task 1 ******* -------")

    # Task 2: let's do some basic analysis on the data.
    # Find how many records we have per year, and print them out sorted by year.
    print("------- ******* Task 2 ******* -------")

    # Task 3: Everyone knows that properties in London are expensive.
    # Find the average property price by county,
    # and print the top 10 most expensive counties
    print("------- ******* Task 3 ******* -------")
if __name__ == '__main__':
    conf = SparkConf()

    sc = SparkContext(conf=conf)

    datadir = "/Users/eyalbenivri/Developer/projects/spark-workshop/data/"

    # sudo dpkg --configure -a
    # sudo apt-get install python-setuptools
    # sudo easy_install dateutils
    # Download pyspark_csv.py from https://github.com/seahboonsiew/pyspark-csv
    sys.path.append('/Users/eyalbenivri/Developer/libs/pyspark_libs')  # replace as necessary
    import pyspark_csv

    sc.addFile('/Users/eyalbenivri/Developer/libs/pyspark_libs/pyspark_csv.py')  # ditto
    sqlContext = SQLContext(sc)

    # Task 1: load the prop-prices.csv file as an RDD, and use the csvToDataFrame function from the pyspark_csv module
    # to create a DataFrame and register it as a temporary table so that you can run SQL queries:
    print("------- ******* Task 1 ******* -------")
    columns = ['id', 'price', 'date', 'zip', 'type', 'new', 'duration', 'PAON',
               'SAON', 'street', 'locality', 'town', 'district', 'county', 'ppd',
               'status']

    rdd = sc.textFile(datadir + "prop-prices.csv")
    df = pyspark_csv.csvToDataFrame(sqlContext, rdd, columns=columns)
    df.registerTempTable("properties")
    df.persist()

    # Task 2: let's do some basic analysis on the data.
Beispiel #55
0
class SparkUtils:
    def __init__(self, master, app_name):
        if os.environ["pfe_env"] != "dev":
            self.sc = SparkContext(appName=app_name)
            self.sc.addFile('/FileProcessor.py')
            self.sc.addFile('/FileIndexProducer.py')
            self.sc.addFile('/FileIndexRepository.py')
            self.sc.addFile('/FileUrlProcessor.py')
            self.sc.addFile('/LdaTopicsDescriptionProducer.py')
            self.sc.addFile('/LdaTopicsDescriptionRepository.py')
            self.sc.addFile('/Parser.py')
            self.sc.addFile('/SparkProcessor.py')
            self.sc.addFile('/SparkUtils.py')
            self.sc.addFile('/TextMostCommonWordsExtractor.py')
            self.sc.addFile('/TextPreProcessor.py')
            self.sc.addFile('/TextSummarizer.py')
            self.sc.addFile('/thumbnail_temp.py')
            self.sc.addFile('/ThumbnailGenerator.py')
            self.sc.addFile('/NotificationConstants.py')
            self.sc.addFile('/RabbitMqConstants.py')
        else:
            self.sc = SparkContext(master=master, appName=app_name)
        self.sql_context = SQLContext(self.sc)

    # output rdd:(url, b'content")
    def read_files(self, path):
        return self.sc.binaryFiles(path)

    def rdd_to_df(self, rdd, schema):
        df = self.sql_context.createDataFrame(rdd, schema)
        return df

    def join_df(self, df0, df1, join_col, df0_selected_cols, df1_selected_cols):
        df0_selected_cols = ["df0."+x for x in df0_selected_cols]
        df1_selected_cols = ["df1."+x for x in df1_selected_cols]
        df0 = df0.alias('df0')
        df1 = df1.alias('df1')
        df = df0.join(df1, col("df0."+join_col) == col("df1."+join_col))\
            .select(df0_selected_cols + df1_selected_cols)
        return df
Beispiel #56
0
    return (numFailedPredictions, expectedFailedPredictions, numFalseAlarms,
            numGoodRecords)


# Prepare desired columns
desiredsmartnos = [1, 3, 5, 7, 9, 194, 197]
desiredcolumns = ['date', 'serial_number', 'model', 'failure']
for sno in desiredsmartnos:
    desiredcolumns.append('smart_' + str(sno) + '_normalized')
    desiredcolumns.append('smart_' + str(sno) + '_raw')

if __name__ == "__main__":
    sparkconf = SparkConf().setAppName('hddpredict')
    sparkcontext = SparkContext(conf=sparkconf)
    sparkcontext.addFile(
        'hdfs://ec2-34-204-54-226.compute-1.amazonaws.com:9000/libsvm-322',
        True)
    sparksql = SparkSession.builder.master('local').appName(
        'hddpredict').getOrCreate()

    # Load the entire data and project wanted columns
    # Then parition by individual hard disk and sort by date so we can
    # model partition as time series and compute rate of change of attributes.
    # drivedatadf = sparksql.read.csv('/user/zixian/project/input/*.csv', inferSchema = True, header = True)
    drivedatadf = sparksql.read.csv(
        'hdfs://ec2-34-204-54-226.compute-1.amazonaws.com:9000/data/*.csv',
        inferSchema=True,
        header=True)
    drivedatadf = drivedatadf.select(desiredcolumns).fillna(0)
    drivedatadf.cache()
'''
Source of school list:
http://schools.nyc.gov/schoolsearch/
'''

from __future__ import print_function

import sys
import os
from operator import add
from pyspark import SparkContext
from csv import reader

sc = SparkContext()
sc.addFile("src/helper/assign_basetype.py")
from assign_basetype import *
school_lines = sc.textFile("/user/ac5901/school_number.csv", 1)
school_numbers = school_lines.map(lambda x: x).collect()


def check_school(val):
    basetype = get_basetype(val)
    if basetype == 'TEXT' or basetype == 'INT':
        if val is None or len(str(val).strip()) == 0 or str(val) in [
                'Unspecified', 'NA', 'N/A', 'N?A', 'NA/'
        ]:
            return 'NULL'
        elif str(val) in school_numbers:
            return 'VALID'
        else:
            return 'INVALID'
Beispiel #58
0
# -*- coding: utf-8 -*-

from __future__ import absolute_import, print_function, division, unicode_literals

from pyspark import SparkConf, SparkContext, SparkFiles


if __name__ == '__main__':
    conf = SparkConf().setAppName('Pipe')
    sc = SparkContext(conf=conf)

    column_count_script = './scripts/columncount.py'
    column_count_script_name = 'columncount.py'
    sc.addFile(column_count_script)

    lines = sc.parallelize(['1,2,3', '4,5', '6', '7,8,9,10'])
    print(lines.pipe(SparkFiles.get(column_count_script_name)).collect())

    sc.stop()