Ejemplo n.º 1
0
def main():
    from pyspark import SparkContext
    from pyspark.sql import SQLContext
    sc = SparkContext()
    sqlCtx = SQLContext(sc)
    csv2df_events(sqlCtx)
    sc.stop()
def main(input_path, output_path):

    sc = SparkContext(appName='Data_Analysis')

    # load raw dataset
    raw_rdd = sc.textFile(input_path).map(lambda x: x.split('|'))
    # vaidate whether data fulfill the definition of data dictionary
    validate_rdd = raw_rdd

    # load fixed cell master file for cgi and bbc mapping
    cell_master_dict = {}
    with open(CELL_MASTER_FILE, 'r') as f:
        for line in f:
            line = line.strip()
            line = line.split('|')
            cell_master_dict[line[0]] = line[9]

    transform_rdd = validate_rdd.map(lambda x: data_tranform(x, cell_master_dict))

    '''
    Filter out records for JABODETABEK
    '''
    Jabodetabek_rdd = transform_rdd.filter(lambda x: x[-1] == 'JABODETABEK')

    bbc_ci_number = Jabodetabek_rdd.map(lambda x: (x[-2], 1)).reduceByKey(lambda x,y : x+y).count()
    print 'Number of cell tower for Jabodetabek: %d' % bbc_ci_number

    Jabodetabek_rdd.saveAsTextFile(output_path)
class ZeppelinReporterTest(unittest.TestCase):
    def setUp(self):
        self.sc = SparkContext()
        self.sql = SQLContext(self.sc)
        self.df = self.sql.createDataFrame([(1, "a"), (1, None), (3, "c")])

    def tearDown(self):
        self.sc.stop()

    def test_output(self):
        with patch("pyddq.reporters.get_field") as get_field:
            baos = ByteArrayOutputStream()
            baos.jvm = self.df._sc._jvm

            get_field.return_value = baos.jvm_obj
            check = Check(self.df).hasUniqueKey("_1").hasUniqueKey("_1", "_2")
            z = Mock()
            reporter = ZeppelinReporter(z)
            check.run([reporter])
            expected_output = """
%html
</p>
<h4>Checking [_1: bigint, _2: string]</h4>
<h5>It has a total number of 2 columns and 3 rows.</h5>
<table>
<tr><td style="padding:3px">&#10060;</td><td style="padding:3px">Column _1 is not a key (1 non-unique tuple).</td></tr>
<tr><td style="padding:3px">&#9989;</td><td style="padding:3px">Columns _1, _2 are a key.</td></tr>
</table>
<p hidden>
""".strip()
            self.assertEqual(baos.get_output(), expected_output)
Ejemplo n.º 4
0
 def run(self):
     sc = SparkContext("local", "gender")
     sqlContext = SQLContext(sc)
     #StringType =(str, unicode)
     _out = self.output().open('w')
     #lines = sc.textFile("myUser.csv")
     #fobj = self.input().open("r")
     #lines = sc.textFile(fobj.name)
     print(type(self.required_tasks['insert_source'].output()))
     print(self.required_tasks['insert_source'])
     #print(self.input()['insert_source'].input())
     lines = sc.textFile("myUser.csv")
     parts = lines.map(lambda l: l.split(","))
     users = parts.map(lambda p: (p[0], p[1],p[2],p[3],p[4],p[5],p[6],p[7],
         p[8],p[9],p[10],p[11],p[12],p[13],p[14],p[15],p[16],p[17],p[18],p[19]))
     schemaString = "userId lmsUserId lmsName orgName name gender registrationDate emailId mothertounge highestEduDegree goals city state active firstAccesDate lastAccessDate allowCert yearOfBirth pincode aadharId"
     print(schemaString)
     _out.write(schemaString )
     fields = [StructField(field_name, StringType(), True) for field_name in schemaString.split()]
     schema = StructType(fields)
     #schemaUser = sqlContext.createDataFrame(users, schema)
     schemaUser = sqlContext.applySchema(users, schema)
     schemaUser.registerTempTable("users")
     results = sqlContext.sql("SELECT gender FROM users")
     genders = results.map(lambda p : (p,1))
     counts = genders.reduceByKey(lambda a, b: a + b) #.map(lambda t : ("Gender " + t(0) + " No " + t(1))).collect()
     for name in counts.collect():
         _out.write(str(name))
     _out.close()
Ejemplo n.º 5
0
def do_all(f_path,out_name):
	sc = SparkContext()
	data = sc.textFile(f_path)

	data = data.map(parseKeepD).filter(lambda p: p[0] != None)

	# Scale Features
	features = data.map(lambda x: x[0].features)
	summary = Statistics.colStats(features)
	global means
	global varis
	means = summary.mean()
	varis = summary.variance()

	#scale the points
	data = data.map(lambda y: (conv_label_pt(y[0]),y[1]))

	#train model
	model = LinearRegressionWithSGD().train(data.map(lambda x: x[0]), intercept=True, regType='none')

	#calculate disparity
	disparity = data.map(lambda p: (p[0].label, model.predict(p[0].features), p[1]))  

	#calculate SSR for later
	ssr = disparity.map(lambda x: (x[0] - x[1])**2).sum()

	#keep N
	N = disparity.count()
	#shut down SC
	MSE = ssr/float(N)
	se = std_errors(data,MSE,N)
	disparity.saveAsTextFile(out_loc + out_name)

	sc.stop()
	return model.intercept,model.weights,se,disparity, ssr, N
Ejemplo n.º 6
0
def query12_input(query_name, conf=None, output_persist=False):
    sc = SparkContext(conf=conf)
    sqlContext = HiveContext(sc)

    # SQL statements can be run by using the sql methods provided by sqlContext
    sql = "use tpcds_text_db_1_50"
    _ = sqlContext.sql(sql)

#    web_sales_sql = "select * from web_sales"
#    web_sales = sqlContext.sql(web_sales_sql)
#    web_sales.persist()
#    web_sales.registerAsTable("web_sales")
#    item_sql = "select * from item"
#    item = sqlContext.sql(item_sql)
#    item.persist()
#    item.registerAsTable("item")
#    date_dim_sql = "select * from date_dim"
#    date_dim = sqlContext.sql(date_dim_sql)
#    date_dim.persist()
#    date_dim.registerAsTable("date_dim")
    sqlContext.cacheTable("web_sales")
    sqlContext.cacheTable("item")
    sqlContext.cacheTable("date_dim")

    # discard the first query
    output = execute_sql(query_name, sqlContext, output_persist)
    # check the re-run statistics
    output = execute_sql(query_name, sqlContext)
    output['describe'] = output['output'].describe().show()

    sc.stop()
    return output
Ejemplo n.º 7
0
def main():
	inputs = sys.argv[1]
	output = sys.argv[2] 

	conf = SparkConf().setAppName('scalable multiplication')
	sc = SparkContext(conf=conf)
	assert sc.version >= '1.5.1'

	text = sc.textFile(inputs)

	# sbaronia - Split the row to get individual numbers
	row = text.map(lambda line: line.split())
	
	# sbaronia - calling element_wise_product on individual line 
	# and then adding all the returned 10x10 matrix to get
	# final matrix
	sub = row.map(element_wise_product).reduce(add_tuples)

	# sbaronia - writing formatted output to a file in 
	# a 10x10 matrix
	result = open(output, 'w')

	count = 0
	for i in range(len(sub)):
		result.write(str(sub[i]) + " ")
		count += 1
		if (count == 10):
			result.write("\n")
			count = 0

	result.close()
Ejemplo n.º 8
0
def main(argList):	
	# Process command line args
	if len(argList) >= 2:
		pass
	else:
		print ("no input file specified and or output")
		usage()
		sys.exit()
		
	if '-inputPartition' in argList:
		inp = int(argList[argList.index('-inputPartition') + 1])
	else:
		inp = 1

	if '-outputPartition' in argList:
		onp = int(argList[argList.index('-outputPartition') + 1])
	else:
		onp = inp
		
	
	# Create Spark Contex for NONE local MODE
	sc = SparkContext() 
	
	
	irdd = sc.textFile(argList[0], inp, use_unicode=True).map(lambda x: (x[0:10],x[10:]))
	ordd = irdd.sortByKey(True, onp).map(lambda x: (x[0] + x[1].strip('\n')) + '\r')
	ordd.saveAsTextFile(argList[1]+'/output')
def main():
    cleanup()

    sc = SparkContext()
    spark = SparkSession(sc)
    path = os.path.join(mysql_export_dir, "name_string_indices.tsv")

    df = spark.read.csv(path, header=True, inferSchema=True, sep='\t', nullValue='NULL')

    names = df.select('name').rdd.map(lambda r: r['name'])
    names_json = parse_spark(sc, names) \
        .map(json.loads) \
        .zip(df.rdd)

    synonym_names = names_json.filter(lambda n: is_synonym(n))
    accepted_names = names_json.filter(lambda n: not is_synonym(n))

    synonym_names_with_accepted_columns = synonym_names \
        .map(to_key_value) \
        .leftOuterJoin(accepted_names.map(to_key_value)) \
        .map(add_accepted_data_to_synonym_name)
    accepted_names_with_accepted_columns = accepted_names \
        .map(add_accepted_data_accepted_name)
    sc.union([synonym_names_with_accepted_columns, accepted_names_with_accepted_columns]) \
        .map(join_fields) \
        .saveAsTextFile(output_dir_name_string_indices)
Ejemplo n.º 10
0
def main():
    sc = SparkContext( appName="Transforming Eff Care" )
    src = sc.textFile(utils.data_home + "/measure_dates.csv")

    transformed = src.map(utils.to_row_sep).map(transform_row).map(utils.to_row_string)

    transformed.saveAsTextFile(utils.data_home + "/measures_data")
Ejemplo n.º 11
0
def KMeansModel(dataPath, label, k, character, master):
    sc = SparkContext(master)
    data = sc.textFile(dataPath).map(lambda line: line.replace(character, ','))

    if label == 0:
        label_sum = data.map(lambda line: line.split(',')).map(lambda data: (float(data[0]), 1)).reduceByKey(add).collect()
        label = data.map(lambda line: line.split(',')).map(lambda data: float(data[0])).collect()        
        train_data = data.map(lambda line: line.split(',')).map(lambda x: map(lambda part: float(part), x[1:len(x)]))
    else:
        label_sum = data.map(lambda line: line.split(',')).map(lambda data: (float(data[-1]), 1)).reduceByKey(add).collect()
        label = data.map(lambda line: line.split(',')).map(lambda data: float(data[-1])).collect()        
        train_data = data.map(lambda line: line.split(',')).map(lambda x: map(lambda part: float(part) if part is not None else '', x[:len(x) - 1]))
    model = km.train(train_data, k)
    predict_data = train_data.collect()
    train = len(predict_data)
    acc = 0
    
    for i in range(len(label_sum)):
        ksum = np.zeros(k, dtype = int)
        cur_label = label_sum[i][0]
        for j in range(train):
            if label[j] == cur_label:
                ksum[model.predict(predict_data[j])] += 1
        acc += max(ksum)

    string = "KMeans Result: \n"
    center = model.centers
    for i in range(k):
        cur = str(i) + ":" + str(center[i]) + '\n'
        string += cur  
    string = string + "Acc: " + str((float(acc)/train) * 100) + "%"    
    sc.stop()
    return string
def main():

	
	input = sys.argv[1]
	output = sys.argv[2]
	
	
	conf = SparkConf().setAppName('Matrix Multiplication')
	sc = SparkContext(conf=conf)
	assert sc.version >= '1.5.1'
	
	row = sc.textFile(input).map(lambda row : row.split(' ')).cache()
	ncol = len(row.take(1)[0])
	intermediateResult = row.map(permutation).reduce(add_tuples)
	
	outputFile = open(output, 'w') 
	

	
	
	
	result = [intermediateResult[x:x+3] for x in range(0, len(intermediateResult), ncol)]
	
	
	for row in result:
		for element in row:
			outputFile.write(str(element) + ' ')
		outputFile.write('\n')
		
	outputFile.close()
Ejemplo n.º 13
0
class TestWordCounter(unittest.TestCase):



	def setUp(self):
	   conf = SparkConf().setAppName("appTest").setMaster("local[*]")
	   self.sc = SparkContext(conf=conf)
	   self.counter = WordCounter() 

	def tearDown(self):
	   self.sc.stop()

	def test_when_exist_one_movie_and_counter(self):
	   movieList = ["1993::Toy Story Toy (1995)::Animation|Children's|Comedy",
	                "1993::ToyA StoryA ToyA (1995)::Animation|Children's|Comedy"]	 
	   result = (('ToyA', ['::ToyA StoryA ToyA (1995)::']),
	             ('Toy', ['::Toy Story Toy (1995)::']))                
	   movies = self.sc.parallelize(movieList)
 	   self.assertEqual(self.counter.getMaxValues(movies),result)   


 	def test_when_exist_one_movie_and_counter_moreMovies(self):
	   movieList = ["1993::Toy Story Toy (1995)::Animation|Children's|Comedy",
	                "1993::ToyA StoryB ToyA (1995)::Animation|Children's|Comedy",
	                "1993::ToyA StoryA ToyA (1995)::Animation|Children's|Comedy"]	 
	   result = (('ToyA', ['::ToyA StoryB ToyA (1995)::','::ToyA StoryA ToyA (1995)::']))                
	   movies = self.sc.parallelize(movieList)
 	   self.assertEqual(self.counter.getMaxValues(movies),result)   
Ejemplo n.º 14
0
def recom(matrix_file_name, user_file_name, output="re.out"):
    sc = SparkContext("local[8]", "Recommendation")
    """ Reads in a sequence file FILE_NAME to be manipulated """
    matrix = sc.sequenceFile(matrix_file_name)
    user = sc.sequenceFile(user_file_name)

    """
    - flatMap takes in a function that will take one input and outputs 0 or more
      items
    - map takes in a function that will take one input and outputs a single item
    - reduceByKey takes in a function, groups the dataset by keys and aggregates
      the values of each key
    """
    user_tuples = user.flatMap(flat_user) \
                 .map(map_user) \
                 .sortByKey(keyfunc=lambda k: int(k))

    keys = user_tuples.keys().collect()

    matrix_tuples = matrix.flatMap(flat_matrix) \
                          .map(map_matrix) \
                          .filter(lambda x: x[0] in keys)
    global mt 
    mt = matrix_tuples.collectAsMap()

    recm = user_tuples.flatMap(flat_recom) \
                      .reduceByKey(reduce_recom) \
                      .filter(lambda x: x[0] not in keys) \
                      .sortBy(lambda (key, value): int(value))
 
    """ Takes the dataset stored in counts and writes everything out to OUTPUT """
    recm.coalesce(1).saveAsTextFile(output)
Ejemplo n.º 15
0
def stackexchange_json_spark_job():
    """
    Spark job to convert json data from hdfs into ques and ans.
    Result is written into elasticsearch for text based search from user.
    """
    server = bluebook_conf.HDFS_FQDN
    conf = SparkConf().setAppName("stackexchange_json_spark_job")
    spark_context = SparkContext(conf=conf)    
    json_ques_folder_address = "hdfs://" + server + "/" +\
                              bluebook_conf.STACKEXCHANGE_JSON_QUES_FOLDER_NAME +\
                              "/part-*"
    json_ans_folder_address = "hdfs://" + server + "/" +\
                              bluebook_conf.STACKEXCHANGE_JSON_ANS_FOLDER_NAME +\
                              "/part-*"
    
    # Ques and ans files are seperately read from hdfs
    ques_file = spark_context.textFile(json_ques_folder_address)
    ans_file = spark_context.textFile(json_ans_folder_address)
    ques_tups = ques_file.map(lambda line: stackexchange_json_mapper(line, 'ques'))
    ans_tups = ans_file.map(lambda line: stackexchange_json_mapper(line, 'ans'))

    # Join accepted answers with their respective questions
    ques_ans = ques_tups.join(ans_tups).map(lambda x: (x[0], {'ques': x[1][0], 'ans': x[1][1]}))
    ques_ans.saveAsNewAPIHadoopFile(
        path='-', 
        outputFormatClass="org.elasticsearch.hadoop.mr.EsOutputFormat",
        keyClass="org.apache.hadoop.io.NullWritable", 
        valueClass="org.elasticsearch.hadoop.mr.LinkedMapWritable", 
        conf=stackoverflow_es_write_conf)
def main(argv):
    
    ''' matrixDirectory: the hdfs directory where we find users profile matrix. It is assumed to be compressed 
                        and split in several files.
        streamFiles: the files used to update the matrix. In userId|country|artistId|trackId format
        outputFile: optional output directory for the updated matrix. By default, we simply overwrite the current one'''
    matrixDirectory, streamFiles, outputFile = getArguments(argv)

    sc = SparkContext(appName="usersProfile")
    
    # open both matrix and non processed stream_xxxxxxxx files
    # Turn into (key, value) pair, where key = (user, track), to prepare the join
    matrix = (sc.textFile(matrixDirectory + "*.gz")
                .map(lambda line: map(int, line.split(" ")))
                .map(lambda t: ((t[0], t[1]), t[2])))

    streamData = (sc.textFile(streamFiles)
                    .map(lambda line:  line.split("|"))
		    .map(lambda t: ((int(t[0]), int(t[3])), 1)))
  
  
    outData = (matrix.join(streamData) # here the entries look like ((user, track), [count, 1, 1 ...])
	             .map(lambda t: (t[0], sum(t[1])) ) # compute new count => ((user, track), new_count)
                     .sortByKey()                  
		     .map(lambda t: " ".join(map(str, (t[0][0], t[0][1], t[1]))))) # prepare output file

    saveAsTextFile(outData, path = outputFile, overwrite = True)
Ejemplo n.º 17
0
def run():
#if __name__ == "__main__":
    sc = SparkContext(master = spark_addr, appName= app_name)
    rdd = sc.textFile(hdfs_addr + file_path, 2).map(lambda line:format_list(line)).cache()
#    rdd = sc.parallelize(test_list,4).cache()
#********create rules************
    supp = float(rdd.count())*supp_rate
    item = create_item(rdd)   #create one item
    item = freq(rdd,item,supp)
    one_item = item
    freq_items = item
    while item.count() > 0:
        more_item = item_plus(sc,item)
        item = freq(rdd,more_item,supp)
        freq_items = freq_items.union(item)

    #result freq_items is key_value,key's type is frozenset
  #  rules = produce_rule(freq_items,one_item)
#    rule_result = rules.collect()
    freq_result = freq_items.collect()
#    one_result = one_item.keys().collect()
    one_result = one_item.keys().collect()
    dict_rule = produce_rule(freq_result,one_result)
    out,total = probability(rdd,dict_rule,0.5)
    out1 =out.collect()
    print "$$$$$$$$$$$$$$$$$$$$$$$out=",out1,"all=",total

#****************************
    
    sc.stop()
    return freq_result,dict_rule
Ejemplo n.º 18
0
class SparkContextFactory:
  def __init__(self):
    # not sure why windows environment variable can't be read, I set it 
    ##os.environ["SPARK_HOME"] = "C:\Spark"
    # not sure why windows environment variable can't be read, I set it 
    ##os.environ["HADOOP_CONF_DIR"] = "C:\hdp\bin"
    ##sys.path.append("C:\Spark\python")
    ##sys.path.append("C:\Spark\bin")

    # specify spark home
    os.environ["SPARK_HOME"] = "/opt/cloudera/parcels/CDH-5.4.4-1.cdh5.4.4.p0.4/lib/spark"
    # specify pyspark path so its libraries can be accessed by this application
    sys.path.append("/opt/cloudera/parcels/CDH-5.4.4-1.cdh5.4.4.p0.4/lib/spark/python")
    from pyspark import SparkContext, SparkConf
    from pyspark.sql import SQLContext

    self.conf = SparkConf().setMaster("yarn-client")
    self.conf.setAppName("MrT")
    self.conf.set("spark.executor.memory", "5g")
    self.conf.set("spark.driver.memory", "10g")

    self.sc = SparkContext(conf = self.conf, pyFiles =
    ["ComputeCovHistory.py", "go.py", "risk_DSconvert.py", "ewstats.py", "ewstatsRDD.py", "ewstatswrap.py"])

    """
    toDF method is a monkey patch executed inside SQLContext constructor
    so to be able to use it you have to create a SQLContext first
    """
    self.sqlContextInstance = SQLContext(self.sc)


  def disconnect(self):
    self.sc.stop()
Ejemplo n.º 19
0
def init_spark_context(details=[]):
    global spark_context
    if spark_context:
        return
    build_type = yb_dist_tests.global_conf.build_type
    from pyspark import SparkContext
    # We sometimes fail tasks due to unsynchronized clocks, so we should tolerate a fair number of
    # retries.
    # https://stackoverflow.com/questions/26260006/are-failed-tasks-resubmitted-in-apache-spark
    # NOTE: we never retry failed tests to avoid hiding bugs. This failure tolerance mechanism
    #       is just for the resilience of the test framework itself.
    SparkContext.setSystemProperty('spark.task.maxFailures', str(SPARK_TASK_MAX_FAILURES))
    if yb_dist_tests.global_conf.build_type == 'tsan':
        logging.info("Using a separate default Spark cluster for TSAN tests")
        default_spark_master_url = DEFAULT_SPARK_MASTER_URL_TSAN
    else:
        logging.info("Using the regular default Spark cluster for non-TSAN tests")
        default_spark_master_url = DEFAULT_SPARK_MASTER_URL

    spark_master_url = os.environ.get('YB_SPARK_MASTER_URL', default_spark_master_url)
    details += [
        'user: {}'.format(getpass.getuser()),
        'build type: {}'.format(build_type)
        ]

    if 'BUILD_URL' in os.environ:
        details.append('URL: {}'.format(os.environ['BUILD_URL']))

    spark_context = SparkContext(spark_master_url, "YB tests ({})".format(', '.join(details)))
    spark_context.addPyFile(yb_dist_tests.__file__)
Ejemplo n.º 20
0
def solve_puzzle(master, output, height, width, slaves):
    global HEIGHT, WIDTH, level
    HEIGHT=height
    WIDTH=width
    level = 0

    sc = SparkContext(master, "python")

    """ YOUR CODE HERE """
    NUM_WORKERS = slaves

    sol = Sliding.solution(WIDTH, HEIGHT)
    """ MAP REDUCE PROCESSING CODE HERE """
    level_pos = sc.parallelize((make_state(level, sol),))
    prev_size, size = 0, 1

    while prev_size != size:
        level += 1
        if level % 10 == 0:
            level_pos = level_pos.partitionBy(PARTITION_COUNT)
        level_pos = level_pos.flatMap(bfs_flat_map).reduceByKey(bfs_reduce)
        prev_size = size
        size = level_pos.count()

    """ OUTPUT CODE HERE """
    level_pos = level_pos.map(unhash_board)
    level_pos.coalesce(NUM_WORKERS).saveAsTextFile(output)

    sc.stop()
Ejemplo n.º 21
0
def load_cut_to_rdd(input_file, result_file):
    sc = SparkContext(appName='PythonKMeans',master="mesos://219.224.135.91:5050")
    lines = sc.textFile(input_file)
    data = lines.map(parseKV).cache()

    doc_term_tf = data.reduceByKey(add).cache()

    num_doc = doc_term_tf.map(lambda ((tid, term), tf): tid).distinct().count()
    terms_list = doc_term_tf.map(lambda ((tid, term), tf): term).distinct().collect()
    num_term = len(terms_list)

    term_idf = doc_term_tf.map(
            lambda ((tid, term), tf): (term, 1.0)
            ).reduceByKey(add).mapValues(lambda idf: math.log(float(num_doc) / (idf+1)))
    tfidf_join = doc_term_tf.map(
            lambda ((tid, term), tf): (term, (tid, tf))).join(term_idf)
    tfidf = tfidf_join.map(lambda (term, ((tid, tf), idf)): (tid, (terms_list.index(term), tf*idf)))

    doc_vec = tfidf.groupByKey().mapValues(lambda feature : Vectors.sparse(num_term, feature).toArray()).cache()

    nonzero_count = 0
    f = open(result_file,'w')
    f.write('%s %s\r\n'%(num_doc, num_term))
    for (tid, feature) in doc_vec.collect():
        for num in feature:
            f.write(str(num)+"\t")
        f.write("\n")
    f.close()
    sc.stop()


    return
Ejemplo n.º 22
0
def solve_puzzle(master, output, height, width, slaves):
    global HEIGHT, WIDTH, level
    HEIGHT=height
    WIDTH=width
    level = 0

    sc = SparkContext(master, "python")

    """ YOUR CODE HERE """
    """ YOUR MAP REDUCE PROCESSING CODE HERE """
    solution=Sliding.solution(WIDTH, HEIGHT)
    sol = Sliding.board_to_hash(WIDTH, HEIGHT, solution)
    data = sc.parallelize([(sol,level),])
    counter = 0
    curLen = 1 
    while(counter < curLen):
        level += 1
        data = data.flatMap(bfs_flat_map)
        

        if (level% 12 == 0):
            data = data.partitionBy(PARTITION_COUNT)
        data = data.reduceByKey(bfs_reduce)
        if (level% 6 == 0):
            counter = curLen
            curLen = data.count()
        
        
    """ YOUR OUTPUT CODE HERE """
    data.coalesce(slaves).saveAsTextFile(output)
    sc.stop()
def main(name, divide):

    """
    old_g = pickle.load(open("/net/data/facebook/facebook-ucsb/Facebook_2008/"+name +"/original_pickles/"+name +".pickle", 'r'))
    new_g = networkx.Graph()
    for node, friends in old_g.adj.iteritems():
        if node not in new_g.nodes():
            new_g.add_node(node)
        for friend in friends.iterkeys():
            new_g.add_node(friend)
            new_g.add_edge(node, friend)
            """
    # serialize the networkx graph as text files of edgelist
    # into a text file for workers to read

    #   networkx.write_edgelist(new_g, "edgelist/"+name, data=False)
    #   subprocess.check_call("hdfs dfs -put edgelist/"+name+ " edgelist/", shell=True)

    new_g = networkx.read_adjlist(name + "_list.txt")  # Egypt_list is an edge list
    sc = SparkContext(appName="Sorted_removal")

    dataG = json_graph.node_link_data(new_g)
    stringG = json.dumps(dataG)
    originalG = sc.broadcast(stringG)
    edges = sc.textFile("hdfs://scrapper/user/xiaofeng/edgelist/" + name, 192 * 4 * int(divide))
    costs = edges.map(lambda line: line.split(" ")).map(lambda edge: edge_to_cost(edge, originalG.value))
    costs.saveAsTextFile("hdfs://scrapper/user/xiaofeng/costs_" + name)
    sc.stop()
    subprocess.check_call("hdfs dfs -get costs_" + name + " /home/xiaofeng/facebook/FacebookProject/costs/", shell=True)
    Reformat("/home/xiaofeng/facebook/FacebookProject/costs/costs_" + name + "/", name)
def count_triangles(data, master="local[2]"):
    """
    @brief: Count triangles using Spark
    @param data: The data location for the input files
    @param master: The master URL as defined at
    https://spark.apache.org/docs/1.1.0/submitting-applications.html#master-urls
    """

    #################  NO EDITS HERE ###################
    assert not os.path.exists("triangles.out"), "File: triangles.out \
    already exists"
    sc = SparkContext(master, "Triangle Count")
    start = time()
    ###############  END NO EDITS HERE  ################
    # TODO: Your code goes here!
    people = sc.textFile(data)
    triad = people.flatMap(GetTriad).reduceByKey(add).filter(lambda x: x[1]>1)
    #triadCount = triad.map(lambda x: (x,1))
    #triadSum = triadCount.reduceByKey(add)
    #triangles = triadSum.filter(lambda x: x[1]>1)
    #output = triangles.collect()
    output = triad.collect()
    #triangles.saveAsTextFile("test1")
    #################  NO EDITS HERE  ###################
    print "\n\n*****************************************"
    print "\nTotal algorithm time: %.4f sec \n" % (time()-start)
    print "*****************************************\n\n""" 
    ###############  END NO EDITS HERE ################
    with open("triangles.out", "wb") as f:
        for friends in output:
            f.write(friends[0]+"\n") # TODO: Loop with f to write your result to file serially
        pass
Ejemplo n.º 25
0
    def bmRun(self):
        """
        Connect DB from Spark and Run/Profile Query
        """
        #create output file for results
        print "Create benchmark output file for recoring..."
        file_out = open("/Users/mira67/Downloads/benchmark_output.txt", "w")
        print "start query evaluation, load tables from DB and register tables in Spark..."

        #load data with Spark
        with Timer() as tm:
            sc = SparkContext("local","penguin")
            #sc = SparkContext(master=local[2])
            sqlContext = SQLContext(sc)
             
            #queries test here, depends on queries to load table in memory
            df1 =sqlContext.read.jdbc(url=self.url, table = self.tbName[0],lowerBound = 0, upperBound = 350, numPartitions=200)#dbtable is variable
            df1.registerTempTable(self.tbName[0])

            df2 =sqlContext.read.jdbc(url=self.url, table = self.tbName[1],lowerBound = 0, upperBound = 350, numPartitions=200)#dbtable is variable
            df2.registerTempTable(self.tbName[1])

            #register helper functions for SQL
            sqlContext.registerFunction("MONTH", lambda x: x[5:7], StringType())#grab Month
            sqlContext.registerFunction("YEAR", lambda x: x[0:4], StringType())
            sqlContext.registerFunction("DAY", lambda x: x[8:10], StringType())

            rdf1 = sqlContext.sql("SELECT * FROM "+self.tbName[0])
            rdf2 = sqlContext.sql("SELECT * FROM " + self.tbName[1])
            sqlContext.registerDataFrameAsTable(rdf1, self.mtb[0])
            sqlContext.registerDataFrameAsTable(rdf2, self.mtb[1])

        mem_use = self.memory_usage_psutil()
        print "memory_use_load %s" %mem_use
        print "=> elasped load data: %s ms" % (tm.secs * 1000)

        #Query with Spark
        with Timer() as tm:
            #query
            rdf = sqlContext.sql(self.sqlStm)
#need register as table first
            print "Data schema from query:"
            rdf.printSchema()
            #hist of BT values
            #Todo
        mem_use = self.memory_usage_psutil()
        print "memory_use_load %s" %mem_use
        print "=> elasped: %s ms" % (tm.secs * 1000)

        file_out.write("Query Time %s Memory %s\n" % (str(tm.secs * 1000),str(mem_use))) 
                
        #example enabled
        day1 = sqlContext.sql("SELECT * FROM ssmi t1, map t2 WHERE t1.DATE BETWEEN '1990-01-01' AND '1990-01-01' AND t1.LOCID = t2.ID ORDER BY t1.LOCID")
        #call plot
        demoplt = qplt.queryPlot()
        demoplt.qMapDemo(day1)

        
        #stop sparkcontext
        sc.stop()
Ejemplo n.º 26
0
def main(arglist):

    with open("log_file_v.txt", "a") as f:
        f.write("Start time of validation...... %s\n" % datetime.datetime.now())

    print("Start time of validation...... %s" % datetime.datetime.now())

    # mapreduce params
    output = arglist[0]
    minPartitions = int(arglist[1])

    # initialize
    sc = SparkContext(appName="PythonValidate")

    # rdd = sc.textFile(output_file_name, minPartitions=minPartitions)
    rdd = sc.wholeTextFiles(output, minPartitions=minPartitions)
    print('partitions', rdd.getNumPartitions())
    error_count = rdd.mapPartitions(separateBlocks).sum()

    sc.stop()

    print("End time of validation...... %s" % datetime.datetime.now())
    with open("log_file_v.txt", "a") as f:
        f.write("End time of validation...... %s\n" % datetime.datetime.now())
        f.write("Error count of sorted file...... %s" % error_count)

    f.close()
Ejemplo n.º 27
0
def main(argv=None):
    '''this is called if run from command line'''

    parser = argparse.ArgumentParser()
    parser.add_argument('-i','--input', help="Seq input file on cluster.", required=True)
    parser.add_argument('-o','--output', help="UTF-8 output file on cluster.", required=False)
    parser.add_argument('-p','--printToLog', help="Print results to log.", required=False, action='store_true')
    args = parser.parse_args()

    sc = SparkContext()
    global goodJsonRecords, badJsonRecords
    goodJsonRecords = sc.accumulator(0)
    badJsonRecords = sc.accumulator(0)
    data = sc.sequenceFile(args.input, "org.apache.hadoop.io.Text", "org.apache.hadoop.io.Text")
    tagCounts = data.values().flatMap(getTokens).countByValue()

    # So far, this code isn't useful.  The output fiile is written by the
    # master node into an isolated folder, and I don't know of a way to
    # retrieve it.
    if args.output != None:
        with codecs.open(args.output, 'wb', 'utf-8') as f:
            for k in sorted(tagCounts):
                f.write(k + " " + str(tagCounts[k]) + "\n")

    print "========================================"
    print "goodJsonRecords = %d" % goodJsonRecords.value
    print "badJsonRecords = %d" % badJsonRecords.value
    if args.printToLog:
        for k in sorted(tagCounts):
            print json.dumps(k), tagCounts[k]
    print "========================================"
Ejemplo n.º 28
0
def stackexchange_xml_spark_job():
    server = bluebook_conf.HDFS_FQDN
    conf = SparkConf()

    xml_file_address = "hdfs://" + server + "/" +\
                       bluebook_conf.STACKEXCHANGE_XML_FOLDER_NAME +\
                       bluebook_conf.STACKEXCHANGE_XML_FILE_NAME
                         
    json_ques_folder_address = "hdfs://" + server + "/" +\
                               bluebook_conf.STACKEXCHANGE_JSON_QUES_FOLDER_NAME
    json_ans_folder_address = "hdfs://" + server + "/" +\
                              bluebook_conf.STACKEXCHANGE_JSON_ANS_FOLDER_NAME
        
    conf.setAppName('stackexchange_xml_spark_job')
    spark_context = SparkContext(conf=conf)
        
    file = spark_context.textFile(xml_file_address)

    # Ques and Ans files are stored seperately depending of their 'posttypeid'
    # Ques -> posttypeid == 1
    # Ans -> posttypeid == 2
    ques = file.map(stackexchange_xml_mapper)\
               .filter(lambda dic: 'posttypeid' in dic.keys())\
               .filter(lambda dic: dic['posttypeid'] == '1')\
               .map(lambda d: jsoner(d))
    ans = file.map(stackexchange_xml_mapper)\
               .filter(lambda dic: 'posttypeid' in dic.keys())\
               .filter(lambda dic: dic['posttypeid'] == '2')\
               .map(lambda d: jsoner(d))
    ques.saveAsTextFile(json_ques_folder_address)
    ans.saveAsTextFile(json_ans_folder_address)
Ejemplo n.º 29
0
    def __init__(self, file_path, train_file, test_file, real_file=None):
        """
        file_path: the folder where data files reside
        train_file: (user, item, rating) quote records
        test_file: (user, item) records, preferences to be predicted
        real_file: (user, option, value) real purchase records, can be none if it doesn't exist
        For this specific project:
        item here is the combination of options with their values,
            e.g. item 10 denotes option A with choice 0; item 21 denotes option B with choice 1
        rating is the number of quotes for a certain item by a user
        """
        self.file_path = file_path
        config = SparkConf().setMaster("local").setAppName("Kaggle")\
            .set("spark.executor.memory", "2g")\
            .set("spark.storage.memoryFraction", "1")

        sc = SparkContext(conf=config)

        self.train_data = sc.textFile("file:" + self.file_path + train_file).cache()\
            .map(lambda line: array([float(x) for x in line.split(',')]))

        self.test_data = sc.textFile("file:" + self.file_path + test_file).cache()\
            .map(lambda line: [float(x) for x in line.split(',')])

        if real_file:
            self.real_data = sc.textFile("file:" + self.file_path + real_file).cache()\
                .map(lambda line: [float(x) for x in line.split(',')]).map(lambda r: ((r[0], r[1]), r[2]))
Ejemplo n.º 30
0
def main():
    conf = SparkConf().set("spark.ui.showConsoleProgress", "false")
    sc = SparkContext(appName="PythonStatusAPIDemo", conf=conf)

    def run():
        rdd = sc.parallelize(range(10), 10).map(delayed(2))
        reduced = rdd.map(lambda x: (x, 1)).reduceByKey(lambda x, y: x + y)
        return reduced.map(delayed(2)).collect()

    result = call_in_background(run)
    status = sc.statusTracker()
    while result.empty():
        ids = status.getJobIdsForGroup()
        for id in ids:
            job = status.getJobInfo(id)
            print "Job", id, "status: ", job.status
            for sid in job.stageIds:
                info = status.getStageInfo(sid)
                if info:
                    print "Stage %d: %d tasks total (%d active, %d complete)" % \
                          (sid, info.numTasks, info.numActiveTasks, info.numCompletedTasks)
        time.sleep(1)

    print "Job results are:", result.get()
    sc.stop()
Ejemplo n.º 31
0
# input: a tuple (x, y) (longtitude, landtitude) and a list of (id, (longtitude, langtitude)) of centers
def closestCenter(xy, centers):
    min_dist = haversine(xy, centers[0][1])
    min_center_id = centers[0][0]
    for i in range(1, len(centers)): # lengths should be 5
        dist = haversine(xy, centers[i][1])
        if dist < min_dist:
            min_dist = dist
            min_center_id = centers[i][0] # id
    return ( min_center_id, (xy[0], xy[1], 1) )

#-------- program --------#
appName = "Kmeans App"
conf = SparkConf().setAppName(appName)
sc = SparkContext(conf=conf)

HDFS = "hdfs://master:9000/"
rides = sc.textFile(HDFS + "yellow_tripdata_1m.csv") # pointer to the file
filter_rides = rides.filter(myfilter)  # remove incorrect data lines (contain 0.0 coordinate)
coords = filter_rides.map(lambda line: (float(line.split(",")[3]), float(line.split(",")[4]))) # map data to correct format
#TODO sc.cashe() for extra grade
centers = [(idx, tup) for idx, tup in enumerate(coords.take(5))] # list type, not rdd

#print("\n\n-------- {} --------\n{}\n\n".format('Beginning', centers))

for i in range(0, MAX_ITER):
    mapped = coords.map(lambda tup: closestCenter(tup, centers)) # emit (id of center,  )
    avg = mapped \
            .reduceByKey(lambda a, b: (a[0] + b[0], a[1] + b[1], a[2] + b[2])) \
            .mapValues(lambda v: (v[0]/v[2], v[1]/v[2])) # reduce by key (id of center) and  tranform only the value
Ejemplo n.º 32
0
from skimage import io
import numpy as np
import matplotlib.pyplot as plt
from skimage.exposure import rescale_intensity
from scipy import ndimage as ndi
import math
from skimage.morphology import skeletonize
from django.views.decorators.http import condition
from pyspark import SparkContext
from pyspark.sql import SQLContext, Row
from pyspark.ml.linalg import Vectors
from pyspark import SparkConf, SparkContext

os.system("export _JAVA_OPTIONS='-Xms1g -Xmx40g'")
conf = (SparkConf().set("spark.driver.maxResultSize", "5g"))
sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)


def as_gray(image_filter, image, *args, **kwargs):
    gray_image = rgb2gray(image)
    return image_filter(gray_image, *args, **kwargs)

@adapt_rgb(as_gray)
def original_gray(image):
    return image

@adapt_rgb(as_gray)
def skeleton_gray(image):
    return skeletonize(image)
Ejemplo n.º 33
0
			if not bf[nOffset + k]:
				bHit = False
				break
			nOffset += bits_per_slice
		if bHit == True:
			yield t

	


if __name__ == '__main__':

	sApp = 'spark'
	nPart = 38*14*4
	#sRef = op.join(sHdfsDir, 'hg38.fa.nb.enc.gzip')
	sRef = op.join(sHdfsDir, 'chr21.fa.nb.enc.gzip')
	sInput = op.join(sHdfsDir, 'first1M.fa.nb.enc')
	sSeeds = op.join(sHdfsDir, 'seed.enc')

	# print default SparkConf
	sf = SparkConf()
	print sf.toDebugString()
	sc = SparkContext(appName=sApp)

	rdd = sc.textFile(op.join(sHdfsDir,'half.enc'), use_unicode=False)
	nTotal = rdd.count()

	sc.stop()

	print nTotal
Ejemplo n.º 34
0
"""
Created on Mon Dec 14 16:13:29 2020

@author: prach
"""

import re
from pyspark import SparkConf, SparkContext


def normalizeWords(text):
    return re.compile(r'\W+', re.UNICODE).split(text.lower())


conf = SparkConf().setMaster("local").setAppName("WordCount")
sc = SparkContext(conf=conf)

input = sc.textFile("file:///sparkcourse/book.txt")

rdd = input.flatMap(normalizeWords)

rdd1 = rdd.map(lambda x: (x, 1)).reduceByKey(lambda x, y: x + y)

rdd2 = rdd1.map(lambda x: (x[1], x[0])).sortBykey()

rdd3 = rdd2.sortBykey()

results = rdd3.collect()

for result in results:
    count = str(result[0])
Ejemplo n.º 35
0
class HailContext(object):
    """The main entrypoint for Hail functionality.

    :param sc: spark context, will be auto-generated if None
    :type sc: :class:`.pyspark.SparkContext`

    :param appName: Spark application identifier

    :param master: Spark cluster master

    :param local: local resources to use

    :param log: log path

    :param quiet: suppress log messages

    :param append: write to end of log file instead of overwriting

    :param parquet_compression: level of on-disk annotation compression

    :param min_block_size: minimum file split size in MB

    :param branching_factor: branching factor for tree aggregation

    :param tmp_dir: temporary directory for file merging

    :ivar sc: Spark context
    :vartype sc: :class:`.pyspark.SparkContext`
    """

    def __init__(self, sc=None, appName="Hail", master=None, local='local[*]',
                 log='hail.log', quiet=False, append=False, parquet_compression='uncompressed',
                 min_block_size=1, branching_factor=50, tmp_dir='/tmp'):
        from pyspark import SparkContext
        SparkContext._ensure_initialized()

        self._gateway = SparkContext._gateway
        self._jvm = SparkContext._jvm

        Env._jvm = self._jvm
        Env._gateway = self._gateway

        # hail package
        self._hail = getattr(self._jvm, 'is').hail

        driver = scala_package_object(self._hail.driver)

        if not sc:
            self._jsc = driver.configureAndCreateSparkContext(
                appName, joption(master), local, parquet_compression, min_block_size)
            self.sc = SparkContext(gateway=self._gateway, jsc=self._jvm.JavaSparkContext(self._jsc))
        else:
            self.sc = sc
            # sc._jsc is a JavaSparkContext
            self._jsc = sc._jsc.sc()

        driver.configureHail(branching_factor, tmp_dir)
        driver.configureLogging(log, quiet, append)

        self._jsql_context = driver.createSQLContext(self._jsc)
        self._sql_context = SQLContext(self.sc, self._jsql_context)

    def _jstate(self, jvds):
        return self._hail.driver.State(
            self._jsc, self._jsql_context, jvds, scala_object(self._jvm.scala.collection.immutable, 'Map').empty())

    def _run_command(self, vds, pargs):
        jargs = jarray(self._jvm.java.lang.String, pargs)
        t = self._hail.driver.ToplevelCommands.lookup(jargs)
        cmd = t._1()
        cmd_args = t._2()
        jstate = self._jstate(vds._jvds if vds != None else None)

        try:
            result = cmd.run(jstate, cmd_args)
        except Py4JJavaError as e:
            raise_py4j_exception(e)

        return VariantDataset(self, result.vds())

    def grep(self, regex, path, max_count=100):
        """Grep big files, like, really fast.

        **Examples**

        Print all lines containing the string ``hello`` in *file.txt*:

        >>> hc.grep('hello','data/file.txt')

        Print all lines containing digits in *file1.txt* and *file2.txt*:

        >>> hc.grep('\d', ['data/file1.txt','data/file2.txt'])

        **Background**

        :py:meth:`~hail.HailContext.grep` mimics the basic functionality of Unix ``grep`` in parallel, printing results to screen. This command is provided as a convenience to those in the statistical genetics community who often search enormous text files like VCFs. Find background on regular expressions at `RegExr <http://regexr.com/>`_.

        :param str regex: The regular expression to match.

        :param path: The files to search.
        :type path: str or list of str

        :param int max_count: The maximum number of matches to return.
        """

        pargs = ["grep", regex]
        if isinstance(path, str):
            pargs.append(path)
        else:
            for p in path:
                pargs.append(p)

        pargs.append('--max-count')
        pargs.append(str(max_count))

        self._run_command(None, pargs)

    def import_annotations_table(self, path, variant_expr, code=None, npartitions=None, config=None):
        """Import variants and variant annotations from a delimited text file
        (text table) as a sites-only VariantDataset.

        :param path: The files to import.
        :type path: str or list of str

        :param str variant_expr: Expression to construct a variant
            from a row of the text table.  Must have type Variant.

        :param code: Expression to build the variant annotations.
        :type code: str or None

        :param npartitions: Number of partitions.
        :type npartitions: int or None

        :param config: Configuration options for importing text files
        :type config: :class:`.TextTableConfig` or None

        :rtype: :class:`.VariantDataset`
        """

        pargs = ['importannotations', 'table']
        if isinstance(path, str):
            pargs.append(path)
        else:
            for p in path:
                pargs.append(p)

        pargs.append('--variant-expr')
        pargs.append(variant_expr)

        if code:
            pargs.append('--code')
            pargs.append(code)

        if npartitions:
            pargs.append('--npartition')
            pargs.append(npartitions)

        if not config:
            config = TextTableConfig()

        pargs.extend(config._as_pargs())

        return self._run_command(None, pargs)

    def import_bgen(self, path, tolerance=0.2, sample_file=None, npartitions=None):
        """Import .bgen files as VariantDataset

        :param path: .bgen files to import.
        :type path: str or list of str

        :param float tolerance: If the sum of the dosages for a
            genotype differ from 1.0 by more than the tolerance, set
            the genotype to missing.

        :param sample_file: The sample file.
        :type sample_file: str or None

        :param npartitions: Number of partitions.
        :type npartitions: int or None

        :return A dataset imported from the bgen file.
        :rtype: :class:`.VariantDataset`
        """

        pargs = ["importbgen"]

        if isinstance(path, str):
            pargs.append(path)
        else:
            for p in path:
                pargs.append(p)

        if sample_file:
            pargs.append('--samplefile')
            pargs.append(sample_file)

        if npartitions:
            pargs.append('--npartition')
            pargs.append(str(npartitions))

        pargs.append('--tolerance')
        pargs.append(str(tolerance))

        return self._run_command(None, pargs)

    def import_gen(self, path, sample_file=None, tolerance=0.02, npartitions=None, chromosome=None):
        """Import .gen files as VariantDataset.

        **Examples**

        Read a .gen file and a .sample file and write to a .vds file::

        >>> (hc.import_gen('data/example.gen', sample_file='data/example.sample')
        >>>  .write('data/example.vds'))

        Load multiple files at the same time with `Hadoop glob patterns <../reference.html#hadoopglob>`_::

        >>> (hc.import_gen('data/example.chr*.gen', sample_file='data/example.sample')
        >>>  .write('data/example.vds'))

        **Notes**

        For more information on the .gen file format, see `here <http://www.stats.ox.ac.uk/%7Emarchini/software/gwas/file_format.html#mozTocId40300>`_.

        To ensure that the .gen file(s) and .sample file are correctly prepared for import:

        - If there are only 5 columns before the start of the dosage data (chromosome field is missing), you must specify the chromosome using the ``chromosome`` parameter

        - No duplicate sample IDs are allowed

        The first column in the .sample file is used as the sample ID ``s.id``.

        .. _dosagefilters:

        **Dosage representation**

        Since dosages are understood as genotype probabilities, :py:meth:`~hail.HailContext.import_gen` automatically sets to missing those genotypes for which the sum of the dosages is a distance greater than the ``tolerance`` paramater from 1.0.  The default tolerance is 0.02, so a genotypes with sum .97 or 1.03 is filtered out, whereas a genotype with sum .98 or 1.02 remains.

        :py:meth:`~hail.HailContext.import_gen` normalizes all dosages to sum to 1.0. Therefore, an input dosage of (0.98, 0.0, 0.0) will be stored as (1.0, 0.0, 0.0) in Hail.

        Even when the dosages sum to 1.0, Hail may store slightly different values than the original GEN file (maximum observed difference is 3E-4).

        **Annotations**

        :py:meth:`~hail.HailContext.import_gen` adds the following variant annotations:

         - **va.varid** (*String*) -- 2nd column of .gen file if chromosome present, otherwise 1st column.

         - **va.rsid** (*String*) -- 3rd column of .gen file if chromosome present, otherwise 2nd column.

        :param path: .gen files to import.
        :type path: str or list of str

        :param sample_file: The sample file.
        :type sample_file: str or None

        :param float tolerance: If the sum of the dosages for a genotype differ from 1.0 by more than the tolerance, set the genotype to missing.

        :param npartitions: Number of partitions.
        :type npartitions: int or None

        :param chromosome: Chromosome if not listed in the .gen file.
        :type chromosome: str or None

        :rtype: :class:`.VariantDataset`
        :return: A dataset imported from a .gen and .sample file.
        """

        pargs = ["importgen"]

        if isinstance(path, str):
            pargs.append(path)
        else:
            for p in path:
                pargs.append(p)

        if sample_file:
            pargs.append('--samplefile')
            pargs.append(sample_file)

        if chromosome:
            pargs.append('--chromosome')
            pargs.append(chromosome)

        if npartitions:
            pargs.append('--npartition')
            pargs.append(str(npartitions))

        if tolerance:
            pargs.append('--tolerance')
            pargs.append(str(tolerance))

        return self._run_command(None, pargs)

    def import_keytable(self, path, key_names, npartitions=None, config=None):
        """Import delimited text file (text table) as KeyTable.

        :param path: files to import.
        :type path: str or list of str

        :param key_names: The name(s) of fields to be considered keys
        :type key_names: str or list of str

        :param npartitions: Number of partitions.
        :type npartitions: int or None

        :param config: Configuration options for importing text files
        :type config: :class:`.TextTableConfig` or None

        :rtype: :class:`.KeyTable`
        """

        path_args = []
        if isinstance(path, str):
            path_args.append(path)
        else:
            for p in path:
                path_args.append(p)

        if not isinstance(key_names, str):
            key_names = ','.join(key_names)

        if not npartitions:
            npartitions = self.sc.defaultMinPartitions

        if not config:
            config = TextTableConfig()

        return KeyTable(self, self._hail.keytable.KeyTable.importTextTable(
            self._jsc, jarray(self._jvm.java.lang.String, path_args), key_names, npartitions,
            config._to_java()))

    def import_plink(self, bed, bim, fam, npartitions=None, delimiter='\\\\s+', missing='NA', quantpheno=False):
        """Import PLINK binary file (BED, BIM, FAM) as VariantDataset

        **Examples**

        Import data from a PLINK binary file:

        >>> vds = (hc.import_plink(bed="data/test.bed",
        >>>                        bim="data/test.bim",
        >>>                        fam="data/test.fam"))

        **Implementation Details**

        Only binary SNP-major mode files can be read into Hail. To convert your file from individual-major mode to SNP-major mode, use PLINK to read in your fileset and use the ``--make-bed`` option.

        The centiMorgan position is not currently used in Hail (Column 3 in BIM file).

        The ID (``s.id``) used by Hail is the individual ID (column 2 in FAM file).

        .. warning::

            No duplicate individual IDs are allowed.

        Chromosome names (Column 1) are automatically converted in the following cases:
        
          - 23 => "X"
          - 24 => "Y"
          - 25 => "X"
          - 26 => "MT"

        **Annotations**

        :py:meth:`~hail.HailContext.import_plink` adds the following annotations:

         - **va.rsid** (*String*) -- Column 2 in the BIM file.
         - **sa.famID** (*String*) -- Column 1 in the FAM file. Set to missing if ID equals "0".
         - **sa.patID** (*String*) -- Column 3 in the FAM file. Set to missing if ID equals "0".
         - **sa.matID** (*String*) -- Column 4 in the FAM file. Set to missing if ID equals "0".
         - **sa.isFemale** (*String*) -- Column 5 in the FAM file. Set to missing if value equals "-9", "0", or "N/A".
           Set to true if value equals "2". Set to false if value equals "1".
         - **sa.isCase** (*String*) -- Column 6 in the FAM file. Only present if ``quantpheno`` equals False.
           Set to missing if value equals "-9", "0", "N/A", or the value specified by ``missing``.
           Set to true if value equals "2". Set to false if value equals "1".
         - **sa.qPheno** (*String*) -- Column 6 in the FAM file. Only present if ``quantpheno`` equals True.
           Set to missing if value equals ``missing``.

        :param str bed: PLINK BED file.

        :param str bim: PLINK BIM file.

        :param str fam: PLINK FAM file.

        :param npartitions: Number of partitions.
        :type npartitions: int or None

        :param str missing: The string used to denote missing values **only** for the phenotype field. This is in addition to "-9", "0", and "N/A" for case-control phenotypes.

        :param str delimiter: FAM file field delimiter regex.

        :param bool quantpheno: If True, FAM phenotype is interpreted as quantitative.

        :return: A dataset imported from a PLINK binary file.

        :rtype: :class:`.VariantDataset`
        """

        pargs = ["importplink"]

        pargs.append('--bed')
        pargs.append(bed)

        pargs.append('--bim')
        pargs.append(bim)

        pargs.append('--fam')
        pargs.append(fam)

        if npartitions:
            pargs.append('--npartition')
            pargs.append(npartitions)

        if quantpheno:
            pargs.append('--quantpheno')

        pargs.append('--missing')
        pargs.append(missing)

        pargs.append('--delimiter')
        pargs.append(delimiter)

        return self._run_command(None, pargs)

    def read(self, path, sites_only=False):
        """Read .vds files as VariantDataset

        When loading multiple .vds files, they must have the same
        sample IDs, split status and variant metadata.

        :param path: .vds files to read.
        :type path: str or list of str

        :param bool sites_only: If True, create sites-only
          VariantDataset.  Don't load sample ids, sample annotations
          or gneotypes.

        :return: A dataset read from disk
        :rtype: :class:`.VariantDataset`
        """

        pargs = ["read"]

        if isinstance(path, str):
            pargs.append(path)
        else:
            for p in path:
                pargs.append(p)

        if sites_only:
            pargs.append("--skip-genotypes")
        return self._run_command(None, pargs)

    def write_partitioning(self, path):
        """Write partitioning.json.gz file for legacy VDS file.

        :param str path: path to VDS file.
        """

        self._hail.variant.VariantSampleMatrix.writePartitioning(self._jsql_context, path)

    def import_vcf(self, path, force=False, force_bgz=False, header_file=None, npartitions=None,
                   sites_only=False, store_gq=False, pp_as_pl=False, skip_bad_ad=False):
        """Import .vcf files as VariantDataset

        :param path: .vcf files to read.
        :type path: str or list of str

        :param bool force: If True, load .gz files serially.

        :param bool force_bgz: If True, load .gz files as blocked gzip files (BGZF)

        :param header_file: File to load VCF header from.  If not specified, the first file in path is used.
        :type header_file: str or None

        :param npartitions: Number of partitions.
        :type npartitions: int or None

        :param bool sites_only: If True, create sites-only
            VariantDataset.  Don't load sample ids, sample annotations
            or gneotypes.

        :param bool store_gq: If True, store GQ FORMAT field instead of computing from PL.

        :param bool pp_as_pl: If True, store PP FORMAT field as PL.  EXPERIMENTAL.

        :param bool skip_bad_ad: If True, set AD FORMAT field with
            wrong number of elements to missing, rather than setting
            the entire genotype to missing.

        :return: A dataset imported from the VCF file
        :rtype: :class:`.VariantDataset`

        """

        pargs = ["importvcf"]

        if isinstance(path, str):
            pargs.append(path)
        else:
            for p in path:
                pargs.append(p)

        if force:
            pargs.append('--force')

        if force_bgz:
            pargs.append('--force-bgz')

        if header_file:
            pargs.append('--header-file')
            pargs.append(header_file)

        if npartitions:
            pargs.append('--npartition')
            pargs.append(str(npartitions))

        if pp_as_pl:
            pargs.append('--pp-as-pl')

        if skip_bad_ad:
            pargs.append('--skip-bad-ad')

        if sites_only:
            pargs.append('--skip-genotypes')

        if store_gq:
            pargs.append('--store-gq')

        return self._run_command(None, pargs)

    def index_bgen(self, path):
        """Index .bgen files.  import_bgen cannot run with these indicies.

        :param path: .bgen files to index.
        :type path: str or list of str

        """

        pargs = ["indexbgen"]

        if isinstance(path, str):
            pargs.append(path)
        else:
            for p in path:
                pargs.append(p)

        self._run_command(None, pargs)

    def balding_nichols_model(self, populations, samples, variants, partitions=None,
                              pop_dist=None,
                              fst=None,
                              af_dist = UniformDist(0.1, 0.9),
                              seed=0):
        """Generate a VariantDataset using the Balding-Nichols model.

        **Examples**

        To generate a VDS with 3 populations, 100 samples in total, and 1000 variants:

        >>> vds = hc.balding_nichols_model(3, 100, 1000)

        To generate a VDS with 4 populations, 2000 samples, 5000 variants, 10 partitions, population distribution [0.1, 0.2, 0.3, 0.4], :math:`F_st` values [.02, .06, .04, .12], ancestral allele frequencies drawn from a truncated beta distribution with a = .01 and b = .05 over the interval [0.05, 1], and random seed 1:

        >>> vds = hc.balding_nichols_model(4, 40, 150, 10,
        >>>     pop_dist=[0.1, 0.2, 0.3, 0.4],
        >>>     fst=[.02, .06, .04, .12],
        >>>     af_dist=hail.stats.TruncatedBetaDist(a=0.01, b=2.0, minVal=0.05, maxVal=1.0),
        >>>     seed=1)

        **Notes**

        Hail is able to randomly generate a VDS using the Balding-Nichols model.

        - :math:`K` populations are labeled by integers 0, 1, ..., K - 1
        - :math:`N` samples are named by strings 0, 1, ..., N - 1
        - :math:`M` variants are defined as ``1:1:A:C``, ``1:2:A:C``, ..., ``1:M:A:C``
        - The default ancestral frequency distribution :math:`P_0` is uniform on [0.1, 0.9]. Options are UniformDist(minVal, maxVal), BetaDist(a, b), and TruncatedBetaDist(a, b, minVal, maxVal). All three classes are located in hail.stats.
        - The population distribution :math:`\pi` defaults to uniform
        - The :math:`F_{st}` values default to 0.1
        - The number of partitions defaults to one partition per million genotypes (i.e., samples * variants / 10^6) or 8, whichever is larger

        The Balding-Nichols model models genotypes of individuals from a structured population comprising :math:`K` homogeneous subpopulations
        that have each diverged from a single ancestral population (a `star phylogeny`). We take :math:`N` samples and :math:`M` bi-allelic variants in perfect
        linkage equilibrium. The relative sizes of the subpopulations are given by a probability vector :math:`\pi`; the ancestral allele frequencies are
        drawn independently from a frequency spectrum :math:`P_0`; the subpopulations have diverged with possibly different :math:`F_{ST}` parameters :math:`F_k`
        (here and below, lowercase indices run over a range bounded by the corresponding uppercase parameter, e.g. :math:`k = 1, \ldots, K`).
        For each variant, the subpopulation allele frequencies are drawn a `beta distribution <https://en.wikipedia.org/wiki/Beta_distribution>`_, a useful continuous approximation of
        the effect of genetic drift. We denote the individual subpopulation memberships by :math:`k_n`, the ancestral allele frequences by :math:`p_{0, m}`,
        the subpopulation allele frequencies by :math:`p_{k, m}`, and the genotypes by :math:`g_{n, m}`. The generative model in then given by:

        .. math::
            k_n \,&\sim\, \pi

            p_{0,m}\,&\sim\, P_0

            p_{k,m}\mid p_{0,m}\,&\sim\, \mathrm{Beta}(\mu = p_{0,m},\, \sigma^2 = F_k p_{0,m}(1 - p_{0,m}))

            g_{n,m}\mid k_n, p_{k, m} \,&\sim\, \mathrm{Binomial}(2, p_{k_n, m})

        We have parametrized the beta distribution by its mean and variance; the usual parameters are :math:`a = (1 - p)(1 - F)/F,\; b = p(1-F)/F` with :math:`F = F_k,\; p = p_{0,m}`.

        **Annotations**

        :py:meth:`~hail.HailContext.balding_nichols_model` adds the following global, sample, and variant annotations:

         - **global.nPops** (*Int*) -- Number of populations
         - **global.nSamples** (*Int*) -- Number of samples
         - **global.nVariants** (*Int*) -- Number of variants
         - **global.popDist** (*Array[Double]*) -- Normalized population distribution indexed by population
         - **global.Fst** (*Array[Double]*) -- F_st values indexed by population
         - **global.seed** (*Int*) -- Random seed
         - **global.ancestralAFDist** (*Struct*) -- Information about ancestral allele frequency distribution
         - **sa.pop** (*Int*) -- Population of sample
         - **va.ancestralAF** (*Double*) -- Ancestral allele frequency
         - **va.AF** (*Array[Double]*) -- Allele frequency indexed by population

        :param int populations: Number of populations.

        :param int samples: Number of samples.

        :param int variants: Number of variants.

        :param int partitions: Number of partitions.

        :param pop_dist: Unnormalized population distribution
        :type pop_dist: array of float or None

        :param fst: F_st values
        :type fst: array of float or None

        :param af_dist: Ancestral allele frequency distribution
        :type af_dist: :class:`.UniformDist` or :class:`.BetaDist` or :class:`.TruncatedBetaDist`

        :param int seed: Random seed.

        :rtype: :class:`.VariantDataset`
        :return: A VariantDataset generated by the Balding-Nichols model.
        """

        if pop_dist is None:
            jvm_pop_dist_opt = joption(pop_dist)
        else:
            jvm_pop_dist_opt = joption(jarray(self._jvm.double, pop_dist))

        if fst is None:
            jvm_fst_opt = joption(fst)
        else:
            jvm_fst_opt = joption(jarray(self._jvm.double, fst))

        return VariantDataset(self, self._hail.stats.BaldingNicholsModel.apply(self._jsc,  populations, samples, variants,
                            jvm_pop_dist_opt,
                            jvm_fst_opt,
                            seed,
                            joption(partitions), af_dist._jrep()))

    def dataframe_to_keytable(self, df, keys=[]):
        """Convert Spark SQL DataFrame to KeyTable.

        Spark SQL data types are converted to Hail types in the obvious way as follows:

        .. code-block:: text

          BooleanType => Boolean
          IntegerType => Int
          LongType => Long
          FloatType => Float
          DoubleType => Double
          StringType => String
          BinaryType => Binary
          ArrayType => Array
          StructType => Struct

        Unlisted Spark SQL data types are currently unsupported.

        :param keys: List of key column names.
        :type keys: list of string

        :return: The DataFrame as a KeyTable.
        :rtype: :class:`.KeyTable`
        """

        jkeys = jarray(self._jvm.java.lang.String, keys)
        return KeyTable(self, self._hail.keytable.KeyTable.fromDF(df._jdf, jkeys))

    def stop(self):
        """ Shut down the Hail Context """
        self.sc.stop()
        self.sc = None
Ejemplo n.º 36
0
#Author: Andre Foote
#Date: 27th May 2018

from pyspark import SparkContext
from pyspark.sql import SQLContext
from pyspark.sql import Row
import numpy as np
from pyspark.sql.types import *
import matplotlib.pyplot as plt
import sys

if __name__ == "__main__":

    sc = SparkContext(appName="SparkProblem")

    sqlContext = SQLContext(sc)

    #Load csv into dataframe
    df = sqlContext.read.format('com.databricks.spark.csv').options(header='true', inferschema='true').load('DataSample.csv')

    #Remove suspicious requests (ie records with identical geoinfo and timest fields)
    df_no_dupes = df.dropDuplicates(['TimeSt','Latitude','Longitude'])
    no_dupes_length = df_no_dupes.count()-1

    #The points of interest (poi)
    poiList = [(53.546167000000004, -113.48573400000001), (45.521629, -73.566024), (45.22483, -63.232729000000006)]

    #The following three functions return a single rdd containing the distance between each row in 
    #DataSample.csv and one of the POI coordinates. distance1 for POI1, distance2 for POI2, distance3 for POI3.
    def distance1(row):
        return float(np.sqrt((row.Latitude-poiList[0][0])**2+(row.Longitude-poiList[0][1])**2))
Ejemplo n.º 37
0
import sys
import pyspark
import string

from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext

from pyspark.sql.types import StructType, StructField, IntegerType, StringType, FloatType
from pyspark.sql.window import Window
from pyspark.sql.functions import *

if __name__ == "__main__":

    sc = SparkContext()

    spark = SparkSession \
        .builder \
        .appName("sql") \
        .config("spark.some.config.option", "some-value") \
        .getOrCreate()

    sqlContext = SQLContext(spark)

    # get command-line arguments
    inFile = sys.argv[1]
    supp = sys.argv[2]
    conf = sys.argv[3]
    prot = sys.argv[4]
from pyspark import SparkContext
from pyspark.sql import SparkSession, DataFrame
from pyspark.sql import SQLContext
import os
import csv
import time
start_time = time.time()
filename = '/home/fieldtest1/CATT_Intern/TripRecords/TripRecordsFebruary1.csv'
sc = SparkContext()

spark = SparkSession \
    .builder \
    .appName("Python Spark SQL basic example") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

def combine_csv(path_to_csv, path_to_save):
    list_csv = os.listdir(path_to_csv)
    all_csv = []
    for csv_ in list_csv:
        if csv_[-1] == 'v':
            path = path_to_csv + '/' + csv_
            all_csv.append(open(path, mode='r', newline=''))
    with open(path_to_save, 'w', newline='', encoding='utf-8') as g:
        writer = csv.writer(g)
        for csv_ in all_csv:
            for row in csv_:
                writer.writerow(row.rstrip(',,\r\n').split(','))


def extract_crossed_trips(crossed_trips_path, trip_records_name, month):
Ejemplo n.º 39
0
from pyspark.sql import SQLContext
from pyspark.sql.types import *
from pyspark import SparkContext

sc = SparkContext("local", "Simple App")

sqlContext = SQLContext(sc)

surveys = sc.textFile('/user/w205/hospital_compare/surveys_responses.csv')
surveyfiltered = surveys.filter(lambda x: "Not Available" not in x)

surveyparts1 = surveyfiltered.map(lambda l: l.split(','))
surveyparts = surveyparts1.filter(lambda l: len(l) is 33)

surveys_table = surveyparts.map(
    lambda l: ('hcahps', l[0], int(l[31].strip('"')), int(l[32].strip('"'))))

schemaString = 'sid hid base_score consistency_score'

surveyfields = [
    StructField(field_name, StringType(), True)
    for field_name in schemaString.split()
]
surveyfields[2].dataType = IntegerType()
surveyfields[3].dataType = IntegerType()

surveyschema = StructType(surveyfields)
schemasurveys = sqlContext.createDataFrame(surveys_table, surveyschema)
schemasurveys.registerTempTable('surveys_table')

# save files
Ejemplo n.º 40
0
    return ((str(flightDate), yDest), (flight[11], flight[18], flight[6],
                                       flight[10], flight[25],
                                       float(flight[38].strip('\"'))))


# Origin = 11
# Dest  = 18
#Airline = 6
# Flight Number = 10
# CRSDepTime = 25
# ArrDelay = 38
# year = 0
# Month = 2
# DayofMonth = 3
conf = SparkConf()
sc = SparkContext(conf=conf)

allFiles = []
allFiles = getFileNames()
rdd = sc.textFile(','.join(allFiles))

runningFlights = rdd.map(lambda line: line.split(',')) \
                  .filter(notCancelled) \
                  .filter(isFloat)

flightXY = runningFlights.filter(
    lambda x: float(x[25].strip('\"')) < 1200).map(extractInfo)

flightYZ = runningFlights.filter(lambda x: float(x[25].strip(
    '\"')) > 1200).map(lambda flight: extractInfo(flight, True))
Ejemplo n.º 41
0
    return failures

def kupiecTestStatistic(total, failures, confidenceLevel):
    failureRatio = float(failures)/ total
    logNumer = (total - failures) * math.log1p(-confidenceLevel) * failures * math.log(confidenceLevel)
    logDenom = (total - failures) * math.log1p(-failureRatio) + failures * math.log(failureRatio)
    return -2 * (logNumer - logDenom)

def kupiecTestPValue(stocksReturns, valueAtRisk, confidenceLevel):
    failures = countFailures(stocksReturns, valueAtRisk)
    total = len(stocksReturns[0])
    testStatistic = kupiecTestStatistic(total, failures, confidenceLevel)
    return 1 - stats.chi2.cdf(testStatistic, 1)

if __name__ == "__main__":
    sc = SparkContext(appName="VaR")
    (stocksReturns, factorsReturns) = readStocksAndFactors("/Users/Karim/Downloads/VaR-Data/")
    plotDistribution(factorsReturns[2])
    plotDistribution(factorsReturns[3])
    numTrials = 10000000
    parallelism = 1000
    baseSeed = 1001L
    trials = computeTrialReturns(stocksReturns, factorsReturns, sc, baseSeed, numTrials,parallelism)
    trials.cache()
    valueAtRisk = fivePercentVaR(trials)
    conditionalValueAtRisk = fivePercentCVaR(trials)
    print("VaR 5%: " + str(valueAtRisk))
    print("CVaR 5%: " + str(conditionalValueAtRisk))
    varConfidenceInterval = bootstrappedConfidenceInterval(trials, fivePercentVaR, 100, 0.05)
    cvarConfidenceInterval = bootstrappedConfidenceInterval(trials, fivePercentCVaR, 100, 0.05)
    print("VaR confidence interval: " + str(varConfidenceInterval))
import os
import sys

from pyspark import SparkConf, SparkContext, SQLContext

import logger

MODULE_NAME = os.path.basename(sys.modules['__main__'].__file__)
TEST_NAME = os.path.splitext(MODULE_NAME)[0]
LOGGER = logger.get_logger(TEST_NAME)

URLPATH = "s3a://dask-avro-data/application-data/app-100*.avro"

# Start
LOGGER.info('START: Creating spark conf')
Sconf = SparkConf()
sc = SparkContext(appName="my_test", conf=Sconf)
sqlContext = SQLContext(sparkContext=sc)
LOGGER.info('FINISH: Finished creating spark conf')

LOGGER.info('START: Creating spark dataframe')
df = sqlContext.read.format("com.databricks.spark.avro").load(URLPATH)
LOGGER.info('FINISH: Spark dataframe created')

LOGGER.info('START: Starting filtered count')
cnt = df.filter(df.payload.originationCountryCode == 'CAN').count()
LOGGER.info('START: Count is %s', cnt)

sc.stop()
Ejemplo n.º 43
0
# import numpy as np
#import pandas as pd
# d=pd.read_csv("cnt_51000000_7_20_20_1_relu_no_header.csv", delim_whitespace=True)

# d["HC"] = d[0].map(lambda x: hamming_comp(inputs_str,x,2))

# d.to_csv("cnt_51000000_7_20_20_1_relu.csv", sep="\t")

input_dim = 7
inputs = [[int(l) for l in "{0:07b}".format(i)]
          for i in range(0, 2**input_dim)]
inputs_str = ["{0:07b}".format(i) for i in range(0, 2**input_dim)]

from pyspark import SparkContext
sc = SparkContext.getOrCreate()
print(sc._jsc.sc().getExecutorMemoryStatus())
print(sc)
print("Ready to go!")

# data = sc.textFile("cnt_51000000_7_20_20_1_relu_no_header.csv")
#data = sc.textFile("test.txt")
data = sc.textFile(filename)

#data.take(15)
data = data.map(lambda x: x.split("\t"))
# data2 = data.map(lambda x: x[0]).take(15)

# rdd = sc.parallelize(data2)

# rdd.map(lambda x: hamming_comp(inputs_str,x,2)).collect()
Ejemplo n.º 44
0
stopWordsPath = sys.argv[1]
delimitersPath = sys.argv[2]
delimiters = ""

with open(stopWordsPath) as f:
    data = f.read()
    stopwords = re.split("\\n|\\s", data)
#TODO

with open(delimitersPath) as f:
    delimiters = ",|;|\.|\?|!|-|:|@|\[|\]|\(|\)|\{|\}|_|\*|\/|\\n| "
    #TODO

conf = SparkConf().setMaster("local").setAppName("TitleCount")
conf.set("spark.driver.bindAddress", "127.0.0.1")
sc = SparkContext(conf=conf)

lines = sc.textFile(sys.argv[3], 1)

outputFile = open(sys.argv[4], "w")
sys.stdout = outputFile


def titlecountmap(line):
    retval = list()
    words = re.split(delimiters, line.lower())
    for word in words:
        if word not in stopwords and word != '':
            retval.append(word)
    return retval
Ejemplo n.º 45
0
from pyspark.mllib.classification import LogisticRegressionWithSGD
import numpy as np
from math import log
from math import exp #  exp(-t) = e^-t
from operator import add
import sys

# input_file = sys.argv[1]
# input_file = "/data/scratch/vw/criteo-display-advertising-dataset/train.txt"  # Should be some file on your system
input_file = "/tmp/datasets/train.txt"  # Should be some file on your system


print ("--------------------creating context.. ------------")
conf = SparkConf().setAppName('Click Prediction')
conf.set("spark.storage.memoryFraction", "0.40")
sc = SparkContext(conf=conf)
# sc = SparkContext("local[4]", "ClickRatePrediction") ##run on local with 4 cores, named it "ClickRatePrediction"
print ("-------------------Finished creating context..------------")

print ("--------------------Creating parse text file-----------")
# input_file = open(input_file)
# dacData = [unicode(x.replace('\n', '').replace('\t', ',')) for x in input_file]
dacData = sc.textFile(input_file).map(lambda x: unicode(x.replace('\n', '').replace('\t', ',')) for x in input_file)


print ("-------------------Parse text was created!-----------")

print ("-------------------Creating RDD!! ------------------------")
rawData  = (sc
            .parallelize(dacData, 4)  # Create an RDD
            .zipWithIndex()  # Enumerate lines
'''
Source of school list:
http://schools.nyc.gov/schoolsearch/
'''

from __future__ import print_function

import sys
import os
from operator import add
from pyspark import SparkContext
from csv import reader

sc = SparkContext()
sc.addFile("src/helper/assign_basetype.py")
from assign_basetype import *

school_lines = sc.textFile("/user/ac5901/school_name.csv", 1)
schools = school_lines.map(lambda x: x).collect()


def check_school(val):
    basetype = get_basetype(val)
    if basetype == 'TEXT':
        if val is None or len(val.strip()) == 0 or val in [
                'Unspecified', 'NA', 'N/A', 'N?A', 'NA/'
        ]:
            return 'NULL'
        elif val in schools:
            return 'VALID'
        else:
Ejemplo n.º 47
0
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
from pyspark.sql.functions import to_timestamp
import sys
import time
import math
from operator import add
import logging
sys.path.append('/app/htm')
import settings
import htmCircle

conf = SparkConf()
conf.setMaster('spark://spark-master:7077')
conf.setAppName('spark-basic')
sc = SparkContext(conf=conf)
sc.addPyFile("/app/htm/htmCircle.py")
sc.addPyFile("/app/htm/_htmCircle.so")

point = [336.14, 0.13]

properties = {
    'user': settings.DB_USER,
    'password': settings.DB_PASS,
    'host': 'jdbc:mysql://' + settings.DB_HOST + ':3306',
    'database': settings.DB_NAME,
    'driver': 'com.mysql.jdbc.Driver',
    'url': 'jdbc:mysql://' + settings.DB_HOST + ':3306/' + settings.DB_NAME
}

working_directory = '/app/'
Ejemplo n.º 48
0
import numpy as np
import pandas as pd
import pyspark
from pyspark import SparkContext
from pyspark.sql import SQLContext

from pyspark.mllib.recommendation import ALS, MatrixFactorizationModel, Rating

sc = SparkContext(master="local", appName="first app")

df_rdd = sc.textFile('./data/ml-1m/ratings.dat').map(lambda x: x.split("::"))

ratings = df_rdd.map(lambda l: Rating(int(l[0]), int(l[1]), float(l[2])))
X_train, X_test = ratings.randomSplit([0.8, 0.2])

rank = 10
numIterations = 10
model = ALS.train(X_train, rank, numIterations)

testdata = X_test.map(lambda p: (p[0], p[1]))
predictions = model.predictAll(testdata).map(lambda r: ((r[0], r[1]), r[2]))

ratesAndPreds = X_test.map(lambda r: ((r[0], r[1]), r[2])).join(predictions)

MSE = ratesAndPreds.map(lambda r: (r[1][0] - r[1][1])**2).mean()
print("Mean Squared Error = " + str(MSE))
Ejemplo n.º 49
0
from pyspark import SparkContext
import sys
import time
from itertools import combinations
from collections import defaultdict
import queue as Q  # For BFS Implementation #
import copy
sc = SparkContext("local[*]", "Task1")
sc.setLogLevel("OFF")
start = time.time()
input_file_path = sys.argv[1]
RDD_inter = sc.textFile(input_file_path)
result_RDD1 = RDD_inter.map(lambda a: a.split(" "))
result_RDD2 = RDD_inter.map(lambda a: a.split(" ")[::-1])
result_RDD = result_RDD1.union(result_RDD2)

#print(user_pairs_RDD.take(5))
nodes_RDD = result_RDD.flatMap(lambda a: [(a[0]), (a[1])]).distinct()
nodes_list = nodes_RDD.collect()
print(len(nodes_list))
edges_RDD = result_RDD.map(lambda a: (a[0], a[1])).map(lambda a: (a[0], a[1]))
edges_list = edges_RDD.collect()
print(len(edges_list))


# Edges between users based on threshold #
def user_edges(user):
    user_edge_list = []
    for i in edges_list:
        if (i[1] == user):
            user_edge_list.append(i[0])
# In[2]:

get_ipython().system('pip install pyspark==2.4.5')

# In[3]:

get_ipython().system('pip install systemml')

# In[4]:

from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext, SparkSession
from pyspark.sql.types import StructType, StructField, DoubleType, IntegerType, StringType

sc = SparkContext.getOrCreate(SparkConf().setMaster("local[*]"))
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()

# In[5]:

get_ipython().system('mkdir -p /home/dsxuser/work/systemml')

# In[6]:

from systemml import MLContext, dml
import numpy as np
import time

ml = MLContext(spark)
Ejemplo n.º 51
0
# Spark RDD functions 2

import sys, re
from pyspark import SparkConf, SparkContext, RDD

conf = SparkConf().setAppName('RDD Func')
sc = SparkContext(conf=conf)

############################################################
# station-id:       station ID number
# name:             station name
# lat:              latitude
# long:             longitude
# dockcount:        number of docks which embeds station
# landmark:         city
# installation:     date station was embedded
# bikes_available:  number of available bicycle
# docks_available:  number of available docks
# time:             time and date, PST
############################################################
stations = sc.textFile('/opt/spark/data/bike-share/stations')
status = sc.textFile('/opt/spark/data/bike-share/status')

status2 = status.map(lambda x: x.split(',')) \
                .map(lambda x: (x[0], x[1], x[2], x[3].replace('"', ''))) \
                .map(lambda x: (x[0], x[1], x[2], x[3].split(' '))) \
                .map(lambda x: (int(x[0]), int(x[1]), int(x[3][0]), int(x[3][1]), int(x[3][2]), int(x[4][0])))
status2.first()

status3 = status2.filter(lambda x: x[2] == 2015 and x[3] == 2 and x[4] >= 22) \
                 .map(lambda x: (x[0], x[1], x[5]))

def print(*arg):
    mystring = ""
    for argument in arg:
        mystring += str(argument)
    f = open('log.txt', 'a')
    f.write(mystring + "\n")
    f.close()


# Initialize SparkContext
import sys
from pyspark import SparkContext
from pyspark import SparkConf
sc = SparkContext()
import os
import sys
import re
from pyspark import SparkContext
from pyspark import SparkContext
from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)
from pyspark.sql import types
from pyspark.sql import Row
from pyspark.sql import functions
from pyspark.sql import SparkSession
import matplotlib.pyplot as plt
import matplotlib.mlab as mlab
import pandas as pd
import numpy as np
Ejemplo n.º 53
0
# and then run the example
#    `$ bin/spark-submit examples/src/main/python/streaming/stateful_network_wordcount.py \
#        localhost 9999`
###

from __future__ import print_function
import sys
from pyspark import SparkContext
from pyspark.streaming import StreamingContext

if __name__ == "__main__":
    if len(sys.argv) != 3:
        print("Usage: stateful_network_wordcount.py <hostname> <port>",
              file=sys.stderr)
        exit(-1)
    sc = SparkContext(appName="PythonStreamingStatefulNetworkWordCount")
    ssc = StreamingContext(sc, 3)
    ssc.checkpoint("checkpoint")

    # RDD with initial state (key, value) pairs
    # initialStateRDD = sc.parallelize([(u'hello', 1), (u'world', 1)])


    def updateFunc(new_values, last_sum):
        return sum(new_values) + (last_sum or 0)

    lines = ssc.socketTextStream(sys.argv[1], int(sys.argv[2]))
    running_counts = lines.flatMap(lambda line: line.split(" "))\
                          .map(lambda word: (word, 1))\
                          .updateStateByKey(updateFunc)
    #.updateStateByKey(updateFunc,initialStateRDD)
Ejemplo n.º 54
0
from __future__ import print_function

import sys
from datetime import timedelta, datetime, tzinfo
import numpy as np

from pyspark import SparkContext

import sparkmpi

sc = SparkContext(appName="SparkMPI")
print("\nHello SparkMPI\n")

partitions = 2

srv = sparkmpi.AddressServer.createServer()
addr = srv.start(partitions)

print("address: ", addr)


def f(args):

    comm = sparkmpi.Communicator.createCommunicator(args['rank'], 2)

    imageSize = 2 * 1000000
    comm.allocate(imageSize * 4)

    comm.connect(args['addr'])

    a = np.zeros(imageSize, dtype=np.float32)
# -*- coding: utf-8 -*-

import time
import csv
import sys
import json
from pyspark import SparkConf, SparkContext

#%%
conf = SparkConf().setAppName("Task-1-ground-truth-generator").set("spark.executor.memory", "4g")
sc = SparkContext(conf=conf)

#%%
input_file = 'data/test_review.json'
reqd_jaccard_similarity = 0.05
output_file = 'data/true_similarity_pairs_small.csv'

#%%
input_data = sc.textFile(input_file)
input_rdd = input_data.map(json.loads).map(lambda row: (row["business_id"], row["user_id"])).cache()
input_rdd_grouped = input_rdd.groupByKey().map(lambda x: (x[0], set(x[1])))

input_rdd_grouped = input_rdd_grouped.repartition(1)
num_partitions = input_rdd_grouped.getNumPartitions()

business_bucket = input_rdd_grouped.collect()
business_bucket = sorted(business_bucket)



#%%
Ejemplo n.º 56
0
from pyspark import SparkConf, SparkContext
import math


def format0(rec):
    Record = rec.split("|")
    return (Record)


def format1(rec):
    Record = rec.split("\t")
    return (Record)


con = SparkConf()
sc = SparkContext(conf=con)

movie = sc.textFile("file:///home/cloudera/imdb/Movies.item", use_unicode=True)
rating = sc.textFile("file:///home/cloudera/imdb/Movie-Ratings-Done.data")

movieFormatted = movie.map(format0)
ratingFormatted = rating.map(format1)
dataM = movieFormatted.take(movieFormatted.count())
dataR = ratingFormatted.collect()
#golden = movieFormatted.filter(findMovie)
#match = rdd.union
#out = dataM.collect()
movietitle = "GoldenEye (1995)"

movieID = movieFormatted.filter(lambda n: n[1] == movietitle).map(
    lambda x: x[0]).collect()
Ejemplo n.º 57
0

# function that parses file to put movie names in python dictionary
# maps movie IDs to names
def loadMovieNames():
    movieNames = {}
    with open("ml-100k/u.ITEM") as f:
        for line in f:
            fields = line.split('|')
            movieNames[int(fields[0])] = fields[1]
    return movieNames


# boilerplate
conf = SparkConf().setMaster("local").setAppName("PopularMovies")
sc = SparkContext(conf=conf)

# object that returns the broadcast on cluster
nameDict = sc.broadcast(loadMovieNames())

# import the data, map movie IDs and reduce by key while counting occurence of each movie
lines = sc.textFile("file:///SparkCourse/ml-100k/u.data")
movies = lines.map(lambda x: (int(x.split()[1]), 1))
movieCounts = movies.reduceByKey(lambda x, y: x + y)

# flipp the tuple from (id, count) to (count, id) and sort
flipped = movieCounts.map(lambda x: (x[1], x[0]))
sortedMovies = flipped.sortByKey()

# use the broadcast object nameDict to transform each line to (name, count)
#sortedMoviesWithNames = sortedMovies.map(lambda (count, movie) : (nameDict.value[movie], count))
Ejemplo n.º 58
0
from pyspark import SparkConf, SparkContext
from pyspark.sql import HiveContext
from pyspark.sql import SparkSession

conf = SparkConf().setMaster("spark://Masteru:7077").setAppName("My App")
sc = SparkContext(conf=conf)
hivectx = HiveContext(sc)
a = sc.textFile('hdfs://Masteru:9000/RLCPP.csv')
print(a.collect())
from pyspark import SparkContext, SparkConf, SQLContext
import logging, sys
import numpy as np

# spark-submit --packages com.databricks:spark-csv_2.10:1.4.0 --py-files master/hadoop/stemmer.py,master/hadoop/filter.py --master yarn --deploy-mode cluster  master/hadoop/distances.py

logging.basicConfig(stream=sys.stdout, level=logging.INFO, format='%(asctime)s %(levelname)s %(message)s')
logger = logging.getLogger(__name__)

conf = SparkConf()\
    .set("spark.driver.maxResultSize", "0")\
	.set("spark.driver.memory", "12g")\
	.set("spark.executor.memory", "12g")\
	.set("spark.executor.instances", "400")

sc = SparkContext(appName='distances', conf=conf)


def write_data(path):
	import filter
	from pyspark.mllib.feature import Word2Vec, Word2VecModel

	# load data
	loc = '/user/rmusters/text/2015/01/*'
	text_file = sc.textFile(loc)
	data = text_file.map(lambda line: filter.filter(line).split(" "))

	# load model
	word2vec = Word2Vec()
	model = Word2VecModel.load(sc, '/user/rmusters/2015model99')
Ejemplo n.º 60
0
 def setUp(self):
     conf = SparkConf().setMaster("local[*]").setAppName(
         'read_sequence_file')
     self.sc = SparkContext(conf=conf)