Python SparkContext.emptyRDD Beispiele, pyspark.SparkContext.emptyRDD Python Beispiele

Beispiel #1

0

Datei anzeigen

Datei: langPopCount.py Projekt: retanoj/ss_homework

def main():
    appName = "langPopCount;zl"
    
    conf = (SparkConf()
            .setAppName(appName)
            .set("spark.executor.memory", "5g")
            .set("spark.executor.cores","3")
            .set("spark.executor.instance", "3")
            )
    sc = SparkContext(conf = conf)
    hc = HiveContext(sc)

    langTagList = ['<java>', '<javascript>', '<c>', '<c++>', '<c#>', '<python>', '<php>', '<css>', '<html>', '<objective-c>']
    resultrdd = sc.emptyRDD()

    for tag in langTagList:
        postCountdf = hc.sql("select creationdate, 1 as c from questionpost where tags like '%{tag}%' ".format(tag=tag))
        postCountOnYearrdd = postCountdf \
                                 .filter(postCountdf.creationdate != '__none__') \
                                 .withColumn('year', postCountdf.creationdate.substr(0,4)) \
                                 .drop('creationdate') \
                                 .groupBy('year').count() \
                                 .withColumnRenamed('count', 'c') \
                                 .repartition(1) \
                                 .sort('year', ascending=True) \
                                 .map(lambda _: "{tag} {year} {cnt}".format(tag=tag.strip('<>'), year=_.year, cnt=_.c))
        resultrdd = resultrdd.union(postCountOnYearrdd)

    resultrdd = resultrdd.repartition(1)
    resultrdd.saveAsTextFile('/sshomework_zl/popCount')

    sc.stop()

Beispiel #2

0

Datei anzeigen

def main():
    appName = "langPopCount;zl"

    conf = (SparkConf().setAppName(appName).set(
        "spark.executor.memory", "5g").set("spark.executor.cores",
                                           "3").set("spark.executor.instance",
                                                    "3"))
    sc = SparkContext(conf=conf)
    hc = HiveContext(sc)

    langTagList = [
        '<java>', '<javascript>', '<c>', '<c++>', '<c#>', '<python>', '<php>',
        '<css>', '<html>', '<objective-c>'
    ]
    resultrdd = sc.emptyRDD()

    for tag in langTagList:
        postCountdf = hc.sql(
            "select creationdate, 1 as c from questionpost where tags like '%{tag}%' "
            .format(tag=tag))
        postCountOnYearrdd = postCountdf \
                                 .filter(postCountdf.creationdate != '__none__') \
                                 .withColumn('year', postCountdf.creationdate.substr(0,4)) \
                                 .drop('creationdate') \
                                 .groupBy('year').count() \
                                 .withColumnRenamed('count', 'c') \
                                 .repartition(1) \
                                 .sort('year', ascending=True) \
                                 .map(lambda _: "{tag} {year} {cnt}".format(tag=tag.strip('<>'), year=_.year, cnt=_.c))
        resultrdd = resultrdd.union(postCountOnYearrdd)

    resultrdd = resultrdd.repartition(1)
    resultrdd.saveAsTextFile('/sshomework_zl/popCount')

    sc.stop()

Beispiel #3

0

Datei anzeigen

    def run(self, inputFile):
        sc = SparkContext("local[8]", "ratings")
        text = sc.textFile(inputFile)
        header = text.first() #extract header
        text = text.filter(lambda row : row != header)

        mapping = text.map(lambda line: line.split(',')).map(lambda x:  (int(x[0]),int(x[1]))).groupByKey()
        movieInput = text.map(lambda line: line.split(',')).map(lambda x:  (int(x[1]),int(x[0]))).groupByKey().map(lambda x: (x[0],list(x[1]))).sortByKey()
        self.totalUsers = len(mapping.collect())
        print(self.totalUsers)
        self.generate_hash_functions()

        #for i in range(10):
        #   print(movieInput.collect()[i])

        Signature = movieInput.map(lambda x : (x[0],self.create_signature(x[1])))
        #print("Signature")
        #print(Signature.take(10))


        bandSize = self.numHash//self.numBand
        unique = sc.emptyRDD()
        self.movieDict = dict(movieInput.collect())
        start = time.time()
        for i in range(self.numBand):
            bands = Signature.map(lambda x : (x[0], x[1][(i * bandSize) : ((i+1) * bandSize) ]))
            a = random.randint(1,1500)
            b = random.randint(1,1000)
            bands = bands.map(lambda x:(x[0], self.hashBucket(x[1],a,b))).map(lambda x: (x[1],x[0])).groupByKey().map(lambda x:sorted(list(x[1]))).flatMap(lambda x: list(combinations(x,2)))
            unique = unique.union(bands).distinct()
        print("flatmap",time.time()- start)
        unique = unique.distinct().map(lambda x :(x[0], (x[1] ,self.computeJacardSimilarity(x[0],x[1])))).filter(lambda x : x[1][1] >= 0.5)
        output = unique.groupByKey().sortByKey().map(lambda x: (x[0], sorted(list(x[1]),key=lambda tup: tup[0]))).collect()
        print("done",time.time()- start)

        with open('Tuhina_Kumar_SimilarMovie_Jaccard.txt','w') as f:
            for i in range(len(output)):
                str1 =''
                for j in range(len(output[i][1])):
                    str1 = str1 + str(output[i][0]) +', ' + str(output[i][1][j][0])+', '+ str(output[i][1][j][1])+'\n'
                f.write(str1)
        #print("str done",time.time()- start)


        ##### precision and recall ########
        '''

Beispiel #4

0

Datei anzeigen

Datei: Penghao_Duan_lshrec_py3.py Projekt: PenghaoDuan/Data-Mining

    def bandHash(self):
        sc = SparkContext(appName='Inf553')
        input_file = sys.argv[1]
        output_file = sys.argv[2]
        #        rdd = sc.textFile('Input.txt')
        rdd = sc.textFile(input_file)
        input_data_rdd = rdd.map(self.inputData)
        input_rdd = input_data_rdd.map(self.signature)
        self.inputDict = dict(input_data_rdd.collect())
        counter = 0
        bandVal = self.numOfSig / self.numOfBands
        candidatePairs = sc.emptyRDD()

        for band in range(self.numOfBands):
            bandRDD = input_rdd.map(lambda x:
                                    (x[0], x[1][counter:counter + bandVal]))
            bandCandidatePairs = bandRDD.map(lambda umDict: (tuple(umDict[
                1]), umDict[0])).groupByKey().map(lambda x: list(x[
                    1])).flatMap(lambda x: list(combinations(x, 2)))
            candidatePairs = candidatePairs.union(bandCandidatePairs)
            counter = counter + bandVal

        pairwiseJaccSim = candidatePairs.distinct().map(lambda x: (
            x[0], x[1],
            self.calculationOfJaccard(self.inputDict[x[0]], self.inputDict[x[
                1]]))).flatMap(lambda x: ((x[0], ([(x[1], x[2])])), ((x[1], [(
                    x[0], x[2])])))).reduceByKey(lambda x, y: x + y).sortByKey(
                        ascending=True)

        outputRDD = pairwiseJaccSim.map(lambda x: (x[0], dict(x[1]))).map(
            lambda x: (x[0], sorted(x[1].items(), key=lambda k: (-k[1], k[0])))
        ).map(lambda x: (x[0], x[1][:5])).map(lambda x: ''.join(s for s in [
            'U' + str(x[0]), ':', ','.join('U' + str(movieName) for movieName
                                           in sorted([a[0] for a in x[1]]))
        ])).collect()

        #        outfile = open('Test.txt', 'w')
        outfile = open(output_file, 'w')
        for line in outputRDD:
            outfile.write(line)
            outfile.write('\n')
        outfile.close()
        sc.stop()

Beispiel #5

0

Datei anzeigen

Datei: snippet.py Projekt: szabo92/gistable

def main(args):
    arguments = parseArguments(args)
    if arguments.debug:
        sys.stdout.write("Arguments: %s\n"%arguments)
    sc = SparkContext(appName=arguments.app)
    items = sc.textFile(arguments.input)
    items = items.map(lambda x: load_item(x)).cache()
    distances = sc.emptyRDD()
    for i in xrange(0, arguments.repetition):
        curr_distances = items.repartitionAndSortWithinPartitions(\
                          arguments.repartition, \
                          lambda x: randint(1, arguments.repartition)).\
                          mapPartitions(lambda x:
                          approx_distance_user(x, arguments.num_reco))
        distances = distances.union(curr_distances)
    distances = distances.reduceByKey(lambda a, b : a + b)
    distances = distances.map(lambda x: (x[0],
                              sorted(set(x[1]), key = lambda a: -a[1])))
    distances = distances.collect()
    for item_id, rec in distances:
        print ("%s\t%s"%(item_id, rec))

Beispiel #6

0

Datei anzeigen

Datei: tfidf.py Projekt: pragaashp/Scalable-tf-idf

class TFIDF():

	def __init__(self,input_path,output_path):
		self.input = input_path
		self.output = output_path
		self.texts = glob(self.input + '/*.txt')
		self.conf = SparkConf().setAppName('tfidf')\
							   .setMaster('local')\
							   .set('spark.executor.memory','1g')
		self.sc = SparkContext(conf=self.conf)

	def writeToCSVFile(self,rdd):
		with open(self.output + '/tfidf-scores.csv','wb') as csvfile:
			writer = csv.writer(csvfile)
			writer.writerow(['docID','word','score'])
			writer.writerows(rdd)


	def run(self):
		# Job 1: Word Frequency in Documents.
		tfilter = TextFilter().filter
		wcRDD = self.sc.emptyRDD()
		for dkey,textfile in enumerate(self.texts):
			tf = self.sc.textFile(textfile)\
					 .filter(lambda line: len(line.strip()) > 0)\
				     .flatMap(lambda line: tfilter(line))\
				     .map(lambda word: ((word,dkey),1))\
				     .reduceByKey(operator.add)
			N = tf.map(lambda ((w,d),y): y).sum()
			tf = tf.map(lambda ((w,d),y): ((w,d),(y,N)))
			wcRDD = self.sc.union([wcRDD,tf])

		# Job 2: Word Frequency in Corpus & Calculate TF-IDF.
		D = self.sc.broadcast(len(self.texts))
		wcRDD = wcRDD.map(lambda ((w,d),(a,b)): (w,(d,a,b)))
		wfRDD = wcRDD.map(lambda (w,(d,a,b)): (w,1)).reduceByKey(operator.add)
		tfidf = wcRDD.join(wfRDD).map(lambda (w,((d,a,b),c)): ((d,-a/b * np.log(D.value/c),w),1))\
					 .sortByKey(True).map(lambda ((d,z,w),a): (d,w,-z))
		self.writeToCSVFile(tfidf.collect())

Beispiel #7

0

Datei anzeigen

                    
months = ['january', 'february', 'march', 'april', 'may', 'june', 'july', 'august', 'september', 'october']
for m in months:
    file_list = []
    for path in all_paths:
        if(fnmatch.fnmatch(path,'*/' + m + twitter_files)):
            file_list.append(path)

    # Define Schema so that we can define an empty df
    schema = StructType([StructField('id',StringType(),True),
                         StructField('num_followers',IntegerType(),True),
                         StructField('num_following',IntegerType(),True),
                         StructField('mentions',ArrayType(StringType(), True),True)])

    # Empty df
    cleaned_df = sqlContext.createDataFrame(sc.emptyRDD(), schema)
    for f in file_list[:]:
        df_temp = sqlContext.read.json(f)
        s_temp = getCols(df_temp)
        cleaned_df = cleaned_df.unionAll(s_temp)

    cleaned_df.write.parquet('file:///home/fang/twitter_user_info' + '_' + m[:3]+'.parquet')


                    
                    
                    
                    
# for path in all_paths:
#     if(fnmatch.fnmatch(path, twitter_files)):
#         file_list.append(path)

Beispiel #8

0

Datei anzeigen

Datei: effectWorkflow.py Projekt: skangaslahti/effect-workflows

                    .set_extractor(cve_regex_extractor)

                msid_regex = re.compile('(ms[0-9]{2}-[0-9]{3})', re.IGNORECASE)
                msid_regex_extractor = RegexExtractor() \
                    .set_regex(msid_regex) \
                    .set_metadata({'extractor': 'msid-regex'}) \
                    .set_include_context(True) \
                    .set_renamed_input_fields('text')

                msid_regex_extractor_processor = ExtractorProcessor() \
                    .set_name('msid_from_extracted_text-regex') \
                    .set_input_fields('raw_content') \
                    .set_output_field('extractions.msid') \
                    .set_extractor(msid_regex_extractor)

                cdr_extractions_isi_rdd = sc.emptyRDD()
                extraction_source_names = []
                for source in source_extraction_fields:
                    extraction_source_names.append(source)
                    extraction_fields = source_extraction_fields[source]

                    cve_process_source = ExtractorProcessor() \
                                        .set_name('cve_from_extracted_text-regex') \
                                        .set_input_fields(extraction_fields) \
                                        .set_output_field('extractions.cve') \
                                        .set_extractor(cve_regex_extractor)
                    msid_process_source = ExtractorProcessor() \
                                        .set_name('msid_from_extracted_text-regex') \
                                        .set_input_fields(extraction_fields) \
                                        .set_output_field('extractions.msid') \
                                        .set_extractor(msid_regex_extractor)

Beispiel #9

0

Datei anzeigen

Datei: BigDataScalingAttempt.py Projekt: ilektram/pySpark-for-ebook-Classification

    if len(sys.argv) != 1:
        print("Usage: Report")
        exit(-1)

    conf = SparkConf().set('spark.local.dir', '/data/store/tmp')
    conf.set('spark.storage.memoryFraction', '0.5')
    conf.set('spark.akka.frameSize', '256')

    # Connect to Spark
    sc = SparkContext(appName="Big Data Report 2015", conf=conf)

    stopwords = get_stopwords(sc)

    # Part 1
    # Load and parse non-empty files into RDD
    rawDataRDD = sc.emptyRDD()
    rdds = []
    batchSize = 1000
    i = 0

    dirs = []
    for root, dir, files in os.walk('/data/store/gutenberg/text-full/'):
        if len(files) == 0:
            continue  # skip empty directories
        if os.stat(os.path.join(root, files[0])).st_size > 1000000:
            continue  # skip files bigger than 1 megabyte
        dirs.append(root)

    print("Got {} dirs - {} chunks".format(len(dirs), int(len(dirs) / batchSize)))

    countWords = []

Beispiel #10

0

Datei anzeigen

Datei: main.py Projekt: kovalexal/hadoop-tasks

        correlation = (n * s_xy - s_x * s_y) / (sqrt(
            (n * s_x2 - s_x**2) * (n * s_y2 - s_y**2)))
    except:
        pass
    return correlation

input = sc\
    .textFile(args.input)\
    .map(lambda line: line.split(','))\
    .filter(lambda splits: len(splits) == 8 and splits[0][0] != '#')\
    .map(lambda x: (x[0], x[2], int(x[3]), float(x[4])))\
    .filter(filter_moment)\
    .map(lambda x: (x[0], int(datetime.strptime(x[1], '%Y%m%d%H%M%S%f').timestamp()), x[2], x[3]))\
    .cache()

result = sc.emptyRDD()

for width in args.candle_widths:
    candles = input\
        .map(lambda x: moment_to_candle_start(x, width))\
        .reduceByKey(lambda x, y: x if x[0] > y[0] else y) \
        .map(lambda x: (x[0][0], (x[0][1], x[1][1])))\
        .groupByKey()\
        .cache()

    for shift in args.candle_shifts:
        shifted_candles = candles\
            .map(lambda x: (x[0] + shift * width, x[1])).cache()

        correlations = candles\
            .join(shifted_candles)\

Beispiel #11

0

Datei anzeigen

    sqlContext = SQLContext(sc)

    tableName = sc.broadcast(createUniqueTableName('NBM'))
    MYSQL_CONNECTION_URL = sc.broadcast(
        'jdbc:mysql://localhost:3306/' + db + '?user='******'&password='******'&useUnicode=true&useJDBCCompliantTimezoneShift=true&useLegacyDatetimeCode=false&serverTimezone=UTC&useSSL=false'
    )

    # CREATE TABLE SCHEMA
    schema = StructType([
        StructField("gt", IntegerType(), True),
        StructField("predicted", IntegerType(), True)
    ])

    dfTableSchema = sqlContext.createDataFrame(sc.emptyRDD(), schema)
    dfTableSchema.write.jdbc(MYSQL_CONNECTION_URL.value,
                             tableName.value,
                             mode='error')

    # LOAD JDBC PROPERTIES
    df = sqlContext.read.format('jdbc')\
                        .options(url = MYSQL_CONNECTION_URL.value,
                                 dbtable = db+'.'+tableName.value
                                ).load()

    # CREATE STREAMING CONTEXT
    ssc = StreamingContext(sc, int(spark_batch_duration))

    # setting checkpoint
    # ssc.checkpoint(".")

Beispiel #12

0

Datei anzeigen

Datei: peiwen_du_spark.py Projekt: PeiwenDu/detect-communities-in-the-distributed-environment

    # print(sum(res))
    return sum(res)


# delete_edge = betweenness.sortBy(lambda x:x[1],False).map(lambda x:(x[0][0],x[0][1])).first()
# print(delete_edge)
# sets = graph.filter(lambda x: x == delete_edge).flatMap(lambda x: [x[0], x[1]]).map(lambda x:findSet(x, parent_child))
# Q = sets.map(modularity).sum() / (2 * m)
# print(len(sets.take(1)[0]))
# print(Q)

increase = Decimal(1)
delete_edge = betweenness.sortBy(
    lambda x: x[1], False).map(lambda x: (x[0][0], x[0][1])).first()
delete_edges = [delete_edge]
default_sets = sc.emptyRDD()
Q_max = Decimal(0)
while increase >= 0.0:
    new_graph = graph.filter(lambda x: x not in delete_edges)
    # print(new_graph.count())
    children = new_graph.flatMap(lambda x: [(x[0], [x[1]]), (x[1], [x[0]])]
                                 ).reduceByKey(lambda x, y: x + y).collect()
    new_parent_child = dict()
    for node in children:
        new_parent_child[node[0]] = node[1]

    sets = graph.filter(lambda x: x in delete_edges).flatMap(
        lambda x: [x[0], x[1]]).distinct().map(
            lambda x: findSet(x, new_parent_child)).distinct()
    # print(sets.filter(lambda x:len(x)==1).collect())
    # print(nodes.subtract(sets.flatMap(lambda x:[x])))

Beispiel #13

0

Datei anzeigen

if __name__ == "__main__":
    if len(sys.argv) != 3:
        print("Usage: PySparkForumPostTraining.py <classifiers_file> <test_file>", file=sys.stderr)
        exit(-1)
    print("Started Classification")
    sc = SparkContext(appName="PySparkForumPostClassify", pyFiles=['./classifier.py'])
    
    classifiers_file_path = sys.argv[1]
    load_classifiers(classifiers_file_path)
    test_data_file = sys.argv[2]

    
    lines = sc.textFile(test_data_file, 1).zipWithUniqueId()
    lines.cache()
    
    stage0 = sc.emptyRDD()
    for classifier_key, classifier in classifiers.iteritems():
        tmp_rdd = lines.map(lambda x: map_add_classifier(classifier.name,x))
        stage0 = stage0.union(tmp_rdd)
    
    #print(stage0.first())
    
    # map by value each word in documents.
    stage1 = stage0.flatMapValues(lambda x: x).filter(filter_inexistent_words)  \
                   .map(map_log_of_probability) \
                   .reduceByKey(reducer_add) \
                   .map(map_get_class_prob) \
                   .reduceByKey(reducer_get_classification) \
                   .map(map_accuracy_classification) \
                   .reduceByKey(reducer_add)
    #print(stage1.take(20))

Beispiel #14

0

Datei anzeigen

Datei: bigData.py Projekt: raphaelbp12/dengue-rio

else:
    cursor = conn.execute("SELECT count(*) FROM casos");
    log("Tabela 'casos' encontrada com sucesso. " + str(len(cursor.fetchall())) + " registros existentes.")


sc = SparkContext()

sqlContext = SQLContext(sc)

field = [StructField("DT_NOTIFIC", StringType(), True),
         StructField("NU_ANO", IntegerType(), True),
         StructField("Long_WGS84", StringType(), True),
         StructField("Lat_WGS84", StringType(), True), ]
schema = StructType(field)

df = sqlContext.createDataFrame(sc.emptyRDD(), schema)

log("Iniciando geração dicionário bairros...")
bairrosDict = getBairrosDict(sc)
log("Sucesso\n")

cols = ['DT_NOTIFIC', 'NU_ANO', 'Long_WGS84', 'Lat_WGS84']

log("Lendo dados chamados dengue...")
for y in range(2010, time.localtime()[0]):
    for m in range(1, 13):

        fileN = ("Casos_Notificados_Dengue_mes_ano.csv").replace("ano", str(y)).replace("mes", "%02d" % m)
        if fileN not in arquivosImportados:
            log("Processando: " + fileN)
            path = "./" + fileN

Beispiel #15

0

Datei anzeigen

Datei: pehar_dist.py Projekt: wtwong316/Large-Time-Series-Prediction

    return res 
if __name__ == "__main__":
    
    os.system ("hadoop fs -mkdir -p features_selection")
    os.system ("hadoop fs -mkdir -p features_selection/input_mat")
    
    iters = 30
    if (len (sys.argv[1].split ('/')) > 1):
        input_mat = sys.argv[1].split ('/')[1] .split ('.')[0]
    else:
        input_mat = sys.argv[1]. split ('.')[0]
 
    df = sqlcontext.read. parquet ('hdfs://master:9000/user/hduser/matrix_of_depend/' + input_mat)
    targets = df. select ("P1"). distinct ().  rdd. map (lambda x: x[0]). collect ()
    
    features = sc.emptyRDD () 
    
    for target in targets:
        hubs = select_k_variables (df, target, iters = iters, k=3)
        features = features.union (hubs. map (lambda x:(x[0], [x[1]])))

    features = features. reduceByKey (lambda x,y: x + y ).\
    map (lambda x: (x[0].split ('_')[0], x[0].split ('_')[1], x[1])).\
    map (lambda x: (x[0], x[1], list_to_str (x[2]))).\
    toDF (["Col", "NbV", "Hubs"])
    
    features.write.format("com.databricks.spark.csv").\
    mode("overwrite").\
    option("header", "true").\
    save("/user/hduser/features_selection/" + input_mat)

Beispiel #16

0

Datei anzeigen

Datei: C-02-GraficaHashtags.py Projekt: paola-md/TwitterStreaming_Spark

def aggregate_tags_count(new_values, total_sum):
    return sum(new_values) + (total_sum or 0)

#Metodo para obtener una instancia de SQL
def get_sql_context_instance(spark_context):

    if ('sqlContextSingletonInstance' not in globals()):
        globals()['sqlContextSingletonInstance'] = SQLContext(spark_context)

    return globals()['sqlContextSingletonInstance']


from pyspark.sql.types import *
schema = StructType([])
sql_context = HiveContext(sc)
empty = sql_context.createDataFrame(sc.emptyRDD(), schema)

# Metodo para obtener los resultados de la cuenta
def process_rdd(_, rdd):
    try:
        
		#Obtiene el singleton de sql 
        sql_context = get_sql_context_instance(rdd.context)

		#convierte de RDD a Row RDD
        row_rdd = rdd.map(lambda w: Row(hashtag=w[0], hashtag_count=w[1]))
        # Crea Dataframe de Row DD
        hashtags_df = sql_context.createDataFrame(row_rdd)
        # Register dataframe como tabla
        hashtags_df.registerTempTable("hashtags")
        # obtiene los 20 hashtags con mayor frecuencia

Beispiel #17

0

Datei anzeigen

from pyspark.sql import Row
from pyspark.sql import functions as F
from pyspark.ml.feature import RegexTokenizer, StringIndexer

from pyspark.sql.types import StructType, StructField, IntegerType, StringType, FloatType, LongType, BooleanType, ArrayType
from pyspark.ml.clustering import KMeans

sc = SparkContext()
spark = SparkSession.builder.appName("task2").config(
    "spark.some.config.option", "some-value").getOrCreate()
sqlContext = SQLContext(spark)
resub = F.udf(lambda string: re.sub(r'[^\w\s]', '', string), StringType())

#collect data to rdd
cluster = sc.textFile("cluster2.txt").collect()
inp = sc.emptyRDD()
for file in cluster:
    filePath = "/user/hm74/NYCColumns/" + file.replace("'", "")
    tmp = sc.textFile(filePath).map(lambda row: row.split("\t")).map(
        lambda x: (str(x[0]), x[1]))
    inp = sc.union([inp, tmp])

inp = inp.reduceByKey(lambda x, y: int(x) + int(y))

df = sqlContext.createDataFrame(inp, ['inp', 'count'])
df = df.withColumn("sentence", resub(df.inp))

#tokenized words
regexTokenized = RegexTokenizer(inputCol="sentence",
                                outputCol="words").transform(df)
regexTokenized = regexTokenized.select("sentence", "words", "count")

Beispiel #18

0

Datei anzeigen

Datei: spark_test3.py Projekt: luohuayong/spark

sql_cart = "select uid,pid,0 as rating from data_cart"
sql_favorites = "select uid,pid,0 as rating from data_favorites"
cur = conn.cursor()
cur.execute(sql_payment)
rdd_payment = sc.parallelize(cur.fetchall())
print "rdd_payment.count() = %s" % rdd_payment.count()
cur.execute(sql_order)
rdd_order = sc.parallelize(cur.fetchall())
print "rdd_order.count() = %s" % rdd_order.count()
data_cart = cur.execute(sql_cart)
rdd_cart = sc.parallelize(cur.fetchall())
print "rdd_cart.count() = %s" % rdd_cart.count()
data_favorites = cur.execute(sql_favorites)
rdd_favorites = sc.parallelize(cur.fetchall())
print "rdd_favorites.count() = %s" % rdd_favorites.count()
rdd_rating = sc.emptyRDD()
rdd_temp = rdd_payment.map(lambda x:(x[0],x[1],10.0))
rdd_rating = rdd_rating.union(rdd_temp)
print "rdd_temp_payment.count() = %s" % rdd_temp.count()
rdd_temp = rdd_order.subtract(rdd_payment).map(lambda x:(x[0],x[1],8.0))
rdd_rating = rdd_rating.union(rdd_temp)
print "rdd_temp_order.count() = %s" % rdd_temp.count()
rdd_temp = rdd_cart.subtract(rdd_order).subtract(rdd_payment).map(lambda x:(x[0],x[1],7.0))
rdd_rating = rdd_rating.union(rdd_temp)
print "rdd_temp_cart.count() = %s" % rdd_temp.count()
rdd_temp = rdd_favorites.subtract(rdd_cart).subtract(rdd_order).subtract(rdd_payment).map(lambda x:(x[0],x[1],5.0))
rdd_rating = rdd_rating.union(rdd_temp)
print "rdd_temp_favorites.count() = %s" % rdd_temp.count()
print "rdd_rating.count() = %s" % rdd_rating.count()

collect = rdd_rating.collect()

Beispiel #19

0

Datei anzeigen

Datei: main.py Projekt: kovalexal/hadoop-tasks

    try:
        correlation = (n * s_xy - s_x * s_y) / (sqrt( (n * s_x2 - s_x ** 2) * (n * s_y2 - s_y ** 2) ))
    except:
        pass
    return correlation

input = sc\
    .textFile(args.input)\
    .map(lambda line: line.split(','))\
    .filter(lambda splits: len(splits) == 8 and splits[0][0] != '#')\
    .map(lambda x: (x[0], x[2], int(x[3]), float(x[4])))\
    .filter(filter_moment)\
    .map(lambda x: (x[0], int(datetime.strptime(x[1], '%Y%m%d%H%M%S%f').timestamp()), x[2], x[3]))\
    .cache()

result = sc.emptyRDD()

for width in args.candle_widths:
    candles = input\
        .map(lambda x: moment_to_candle_start(x, width))\
        .reduceByKey(lambda x, y: x if x[0] > y[0] else y) \
        .map(lambda x: (x[0][0], (x[0][1], x[1][1])))\
        .groupByKey()\
        .cache()

    for shift in args.candle_shifts:
        shifted_candles = candles\
            .map(lambda x: (x[0] + shift * width, x[1])).cache()

        correlations = candles\
            .join(shifted_candles)\

Beispiel #20

0

Datei anzeigen

Datei: aps_threshold.py Projekt: cchriste/dataflow

    parser.add_argument("-s","--sigma",type=float,default=5,help="sigma to use for gaussing smoothing")
    parser.add_argument("-d","--dst",default="/tmp",help="destination path")
    parser.add_argument("-n","--num_partitions",type=int,default=16,help="number of partitions to create, each with num_files/num_partitions records")
    parser.add_argument('--nocache', dest='nocache',default=False,action='store_true',help="cache image stack before thresholding")
    parser.add_argument('--granular', dest='granular',default=False,action='store_true',help="granular image processing operations (vs grouped)")
    args = parser.parse_args()

    threshold_percent=args.percent
    gaussian_sigma=args.sigma

    sc = SparkContext(appName="APS_Thresholder")

    import time
    t0=tbegin=time.time()

    files=sc.emptyRDD()
    if args.path != None:
        filelist = genfilelist(args.path, args.ext)     #note: occasional problem here when worker tries to read unsynced nfs file, use filelist instead
        files = sc.textFile(filelist)
    else:
        files=sc.textFile(args.filelist)

    # threshold_stack.foreach(noop)  #useful to force pipeline to execute for debugging
    # tmark=time.time()
    # print("generate and read files: %0.6f"%(tmark-to))
    # t0=tmark

    slice_count=files.count()
    files=files.repartition(args.num_partitions)  # Or maybe just slice_count, but I suspect file size plays a significant role in how many records per partition is optimal.

    stack=files.map(readTiff)

Beispiel #21

0

Datei anzeigen

Datei: jaccard_lsh.py Projekt: soniagodhwani/Data-Mining

print(rdd.getNumPartitions())

##creating signature matrix
x = rdd.mapPartitions(lambda iterator: create_local_signature_matrix(
    iterator, user_map)).reduceByKey(
        lambda x, y: getMinForBusinessFronPartitions(x, y))
signature_matrix = dict(x.collect())

## creting bands
y = x.flatMap(lambda x: divideIntoBands(x)).groupByKey().collect()

## get candidiate pairs
#z = y.mapValues(lambda x: getCandiatesForBands(x)).collect()
#print(len(z))
candidates = sc.emptyRDD()
for i in y:
    band = sc.parallelize(i[1])
    c = band.groupByKey().map(lambda x: list(x[1])).flatMap(
        lambda x: list(combinations(x, 2)))
    candidates = candidates.union(c)

candidates = candidates.distinct().persist()
singleCandidates = candidates.flatMap(
    lambda x: [x[0], x[1]]).distinct().collect()
candidates = candidates.collect()

#pairs = candidates.distinct().map(lambda x: (x,calculateSimilarity(signature_matrix[x[0]],signature_matrix[x[1]]))).filter(lambda x: x[1]>=0.5).collect()

businesses_copy = rdd.map(lambda x: (x[1], [x[0]])).reduceByKey(
    lambda x, y: x + y)

Beispiel #22

0

Datei anzeigen

Datei: main.py Projekt: xu-weiyuan/blue-marlin

def run(cfg):

    global hive_context

    sc = SparkContext()
    hive_context = HiveContext(sc)
    sc.setLogLevel('WARN')

    # ESClient requires host ip

    es_client_booking = ESClient(cfg['es_host'], cfg['es_port'],
                                 cfg['es_booking_index'], cfg['es_booking_type'])
    bookings = es_client_booking.search({})  # get at most 1000 results for now
    bookings = optimizer.util.filter_valid_bookings(bookings)
    # adjust dates in bookings
    bookings = optimizer.util.adjust_booking_dates(bookings)
    bookings_map = optimizer.util.get_bookings_map(bookings)

    df = hive_context.createDataFrame(sc.emptyRDD(), optimizer.util.get_common_pyspark_schema())
    today = cfg['today']  # YYYY-MM-DD
    days = optimizer.util.get_days_from_bookings(today, bookings)

    df = generate_resources(cfg, df, bookings_map, days, bookings, hive_context)
    # Row(day='2018-04-02', ands=['b1', 'b3', 'b2'], minus=[], allocated={}, amount=43562)
    print('defining resources')
    df.cache()
    print(df.take(1))

    # run the allocation
    df = hwm_allocation(df, bookings, days)

    # Row(day='2018-04-02', ands=['b1', 'b3', 'b2'], minus=[], amount=43562, allocated={'b2': 800, 'b3': 1000, 'b1': 500})
    print('bb-bookings allocation')
    df.cache()
    print(df.take(1))

    # lock bookings
    lock_booking(es_client_booking, True)

    # remove bbs
    remove_booking_buckets(cfg, days)

    # save new booking-buckets into es
    df = save_booking_buckets_in_es(cfg, df)
    print('bbs saved')
    df.cache()
    print(df.take(1))

    # unlock bookings
    lock_booking(es_client_booking, False)
    day = days[-1]
    tomorrow = optimizer.util.get_next_date(day)

    # use only tomorrow to create the allocation plan
    df = df.filter(df.day == tomorrow)

    # this method add the bbs ucdocs allocation_map with their values
    df = add_ucdoc_bb_allocation_map(cfg, df, bookings_map)

    # [Row(day='2018-04-02', ands=['b1', 'b3', 'b2'], minus=[], amount=43562, allocated={'b2': 800, 'b3': 1000, 'b1': 500}, allocation_map={'minusonepage,3,5G,g_x,2,pt,1002,icc': {'b2': 1, 'b3': 2, 'b1': 1}, 'magazinelock,2,3G,g_x,3,pt,1005,icc': {'b2': 56, 'b3': 70, 'b1': 35}, 'magazinelock,2,4G,g_x,3,pt,1005,icc': {'b2': 56, 'b3': 70, 'b1': 35}, 'minusonepage,3,5G,g_x,2,pt,1003,icc': {'b2': 6, 'b3': 8, 'b1': 4}, 'minusonepage,1,4G,g_x,2,pt,1003,icc': {'b2': 16, 'b3': 20, 'b1': 10}, 'minusonepage,2,4G,g_f,4,pt,1002,icc': {'b2': 12, 'b3': 15, 'b1': 8}, 'cloudFolder,2,5G,g_x,3,pt,1005,icc': {'b2': 57, 'b3': 72, 'b1': 36}, 'minusonepage,2,3G,g_x,3,pt,1002,icc': {'b2': 3, 'b3': 4, 'b1': 2}, 'minusonepage,1,3G,g_x,1,pt,1005,icc': {'b2': 27, 'b3': 33, 'b1': 17}, 'minusonepage,1,3G,g_x,4,pt,1004,icc': {'b2': 72, 'b3': 90, 'b1': 45}, 'magazinelock,2,5G,g_x,4,pt,1004,icc': {'b2': 32, 'b3': 40, 'b1': 20}, 'cloudFolder,2,3G,g_f,3,pt,1002,icc': {'b2': 16, 'b3': 20, 'b1': 10}, 'cloudFolder,3,5G,g_f,2,pt,1004,icc': {'b2': 27, 'b3': 34, 'b1': 17}})]
    print('ucdocs-bookings allocation')
    df.cache()
    print(df.take(1))

    # at this point we have a df which is a allocation of bookings to bbs
    df = df.select(df.day, explode(df.allocation_map))

    # Row(day='2018-04-02', key='magazinelock,3,5G,g_x,2,pt,1004,icc', value={'b2': 14, 'b3': 18, 'b1': 9})
    print('exploded')
    df.cache()
    print(df.take(1))

    # agg all the allocation maps for a ucdoc
    _map_type = MapType(StringType(), IntegerType())
    _audf = udf(agg_allocation_maps, _map_type)
    df = df.groupBy('key').agg(_audf(collect_list('value')).alias('allmap'))

    # [Row(key='cloudFolder,3,5G,g_f,2,pt,1004,icc', allmap={'b2': 27, 'b3': 34, 'b1': 17})]
    print('final aggregation')
    df.cache()
    print(df.take(1))

    # writing into hdfs
    filename = 'allmap-{}-{}'.format(
        optimizer.util.convert_date_remove_dash(day), str(int(time.time())))
    df.write.save(filename, format='json')

Beispiel #23

0

Datei anzeigen

Datei: 5_stop_locations.py Projekt: Aausuman/Thesis

    return when(col(x) != "", col(x)).otherwise(0)


# Importing and processing our dataset
records_rdd = raw_records.map(pre_process)
records_df = records_rdd.toDF(schema=["Timestamp", "LineID", "Direction", "JourneyPatternID", "Timeframe",\
                                      "VehicleJourneyID", "Operator", "Congestion", "Lon", "Lat", "Delay",\
                                      "BlockID", "VehicleID", "StopID", "AtStop"])
records_df_without_empty = records_df.withColumn("LineID",
                                                 blank_as_null("LineID"))

# Creating an empty data-frame for storing coordinates of all the stops within all lineID
relevant_fields = [StructField("LineID",StringType(), True),StructField("StopID", StringType(), True),\
            StructField("Lon", StringType(), True), StructField("Lat", StringType(), True)]
schema = StructType(relevant_fields)
all_coordinates_df = sqc.createDataFrame(sc.emptyRDD(), schema)

# Remapping records into an RDD by LineID as Key
filtered_data_rdd = records_df_without_empty.rdd.map(lambda x: (str(x["LineID"]), (str(x["LineID"]),\
                                                                        time.ctime(int(str(x["Timestamp"]))/1000000),\
                                                                        str(x["JourneyPatternID"]),\
                                                                        int(str(x["VehicleID"])),\
                                                                        int(str(x["VehicleJourneyID"])),\
                                                                        int(str(x["Delay"])),\
                                                                        str(x["Lon"]), str(x["Lat"]),\
                                                                        str(x["StopID"]),\
                                                                        int(str(x["AtStop"])))))

# Grouping those records on LineID
grouped_by_lineID = filtered_data_rdd.groupByKey().mapValues(list)
results_1 = grouped_by_lineID.collect()

Beispiel #24

0

Datei anzeigen

    header = patient.first()
    patient = patient.filter(lambda row: row != header)
    patient = patient.flatMap(pair_patient_to_disease).filter(patient_filter)

    support_pct = args.min_sup
    patient_cnt = patient.keys().distinct().count()
    min_support = round(patient_cnt * support_pct)

    joined_set = geo.join(patient).map(lambda row:
                                       (row[0], row[1][0][0], row[1][0][1]))

    transaction_set = joined_set.map(lambda s: (s[0], s[1])).groupByKey().map(
        lambda s: (s[0], set(s[1])))

    items = joined_set.map(lambda row:
                           (row[1], row[2])).keys().distinct().collect()
    #items = set(items)

    final_set = sc.emptyRDD()

    for i in range(args.num_iter):
        itemset_combs = generate_item_combs(items, i + 1)
        retitems = transaction_set.values().flatMap(lambda row: returnItems(itemset_combs, row)).reduceByKey(sum_patient_count) \
            .map(lambda row: (row[0], row[1], min_support)).filter(min_support_filter).map(lambda row: (row[1], row[0]))

        final_set = final_set.union(retitems)

    final_set = final_set.sortBy(lambda x: x[0], ascending=False)
    final_set.map(lambda x: str(x[0]) + '\t' + "\t".join(y for y in x[1])
                  ).coalesce(1).saveAsTextFile(args.output)

Beispiel #25

0

Datei anzeigen

# NOTE: This code requires you to have downloaded simulation snapshots
# to an EBS storage attached to your cluster.

# If you are interested in getting access to the Caterpillar particle data,
# please contact the Caterpillar team / email [email protected]

conf = SparkConf().setAppName('project_spark')
sc = SparkContext(conf=conf)

# getting snapshot number from command line
snap = sys.argv[1]

# downsampling factor
down_max = 0.1

allpos_rdd = sc.emptyRDD()
snap3char = str(snap).zfill(3)
for i in range(64):
    # read the file into an numpy array
    newfile = h5py.File(
        '/mnt/s3/snapdir_' + snap3char + '/snap_' + snap3char + '.' + str(i) +
        '.hdf5', 'r')
    particletypes = newfile.keys()[1:]
    # loop through all particle types
    for newtype in particletypes:
        if newtype == 'PartType1':
            # load the coordinates of the high-resolution type into an rdd
            positions = newfile[newtype]['Coordinates'][:]
            positions_rdd = sc.parallelize(positions)
            # downsample the rdd
            typeindex = int(newtype[-1]) - 1

Beispiel #26

0

Datei anzeigen

Datei: 10_busy_lines.py Projekt: Aausuman/Thesis

def date_and_day(df):
    first_timestamp = int(float(df.collect()[0]["Timestamp"]))/1000000
    readable_first_timestamp = t.ctime(first_timestamp)
    day = readable_first_timestamp[0:3]
    date = readable_first_timestamp[4:10]
    return day, date


# Creating an empty data-frame for storing busy lines data
relevant_fields = [StructField("LineID",IntegerType(), True), \
                   StructField("Number of times at stops", IntegerType(), True), \
                   StructField("Date",StringType(), True), \
                   StructField("Day", StringType(), True), \
                   ]
schema = StructType(relevant_fields)
busy_lines_df = sqc.createDataFrame(sc.emptyRDD(), schema)

# Importing and cleaning our data-set
records_rdd = raw_records.map(pre_process)
records_df = records_rdd.toDF(schema=["Timestamp", "LineID", "Direction", "JourneyPatternID", "Timeframe", \
                                      "VehicleJourneyID", "Operator", "Congestion", "Lon", "Lat", "Delay", \
                                      "BlockID", "VehicleID", "StopID", "AtStop"])
records_df = cleaning(records_df)

# Getting day and date for this set of records
day, date = date_and_day(records_df)

# Remapping rdd as a PairRDD with LineID as key
records_keyLineID_rdd = records_df.rdd.map(lambda x: (int(str(x["LineID"])), [(int(str(x["LineID"])), \
                                                                               int(str(x["StopID"])), \
                                                                               int(float(str(x["Timestamp"]))), \

Beispiel #27

0

Datei anzeigen

Datei: BigDataReportChunks.py Projekt: ilektram/pySpark-for-ebook-Classification

# If this is the main program
if __name__ == "__main__":
    # Make sure we have all arguments we need
    if len(sys.argv) != 1:
        print("Usage: Report")
        exit(-1)

    conf = SparkConf().set('spark.local.dir', '/data/store/tmp')
    # Connect to Spark
    sc = SparkContext(appName="Big Data Report 2015", conf=conf)

    stopwords = get_stopwords(sc)

    # Part 1
    # Load and parse non-empty files into RDD
    rawDataRDD = sc.emptyRDD()
    rdds = []
    batchSize = 1000
    i = 0

    dirs = []
    for root, dir, files in os.walk('/data/store/gutenberg/text-full/'):
        if len(files) == 0:
            continue  # skip empty directories
        if os.stat(os.path.join(root, files[0])).st_size > 1000000:
            continue  # skip files bigger than 1 megabyte
        dirs.append(root)

    print("Got {} dirs - {} chunks".format(len(dirs),
                                           int(len(dirs) / batchSize)))

Beispiel #28

0

Datei anzeigen

Datei: coverage_journeys.py Projekt: niko64fx/spark-stat-analyzer

    # treatment_day = datetime.strptime(sys.argv[1], '%Y-%m-%d').date()
    # source_root = '/home/vlepot/dev/navitia-stat-logger/tmp'
    # source_root = 'gs://hdp_test'

    source_root = sys.argv[1]
    treatment_day_start = datetime.strptime(sys.argv[2], '%Y-%m-%d').date()
    treatment_day_end = datetime.strptime(sys.argv[3], '%Y-%m-%d').date()

    print "Go for dates: " + treatment_day_start.strftime('%Y-%m-%d') + " -> " + treatment_day_end.strftime('%Y-%m-%d')
    print "Source root dir: " + source_root

    conf = SparkConf().setAppName("coverage_journeys_compiler")
    sc = SparkContext(conf=conf)

    statsLines = sc.emptyRDD()
    treatment_day = treatment_day_start
    while treatment_day <= treatment_day_end:
        if source_root.startswith("/") and \
                        len(glob(source_root + '/' + treatment_day.strftime('%Y/%m/%d') + '/*.json.log*')) > 0:
            statsLines = statsLines.union(sc.textFile(
                source_root + '/' + treatment_day.strftime('%Y/%m/%d') + '/*.json.log*')
            )
        treatment_day += timedelta(days=1)

    dayStats = statsLines.map(
        lambda stat: json.loads(stat)
    ).filter(
        lambda line: line["api"] == 'v1.journeys'
    )

Beispiel #29

0

Datei anzeigen

Datei: availability.py Projekt: jacobgreenleaf/qubit-hadoop

import argparse
import json

from datetime import datetime
from pyspark import SparkContext

'''
parser = argparse.ArgumentParser(description='Process an availability report')
parser.add_argument('--in', dest='input')
args = parser.parse_args()
'''

if __name__ == "__main__":
    sc = SparkContext(appName="AvailabilityReport")

    rdd = sc.emptyRDD()

    for path in sys.argv[1:]:
        rdd = rdd.union(sc.wholeTextFiles(path))
    
    availabilityTuples = rdd.values() \
        .flatMap(lambda fz: fz.split("\n")) \
        .filter(lambda u: len(u)>0) \
        .map(lambda line: line.split("\t")[2]) \
        .distinct() \
        .keyBy(lambda u: int(json.loads(u)['sequence']/1E4)) \
        .mapValues(lambda u: \
            (1, datetime.strptime(json.loads(u)['time'], '%Y-%m-%dT%H:%M:%S.%fZ'), datetime.strptime(json.loads(u)['time'], '%Y-%m-%dT%H:%M:%S.%fZ')) \
        ) \
        .reduceByKey(lambda a, b: \
            (a[0]+b[0], min(a[1], b[1]), max(a[2], b[2]))

Beispiel #30

0

Datei anzeigen

Datei: classes.py Projekt: larsbkrogvig/strava-spark

class StravaLoader(object):

    def __init__(self, 
                 data_source='local', 
                 activity_directory='strava-activities-subset',
                 s3bucket='larsbk',
                 athletes=None,
                 activity_types=[
                    'Ride',
                    'Run',
                    'NordicSki'
                 ],
                 sc=None,
                 hiveContext=None,
                 conf=(SparkConf().setAppName('Strava analysis')),
                 filter_bug_inducing_rows=True
                 ):

        ''' Initialize Strava Analysis object'''


        # INPUT PARAMETERS

        self.athletes = athletes # Athletes to analyze (optional)
        self.activity_types = activity_types # Activity_types to consider (default)
        self.filter_bug_inducing_rows = filter_bug_inducing_rows


        # CONFIGURE SPARK

        if sc != None and hiveContext != None: # Both contexts were supplied by user
            print 'Info: Using supplied SparkContext and HiveContext'
            self.sc = sc
            self.hiveContext = hiveContext

        else: # Initialize new contexts
            print 'Info: Intitializing SparkContext and hiveContext from (default) conf'
            self.sc = SparkContext(conf=conf)
            self.hiveContext = HiveContext(self.sc)

        self.schema = pickle.load(open('./schema.p', 'rb')) # The pre-defined schema
        self.df = None # Empry DataFrame to be populated later


        # CONFIGURE DATA SOURCE

        data_root_path = {
                's3': 's3n://%s/%s/' % (s3bucket, activity_directory), 
                'local': './%s/' % activity_directory
        }
        
        if data_source not in data_root_path.keys(): # Check if data source is valid 
            raise Exception(('Unrecognized data source %s. '
                             'Supported sources: "%s".') \
                             % '", "'.join(data_root_path.keys()))
        
        self.data_source = data_source # This is a valid data source
        self.path = data_root_path[data_source] # This is the path to the data


        # (S3 SPECIFIC STUFF)

        if data_source == 's3':

            # Get a list of files in he activity_directorys
            bucket = boto3.resource('s3').Bucket(s3bucket) 
            objects = bucket.objects.filter(Prefix='%s/gpx/' % activity_directory)
            files = [obj.key for obj in objects] 

            # Make set of observed combinations of athlete and activity_type
            athlete_and_type = set([]) # Empty set to populate
            fpattern = '\/([\w]+)\/(?:[\w-]+)-([\w]+)\.gpx' # File name pattern
            for fname in files:
                match = re.match(activity_directory+'/gpx'+fpattern, fname)
                if match:
                    athlete_and_type.add((match.group(1), match.group(2)))

            self.s3_athlete_and_type = athlete_and_type # Save set for later use

        pass


    def _get_athlete_directories(self):
        '''
        Look for athlete directories in data_root_path \
        and update self.athletes
        '''

        if self.data_source in ['local']:

            self.athletes = [
                directory for directory in os.listdir(self.path+'gpx/')
                if re.match('^[\w-]+$', directory)
            ]

        else:
            print ('Warning: Automatic directory/athlete detection not yet supported for '
                   'data source %s. Using: "akrogvig", "lkrogvig", "brustad"') \
                   % self.data_source

            self.athletes = ['akrogvig', 'lkrogvig', 'brustad']

        pass


    def _activities_exist(self, athlete, activity_type):
        '''
        Checks if there exists activities of type <activity_type> for athlete <athlete>, 
        returns a boolean value
        '''

        # Check local directory with glob
        if self.data_source == 'local':
            return glob.glob(self.path+'gpx/%s/*%s.gpx' % (athlete, activity_type))

        # Check if combination exists by using previously compiled sets
        elif self.data_source == 's3':
            return ((athlete, activity_type) in self.s3_athlete_and_type)

    def _load_dataset(self):
        '''
        Loads strava activities from source to DataFrame self.df
        '''

        # Get athlete list if not already set
        if not self.athletes:
            self._get_athlete_directories()

        # Initialize empty dataset
        self.df = self.hiveContext.createDataFrame(
            self.sc.emptyRDD(),
            self.schema
        )

        for athlete in self.athletes:
            for activity_type in self.activity_types:
        
                # Check that there are files of that type (or else .load fails)
                if self._activities_exist(athlete, activity_type):

                    # Read data
                    dfadd = self.hiveContext.read.format('com.databricks.spark.xml') \
                                    .options(rowTag='trkpt', treatEmptyValuesAsNulls=False) \
                                    .schema(self.schema) \
                                    .load(self.path+'gpx/%s/*%s.gpx' % (athlete, activity_type))
                
                    dfadd = dfadd.withColumn('athlete', lit(athlete)) \
                                 .withColumn('activity_type', lit(activity_type))
                
                    self.df = self.df.unionAll(dfadd)

        if self.filter_bug_inducing_rows:
            self.df = self.df.filter(self.df['extensions.gpxtpx:TrackPointExtension.#VALUE'].isNull())

        pass


    def derive_schema(self):
        '''
        Loads all data in self.path and derives the schema, saves with pickle to "schema.p"
        '''

        df = self.hiveContext.read.format('com.databricks.spark.xml') \
                    .options(rowTag='trkpt') \
                    .load(self.path+'gpx/*')

        df = df.withColumn('athlete',lit(None).cast(StringType())) \
               .withColumn('activity_type',lit(None).cast(StringType()))

        df.printSchema()
        pickle.dump(df.schema, open("schema.p", "wb"))

        pass


    def get_dataset(self):
        '''
        Returns strava activity dataset
        '''
        if not self.df:
            self._load_dataset()
        
        return self.df

Beispiel #31

0

Datei anzeigen

Datei: ApartadoB.py Projekt: JIraizoz/CLO-PROJECT

solEstaciones = dfEstaciones.rdd
solEpocas = dfEpocas.rdd

#tenemos solMeses = (MES, GENERO), NUMVECES
solMeses = solMeses.map(lambda x: ((x[1], x[0]), x[2]))
solMeses = solMeses.reduceByKey(lambda a, b: a + b)

#solEstaciones = (ESTACION, GENERO), NUMERO
solEstaciones = solEstaciones.map(lambda x: ((x[1], x[0]), x[2]))
solEstaciones = solEstaciones.reduceByKey(lambda a, b: a + b)

#solEpocas = (EPOCA, GENERO), NUMERO
solEpocas = solEpocas.map(lambda x: ((x[1], x[0]), x[2]))
solEpocas = solEpocas.reduceByKey(lambda a, b: a + b)

maximo = sc.emptyRDD() #en maximo guardaremos los top 5 generos por mes, epoca y estacion
minimo = sc.emptyRDD() #en minimo guardaremos el peor genero por mes, epoca y estacion

for mes in meses:

	rdd4 = solMeses.filter(lambda x: mes == x[0][0]) #cogemos las filas que tengan el mes que queremos
	rdd4 = rdd4.map(lambda x: (x[1], x[0][1]))
	rdd4 = rdd4.sortByKey(False) #ordenamos de mayor a menor numero de visualizaciones
	rdd5 = rdd4.sortByKey(True) #ordenamos de menor a mayor numero de visualizaciones
	
	if rdd4.count() > 0:
		rdd4 = rdd4.take(5) #cogemos los 5 mayores
		rdd5 = rdd5.take(1) #y el menor
		rdd4 = sc.parallelize(rdd4) #los pasamos a rdd
		rdd5 = sc.parallelize(rdd5)
		rdd4 = rdd4.map(lambda x: (mes, x[1], x[0]))

Beispiel #32

0

Datei anzeigen

Datei: car_gps_preprocess.py Projekt: robinisme2/taiwantaxi-realtime

        block = 3
    else:
        block = 4

    return (grid, block)


grid_block_udf = udf(grid_block, grid_schema)
schema = StructType([StructField("memsn", LongType(), True),
            StructField("utc", StringType(), True),
            StructField("meter", LongType(), True),
            StructField("busy", LongType(), True),
            StructField("acc", BooleanType(), True),
            StructField("grid", IntegerType(), True),
            StructField("block", IntegerType(), True)])
last_df = spark.createDataFrame(sc.emptyRDD(), schema)

def process(time, rowRdd):
    print("========= %s =========" % str(time))
    if rowRdd.isEmpty():
        print("Rdd is empty")
        return
    tw = pendulum.timezone("Asia/Taipei")
    time = tw.convert(time)
    utc = pendulum.timezone("UTC")
    end = pendulum.instance(utc.convert(time)).subtract(minutes=2)
    start = end.subtract(minutes=3)
    end.set_to_string_format("%Y-%m-%dT%H:%M:%SZ")
    start.set_to_string_format("%Y-%m-%dT%H:%M:%SZ")

    taxi_df = spark.createDataFrame(rowRdd)

Beispiel #33

0

Datei anzeigen

Datei: log_analyzer.py Projekt: timforby/bigdata

        for a in ans:
            print(a)
#question 7
if int(question) == 7:
    print("users who started a session on all hosts ")
    print(" + : ", end="")
    #empty RDD to hold intersections
    fu = uniqueuser(readhost(hosts[0]))
    for i in hosts[1:]:
        fu = fu.intersection(uniqueuser(readhost(i)))
    print(fu.collect())
#question 8
if int(question) == 8:
    print("users who started a session on exactly one host, with host name ")
    print(" + : ", end="")
    fu = sc.emptyRDD()
    for i in hosts:
        #map with host name
        fu = fu.union(uniqueuser(readhost(i)).map(lambda x: (x, i)))
    #repeats will have username, host1+host2+... and thus will not have single host
    print(
        fu.reduceByKey(lambda x, y: x + y).filter(
            lambda x: x[1] in hosts).sortBy(lambda x: x[1]).collect())

#question 9
if int(question) == 9:
    print("Host Anonymization")
    for i in hosts:
        print(" + " + i + ": ")
        print(" . User name mapping: ", end="")
        us = readhost(i)

Beispiel #34

0

Datei anzeigen

Datei: gframes.py Projekt: sagarcasm/ApacheWebLogExtraction

        name, value = mapping[idx]
	# if we already have a value for this feature, skip to
	# the next one
	if name in thisfeats and thisfeats[name] != 'None':
		continue
        if (f == '1'):
            thisfeats[name] = value
        else:
            thisfeats[name] = 'None'
    for ff in featids[1:]:
        vtxfeats.append(thisfeats[ff])
    return vtx(vtxid, *vtxfeats)


# load all of the feature maps, feature files, and self features into an RDD
alledges = sc.emptyRDD()
for personid in peopleids:
        featmap_fname = "/home/zeppelin/facebook/%d.featnames" % personid
        feats_fname = "%d.feat" % personid
        this_feats_fname = "%d.egofeat" % personid

        # load the feature map
        fmap = []
        with open(featmap_fname) as flines:
            for line in flines:
                fmap.append(fn_process(line))

        # load the features for all the edges, and our own
        f_rdd = sc.textFile(feats_fname).map(lambda x: feat_process(x, -1)). \
             union(sc.textFile(this_feats_fname).map(lambda x: feat_process(x, personid)))
        #f_rdd = sc.textFile(feats_fname).map(lambda x: feat_process(x, -1))

Beispiel #35

0

Datei anzeigen

Datei: generate_data_for_augmentation_learning_spark.py Projekt: VIDA-NYU/prida

    hdfs_client = InsecureClient(hdfs_address, user=hdfs_user)

    # opening training and test data files
    if not cluster_execution:
        learning_data_filename_training = 'file://' + learning_data_filename_training
        id_to_dataset_filename_training = 'file://' + id_to_dataset_filename_training
        if learning_data_filename_test:
            learning_data_filename_test = 'file://' + learning_data_filename_test
            id_to_dataset_filename_test = 'file://' + id_to_dataset_filename_test

    learning_data_training = sc.textFile(learning_data_filename_training +
                                         '/*').persist(
                                             StorageLevel.MEMORY_AND_DISK)
    id_to_dataset_training = sc.pickleFile(
        id_to_dataset_filename_training).persist(StorageLevel.MEMORY_AND_DISK)
    learning_data_test = sc.emptyRDD()
    id_to_dataset_test = sc.emptyRDD()
    if learning_data_filename_test:
        learning_data_test = sc.textFile(learning_data_filename_test +
                                         '/*').persist(
                                             StorageLevel.MEMORY_AND_DISK)
        id_to_dataset_test = sc.pickleFile(
            id_to_dataset_filename_test).persist(StorageLevel.MEMORY_AND_DISK)

    # taking first element and checking if information about joined dataset is present
    has_joined_data = False
    first = json.loads(learning_data_training.first())
    if 'joined_dataset' in first:
        has_joined_data = True

    # generating learning instances for training

Beispiel #36

0

Datei anzeigen

Datei: stream.py Projekt: swetakum/AMS560-Project

class Reader():
    def __init__(self):
        self.sc = SparkContext('local', 'Stream-SQL')
        self.ssc = StreamingContext(self.sc, batchDuration=3)
        self.spark = SparkSession.builder\
            .getOrCreate()
        self.sc.setLogLevel('ERROR')

    def initStream(self):
        self.readInput()

        self.ssc.start()
        self.ssc.awaitTermination()

    def inputSQLQuery(self, query):
        self.modQuery = ''
        self.dictInnerQuery = {}

        innerFlag = False
        innerCol = ''
        wordList = query.split(' ')
        wordQuery = ''

        for i in range(len(wordList)):
            word = wordList[i]

            # Detect opening '(' of inner query
            if word == '(SELECT':
                innerFlag = True
                innerCol = wordList[i - 2]

            if innerFlag:
                wordQuery += word + ' '
            else:
                self.modQuery += word + ' '

            # Detect closing ')' of table) and not AVG(col)
            if ')' in word and '(' not in word:
                replaceInner = 'Q' + str(len(self.dictInnerQuery))
                self.modQuery += replaceInner + ' '
                key = replaceInner
                value = [wordQuery, innerCol, 0]
                self.dictInnerQuery[key] = value

                innerFlag = False
                wordQuery = ''

    def readInput(self):
        lines = self.ssc.textFileStream('Data/Live')

        self.csvSchema = StructType([
            StructField('col1', IntegerType()),
            StructField('col2', IntegerType()),
            StructField('col3', IntegerType())
        ])

        self.stateDF = self.spark.createDataFrame(self.sc.emptyRDD(),
                                                  self.csvSchema)
        # self.stateDF.show()
        self.globalDF = self.spark.createDataFrame(self.sc.emptyRDD(),
                                                   self.csvSchema)

        self.totalTime = 0.0

        def row(inpStr):
            return Row(int(inpStr[0]), int(inpStr[1]), int(inpStr[2]))

        def iterateRDD(rdd):
            start = time.clock()
            data = rdd.map(lambda line: line.split(' ')).map(row)
            df = data.toDF(self.csvSchema)

            if df.count():
                # print(self.stateDF.count())
                curDF = df.union(self.stateDF)
                self.queryRDD(curDF)

                # Append to global DF for batch outputs
                # self.globalDF = df.union(self.globalDF)

                self.outputQuery(curDF)
                self.totalTime += time.clock() - start
                # print(str(round(self.totalTime, 2)) + 's')

        lines.foreachRDD(iterateRDD)

    def queryRDD(self, df):
        # df.show()
        df.createOrReplaceTempView('table')

        for key, value in self.dictInnerQuery.items():
            innerQuery = value[0]
            sqlDF = self.spark.sql(innerQuery)
            sqlRes = sqlDF.first()[0]
            self.dictInnerQuery[key][2] = sqlRes

        # df.show()
        b = 14
        addToState = [False for i in range(df.count())]
        for key, value in self.dictInnerQuery.items():
            col = value[1]
            val = value[2]
            # print(col, val, b)
            tupleList = [{col: x[col]} for x in df.rdd.collect()]
            for i in range(len(tupleList)):
                row = tupleList[i]
                if row[col] > val - b and row[col] < val + b:
                    addToState[i] = True

        # print(addToState)
        itr = 0
        newRows = []
        newStateDF = self.spark.createDataFrame(self.sc.emptyRDD(),
                                                self.csvSchema)
        for row in df.rdd.collect():
            if addToState[itr]:
                newRows.append(row)
            itr += 1
        # print(newRows)
        newStateDF = self.spark.createDataFrame(newRows, self.csvSchema)
        self.stateDF = newStateDF
        # newStateDF.printSchema()
        approxRows = newStateDF.sort('col1', ascending=False).collect()
        approxDF = self.spark.createDataFrame(approxRows, self.csvSchema)
        # approxDF.show()
        self.stateDF = self.spark.createDataFrame(approxDF.head(60),
                                                  self.csvSchema)
        # self.stateDF.show()

    def outputQuery(self, df):
        curQuery = ' '.join(
            list(
                map((lambda word: str(round(self.dictInnerQuery[word][2], 2))
                     if word in self.dictInnerQuery else word),
                    self.modQuery.split())))
        df.createOrReplaceTempView('table')
        streamOut = self.spark.sql(curQuery).first()[0]
        print(streamOut)

Beispiel #37

0

Datei anzeigen

Datei: app.py Projekt: DmitryIvanoff/dsbda_hw2

            lambda a, b: a + b)  # oldrdd U rdd->sort->(key,amount)
    tmpFile = NamedTemporaryFile(delete=True)
    tmpFile.close()
    newrdd.saveAsPickleFile(tmpFile.name)
    open(filename, "w")  #remove all logs from logfile
    result = newrdd.collect()
    return result


if __name__ == '__main__':
    if len(sys.argv) != 2:
        print('Usage: app.py <logfile>')
        sys.exit(-1)

    KeyspaceName = 'syslog'
    TableName = 'statistics'
    cluster = cascl.Cluster()
    CreateKeySpaceAndTable(cluster, KeyspaceName, TableName)
    conf = SparkConf().setAppName('CountingSyslogsByHours')
    sc = SparkContext(conf=conf)
    tmpFile = NamedTemporaryFile(delete=True)
    tmpFile.close()
    sc.emptyRDD().saveAsPickleFile(tmpFile.name)
    statistics = SparkCalculate(sc, sys.argv[1], tmpFile=tmpFile)
    #writing to Cassandra
    SaveToDB(statistics, cluster)
    #printing from Cassandra
    printFromDb(cluster)
    cluster.shutdown()
    sc.stop()

Beispiel #38

0

Datei anzeigen

        return (node, 1 / n)


if __name__ == "__main__":
    sc = SparkContext(appName="pagerank")
    lines = sc.textFile(sys.argv[1])
    count = lines.count()
    N = 1 / count
    links = lines.map(lambda x: parse(x)).cache()
    # print(links.collect())
    # fractionals = lines.map(lambda f:parse(f)).map(lambda z:rank_fractions(z)).filter(lambda fg:(fg!=None)).mapValues(lambda pr:pr*N)
    # fractionals = lines.map(lambda f:parse(f)).map(lambda p:initial_rank(p,count))

    #

    ranks = sc.emptyRDD()
    contribs = sc.emptyRDD()
    #
    initial_ranks = links.map(lambda r: initial_rank(r, count))
    frac = initial_ranks.map(lambda p: p).filter(lambda f: (f != None))
    # print(frac.collect())
    # print(initial_ranks.collect())
    for i in range(10):
        fractionals = initial_ranks.map(lambda p: rank_fractions(p)).filter(
            lambda f: (f != None))
        contribs = fractionals.reduceByKey(add)
        ranks = contribs.mapValues(lambda v: .15 + .85 * v)
        print(ranks.collect())
        joined = links.join(frac)

    print(joined.collect())

Beispiel #39

0

Datei anzeigen

Datei: GVR.py Projekt: BRiDGEIris/digest

float(15)/2


# In[96]:

start_time = time.time()

variants_case = sqlContext.sql("SELECT patient,chr,pos,reference,alternative,gene_symbol,zygosity FROM parquetFile "+sqlCase)
patientsID_case=sorted(variants_case.map(lambda v:v[0]).distinct().collect())

if sqlControl!="NULL":
    variants_control= sqlContext.sql("SELECT patient,chr,pos,reference,alternative,gene_symbol,zygosity FROM parquetFile "+sqlControl)
#    controlMAF=float(controlMAF)
else:
    variants_control=sc.emptyRDD()
#    controlMAF=0   
patientsID_control=sorted(variants_control.map(lambda v:v[0]).distinct().collect())

patientsID=patientsID_case+patientsID_control
patientsID_dictionnary=dict(zip(patientsID,range(len(patientsID))))

patientsID_split_index_b=sc.broadcast(len(patientsID_case))

patientsID_dictionnary_b = sc.broadcast(patientsID_dictionnary)

variants=variants_control.unionAll(variants_case)

variants_grouped=variants.map(createKey_VariantGene).groupByKey()

controlMAF_b=sc.broadcast(controlMAF)