Esempio n. 1
0
def SPARKreadFile(sc):
    basedir = os.getcwd()
    filename = os.path.join(basedir,'FAK53004_ae6e213fd4e39f25ca87bf1c770b24c891782abc_0.fastq')
    sc.setLogLevel("WARN")
    #file = open(SparkFiles.get(filename))
    file = sc.textFile(filename)
    list = file.take(file.count())
    dict = namedtuple('SEQUENCE', ['NUMBER','ID', 'SEQ', 'OP', 'QUAL'])
    DFs = []
    dict_ID = []
    dict_SEQ = []
    dict_OP = []
    dict_QUAL = []
    counter = 0
    for i, v in enumerate(list):
        if (i%4 == 0):
            dict_ID.append(v)
        if (i%4 == 1):
            dict_SEQ.append(v)
        if (i%4 == 2):
            dict_OP.append(v)
        if (i%4 == 3):
            dict_QUAL.append(v)
            df = dict(NUMBER = counter, ID = dict_ID[counter], SEQ = dict_SEQ[counter], OP = dict_OP[counter], QUAL = dict_QUAL[counter])
            DFs.append(df)
            counter +=1
    rdd = sc.parallelize(DFs)
    seqDF = rdd.map(lambda x: Row(NUMBER = x[0], ID=x[1], SEQ=x[2], OP=x[3], QUAL=x[4]))
    schemaSeqDF = sqlContext.createDataFrame(seqDF)
    #file.close()
    return schemaSeqDF
Esempio n. 2
0
def HLalignment(a, alignments, tab, Aligner,sc):
    dict = ReadFile.readFile3()
    # counter = 0
    for name, seq, qual in dict.values():
        try:
            hit = next(a.map(seq, MD=True, cs=True))
            # dict = {}
            flag = 0 if hit.strand == 1 else 16
            seq = seq if hit.strand == 1 else seq.translate(tab)[::-1]
            clip = ['' if x == 0 else '{}S'.format(x) for x in (hit.q_st, len(seq) - hit.q_en)]
            if hit.strand == -1:
                clip = clip[::-1]
            cigar = "".join((clip[0], hit.cigar_str, clip[1]))
            alignment = Aligner(contig=hit.ctg, Rname=name, flag=flag, pos=hit.r_st, mapq=hit.mapq, cigar=cigar, seq=seq, is_primary=hit.is_primary, MDtag=hit.MD, cstag=hit.cs, basequal=qual)
            # dict['counter','Qname', 'flag', 'Rname', 'pos', 'mapq', 'cigar', 'seq', 'is_primary'] = name, flag, hit.ctg, hit.r_st, hit.mapq, hit.cigar_str, seq, hit.is_primary
            if hit.mapq >= 10:
                # alignments.append(dict['counter','Qname', 'flag', 'Rname', 'pos', 'mapq', 'cigar','seq', 'is_primary'])
                alignments.append(alignment)
                # counter += 1
        except StopIteration:
            alignment = Aligner(contig='chr0', Rname=name, flag=4, pos=None, mapq=None, cigar=None, seq=seq, is_primary=False, MDtag=None, cstag=None, basequal=qual)
            alignments.append(alignment)
    rdd = sc.parallelize(alignments)
    seqDF = rdd.map(lambda x: Row(contig=x[0], Rname=x[1], flag=x[2], pos=x[3], mapq=x[4], cigar=x[5], seq=x[6], is_primary=x[7], MDtag=x[8], cstag=x[9], basequal=x[10]))
    DF = sqlContext.createDataFrame(seqDF)
    return DF
Esempio n. 3
0
def Sparkseeds(dict, i, k, hashDF, sc):
    word = [(i, HashTable.hash_djb2(dict[i][j:j + k]), j)
            for j in range(0,
                           len(dict[i]) - k)]
    rddW = sc.parallelize(word)
    schemaWordDF = rddW.map(
        lambda x: Row(NUM_SEQ=x[0], ID_SEQ=x[1], POS_SEQ=x[2]))
    df = sqlContext.createDataFrame(schemaWordDF)
    reDF = df.join(hashDF, df.ID_SEQ == hashDF.ID_GEN, how='inner')
    reDF = reDF.orderBy(reDF.POS_SEQ).select(reDF.NUM_SEQ, reDF.ID_SEQ,
                                             reDF.POS_SEQ, reDF.POS_GEN)
    my_window = Window.partitionBy(reDF.NUM_SEQ).orderBy(reDF.POS_SEQ)
    reDF = reDF.withColumn("prev_value", F.lag(reDF.POS_SEQ).over(my_window))
    reDF = reDF.withColumn(
        "dist",
        F.when(F.isnull(reDF.POS_SEQ - reDF.prev_value),
               0).otherwise(reDF.POS_SEQ - reDF.prev_value))
    reDF = reDF.select(reDF.NUM_SEQ, reDF.ID_SEQ, reDF.POS_SEQ, reDF.dist,
                       reDF.POS_GEN)
    reDF = reDF.withColumn("dist0", F.lead(reDF.dist).over(my_window))
    elDF = reDF.filter(((reDF.dist == 0) | (reDF.dist >= 50))
                       & ((reDF.dist0.isNull()) | (reDF.dist0 >= 50)))
    reDF = reDF.subtract(elDF)
    reDF = reDF.orderBy(reDF.POS_SEQ).select(reDF.NUM_SEQ, reDF.ID_SEQ,
                                             reDF.POS_SEQ, reDF.POS_GEN)

    #pos = function(reDF)

    return reDF
    def test_read_dataframe_with_path(self):
        paths = ["/data/master/pctk/data/t_pctk_rcc_balance/cutoff_date=2020-06-30",
                 "/data/master/pctk/data/t_pctk_rcc_balance/cutoff_date=2020-07-31"]

        dataframe = self.dataFrame.read_dataframe(paths=paths)
        empty_dataframe = sqlContext.createDataFrame([], StructType([]))

        self.assertNotEqual(dataframe, empty_dataframe)
Esempio n. 5
0
def pandas_to_spark(pandas_df):
    columns = list(pandas_df.columns)
    types = list(pandas_df.dtypes)
    struct_list = []
    for column, typo in zip(columns, types):
        struct_list.append(define_structure(column, typo))
    p_schema = StructType(struct_list)
    return sqlContext.createDataFrame(pandas_df, p_schema)
    def test_read_dataframe_with_path_retrieving_partition_name(self):
        paths = ["/data/master/pctk/data/t_pctk_rcc_balance/cutoff_date=2020-06-30",
                 "/data/master/pctk/data/t_pctk_rcc_balance/cutoff_date=2020-07-31"]

        dataframe = self.dataFrame.read_dataframe(paths=paths,
                                                  options={'basePath': self.path})

        empty_dataframe = sqlContext.createDataFrame([], StructType([]))

        self.assertNotEqual(dataframe, empty_dataframe)
        self.assertTrue("cutoff_date" in dataframe.schema.names)
    def test_read_dataframes_with_date_range(self):
        dataframe = self.dataFrame.read_dataframes(self.path, process_date=["2020-05-31", "2020-07-31"],
                                                   options={"basePath": self.path})

        empty_dataframe = sqlContext.createDataFrame([], StructType([]))
        dates = dataframe.select("cutoff_date").dropDuplicates().collect()
        expected_dates = [Row(cutoff_date=datetime.date(2020, 7, 31)),
                          Row(cutoff_date=datetime.date(2020, 5, 31)),
                          Row(cutoff_date=datetime.date(2020, 6, 30))]

        self.assertNotEqual(dataframe, empty_dataframe)
        self.assertEqual(dates, expected_dates)
Esempio n. 8
0
 def __init__(self, scoreAndLabels, sc):
     df = sqlContext.createDataFrame(scoreAndLabels,
                                     schema=StructType([
                                         StructField("score",
                                                     DoubleType(),
                                                     nullable=False),
                                         StructField("label",
                                                     DoubleType(),
                                                     nullable=False)
                                     ]))
     java_class = sc._jvm.org.apache.spark.mllib.evaluation.BinaryClassificationMetrics
     java_model = java_class(df._jdf)
     super(BinaryClassificationMetrics, self).__init__(java_model)
Esempio n. 9
0
 def get_spark_df(self, df):
     self.df = df
     meta = self.get_pdf_column_meta(self.df.columns)
     struct_list = []
     for x in meta:
         # tpe = col_attr(meta, str(x))
         tpe = [
             str(meta.get(x).get(self.dtypeHeader)),
             str(meta.get(x).get(self.actualHeader))
         ]
         struct_list.append(self.define_structure(x, tpe[0], tpe[1]))
     p_schema = StructType(struct_list)
     return sqlContext.createDataFrame(self.df, p_schema)
Esempio n. 10
0
def save_data(rdd):
    global flag
    flag = False
    """
    Parsing JSON value in each RDDs
    Creating Spark SQL DataFrame from RDD
    Writing DataFrame to HDFS and Oracle DB
    """
    if not rdd.isEmpty():
        rdd = rdd.map(lambda m: parse(m[1]))
        df = sqlContext.createDataFrame(rdd)
        df.createOrReplaceTempView("t")
        result = spark.sql(
            '''select event_id, event_type from (select row_number() over (partition by _1 order by _2) as RN,
			_1 as event_id,_2 as event_type from t)
			   where RN = 1''')

        count = result.count()

        try:
            # Writing to HDFS
            result.write \
                .format("csv") \
                .mode("append") \
                .option("header", "true") \
                .save(HDFS_OUTPUT_PATH)

            # Writing to Oracle DB
            result.write \
                .format("jdbc") \
                .mode("append") \
                .option("driver", DRIVER) \
                .option("url", URL_TARGET_DB) \
                .option("dbtable", TARGET_DB_TABLE_NAME) \
                .option("user", TARGET_DB_USER_NAME) \
                .option("password", TARGET_DB_USER_PASSWORD) \
                .save()

            write_log('INFO', 'Consumer_dim_event_type.py', 'main',
                      '{} rows inserted successfully'.format(count))

        except Exception as e:
            print('--> It seems an Error occurred: {}'.format(e))
            write_log('ERROR', 'Consumer_dim_event_type.py', 'main',
                      str(e)[:1000])
            flag = True
    else:
        ssc.stop()
    return rdd
Esempio n. 11
0
def best_choice(dict, i, PG, seedArray, genome, sc):
    SC = []
    for z in range(len(PG)):
        for pos_gen in PG[z]:
            seq = (dict[i], genome[pos_gen - seedArray[z]: pos_gen - seedArray[z] + len(dict[i])], seedArray[z], pos_gen)
            SC.append(seq)
    rddSeq = sc.parallelize(SC)
    schemaSeqDF = rddSeq.map(lambda x: Row(SEQ=x[0], GEN=x[1], POS_SEQ=x[2], POS_GEN=x[3]))
    df = sqlContext.createDataFrame(schemaSeqDF)
    df = df.withColumn("dist", F.levenshtein(F.col("SEQ"), F.col("GEN")))
    val = (1 / float(len(dict[i]))) * 100
    df = df.withColumn("percentage", val*F.col( "dist")).drop("dist")
    minDF = df.agg(min(col("percentage")).alias("percentage"))
    min_percentage = [x["percentage"] for x in minDF.rdd.collect()]
    df = df.filter(df.percentage == min_percentage[0])
    return df,min_percentage
Esempio n. 12
0
def write_offset_ranges(rdd):
    """
    Writing value of untilOffset to DB for offsets
    :param untilOffset: Exclusive ending offset.
    """
    if flag != True:
        for o in offsetRanges:
            currentOffset = int(o.untilOffset)
            df_write_offsets = sqlContext.createDataFrame([{"OFFSET": currentOffset}])
            df_write_offsets.write \
                .format("jdbc") \
                .mode("overwrite") \
                .option("driver", DRIVER) \
                .option("url", URL_TARGET_DB) \
                .option("dbtable", OFFSET_TABLE_NAME) \
                .option("user", TARGET_DB_USER_NAME) \
                .option("password", TARGET_DB_USER_PASSWORD) \
                .save()
Esempio n. 13
0
def convert_list_to_df(spark: SparkSession, table: set, schema: StructType,
                       table_name: str) -> (DataFrame, str):
    """Converting tuple of table (list type), schema, table_name to tuple table (DataFrame type), table_name

    :param spark: SparkSession
    :param table: list - Data from the table
    :param schema: StructType - Schema of the table
    :param table_name: str - Table name
    :return: Table in format DataFrame with table name (str type)
    """
    table: list = convert_set_to_list(table)
    print(
        "Converting content of the \"{}\" table from List to DataFrame format..."
        .format(table_name))
    sc: SparkContext = spark.sparkContext
    rdd = sc.parallelize(table)
    df: DataFrame = sqlContext.createDataFrame(rdd, schema)

    return df, table_name
Esempio n. 14
0
def save_data(rdd):
    """ Function for saving data in window """
    global NAMES
    if not rdd.isEmpty():
        # parsing data in RDD
        rdd = rdd \
            .map(lambda x: parser.parse(x[1])) \
            .map(lambda data: collect(data)) \
            .reduceByKey(lambda rec1, rec2: max(rec1, rec2, key=last_record))
        NAMES = dict(rdd.collect())
        print(
            "************************************> NAMES <************************************"
        )
        print(NAMES)
        print(
            "************************************> NAMES <************************************"
        )
        rdd = rdd \
            .map(lambda rec: (rec[0], rec[1][0], rec[1][1], rec[1][2]))
        # create DataFrame and View
        df = sqlContext.createDataFrame(rdd)
        df.createOrReplaceTempView("t")
        # query for getting result
        res = spark.sql(
            'select t._1 as NAME, t._2 as COUNT_NAME, t._3 as AVG_TRAFFIC, t._4 as AVG_SUCCESS_SELL from t'
        )
        res.show(40)
        # res.printSchema()
        # res = spark.sql('select count(*) KEY, sum(t._2) VALUE from t')
        res \
            .write \
            .format("jdbc") \
            .mode("overwrite") \
            .option("driver", 'oracle.jdbc.OracleDriver') \
            .option("url", "jdbc:oracle:thin:@{0}:{1}:orcl".format(IP_DB, PORT_DB)) \
            .option("dbtable", "tmp_kafka") \
            .option("user", "kozyar") \
            .option("password", "usertest") \
            .save()
        # spark.catalog.dropTempView("t")
    return rdd
Esempio n. 15
0
 sqlContext.setConf("spark.sql.parquet.binaryAsString", "true")
 df = sqlContext.sql("SELECT * FROM cdrdb.pre_rec_cdr_pqt_vw")
 hotline = sqlContext.read.text('/data/resources/numlist.txt')
 global hotline_list
 hotline_list = hotline.map(lambda x: x.value).collect()
 df1 = df.select('year', 'month').groupBy('year', 'month').count()
 partionList = df1.select('year', 'month').collect()
 temp = True
 for i, x in enumerate(partionList):
     print(x)
     if (x.year and x.month):
         temp_df = sqlContext.sql(
             "SELECT * FROM cdrdb.pre_rec_cdr_pqt_vw WHERE year='{}' AND month='{}'"
             .format(str(x.year), str(x.month)))
         df2 = sqlContext.createDataFrame(temp_df.map(parse), [
             'number', 'number2', 'type', 'date', 'week', 'callduration',
             'iscompethot'
         ])
         if (temp):
             temp = False
             print("Overwriting")
             df2.filter(
                 df2.type != 'invalid').write.mode("overwrite").saveAsTable(
                     "cdr_step0",
                     format="parquet",
                     path="/data/intermediate_data/cdr_step0/")
         else:
             print("Appending")
             df2.filter(
                 df2.type != 'invalid').write.mode("append").saveAsTable(
                     "cdr_step0",
                     format="parquet",
# print("Partitions structure: {}".format(rdd_1.glom().collect()), "\n")
#
# # Creation of RDD from another RDD thru transformation map() method (add value 4 to each number)
# rdd_00 = rdd_0.map(lambda x : x+4)
# print(rdd_00.toDebugString())
# print("Number of partitions: {}".format(rdd_00.getNumPartitions()))
# print("Partitions structure: {}".format(rdd_00.glom().collect()), "\n")

# Creation of RDD from a panda's DF
# Create Pandas DataFrame from a dictionary
data = {"Name":['Roshan', 'Hossam', 'Bala', 'Marcel', 'Deepak'], "Membership Due":[100, 200, 300, 400, 500]}
pandasDf0 = pd.DataFrame(data)
print(" Here is the Pandas DF created form a dictionary \n {}".format(pandasDf0.head()))

# Now convert the Panda's DF to Spark DF accommodate with 7 partitions using sqlContext.createDataFrame(df) method
sparkDf0 = sqlContext.createDataFrame(pandasDf0).repartition(7)
rdd_000 = sparkDf0.rdd.map(list)

print("\n Number of partitions: {}".format(rdd_000.getNumPartitions()))
print("\n Converted RDD rdd_000 (data shown inside partitions): \n {}".format(rdd_000.glom().collect()))
print("\n Converted RDD rdd_000 (data shown as a flat list): \n {}".format(rdd_000.collect()))

# # get the RDD Lineage
# print(rdd_1.toDebugString(), "\n")
#
# # add value 20 each number
# rdd_2 = rdd_1.map(lambda x : x+20)
#
# # RDD Object
# print(rdd_2)
#
Esempio n. 17
0
def save_data(rdd):
    global flag
    flag = False
    """
    Parsing JSON value in each RDDs
    Creating Spark SQL dataFrame from RDD
    Writing dataFrame to HDFS and Oracle DB
    """
    if not rdd.isEmpty():
        # Create df for duplicate handling
        write_log('INFO', 'Consumer_fct_prod.py', 'main', 'Executing max_id')
        df_max_id = spark.read \
            .format("jdbc") \
            .option("driver", DRIVER) \
            .option("url", URL_TARGET_DB) \
            .option("dbtable", "(SELECT max(ID) ID from " + TARGET_DB_TABLE_NAME + ")") \
            .option("user", TARGET_DB_USER_NAME) \
            .option("password", TARGET_DB_USER_PASSWORD) \
            .load()

        max_id = df_max_id.agg({'ID': 'max'}).collect()[0][0]
        if max_id == None:
            max_id = 0
        write_log('INFO', 'Consumer_fct_prod.py', 'main',
                  'Max id executed successfully max_id = {}'.format(max_id))

        rdd = rdd.map(lambda m: parse(m[1]))
        df_fct_prod = sqlContext.createDataFrame(rdd)
        df_fct_prod.createOrReplaceTempView("t")
        result = spark.sql(
            '''select id, event_id, event_time, product_id, customer_id
        from (select row_number() over (partition by _1 order by _3) as RN, _1 as id,_2 as event_id,
        to_timestamp(_3) as event_time,_4 as product_id,_5 as customer_id
                    from t where _1 > ''' + str(max_id) + ''')
        where RN = 1''')

        count = result.count()

        try:
            write_log('INFO', 'Consumer_fct_prod.py', 'main',
                      'Consumer is inserting {} rows to DB'.format(count))

            # Writing to HDFS
            result.write \
                .format("csv") \
                .mode("append") \
                .option("header", "true") \
                .save(HDFS_OUTPUT_PATH)

            # Writing to Oracle DB
            result.write \
                .format("jdbc") \
                .mode("append") \
                .option("driver", DRIVER) \
                .option("url", URL_TARGET_DB) \
                .option("dbtable", TARGET_DB_TABLE_NAME) \
                .option("user", TARGET_DB_USER_NAME) \
                .option("password", TARGET_DB_USER_PASSWORD) \
                .save()

            write_log('INFO', 'Consumer_fct_prod.py', 'main',
                      '{} rows inserted to DB successfully'.format(count))

        except Exception as e:
            print('--> It seems an Error occurred: {}'.format(e))
            write_log('ERROR', 'Consumer_fct_prod.py', 'main', str(e)[:1000])
            flag = True
    else:
        ssc.stop()
    return rdd
Esempio n. 18
0
    .options(delimiter=',', header=True, inferSchema=False) \
    .schema(songs2tracks_schema) \
    .load(songs2tracks_file)

metadata_df = sqlContext.read.format('com.databricks.spark.csv') \
    .options(delimiter=',', header=True, inferSchema=False) \
    .schema(metadata_schema) \
    .load(metadata_file)

# change ids from strings to integers
userId_change = plays_df.select('userId').distinct().select(
    'userId',
    F.monotonically_increasing_id().alias('new_userId'))
user_als_id_LUT = sqlContext.createDataFrame(
    userId_change.rdd.map(lambda x: x[0]).zipWithIndex(),
    StructType([
        StructField("userId", StringType(), True),
        StructField("user_als_id", IntegerType(), True)
    ]))

songId_change = plays_df.select('songId').distinct().select(
    'songId',
    F.monotonically_increasing_id().alias('new_songId'))
song_als_id_LUT = sqlContext.createDataFrame(
    songId_change.rdd.map(lambda x: x[0]).zipWithIndex(),
    StructType([
        StructField("songId", StringType(), True),
        StructField("song_als_id", IntegerType(), True)
    ]))

# RUN BELOW TWO LINES TO CHECK IF THE  NEW USER_ID, SONG_ID GENERATED PROPERLY
# user_als_id_LUT.show(5)
Esempio n. 19
0
from pyspark.shell import sqlContext
from pyspark.sql import *
names = ["Alice", "Bob", "Mike"]
items = ["milk", "bread", "butter", "apples", "oranges"]

df = sqlContext.createDataFrame([(names[i % 1], items[i % 3])
                                 for i in range(100)], ["name", "item"])

df.stat.crosstab("name", "item").show()
    def test_read_dataframe(self):
        dataframe = self.dataFrame.read_dataframe(self.path)
        empty_dataframe = sqlContext.createDataFrame([], StructType([]))

        self.assertNotEqual(dataframe, empty_dataframe)
Esempio n. 21
0
# binin.close()

# binout = open('hash1.bin','wb' )
# data = pickle.dumps(ht1)
# binout.write(data)
# binout.close()

#300000
binin = open('hash1.bin', 'rb')
ht1 = pickle.load(binin)
binin.close()
#==================================================================

rdd = sc.parallelize(ht1.items())
schemaHashDF = rdd.map(lambda x: Row(ID_GEN=x[0], POS_GEN=x[1]))
hashDF = sqlContext.createDataFrame(schemaHashDF)
#hashDF.show()

print('\033[1m' + 'ALLINEAMENTO CON UTILIZZO DI SPARK:' + '\033[0m')
startS = datetime.now()
SparkAligner.alignerSpark(dict, genome, hashDF, sc, dict_map)
endS = datetime.now()
print('\033[1m' + 'TEMPO CON SPARK: ' + '\033[0m', endS - startS)
print(
    "======================================================================================================================================================"
)

# print ('\033[1m' + 'ALLINEAMENTO SENZA UTILIZZO DI SPARK:' + '\033[0m')
# start = datetime.now()
# Aligner.aligner(dict, genome, ht)
# end = datetime.now()
Esempio n. 22
0
# query the website and return the html to the variable ‘page’
page = requests.get(quote_page).text

soup = BeautifulSoup(page, 'html.parser')
# print(soup.prettify())
# Table for 2019-estimate, 2016 land area KM, 2016 population density extracted (rank ascending)
tab = soup.find('div', {'class': 'mw-parser-output'})
tab = tab.find('table', {'class': 'wikitable sortable'})
tab = tab.find('tbody')
tab = tab.findAll('tr')
city_pop_us_tab = []
ans = []
# got completed table needed
for tr in tab:
    city_pop_us_tab = tr.text.strip()
    city_pop_us_tab = city_pop_us_tab.replace(u'\xa0', u' ')
    city_pop_us_tab = city_pop_us_tab.split('\n')
    ans.append(city_pop_us_tab)
    # print(city_pop_us_tab)
    # print('+++++++++++++++++')
ans.remove(ans[0])

rdd = spark.sparkContext.parallelize(ans)
# print(rdd.collect())
dict_dataframe = sqlContext.createDataFrame(rdd,
                                            ['rank', ' ', 'city', '', 'state', '', '2019estimate', '', '2010censor',
                                             '', 'change', '', '2016land1', '', '2016land2', '', '2016pop1', '',
                                             '2016pop2'])

dict_dataframe.show()
Esempio n. 23
0
def ProcessData(pandaData, pipeline):
    sparkData = sqlContext.createDataFrame(pandaData)
    transformedData = pipeline.fit(sparkData).transform(sparkData)
    return transformedData
reduced = rdd.map(lambda row: ((row[2], row[3], row[6], row[7]), [(row[1], row[0])])) \
    .reduceByKey(lambda x, y: x + y) \
    .map(lambda row: (row[0], sorted(row[1], key=lambda text: text[0]))).filter(lambda row: len(row[1]) == 2) \
    .map(lambda row: (row[1][0][1], row[1][1][1],
                      row[0][0], row[0][1], row[0][2], row[0][3]))

schema_red = typ.StructType([
    typ.StructField('Start Date', typ.StringType(), False),
    typ.StructField('End Date', typ.StringType(), False),
    typ.StructField('Private IP', typ.StringType(), False),
    typ.StructField('Private Port', typ.StringType(), False),
    typ.StructField('Destination IP', typ.StringType(), False),
    typ.StructField('Destination Port', typ.StringType(), False)
])

df_red = sqlContext.createDataFrame(reduced, schema_red)
df_red.show()
"""
Output
+-------------------+-------------------+--------------+------------+--------------+----------------+
|         Start Date|           End Date|    Private IP|Private Port|Destination IP|Destination Port|
+-------------------+-------------------+--------------+------------+--------------+----------------+
|22-02-2016 12:40:59|22-02-2016 12:42:04|100.68.154.175|         112| 216.58.197.77|               0|
|22-02-2016 12:41:07|22-02-2016 12:43:16|100.77.186.232|       38867|  100.1.200.99|            8080|
|22-02-2016 12:41:08|22-02-2016 12:43:18|100.68.154.175|       11882| 59.144.144.99|              53|
|22-02-2016 12:41:12|22-02-2016 12:43:21|100.77.186.232|       38875|  100.1.200.99|            8080|
|22-02-2016 12:41:17|22-02-2016 12:43:26|100.76.103.155|       35050| 59.144.144.99|              53|
|22-02-2016 12:41:17|22-02-2016 12:43:26|100.77.186.232|       38880|  100.1.200.99|            8080|
|22-02-2016 12:41:17|22-02-2016 12:43:26|100.77.186.232|       38881|  100.1.200.99|            8080|
|22-02-2016 12:41:18|22-02-2016 12:43:28|100.77.186.232|       38885|  100.1.200.99|            8080|
|22-02-2016 12:41:22|22-02-2016 12:43:31|100.77.186.232|       38889|  100.1.200.99|            8080|
Esempio n. 25
0
    pandas_df = qGET.to_pandas()
    print(pandas_to_spark(pandas_df))

    columns = list(pandas_df.columns)
    print('get columns', columns)

    types = list(pandas_df.dtypes)
    print('get types', types)

    struct_list = []
    for column, typo in zip(columns, types):
        struct_list.append(define_structure(column, typo))
    p_schema = StructType(struct_list)
    print('get p_schema', p_schema)

    spark_DF = sqlContext.createDataFrame(pandas_df, p_schema)
    print('get spark_DF', spark_DF)
    '''
    # Connect to sqlite3 database
    conn = sqlite3.connect("pythonsqlite.db")
    cur = conn.cursor()

    # look dataset in quandl &  transfter to pandas
    df = pandaset().lookpandaset('WIKI/AAPL')

    #  pandas dataFrame store to Sqlite3
    df.to_sql("daily_flights", conn, if_exists="replace")
    pd_daily_flights = pd.read_sql_query("select * from daily_flights limit 10;", conn)
    print('daily_flights', pd_daily_flights)

    getdata = quandl.get('FRED/GDP',  start_date='2010-01-01', end_date='2014-01-01',
Esempio n. 26
0
    # Separating catagorical and numerical columns
    Id_col = ['customerID']
    target_col = ["Churn"]
    cat_cols = pandasData.nunique()[pandasData.nunique() < 6].keys().tolist()
    cat_cols = [x for x in cat_cols if x not in target_col]
    num_cols = [
        x for x in pandasData.columns
        if x not in cat_cols + target_col + Id_col
    ]

    # labels
    lab = pandasData["Churn"].value_counts().keys().tolist()
    # values
    val = pandasData["Churn"].value_counts().values.tolist()
    spark_df = sqlContext.createDataFrame(pandasData)
    spark_df.show

    def func(pct, allvals):
        absolute = int(pct / 100. * np.sum(allvals))
        return "{:.1f}%".format(pct, absolute)

    def churnPlot():
        fig, ax = plt.subplots(figsize=(6, 3), subplot_kw=dict(aspect="equal"))
        wedges, texts, autotexts = ax.pie(val,
                                          autopct=lambda pct: func(pct, val),
                                          textprops=dict(color="w"))
        ax.legend(wedges,
                  lab,
                  title="Légende",
                  loc="center left",
Esempio n. 27
0
def save_data(rdd):
    global flag
    flag = False
    """
    Parsing JSON value in each RDDs
    Creating Spark SQL DataFrame from RDD
    Writing DataFrame to HDFS and Oracle DB
    """
    if not rdd.isEmpty():
        # Create df for duplicate handling
        df_max_id = spark.read \
            .format("jdbc") \
            .option("driver", DRIVER) \
            .option("url", URL_TARGET_DB) \
            .option("dbtable", TARGET_DB_TABLE_NAME) \
            .option("user", TARGET_DB_USER_NAME) \
            .option("password", TARGET_DB_USER_PASSWORD) \
            .load()

        max_id = df_max_id.agg({'product_id': 'max'}).collect()[0][0]
        if max_id == None:
            max_id = 0

        rdd = rdd.map(lambda m: parse(m[1]))
        df = sqlContext.createDataFrame(rdd)
        df.createOrReplaceTempView("t")
        result = spark.sql(
            '''select product_id, category_id, brand, description, name, price, last_update_date
                from (select row_number() over (partition by _1 order by _7) as RN,_1 as product_id,_2 as category_id,
                _3 as brand,_4 as description,_5 as name,_6 as price,to_timestamp(_7) as last_update_date
                 from t where _1 > ''' + str(max_id) + ''')
            where RN = 1''')

        count = result.count()

        try:
            # Writing to HDFS
            result.write \
                .format("csv") \
                .mode("append") \
                .option("header", "true") \
                .save(HDFS_OUTPUT_PATH)

            # Writing to Oracle DB
            result.write \
                .format("jdbc") \
                .mode("append") \
                .option("driver", DRIVER) \
                .option("url", URL_TARGET_DB) \
                .option("dbtable", TARGET_DB_TABLE_NAME) \
                .option("user", TARGET_DB_USER_NAME) \
                .option("password", TARGET_DB_USER_PASSWORD) \
                .save()

            write_log('INFO', 'Consumer_dim_products.py', 'main', '{} rows inserted successfully'.format(count))

        except Exception as e:
            print('--> It seems an Error occurred: {}'.format(e))
            write_log('ERROR', 'Consumer_dim_products.py', 'main', str(e)[:1000])
            flag = True
    else:
        ssc.stop()
    return rdd
Esempio n. 28
0
        'CITY': 'Amsterdam'
    },
    index=[0])
#pd_person = pd.DataFrame({'ADDRESS':'Museumplein','CITY':'Amsterdam','FIRSTNAME':'John','LASTNAME':'Doe','PERSONID':'0'}, index=[0])

#Create PySpark DataFrame Schema
p_schema = StructType([
    StructField('ADDRESS', StringType(), True),
    StructField('CITY', StringType(), True),
    StructField('FIRSTNAME', StringType(), True),
    StructField('LASTNAME', StringType(), True),
    StructField('PERSONID', StringType(), True)
])

#Create Spark DataFrame from Pandas
df_person = sqlContext.createDataFrame(pd_person, p_schema)
#Important to order columns in the same order as the target database
df_persons = df_person.select("PERSONID", "LASTNAME", "FIRSTNAME", "CITY",
                              "ADDRESS")

spark = SparkSession.builder.appName('pandasToSparkDF').getOrCreate()

df_persons.createOrReplaceTempView("DimSalary")
spark.sql("select * from DimSalary").show()
'''
spark = SparkSession.builder.appName('pandasToSparkDF').getOrCreate()
quandl.ApiConfig.api_key = 'P6LZzSkdVN6zTXQDE6Pd'

qGET = quandl.Dataset('NSE/OIL').data()
df = qGET.to_pandas()
df_pd = pd.DataFrame(df)
Esempio n. 29
0
def pretty_print_pagerank(graphframes, google):
    """ Prints a pretty chart with Google, Graphframes, and the Deltas """
    # Divide by total to match Google
    print("+-------+---------------------+")
    print("|Google\t|GraphFrames\t|Delta|")
    print("+-------+---------------------+")
    for key in google:
        goog = google[key]
        g_frames = graphframes_pagerank[key]
        print("|{}\t|{:.3f}\t\t|{:.3f}|".format(goog, g_frames,
                                                abs(goog - g_frames)))
    print("+-------+---------------------+")


""" ## Create some edges and vertices to match Fig 2.1 in the paper """
vertices = sqlContext.createDataFrame([(1, ), (2, ), (3, ), (4, )], ["id"])

edges = sqlContext.createDataFrame([(1, 2), (1, 3), (1, 4), (2, 3), (2, 4),
                                    (3, 1), (4, 1), (4, 3)], ["src", "dst"])

graph = GraphFrame(vertices, edges)
""" ## Show Vertices """
display_graph(graph.vertices)
doc.show()
""" ## Show Edges """
display_graph(graph.edges)
doc.show()
""" ## Show Degrees (Sum of in and out degrees by node) """
display_graph(graph.degrees)
doc.show()
""" Show all motifs which satisfy a->b->c """
    def test_read_dataframes(self):
        dataframe = self.dataFrame.read_dataframes(self.path, partition_number=1)
        empty_dataframe = sqlContext.createDataFrame([], StructType([]))

        self.assertNotEqual(dataframe, empty_dataframe)