Example #1
0
    def test_slice(self):
        from pyspark.sql.functions import slice, lit

        df = self.spark.createDataFrame([([1, 2, 3], ), ([4, 5], )], ['x'])

        self.assertEquals(
            df.select(slice(df.x, 2, 2).alias("sliced")).collect(),
            df.select(slice(df.x, lit(2), lit(2)).alias("sliced")).collect(),
        )
Example #2
0
def benchmark_extract_top_keywords(posts, n_keywords=10):
    """Given TF-IDF output (as "features" column) extracts out the vocabulary index of the
    10 keywords with highest TF-IDF (for each post)."""
    def extract_keys_from_vector(vector):
        return vector.indices.tolist()

    def extract_values_from_vector(vector):
        return vector.values.tolist()

    extract_keys_from_vector_udf = udf(
        lambda vector: extract_keys_from_vector(vector),
        ArrayType(IntegerType()))
    extract_values_from_vector_udf = udf(
        lambda vector: extract_values_from_vector(vector),
        ArrayType(DoubleType()))

    posts = posts.withColumn("extracted_keys",
                             extract_keys_from_vector_udf("features"))
    posts = posts.withColumn("extracted_values",
                             extract_values_from_vector_udf("features"))

    posts = posts.withColumn(
        "zipped_truncated",
        slice(
            sort_array(arrays_zip("extracted_values", "extracted_keys"),
                       asc=False), 1, n_keywords))

    take_second = udf(lambda rows: [row[1] for row in rows],
                      ArrayType(IntegerType()))
    posts = posts.withColumn("top_indices", take_second("zipped_truncated"))

    return posts
 def add_year(self, df):
     df2 = (
         df.withColumn('file_name', slice(split(input_file_name(), '/'),
                                          -1 ,1)[0])
             .withColumn('flight_year',
                         col('file_name').substr(1, 4).cast(IntegerType()))
     )
     return df2
Example #4
0
    def test_slice(self):
        from pyspark.sql.functions import lit, size, slice

        df = self.spark.createDataFrame([([1, 2, 3], ), ([4, 5], )], ['x'])

        self.assertEqual(
            df.select(slice(df.x, 2, 2).alias("sliced")).collect(),
            df.select(slice(df.x, lit(2), lit(2)).alias("sliced")).collect(),
        )

        self.assertEqual(
            df.select(slice(df.x,
                            size(df.x) - 1, lit(1)).alias("sliced")).collect(),
            [Row(sliced=[2]), Row(sliced=[4])])
        self.assertEqual(
            df.select(slice(df.x, lit(1),
                            size(df.x) - 1).alias("sliced")).collect(),
            [Row(sliced=[1, 2]), Row(sliced=[4])])
Example #5
0
    def test_slice(self):
        df = self.spark.createDataFrame(
            [
                (
                    [1, 2, 3],
                    2,
                    2,
                ),
                (
                    [4, 5],
                    2,
                    2,
                ),
            ],
            ["x", "index", "len"],
        )

        expected = [Row(sliced=[2, 3]), Row(sliced=[5])]
        self.assertTrue(
            all([
                df.select(slice(df.x, 2,
                                2).alias("sliced")).collect() == expected,
                df.select(slice(df.x, lit(2),
                                lit(2)).alias("sliced")).collect() == expected,
                df.select(slice("x", "index",
                                "len").alias("sliced")).collect() == expected,
            ]))

        self.assertEqual(
            df.select(slice(df.x,
                            size(df.x) - 1, lit(1)).alias("sliced")).collect(),
            [Row(sliced=[2]), Row(sliced=[4])],
        )
        self.assertEqual(
            df.select(slice(df.x, lit(1),
                            size(df.x) - 1).alias("sliced")).collect(),
            [Row(sliced=[1, 2]), Row(sliced=[4])],
        )
Example #6
0
def extract_top_keywords(posts, vocabulary, n_keywords=10):
    """Given word count (Count Vectorizer) output (as "features" column) -
    extracts out the vocabulary index of the 10 keywords with highest TF-IDF (for each post)."""
    def extract_keys_from_vector(vector):
        return vector.indices.tolist()

    def extract_values_from_vector(vector):
        return vector.values.tolist()

    extract_keys_from_vector_udf = udf(
        lambda vector: extract_keys_from_vector(vector),
        ArrayType(IntegerType()))
    extract_values_from_vector_udf = udf(
        lambda vector: extract_values_from_vector(vector),
        ArrayType(DoubleType()))

    idf_udf = array_transform(idf_wiki)
    vocab_dict = {k: v for k, v in enumerate(vocabulary)}

    def ix_to_word(ix):
        return vocab_dict[ix]

    vocab_udf = array_transform(ix_to_word)

    posts = posts.withColumn("word_ix",
                             extract_keys_from_vector_udf("features"))
    posts = posts.withColumn("word_count",
                             extract_values_from_vector_udf("features"))
    posts = posts.withColumn('words', vocab_udf(col('word_ix')))
    posts = posts.withColumn("idf", idf_udf(col("words")))
    posts = posts.withColumn(
        "zipped_truncated",
        slice(sort_array(arrays_zip("idf", "words"), asc=False), 1,
              n_keywords))

    take_second = udf(lambda rows: [row[1] for row in rows],
                      ArrayType(StringType()))
    posts = posts.withColumn("top_keywords", take_second("zipped_truncated"))

    return posts['CreationDate', 'top_keywords', 'Tags', 'ParentId']
Example #7
0
def generateCooccurrenceMatrices(data, df_all):
    # to dataframe and create top 100 column
    df_temp = data.toDF(['Topic', 'Tuples per Topic'])
    df_temp = df_temp.withColumn(
        'Top 100', F.slice('Tuples per Topic', start=1, length=100))
    count_tuples = df_temp.select(
        ['Topic',
         'Top 100']).rdd.map(lambda r: (r[0], [w[0] for w in r[1]])).collect()

    # lists of top100 words per topic
    economy_top100_words = count_tuples[0][1]
    microsoft_top100_words = count_tuples[1][1]
    palestine_top100_words = count_tuples[2][1]
    obama_top100_words = count_tuples[3][1]

    def mapOcc(sentence, top100_words):
        data_combs = []
        for word in top100_words:
            h = {}
            if word in sentence:
                for neigh in neighbors(sentence, top100_words, word):
                    comb = (word, neigh)
                    h[comb] = h.get(comb, 0) + 1
                h[(word, word)] = 0
                data_combs.extend(list(h.items()))
        return data_combs

    def neighbors(sentence, top100_words, word):
        neighs = list(set(sentence) & set(top100_words))
        neighs.remove(word)
        return neighs

    def generateCoocurrenceMatrix(col, topic, top100_words):
        # list of all titles per topic
        col_topic = df_all.select(col + '_sentence').where(
            F.col('Topic').isin({topic})).rdd.flatMap(lambda r: r).collect()
        df_col_topic = spark.createDataFrame(col_topic, "string").toDF(col)
        df_col_topic = df_col_topic.withColumn(col, F.split(F.col(col), ' '))

        # calculate co-occurrence stripes per title
        economy_coocurrence = df_col_topic.rdd. \
            flatMap(lambda r: mapOcc(r[0], top100_words)). \
            reduceByKey(lambda x, y: x + y). \
            map(lambda x: (x[0][0], [(x[0][1], int(x[1]))])). \
            reduceByKey(lambda x, y: x + y).sortByKey().collect()

        # construct matrix from coocurrence stripes
        as_matrix = defaultdict(dict)
        for entry in economy_coocurrence:
            topword = entry[0]
            for cooc in sorted(entry[1]):
                as_matrix[topword][cooc[0]] = cooc[1]

        # use pandas to create a dataframe from matrix
        pd_df_matrix = pd.DataFrame(as_matrix)
        pd_df_matrix.insert(0, 'words', sorted(top100_words), True)
        df_matrix = spark.createDataFrame(pd_df_matrix)
        df_matrix = df_matrix.na.fill(0)

        # write matrix to file
        df_matrix \
            .repartition(1) \
            .write \
            .mode("overwrite") \
            .csv(dir_path + "results/" + "cooc_matrix_" + col + '_' + topic, header=True)

    generateCoocurrenceMatrix('Title', 'economy', economy_top100_words)
    generateCoocurrenceMatrix('Title', 'microsoft', microsoft_top100_words)
    generateCoocurrenceMatrix('Title', 'palestine', palestine_top100_words)
    generateCoocurrenceMatrix('Title', 'obama', obama_top100_words)
    generateCoocurrenceMatrix('Headline', 'economy', economy_top100_words)
    generateCoocurrenceMatrix('Headline', 'microsoft', microsoft_top100_words)
    generateCoocurrenceMatrix('Headline', 'palestine', palestine_top100_words)
    generateCoocurrenceMatrix('Headline', 'obama', obama_top100_words)
    conf = pyspark.SparkConf().set("spark.cores.max", "4")
    sc = pyspark.SparkContext(master=SPARK_MASTER, conf=conf)
    spark = pyspark.sql.SparkSession(sc).builder.appName(
        APP_NAME).getOrCreate()

print("PySpark initiated...")


lines = spark \
        .readStream \
        .format("text") \
        .load(path="streaming_src/")

lines.printSchema()

words = lines \
    .filter(lines['value'].contains('- -')) \
    .withColumn("split", slice(split(lines['value'], " "), -2,1).getItem(0))

wordCounts = words.groupBy('split').count()

# Start running the query that prints the running counts to the console
query = wordCounts \
    .writeStream \
    .outputMode('complete') \
   .format('console')\
    .start()

query.awaitTermination()
query.stop()