/
main.py
112 lines (82 loc) · 3.44 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
# -*- coding: utf-8 -*-
import json
from pyspark import SparkContext
from pyspark.sql import SQLContext
from pyspark.ml.feature import StopWordsRemover
from pyspark.mllib.linalg.distributed import (CoordinateMatrix,
IndexedRow, IndexedRowMatrix)
from pyspark.ml.feature import Word2Vec
SOCIALBASEBR_DATA_FILE = 'data/socialbasebr_tweets.json'
FILTER_DATA_FILE = 'data/tweets.json'
TARGET_DATA_SIZE = 100
def unique_id():
i = 0
while True:
yield i
i += 1
id_gen = unique_id()
def load_data(filename, size=5, language_filter=None):
'''Load and return tweets texts'''
data = []
with open(filename) as f:
while True:
line = f.readline()
if not line:
break
try:
structure = json.loads(line)
except ValueError:
continue
# Ignore deleted tweets and implement language filter
if 'delete' in structure or language_filter and structure['lang'] \
not in language_filter:
continue
data.append(structure['text'])
return data[:size]
def get_all_data():
target_data = load_data(SOCIALBASEBR_DATA_FILE, TARGET_DATA_SIZE)
filter_data = load_data(FILTER_DATA_FILE, 200,
language_filter=['pt', 'en'])
return target_data + filter_data
def column_similarities(df):
mat = IndexedRowMatrix(df.select("id", "result").map(
lambda row: IndexedRow(*row)))
java_coordinate_matrix = mat._java_matrix_wrapper.call(
"columnSimilarities")
return CoordinateMatrix(java_coordinate_matrix)
def fit_and_transform(sql_context, data):
documentdf = sql_context.createDataFrame(data, ["id", "text"])
# Learn a mapping from words to Vectors.
word2vec = Word2Vec(vectorSize=len(data), minCount=0,
inputCol="text",
outputCol="result")
model = word2vec.fit(documentdf)
matrix = column_similarities(model.transform(documentdf))
return matrix
def main(sc):
sql_context = SQLContext(sc)
all_data = get_all_data()
# Input data: Each row is a bag of words from a sentence or document.
training_data = [(id_gen.next(), text.split(" ")) for text in all_data]
documentdf = sql_context.createDataFrame(training_data, ["id", "text"])
remover = StopWordsRemover(inputCol="text", outputCol="text_filtered")
cleaned_document = remover.transform(documentdf)
# Learn a mapping from words to Vectors.
word2vec = Word2Vec(vectorSize=len(training_data),
inputCol="text_filtered",
outputCol="result")
model = word2vec.fit(cleaned_document)
matrix = column_similarities(model.transform(cleaned_document))
# We use the size of the target data to filter only
# products of target data to filter data and avoid
# products of taret data to itself
values = matrix.entries.filter(
lambda x: x.j >= TARGET_DATA_SIZE and x.i < TARGET_DATA_SIZE).sortBy(
keyfunc=lambda x: x.value, ascending=False).map(
lambda x: x.j).distinct().take(100)
training_data_index = dict(training_data)
for position, item in enumerate(values):
line = " ".join(training_data_index[int(item)])
print('%d -> %s' % (position, line.encode('utf-8')))
if __name__ == '__main__':
main(SparkContext("local", "Desafio SocialBase"))