def compute(sc, topLeft, bottomRight, step, datasetPath, k, gfs): sqlContext = SQLContext(sc) data = sc.textFile(datasetPath) data = data.mapPartitions(lambda x: csv.reader(x)) header = data.first() data = data.filter(lambda x: x != header) result_to_write = [] res_computation = [] step = check_step(topLeft, bottomRight, step) squares = get_squares(topLeft, bottomRight, step) # start computing elapsed time here start_time = time.time() data = data.map(lambda x: is_inside(x, topLeft, bottomRight, step, squares)). \ filter(lambda x: x is not None) data = data.map(remove_punctuation). \ map(split_string_into_array). \ filter(remove_empty_array). \ map(create_row). \ groupByKey(). \ map(lambda x : (x[0], list(x[1]))) # create the dataframes allDf = [] for df in data.collect(): if df: allDf.append([df[0], sqlContext.createDataFrame(df[1])]) for docDFs in allDf: docDF = docDFs[1] squareId = docDFs[0] StopWordsRemover.loadDefaultStopWords('english') newDocDF_eng = StopWordsRemover(inputCol="words", outputCol="filtered_eng"). \ transform(docDF) newDocDF_eng = newDocDF_eng.drop('words') StopWordsRemover.loadDefaultStopWords('italian') newDocDF_ita = StopWordsRemover(inputCol="filtered_eng", outputCol="filtered_ita"). \ transform(newDocDF_eng) newDocDF_ita = newDocDF_ita.drop('filtered_eng') StopWordsRemover.loadDefaultStopWords('german') newDocDF_ger = StopWordsRemover(inputCol="filtered_ita", outputCol="filtered_ger"). \ transform(newDocDF_ita) newDocDF_ger = newDocDF_ger.drop('filtered_ita') model = CountVectorizer(inputCol="filtered_ger", outputCol="vectors"). \ fit(newDocDF_ger) result = model.transform(newDocDF_ger) corpus = result.select("idd", "vectors").rdd.map(create_corpus).cache() # cluster the documents into the k topics using LDA ldaModel = LDA.train(corpus, k=k, maxIterations=100, optimizer='online') vocabArray = model.vocabulary wordNumbers = 10 # number of words per topic topicIndices = sc.parallelize( ldaModel.describeTopics(maxTermsPerTopic=wordNumbers)) toBePrinted = min(len(vocabArray), wordNumbers) topics_final = topicIndices.map( lambda x: topic_render(x, toBePrinted, vocabArray)).collect() # compute labels topics_label = [] for topic in topics_final: for topic_term in topic: if topic_term not in topics_label: topics_label.append(topic_term) break # print topics s = "; " res = "{}, {}, {}, {}, {}".format(topLeft.x, topLeft.y, bottomRight.x, bottomRight.y, s.join(topics_label)) result_to_write.append(res) res_computation.append(topics_label) end_time = time.time() elapsed_time = end_time - start_time result_to_write.append(elapsed_time) to_write = sc.parallelize(result_to_write) # get dataset size from file name size = datasetPath.split('.')[0].split('_')[1] if gfs: output_folder = "/tmp/Topic_Zoomer_" + str( time.ctime(start_time)).replace(' ', '_').replace(':', '-') + '_' + size else: output_folder = "Topic_Zoomer_" + str(time.ctime(start_time)).replace( ' ', '_').replace(':', '-') + '_' + size to_write.saveAsTextFile(output_folder) if gfs: copyHdfsCmd = 'hdfs dfs -copyToLocal {} {}'.format( output_folder, output_folder) copyBucketCmd = 'gsutil cp -r {} {}'.format(output_folder, gfs_output_path_hdfs) copyRecBucketCmd = 'gsutil cp -r {} {}'.format(recFileFolder, gfs_output_path_hdfs) copyHdfsRes = subprocess.call(shlex.split(copyHdfsCmd)) copyBucketRes = subprocess.call(shlex.split(copyBucketCmd)) copyRecBucketRes = subprocess.call(shlex.split(copyRecBucketCmd)) # some exit code checks if copyBucketRes or copyHdfsRes or copyRecBucketRes: print('hdfsRes: {}'.format(copyHdfsRes)) print('bucketResComp: {}'.format(copyBucketRes)) print('bucketResRec: {}'.format(copyRecBucketRes)) print('Something went wrong while copying results') return res_computation