Python StopWordsRemover.drop Examples

Programming Language: Python

Namespace/Package Name: pyspark.ml.feature

Class/Type: StopWordsRemover

Method/Function: drop

Examples at hotexamples.com: 1

Python StopWordsRemover.drop - 1 examples found. These are the top rated real world Python examples of pyspark.ml.feature.StopWordsRemover.drop extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

loadDefaultStopWords(30)

StopWordsRemover(30)

transform(30)

getOutputCol(28)

getStopWords(17)

setStopWords(11)

setInputCol(7)

setOutputCol(6)

drop(1)

extend(1)

getInputCol(1)

show(1)

Example #1

Show file

def compute(sc, topLeft, bottomRight, step, datasetPath, k, gfs):
    sqlContext = SQLContext(sc)
    data = sc.textFile(datasetPath)
    data = data.mapPartitions(lambda x: csv.reader(x))
    header = data.first()
    data = data.filter(lambda x: x != header)
    result_to_write = []
    res_computation = []
    step = check_step(topLeft, bottomRight, step)
    squares = get_squares(topLeft, bottomRight, step)
    # start computing elapsed time here
    start_time = time.time()
    data = data.map(lambda x: is_inside(x, topLeft, bottomRight, step, squares)). \
        filter(lambda x: x is not None)
    data = data.map(remove_punctuation). \
        map(split_string_into_array). \
        filter(remove_empty_array). \
        map(create_row). \
        groupByKey(). \
        map(lambda x : (x[0], list(x[1])))
    # create the dataframes
    allDf = []
    for df in data.collect():
        if df:
            allDf.append([df[0], sqlContext.createDataFrame(df[1])])

    for docDFs in allDf:
        docDF = docDFs[1]
        squareId = docDFs[0]
        StopWordsRemover.loadDefaultStopWords('english')
        newDocDF_eng = StopWordsRemover(inputCol="words", outputCol="filtered_eng"). \
            transform(docDF)
        newDocDF_eng = newDocDF_eng.drop('words')
        StopWordsRemover.loadDefaultStopWords('italian')
        newDocDF_ita = StopWordsRemover(inputCol="filtered_eng", outputCol="filtered_ita"). \
            transform(newDocDF_eng)
        newDocDF_ita = newDocDF_ita.drop('filtered_eng')
        StopWordsRemover.loadDefaultStopWords('german')
        newDocDF_ger = StopWordsRemover(inputCol="filtered_ita", outputCol="filtered_ger"). \
            transform(newDocDF_ita)
        newDocDF_ger = newDocDF_ger.drop('filtered_ita')

        model = CountVectorizer(inputCol="filtered_ger", outputCol="vectors"). \
            fit(newDocDF_ger)
        result = model.transform(newDocDF_ger)
        corpus = result.select("idd", "vectors").rdd.map(create_corpus).cache()
        # cluster the documents into the k topics using LDA
        ldaModel = LDA.train(corpus,
                             k=k,
                             maxIterations=100,
                             optimizer='online')
        vocabArray = model.vocabulary
        wordNumbers = 10  # number of words per topic
        topicIndices = sc.parallelize(
            ldaModel.describeTopics(maxTermsPerTopic=wordNumbers))

        toBePrinted = min(len(vocabArray), wordNumbers)
        topics_final = topicIndices.map(
            lambda x: topic_render(x, toBePrinted, vocabArray)).collect()
        # compute labels
        topics_label = []
        for topic in topics_final:
            for topic_term in topic:
                if topic_term not in topics_label:
                    topics_label.append(topic_term)
                    break
        # print topics
        s = "; "
        res = "{}, {}, {}, {}, {}".format(topLeft.x, topLeft.y, bottomRight.x,
                                          bottomRight.y, s.join(topics_label))
        result_to_write.append(res)
        res_computation.append(topics_label)

    end_time = time.time()
    elapsed_time = end_time - start_time
    result_to_write.append(elapsed_time)
    to_write = sc.parallelize(result_to_write)
    # get dataset size from file name
    size = datasetPath.split('.')[0].split('_')[1]
    if gfs:
        output_folder = "/tmp/Topic_Zoomer_" + str(
            time.ctime(start_time)).replace(' ', '_').replace(':',
                                                              '-') + '_' + size
    else:
        output_folder = "Topic_Zoomer_" + str(time.ctime(start_time)).replace(
            ' ', '_').replace(':', '-') + '_' + size
    to_write.saveAsTextFile(output_folder)

    if gfs:
        copyHdfsCmd = 'hdfs dfs -copyToLocal {} {}'.format(
            output_folder, output_folder)
        copyBucketCmd = 'gsutil cp -r {} {}'.format(output_folder,
                                                    gfs_output_path_hdfs)
        copyRecBucketCmd = 'gsutil cp -r {} {}'.format(recFileFolder,
                                                       gfs_output_path_hdfs)
        copyHdfsRes = subprocess.call(shlex.split(copyHdfsCmd))
        copyBucketRes = subprocess.call(shlex.split(copyBucketCmd))
        copyRecBucketRes = subprocess.call(shlex.split(copyRecBucketCmd))
        # some exit code checks
        if copyBucketRes or copyHdfsRes or copyRecBucketRes:
            print('hdfsRes: {}'.format(copyHdfsRes))
            print('bucketResComp: {}'.format(copyBucketRes))
            print('bucketResRec: {}'.format(copyRecBucketRes))
            print('Something went wrong while copying results')
    return res_computation