コード例 #1
0
    def countVectorizer(infoData):
        colName = infoData.get(pc.COLMTOENCODE)
        dataset = infoData.get(pc.DATASET)
        encodedColm = infoData.get(pc.ENCODEDCOLM)
        originalColmName = infoData.get(pc.ORIGINALCOLMNAME)
        oneHotEncoderPathMapping = infoData.get(pc.ONEHOTENCODERPATHMAPPING)
        storageLocation = infoData.get(pc.STORAGELOCATION)
        countVectorizer = CountVectorizer(inputCol=colName,
                                          outputCol=encodedColm).fit(dataset)
        '''oneHotEncoderPath = storageLocation + modelId.upper() + PredictiveConstants.ONEHOTENCODED.upper() + PredictiveConstants.PARQUETEXTENSION
        oneHotEncoder.write().overwrite().save(oneHotEncoderPath)
        oneHotEncoderPathMapping.update({
            PredictiveConstants.ONEHOTENCODED: oneHotEncoderPath
        })'''

        oneHotEncoderPath = storageLocation + pc.ONEHOTENCODED_.upper(
        ) + originalColmName.upper() + pc.PARQUETEXTENSION
        countVectorizer.write().overwrite().save(oneHotEncoderPath)
        oneHotEncoderPathMapping.update({originalColmName: oneHotEncoderPath})

        dataset = countVectorizer.transform(dataset)
        infoData.update({
            pc.ONEHOTENCODERPATHMAPPING: oneHotEncoderPathMapping,
            pc.DATASET: dataset
        })
        return infoData
    def oneHotEncodeData(self, sentimentInfoData):
        colName = sentimentInfoData.get(pc.COLMTOENCODE)
        dataset = sentimentInfoData.get(pc.DATASET)
        vectorizedFeaturescolmName = "features"  # temp fix for testing only
        dataset.drop(vectorizedFeaturescolmName)
        oneHotEncodedColName = pc.ONEHOTENCODED_ + colName
        countVectorizer = CountVectorizer(
            inputCol=pc.DMXSTOPWORDS,
            outputCol=oneHotEncodedColName).fit(dataset)
        '''oneHotEncoderPath = storageLocation + modelId.upper() + PredictiveConstants.ONEHOTENCODED.upper() + PredictiveConstants.PARQUETEXTENSION
        oneHotEncoder.write().overwrite().save(oneHotEncoderPath)
        oneHotEncoderPathMapping.update({
            PredictiveConstants.ONEHOTENCODED: oneHotEncoderPath
        })'''

        dataset = countVectorizer.transform(dataset)
        # need to store the path of count vectorizer to use at the time of performing sentiment analysis.
        '''create feature colm from encoded colm'''
        featureassembler = VectorAssembler(
            inputCols=[oneHotEncodedColName],
            outputCol=vectorizedFeaturescolmName,
            handleInvalid="skip")
        dataset = featureassembler.transform(dataset)
        sentimentInfoData.update({
            pc.FEATURECOLUMN: vectorizedFeaturescolmName,
            pc.DATASET: dataset
        })
        return sentimentInfoData
コード例 #3
0
ファイル: Word2VecTrainer.py プロジェクト: Mihandr/Komp-Lingv
    def train(self):

        self.__prepare()

        spark = SparkSession\
            .builder\
            .appName("Kursach")\
            .getOrCreate()

        input_file = spark.sparkContext.textFile('./w2v.txt')

        # print(input_file.collect())
        prepared = input_file.map(lambda x: ([x]))
        df = prepared.toDF()
        prepared_df = df.selectExpr('_1 as text')

        tokenizer = Tokenizer(inputCol='text', outputCol='words')
        words = tokenizer.transform(prepared_df)

        stop_words = StopWordsRemover.loadDefaultStopWords('russian')
        remover = StopWordsRemover(inputCol='words',
                                   outputCol='filtered',
                                   stopWords=stop_words)
        filtered = remover.transform(words)

        # print(stop_words)

        # filtered.show()

        # words.select('words').show(truncate=False, vertical=True)

        # filtered.select('filtered').show(truncate=False, vertical=True)

        vectorizer = CountVectorizer(inputCol='filtered',
                                     outputCol='raw_features').fit(filtered)
        featurized_data = vectorizer.transform(filtered)
        featurized_data.cache()
        vocabulary = vectorizer.vocabulary

        # featurized_data.show()

        # featurized_data.select('raw_features').show(truncate=False, vertical=True)

        # print(vocabulary)

        idf = IDF(inputCol='raw_features', outputCol='features')
        idf_model = idf.fit(featurized_data)
        rescaled_data = idf_model.transform(featurized_data)

        self.__word2Vec = Word2Vec(vectorSize=3,
                                   minCount=0,
                                   inputCol='words',
                                   outputCol='result')
        self.__model = self.__word2Vec.fit(filtered)
        w2v_df = self.__model.transform(words)
        w2v_df.show()
        spark.stop()
コード例 #4
0
def LDAThis(sc, RDD, minFreq, numTopics, maxIter, wordsPerTopic):
    '''
Arguments:
     sc: A SparkContext Object
     RDD: An RDD with rows as tokenized sentences
     minFreq: Minimum document frequency for CountVectorizer
     numTopics: Number of Topics
     maxIter: Max number of iterations for LDA train
     wordsPerTopic: Number of words to show per topic
     topWords: Number of words to show per topic
Requirements
     sqlContext = SQLContext(sc) <- must be defined outside function
     '''
    StopWords = stopwords.words("english")
    sqlContext = SQLContext(sc)
    # Structure Data
    idRDD = RDD.map(
        lambda words: [x for x in words if x.isalpha() and x not in StopWords
                       ]).filter(lambda x: len(x) > 2).zipWithIndex()
    idDF = sqlContext.createDataFrame(idRDD, ["tokens", 'index'])
    # Term Frequency
    CVecModel = CountVectorizer(inputCol="tokens",
                                outputCol="rawFeatures",
                                vocabSize=5000,
                                minDF=minFreq).fit(idDF)
    resultCVec = CVecModel.transform(idDF)
    vocabArray = CVecModel.vocabulary
    #IDF
    idf = IDF(inputCol="rawFeatures", outputCol="features")
    idfModel = idf.fit(resultCVec)
    resultTFIDF = idfModel.transform(resultCVec)
    # LDA
    resultLDA = LDA.train(resultTFIDF.select(
        'index', 'features').rdd.mapValues(Vectors.fromML).map(list),
                          k=numTopics,
                          maxIterations=maxIter)
    topicIndices = sc.parallelize(
        resultLDA.describeTopics(maxTermsPerTopic=wordsPerTopic))
    topicsFinal = topicIndices.map(lambda topic: render_topics(
        topic, wordsPerTopic, vocabArray)).collect()

    # Show Topics
    for topic in range(len(topicsFinal)):
        print("Topic" + str(topic) + ":")
        for term in topicsFinal[topic]:
            print(term)
        print('\n')
    return resultLDA
コード例 #5
0
def GetFeatures(data):
    '''
	# TF-IDF Score
	hashingTF = HashingTF(inputCol="Filtered", outputCol="rawFeatures", numFeatures=3000)
	idf = IDF(inputCol="rawFeatures", outputCol="features", minDocFreq=2.0)

	pipeline = Pipeline(stages=[hashingTF, idf])

	dataset = pipeline.fit(data).transform(data)
	dataset.show(5)
	'''
    # Term Frequency
    # minDF: Specifies the minimum number of different documents a term must appear in
    # 		 to be included in the vocabulary
    model = CountVectorizer(inputCol="Filtered",
                            outputCol="features",
                            minDF=0.03).fit(data)
    df = model.transform(data)

    print("========= Finish Getting Features for Training =========")

    return df, model.vocabulary
コード例 #6
0
def getCountVector(final_df):
    '''
    This function accepts as input a dataframe with a column named 'data' containing
    each document as a row. This will be converted to a countvector and the ouput column
    will be named 'indexedFeatures'. It returns the countvector model and the original dataframe
    with additional column 'indexedFeatures'.

    Arg1 : dataframe to compute the count vector

    '''
    #getting the countvector
    print('************* inside the count vector ****************')
    print('************* inside the count vector ****************')
    print('************* inside the count vector ****************')

    cv = CountVectorizer(inputCol="data",
                         outputCol="indexedFeatures").fit(final_df)
    countVector_df = cv.transform(final_df)

    print('************* returning the count vector ****************')
    print('************* returning the count vector ****************')
    print('************* returning the count vector ****************')

    return countVector_df, cv
コード例 #7
0
ファイル: module8_cs2.py プロジェクト: voklymchuk/spark
def main():

    set_pandas_options()
    app_name = "Case Study 2: Email Analytics"

    conf = SparkConf().setAppName(app_name)
    conf = (conf.setMaster('local[*]').set(
        "spark.driver.host",
        "localhost").set('spark.executor.memory',
                         '4G').set('spark.driver.memory',
                                   '8G').set('spark.driver.maxResultSize',
                                             '10G'))
    sc = SparkContext(conf=conf)
    spark = SparkSession(sc)

    log4jLogger = sc._jvm.org.apache.log4j
    LOGGER = log4jLogger.LogManager.getLogger(__name__)
    LOGGER.info("pyspark script logger initialized")

    # 1 Load data into Spark DataFrame
    LOG = get_hdfs_filepath('*/*/*')

    # read text file
    log_txt_df = sc.wholeTextFiles(LOG).filter(lambda line: line != '').toDF()
    # Convert strings to columns
    udf1 = udf(to_utc_timestamp, TimestampType())
    df = log_txt_df
    df = df.select(df._2.alias('line'))
    udf1 = udf(to_utc_timestamp, TimestampType())
    temp = df.select(
        regexp_extract(col('line'), r'Message-ID:\s<.*>',
                       0).alias('Message_ID'),
        regexp_extract(
            col('line'),
            r'\d{1,2}\s\w{3}\s\d{4}\s\d{2}:\d{2}:\d{2}\s(\+|\-)\d{4}(.*)',
            0).alias("Date"),
        regexp_extract(col('line'), r'From:\s(.*)', 0).alias("From"),
        regexp_extract(
            col('line'),
            r"To:\s(.+)((?:\n|\r\n?)((?:(?:\n|\r\n?).+)+)){0,}(\S+@\S+)(?:\n|\r\n?)Subject:\s",
            0).alias("To"),
        regexp_extract(
            col('line'),
            r"Subject:\s(.+)((?:\n|\r\n?)((?:(?:\n|\r\n?).+)+)){0,}",
            1).alias("Subject"),
        regexp_extract(
            col('line'),
            r"Cc:\s(.+)((?:\n|\r\n?)((?:(?:\n|\r\n?).+)+)){0,}(?:\n|\r\n?)Mime-Version:\s",
            0).alias("Cc"),
        regexp_extract(col('line'), r'Mime-Version:\s(.+)',
                       1).alias("Mime_Version"),
        regexp_extract(col('line'), r'Content-Type:\s(.*)',
                       1).alias("Content_Type"),
        regexp_extract(col('line'), r"Content-Transfer-Encoding:\s(.+)",
                       1).alias("Content_Transfer_Encoding"),
        regexp_extract(col('line'), r"X-From:\s(.*)(?:\n|\r\n?)X-To:\s",
                       0).alias("X_From"),
        regexp_extract(col('line'), r'X-To:\s(.*)(?:\n|\r\n?)X-cc:\s',
                       0).alias("X_To"),
        regexp_extract(col('line'), r'X-cc:\s(.*)(?:\n|\r\n?)X-bcc:\s',
                       0).alias("X_cc"),
        regexp_extract(col('line'), r'X-bcc:\s(.*)(?:\n|\r\n?)X-Folder:\s',
                       0).alias("X_bcc"),
        regexp_extract(col('line'), r'X-Folder:\s(.*)(?:\n|\r\n?)X-Origin:\s',
                       0).alias("X_Folder"),
        regexp_extract(col('line'),
                       r"X-Origin:\s(.*)(?:\n|\r\n?)X-FileName:\s",
                       0).alias("X_Origin"),
        regexp_extract(col('line'), r"X-FileName:\s(.*)",
                       0).alias("X_FileName"),
        regexp_extract(
            col('line'),
            r"X-FileName:\s(.*)((?:\n|\r\n?){1,}(.*)){1,}((?:(?:\n|\r\n?).+)+)",
            0).alias("FYI"))
    #temp.cache()
    temp1 = temp.select(
        expr("substring(Message_ID, 14, length(Message_ID)-14)").alias(
            "Message_ID"), 'Date',
        udf1('Date').alias('UTC_timestamp'),
        expr("substring(From, 7, length(From)-6)").alias("From"),
        expr("substring(To, 5, length(To)-15)").alias("To"), "Subject",
        expr("substring(Cc, 5, length(Cc)-20)").alias("Cc"), "Mime_Version",
        "Content_Type", 'Content_Transfer_Encoding',
        expr("substring(X_From, 9, length(X_From)-16)").alias("X_From"),
        expr("substring(X_To, 7, length(X_To)-14)").alias("X_To"),
        expr("substring(X_cc, 7, length(X_cc)-15)").alias("X_cc"),
        expr("substring(X_bcc, 8, length(X_bcc)-19)").alias("X_bcc"),
        expr("substring(X_Folder, 11, length(X_Folder)-22)").alias("X_Folder"),
        expr("substring(X_Origin, 11, length(X_Origin)-24)").alias("X_Origin"),
        expr("substring(X_FileName, 13, length(X_FileName)-15)").alias(
            "X_FileName"),
        regexp_replace(
            col('FYI'),
            r"(X-FileName:\s(.*)(?:\n|\r\n?){1,})|(-*Original Message-*(.*)((?:\n|\r\n?){1,}(.*)){0,}((?:(?:\n|\r\n?).+)+))",
            '').alias('FYI'))
    #temp1.cache()
    result = temp1.select(
        "Message_ID", 'Date', 'UTC_timestamp', "From",
        regexp_replace(col('To'), r"\r\n\t", "").alias("To"), "Subject",
        regexp_replace(col('Cc'), r"\r\n\t", "").alias("Cc"), "Mime_Version",
        "Content_Type", 'Content_Transfer_Encoding', "X_From", "X_To", "X_cc",
        "X_bcc", "X_Folder", "X_Origin", "X_FileName",
        regexp_replace(col('FYI'), r"(^\s{1,})|(\n{2,})", '').alias('FYI'))
    zz = result.limit(5).toPandas()
    LOGGER.info(
        "\n\n1.\tLoad data into Spark DataFrame\tDone!\n\n{}\n".format(zz))

    # 2 Display the top 10 high-frequency users based on weekly numbers of emails sent
    df1 = result
    freq = df1.groupBy('From').agg(
        (count('UTC_timestamp') /
         ((max(unix_timestamp(col('UTC_timestamp'))) -
           min(unix_timestamp(col('UTC_timestamp')))) /
          604800)).alias('rate_per_week')).orderBy("rate_per_week",
                                                   ascending=False)
    zz = freq.limit(10).toPandas()
    LOGGER.info(
        "\n\n2.\tDisplay the top 10 high-frequency users based on weekly numbers of emails sent\tDone!\n\n{}\n"
        .format(zz))

    # 3a Extract top 20 keywords from the subject text for the top 10 high-frequency users
    top = freq.limit(10)
    top_subj = df1.join(top, df1["From"] == top["From"],
                        "inner").select(df1['From'], df1['Subject'])
    top_texts = top_subj.groupBy("From").agg(
        concat_ws(" ", collect_list("Subject")).alias("texts"))
    top_texts = top_texts.select('texts').agg(
        concat_ws(" ", collect_list("texts")).alias("subjects"))
    # Extract word
    from pyspark.ml.feature import Tokenizer
    tokenizer = Tokenizer().setInputCol("subjects").setOutputCol("words")
    transformed = tokenizer.transform(top_texts)
    # Extend the stop words dictionary by adding your own stop words such as -
    # Remove stopwords
    # custom stopwords
    stopwords = StopWordsRemover().getStopWords() + ["-", "re:", "", "fw"]
    remover = StopWordsRemover().setStopWords(stopwords).setInputCol(
        "words").setOutputCol("filtered")
    cleaned = remover.transform(transformed)
    # Extract top 20 keywords by identifying removing the common stop words
    # Generate features
    from pyspark.ml.feature import CountVectorizer, CountVectorizerModel
    cvmodel = CountVectorizer().setInputCol("filtered").setOutputCol(
        "features").fit(cleaned)
    featured = cvmodel.transform(cleaned)
    counts = featured.select('features').collect()
    a = cvmodel.vocabulary
    b = counts[0]['features'].values
    d = {'words': a, 'counts': b}
    df = pd.DataFrame(d)
    zz = df.head(20)
    LOGGER.info(
        "\n\n3a.\tExtract top 20 keywords from the subject text for the top 10 high-frequency users\tDone!\n\n{}\n"
        .format(zz))
    # 3b Extract top 20 keywords from the subject text for the non-high frequency users
    w = Window().orderBy(lit('A'))
    bottom = freq.orderBy("rate_per_week",
                          ascending=False).withColumn("row_num",
                                                      row_number().over(w))
    bottom = bottom.where(col('row_num') > 10).select('From', 'rate_per_week')
    bottom_subj = df1.join(bottom, df1["From"] == bottom["From"],
                           "inner").select(df1["From"], df1["Subject"])
    bottom_texts = bottom_subj.groupBy("From").agg(
        concat_ws(" ", collect_list("Subject")).alias("texts"))
    bottom_texts = bottom_texts.select('texts').agg(
        concat_ws(" ", collect_list("texts")).alias("subjects"))
    # Extract word
    tokenizer = Tokenizer().setInputCol("subjects").setOutputCol("words")
    transformed = tokenizer.transform(bottom_texts)
    # Remove stopwords
    # custom stopwords
    stopwords = StopWordsRemover().getStopWords() + [
        "-", "re:", "fw:", "", "&"
    ]
    remover = StopWordsRemover().setStopWords(stopwords).setInputCol(
        "words").setOutputCol("filtered")
    cleaned = remover.transform(transformed)
    # Generate features
    cvmodel = CountVectorizer().setInputCol("filtered").setOutputCol(
        "features").fit(cleaned)
    featured = cvmodel.transform(cleaned)
    counts = featured.select('features').collect()
    a = cvmodel.vocabulary
    b = counts[0]['features'].values
    d = {'words': a, 'counts': b}
    df = pd.DataFrame(d)
    zz = df.head(20)
    LOGGER.info(
        "\n\n3b.\tExtract top 20 keywords from the subject text for the non-high frequency users\tDone!\n\n{}\n"
        .format(zz))

    # 6 Introduce a new column label to identify new, replied, and forwarded messages
    df = result

    def to_label(sbj):
        l1 = "RE" if sbj.startswith("RE:") else (
            "FW" if sbj.startswith("FW:") else 'NEW')
        return l1

    udf2 = udf(to_label, StringType())
    df_with_label = df.withColumn('label', udf2("Subject"))
    zz = df_with_label.limit(5).toPandas()
    LOGGER.info(
        "\n\n6.\tIntroduce a new column label to identify new, replied, and forwarded messages\tDone!\n\n{}\n"
        .format(zz))

    # 7 Get the trend of the over mail activity using the pivot table from spark itself
    pivotDF = df_with_label.groupBy(
        year("UTC_timestamp").alias('year'),
        month("UTC_timestamp").alias('month')).pivot("label").count().orderBy(
            "year", "month")
    zz = pivotDF.na.fill(0).toPandas()
    LOGGER.info(
        "\n\n7.\tGet the trend of the over mail activity using the pivot table from spark itself\tDone!\n\n{}\n"
        .format(zz))

    # 8 Use k-means clustering to create 4 clusters from the extracted keywords
    raw = result.select("Message_ID", "From", "Subject")
    # Extract word
    from pyspark.ml.feature import Tokenizer
    tokenizer = Tokenizer().setInputCol("Subject").setOutputCol("words")
    transformed = tokenizer.transform(raw)
    # Remove stopwords
    # custom stopwords
    stopwords = StopWordsRemover().getStopWords() + [
        "-", "re:", "fw:", "", "&"
    ]
    remover = StopWordsRemover().setStopWords(stopwords).setInputCol(
        "words").setOutputCol("filtered")
    cleaned = remover.transform(transformed)
    cleaned = cleaned.select("Message_ID", "words", "filtered")
    # Generate features
    from pyspark.ml.feature import CountVectorizer, CountVectorizerModel
    cvmodel = CountVectorizer().setInputCol("filtered").setOutputCol(
        "features").fit(cleaned)
    featured = cvmodel.transform(cleaned)
    kmeans = KMeans(k=4, seed=1)  # 4 clusters here
    model = kmeans.fit(featured.select('features'))
    transformed = model.transform(featured)
    zz = transformed.limit(5).toPandas()
    LOGGER.info(
        "\n\n8.\tUse k-means clustering to create 4 clusters from the extracted keywords\tDone!\n\n{}\n"
        .format(zz))

    # 9 Use LDA to generate 4 topics from the extracted keywords
    LOGGER.info(
        "\n\n9.\tUse LDA to generate 4 topics from the extracted keywords\tDone!\n\n{}\n{}\n{}\n{}\n"
        .format(get_topic(0, transformed), get_topic(1, transformed),
                get_topic(2, transformed), get_topic(3, transformed)))
コード例 #8
0
def main():
    conf = SparkConf().setAppName("Program Number 1")
    sc = SparkContext(conf=conf)
    sc.setLogLevel("ERROR")

    # creates Spark Session
    spark = SparkSession.builder.appName("Program Number 1").getOrCreate()

    # tweets folder address on HDFS server -  ignore files with .tmp extensions (Flume active files).
    inputpath = "hdfs://hdfs input path"

    spark.conf.set("spark.sql.shuffle.partitions", 1)

    # get the raw tweets from HDFS
    raw_tweets = spark.read.format("json").option(
        "inferScehma", "true").option("mode", "dropMalformed").load(inputpath)

    # get the tweet text from the raw data. text is transformed to lower case. Deletes re-tweets. and finally include an index for each tweet
    tweets = raw_tweets.select(
        functions.lower(functions.col("text"))).withColumnRenamed(
            "lower(text)", "text").distinct().withColumn(
                "id", functions.monotonically_increasing_id())

    # Create a tokenizer that Filter away tokens with length < 4, and get rid of symbols like $,#,...
    tokenizer = RegexTokenizer().setPattern("[\\W_]+").setMinTokenLength(
        4).setInputCol("text").setOutputCol("tokens")

    # Tokenize tweets
    tokenized_tweets = tokenizer.transform(tweets)
    remover = StopWordsRemover().setInputCol("tokens").setOutputCol("cleaned")

    # remove stopwords
    cleaned_tweets = remover.transform(tokenized_tweets)

    # create a vector of words that at least appeared in two different tweets, and set maximum vocab size to 20000.
    vectorizer = CountVectorizer().setInputCol("cleaned").setOutputCol(
        "features").setVocabSize(20000).setMinDF(2).fit(cleaned_tweets)
    wordVectors = vectorizer.transform(cleaned_tweets).select("id", "features")

    # LDA
    # create Latent Dirichlet Allocation model and run it on our data with 25 iteration and 5 topics
    lda = LDA(k=5, maxIter=25)
    # fit the model on data
    ldaModel = lda.fit(wordVectors)
    # create topics based on LDA
    lda_topics = ldaModel.describeTopics()
    # show LDA topics

    # ______________________________________________________________________________________________________________
    # LSA
    clean_tweets_list = []
    tweet_list = []
    # for creating the document term matrix for the LSIModel as input
    # this is needed as LSI needs tuples of (vocabulary_index, frequency) form
    for tweet_row in wordVectors.select('features').collect():
        tweet_list.clear()
        # reading the SparseVector of 'features' column (hence the 0 index) and zipping them to a list
        # idx = vocabulary_index, val=frequency of that word in that tweet
        for idx, val in zip(tweet_row[0].indices, tweet_row[0].values):
            # converting the frequency from float to integer
            tweet_list.append((idx, int(val)))
        clean_tweets_list.append(tweet_list[:])

    # calling the LSIModel and passing the number of topics as 5
    lsa_model = LsiModel(clean_tweets_list, num_topics=5)
    # show LSA topics

    # ______________________________________________________________________________________________________________
    # #Comparison

    # get the weights and indices of words from LDA topics in format of List[list[]]
    lda_wordIndices = [row['termIndices'] for row in lda_topics.collect()]
    lda_wordWeights = [row['termWeights'] for row in lda_topics.collect()]

    # get the weights and indices of words from LDA topics in format of numpy array with 5*wordCount shape.
    # each element is the weight of the corresponding word in that specific topic.
    lsa_weightsMatrix = lsa_model.get_topics()

    # function to calculate the similarity between an lsa topic and an lda topic.
    def topic_similarity_calculator(lsa_t, lda_t):
        (lda_index, lda_weight) = lda_t
        sum = 0
        for index, weight in zip(lda_index, lda_weight):
            sum = sum + (np.abs(lsa_t[index] * weight))
        return sum

    # run the similarity function on 25 possibilities (5 LSA * 5 LDA)
    similarity = []
    eachLSA = []
    for i in range(0, 5):
        eachLSA.clear()
        for j in range(0, 5):
            temp = topic_similarity_calculator(
                lsa_weightsMatrix[i], (lda_wordIndices[j], lda_wordWeights[j]))
            eachLSA.append(temp)
        similarity.append(eachLSA[:])

    # Print the similarity table
    # each row is a LDA topic and each column is an LSA topic.
    print(" ")
    print("Similarity table")

    def similarity_print(s):
        i = 1
        print("|--------------------------------------------------------|")
        print("|      |  LSA 1  |  LSA 2  |  LSA 3  |  LSA 4  |  LSA 5  |")
        print("|--------------------------------------------------------|")
        for one, two, three, four, five in zip(*similarity):
            print(
                '|LDA {} | {:+1.4f} | {:+1.4f} | {:+1.4f} | {:+1.4f} | {:+1.4f} |'
                .format(i, one, two, three, four, five))
            print("|--------------------------------------------------------|")
            i = i + 1
#creates the similarity matrix

    similarity_print(similarity)

    # ______________________________________________________________________________________________________________
    # Final result Table
    # Manually found the following Topics to be similar
    # (LSA1 - LDA1)
    # (LSA5 - LDA2)
    # rest are alone
    lsa_words_idx = []
    for idx, curr_topic in enumerate(lsa_weightsMatrix):
        lsa_words_idx.append(np.abs(curr_topic).argsort()[-10:][::-1])
    lsa_topics_bow = {}
    lda_topics_bow = {}
    lsa_bow_list = []
    lda_bow_list = []
    for curr_idx, (lda_topic,
                   lsa_topic) in enumerate(zip(lda_wordIndices,
                                               lsa_words_idx)):
        lsa_bow_list.clear()
        lda_bow_list.clear()
        for idx in range(10):
            lsa_bow_list.append(vectorizer.vocabulary[lsa_topic[idx]])
            lda_bow_list.append(vectorizer.vocabulary[lda_topic[idx]])
        lsa_topics_bow[curr_idx] = lsa_bow_list[:]
        lda_topics_bow[curr_idx] = lda_bow_list[:]

    results = []
    names = []
    # Creating word dictionary for LDA2 and LSA5
    lda2_lsa5 = lda_topics_bow[1][:]
    for word in (lsa_topics_bow[4]):
        if word not in lda2_lsa5:
            lda2_lsa5.append(word)

# Creating word dictionary for LDA1 and LSA1
    lda1_lsa1 = lda_topics_bow[0][:]
    for word in (lsa_topics_bow[0]):
        if word not in lda1_lsa1:
            lda1_lsa1.append(word)
    results.append(lda1_lsa1)
    names.append("LDA1 - LSA1 ")
    results.append(lda2_lsa5)
    names.append("LDA2 - LSA5 ")
    results.append(lda_topics_bow[2])
    names.append("LDA3        ")
    results.append(lda_topics_bow[3])
    names.append("LDA4        ")
    results.append(lda_topics_bow[4])
    names.append("LDA5        ")
    results.append(lsa_topics_bow[1])
    names.append("LSA2        ")
    results.append(lsa_topics_bow[2])
    names.append("LSA3        ")
    results.append(lsa_topics_bow[3])
    names.append("LSA4        ")
    #printing the topics and related words
    print(" ")
    print("Topics Table")
    print(
        "|------------------------------------------------------------------------------------------|"
    )
    print(
        "|    Topic     |  Significant Words                                                    |"
    )
    print(
        "|------------------------------------------------------------------------------------------|"
    )
    for name, r in zip(names, results):
        print('| {} |  {} |'.format(name, r))
        print(
            "|------------------------------------------------------------------------------------------|"
        )

    print(" ")
    print(" ")
コード例 #9
0
ファイル: main.py プロジェクト: Sapfir0/6semestr
print(input_data.take(4))
prepared_data = input_data.map(lambda x: (get_patent_name(x[1]), get_claims(x[1]))) \
    .map(lambda x: (x[0], remove_punctuation(x[1]))) \
    .map(lambda x: (x[0], remove_linebreaks(x[1])))

prepared_df = prepared_data.toDF().selectExpr('_1 as patent_name',
                                              '_2 as patent_claims')
# Разбить claims на токены
tokenizer = Tokenizer(inputCol="patent_claims", outputCol="words")
words_data = tokenizer.transform(prepared_df)
# Отфильтровать токены, оставив только слова
filtered_words_data = words_data.rdd.map(lambda x:
                                         (x[0], x[1], get_only_words(x[2])))

filtered_df = filtered_words_data.toDF().selectExpr('_1 as patent_name',
                                                    '_2 as patent_claims',
                                                    '_3 as words')
# Удалить стоп-слова (союзы, предлоги, местоимения и т.д.)
remover = StopWordsRemover(inputCol='words', outputCol='filtered')
filtered = remover.transform(filtered_df)
vectorizer = CountVectorizer(inputCol='filtered',
                             outputCol='raw_features').fit(filtered)
featurized_data = vectorizer.transform(filtered)

featurized_data.cache()
idf = IDF(inputCol='raw_features', outputCol='features')
idf_model = idf.fit(featurized_data)
rescaled_data = idf_model.transform(featurized_data)
# Вывести таблицу rescaled_data
rescaled_data.show()
spark.stop()
コード例 #10
0
ファイル: analyzer.py プロジェクト: GOVnKOD/KL_KP
def makeWord2VecModel():
    cursor = News.find({})
    text = ""
    for news in cursor:
        text += news['text']
    with open(os.path.join(os.getcwd(), 'word2Vec.txt'), 'w',
              encoding='utf-8') as inputFile:
        inputFile.writelines(text)
    spark = SparkSession.builder.appName("SimpleApplication").getOrCreate()

    # Построчная загрузка файла в RDD
    input_file = spark.sparkContext.textFile('word2Vec.txt')

    print(input_file.collect())
    prepared = input_file.map(lambda x: ([x]))
    df = prepared.toDF()
    prepared_df = df.selectExpr('_1 as text')

    # Разбить на токены
    tokenizer = Tokenizer(inputCol='text', outputCol='words')
    words = tokenizer.transform(prepared_df)

    # Удалить стоп-слова
    stop_words = StopWordsRemover.loadDefaultStopWords('russian')
    remover = StopWordsRemover(inputCol='words',
                               outputCol='filtered',
                               stopWords=stop_words)
    filtered = remover.transform(words)

    # Вывести стоп-слова для русского языка
    print(stop_words)

    # Вывести таблицу filtered
    filtered.show()

    # Вывести столбец таблицы words с токенами до удаления стоп-слов
    words.select('words').show(truncate=False, vertical=True)

    # Вывести столбец "filtered" таблицы filtered с токенами после удаления стоп-слов
    filtered.select('filtered').show(truncate=False, vertical=True)

    # Посчитать значения TF
    vectorizer = CountVectorizer(inputCol='filtered',
                                 outputCol='raw_features').fit(filtered)
    featurized_data = vectorizer.transform(filtered)
    featurized_data.cache()
    vocabulary = vectorizer.vocabulary

    # Вывести таблицу со значениями частоты встречаемости термов.
    featurized_data.show()

    # Вывести столбец "raw_features" таблицы featurized_data
    featurized_data.select('raw_features').show(truncate=False, vertical=True)

    # Вывести список термов в словаре
    print(vocabulary)

    # Посчитать значения DF
    idf = IDF(inputCol='raw_features', outputCol='features')
    idf_model = idf.fit(featurized_data)
    rescaled_data = idf_model.transform(featurized_data)

    # Вывести таблицу rescaled_data
    rescaled_data.show()

    # Вывести столбец "features" таблицы featurized_data
    rescaled_data.select('features').show(truncate=False, vertical=True)

    # Построить модель Word2Vec
    word2Vec = Word2Vec(vectorSize=3,
                        minCount=0,
                        inputCol='words',
                        outputCol='result')
    model = word2Vec.fit(words)
    w2v_df = model.transform(words)
    w2v_df.show()
    persons = []

    cPersons = db.Persones.find({})
    for secName in cPersons:
        persons.append(secName['sName'])

    synonyms = []
    i = 0
    synonyms.append(model.findSynonyms('погибла', 2))

    for word, cosine_distance in synonyms:
        print(str(word))

    spark.stop()
コード例 #11
0
ファイル: tfidf.py プロジェクト: han-keong/kindle_project
    df = df.repartition(20)

    # # Use nltk.word_tokenizer to tokenize words
    # @udf(ArrayType(StringType()))
    # def tokenize(string):
    #     return word_tokenize(string)

    # df = df.withColumn("words", tokenize("reviewText"))

    df = RegexTokenizer(inputCol="reviewText", outputCol="words", pattern="\\W").transform(df)
    df = df.drop("reviewText")

    cv_model = CountVectorizer(inputCol="words", outputCol="tf").fit(df)
    vocabulary = cv_model.vocabulary

    df = cv_model.transform(df)
    df = df.drop("words")
    df.cache()

    df = IDF(inputCol="tf", outputCol="tfidf").fit(df).transform(df)
    df = df.drop("tf")
    df.unpersist()

    @udf(MapType(StringType(), FloatType()))
    def create_map(vector):
        zipped = zip(vector.indices, vector.values)
        return dict((vocabulary[int(x)], float(y)) for (x, y) in zipped)

    results = df.withColumn("tfidf", create_map("tfidf"))

    results.write.json("hdfs:/output/tfidf", mode="overwrite")
コード例 #12
0
class TFIDF:
    def __init__(self):

        self.sparkSession = SparkSession\
            .builder\
            .appName("DocumentSearchEngine")\
            .getOrCreate()

        self.sc = self.sparkSession.sparkContext
        self.tf = None

        # create vocabulary
        vocab_rdd = self.sparkSession.sparkContext.wholeTextFiles(
            "data/word_list.txt")
        wordlist = self.sparkSession.createDataFrame(vocab_rdd,
                                                     ["text", "data"])
        tokenizer = Tokenizer(inputCol="data", outputCol="words")
        wordsData = tokenizer.transform(wordlist)
        self.vocabModel = CountVectorizer(
            inputCol="words", outputCol="rawFeatures").fit(wordsData)
        self.word_to_id = dict()

        for id, word in enumerate(self.vocabModel.vocabulary):
            self.word_to_id[word] = id

        self.id_to_path = dict()
        # destructor
        atexit.register(self.cleanup)

    def get_vocabulary(self):
        return self.vocabModel.vocabulary

    def get_tf_idf(self, stage_folder):
        """

        :param stage_folder_queue:
        :return: sqlobject of path and features
        """

        print(stage_folder)
        documents_rdd = self.sparkSession.sparkContext.wholeTextFiles(
            stage_folder + "/*")
        #documents_rdd.show(20, False)
        #state.logger.debug("documents : %s",[each for each in documents_rdd.collect()])

        documents = self.sparkSession.createDataFrame(documents_rdd,
                                                      ["path", "text"])
        tokenizer = Tokenizer(inputCol="text", outputCol="words")
        wordsData = tokenizer.transform(documents)
        #wordsData.select("path", "words").show(20,False)

        current_tf = self.vocabModel.transform(wordsData)
        #current_tf.show(20, False)

        if self.tf is not None:
            self.tf = self.tf.union(current_tf)
        else:
            self.tf = current_tf

        #self.tf.show(20, False)

        idf = IDF(inputCol="rawFeatures", outputCol="tfidf")
        idfModel = idf.fit(self.tf)

        self.tfidf = idfModel.transform(self.tf)
        #rescaledData.select("path", "features")

        ans = []
        state.logger.debug(
            "TFIDF : %s",
            [each for each in self.tfidf.select("path", "tfidf").collect()])
        for each in self.tfidf.select("path", "tfidf").collect():
            ans.append(each)

        return ans

    def get_tf_idf_map(self, tfidf):

        # create mapping dict
        tf_idf_map = dict()
        file_to_id_map = dict()
        id_to_file_map = dict()
        index = 0
        for row in tfidf:
            doc = row.path
            if doc not in file_to_id_map:
                id_to_file_map[index] = doc
                file_to_id_map[doc] = index
                index += 1
            for word, score in zip(row.tfidf.indices, row.tfidf.values):
                if score > 0:
                    if word not in tf_idf_map:
                        tf_idf_map[word] = []
                    tf_idf_map[word].append((file_to_id_map[doc], score))

        for key in tf_idf_map:
            tf_idf_map[key].sort(key=lambda x: -x[1])

        return tf_idf_map, file_to_id_map, id_to_file_map, self.vocabModel.vocabulary, self.word_to_id

    def cleanup(self):
        self.sparkSession.stop()
コード例 #13
0
ファイル: analysisLDA.py プロジェクト: chopintzc/Enron
sentenceData = spark.createDataFrame(all_data, ["label", "sentence"])
tokenizer = Tokenizer(inputCol="sentence", outputCol="words")
wordsData = tokenizer.transform(sentenceData)
print "original words:"
for words_label in wordsData.select("words", "label").take(printLines):
    print(words_label)
'''
calculate temporal frequency
'''
#hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20)
#featurizedData = hashingTF.transform(wordsData)
cv = CountVectorizer(inputCol="words",
                     outputCol="rawFeatures",
                     vocabSize=vocabNumber,
                     minDF=minDFValue).fit(wordsData)
featurizedData = cv.transform(wordsData)
print "words after TF:"
print cv.vocabulary
for words_label in featurizedData.select("rawFeatures",
                                         "words").take(printLines):
    print(words_label)
'''
calculate IDF
'''
idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
dataset = idfModel.transform(featurizedData)
print "words after IDF:"
for features_label in dataset.select("features", "label").take(printLines):
    print(features_label)
'''
コード例 #14
0
def train_model_sentences_with_person():
    sentences_with_person_collection = get_db_collection_object(
        'SentencesWithPerson')

    with open("sentences_with_person.txt", "w",
              encoding='utf-8') as file_sentences_with_person:
        for sen in sentences_with_person_collection.find():
            file_sentences_with_person.write('{0}\n'.format(sen['sentence']))

    spark = SparkSession \
        .builder \
        .appName("SentenceProcessor") \
        .getOrCreate()

    input_data = spark.sparkContext.textFile('./sentences_with_person.txt')
    prepared_data = input_data.map(lambda x: (x, len(x)))
    prepared_data = prepared_data.filter(lambda x: x[1] > 0)

    prepared_df = prepared_data.toDF().selectExpr('_1 as sentence',
                                                  '_2 as length')
    # prepared_df.show(truncate=False)

    tokenizer = Tokenizer(inputCol="sentence", outputCol="words")
    words_data = tokenizer.transform(prepared_df)
    # words_data.show(truncate=False)

    # Отфильтровать токены, оставив только слова
    filtered_words_data = words_data.rdd.map(
        lambda x: (x[0], x[1], get_only_words(x[2])))
    filtered_df = filtered_words_data.toDF().selectExpr(
        '_1 as sentence', '_2 as length', '_3 as words')
    # filtered_df.show()

    # Удалить стоп-слова (союзы, предлоги, местоимения и т.д.)
    stop_words = stopwords.words('russian')
    remover = StopWordsRemover(inputCol='words',
                               outputCol='filtered',
                               stopWords=stop_words)
    filtered = remover.transform(filtered_df)

    #
    normalize_words_data = filtered.rdd.map(
        lambda x: (x[0], x[1], x[2], normalization_sentence(x[3])))
    normalized_df = normalize_words_data.toDF().selectExpr(
        '_1 as sentence', '_2 as length', '_3 as words',
        '_4 as normalize_words')
    # normalized_df.show()

    #
    vectorizer = CountVectorizer(inputCol='normalize_words',
                                 outputCol='raw_features').fit(normalized_df)
    featurized_data = vectorizer.transform(normalized_df)
    featurized_data.cache()

    #
    idf = IDF(inputCol='raw_features', outputCol='features')
    idf_model = idf.fit(featurized_data)
    rescaled_data = idf_model.transform(featurized_data)

    # Построить модель Word2Vec
    word2Vec = Word2Vec(vectorSize=300,
                        minCount=0,
                        inputCol='normalize_words',
                        outputCol='result')
    doc2vec_pipeline = Pipeline(stages=[tokenizer, word2Vec])
    model = word2Vec.fit(rescaled_data)
    w2v_df = model.transform(rescaled_data)
    # w2v_df.show(truncate=False)

    # print(model.findSynonyms('бочаров', 2).show())

    # sc = spark.sparkContext
    path = './models/model_person'
    #
    # print(sc, path)
    model.write().overwrite().save(path)

    #m = Word2Vec.load('./models/model_person/')
    # pickle.dump(model, './models/model_person/mp.model')

    spark.stop()
コード例 #15
0
    testing_set_raw = df_filtered.filter(
        df_filtered.id >= training_end_idx).repartition(partitions_no)

    # 2. Create vocabulary
    log.warn("Building vocabulary")
    cv_model = CountVectorizer(inputCol="filtered",
                               outputCol="vectors",
                               minDF=minDF,
                               vocabSize=vocab_size).fit(training_set_raw)
    V = len(cv_model.vocabulary)
    log.warn("Vocabulary size = {0}".format(V))

    # 3. Transform documents to BOW representation:
    # each doc is represented as SparseVector: (vocabSize, {word_id:count, word_id:count,...}
    log.warn("Transform training dataset to bow representation")
    training_set = cv_model.transform(training_set_raw).select(
        'id', 'vectors').cache()
    log.warn('Training set: {0} documents'.format(training_set.count()))
    training_set_local = training_set.collect()

    # 4. Initialize model:
    # 4.1 each doc represented by (id, z_n array (topic to word assignment) and c_k_m (topics distribution for doc)
    # 4.2 randomly assign topic to each word in document, increment c_k_m accordingly
    z_m_n = training_set.rdd.map(init, preservesPartitioning=True).cache()

    z_m_n_matrix = z_m_n.flatMap(word_topics).reduceByKey(
        lambda a, b: a + b).collect()
    c_k_global = update_c_k(z_m_n_matrix)
    c_k_n_global = update_c_k_n(z_m_n_matrix)

    c_k_m_x = get_c_k_m_x(
        z_m_n.map(lambda x: (x[0], x[2])).sortByKey(ascending=True).collect())
コード例 #16
0
# ### Sample of 2-word nGrams on Maintenance Notes
tk = Tokenizer(inputCol="note", outputCol="words") # Tokenize
maintTokenized = tk.transform(maintenance)
swr = StopWordsRemover(inputCol="words", outputCol="filtered") # Remove stop-words
maintFiltered = swr.transform(maintTokenized)
ngram = NGram(n=2, inputCol="filtered", outputCol="ngrams") # 2-word nGrams
maintNGrams = ngram.transform(maintFiltered)
maintNGrams.select('ngrams').show(5, truncate=False)

# ### Topic Clustering using Latent Dirichlet Allocation (LDA)
# LDA is a form of un-supervised machine learning that identifies clusters, or topics,
# in the data
cv = CountVectorizer(inputCol="ngrams", outputCol="features", vocabSize=50)\
  .fit(maintNGrams) # CountVectorize converts nGram array into a vector of counts
maintVectors = cv.transform(maintNGrams)
vocabArray = cv.vocabulary
lda = LDA(k=3, maxIter=10)
ldaModel = lda.fit(maintVectors)

ldaModel.write().overwrite().save('lda.mdl')

topics = ldaModel.describeTopics(5)
# We see below that each maintenance log can be clustered based on its text into 
# 1 of 3 topics below. The nGrams in each cluster show clearly 3 types of maintenance
# activities
# 1. Preventive maintenance occurs when the we have 'abnormal readings' or a 'component replacement'
# 2. Corrective maintenance occurs when we have a 'asset shutdown' event or 'asset failure'
# 3. The rest of the logs indicate that no downtime is required (ie. 'maintenance tests passed', 'asset healthy')
for topic in topics.collect():
    print('Topic %d Top 5 Weighted nGrams' % (topic[0]+1))
コード例 #17
0
def main(train_x, train_y, test_x, test_y=None, base='gs'):
    # generate joint feature set
    train_features = elizabeth.preprocess.load(train_x,
                                               train_y,
                                               base=base,
                                               kind='joint').drop('url')
    test_features = elizabeth.preprocess.load(test_x,
                                              test_y,
                                              base=base,
                                              kind='joint').drop('url')

    train_features.show()

    token_counter = CountVectorizer(inputCol='features',
                                    outputCol='tokenCounts',
                                    minDF=10).fit(train_features)
    train = token_counter.transform(train_features).drop('features')
    test = token_counter.transform(test_features).drop('features')

    # convert the string labels to numeric indices
    # the handleInvalid param allows the label indexer to deal with labels that weren't seen during fitting
    label_indexer = StringIndexer(inputCol='label',
                                  outputCol='indexedLabel',
                                  handleInvalid="skip")
    label_indexer = label_indexer.fit(train)
    train = label_indexer.transform(train)
    # the test set won't always have labels
    if test_y is not None:
        test = label_indexer.transform(test)

    index_labeller = IndexToString(inputCol='prediction',
                                   outputCol='predictedClass',
                                   labels=label_indexer.labels)

    # create and train a Random Forest classifier
    rf = RandomForestClassifier(labelCol='indexedLabel',
                                featuresCol='tokenCounts',
                                numTrees=20,
                                maxDepth=10,
                                minInfoGain=0.0,
                                seed=12345)
    model = rf.fit(train)
    prediction = model.transform(test)
    prediction = index_labeller.transform(
        prediction)  # DF[id, url, ... prediction, predictedClass]

    # If labels are given for the test set, print a score.s
    if test_y:
        evaluator = MulticlassClassificationEvaluator(
            labelCol="indexedLabel",
            predictionCol='prediction',
            metricName='accuracy')
        accuracy = evaluator.evaluate(prediction)
        print("\n\tAccuracy on test set: %0.6f\n" % accuracy)

    # If no labels are given for the test set, print predictions.
    else:
        prediction = prediction.orderBy(prediction.id).select(
            prediction.predictedClass)
        prediction = prediction.rdd.map(
            lambda prediction: int(prediction.predictedClass))
        prediction = prediction.toLocalIterator()
        print(*prediction, sep='\n')
コード例 #18
0
class BM25Model(object):
    """
    Computes BM25 score.
    """
    def __init__(self, k=1.2, b=.75):
        self.k = k
        self.b = b
        self.tok = Tokenizer(inputCol='__input', outputCol='__tokens')
        self.vec = CountVectorizer(inputCol='__tokens', outputCol='__counts')
        self.idf = IDF(inputCol='__counts', outputCol='__idf')
        self.train_col = None
        self.udf = None
        self.is_fit = False

    def fit(self, df, train_col):
        """
        Does fitting on input df.
            df: a pyspark dataframe.
            train_col (string): The name of the column containing training documents.
            
        Returns: self, a 
        """
        self.train_col = train_col
        df_ = self.tok.transform(df.withColumnRenamed(train_col, '__input'))
        mean_dl = df_.select(F.mean(F.size(F.col('__tokens')))).collect()[0][0]
        self.vec = self.vec.fit(df_)
        df_ = self.vec.transform(df_)
        self.idf = self.idf.fit(df_)
        #this will reset value of self.udf to be a working udf function.
        exec(udf_template.format(mean_dl, self.k, self.b))
        self.is_fit = True
        return self

    def transform(self,
                  df,
                  score_col,
                  bm25_output_name='bm25',
                  tf_output_name=None,
                  ntf_output_name=None,
                  tfidf_output_name=None):
        """
        Computes BM25 score, 
            along with normalized term frequency (ntf) and tfidf.
            These three additional scores come "for free" with bm25
            but are only returned optionally.
        """
        if not self.is_fit:
            raise Exception(
                "You must fit the BM25 model with a call to .fit() first.")
        columns = df.columns
        df_ = self.tok.transform(df.withColumnRenamed(score_col, '__input'))
        df_ = self.vec.transform(df_)
        df_ = self.idf.transform(df_)
        df_ = (df_.withColumnRenamed(
            '__counts', '__query_counts').withColumnRenamed(
                '__input',
                score_col)).select(columns +
                                   [score_col, '__query_counts', '__idf'])
        df_ = self.tok.transform(
            df_.withColumnRenamed(self.train_col, '__input'))
        df_ = self.vec.transform(df_)
        df_ = df_.withColumnRenamed('__counts', '__item_counts')
        df_ = df_.withColumn(
            'bm25',
            self.udf(F.col('__query_counts'), F.col('__item_counts'),
                     F.col('__idf')))
        df_ = df_.withColumnRenamed('__input', self.train_col)
        computed_values = df_.withColumn(
            'more',
            F.explode(F.array(F.col('bm25')))).select(columns + ['bm25.*'])

        #this is logic for naming output column(s)
        final_selection = columns
        if bm25_output_name is not None:
            computed_values = computed_values.withColumnRenamed(
                'bm25', bm25_output_name)
            final_selection.append(bm25_output_name)
        if tf_output_name is not None:
            computed_values = computed_values.withColumnRenamed(
                'tf', tf_output_name)
            final_selection.append(tf_output_name)
        if ntf_output_name is not None:
            computed_values = computed_values.withColumnRenamed(
                'ntf', ntf_output_name)
            final_selection.append(ntf_output_name)
        if tfidf_output_name is not None:
            computed_values = computed_values.withColumnRenamed(
                'tfidf', tfidf_output_name)
            final_selection.append(tfidf_output_name)

        return computed_values.select(final_selection)
コード例 #19
0
def compute(sc, topLeft, bottomRight, step, datasetPath, k, gfs):
    sqlContext = SQLContext(sc)
    data = sc.textFile(datasetPath)
    data = data.mapPartitions(lambda x: csv.reader(x))
    header = data.first()
    data = data.filter(lambda x: x != header)
    result_to_write = []
    res_computation = []
    step = check_step(topLeft, bottomRight, step)
    squares = get_squares(topLeft, bottomRight, step)
    # start computing elapsed time here
    start_time = time.time()
    data = data.map(lambda x: is_inside(x, topLeft, bottomRight, step, squares)). \
        filter(lambda x: x is not None)
    data = data.map(remove_punctuation). \
        map(split_string_into_array). \
        filter(remove_empty_array). \
        map(create_row). \
        groupByKey(). \
        map(lambda x : (x[0], list(x[1])))
    # create the dataframes
    allDf = []
    for df in data.collect():
        if df:
            allDf.append([df[0], sqlContext.createDataFrame(df[1])])

    for docDFs in allDf:
        docDF = docDFs[1]
        squareId = docDFs[0]
        StopWordsRemover.loadDefaultStopWords('english')
        newDocDF_eng = StopWordsRemover(inputCol="words", outputCol="filtered_eng"). \
            transform(docDF)
        newDocDF_eng = newDocDF_eng.drop('words')
        StopWordsRemover.loadDefaultStopWords('italian')
        newDocDF_ita = StopWordsRemover(inputCol="filtered_eng", outputCol="filtered_ita"). \
            transform(newDocDF_eng)
        newDocDF_ita = newDocDF_ita.drop('filtered_eng')
        StopWordsRemover.loadDefaultStopWords('german')
        newDocDF_ger = StopWordsRemover(inputCol="filtered_ita", outputCol="filtered_ger"). \
            transform(newDocDF_ita)
        newDocDF_ger = newDocDF_ger.drop('filtered_ita')

        model = CountVectorizer(inputCol="filtered_ger", outputCol="vectors"). \
            fit(newDocDF_ger)
        result = model.transform(newDocDF_ger)
        corpus = result.select("idd", "vectors").rdd.map(create_corpus).cache()
        # cluster the documents into the k topics using LDA
        ldaModel = LDA.train(corpus,
                             k=k,
                             maxIterations=100,
                             optimizer='online')
        vocabArray = model.vocabulary
        wordNumbers = 10  # number of words per topic
        topicIndices = sc.parallelize(
            ldaModel.describeTopics(maxTermsPerTopic=wordNumbers))

        toBePrinted = min(len(vocabArray), wordNumbers)
        topics_final = topicIndices.map(
            lambda x: topic_render(x, toBePrinted, vocabArray)).collect()
        # compute labels
        topics_label = []
        for topic in topics_final:
            for topic_term in topic:
                if topic_term not in topics_label:
                    topics_label.append(topic_term)
                    break
        # print topics
        s = "; "
        res = "{}, {}, {}, {}, {}".format(topLeft.x, topLeft.y, bottomRight.x,
                                          bottomRight.y, s.join(topics_label))
        result_to_write.append(res)
        res_computation.append(topics_label)

    end_time = time.time()
    elapsed_time = end_time - start_time
    result_to_write.append(elapsed_time)
    to_write = sc.parallelize(result_to_write)
    # get dataset size from file name
    size = datasetPath.split('.')[0].split('_')[1]
    if gfs:
        output_folder = "/tmp/Topic_Zoomer_" + str(
            time.ctime(start_time)).replace(' ', '_').replace(':',
                                                              '-') + '_' + size
    else:
        output_folder = "Topic_Zoomer_" + str(time.ctime(start_time)).replace(
            ' ', '_').replace(':', '-') + '_' + size
    to_write.saveAsTextFile(output_folder)

    if gfs:
        copyHdfsCmd = 'hdfs dfs -copyToLocal {} {}'.format(
            output_folder, output_folder)
        copyBucketCmd = 'gsutil cp -r {} {}'.format(output_folder,
                                                    gfs_output_path_hdfs)
        copyRecBucketCmd = 'gsutil cp -r {} {}'.format(recFileFolder,
                                                       gfs_output_path_hdfs)
        copyHdfsRes = subprocess.call(shlex.split(copyHdfsCmd))
        copyBucketRes = subprocess.call(shlex.split(copyBucketCmd))
        copyRecBucketRes = subprocess.call(shlex.split(copyRecBucketCmd))
        # some exit code checks
        if copyBucketRes or copyHdfsRes or copyRecBucketRes:
            print('hdfsRes: {}'.format(copyHdfsRes))
            print('bucketResComp: {}'.format(copyBucketRes))
            print('bucketResRec: {}'.format(copyRecBucketRes))
            print('Something went wrong while copying results')
    return res_computation