def preprocess_comment(comment):
    import preprocessing

    comment = comment.decode('cp1252')

    #Tu dodatkowo uzywam stemmera i wycinam stopwords
    #comment = preprocessing.preprocess_pipeline(comment, "english", "LancasterStemmer", True, True, False)
    #comment = preprocessing.preprocess_pipeline(comment, "english", "WordNetLemmatizer", True, True, False)
    comment = preprocessing.preprocess_pipeline(comment, "english", "PorterStemmer", True, True, False)
    #comment = preprocessing.preprocess_pipeline(comment, "english", "SnowballStemmer", True, True, False)

    return comment
Beispiel #2
0
def preprocess_comment(comment):
    import preprocessing
    comment = comment.decode('cp1252')
    '''
    preprocessing_pipeline(komentarz, jezyk, stemmer_
    type, do_remove_stopwords, do_clean_html)
    '''
    """
    comment = preprocessing.preprocess_pipeline(comment, "english",
                                                False, True, False, False)
    """                                            
    #Tu dodatkowo uzywam stemmera i wycinam stopwords
    comment = preprocessing.preprocess_pipeline(comment, "english", "LancasterStemmer", True, True, False)
    return comment
Beispiel #3
0
def file_to_words(url):
    return [(word, 1) for word in preprocess_pipeline(
        UrlProcessor.get_parsed_page(url).text_content())]
Beispiel #4
0
 def stem(s):
         return preprocessing.preprocess_pipeline(s, return_as_str=True, do_remove_stopwords=True, do_clean_html=False)