def preprocess_comment(comment): import preprocessing comment = comment.decode('cp1252') #Tu dodatkowo uzywam stemmera i wycinam stopwords #comment = preprocessing.preprocess_pipeline(comment, "english", "LancasterStemmer", True, True, False) #comment = preprocessing.preprocess_pipeline(comment, "english", "WordNetLemmatizer", True, True, False) comment = preprocessing.preprocess_pipeline(comment, "english", "PorterStemmer", True, True, False) #comment = preprocessing.preprocess_pipeline(comment, "english", "SnowballStemmer", True, True, False) return comment
def preprocess_comment(comment): import preprocessing comment = comment.decode('cp1252') ''' preprocessing_pipeline(komentarz, jezyk, stemmer_ type, do_remove_stopwords, do_clean_html) ''' """ comment = preprocessing.preprocess_pipeline(comment, "english", False, True, False, False) """ #Tu dodatkowo uzywam stemmera i wycinam stopwords comment = preprocessing.preprocess_pipeline(comment, "english", "LancasterStemmer", True, True, False) return comment
def file_to_words(url): return [(word, 1) for word in preprocess_pipeline( UrlProcessor.get_parsed_page(url).text_content())]
def stem(s): return preprocessing.preprocess_pipeline(s, return_as_str=True, do_remove_stopwords=True, do_clean_html=False)