def __init__(self, data=get_all_offline_articles(), language='english', stop_words='english'): super(TextProcessor, self).__init__(data) #self.trainset = deepcopy(data.values()[0:1]) self.trainset = deepcopy(data.values()) # Sorting by title self.trainset = sorted(self.trainset, key=attrgetter('title')) self.data = data #analyzer = WordNGramAnalyzer(stop_words=ENGLISH_STOP_WORDS) #self.count_vectorizer = CountVectorizer(analyzer=analyzer) self.count_vectorizer = CountVectorizer(stop_words=stop_words) self.tfidf = TfidfTransformer(norm="l2") self.stemmer = Stemmer(language)
def setUp(self): all_articles = get_all_offline_articles() self.tp = TextProcessor(all_articles)
def createTextProcessor(): all_articles = get_all_offline_articles() tp = TextProcessor(all_articles) tp.prepare() return tp