Esempio n. 1
0
    def __init__(self, data=get_all_offline_articles(), 
                                    language='english', stop_words='english'):
        super(TextProcessor, self).__init__(data)
        #self.trainset = deepcopy(data.values()[0:1])
        self.trainset = deepcopy(data.values())
        # Sorting by title
        self.trainset = sorted(self.trainset, key=attrgetter('title'))

        self.data = data
        #analyzer = WordNGramAnalyzer(stop_words=ENGLISH_STOP_WORDS)
        #self.count_vectorizer = CountVectorizer(analyzer=analyzer)
        self.count_vectorizer = CountVectorizer(stop_words=stop_words)

        self.tfidf = TfidfTransformer(norm="l2")
        self.stemmer = Stemmer(language)
 def setUp(self):
     all_articles = get_all_offline_articles()
     self.tp =  TextProcessor(all_articles)
Esempio n. 3
0
def createTextProcessor():

    all_articles = get_all_offline_articles()
    tp = TextProcessor(all_articles)
    tp.prepare()
    return tp