def process(self, documents): [data_tfidf, data_tf, data_ttf, corpus, urls] = getTermStatistics(documents, self.rm_stopwords, self.rm_numbers, self.pos_tags, self.mapping, self.es_index, self.es_doc_type, self.es) self.tfidfArray = data_tfidf self.tfArray = data_tf self.ttf = data_ttf self.corpus = corpus self.documents = urls
def process(self, documents): [data_tfidf, data_tf, data_ttf, corpus, urls] = getTermStatistics(documents, self.rm_stopwords, self.rm_numbers, self.pos_tags, self.term_freq, mapping=self.mapping, es_index=self.es_index, es_doc_type=self.es_doc_type, es=self.es) self.tfidfArray = data_tfidf self.tfArray = data_tf self.ttf = data_ttf self.corpus = corpus self.documents = urls
def term_tfidf(self): urls = list(self.urls_set) [data, corpus] = getTermStatistics(urls) #all_docs = get_bag_of_words(list(self.urls_set)) #return tfidf.tfidf(all_docs).getTfidfArray() return [urls, corpus, data.toarray()]
def process(self, documents): [data, corpus] = getTermStatistics(documents) self.tfidfArray = data self.corpus = corpus
def term_tfidf(self, urls): [data, data_tf, data_ttf , corpus, urls] = getTermStatistics(urls, mapping=es_info['mapping'], es_index=es_info['activeCrawlerIndex'], es_doc_type=es_info['docType'], es=self._es) return [data, data_tf, data_ttf, corpus, urls]
def process(self, documents, es_index = 'memex', es_doc_type = 'page', es = None): [data_tfidf, data_tf, data_ttf, corpus] = getTermStatistics(documents, es_index, es_doc_type, es) self.tfidfArray = data_tfidf self.tfArray = data_tf self.ttf = data_ttf self.corpus = corpus