Ejemplo n.º 1
0
 def process(self, documents):
     [data_tfidf, data_tf, data_ttf, corpus, urls] = getTermStatistics(documents, self.rm_stopwords, self.rm_numbers, self.pos_tags, self.mapping, self.es_index, self.es_doc_type, self.es)
     self.tfidfArray = data_tfidf
     self.tfArray = data_tf
     self.ttf = data_ttf
     self.corpus = corpus
     self.documents = urls
Ejemplo n.º 2
0
 def process(self, documents):
     [data_tfidf, data_tf, data_ttf, corpus,
      urls] = getTermStatistics(documents,
                                self.rm_stopwords,
                                self.rm_numbers,
                                self.pos_tags,
                                self.term_freq,
                                mapping=self.mapping,
                                es_index=self.es_index,
                                es_doc_type=self.es_doc_type,
                                es=self.es)
     self.tfidfArray = data_tfidf
     self.tfArray = data_tf
     self.ttf = data_ttf
     self.corpus = corpus
     self.documents = urls
Ejemplo n.º 3
0
 def term_tfidf(self):
     urls = list(self.urls_set)
     [data, corpus] = getTermStatistics(urls)
     #all_docs = get_bag_of_words(list(self.urls_set))
     #return tfidf.tfidf(all_docs).getTfidfArray()
     return [urls, corpus, data.toarray()]
Ejemplo n.º 4
0
 def process(self, documents):
     [data, corpus] = getTermStatistics(documents)
     self.tfidfArray = data
     self.corpus = corpus
 def term_tfidf(self, urls):
   [data, data_tf, data_ttf , corpus, urls] = getTermStatistics(urls, mapping=es_info['mapping'], es_index=es_info['activeCrawlerIndex'], es_doc_type=es_info['docType'], es=self._es)
   return [data, data_tf, data_ttf, corpus, urls]
Ejemplo n.º 6
0
 def process(self, documents, es_index = 'memex', es_doc_type = 'page', es = None):
     [data_tfidf, data_tf, data_ttf, corpus] = getTermStatistics(documents, es_index, es_doc_type, es)
     self.tfidfArray = data_tfidf
     self.tfArray = data_tf
     self.ttf = data_ttf
     self.corpus = corpus
Ejemplo n.º 7
0
 def process(self, documents):
     [data, corpus] = getTermStatistics(documents)
     self.tfidfArray = data
     self.corpus = corpus
Ejemplo n.º 8
0
 def term_tfidf(self):
     urls = list(self.urls_set)
     [data, corpus] = getTermStatistics(urls)
     #all_docs = get_bag_of_words(list(self.urls_set))
     #return tfidf.tfidf(all_docs).getTfidfArray()
     return [urls, corpus, data.toarray()]