def __make_dataset_bags(self, in_files):
     result = []
     file_indices = []
     for (filename, index) in zip(in_files, itertools.count()):
         sentences = self.sentences_extractor(filename)
         sentences_filtered = []
         for raw_sentence in sentences:
             sentences_filtered.append([word for word in raw_sentence if word not in self.stop_list])
         bag = bag_of_words.sentences_to_bag_of_words(sentences_filtered)
         if len(bag) >= 20:
             result.append(bag)
             file_indices.append(index)
     return (result, file_indices)
Ejemplo n.º 2
0
def process_data(in_folder, in_sentences_extractor):
    bags = []
    (files, categories) = dataset_loading.get_files_list(in_folder)
    for filename in files:
        sentences = in_sentences_extractor(filename)
        bags.append(bag_of_words.sentences_to_bag_of_words(sentences))
    categories_dict = dataset_loading.get_categories_dict(categories)
    categories_vector = [categories_dict[category] for category in categories]

    vectorizer = DictVectorizer()
    # builds a vocabulary out of all words in both sets
    term_document_matrix = vectorizer.fit_transform(bags)
    tfidf_transformer = TfidfTransformer()
    # in this matrix rows are documents, columns - features (terms' tfidf's)
    tfidf_matrix = tfidf_transformer.fit_transform(term_document_matrix)
    return (tfidf_matrix, categories_dict, categories_vector)