def process_corpus(self, args): processed_dir = args.processed_dir tag_filter = args.tag_filter if os.path.exists(processed_dir): shutil.rmtree(processed_dir) os.makedirs(processed_dir) corpus = french_main.Corpus() if args.genre_compare == "True" and os.path.exists('genre_corpus'): genre_corpus = french_main.GenreCorpus() elif args.genre_compare == "True": raise ValueError("""Error: specified genre clustering but genre_corpus/ directory does not exist.""") else: pass corpus.group_articles_by_publication() for text in corpus.texts: self.filter_tags(processed_dir, tag_filter, text) if args.genre_compare == "True": for text in genre_corpus.texts: self.filter_tags(processed_dir, tag_filter, text, genre_text=True)
import main from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.cluster import KMeans import matplotlib.pyplot as plt from sklearn.decomposition import PCA # from bokeh.plotting import figure, output_file, show corpus = main.Corpus() token_dict = {} # common_lemmas = [FreqDist(text.stems).most_common(3000) # for text in corpus.texts] for text in corpus.texts: token_dict[text.filename] = text.text tfidf = TfidfVectorizer() tfs = tfidf.fit_transform(token_dict.values()) names = [text.filename for text in corpus.texts] # fit and then predict will try and slot them into the nclusters fitted = KMeans(n_clusters=5).fit(tfs) classes = fitted.predict(tfs) sklearn = PCA(n_components=5) sklearn_transf = sklearn.fit_transform(tfs.toarray()) plt.scatter(sklearn_transf[:, 0], sklearn_transf[:, 1], c=classes, s=500) for i in range(len(classes)): plt.text(sklearn_transf[i, 0], sklearn_transf[i, 1], s=names[i]) plt.show() savefig('clustering.png')
import main if __name__ == '__main__': c = main.Corpus() print c.__name__
def task1(): c = main.Corpus() print c.__class__.__name__