def process_corpus(self, args):
        processed_dir = args.processed_dir
        tag_filter = args.tag_filter

        if os.path.exists(processed_dir):
            shutil.rmtree(processed_dir)

        os.makedirs(processed_dir)

        corpus = french_main.Corpus()
        if args.genre_compare == "True" and os.path.exists('genre_corpus'):
            genre_corpus = french_main.GenreCorpus()
        elif args.genre_compare == "True":
            raise ValueError("""Error: specified genre clustering but
                 genre_corpus/ directory does not exist.""")
        else:
            pass
        corpus.group_articles_by_publication()
        for text in corpus.texts:
            self.filter_tags(processed_dir, tag_filter, text)

        if args.genre_compare == "True":
            for text in genre_corpus.texts:
                self.filter_tags(processed_dir,
                                 tag_filter,
                                 text,
                                 genre_text=True)
Esempio n. 2
0
import main
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
# from bokeh.plotting import figure, output_file, show

corpus = main.Corpus()

token_dict = {}
# common_lemmas = [FreqDist(text.stems).most_common(3000)
# for text in corpus.texts]
for text in corpus.texts:
    token_dict[text.filename] = text.text

tfidf = TfidfVectorizer()
tfs = tfidf.fit_transform(token_dict.values())

names = [text.filename for text in corpus.texts]

# fit and then predict will try and slot them into the nclusters
fitted = KMeans(n_clusters=5).fit(tfs)
classes = fitted.predict(tfs)
sklearn = PCA(n_components=5)
sklearn_transf = sklearn.fit_transform(tfs.toarray())
plt.scatter(sklearn_transf[:, 0], sklearn_transf[:, 1], c=classes, s=500)
for i in range(len(classes)):
    plt.text(sklearn_transf[i, 0], sklearn_transf[i, 1], s=names[i])

plt.show()
savefig('clustering.png')
Esempio n. 3
0
import main

if __name__ == '__main__':
    c = main.Corpus()
    print c.__name__
Esempio n. 4
0
def task1():
    c = main.Corpus()
    print c.__class__.__name__