def setUp(self) -> None: self.corpus = Elsevier_Corpus_Reader.ScopusProcessedCorpusReader( "Corpus/Processed_corpus/") self.loader = Elsevier_Corpus_Reader.CorpuKfoldLoader(self.corpus, n_folds=12, shuffle=False) self.subset = next(self.loader.fileids(test=True)) self.model = Pipeline([("norm", Corpus_Vectorizer.TitleNormalizer()), ("vect", Corpus_Vectorizer.OneHotVectorizer()), ('clusters', Corpus_Cluster.HierarchicalClustering())])
def plot_term_occurance_over_time(corpus, n=30, fileids=None): # ############################################# # Plot mentions of characters through over time # ############################################# frequent_terms, _ = most_common_terms(corpus, n=n, fileids=fileids) docs = corpus.title_tagged(fileids=subset) normalizer = Corpus_Vectorizer.TextNormalizer() normed = normalizer.transform(docs) dates = corpus.pub_date(form='year', fileids=subset) x, y = [], [] for doc, date in zip(normed, dates): for i, term in enumerate(frequent_terms): if term in doc: x.append(date) y.append(i) # Create the plot fig, ax = plt.subplots(figsize=(12, 6)) plt.plot(x, y, "*") plt.yticks(list(range(len(frequent_terms))), frequent_terms, size=8) plt.ylim(-1, len(frequent_terms)) plt.title("Character Mentions in the Wizard of Oz") plt.show()
def most_common_terms(corpus, n=50, fileids=None): # get the most common words in the corpus words = corpus.title_tagged(fileids=fileids) normalizer = Corpus_Vectorizer.TextNormalizer() normed = (sent for title in normalizer.transform(words) for sent in title) word_count = Counter(normed) return zip(*word_count.most_common(n))
def test_transform(self): target = 9 docs = list(self.corpus.title_tagged(fileids=self.subset)) labels = [ self.corpus.categories(fileids=fileid)[0] for fileid in self.subset ] normal = Corpus_Vectorizer.TextNormalizer() normal.fit(docs, labels) normed = normal.transform(docs) vec = Corpus_Vectorizer.CorpusFrequencyVector() vector = vec.fit_transform(normed) result = list(vector)[0].toarray().sum() self.assertEqual(result, target)
def test_transform(self): target = ['histologic', 'evaluation', 'implant', 'follow', 'flapless', 'flap', 'surgery', 'study', 'canine'] docs = list(self.corpus.title_tagged(fileids=self.subset)) labels = [ self.corpus.categories(fileids=fileid)[0] for fileid in self.subset ] normal = Corpus_Vectorizer.TextNormalizer() normal.fit(docs, labels) result = list(normal.transform(docs))[0] self.assertEqual(result, target)
def test_titleNormalizer(self): target = 'histologic evaluation implant follow flapless flap surgery ' \ 'study canine' docs = list(self.corpus.title_tagged(fileids=self.subset)) labels = [ self.corpus.categories(fileids=fileid)[0] for fileid in self.subset ] normal = Corpus_Vectorizer.TitleNormalizer() normal.fit(docs, labels) result = list(normal.transform(docs))[0] self.assertEqual(result, target)
def __init__(self, n_components=50, estimator='LDA'): """ n_topics is the desired number of topics To use Latent Semantic Analysis, set estimator to 'LSA', To use Non-Negative Matrix Factorization, set estimator to 'NMF', otherwise, defaults to Latent Dirichlet Allocation ('LDA'). """ self.n_components = n_components if estimator == 'LSA': self.estimator = TruncatedSVD(n_components=self.n_components) elif estimator == 'NMF': self.estimator = NMF(n_components=self.n_components) else: self.estimator = LatentDirichletAllocation( n_components=self.n_components) self.model = Pipeline([ ('norm', Corpus_Vectorizer.TextNormalizer()), # ('vect', Corpus_Vectorizer.CorpusFrequencyVector()), # ('vect', Corpus_Vectorizer.CorpusOneHotVector()), ('vect', Corpus_Vectorizer.CorpusTFIDVector()), ('model', self.estimator) ])
def cooccurrence(corpus, terms, fileids=None): # get the possible paris from the most common words possible_pairs = list(itertools.combinations(terms, 2)) # create an empty dictionary containing an entry for each pair of words cooccurring = dict.fromkeys(possible_pairs, 0) # run through each document title and invriment coccurance of terms docs = corpus.title_tagged(fileids=fileids) normalizer = Corpus_Vectorizer.TextNormalizer() normed = (dd for dd in normalizer.transform(docs)) for doc in normed: for pair in possible_pairs: if pair[0] in doc and pair[1] in doc: cooccurring[pair] += 1 return cooccurring
def plot_tsne_clusters(corpus, fileids=None, labels=None): from yellowbrick.text import TSNEVisualizer from sklearn.feature_extraction.text import TfidfVectorizer words = corpus.title_tagged(fileids=fileids) normalizer = Corpus_Vectorizer.TextNormalizer() normed = (sent for title in normalizer.transform(words) for sent in title) # normed = (dd for dd in normalizer.transform(docs)) tfidf = TfidfVectorizer() procd = tfidf.fit_transform(normed) tsne = TSNEVisualizer() if labels is None: tsne.fit(procd) else: tsne.fit(procd, ["c{}".format(c) for c in labels]) tsne.poof()
def matrix(corpus, terms, fileids=None): # # get the most common words in the corpus # frequent_terms, _ = most_common_terms(corpus, n=n, fileids=fileids) # for term in frequent_terms: print(term) # run through each document title and invriment coccurance of terms docs = corpus.title_tagged(fileids=fileids) normalizer = Corpus_Vectorizer.TextNormalizer() normed = normalizer.transform(docs) mtx = [] for first in terms: row = [] for second in terms: count = 0 for doc in normed: if first in doc and second in doc: count += 1 row.append(count) mtx.append(row) return mtx
# # Plot hierarchical clustering # model = Pipeline([ # ("norm", Corpus_Vectorizer.TitleNormalizer()), # ("vect", Corpus_Vectorizer.OneHotVectorizer()), # ('clusters', Corpus_Cluster.HierarchicalClustering()) # ]) # # clusters = model.fit_transform(docs) # labels = model.named_steps['clusters'].labels # children = model.named_steps['clusters'].children # # plot_dendrogram(children) # decompose data to 2D reduce = Pipeline([ ("norm", Corpus_Vectorizer.TitleNormalizer()), ("vect", Corpus_Vectorizer.OneHotVectorizer()), ('pca', PCA(n_components=2)) ]) X2d = reduce.fit_transform(docs) # plot Kmeans model = Pipeline([ ("norm", Corpus_Vectorizer.TitleNormalizer()), ("vect", Corpus_Vectorizer.OneHotVectorizer()), ('clusters', Corpus_Cluster.MiniBatchKMeansClusters(k=3)) ]) clusters = model.fit_transform(docs)
def test_tokenize(self): target = ['the', 'eleph', 'sneez', 'at', 'the', 'sight', 'of', 'potato'] result = [word for word in Corpus_Vectorizer.tokenize(self.corpus[0])] self.assertEqual(result, target) self.assertEqual(len(result), len(target))