def setUp(self) -> None:
     self.corpus = Elsevier_Corpus_Reader.ScopusProcessedCorpusReader(
         "Corpus/Processed_corpus/")
     self.loader = Elsevier_Corpus_Reader.CorpuKfoldLoader(self.corpus,
                                                           n_folds=12,
                                                           shuffle=False)
     self.subset = next(self.loader.fileids(test=True))
     self.model = Pipeline([("norm", Corpus_Vectorizer.TitleNormalizer()),
                            ("vect", Corpus_Vectorizer.OneHotVectorizer()),
                            ('clusters',
                             Corpus_Cluster.HierarchicalClustering())])
def plot_term_occurance_over_time(corpus, n=30, fileids=None):
    # #############################################
    # Plot mentions of characters through over time
    # #############################################
    frequent_terms, _ = most_common_terms(corpus, n=n, fileids=fileids)
    docs = corpus.title_tagged(fileids=subset)

    normalizer = Corpus_Vectorizer.TextNormalizer()
    normed = normalizer.transform(docs)

    dates = corpus.pub_date(form='year', fileids=subset)

    x, y = [], []
    for doc, date in zip(normed, dates):
        for i, term in enumerate(frequent_terms):
            if term in doc:
                x.append(date)
                y.append(i)

    # Create the plot
    fig, ax = plt.subplots(figsize=(12, 6))

    plt.plot(x, y, "*")

    plt.yticks(list(range(len(frequent_terms))), frequent_terms, size=8)
    plt.ylim(-1, len(frequent_terms))
    plt.title("Character Mentions in the Wizard of Oz")
    plt.show()
def most_common_terms(corpus, n=50, fileids=None):
    # get the most common words in the corpus
    words = corpus.title_tagged(fileids=fileids)
    normalizer = Corpus_Vectorizer.TextNormalizer()
    normed = (sent for title in normalizer.transform(words) for sent in title)
    word_count = Counter(normed)

    return zip(*word_count.most_common(n))
Beispiel #4
0
    def test_transform(self):
        target = 9

        docs = list(self.corpus.title_tagged(fileids=self.subset))
        labels = [
            self.corpus.categories(fileids=fileid)[0]
            for fileid in self.subset
        ]
        normal = Corpus_Vectorizer.TextNormalizer()
        normal.fit(docs, labels)
        normed = normal.transform(docs)

        vec = Corpus_Vectorizer.CorpusFrequencyVector()
        vector = vec.fit_transform(normed)

        result = list(vector)[0].toarray().sum()

        self.assertEqual(result, target)
Beispiel #5
0
    def test_transform(self):
        target = ['histologic', 'evaluation', 'implant', 'follow', 'flapless',
                  'flap', 'surgery', 'study', 'canine']
        docs = list(self.corpus.title_tagged(fileids=self.subset))
        labels = [
            self.corpus.categories(fileids=fileid)[0]
            for fileid in self.subset
        ]
        normal = Corpus_Vectorizer.TextNormalizer()
        normal.fit(docs, labels)
        result = list(normal.transform(docs))[0]

        self.assertEqual(result, target)
Beispiel #6
0
    def test_titleNormalizer(self):
        target = 'histologic evaluation implant follow flapless flap surgery ' \
                 'study canine'
        docs = list(self.corpus.title_tagged(fileids=self.subset))
        labels = [
            self.corpus.categories(fileids=fileid)[0]
            for fileid in self.subset
        ]
        normal = Corpus_Vectorizer.TitleNormalizer()
        normal.fit(docs, labels)
        result = list(normal.transform(docs))[0]

        self.assertEqual(result, target)
    def __init__(self, n_components=50, estimator='LDA'):
        """
        n_topics is the desired number of topics
        To use Latent Semantic Analysis, set estimator to 'LSA',
        To use Non-Negative Matrix Factorization, set estimator to 'NMF',
        otherwise, defaults to Latent Dirichlet Allocation ('LDA').
        """
        self.n_components = n_components

        if estimator == 'LSA':
            self.estimator = TruncatedSVD(n_components=self.n_components)
        elif estimator == 'NMF':
            self.estimator = NMF(n_components=self.n_components)
        else:
            self.estimator = LatentDirichletAllocation(
                n_components=self.n_components)

        self.model = Pipeline([
            ('norm', Corpus_Vectorizer.TextNormalizer()),
            # ('vect', Corpus_Vectorizer.CorpusFrequencyVector()),
            # ('vect', Corpus_Vectorizer.CorpusOneHotVector()),
            ('vect', Corpus_Vectorizer.CorpusTFIDVector()),
            ('model', self.estimator)
        ])
def cooccurrence(corpus, terms, fileids=None):
    # get the possible paris from the most common words
    possible_pairs = list(itertools.combinations(terms, 2))

    # create an empty dictionary containing an entry for each pair of words
    cooccurring = dict.fromkeys(possible_pairs, 0)

    # run through each document title and invriment coccurance of terms
    docs = corpus.title_tagged(fileids=fileids)
    normalizer = Corpus_Vectorizer.TextNormalizer()
    normed = (dd for dd in normalizer.transform(docs))
    for doc in normed:
        for pair in possible_pairs:
            if pair[0] in doc and pair[1] in doc:
                cooccurring[pair] += 1
    return cooccurring
def plot_tsne_clusters(corpus, fileids=None, labels=None):
    from yellowbrick.text import TSNEVisualizer
    from sklearn.feature_extraction.text import TfidfVectorizer

    words = corpus.title_tagged(fileids=fileids)
    normalizer = Corpus_Vectorizer.TextNormalizer()
    normed = (sent for title in normalizer.transform(words) for sent in title)
    # normed = (dd for dd in normalizer.transform(docs))
    tfidf = TfidfVectorizer()
    procd = tfidf.fit_transform(normed)

    tsne = TSNEVisualizer()
    if labels is None:
        tsne.fit(procd)
    else:
        tsne.fit(procd, ["c{}".format(c) for c in labels])
    tsne.poof()
def matrix(corpus, terms, fileids=None):
    # # get the most common words in the corpus
    # frequent_terms, _ = most_common_terms(corpus, n=n, fileids=fileids)
    # for term in frequent_terms: print(term)

    # run through each document title and invriment coccurance of terms
    docs = corpus.title_tagged(fileids=fileids)
    normalizer = Corpus_Vectorizer.TextNormalizer()
    normed = normalizer.transform(docs)

    mtx = []
    for first in terms:
        row = []
        for second in terms:
            count = 0
            for doc in normed:
                if first in doc and second in doc:
                    count += 1
            row.append(count)
        mtx.append(row)
    return mtx
    # # Plot hierarchical clustering
    # model = Pipeline([
    #     ("norm", Corpus_Vectorizer.TitleNormalizer()),
    #     ("vect", Corpus_Vectorizer.OneHotVectorizer()),
    #     ('clusters', Corpus_Cluster.HierarchicalClustering())
    # ])
    #
    # clusters = model.fit_transform(docs)
    # labels = model.named_steps['clusters'].labels
    # children = model.named_steps['clusters'].children
    #
    # plot_dendrogram(children)

    # decompose data to 2D
    reduce = Pipeline([
        ("norm", Corpus_Vectorizer.TitleNormalizer()),
        ("vect", Corpus_Vectorizer.OneHotVectorizer()),
        ('pca', PCA(n_components=2))
    ])

    X2d = reduce.fit_transform(docs)

    # plot Kmeans
    model = Pipeline([
        ("norm", Corpus_Vectorizer.TitleNormalizer()),
        ("vect", Corpus_Vectorizer.OneHotVectorizer()),
        ('clusters', Corpus_Cluster.MiniBatchKMeansClusters(k=3))
    ])

    clusters = model.fit_transform(docs)
Beispiel #12
0
 def test_tokenize(self):
     target = ['the', 'eleph', 'sneez', 'at', 'the', 'sight', 'of', 'potato']
     result = [word for word in Corpus_Vectorizer.tokenize(self.corpus[0])]
     self.assertEqual(result, target)
     self.assertEqual(len(result), len(target))