Exemple #1
0
def main():
    """
    processing text
    """
    from reader import PickledCorpusReader

    # Classification model
    model_file = 'model-SGDClassifier.pickle'

    # Initialize model from pickle
    with open(model_file, 'rb') as f:
        model = pickle.load(f)

    # Initialize a corpus reader
    corpus = PickledCorpusReader('./pickle_corpus/')

    # Call the model to classify text
    y_pred, x_pred = predict_model(model,
                                   corpus,
                                   categories=['artistic_event'])

    # Print result
    count = nartistics = 0
    for result in y_pred:
        if result != 'artistic_event':
            for doc in x_pred[count]:
                print_event(doc)
            print('======================')
            nartistics += 1
        count += 1
    print('{:d} artistic events found/ {:d} events'.format(nartistics, count))
Exemple #2
0
def train_model(path, model, saveto=None, cv=12):
    """
    Trains model from corpus at specified path; constructing cross-validation
    scores using the cv parameter, then fitting the model on the full data and
    writing it to disk at the saveto path if specified. Returns the scores.
    """
    # Load the corpus data and labels for classification
    corpus = PickledCorpusReader(path)
    X = documents(corpus)
    y = labels(corpus)

    # Compute cross validation scores
    scores = cross_val_score(model, X, y, cv=cv)

    # Fit the model on entire data set
    model.fit(X, y)

    # Write to disk if specified
    if saveto:
        joblib.dump(model, saveto)

    # Return scores as well as training time via decorator
    return scores
Exemple #3
0
from reader import PickledCorpusReader

reader = PickledCorpusReader('../corpus')

for category in reader.categories():

    n_docs = len(reader.fileids(categories=[category]))
    n_words = sum(1 for word in reader.words(categories=[category]))

    print("- '{}' contains {:,} docs and {:,} words".format(
        category, n_docs, n_words))
Exemple #4
0
            table.append(row)

    table.sort(key=lambda r: r[-1], reverse=True)
    print(tabulate.tabulate(table, headers=fields))


if __name__ == '__main__':
    results_file = "results.json"
    labels = [
        "artistic_event",
        "other_event",
    ]

    # Initialzing corpus reader and loader (generates K-Folds)
    reader = PickledCorpusReader('./pickle_corpus')
    loader = CorpusLoader(reader, 5, shuffle=True, categories=labels)

    txt = TextNormalizer()
    txt.lemmatize("qu'", "")

    # Initalizing models
    models = []
    for form in (LogisticRegression, SGDClassifier):
        models.append(create_pipeline(form(), True))
        models.append(create_pipeline(form(), False))

    models.append(create_pipeline(MultinomialNB(), False))
    models.append(create_pipeline(GaussianNB(), True))

    # Running all models
Exemple #5
0
from reader import PickledCorpusReader

reader = PickledCorpusReader('../corpus')

for category in reader.categories():

    n_docs = len(reader.fileids(categories=[category]))
    n_words = sum(1 for word in reader.words(categories=[category]))

    print("- '{}' contains {:,} docs and {:,} words".format(category, n_docs, n_words))
def graph(docs):
    G = nx.Graph()
    for doc in docs:
        for pair in pairs(doc):
            if (pair[0][0], pair[1][0]) in G.edges():
                G.edges[(pair[0][0], pair[1][0])]['weight'] += 1
            else:
                G.add_edge(pair[0][0], pair[1][0], weight=1)
    return G


def nbest_centrality(G, metric, n=10, attr="centrality", **kwargs):
    # Compute the centrality scores for each vertex
    scores = metric(G, **kwargs)

    # Set the score as a property on each node
    nx.set_node_attributes(G, name=attr, values=scores)

    # Find the top n scores and print them along with their index
    topn = heapq.nlargest(n, scores.items(), key=itemgetter(1))
    for idx, item in enumerate(topn):
        print("{}. {}: {:0.4f}".format(idx + 1, *item))

    return G


if __name__ == '__main__':
    corpus = PickledCorpusReader("../corpus")
    docs = corpus.docs()
    G = graph(docs)
Exemple #7
0
            self.estimator = lsimodel.LsiTransformer(num_topics=self.n_topics)
        else:
            self.estimator = ldamodel.LdaTransformer(num_topics=self.n_topics)

        self.model = Pipeline([('norm', TextNormalizer()),
                               ('vect', GensimTfidfVectorizer()),
                               ('model', self.estimator)])

    def fit(self, documents):
        self.model.fit(documents)

        return self.model


if __name__ == '__main__':
    corpus = PickledCorpusReader('../corpus')

    # With Sklearn
    skmodel = SklearnTopicModels(estimator='NMF')
    documents = corpus.docs()

    skmodel.fit_transform(documents)
    topics = skmodel.get_topics()
    for topic, terms in topics.items():
        print("Topic #{}:".format(topic + 1))
        print(terms)

    # # With Gensim
    # gmodel = GensimTopicModels(estimator='LSA')
    #
    # docs = [
Exemple #8
0
                trees = ne_chunk(sentence)
                for tree in trees:
                    if hasattr(tree, 'label'):
                        if tree.label() in self.labels:
                            entities.append(
                                ' '.join([child[0].lower() for child in tree])
                                )
        return entities

    def fit(self, documents, labels=None):
        return self

    def transform(self, documents):
        for document in documents:
            yield self.get_entities(document)


if __name__ == '__main__':
    from reader import PickledCorpusReader

    corpus = PickledCorpusReader('../corpus')
    docs = corpus.docs()

    phrase_extractor = KeyphraseExtractor()
    keyphrases = list(phrase_extractor.fit_transform(docs))
    print(keyphrases[0])

    entity_extractor = EntityExtractor()
    entities = list(entity_extractor.fit_transform(docs))
    print(entities[0])
        self.save()
        return self

    def transform(self, documents):
        def generator():
            for document in documents:
                vec = self.tfidf[self.lexicon.doc2bow(document)]
                if self.tofull:
                    yield sparse2full(vec)
                else:
                    yield vec

        return list(generator())


if __name__ == '__main__':
    from reader import PickledCorpusReader

    corpus = PickledCorpusReader('../corpus')
    docs = [
        list(corpus.docs(fileids=fileid))[0] for fileid in corpus.fileids()
    ]

    model = Pipeline([('norm', TextNormalizer()),
                      ('vect', GensimTfidfVectorizer()),
                      ('lda', ldamodel.LdaTransformer())])

    model.fit_transform(docs)

    print(model.named_steps['norm'])
Exemple #10
0
from sklearn.feature_extraction.text import TfidfVectorizer

from reader import PickledCorpusReader
from normalizer import TextNormalizer


def identity(words):
    return words


corpus = PickledCorpusReader('../../corpora/Pickled_Corpus_Sample')

normalizer = TextNormalizer()
docs = normalizer.fit_transform(corpus.docs())

vectorizer = TfidfVectorizer(tokenizer=identity,
                             preprocessor=None,
                             lowercase=False)

vectors = vectorizer.fit_transform(docs)

print(vectors.shape)
Exemple #11
0
            self.estimator = ldamodel.LdaTransformer(num_topics=self.n_topics)

        self.model = Pipeline([
            ('norm', TextNormalizer()),
            ('vect', GensimTfidfVectorizer()),
            ('model', self.estimator)
        ])

    def fit(self, documents):
        self.model.fit(documents)

        return self.model


if __name__ == '__main__':
    corpus = PickledCorpusReader('../corpus')

    # With Sklearn
    skmodel = SklearnTopicModels(estimator='NMF')
    documents   = corpus.docs()

    skmodel.fit_transform(documents)
    topics = skmodel.get_topics()
    for topic, terms in topics.items():
        print("Topic #{}:".format(topic+1))
        print(terms)

    # # With Gensim
    # gmodel = GensimTopicModels(estimator='LSA')
    #
    # docs = [
Exemple #12
0
    def __iter__(self):
        for train_index, test_index in self.folds.split(self.files):
            X_train = self.documents(train_index)
            y_train = self.labels(train_index)

            X_test = self.documents(test_index)
            y_test = self.labels(test_index)

            yield X_train, X_test, y_train, y_test


if __name__ == '__main__':
    from reader import PickledCorpusReader

    corpus = PickledCorpusReader('../pickle_corpus')
    loader = CorpusLoader(
        corpus,
        12,
        categories=["expositions", "expositions_galeries", 'marches_salons'])
    fcount = dcount = pcount = scount = tcount = 0
    for x_train, x_test, y_train, x_test in loader:
        for file in x_train:
            fcount += 1
            for doc in file:
                dcount += 1
                for para in doc:
                    #print(para)
                    #key=input('>>')
                    pcount += 1
                    #if pcount >= 1 :
Exemple #13
0
    def score(self, word, context):
        """
        Use KneserNeyProbDist from NLTK to get score
        """
        trigram = tuple((context[0], context[1], word))
        return self.model.prob(trigram)

    def samples(self):
        return self.model.samples()

    def prob(self, sample):
        return self.model.prob(sample)


if __name__ == '__main__':
    corpus = PickledCorpusReader('../corpus')
    tokens = [''.join(word) for word in corpus.words()]
    vocab = Counter(tokens)
    sents = list([word[0] for word in sent] for sent in corpus.sents())

    counter = count_ngrams(3, vocab, sents)
    knm = KneserNeyModel(counter)


    def complete(input_text):
        tokenized = nltk.word_tokenize(input_text)
        if len(tokenized) < 2:
            response = "Say more."
        else:
            completions = {}
            for sample in knm.samples():
Exemple #14
0
        self.model = KMeansClusterer(self.k,
                                     distance=cosine,
                                     avoid_empty_clusters=True)
        self.model.cluster([
            self.vectorize(corpus.words(fileid))
            for fileid in corpus.fileids(categories=['news'])
        ])

    def classify(self, document):
        """
        Pass through to the internal model classify
        """
        return self.model.classify(self.vectorize(document))


if __name__ == '__main__':
    corpus = PickledCorpusReader('../corpus')

    clusterer = KMeansTopics(corpus, k=7)
    clusterer.cluster(corpus)

    # Classify documents in the new corpus by cluster affinity
    groups = [(clusterer.classify(corpus.words(fileid)), fileid)
              for fileid in corpus.fileids(categories=['news'])]

    # Group documents in corpus by cluster and display them
    groups.sort(key=itemgetter(0))
    for group, items in groupby(groups, key=itemgetter(0)):
        for cluster, fname in items:
            print("Cluster {}: {}".format(cluster + 1, fname))
Exemple #15
0
    # Compute the centrality scores for each vertex
    scores = metric(G, **kwargs)

    # Set the score as a property on each node
    nx.set_node_attributes(G, name=attr, values=scores)

    # Find the top n scores and print them along with their index
    topn = heapq.nlargest(n, scores.items(), key=itemgetter(1))
    for idx, item in enumerate(topn):
        print("{}. {}: {:0.4f}".format(idx + 1, *item))

    return G


if __name__ == '__main__':
    corpus = PickledCorpusReader(
        '/Users/dd/PycharmProjects/atap/resources/sample/')
    G = graph(corpus)

    # # Write the graph to disk, if needed
    nx.write_graphml(G, "entities.graphml")

    # # Get summary stats for the full graph
    print(nx.info(G))

    # # find the most central entities in the social network
    print("Degree centrality")
    nbest_centrality(G, nx.degree_centrality)
    print("Betweenness centrality")
    nbest_centrality(G,
                     nx.betweenness_centrality,
                     10,
Exemple #16
0
    # Create the TF-IDF Model and compute the scores
    model = gensim.models.TfidfModel(vectors)
    scores = model[vectors]

    for doc in scores:
        yield [
            (lexicon[vec], score) for vec, score in doc
        ]


if __name__ == '__main__':

    import heapq

    from reader import PickledCorpusReader
    from collections import Counter

    corpus = PickledCorpusReader('../corpus')
    scores = scored_document_phrases([
        list(corpus.sents(fileids=fileid)) for fileid in corpus.fileids(categories=["politics", "news"])
    ], True)
    tfidfs = Counter()

    for phrases in scores:
        for phrase, score in phrases:
            tfidfs[phrase] += score

    print(
        tabulate(tfidfs.most_common(50), headers=["keyphrase", "cumulative tfidf"])
    )
Exemple #17
0
        indices = train_idx if train else test_idx
        return [
            fileid for doc_idx, fileid in enumerate(self.corpus.fileids())
            if doc_idx in indices
        ]

    def documents(self, fold=None, train=False, test=False):
        for fileid in self.fileids(fold, train, test):
            yield list(self.corpus.docs(fileids=fileid))

    def labels(self, fold=None, train=False, test=False):
        return [
            self.corpus.categories(fileids=fileid)[0]
            for fileid in self.fileids(fold, train, test)
        ]


if __name__ == '__main__':
    from reader import PickledCorpusReader

    corpus = PickledCorpusReader('corpus/tagcorpusoracle')
    for para in corpus.fileids(categories='281550031684823'):
        print(para)

    loader = CorpusLoader(corpus,12)

    for fid in loader.fileids(0, test=True):
        print(fid)


    print(loader.labels(0, test=True))
Exemple #18
0
                              preprocessor=None,
                              lowercase=False))]

    if reduction:
        steps.append(('reduction', TruncatedSVD(n_components=1000)))

    # Add the estimator
    steps.append(('classifier', estimator))
    return Pipeline(steps)


labels = [
    "281571400036367", "281585707268948", "281723051068305", "281981394899011",
    "281988872632944", "281723051068312"
]
reader = PickledCorpusReader('corpus/tagcorpusoracle_test')
loader = CorpusLoader(reader, 5, shuffle=True, categories=labels)

models = []
for form in (LogisticRegression, SGDClassifier, DecisionTreeClassifier):
    models.append(create_pipeline(form(), True))
    models.append(create_pipeline(form(), False))

models.append(create_pipeline(MultinomialNB(), False))
models.append(create_pipeline(GaussianNB(), True))

import time
import json

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
Exemple #19
0
            yield list(self.reader.docs(fileids=[fileid]))

    def labels(self, idx=None):
        return [
            self.reader.categories(fileids=[fileid])[0]
            for fileid in self.fileids(idx)
        ]

    def __iter__(self):
        for train_index, test_index in self.folds.split(self.files):
            X_train = self.documents(train_index)
            y_train = self.labels(train_index)

            X_test = self.documents(test_index)
            y_test = self.labels(test_index)

            yield X_train, X_test, y_train, y_test


if __name__ == '__main__':
    from reader import PickledCorpusReader

    reader = PickledCorpusReader('../../corpora/Pickled_Corpus_Sample')
    loader = CorpusLoader(reader, 10)

    X_train, X_test, y_train, y_test = next(loader.__iter__())
    
    for i in range(2):
        print('\nDOC {}   LABEL: "{}"'.format(i+1, y_train[i]))
        print(next(X_train))
Exemple #20
0
        (
            "vectorize",
            TfidfVectorizer(tokenizer=identity, preprocessor=None, lowercase=False),
        ),
    ]

    if reduction:
        steps.append(("reduction", TruncatedSVD(n_components=10000)))

    # Add the estimator
    steps.append(("classifier", estimator))
    return Pipeline(steps)


labels = ["books", "cinema", "cooking", "gaming", "sports", "tech"]
reader = PickledCorpusReader("./corpus")
loader = CorpusLoader(reader, 5, shuffle=True, categories=labels)

models = []
for form in (LogisticRegression, SGDClassifier):
    models.append(create_pipeline(form(), True))
    models.append(create_pipeline(form(), False))

models.append(create_pipeline(MultinomialNB(), False))
models.append(create_pipeline(GaussianNB(), True))

import json
import time

from sklearn.metrics import (accuracy_score, f1_score, precision_score,
                             recall_score)
Exemple #21
0
    # Compute the centrality scores for each vertex
    nbest = {}
    for name, metric in metrics.items():
        scores = metric(G)
        # Set the score as a property on each node
        nx.set_node_attributes(G, name=name, values=scores)
        # Find the top n scores and print them along with their index
        topn = heapq.nlargest(n, scores.items(), key=itemgetter(1))
        nbest[name] = topn
    return nbest


if __name__ == '__main__':
    from reader import PickledCorpusReader

    corpus = PickledCorpusReader(root='../../corpora/politics_pickled')
    docs = corpus.docs()

    entity_extractor = EntityExtractor()
    entities = entity_extractor.fit_transform(docs)

    entity_pairing = EntityPairs()
    pairs = entity_pairing.fit_transform(entities)

    graph = GraphExtractor()
    G = graph.fit_transform(pairs)
    print(nx.info(G))

    centralities = {
        "Degree Centrality": nx.degree_centrality,
        "Betweenness Centrality": nx.betweenness_centrality
Exemple #22
0
from sklearn.model_selection import train_test_split as tts
from reader import PickledCorpusReader

reader = PickledCorpusReader('../corpus')

labels = ["books", "cinema", "cooking", "gaming", "sports", "tech"]
docs = reader.fileids(categories=labels)
X = list(reader.docs(fileids=docs))
y = [reader.categories(fileids=[fileid])[0] for fileid in docs]
Exemple #23
0
    def score(self, word, context):
        """
        Use KneserNeyProbDist from NLTK to get score
        """
        trigram = tuple((context[0], context[1], word))
        return self.model.prob(trigram)

    def samples(self):
        return self.model.samples()

    def prob(self, sample):
        return self.model.prob(sample)


if __name__ == '__main__':
    corpus = PickledCorpusReader('../corpus')
    tokens = [''.join(word) for word in corpus.words()]
    vocab = Counter(tokens)
    sents = list([word for word in sent] for sent in corpus.sents())

    counter = count_ngrams(3, vocab, sents)
    knm = KneserNeyModel(counter)

    def complete(input_text):
        tokenized = nltk.word_tokenize(input_text)

        if len(tokenized) < 2:
            response = "Say more."
        else:
            completions = {}
            for sample in knm.samples():
Exemple #24
0
    def documents(self, idx=None):
        for fileid in self.fileids(idx):
            yield list(self.reader.docs(fileids=[fileid]))

    def labels(self, idx=None):
        return [
            self.reader.categories(fileids=[fileid])[0]
            for fileid in self.fileids(idx)
        ]

    def __iter__(self):
        for train_index, test_index in self.folds.split(self.files):
            X_train = self.documents(train_index)
            y_train = self.labels(train_index)

            X_test = self.documents(test_index)
            y_test = self.labels(test_index)

            yield X_train, X_test, y_train, y_test


if __name__ == '__main__':
    from reader import PickledCorpusReader

    corpus = PickledCorpusReader('corpus/tagcorpusoracle')
    loader = CorpusLoader(corpus, 12)

    #for X_train, X_test, y_train, y_test in loader:
        #for f in y_train:
            #print(f)
Exemple #25
0
        self.model = KMeansClusterer(
            self.k, distance=cosine, avoid_empty_clusters=True)
        self.model.cluster([
            self.vectorize(
                corpus.words(fileid)
            ) for fileid in corpus.fileids(categories=['news'])
        ])

    def classify(self, document):
        """
        Pass through to the internal model classify
        """
        return self.model.classify(self.vectorize(document))

if __name__ == '__main__':
    corpus = PickledCorpusReader('../corpus')

    clusterer = KMeansTopics(corpus, k=7)
    clusterer.cluster(corpus)

    # Classify documents in the new corpus by cluster affinity
    groups  = [
        (clusterer.classify(corpus.words(fileid)), fileid)
        for fileid in corpus.fileids(categories=['news'])
    ]

    # Group documents in corpus by cluster and display them
    groups.sort(key=itemgetter(0))
    for group, items in groupby(groups, key=itemgetter(0)):
        for cluster, fname in items:
            print("Cluster {}: {}".format(cluster+1,fname))
Exemple #26
0
        return self

    def transform(self, documents):
        def generator():
            for document in documents:
                vec = self.tfidf[self.lexicon.doc2bow(document)]
                if self.tofull:
                    yield sparse2full(vec)
                else:
                    yield vec
        return list(generator())


if __name__ == '__main__':
    from reader import PickledCorpusReader

    corpus = PickledCorpusReader('../corpus')
    docs = [
        list(corpus.docs(fileids=fileid))[0]
        for fileid in corpus.fileids()
    ]

    model = Pipeline([
        ('norm', TextNormalizer()),
        ('vect', GensimTfidfVectorizer()),
        ('lda', ldamodel.LdaTransformer())])

    model.fit_transform(docs)

    print(model.named_steps['norm'])
Exemple #27
0
        'id': 1,
        'name': 'Exposition de peinture',
        'description':
        'ceci est une exposition de Claude Monet.\nVernissage le 23 mai 2020',
        'eventcat': [200]
    }
    event2 = {
        'id': 2,
        'name': 'Exposition de sculpture',
        'description': 'Ceci est une exposition de Rodin',
        'eventcat': [200],
    }
    events = [event1, event2]
    fileid = 'artistic_event/0b7944a1a37ff80d982f0e18abeab26d.pickle'
    corpus1 = PickledCorpusReader('./pickle_corpus',
                                  fileids=[fileid
                                           ])  #categories='artistic_event')
    corpus2 = EventCorpusReader(events, categories=[300])

    for corpus in [corpus1, corpus2]:
        print('------- words --------')
        count = 0
        for word in corpus.words(categories=[200, 300]):
            print(word)
            count += 1
            if count > 10:
                break
        print('------- sents --------')
        count = 0
        for sent in corpus.sents(categories=[200, 300]):
            print(sent)
Exemple #28
0
        # Now determine if we're in train or test mode.
        if not (test or train) or (test and train):
            raise ValueError("Please specify either train or test flag")

        # Select only the indices to filter upon.
        indices = train_idx if train else test_idx
        return [
            fileid for doc_idx, fileid in enumerate(self.corpus.fileids())
            if doc_idx in indices
        ]

    def documents(self, fold=None, train=False, test=False):
        for fileid in self.fileids(fold, train, test):
            yield list(self.corpus.docs(fileids=fileid))

    def labels(self, fold=None, train=False, test=False):
        return [
            self.corpus.categories(fileids=fileid)[0]
            for fileid in self.fileids(fold, train, test)
        ]


if __name__ == '__main__':
    from reader import PickledCorpusReader

    corpus = PickledCorpusReader('corpus')
    loader = CorpusLoader(corpus, 12)

    for fid in loader.fileids(0, test=True):
        print(fid)
    def labels(self, idx=None):
        return [
            self.reader.categories(fileids=[fileid])[0]
            for fileid in self.fileids(idx)
        ]

    def __iter__(self):
        for train_index, test_index in self.folds.split(self.files):
            X_train = self.documents(train_index)
            y_train = self.labels(train_index)

            X_test = self.documents(test_index)
            y_test = self.labels(test_index)

            yield X_train, X_test, y_train, y_test


if __name__ == '__main__':
    from reader import PickledCorpusReader

    corpus = PickledCorpusReader('path\\to\\preprocessed')
    loader = CorpusLoader(corpus, 12)

    for X_train, X_test, y_train, y_test in loader:
        inputDocs = []
        for doc in X_train:
            inputDocs.append(doc)
        print(len(inputDocs))
        print(len(y_train))
        break
Exemple #30
0
        if idx is None:
            return self.files
        return self.files[idx]

    def documents(self, idx=None):
        for fileid in self.fileids(idx):
            yield list(self.reader.docs(fileids=[fileid]))

    def labels(self, idx=None):
        return [
            self.reader.categories(fileids=[fileid])[0]
            for fileid in self.fileids(idx)
        ]

    def __iter__(self):
        for train_index, test_index in self.folds.split(self.files):
            X_train = self.documents(train_index)
            y_train = self.labels(train_index)

            X_test = self.documents(test_index)
            y_test = self.labels(test_index)

            yield X_train, X_test, y_train, y_test


if __name__ == '__main__':
    from reader import PickledCorpusReader

    corpus = PickledCorpusReader('../corpus')
    loader = CorpusLoader(corpus, folds=12)
Exemple #31
0
def create_pipeline(estimator, reduction=False):

    steps = [('normalize', TextNormalizer()),
             ('vectorize',
              GensimVectorizer(path="path\\to\\gensimModels\\doc2vec.d2v"))]

    if reduction:
        steps.append(('reduction', TruncatedSVD(n_components=1000)))

    # Add the estimator
    steps.append(('classifier', estimator))
    return Pipeline(steps)


labels = ["label1", "label2", "RRR", "CAP"]
reader = PickledCorpusReader('path\\to\\preprocessed_min')
loader = CorpusLoader(reader, 12, shuffle=True, categories=labels)

models = []

for form in (LogisticRegression, SGDClassifier):
    # models.append(create_pipeline(form(), True))
    models.append(create_pipeline(form(), False))

# models.append(create_pipeline(MultinomialNB(), False))
models.append(create_pipeline(LinearSVC(), False))

# models.append(create_pipeline(PartialSGDEstimator(), False))
import time
import json