Beispiel #1
0
def main():
    """
    processing text
    """
    from reader import PickledCorpusReader

    # Classification model
    model_file = 'model-SGDClassifier.pickle'

    # Initialize model from pickle
    with open(model_file, 'rb') as f:
        model = pickle.load(f)

    # Initialize a corpus reader
    corpus = PickledCorpusReader('./pickle_corpus/')

    # Call the model to classify text
    y_pred, x_pred = predict_model(model,
                                   corpus,
                                   categories=['artistic_event'])

    # Print result
    count = nartistics = 0
    for result in y_pred:
        if result != 'artistic_event':
            for doc in x_pred[count]:
                print_event(doc)
            print('======================')
            nartistics += 1
        count += 1
    print('{:d} artistic events found/ {:d} events'.format(nartistics, count))
Beispiel #2
0
def train_model(path, model, saveto=None, cv=12):
    """
    Trains model from corpus at specified path; constructing cross-validation
    scores using the cv parameter, then fitting the model on the full data and
    writing it to disk at the saveto path if specified. Returns the scores.
    """
    # Load the corpus data and labels for classification
    corpus = PickledCorpusReader(path)
    X = documents(corpus)
    y = labels(corpus)

    # Compute cross validation scores
    scores = cross_val_score(model, X, y, cv=cv)

    # Fit the model on entire data set
    model.fit(X, y)

    # Write to disk if specified
    if saveto:
        joblib.dump(model, saveto)

    # Return scores as well as training time via decorator
    return scores
Beispiel #3
0
        if idx is None:
            return self.files
        return self.files[idx]

    def documents(self, idx=None):
        for fileid in self.fileids(idx):
            yield list(self.reader.docs(fileids=[fileid]))

    def labels(self, idx=None):
        return [
            self.reader.categories(fileids=[fileid])[0]
            for fileid in self.fileids(idx)
        ]

    def __iter__(self):
        for train_index, test_index in self.folds.split(self.files):
            X_train = self.documents(train_index)
            y_train = self.labels(train_index)

            X_test = self.documents(test_index)
            y_test = self.labels(test_index)

            yield X_train, X_test, y_train, y_test


if __name__ == '__main__':
    from reader import PickledCorpusReader

    corpus = PickledCorpusReader('../corpus')
    loader = CorpusLoader(corpus, folds=12)
Beispiel #4
0
        # Now determine if we're in train or test mode.
        if not (test or train) or (test and train):
            raise ValueError("Please specify either train or test flag")

        # Select only the indices to filter upon.
        indices = train_idx if train else test_idx
        return [
            fileid for doc_idx, fileid in enumerate(self.corpus.fileids())
            if doc_idx in indices
        ]

    def documents(self, fold=None, train=False, test=False):
        for fileid in self.fileids(fold, train, test):
            yield list(self.corpus.docs(fileids=fileid))

    def labels(self, fold=None, train=False, test=False):
        return [
            self.corpus.categories(fileids=fileid)[0]
            for fileid in self.fileids(fold, train, test)
        ]


if __name__ == '__main__':
    from reader import PickledCorpusReader

    corpus = PickledCorpusReader('corpus')
    loader = CorpusLoader(corpus, 12)

    for fid in loader.fileids(0, test=True):
        print(fid)
Beispiel #5
0
            table.append(row)

    table.sort(key=lambda r: r[-1], reverse=True)
    print(tabulate.tabulate(table, headers=fields))


if __name__ == '__main__':
    results_file = "results.json"
    labels = [
        "artistic_event",
        "other_event",
    ]

    # Initialzing corpus reader and loader (generates K-Folds)
    reader = PickledCorpusReader('./pickle_corpus')
    loader = CorpusLoader(reader, 5, shuffle=True, categories=labels)

    txt = TextNormalizer()
    txt.lemmatize("qu'", "")

    # Initalizing models
    models = []
    for form in (LogisticRegression, SGDClassifier):
        models.append(create_pipeline(form(), True))
        models.append(create_pipeline(form(), False))

    models.append(create_pipeline(MultinomialNB(), False))
    models.append(create_pipeline(GaussianNB(), True))

    # Running all models
Beispiel #6
0
    def documents(self, idx=None):
        for fileid in self.fileids(idx):
            yield list(self.reader.docs(fileids=[fileid]))

    def labels(self, idx=None):
        return [
            self.reader.categories(fileids=[fileid])[0]
            for fileid in self.fileids(idx)
        ]

    def __iter__(self):
        for train_index, test_index in self.folds.split(self.files):
            X_train = self.documents(train_index)
            y_train = self.labels(train_index)

            X_test = self.documents(test_index)
            y_test = self.labels(test_index)

            yield X_train, X_test, y_train, y_test


if __name__ == '__main__':
    from reader import PickledCorpusReader

    corpus = PickledCorpusReader('corpus/tagcorpusoracle')
    loader = CorpusLoader(corpus, 12)

    #for X_train, X_test, y_train, y_test in loader:
        #for f in y_train:
            #print(f)
Beispiel #7
0
def graph(docs):
    G = nx.Graph()
    for doc in docs:
        for pair in pairs(doc):
            if (pair[0][0], pair[1][0]) in G.edges():
                G.edges[(pair[0][0], pair[1][0])]['weight'] += 1
            else:
                G.add_edge(pair[0][0], pair[1][0], weight=1)
    return G


def nbest_centrality(G, metric, n=10, attr="centrality", **kwargs):
    # Compute the centrality scores for each vertex
    scores = metric(G, **kwargs)

    # Set the score as a property on each node
    nx.set_node_attributes(G, name=attr, values=scores)

    # Find the top n scores and print them along with their index
    topn = heapq.nlargest(n, scores.items(), key=itemgetter(1))
    for idx, item in enumerate(topn):
        print("{}. {}: {:0.4f}".format(idx + 1, *item))

    return G


if __name__ == '__main__':
    corpus = PickledCorpusReader("../corpus")
    docs = corpus.docs()
    G = graph(docs)
Beispiel #8
0
        (
            "vectorize",
            TfidfVectorizer(tokenizer=identity, preprocessor=None, lowercase=False),
        ),
    ]

    if reduction:
        steps.append(("reduction", TruncatedSVD(n_components=10000)))

    # Add the estimator
    steps.append(("classifier", estimator))
    return Pipeline(steps)


labels = ["books", "cinema", "cooking", "gaming", "sports", "tech"]
reader = PickledCorpusReader("./corpus")
loader = CorpusLoader(reader, 5, shuffle=True, categories=labels)

models = []
for form in (LogisticRegression, SGDClassifier):
    models.append(create_pipeline(form(), True))
    models.append(create_pipeline(form(), False))

models.append(create_pipeline(MultinomialNB(), False))
models.append(create_pipeline(GaussianNB(), True))

import json
import time

from sklearn.metrics import (accuracy_score, f1_score, precision_score,
                             recall_score)
Beispiel #9
0
                              preprocessor=None,
                              lowercase=False))]

    if reduction:
        steps.append(('reduction', TruncatedSVD(n_components=1000)))

    # Add the estimator
    steps.append(('classifier', estimator))
    return Pipeline(steps)


labels = [
    "281571400036367", "281585707268948", "281723051068305", "281981394899011",
    "281988872632944", "281723051068312"
]
reader = PickledCorpusReader('corpus/tagcorpusoracle_test')
loader = CorpusLoader(reader, 5, shuffle=True, categories=labels)

models = []
for form in (LogisticRegression, SGDClassifier, DecisionTreeClassifier):
    models.append(create_pipeline(form(), True))
    models.append(create_pipeline(form(), False))

models.append(create_pipeline(MultinomialNB(), False))
models.append(create_pipeline(GaussianNB(), True))

import time
import json

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
    def labels(self, idx=None):
        return [
            self.reader.categories(fileids=[fileid])[0]
            for fileid in self.fileids(idx)
        ]

    def __iter__(self):
        for train_index, test_index in self.folds.split(self.files):
            X_train = self.documents(train_index)
            y_train = self.labels(train_index)

            X_test = self.documents(test_index)
            y_test = self.labels(test_index)

            yield X_train, X_test, y_train, y_test


if __name__ == '__main__':
    from reader import PickledCorpusReader

    corpus = PickledCorpusReader('path\\to\\preprocessed')
    loader = CorpusLoader(corpus, 12)

    for X_train, X_test, y_train, y_test in loader:
        inputDocs = []
        for doc in X_train:
            inputDocs.append(doc)
        print(len(inputDocs))
        print(len(y_train))
        break
Beispiel #11
0
        'id': 1,
        'name': 'Exposition de peinture',
        'description':
        'ceci est une exposition de Claude Monet.\nVernissage le 23 mai 2020',
        'eventcat': [200]
    }
    event2 = {
        'id': 2,
        'name': 'Exposition de sculpture',
        'description': 'Ceci est une exposition de Rodin',
        'eventcat': [200],
    }
    events = [event1, event2]
    fileid = 'artistic_event/0b7944a1a37ff80d982f0e18abeab26d.pickle'
    corpus1 = PickledCorpusReader('./pickle_corpus',
                                  fileids=[fileid
                                           ])  #categories='artistic_event')
    corpus2 = EventCorpusReader(events, categories=[300])

    for corpus in [corpus1, corpus2]:
        print('------- words --------')
        count = 0
        for word in corpus.words(categories=[200, 300]):
            print(word)
            count += 1
            if count > 10:
                break
        print('------- sents --------')
        count = 0
        for sent in corpus.sents(categories=[200, 300]):
            print(sent)
Beispiel #12
0
    # Compute the centrality scores for each vertex
    nbest = {}
    for name, metric in metrics.items():
        scores = metric(G)
        # Set the score as a property on each node
        nx.set_node_attributes(G, name=name, values=scores)
        # Find the top n scores and print them along with their index
        topn = heapq.nlargest(n, scores.items(), key=itemgetter(1))
        nbest[name] = topn
    return nbest


if __name__ == '__main__':
    from reader import PickledCorpusReader

    corpus = PickledCorpusReader(root='../../corpora/politics_pickled')
    docs = corpus.docs()

    entity_extractor = EntityExtractor()
    entities = entity_extractor.fit_transform(docs)

    entity_pairing = EntityPairs()
    pairs = entity_pairing.fit_transform(entities)

    graph = GraphExtractor()
    G = graph.fit_transform(pairs)
    print(nx.info(G))

    centralities = {
        "Degree Centrality": nx.degree_centrality,
        "Betweenness Centrality": nx.betweenness_centrality
Beispiel #13
0
            yield list(self.reader.docs(fileids=[fileid]))

    def labels(self, idx=None):
        return [
            self.reader.categories(fileids=[fileid])[0]
            for fileid in self.fileids(idx)
        ]

    def __iter__(self):
        for train_index, test_index in self.folds.split(self.files):
            X_train = self.documents(train_index)
            y_train = self.labels(train_index)

            X_test = self.documents(test_index)
            y_test = self.labels(test_index)

            yield X_train, X_test, y_train, y_test


if __name__ == '__main__':
    from reader import PickledCorpusReader

    reader = PickledCorpusReader('../../corpora/Pickled_Corpus_Sample')
    loader = CorpusLoader(reader, 10)

    X_train, X_test, y_train, y_test = next(loader.__iter__())
    
    for i in range(2):
        print('\nDOC {}   LABEL: "{}"'.format(i+1, y_train[i]))
        print(next(X_train))
Beispiel #14
0
    # Compute the centrality scores for each vertex
    scores = metric(G, **kwargs)

    # Set the score as a property on each node
    nx.set_node_attributes(G, name=attr, values=scores)

    # Find the top n scores and print them along with their index
    topn = heapq.nlargest(n, scores.items(), key=itemgetter(1))
    for idx, item in enumerate(topn):
        print("{}. {}: {:0.4f}".format(idx + 1, *item))

    return G


if __name__ == '__main__':
    corpus = PickledCorpusReader(
        '/Users/dd/PycharmProjects/atap/resources/sample/')
    G = graph(corpus)

    # # Write the graph to disk, if needed
    nx.write_graphml(G, "entities.graphml")

    # # Get summary stats for the full graph
    print(nx.info(G))

    # # find the most central entities in the social network
    print("Degree centrality")
    nbest_centrality(G, nx.degree_centrality)
    print("Betweenness centrality")
    nbest_centrality(G,
                     nx.betweenness_centrality,
                     10,
Beispiel #15
0
    def __iter__(self):
        for train_index, test_index in self.folds.split(self.files):
            X_train = self.documents(train_index)
            y_train = self.labels(train_index)

            X_test = self.documents(test_index)
            y_test = self.labels(test_index)

            yield X_train, X_test, y_train, y_test


if __name__ == '__main__':
    from reader import PickledCorpusReader

    corpus = PickledCorpusReader('../pickle_corpus')
    loader = CorpusLoader(
        corpus,
        12,
        categories=["expositions", "expositions_galeries", 'marches_salons'])
    fcount = dcount = pcount = scount = tcount = 0
    for x_train, x_test, y_train, x_test in loader:
        for file in x_train:
            fcount += 1
            for doc in file:
                dcount += 1
                for para in doc:
                    #print(para)
                    #key=input('>>')
                    pcount += 1
                    #if pcount >= 1 :
Beispiel #16
0
def create_pipeline(estimator, reduction=False):

    steps = [('normalize', TextNormalizer()),
             ('vectorize',
              GensimVectorizer(path="path\\to\\gensimModels\\doc2vec.d2v"))]

    if reduction:
        steps.append(('reduction', TruncatedSVD(n_components=1000)))

    # Add the estimator
    steps.append(('classifier', estimator))
    return Pipeline(steps)


labels = ["label1", "label2", "RRR", "CAP"]
reader = PickledCorpusReader('path\\to\\preprocessed_min')
loader = CorpusLoader(reader, 12, shuffle=True, categories=labels)

models = []

for form in (LogisticRegression, SGDClassifier):
    # models.append(create_pipeline(form(), True))
    models.append(create_pipeline(form(), False))

# models.append(create_pipeline(MultinomialNB(), False))
models.append(create_pipeline(LinearSVC(), False))

# models.append(create_pipeline(PartialSGDEstimator(), False))
import time
import json