def main(): """ processing text """ from reader import PickledCorpusReader # Classification model model_file = 'model-SGDClassifier.pickle' # Initialize model from pickle with open(model_file, 'rb') as f: model = pickle.load(f) # Initialize a corpus reader corpus = PickledCorpusReader('./pickle_corpus/') # Call the model to classify text y_pred, x_pred = predict_model(model, corpus, categories=['artistic_event']) # Print result count = nartistics = 0 for result in y_pred: if result != 'artistic_event': for doc in x_pred[count]: print_event(doc) print('======================') nartistics += 1 count += 1 print('{:d} artistic events found/ {:d} events'.format(nartistics, count))
def train_model(path, model, saveto=None, cv=12): """ Trains model from corpus at specified path; constructing cross-validation scores using the cv parameter, then fitting the model on the full data and writing it to disk at the saveto path if specified. Returns the scores. """ # Load the corpus data and labels for classification corpus = PickledCorpusReader(path) X = documents(corpus) y = labels(corpus) # Compute cross validation scores scores = cross_val_score(model, X, y, cv=cv) # Fit the model on entire data set model.fit(X, y) # Write to disk if specified if saveto: joblib.dump(model, saveto) # Return scores as well as training time via decorator return scores
from reader import PickledCorpusReader reader = PickledCorpusReader('../corpus') for category in reader.categories(): n_docs = len(reader.fileids(categories=[category])) n_words = sum(1 for word in reader.words(categories=[category])) print("- '{}' contains {:,} docs and {:,} words".format( category, n_docs, n_words))
table.append(row) table.sort(key=lambda r: r[-1], reverse=True) print(tabulate.tabulate(table, headers=fields)) if __name__ == '__main__': results_file = "results.json" labels = [ "artistic_event", "other_event", ] # Initialzing corpus reader and loader (generates K-Folds) reader = PickledCorpusReader('./pickle_corpus') loader = CorpusLoader(reader, 5, shuffle=True, categories=labels) txt = TextNormalizer() txt.lemmatize("qu'", "") # Initalizing models models = [] for form in (LogisticRegression, SGDClassifier): models.append(create_pipeline(form(), True)) models.append(create_pipeline(form(), False)) models.append(create_pipeline(MultinomialNB(), False)) models.append(create_pipeline(GaussianNB(), True)) # Running all models
from reader import PickledCorpusReader reader = PickledCorpusReader('../corpus') for category in reader.categories(): n_docs = len(reader.fileids(categories=[category])) n_words = sum(1 for word in reader.words(categories=[category])) print("- '{}' contains {:,} docs and {:,} words".format(category, n_docs, n_words))
def graph(docs): G = nx.Graph() for doc in docs: for pair in pairs(doc): if (pair[0][0], pair[1][0]) in G.edges(): G.edges[(pair[0][0], pair[1][0])]['weight'] += 1 else: G.add_edge(pair[0][0], pair[1][0], weight=1) return G def nbest_centrality(G, metric, n=10, attr="centrality", **kwargs): # Compute the centrality scores for each vertex scores = metric(G, **kwargs) # Set the score as a property on each node nx.set_node_attributes(G, name=attr, values=scores) # Find the top n scores and print them along with their index topn = heapq.nlargest(n, scores.items(), key=itemgetter(1)) for idx, item in enumerate(topn): print("{}. {}: {:0.4f}".format(idx + 1, *item)) return G if __name__ == '__main__': corpus = PickledCorpusReader("../corpus") docs = corpus.docs() G = graph(docs)
self.estimator = lsimodel.LsiTransformer(num_topics=self.n_topics) else: self.estimator = ldamodel.LdaTransformer(num_topics=self.n_topics) self.model = Pipeline([('norm', TextNormalizer()), ('vect', GensimTfidfVectorizer()), ('model', self.estimator)]) def fit(self, documents): self.model.fit(documents) return self.model if __name__ == '__main__': corpus = PickledCorpusReader('../corpus') # With Sklearn skmodel = SklearnTopicModels(estimator='NMF') documents = corpus.docs() skmodel.fit_transform(documents) topics = skmodel.get_topics() for topic, terms in topics.items(): print("Topic #{}:".format(topic + 1)) print(terms) # # With Gensim # gmodel = GensimTopicModels(estimator='LSA') # # docs = [
trees = ne_chunk(sentence) for tree in trees: if hasattr(tree, 'label'): if tree.label() in self.labels: entities.append( ' '.join([child[0].lower() for child in tree]) ) return entities def fit(self, documents, labels=None): return self def transform(self, documents): for document in documents: yield self.get_entities(document) if __name__ == '__main__': from reader import PickledCorpusReader corpus = PickledCorpusReader('../corpus') docs = corpus.docs() phrase_extractor = KeyphraseExtractor() keyphrases = list(phrase_extractor.fit_transform(docs)) print(keyphrases[0]) entity_extractor = EntityExtractor() entities = list(entity_extractor.fit_transform(docs)) print(entities[0])
self.save() return self def transform(self, documents): def generator(): for document in documents: vec = self.tfidf[self.lexicon.doc2bow(document)] if self.tofull: yield sparse2full(vec) else: yield vec return list(generator()) if __name__ == '__main__': from reader import PickledCorpusReader corpus = PickledCorpusReader('../corpus') docs = [ list(corpus.docs(fileids=fileid))[0] for fileid in corpus.fileids() ] model = Pipeline([('norm', TextNormalizer()), ('vect', GensimTfidfVectorizer()), ('lda', ldamodel.LdaTransformer())]) model.fit_transform(docs) print(model.named_steps['norm'])
from sklearn.feature_extraction.text import TfidfVectorizer from reader import PickledCorpusReader from normalizer import TextNormalizer def identity(words): return words corpus = PickledCorpusReader('../../corpora/Pickled_Corpus_Sample') normalizer = TextNormalizer() docs = normalizer.fit_transform(corpus.docs()) vectorizer = TfidfVectorizer(tokenizer=identity, preprocessor=None, lowercase=False) vectors = vectorizer.fit_transform(docs) print(vectors.shape)
self.estimator = ldamodel.LdaTransformer(num_topics=self.n_topics) self.model = Pipeline([ ('norm', TextNormalizer()), ('vect', GensimTfidfVectorizer()), ('model', self.estimator) ]) def fit(self, documents): self.model.fit(documents) return self.model if __name__ == '__main__': corpus = PickledCorpusReader('../corpus') # With Sklearn skmodel = SklearnTopicModels(estimator='NMF') documents = corpus.docs() skmodel.fit_transform(documents) topics = skmodel.get_topics() for topic, terms in topics.items(): print("Topic #{}:".format(topic+1)) print(terms) # # With Gensim # gmodel = GensimTopicModels(estimator='LSA') # # docs = [
def __iter__(self): for train_index, test_index in self.folds.split(self.files): X_train = self.documents(train_index) y_train = self.labels(train_index) X_test = self.documents(test_index) y_test = self.labels(test_index) yield X_train, X_test, y_train, y_test if __name__ == '__main__': from reader import PickledCorpusReader corpus = PickledCorpusReader('../pickle_corpus') loader = CorpusLoader( corpus, 12, categories=["expositions", "expositions_galeries", 'marches_salons']) fcount = dcount = pcount = scount = tcount = 0 for x_train, x_test, y_train, x_test in loader: for file in x_train: fcount += 1 for doc in file: dcount += 1 for para in doc: #print(para) #key=input('>>') pcount += 1 #if pcount >= 1 :
def score(self, word, context): """ Use KneserNeyProbDist from NLTK to get score """ trigram = tuple((context[0], context[1], word)) return self.model.prob(trigram) def samples(self): return self.model.samples() def prob(self, sample): return self.model.prob(sample) if __name__ == '__main__': corpus = PickledCorpusReader('../corpus') tokens = [''.join(word) for word in corpus.words()] vocab = Counter(tokens) sents = list([word[0] for word in sent] for sent in corpus.sents()) counter = count_ngrams(3, vocab, sents) knm = KneserNeyModel(counter) def complete(input_text): tokenized = nltk.word_tokenize(input_text) if len(tokenized) < 2: response = "Say more." else: completions = {} for sample in knm.samples():
self.model = KMeansClusterer(self.k, distance=cosine, avoid_empty_clusters=True) self.model.cluster([ self.vectorize(corpus.words(fileid)) for fileid in corpus.fileids(categories=['news']) ]) def classify(self, document): """ Pass through to the internal model classify """ return self.model.classify(self.vectorize(document)) if __name__ == '__main__': corpus = PickledCorpusReader('../corpus') clusterer = KMeansTopics(corpus, k=7) clusterer.cluster(corpus) # Classify documents in the new corpus by cluster affinity groups = [(clusterer.classify(corpus.words(fileid)), fileid) for fileid in corpus.fileids(categories=['news'])] # Group documents in corpus by cluster and display them groups.sort(key=itemgetter(0)) for group, items in groupby(groups, key=itemgetter(0)): for cluster, fname in items: print("Cluster {}: {}".format(cluster + 1, fname))
# Compute the centrality scores for each vertex scores = metric(G, **kwargs) # Set the score as a property on each node nx.set_node_attributes(G, name=attr, values=scores) # Find the top n scores and print them along with their index topn = heapq.nlargest(n, scores.items(), key=itemgetter(1)) for idx, item in enumerate(topn): print("{}. {}: {:0.4f}".format(idx + 1, *item)) return G if __name__ == '__main__': corpus = PickledCorpusReader( '/Users/dd/PycharmProjects/atap/resources/sample/') G = graph(corpus) # # Write the graph to disk, if needed nx.write_graphml(G, "entities.graphml") # # Get summary stats for the full graph print(nx.info(G)) # # find the most central entities in the social network print("Degree centrality") nbest_centrality(G, nx.degree_centrality) print("Betweenness centrality") nbest_centrality(G, nx.betweenness_centrality, 10,
# Create the TF-IDF Model and compute the scores model = gensim.models.TfidfModel(vectors) scores = model[vectors] for doc in scores: yield [ (lexicon[vec], score) for vec, score in doc ] if __name__ == '__main__': import heapq from reader import PickledCorpusReader from collections import Counter corpus = PickledCorpusReader('../corpus') scores = scored_document_phrases([ list(corpus.sents(fileids=fileid)) for fileid in corpus.fileids(categories=["politics", "news"]) ], True) tfidfs = Counter() for phrases in scores: for phrase, score in phrases: tfidfs[phrase] += score print( tabulate(tfidfs.most_common(50), headers=["keyphrase", "cumulative tfidf"]) )
indices = train_idx if train else test_idx return [ fileid for doc_idx, fileid in enumerate(self.corpus.fileids()) if doc_idx in indices ] def documents(self, fold=None, train=False, test=False): for fileid in self.fileids(fold, train, test): yield list(self.corpus.docs(fileids=fileid)) def labels(self, fold=None, train=False, test=False): return [ self.corpus.categories(fileids=fileid)[0] for fileid in self.fileids(fold, train, test) ] if __name__ == '__main__': from reader import PickledCorpusReader corpus = PickledCorpusReader('corpus/tagcorpusoracle') for para in corpus.fileids(categories='281550031684823'): print(para) loader = CorpusLoader(corpus,12) for fid in loader.fileids(0, test=True): print(fid) print(loader.labels(0, test=True))
preprocessor=None, lowercase=False))] if reduction: steps.append(('reduction', TruncatedSVD(n_components=1000))) # Add the estimator steps.append(('classifier', estimator)) return Pipeline(steps) labels = [ "281571400036367", "281585707268948", "281723051068305", "281981394899011", "281988872632944", "281723051068312" ] reader = PickledCorpusReader('corpus/tagcorpusoracle_test') loader = CorpusLoader(reader, 5, shuffle=True, categories=labels) models = [] for form in (LogisticRegression, SGDClassifier, DecisionTreeClassifier): models.append(create_pipeline(form(), True)) models.append(create_pipeline(form(), False)) models.append(create_pipeline(MultinomialNB(), False)) models.append(create_pipeline(GaussianNB(), True)) import time import json from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
yield list(self.reader.docs(fileids=[fileid])) def labels(self, idx=None): return [ self.reader.categories(fileids=[fileid])[0] for fileid in self.fileids(idx) ] def __iter__(self): for train_index, test_index in self.folds.split(self.files): X_train = self.documents(train_index) y_train = self.labels(train_index) X_test = self.documents(test_index) y_test = self.labels(test_index) yield X_train, X_test, y_train, y_test if __name__ == '__main__': from reader import PickledCorpusReader reader = PickledCorpusReader('../../corpora/Pickled_Corpus_Sample') loader = CorpusLoader(reader, 10) X_train, X_test, y_train, y_test = next(loader.__iter__()) for i in range(2): print('\nDOC {} LABEL: "{}"'.format(i+1, y_train[i])) print(next(X_train))
( "vectorize", TfidfVectorizer(tokenizer=identity, preprocessor=None, lowercase=False), ), ] if reduction: steps.append(("reduction", TruncatedSVD(n_components=10000))) # Add the estimator steps.append(("classifier", estimator)) return Pipeline(steps) labels = ["books", "cinema", "cooking", "gaming", "sports", "tech"] reader = PickledCorpusReader("./corpus") loader = CorpusLoader(reader, 5, shuffle=True, categories=labels) models = [] for form in (LogisticRegression, SGDClassifier): models.append(create_pipeline(form(), True)) models.append(create_pipeline(form(), False)) models.append(create_pipeline(MultinomialNB(), False)) models.append(create_pipeline(GaussianNB(), True)) import json import time from sklearn.metrics import (accuracy_score, f1_score, precision_score, recall_score)
# Compute the centrality scores for each vertex nbest = {} for name, metric in metrics.items(): scores = metric(G) # Set the score as a property on each node nx.set_node_attributes(G, name=name, values=scores) # Find the top n scores and print them along with their index topn = heapq.nlargest(n, scores.items(), key=itemgetter(1)) nbest[name] = topn return nbest if __name__ == '__main__': from reader import PickledCorpusReader corpus = PickledCorpusReader(root='../../corpora/politics_pickled') docs = corpus.docs() entity_extractor = EntityExtractor() entities = entity_extractor.fit_transform(docs) entity_pairing = EntityPairs() pairs = entity_pairing.fit_transform(entities) graph = GraphExtractor() G = graph.fit_transform(pairs) print(nx.info(G)) centralities = { "Degree Centrality": nx.degree_centrality, "Betweenness Centrality": nx.betweenness_centrality
from sklearn.model_selection import train_test_split as tts from reader import PickledCorpusReader reader = PickledCorpusReader('../corpus') labels = ["books", "cinema", "cooking", "gaming", "sports", "tech"] docs = reader.fileids(categories=labels) X = list(reader.docs(fileids=docs)) y = [reader.categories(fileids=[fileid])[0] for fileid in docs]
def score(self, word, context): """ Use KneserNeyProbDist from NLTK to get score """ trigram = tuple((context[0], context[1], word)) return self.model.prob(trigram) def samples(self): return self.model.samples() def prob(self, sample): return self.model.prob(sample) if __name__ == '__main__': corpus = PickledCorpusReader('../corpus') tokens = [''.join(word) for word in corpus.words()] vocab = Counter(tokens) sents = list([word for word in sent] for sent in corpus.sents()) counter = count_ngrams(3, vocab, sents) knm = KneserNeyModel(counter) def complete(input_text): tokenized = nltk.word_tokenize(input_text) if len(tokenized) < 2: response = "Say more." else: completions = {} for sample in knm.samples():
def documents(self, idx=None): for fileid in self.fileids(idx): yield list(self.reader.docs(fileids=[fileid])) def labels(self, idx=None): return [ self.reader.categories(fileids=[fileid])[0] for fileid in self.fileids(idx) ] def __iter__(self): for train_index, test_index in self.folds.split(self.files): X_train = self.documents(train_index) y_train = self.labels(train_index) X_test = self.documents(test_index) y_test = self.labels(test_index) yield X_train, X_test, y_train, y_test if __name__ == '__main__': from reader import PickledCorpusReader corpus = PickledCorpusReader('corpus/tagcorpusoracle') loader = CorpusLoader(corpus, 12) #for X_train, X_test, y_train, y_test in loader: #for f in y_train: #print(f)
self.model = KMeansClusterer( self.k, distance=cosine, avoid_empty_clusters=True) self.model.cluster([ self.vectorize( corpus.words(fileid) ) for fileid in corpus.fileids(categories=['news']) ]) def classify(self, document): """ Pass through to the internal model classify """ return self.model.classify(self.vectorize(document)) if __name__ == '__main__': corpus = PickledCorpusReader('../corpus') clusterer = KMeansTopics(corpus, k=7) clusterer.cluster(corpus) # Classify documents in the new corpus by cluster affinity groups = [ (clusterer.classify(corpus.words(fileid)), fileid) for fileid in corpus.fileids(categories=['news']) ] # Group documents in corpus by cluster and display them groups.sort(key=itemgetter(0)) for group, items in groupby(groups, key=itemgetter(0)): for cluster, fname in items: print("Cluster {}: {}".format(cluster+1,fname))
return self def transform(self, documents): def generator(): for document in documents: vec = self.tfidf[self.lexicon.doc2bow(document)] if self.tofull: yield sparse2full(vec) else: yield vec return list(generator()) if __name__ == '__main__': from reader import PickledCorpusReader corpus = PickledCorpusReader('../corpus') docs = [ list(corpus.docs(fileids=fileid))[0] for fileid in corpus.fileids() ] model = Pipeline([ ('norm', TextNormalizer()), ('vect', GensimTfidfVectorizer()), ('lda', ldamodel.LdaTransformer())]) model.fit_transform(docs) print(model.named_steps['norm'])
'id': 1, 'name': 'Exposition de peinture', 'description': 'ceci est une exposition de Claude Monet.\nVernissage le 23 mai 2020', 'eventcat': [200] } event2 = { 'id': 2, 'name': 'Exposition de sculpture', 'description': 'Ceci est une exposition de Rodin', 'eventcat': [200], } events = [event1, event2] fileid = 'artistic_event/0b7944a1a37ff80d982f0e18abeab26d.pickle' corpus1 = PickledCorpusReader('./pickle_corpus', fileids=[fileid ]) #categories='artistic_event') corpus2 = EventCorpusReader(events, categories=[300]) for corpus in [corpus1, corpus2]: print('------- words --------') count = 0 for word in corpus.words(categories=[200, 300]): print(word) count += 1 if count > 10: break print('------- sents --------') count = 0 for sent in corpus.sents(categories=[200, 300]): print(sent)
# Now determine if we're in train or test mode. if not (test or train) or (test and train): raise ValueError("Please specify either train or test flag") # Select only the indices to filter upon. indices = train_idx if train else test_idx return [ fileid for doc_idx, fileid in enumerate(self.corpus.fileids()) if doc_idx in indices ] def documents(self, fold=None, train=False, test=False): for fileid in self.fileids(fold, train, test): yield list(self.corpus.docs(fileids=fileid)) def labels(self, fold=None, train=False, test=False): return [ self.corpus.categories(fileids=fileid)[0] for fileid in self.fileids(fold, train, test) ] if __name__ == '__main__': from reader import PickledCorpusReader corpus = PickledCorpusReader('corpus') loader = CorpusLoader(corpus, 12) for fid in loader.fileids(0, test=True): print(fid)
def labels(self, idx=None): return [ self.reader.categories(fileids=[fileid])[0] for fileid in self.fileids(idx) ] def __iter__(self): for train_index, test_index in self.folds.split(self.files): X_train = self.documents(train_index) y_train = self.labels(train_index) X_test = self.documents(test_index) y_test = self.labels(test_index) yield X_train, X_test, y_train, y_test if __name__ == '__main__': from reader import PickledCorpusReader corpus = PickledCorpusReader('path\\to\\preprocessed') loader = CorpusLoader(corpus, 12) for X_train, X_test, y_train, y_test in loader: inputDocs = [] for doc in X_train: inputDocs.append(doc) print(len(inputDocs)) print(len(y_train)) break
if idx is None: return self.files return self.files[idx] def documents(self, idx=None): for fileid in self.fileids(idx): yield list(self.reader.docs(fileids=[fileid])) def labels(self, idx=None): return [ self.reader.categories(fileids=[fileid])[0] for fileid in self.fileids(idx) ] def __iter__(self): for train_index, test_index in self.folds.split(self.files): X_train = self.documents(train_index) y_train = self.labels(train_index) X_test = self.documents(test_index) y_test = self.labels(test_index) yield X_train, X_test, y_train, y_test if __name__ == '__main__': from reader import PickledCorpusReader corpus = PickledCorpusReader('../corpus') loader = CorpusLoader(corpus, folds=12)
def create_pipeline(estimator, reduction=False): steps = [('normalize', TextNormalizer()), ('vectorize', GensimVectorizer(path="path\\to\\gensimModels\\doc2vec.d2v"))] if reduction: steps.append(('reduction', TruncatedSVD(n_components=1000))) # Add the estimator steps.append(('classifier', estimator)) return Pipeline(steps) labels = ["label1", "label2", "RRR", "CAP"] reader = PickledCorpusReader('path\\to\\preprocessed_min') loader = CorpusLoader(reader, 12, shuffle=True, categories=labels) models = [] for form in (LogisticRegression, SGDClassifier): # models.append(create_pipeline(form(), True)) models.append(create_pipeline(form(), False)) # models.append(create_pipeline(MultinomialNB(), False)) models.append(create_pipeline(LinearSVC(), False)) # models.append(create_pipeline(PartialSGDEstimator(), False)) import time import json