def __init__(self, collection, doStats=False, postingsFile=False): self.collection = collection self.lexAnalyser = False self.calculateStats = doStats self.vocabulary = Vocabulary() self.postings = DictionaryPostings({}) self.documents = Documents() self.maxFreqInDocs = {} #self.positions = DictionaryPostings({}) if self.calculateStats: self.stats = self.getInitStats()
def run(): data_path = '../data/data.csv' result_path = '../analysis_result/sensitive_analysis' _mkdir_p(result_path) X, df, terms = load_data(data_path) docs = Documents(df) n_components = [5, 10, 15, 20, 25, 30, 35, 40, 45, 50] k = 6 records = [] alphas = [0, 0.1, 0.2, 0.5, 0.8, 1] l1_ratios = [0, 0.2, 0.5, 0.8, 1] for k in n_components: # for alpha in alphas: # for l1_ratio in l1_ratios: topic_model = topic_modeling(X, k) """evaluate the topics""" top_idxs_topics = get_idx_for_top_words(topic_model, n_top_words) print(top_idxs_topics) mpj_score = mean_pairwise_Jaccard_Similarity(top_idxs_topics, k) top_words_topics = get_top_words(topic_model, terms, n_top_words) print(top_words_topics) mean_coherence_score = avg_coherence_by_UMass(top_words_topics, docs, k) records.append({ 'k': k, 'mean_pairwise_jaccard': mpj_score, 'coherence_UMass': mean_coherence_score }) pd.DataFrame(data=records).to_csv(path.join(result_path, 'topics_k_scores_2.csv'), index=None)
def docpeek(): corpus=Documents() corpus.readFromFile(root+dataFile,maxline=maxlines) print([(i,corpus.y.count(i)) for i in corpus.classes]) corpus.makeDataFrame() x1=corpus.df[['class','count']].groupby(by='class') cnt=x1.count() a1=x1.min() a2=x1.max() a3=x1.mean() a4=x1.std() cnt.columns=['count'] a1.columns=['min'] a2.columns=['max'] a3.columns=['mean'] a4.columns=['std'] q=cnt.merge(a1,left_index=True,right_index=True)\ .merge(a2,left_index=True,right_index=True)\ .merge(a3,left_index=True,right_index=True) q=q.merge(a4,left_index=True,right_index=True) return corpus,q
def main(): # Set up corpus for training corpus=Documents() corpus.readFromFile(root+dataFile,maxline=maxlines) ''' model1=DocClfComplNB(maxStringLength=MAXSTRINGLENGH, \ firstStringLength=FIRSTSTRINGLENGTH) ''' model1=DocClf2(maxStringLength=MAXSTRINGLENGH, \ firstStringLength=FIRSTSTRINGLENGTH) print() # split into test and training sets xtrain,xtest,ytrain,ytest=\ train_test_split(corpus.words,corpus.y,test_size=testsize, \ random_state=random_state) ytrainpred=model1.fit(xtrain,ytrain) ytestpred=model1.predict(xtest) print([(i,ytest.count(i)) for i in sorted(set(ytest))]) trainAccuracy=accuracy_score(ytrain,ytrainpred) testAccuracy=accuracy_score(ytest,ytestpred) controlAccuracy=accuracy_score(np.random.permutation(ytest),ytestpred) global conf_mat conf_mat =model1.confidence(ytest, ytestpred) print(model1.confidence) print() print( np.unique(ytestpred,return_counts=True)) print() [print("%-25s" % key +" %5.3f" % value) for key,value in model1.confidence.items()] labels=[] [labels.append(key) for key in model1.confidence.keys()] for row in range(0,conf_mat.shape[0]): print( [" %4d" % conf_mat[row,col] for col in range(0,conf_mat.shape[1])]) rowsum=conf_mat.sum(axis=0) colsum=conf_mat.sum(axis=1) print("item rowsum colsum") for ic in range(0,conf_mat.shape[0]): print("%-25s" % labels[ic] + " %5d" % rowsum[ic]+ " %5d" % colsum[ic]) print("") print('train=%6.2f test=%6.2f control=%6.2f' % (trainAccuracy,testAccuracy,controlAccuracy)) pickle.dump(model1,open(root+modelName+".pckmdl","wb")) print(model1.confidence) print(ytestpred[0]) print(xtest[0][0:20]) testfile=open(root+modelName+"testdata.txt","wt") testfile.write(ytestpred[0]) testfile.write("\n") testfile.write(xtest[0]) testfile.write("\n") testfile.write(ytestpred[10]) testfile.write("\n") testfile.write(xtest[10]) testfile.write("\n") testfile.close() print( model1.message)
class Indexer(object): def __init__(self, collection, doStats=False, postingsFile=False): self.collection = collection self.lexAnalyser = False self.calculateStats = doStats self.vocabulary = Vocabulary() self.postings = DictionaryPostings({}) self.documents = Documents() self.maxFreqInDocs = {} #self.positions = DictionaryPostings({}) if self.calculateStats: self.stats = self.getInitStats() def index(self, config): """Indexa la coleccion dada""" # Configuro el analizador lexico self.lexAnalyser = LexAnalyser(config) #-----------------LEER-COLECCION--------------# docId = 0 totalDocs = len(self.collection.allFiles()) number_of_logs = 1 if totalDocs < 50 else totalDocs / 50 for filePath in self.collection.allFiles(): if (not filePath.lower().endswith('.txt')): logging.warning("following file will not be indexed: " + filePath) continue # Guardo los datos del archivo actual actualDoc = { "name": os.path.basename(os.path.normpath(filePath)), "path": filePath } #----------LEER-ARCHIVO--------------------# if (docId + 1) % number_of_logs == 0: logging.info("Cargando %s (%d/%d)" % (actualDoc["name"], docId + 1, totalDocs)) with codecs.open(filePath, mode='rt', encoding='utf-8') as f: # Guardo tokens y terminos del documento tokens = [] terms = [] for line in f: # Aplica tokenizado, stopwords y demas (segun config) analysed = self.lexAnalyser.analyse(line) terms.extend(analysed["terms"]) tokens.extend(analysed["tokens"]) analysed = None # Guardo documento actual self.documents.addDocument(docId, actualDoc["path"]) # De cada documento los terminos que tiene (sin repetir) #self.documentsTerms[docId] = set() # Actualizo vocabulario self.updateIndex(docId, terms) #Actualizo stats if self.calculateStats: self.updateStats(tokens, terms) tokens = None terms = None docId += 1 #------FIN-LEER-ARCHIVO--------------------# #----------------FIN-LEER-COLECCION---------# if self.calculateStats: logging.info("Generando stats") self.endStats() #logging.info(u"Ordenando vocabulario alfabeticamente") #self.vocabulary.setAlphabeticalOrder() #logging.info(u"Generando id de los terminos") #self.setTermsId() #logging.info(u"Ordenando postings por clave") #self.postings.sortByKey() #self.positions.sortByKey() #logging.info(u"Calculando frecuencias maximas de cada documento") #self.loadMaxFreqs() def updateIndex(self, docId, terms): position = 0 termToFreq = {} for t in terms: #self.documentsTerms[docId].add(t) # Si termino no esta en vocabulario lo agrego inicializando la data if not self.vocabulary.isATerm(t): termId = self.vocabulary.addTerm(t, 1.0, 1.0) #self.postings.addPosting(termId, docId, 1.0) #self.positions.addPosting(t, docId, [position]) termToFreq[termId] = 1 else: self.vocabulary.incrementCF(t, 1.0) # termino no estaba en este documento? termId = self.vocabulary.getId(t) if not termId in termToFreq: termToFreq[termId] = 1 self.vocabulary.incrementDF(t, 1.0) #self.postings.addDocToPosting(termId, docId, 1.0) #self.positions.addDocToPosting(t, docId, [position]) # else termino ya existe en documento: else: termToFreq[termId] += 1 # Actualizo postings con frecuencias #self.postings.addDocToPosting(termId, docId, self.postings.getValue(termId, docId) + 1.0) # Actualizo postings posicionales #positionList = self.positions.getValue(t, docId) #positionList.append(position) #self.positions.addDocToPosting(t, docId, positionList) #position += 1 for tId in termToFreq: self.postings.addPosting(tId, docId, termToFreq[tId]) maxValue = 0 for t in termToFreq: if termToFreq[t] >= maxValue: maxValue = termToFreq[t] termToFreq = None self.maxFreqInDocs[docId] = maxValue def getInitStats(self): out = { "tokens_count": 0.0, "terms_count": 0.0, "docs_count": 0.0, "longestDoc": { "tokens_count": -1, "terms_count": -1 }, "shortestDoc": { "tokens_count": sys.maxint, "terms_count": sys.maxint } } return out def updateStats(self, tokens, terms): tokensLength = len(tokens) termsLength = len(set(terms)) self.stats["tokens_count"] += tokensLength self.stats["docs_count"] += 1.0 # Documento es el mas grande? if tokensLength >= self.stats["longestDoc"]["tokens_count"]: self.stats["longestDoc"]["tokens_count"] = tokensLength self.stats["longestDoc"]["terms_count"] = termsLength # Documento es el mas pequeno? if tokensLength <= self.stats["shortestDoc"]["tokens_count"]: self.stats["shortestDoc"]["tokens_count"] = tokensLength self.stats["shortestDoc"]["terms_count"] = termsLength def endStats(self): nuberOfTerms = len(self.vocabulary.content) self.stats["terms_count"] = nuberOfTerms if self.stats["docs_count"] == 0: self.stats["avg_tokens_by_doc"] = 0 self.stats["avg_terms_by_doc"] = 0 else: self.stats["avg_tokens_by_doc"] = self.stats[ "tokens_count"] / self.stats["docs_count"] self.stats["avg_terms_by_doc"] = self.stats[ "terms_count"] / self.stats["docs_count"] self.stats["avg_term_length"] = 0 if nuberOfTerms == 0 else sum( [len(key) for key in self.vocabulary.content]) / (nuberOfTerms + 0.0) self.stats["terms_freq_one"] = len([ key for key in self.vocabulary.content if self.vocabulary.getCF(key) == 1 ]) def printStatsFile(self, title): with open(title, "w") as statsFile: s = [] s.append("-" * 50 + "\n") s.append("\tESTADISTICAS \tpor Juan Cardona\n") s.append("-" * 50 + "\n") s.append("Cantidad de Documentos Procesados: %d\n" % self.stats["docs_count"]) s.append("Cantidad de Tokens Extraidos: %d\n" % self.stats["tokens_count"]) s.append("Cantidad de Términos Extraidos: %d\n" % self.stats["terms_count"]) s.append("Cantidad Promedio de Tokens por Documento: %.2f\n" % self.stats["avg_tokens_by_doc"]) s.append("Cantidad Promedio de Términos por Documento: %.2f\n" % self.stats["avg_terms_by_doc"]) s.append("Largo promedio de un término: %.2f\n" % self.stats["avg_term_length"]) s.append("Cantidad de tokens del documento más corto: %d\n" % self.stats["shortestDoc"]["tokens_count"]) s.append("Cantidad de términos del documento más corto: %d\n" % self.stats["shortestDoc"]["terms_count"]) s.append("Cantidad de tokens del documento más largo: %d\n" % self.stats["longestDoc"]["tokens_count"]) s.append("Cantidad de términos del documento más largo: %d\n" % self.stats["longestDoc"]["terms_count"]) s.append( "Cantidad de términos que aparecen 1 vez en la colección: %d\n" % self.stats["terms_freq_one"]) statsFile.write(''.join(s)) return title
from Bibliotheque import Bibliotheque from Documents import Documents from Volumes import Volumes from Livres import Livres from Adherents import Adherents if __name__ == "__main__": pass biblio1 = Bibliotheque(1) # biblio1.creer_bibliotheque(biblio1.getId_bibliotheque()) # biblio1.liste_bibliotheques() # biblio.liste_bibliotheques() doc1 = Documents(1,'Harry Potter à l ecole des sorciers',biblio1.getId_bibliotheque()) doc2 = Documents(2,'Harry Potter et la Chambre des secrets',biblio1.getId_bibliotheque()) doc3 = Documents(3,'Harry Potter et le Prisonnier d Azkaban',biblio1.getId_bibliotheque()) doc4 = Documents(4,'Harry Potter et la Coupe de feu',biblio1.getId_bibliotheque()) doc5 = Documents(5,'Harry Potter et l Ordre du Phénix ',biblio1.getId_bibliotheque()) doc6 = Documents(6,'Harry Potter et le Prince de sang-mêlé',biblio1.getId_bibliotheque()) doc7 = Documents(7,'Harry Potter et les Reliques de la Mort',biblio1.getId_bibliotheque()) # doc1.creer_document(doc1.getId_document(),doc1.getId_bibliotheque(),doc1.getTitre()) # doc2.creer_document(doc2.getId_document(),doc2.getId_bibliotheque(),doc2.getTitre()) # doc3.creer_document(doc3.getId_document(),doc3.getId_bibliotheque(),doc3.getTitre()) # doc4.creer_document(doc4.getId_document(),doc4.getId_bibliotheque(),doc4.getTitre()) # doc5.creer_document(doc5.getId_document(),doc5.getId_bibliotheque(),doc5.getTitre()) # doc6.creer_document(doc6.getId_document(),doc6.getId_bibliotheque(),doc6.getTitre()) # doc7.creer_document(doc7.getId_document(),doc7.getId_bibliotheque(),doc7.getTitre())
from Documents import Documents from BayesClassifier import Bayes from CrossValidation import Validator from tabulate import tabulate if __name__ == "__main__": documents = Documents.get_all_docs('Bayes\pu1', 'spmsg') ## # For Bayer Classification # @task - ограничения на ваш классификатор, чтобы хорошие письма практически никогда не попадали в спам, но при этом, возможно, общее качество классификации несколько уменьшилось ## include_header = True # True || False minimum_occurrence = 3 # None || Number => при классификации участвуют слова, у которых минимальное кол-во раз появление слова в тренировочной выборке > minimum_occurrence discard_deviation = None # None || Number < 0.5 => отбросить отклонение top_n = None # None || Number => в классификации участвуют только `top_n` слов в документе, которые имеют наивысшие веса bayes = Bayes(include_header=include_header, minimum_occurrence=minimum_occurrence, discard_deviation=discard_deviation, top_n=top_n) ## # For Cross Validation ## debug_print = False # True || False f_measure, matrix = Validator.validate(bayes, documents, debug_print) table = [["T", matrix[0][0], matrix[0][1]], ["F", matrix[1][0], matrix[1][1]]]
from Documents import Documents from GA import GA import pandas as pd import pickle from os import path loadit = False if loadit: docs = pickle.load(open('docs.pkl', 'rb')) else: data_path = '../data/github_issues.csv' df = pd.read_csv(data_path) docs = Documents() # docs.load(list(df['description'])[:300]) docs.load(list(df['description'])) docs.vectorise() pickle.dump(docs, open('docs.pkl', 'wb+')) print("No. of documents loaded: {}".format(docs.get_doc_size())) corpus = docs.get_vectors() dictionary = docs.get_dictionary() GA = GA(corpus, dictionary, pop_size=30, fitness_budget=10000, objective='coherence') GA.initialise_population() GA.evolve()
def main(): # Set up corpus for training corpus = Documents() corpus.readFromFile(root + dataFile, maxline=maxlines) ''' model1=DocClfComplNB(maxStringLength=MAXSTRINGLENGH, \ firstStringLength=FIRSTSTRINGLENGTH) ''' model1=DocClfTLinSVC(maxStringLength=MAXSTRINGLENGH, \ firstStringLength=FIRSTSTRINGLENGTH, penalty=PENALTY,loss=LOSS,dual=DUAL, maxFeatures=MAXFEATURES ) print() # split into test and training sets xtrain,xtest,ytrain,ytest=\ train_test_split(corpus.words,corpus.y,test_size=testsize, \ random_state=random_state) ibest = 0 if (len(cvec) > 1): scorelist = model1.crossVal(cvec, xtrain, ytrain) mincv = 9.e9 * criteriaSign # find value that gave best cross validation score & use it print("case Creg meanCVscore best so far") for item in range(0, len(cvec)): meancvscore = scorelist[item].mean() if meancvscore * criteriaSign < mincv * criteriaSign: mincv = meancvscore ibest = item print("%2d " % item, "%5.2f " % cvec[item], "%8.5f " % meancvscore, "%2d " % ibest) # model1=DocClfTLinSVC(maxStringLength=MAXSTRINGLENGH, \ firstStringLength=FIRSTSTRINGLENGTH, penalty=PENALTY,loss=LOSS,dual=DUAL, maxFeatures=MAXFEATURES,creg=cvec[ibest] ) ytrainpred = model1.fit(xtrain, ytrain) ytestpred = model1.predict(xtest) trainAccuracy = accuracy_score(ytrain, ytrainpred) testAccuracy = accuracy_score(ytest, ytestpred) controlAccuracy = accuracy_score(np.random.permutation(ytest), ytestpred) global conf_mat conf_mat = model1.confidence(ytest, ytestpred) print(model1.confidence) print() print(np.unique(ytestpred, return_counts=True)) print() [ print("%-20s" % key + " %5.3f" % value) for key, value in model1.confidence.items() ] for row in range(0, conf_mat.shape[0]): print([ " %4d" % conf_mat[row, col] for col in range(0, conf_mat.shape[1]) ]) rowsum = conf_mat.sum(axis=0) colsum = conf_mat.sum(axis=1) labels = [] [labels.append(key) for key in model1.confidence.keys()] print("item rowsum colsum") for ic in range(0, conf_mat.shape[0]): print("%-25s" % labels[ic] + " %5d" % rowsum[ic] + " %5d" % colsum[ic]) print("") print('train=%6.2f test=%6.2f control=%6.2f' % (trainAccuracy, testAccuracy, controlAccuracy)) # compute accuracy given predicted value pickle.dump(model1, open(outdir + modelName + ".pckmdl", "wb")) print(ytestpred[0]) print(xtest[0][0:20]) testfile = open(outdir + modelName + "testdata.txt", "wt") testfile.write(ytestpred[0]) testfile.write(",") testfile.write(xtest[0]) testfile.write("\n") testfile.write(ytestpred[10]) testfile.write(",") testfile.write(xtest[10]) testfile.write("\n") testfile.write(ytestpred[25]) testfile.write(",") testfile.write(xtest[25]) testfile.write("\n") testfile.close() print(model1.message)