def __init__(self, collection, doStats=False, postingsFile=False):
     self.collection = collection
     self.lexAnalyser = False
     self.calculateStats = doStats
     self.vocabulary = Vocabulary()
     self.postings = DictionaryPostings({})
     self.documents = Documents()
     self.maxFreqInDocs = {}
     #self.positions = DictionaryPostings({})
     if self.calculateStats:
         self.stats = self.getInitStats()
예제 #2
0
def run():
    data_path = '../data/data.csv'
    result_path = '../analysis_result/sensitive_analysis'
    _mkdir_p(result_path)

    X, df, terms = load_data(data_path)
    docs = Documents(df)

    n_components = [5, 10, 15, 20, 25, 30, 35, 40, 45, 50]
    k = 6
    records = []
    alphas = [0, 0.1, 0.2, 0.5, 0.8, 1]
    l1_ratios = [0, 0.2, 0.5, 0.8, 1]

    for k in n_components:
    # for alpha in alphas:
    #     for l1_ratio in l1_ratios:

        topic_model = topic_modeling(X, k)
        """evaluate the topics"""
        top_idxs_topics = get_idx_for_top_words(topic_model, n_top_words)
        print(top_idxs_topics)
        mpj_score = mean_pairwise_Jaccard_Similarity(top_idxs_topics, k)

        top_words_topics = get_top_words(topic_model, terms, n_top_words)
        print(top_words_topics)

        mean_coherence_score = avg_coherence_by_UMass(top_words_topics, docs, k)

        records.append({
            'k': k,
            'mean_pairwise_jaccard': mpj_score,
            'coherence_UMass': mean_coherence_score
        })


    pd.DataFrame(data=records).to_csv(path.join(result_path, 'topics_k_scores_2.csv'), index=None)
예제 #3
0
def docpeek():
    corpus=Documents()
    corpus.readFromFile(root+dataFile,maxline=maxlines)
    print([(i,corpus.y.count(i)) for i in corpus.classes])
    corpus.makeDataFrame()
    x1=corpus.df[['class','count']].groupby(by='class')
    cnt=x1.count()
    a1=x1.min()
    a2=x1.max()
    a3=x1.mean()
    a4=x1.std()
    cnt.columns=['count']
    a1.columns=['min']
    a2.columns=['max']
    a3.columns=['mean']
    a4.columns=['std']
    q=cnt.merge(a1,left_index=True,right_index=True)\
        .merge(a2,left_index=True,right_index=True)\
        .merge(a3,left_index=True,right_index=True)
    q=q.merge(a4,left_index=True,right_index=True)
    return corpus,q
예제 #4
0
def main():  
     # Set up corpus for training               
    corpus=Documents()
    corpus.readFromFile(root+dataFile,maxline=maxlines)
    ''' 
    model1=DocClfComplNB(maxStringLength=MAXSTRINGLENGH, \
                 firstStringLength=FIRSTSTRINGLENGTH)
        '''
    model1=DocClf2(maxStringLength=MAXSTRINGLENGH, \
                 firstStringLength=FIRSTSTRINGLENGTH)
    print()
    # split into test and training sets
    xtrain,xtest,ytrain,ytest=\
        train_test_split(corpus.words,corpus.y,test_size=testsize, \
                         random_state=random_state)
    ytrainpred=model1.fit(xtrain,ytrain)
    ytestpred=model1.predict(xtest)


    print([(i,ytest.count(i)) for i in sorted(set(ytest))])
    
    
    trainAccuracy=accuracy_score(ytrain,ytrainpred)
    testAccuracy=accuracy_score(ytest,ytestpred)
    controlAccuracy=accuracy_score(np.random.permutation(ytest),ytestpred)
    
    
    global conf_mat
    conf_mat =model1.confidence(ytest, ytestpred)
    print(model1.confidence)
    print()
    print( np.unique(ytestpred,return_counts=True))
    print()
    
    [print("%-25s" % key +" %5.3f" % value) for key,value in model1.confidence.items()]
    
    labels=[]
    [labels.append(key) for key in model1.confidence.keys()]
    for row in range(0,conf_mat.shape[0]):
       print( [" %4d" % conf_mat[row,col] for col in range(0,conf_mat.shape[1])])
    
    rowsum=conf_mat.sum(axis=0)
    colsum=conf_mat.sum(axis=1)
    print("item     rowsum      colsum")
    for ic in range(0,conf_mat.shape[0]):
        print("%-25s" % labels[ic] + " %5d" % rowsum[ic]+ " %5d" % colsum[ic])
      
    print("")
    print('train=%6.2f  test=%6.2f control=%6.2f' % 
          (trainAccuracy,testAccuracy,controlAccuracy))
 
    pickle.dump(model1,open(root+modelName+".pckmdl","wb"))
    print(model1.confidence)
    print(ytestpred[0])
    print(xtest[0][0:20])
    testfile=open(root+modelName+"testdata.txt","wt")
    
    testfile.write(ytestpred[0])
    testfile.write("\n")
    testfile.write(xtest[0])
    testfile.write("\n")
    testfile.write(ytestpred[10])
    testfile.write("\n")
    testfile.write(xtest[10])
    testfile.write("\n")
    testfile.close()
    print( model1.message)
class Indexer(object):
    def __init__(self, collection, doStats=False, postingsFile=False):
        self.collection = collection
        self.lexAnalyser = False
        self.calculateStats = doStats
        self.vocabulary = Vocabulary()
        self.postings = DictionaryPostings({})
        self.documents = Documents()
        self.maxFreqInDocs = {}
        #self.positions = DictionaryPostings({})
        if self.calculateStats:
            self.stats = self.getInitStats()

    def index(self, config):
        """Indexa la coleccion dada"""

        # Configuro el analizador lexico
        self.lexAnalyser = LexAnalyser(config)

        #-----------------LEER-COLECCION--------------#
        docId = 0
        totalDocs = len(self.collection.allFiles())
        number_of_logs = 1 if totalDocs < 50 else totalDocs / 50
        for filePath in self.collection.allFiles():
            if (not filePath.lower().endswith('.txt')):
                logging.warning("following file will not be indexed: " +
                                filePath)
                continue

            # Guardo los datos del archivo actual
            actualDoc = {
                "name": os.path.basename(os.path.normpath(filePath)),
                "path": filePath
            }

            #----------LEER-ARCHIVO--------------------#
            if (docId + 1) % number_of_logs == 0:
                logging.info("Cargando %s (%d/%d)" %
                             (actualDoc["name"], docId + 1, totalDocs))
            with codecs.open(filePath, mode='rt', encoding='utf-8') as f:

                # Guardo tokens y terminos del documento
                tokens = []
                terms = []

                for line in f:
                    # Aplica tokenizado, stopwords y demas (segun config)
                    analysed = self.lexAnalyser.analyse(line)
                    terms.extend(analysed["terms"])
                    tokens.extend(analysed["tokens"])
                    analysed = None

            # Guardo documento actual
            self.documents.addDocument(docId, actualDoc["path"])

            # De cada documento los terminos que tiene (sin repetir)
            #self.documentsTerms[docId] = set()

            # Actualizo vocabulario
            self.updateIndex(docId, terms)
            #Actualizo stats
            if self.calculateStats:
                self.updateStats(tokens, terms)

            tokens = None
            terms = None
            docId += 1
            #------FIN-LEER-ARCHIVO--------------------#

        #----------------FIN-LEER-COLECCION---------#
        if self.calculateStats:
            logging.info("Generando stats")
            self.endStats()

        #logging.info(u"Ordenando vocabulario alfabeticamente")
        #self.vocabulary.setAlphabeticalOrder()
        #logging.info(u"Generando id de los terminos")
        #self.setTermsId()
        #logging.info(u"Ordenando postings por clave")
        #self.postings.sortByKey()
        #self.positions.sortByKey()
        #logging.info(u"Calculando frecuencias maximas de cada documento")
        #self.loadMaxFreqs()

    def updateIndex(self, docId, terms):
        position = 0
        termToFreq = {}
        for t in terms:
            #self.documentsTerms[docId].add(t)
            # Si termino no esta en vocabulario lo agrego inicializando la data
            if not self.vocabulary.isATerm(t):
                termId = self.vocabulary.addTerm(t, 1.0, 1.0)
                #self.postings.addPosting(termId, docId, 1.0)
                #self.positions.addPosting(t, docId, [position])
                termToFreq[termId] = 1
            else:
                self.vocabulary.incrementCF(t, 1.0)
                # termino no estaba en este documento?
                termId = self.vocabulary.getId(t)
                if not termId in termToFreq:
                    termToFreq[termId] = 1
                    self.vocabulary.incrementDF(t, 1.0)
                    #self.postings.addDocToPosting(termId, docId, 1.0)
                    #self.positions.addDocToPosting(t, docId, [position])
                # else termino ya existe en documento:
                else:
                    termToFreq[termId] += 1
                    # Actualizo postings con frecuencias
                    #self.postings.addDocToPosting(termId, docId, self.postings.getValue(termId, docId) + 1.0)
                    # Actualizo postings posicionales
                    #positionList = self.positions.getValue(t, docId)
                    #positionList.append(position)
                    #self.positions.addDocToPosting(t, docId, positionList)
            #position += 1
        for tId in termToFreq:
            self.postings.addPosting(tId, docId, termToFreq[tId])
        maxValue = 0
        for t in termToFreq:
            if termToFreq[t] >= maxValue:
                maxValue = termToFreq[t]
        termToFreq = None
        self.maxFreqInDocs[docId] = maxValue

    def getInitStats(self):
        out = {
            "tokens_count": 0.0,
            "terms_count": 0.0,
            "docs_count": 0.0,
            "longestDoc": {
                "tokens_count": -1,
                "terms_count": -1
            },
            "shortestDoc": {
                "tokens_count": sys.maxint,
                "terms_count": sys.maxint
            }
        }
        return out

    def updateStats(self, tokens, terms):
        tokensLength = len(tokens)
        termsLength = len(set(terms))

        self.stats["tokens_count"] += tokensLength
        self.stats["docs_count"] += 1.0

        # Documento es el mas grande?
        if tokensLength >= self.stats["longestDoc"]["tokens_count"]:
            self.stats["longestDoc"]["tokens_count"] = tokensLength
            self.stats["longestDoc"]["terms_count"] = termsLength
        # Documento es el mas pequeno?
        if tokensLength <= self.stats["shortestDoc"]["tokens_count"]:
            self.stats["shortestDoc"]["tokens_count"] = tokensLength
            self.stats["shortestDoc"]["terms_count"] = termsLength

    def endStats(self):
        nuberOfTerms = len(self.vocabulary.content)
        self.stats["terms_count"] = nuberOfTerms

        if self.stats["docs_count"] == 0:
            self.stats["avg_tokens_by_doc"] = 0
            self.stats["avg_terms_by_doc"] = 0
        else:
            self.stats["avg_tokens_by_doc"] = self.stats[
                "tokens_count"] / self.stats["docs_count"]
            self.stats["avg_terms_by_doc"] = self.stats[
                "terms_count"] / self.stats["docs_count"]

        self.stats["avg_term_length"] = 0 if nuberOfTerms == 0 else sum(
            [len(key)
             for key in self.vocabulary.content]) / (nuberOfTerms + 0.0)
        self.stats["terms_freq_one"] = len([
            key for key in self.vocabulary.content
            if self.vocabulary.getCF(key) == 1
        ])

    def printStatsFile(self, title):
        with open(title, "w") as statsFile:
            s = []
            s.append("-" * 50 + "\n")
            s.append("\tESTADISTICAS \tpor Juan Cardona\n")
            s.append("-" * 50 + "\n")
            s.append("Cantidad de Documentos Procesados: %d\n" %
                     self.stats["docs_count"])
            s.append("Cantidad de Tokens Extraidos: %d\n" %
                     self.stats["tokens_count"])
            s.append("Cantidad de Términos Extraidos: %d\n" %
                     self.stats["terms_count"])
            s.append("Cantidad Promedio de Tokens por Documento: %.2f\n" %
                     self.stats["avg_tokens_by_doc"])
            s.append("Cantidad Promedio de Términos por Documento: %.2f\n" %
                     self.stats["avg_terms_by_doc"])
            s.append("Largo promedio de un término: %.2f\n" %
                     self.stats["avg_term_length"])
            s.append("Cantidad de tokens del documento más corto: %d\n" %
                     self.stats["shortestDoc"]["tokens_count"])
            s.append("Cantidad de términos del documento más corto: %d\n" %
                     self.stats["shortestDoc"]["terms_count"])
            s.append("Cantidad de tokens del documento más largo: %d\n" %
                     self.stats["longestDoc"]["tokens_count"])
            s.append("Cantidad de términos del documento más largo: %d\n" %
                     self.stats["longestDoc"]["terms_count"])
            s.append(
                "Cantidad de términos que aparecen 1 vez en la colección: %d\n"
                % self.stats["terms_freq_one"])
            statsFile.write(''.join(s))
        return title
예제 #6
0
from Bibliotheque import Bibliotheque
from Documents import Documents
from Volumes import Volumes
from Livres import Livres
from Adherents import Adherents

if __name__ == "__main__":
    pass

    biblio1 = Bibliotheque(1)
    # biblio1.creer_bibliotheque(biblio1.getId_bibliotheque())
    # biblio1.liste_bibliotheques()
    
    # biblio.liste_bibliotheques()
    
    doc1 = Documents(1,'Harry Potter à l ecole des sorciers',biblio1.getId_bibliotheque())
    
    doc2 = Documents(2,'Harry Potter et la Chambre des secrets',biblio1.getId_bibliotheque())
    doc3 = Documents(3,'Harry Potter et le Prisonnier d Azkaban',biblio1.getId_bibliotheque())
    doc4 = Documents(4,'Harry Potter et la Coupe de feu',biblio1.getId_bibliotheque())
    doc5 = Documents(5,'Harry Potter et l Ordre du Phénix ',biblio1.getId_bibliotheque())
    doc6 = Documents(6,'Harry Potter et le Prince de sang-mêlé',biblio1.getId_bibliotheque())
    doc7 = Documents(7,'Harry Potter et les Reliques de la Mort',biblio1.getId_bibliotheque())
    
    # doc1.creer_document(doc1.getId_document(),doc1.getId_bibliotheque(),doc1.getTitre())
    # doc2.creer_document(doc2.getId_document(),doc2.getId_bibliotheque(),doc2.getTitre())
    # doc3.creer_document(doc3.getId_document(),doc3.getId_bibliotheque(),doc3.getTitre())
    # doc4.creer_document(doc4.getId_document(),doc4.getId_bibliotheque(),doc4.getTitre())
    # doc5.creer_document(doc5.getId_document(),doc5.getId_bibliotheque(),doc5.getTitre())
    # doc6.creer_document(doc6.getId_document(),doc6.getId_bibliotheque(),doc6.getTitre())
    # doc7.creer_document(doc7.getId_document(),doc7.getId_bibliotheque(),doc7.getTitre())
예제 #7
0
from Documents import Documents
from BayesClassifier import Bayes
from CrossValidation import Validator
from tabulate import tabulate

if __name__ == "__main__":
    documents = Documents.get_all_docs('Bayes\pu1', 'spmsg')

    ##
    # For Bayer Classification
    # @task - ограничения на ваш классификатор, чтобы хорошие письма практически никогда не попадали в спам, но при этом, возможно, общее качество классификации несколько уменьшилось
    ##
    include_header = True  # True || False
    minimum_occurrence = 3  # None || Number        => при классификации участвуют слова, у которых минимальное кол-во раз появление слова в тренировочной выборке > minimum_occurrence
    discard_deviation = None  # None || Number < 0.5  => отбросить отклонение
    top_n = None  # None || Number        => в классификации участвуют только `top_n` слов в документе, которые имеют наивысшие веса

    bayes = Bayes(include_header=include_header,
                  minimum_occurrence=minimum_occurrence,
                  discard_deviation=discard_deviation,
                  top_n=top_n)

    ##
    # For Cross Validation
    ##
    debug_print = False  # True || False

    f_measure, matrix = Validator.validate(bayes, documents, debug_print)

    table = [["T", matrix[0][0], matrix[0][1]],
             ["F", matrix[1][0], matrix[1][1]]]
예제 #8
0
파일: main.py 프로젝트: vnck/EvoTopic
from Documents import Documents
from GA import GA
import pandas as pd
import pickle
from os import path

loadit = False

if loadit:
    docs = pickle.load(open('docs.pkl', 'rb'))
else:
    data_path = '../data/github_issues.csv'
    df = pd.read_csv(data_path)
    docs = Documents()
    #   docs.load(list(df['description'])[:300])
    docs.load(list(df['description']))
    docs.vectorise()
    pickle.dump(docs, open('docs.pkl', 'wb+'))

print("No. of documents loaded: {}".format(docs.get_doc_size()))

corpus = docs.get_vectors()
dictionary = docs.get_dictionary()

GA = GA(corpus,
        dictionary,
        pop_size=30,
        fitness_budget=10000,
        objective='coherence')
GA.initialise_population()
GA.evolve()
예제 #9
0
def main():
    # Set up corpus for training
    corpus = Documents()
    corpus.readFromFile(root + dataFile, maxline=maxlines)
    ''' 
    model1=DocClfComplNB(maxStringLength=MAXSTRINGLENGH, \
                 firstStringLength=FIRSTSTRINGLENGTH)
        '''
    model1=DocClfTLinSVC(maxStringLength=MAXSTRINGLENGH, \
                 firstStringLength=FIRSTSTRINGLENGTH,
                 penalty=PENALTY,loss=LOSS,dual=DUAL,
                 maxFeatures=MAXFEATURES
                 )
    print()

    # split into test and training sets
    xtrain,xtest,ytrain,ytest=\
        train_test_split(corpus.words,corpus.y,test_size=testsize, \
                         random_state=random_state)
    ibest = 0
    if (len(cvec) > 1):
        scorelist = model1.crossVal(cvec, xtrain, ytrain)
        mincv = 9.e9 * criteriaSign

        # find value that gave best cross validation score & use it
        print("case  Creg     meanCVscore  best so far")
        for item in range(0, len(cvec)):
            meancvscore = scorelist[item].mean()
            if meancvscore * criteriaSign < mincv * criteriaSign:
                mincv = meancvscore
                ibest = item
                print("%2d   " % item, "%5.2f   " % cvec[item],
                      "%8.5f   " % meancvscore, "%2d " % ibest)


#
    model1=DocClfTLinSVC(maxStringLength=MAXSTRINGLENGH, \
                 firstStringLength=FIRSTSTRINGLENGTH,
                 penalty=PENALTY,loss=LOSS,dual=DUAL,
                 maxFeatures=MAXFEATURES,creg=cvec[ibest]
                 )

    ytrainpred = model1.fit(xtrain, ytrain)
    ytestpred = model1.predict(xtest)

    trainAccuracy = accuracy_score(ytrain, ytrainpred)
    testAccuracy = accuracy_score(ytest, ytestpred)
    controlAccuracy = accuracy_score(np.random.permutation(ytest), ytestpred)

    global conf_mat
    conf_mat = model1.confidence(ytest, ytestpred)
    print(model1.confidence)
    print()
    print(np.unique(ytestpred, return_counts=True))
    print()

    [
        print("%-20s" % key + " %5.3f" % value)
        for key, value in model1.confidence.items()
    ]
    for row in range(0, conf_mat.shape[0]):
        print([
            " %4d" % conf_mat[row, col] for col in range(0, conf_mat.shape[1])
        ])

    rowsum = conf_mat.sum(axis=0)
    colsum = conf_mat.sum(axis=1)
    labels = []
    [labels.append(key) for key in model1.confidence.keys()]
    print("item     rowsum      colsum")
    for ic in range(0, conf_mat.shape[0]):
        print("%-25s" % labels[ic] + " %5d" % rowsum[ic] + " %5d" % colsum[ic])

    print("")
    print('train=%6.2f  test=%6.2f control=%6.2f' %
          (trainAccuracy, testAccuracy, controlAccuracy))
    # compute accuracy given predicted value

    pickle.dump(model1, open(outdir + modelName + ".pckmdl", "wb"))

    print(ytestpred[0])
    print(xtest[0][0:20])
    testfile = open(outdir + modelName + "testdata.txt", "wt")

    testfile.write(ytestpred[0])
    testfile.write(",")
    testfile.write(xtest[0])
    testfile.write("\n")
    testfile.write(ytestpred[10])
    testfile.write(",")
    testfile.write(xtest[10])
    testfile.write("\n")
    testfile.write(ytestpred[25])
    testfile.write(",")
    testfile.write(xtest[25])
    testfile.write("\n")
    testfile.close()
    print(model1.message)