Exemple #1
0
def main():
    import vocabulary
    # from sklearn.decomposition import PCA
    import pickle

    corpus = vocabulary.load_file('mood.txt')
    voca = vocabulary.Vocabulary(True)
    docs = [voca.doc_to_ids(doc) for doc in corpus]
    doctrain = docs[:450]
    doctest = docs[450:]

    # docs = voca.cut_low_freq(docs, 1)
    # SET parameter
    K = 10  # number of topics
    alpha, beta = 0.5, 0.5
    V = voca.size()

    f = FileOutput("lda_trainning")
    f.out("corpus=%d, words=%d, K=%d, alpha=%f, beta=%f" %
          (len(docs), len(voca.vocas), K, alpha, beta))
    LDA = lda.LDA(K, alpha, beta, docs, V)
    lda_training(f, LDA, voca, iteration=30)  # set number of iterations
    theta = LDA.theta()[:450]
    newtheta = LDA.theta()[450:]
    with open("theta.pk", 'wb') as f:
        pickle.dump(theta, f)
    with open("newtheta.pk", 'wb') as f:
        pickle.dump(newtheta, f)
Exemple #2
0
def main():
    import optparse
    import vocabulary
    parser = optparse.OptionParser()
    parser.add_option("-f", dest="filename", help="corpus filename")
    parser.add_option("-c", dest="corpus", help="using range of Brown corpus' files(start:end)")
    parser.add_option("--alpha", dest="alpha", type="float", help="parameter alpha", default=0.5)
    parser.add_option("--beta", dest="beta", type="float", help="parameter beta", default=0.5)
    parser.add_option("-k", dest="K", type="int", help="number of topics", default=20)
    parser.add_option("-i", dest="iteration", type="int", help="iteration count", default=100)
    parser.add_option("-s", dest="smartinit", action="store_true", help="smart initialize of parameters", default=False)
    parser.add_option("--stopwords", dest="stopwords", help="exclude stop words", action="store_true", default=False)
    parser.add_option("--seed", dest="seed", type="int", help="random seed")
    parser.add_option("--df", dest="df", type="int", help="threshold of document freaquency to cut words", default=0)
    (options, args) = parser.parse_args()
    if not (options.filename or options.corpus): parser.error("need corpus filename(-f) or corpus range(-c)")

    if options.filename:
        corpus = vocabulary.load_file(options.filename)
    else:
        corpus = vocabulary.load_corpus(options.corpus)
        if not corpus: parser.error("corpus range(-c) forms 'start:end'")
    if options.seed != None:
        numpy.random.seed(options.seed)

    voca = vocabulary.Vocabulary(options.stopwords)
    docs = [voca.doc_to_ids(doc) for doc in corpus]
    if options.df > 0: docs = voca.cut_low_freq(docs, options.df)

    lda = LDA(options.K, options.alpha, options.beta, docs, voca.size(), options.smartinit)
    print "corpus=%d, words=%d, K=%d, a=%f, b=%f" % (len(corpus), len(voca.vocas), options.K, options.alpha, options.beta)

    #import cProfile
    #cProfile.runctx('lda_learning(lda, options.iteration, voca)', globals(), locals(), 'lda.profile')
    lda_learning(lda, options.iteration, voca)
def main():
    import optparse
    import vocabulary
    parser = optparse.OptionParser()
    parser.add_option("-f", dest="filename", help="corpus filename")
    parser.add_option("-c", dest="corpus", help="using range of Brown corpus' files(start:end)")
    parser.add_option("--alpha", dest="alpha", type="float", help="parameter alpha", default=0.5)
    parser.add_option("--beta", dest="beta", type="float", help="parameter beta", default=0.5)
    parser.add_option("-k", dest="K", type="int", help="number of topics", default=20)
    parser.add_option("-i", dest="iteration", type="int", help="iteration count", default=100)
    parser.add_option("-s", dest="smartinit", action="store_true", help="smart initialize of parameters", default=False)
    parser.add_option("--stopwords", dest="stopwords", help="exclude stop words", action="store_true", default=False)
    parser.add_option("--seed", dest="seed", type="int", help="random seed")
    parser.add_option("--df", dest="df", type="int", help="threshold of document freaquency to cut words", default=0)
    (options, args) = parser.parse_args()
    if not (options.filename or options.corpus): parser.error("need corpus filename(-f) or corpus range(-c)")

    if options.filename:
        corpus = vocabulary.load_file(options.filename)
    else:
        corpus = vocabulary.load_corpus(options.corpus)
        if not corpus: parser.error("corpus range(-c) forms 'start:end'")
    if options.seed != None:
        numpy.random.seed(options.seed)

    voca = vocabulary.Vocabulary(options.stopwords)
    docs = [voca.doc_to_ids(doc) for doc in corpus]
    if options.df > 0: docs = voca.cut_low_freq(docs, options.df)

    lda = LDA(options.K, options.alpha, options.beta, docs, voca.size(), options.smartinit)

    #import cProfile
    #cProfile.runctx('lda_learning(lda, options.iteration, voca)', globals(), locals(), 'lda.profile')
    lda_learning(lda, options.iteration, voca)
def getVocabulary():
    import vocabulary
    import random
    
    data_path = "/media/University/UniversityDisc/2-Master/MasterThesis/EjecucionTesis/Desarrollo/PythonProjects/Data/"    
    filename = data_path + "aolGoals.txt"   
    stopwords=True
    df=0
    
     #(options, args) = parser.parse_args()    
    if not (filename or corpus): "need corpus filename(-l) or corpus range(-l)"

    if filename:
        corpus = vocabulary.load_file(filename)
    else:
        corpus = vocabulary.load_corpus(corpus)
        if not corpus: "corpus range(-l) forms 'start:end'"  
    
    cp = list(corpus)
    random.seed(326719) 
    random.shuffle(cp)
    
    p = int(len(cp) * .7)
    cp_train = cp[0:p]
    cp_test = cp[p:]


    print "Corpus to Test:", len(cp_test)
    print "Corpus to Train:", len(cp_train)

    voca = vocabulary.Vocabulary(stopwords)
    docs = [voca.doc_to_ids(doc) for doc in cp_train]
    if df > 0: docs = voca.cut_low_freq(docs, df)
    
    return voca, docs, cp_train
Exemple #5
0
def main():
    import optparse
    import vocabulary
    parser = optparse.OptionParser()
    parser.add_option("-f", dest="filename", help="corpus filename")
    parser.add_option("--alpha", dest="alpha", type="float", help="parameter alpha", default=0.5)
    parser.add_option("--beta", dest="beta", type="float", help="parameter beta", default=0.5)
    parser.add_option("--lamda", dest="lamda", type="float", help="parameter lamda", default=0.5)
    parser.add_option("-k", dest="K", type="int", help="number of topics", default=20)
    parser.add_option("-i", dest="iteration", type="int", help="iteration count", default=100)
    parser.add_option("-s", dest="smartinit", action="store_true", help="smart initialize of parameters", default=False)
    parser.add_option("--stopwords", dest="stopwords", help="exclude stop words", action="store_true", default=False)
    parser.add_option("--seed", dest="seed", type="int", help="random seed")
    parser.add_option("--df", dest="df", type="int", help="threshold of document freaquency to cut words", default=0)
    (options, args) = parser.parse_args()
    if not (options.filename): parser.error("need corpus filename(-f) or corpus range(-c)")
    if options.filename:
        (pids,tids) = vocabulary.load_file(options.filename)
    if options.seed != None:
        numpy.random.seed(options.seed)
    #voca is the object which stores the data structures needed by LDA    
    voca = vocabulary.Vocabulary(options.stopwords)
    docs = voca.PT_to_idlist(pids, tids)
    #print docs
    size_of_vocab = max(tids) + 1;
    lda = BLDA(options.K, options.alpha, options.beta, options.lamda, docs, size_of_vocab, options.smartinit)
    #print "corpus=%d, words=%d, K=%d, a=%f, b=%f" % (len(corpus), len(voca.vocas), options.K, options.alpha, options.beta)
    blda_learning(lda, options.iteration)
def main():
    import optparse
    import vocabulary
    global out_dir
    parser = optparse.OptionParser()
    parser.add_option("-f", dest="filename", help="corpus filename")
    parser.add_option("--per",
                      dest="per_weight",
                      type="float",
                      help="person weight",
                      default=0.2)
    parser.add_option("--loc",
                      dest="loc_weight",
                      type="float",
                      help="location weight",
                      default=0.4)
    parser.add_option("--org",
                      dest="org_weight",
                      type="float",
                      help="organisation weight",
                      default=0.1)
    (options, args) = parser.parse_args()
    '''
    parser.add_option("-c", dest="corpus", help="using range of Brown corpus' files(start:end)")
    parser.add_option("--alpha", dest="alpha", type="float", help="parameter alpha", default=0.5)
    parser.add_option("--eta1", dest="eta1", type="float", help="parameter eta for ner word", default=0.4)
    parser.add_option("--eta2", dest="eta2", type="float", help="parameter eta for Non-ner word", default=0.2)
    parser.add_option("-k", dest="K", type="int", help="number of topics", default=20)
    parser.add_option("-i", dest="iteration", type="int", help="iteration count", default=10)
    parser.add_option("-s", dest="smartinit", action="store_true", help="smart initialize of parameters", default=False)
    parser.add_option("--stopwords", dest="stopwords", help="exclude stop words", action="store_true", default=False)
    parser.add_option("--seed", dest="seed", type="int", help="random seed")
    parser.add_option("--df", dest="df", type="int", help="threshold of document freaquency to cut words", default=0)
    (options, args) = parser.parse_args()
    pass
    '''
    if options.filename:
        corpus, doc_ids, event_list = vocabulary.load_file(options.filename)
    else:
        options.filename = 'filtered_event_new2.pkl'
        corpus, doc_ids, event_list = vocabulary.load_file(options.filename)

    load_file(options.filename, options.per_weight, options.loc_weight,
              options.org_weight)
def main():
    corpus = vocabulary.load_file("small_train.txt")

    voca = vocabulary.Vocabulary()
    docs = [voca.doc_to_ids(doc) for doc in corpus]
    if options.df > 0: docs = voca.cut_low_freq(docs, options.df)

    lda = LDA(20, 0.5, 0.5, docs, voca.size(), False)

    lda_learning(lda, 100, voca)
Exemple #8
0
def main():
    import optparse

    parser = optparse.OptionParser()
    parser.add_option("-f", dest="filename", help="corpus filename")
    parser.add_option("-c", dest="corpus", help="using range of Brown corpus' files(start:end)")
    parser.add_option("--alpha", dest="alpha", type="float", help="parameter alpha", default=numpy.random.gamma(1, 1))
    parser.add_option("--gamma", dest="gamma", type="float", help="parameter gamma", default=numpy.random.gamma(1, 1))
    parser.add_option("--base", dest="base", type="float", help="parameter of base measure H", default=0.5)
    parser.add_option("-k", dest="K", type="int", help="initial number of topics", default=1)
    parser.add_option("-i", dest="iteration", type="int", help="iteration count", default=10)
    parser.add_option("-s", dest="stopwords", type="int", help="0=exclude stop words, 1=include stop words", default=1)
    parser.add_option("--seed", dest="seed", type="int", help="random seed")
    parser.add_option("--df", dest="df", type="int", help="threshold of document freaquency to cut words", default=0)
    (options, args) = parser.parse_args()
    if not (options.filename or options.corpus):
        parser.error("need corpus filename(-f) or corpus range(-c)")
    if options.seed != None:
        numpy.random.seed(options.seed)

    import vocabulary

    if options.filename:
        corpus = vocabulary.load_file(options.filename)
    else:
        corpus = vocabulary.load_corpus(options.corpus)
        if not corpus:
            parser.error("corpus range(-c) forms 'start:end'")

    voca = vocabulary.Vocabulary(options.stopwords == 0)
    docs = [voca.doc_to_ids(doc) for doc in corpus]
    if options.df > 0:
        docs = voca.cut_low_freq(docs, options.df)

    hdplda = HDPLDA(options.K, options.alpha, options.gamma, options.base, docs, voca.size())
    print "corpus=%d words=%d alpha=%f gamma=%f base=%f initK=%d stopwords=%d" % (
        len(corpus),
        len(voca.vocas),
        options.alpha,
        options.gamma,
        options.base,
        options.K,
        options.stopwords,
    )
    # hdplda.dump()

    # import cProfile
    # cProfile.runctx('hdplda_learning(hdplda, options.iteration)', globals(), locals(), 'hdplda.profile')
    hdplda_learning(hdplda, options.iteration)

    phi = hdplda.worddist()
    for k, phi_k in enumerate(phi):
        print "\n-- topic: %d" % k
        for w in numpy.argsort(-phi_k)[:20]:
            print "%s: %f" % (voca[w], phi_k[w])
Exemple #9
0
def main():
    import optparse
    import vocabulary
    parser = optparse.OptionParser()
    parser.add_option("-f", dest="corpus_filename", help="corpus filename")
    parser.add_option("--alpha",
                      dest="alpha",
                      type="float",
                      help="parameter alpha",
                      default=0.5)
    parser.add_option("--beta",
                      dest="beta",
                      type="float",
                      help="parameter beta",
                      default=0.5)
    parser.add_option("-k",
                      dest="K",
                      type="int",
                      help="number of topics",
                      default=20)
    parser.add_option("-i",
                      dest="iteration",
                      type="int",
                      help="iteration count",
                      default=100)
    #parser.add_option("-s", dest="smartinit", action="store_true", help="smart initialize of parameters", default=False)
    parser.add_option("--stopwords",
                      dest="stopwords",
                      help="exclude stop words",
                      action="store_true",
                      default=False)
    parser.add_option("--seed", dest="seed", type="int", help="random seed")
    parser.add_option("--df",
                      dest="df",
                      type="int",
                      help="threshold of document frequency to cut words",
                      default=0)
    (options, args) = parser.parse_args()
    if options.corpus_filename:
        corpus = vocabulary.load_file(options.corpus_filename)
    else:
        corpus = null
        if not corpus: parser.error("corpus range(-c) forms 'start:end'")
    if options.seed != None:
        numpy.random.seed(options.seed)

    voca = vocabulary.Vocabulary(options.stopwords)
    docs = [voca.doc_to_ids(doc) for doc in corpus]
    if options.df > 0: docs = voca.cut_low_freq(docs, options.df)

    lda = LDA(options.K, options.alpha, options.beta, docs, voca.size())
    #print "corpus=%d, words=%d, K=%d, a=%f, b=%f" % (len(corpus), len(voca.vocas), options.numberOfTopics, options.alpha, options.beta)

    lda_learning(lda, options.iteration, voca)
def main():
    import optparse
    import vocabulary  # 词汇包

    parser = optparse.OptionParser()
    parser.add_option("-f", dest="filename", help="corpus filename")
    parser.add_option("-c", dest="corpus", help="using range of Brown corpus' files(start:end)")
    parser.add_option("--alpha", dest="alpha", type="float", help="parameter alpha", default=0.5)
    parser.add_option("--beta", dest="beta", type="float", help="parameter beta", default=0.5)
    parser.add_option("-k", dest="K", type="int", help="number of topics", default=20)
    parser.add_option("-i", dest="iteration", type="int", help="iteration count", default=100)
    parser.add_option("-s", dest="smartinit", action="store_true", help="smart initialize of parameters", default=False)
    parser.add_option("--stopwords", dest="stopwords", help="exclude stop words", action="store_true", default=False)
    parser.add_option("--seed", dest="seed", type="int", help="random seed")
    parser.add_option("--df", dest="df", type="int", help="threshold of document freaquency to cut words", default=0)
    (options, args) = parser.parse_args()

    # options = eval(str(options))
    # for k,v in options.items():
    #     print(k,v)
    help(vocabulary)
    if not (options.filename or options.corpus): # 这两个值 本身就是 None
        parser.error("need corpus filename(-f) or corpus range(-c)")


    if options.filename:
        corpus = vocabulary.load_file(options.filename)
    else:
        corpus = vocabulary.load_corpus(options.corpus)

        if not corpus:
            parser.error("corpus range(-c) forms 'start:end'")

    if options.seed != None:
        numpy.random.seed(options.seed)

    voca = vocabulary.Vocabulary(options.stopwords)
    docs = [voca.doc_to_ids(doc) for doc in corpus]
    if options.df > 0: docs = voca.cut_low_freq(docs, options.df)

    lda = LDA(options.K, options.alpha, options.beta, docs, voca.size(), options.smartinit)
    print("corpus=%d,https://blog.csdn.net/baidu_15113429/article/details/79655247 words=%d, K=%d, a=%f, b=%f" % (
    len(corpus), len(voca.vocas), options.K, options.alpha, options.beta))

    # import cProfile
    #cProfile.runctx('lda_learning(lda, options.iteration, voca)', globals(), locals(), 'lda.profile')
    lda_learning(lda, options.iteration, voca)
Exemple #11
0
def main():
    import optparse
    parser = optparse.OptionParser()
    parser.add_option("-f", dest="filename", help="corpus filename")
    parser.add_option("-c", dest="corpus", help="using range of Brown corpus' files(start:end)")
    parser.add_option("--alpha", dest="alpha", type="float", help="parameter alpha", default=numpy.random.gamma(1, 1))
    parser.add_option("--gamma", dest="gamma", type="float", help="parameter gamma", default=numpy.random.gamma(1, 1))
    parser.add_option("--base", dest="base", type="float", help="parameter of base measure H", default=0.5)
    parser.add_option("-k", dest="K", type="int", help="initial number of topics", default=1)
    parser.add_option("-i", dest="iteration", type="int", help="iteration count", default=10)
    parser.add_option("-s", dest="stopwords", type="int", help="0=exclude stop words, 1=include stop words", default=1)
    parser.add_option("--seed", dest="seed", type="int", help="random seed")
    parser.add_option("--df", dest="df", type="int", help="threshold of document freaquency to cut words", default=0)
    (options, args) = parser.parse_args()
    if not (options.filename or options.corpus): parser.error("need corpus filename(-f) or corpus range(-c)")
    if options.seed != None:
        numpy.random.seed(options.seed)

    import vocabulary
    if options.filename:
        corpus = vocabulary.load_file(options.filename)
    else:
        corpus = vocabulary.load_corpus(options.corpus)
        if not corpus: parser.error("corpus range(-c) forms 'start:end'")

    voca = vocabulary.Vocabulary(options.stopwords==0)
    docs = [voca.doc_to_ids(doc) for doc in corpus]
    if options.df > 0: docs = voca.cut_low_freq(docs, options.df)

    hdplda = HDPLDA(options.K, options.alpha, options.gamma, options.base, docs, voca.size())
    print "corpus=%d words=%d alpha=%f gamma=%f base=%f initK=%d stopwords=%d" % (len(corpus), len(voca.vocas), options.alpha, options.gamma, options.base, options.K, options.stopwords)
    #hdplda.dump()

    #import cProfile
    #cProfile.runctx('hdplda_learning(hdplda, options.iteration)', globals(), locals(), 'hdplda.profile')
    hdplda_learning(hdplda, options.iteration)

    phi = hdplda.worddist()
    for k, phi_k in enumerate(phi):
        print "\n-- topic: %d" % k
        for w in numpy.argsort(-phi_k)[:20]:
            print "%s: %f" % (voca[w], phi_k[w])
Exemple #12
0
def main():
    import optparse
    import vocabulary
    parser = optparse.OptionParser()
    parser.add_option("-f", dest="filename", help="corpus filename", default='complete_document_one_2_one.pk')
    parser.add_option("-c", dest="corpus", help="using range of Brown corpus' files(start:end)")
    parser.add_option("--alpha", dest="alpha", type="float", help="parameter alpha", default=0.1)
    parser.add_option("--beta", dest="beta", type="float", help="parameter beta", default=0.1)
    parser.add_option("-k", dest="K", type="int", help="number of topics", default=10)
    parser.add_option("-i", dest="iteration", type="int", help="iteration count", default=500)
    parser.add_option("-s", dest="smartinit", action="store_true", help="smart initialize of parameters", default=True)
    parser.add_option("--stopwords", dest="stopwords", help="exclude stop words", action="store_true",
                      default=False)
    parser.add_option("--seed", dest="seed", type="int", help="random seed")
    parser.add_option("--df", dest="df", type="int", help="threshold of document freaquency to cut words", default=0)
    (options, args) = parser.parse_args()
    if not (options.filename or options.corpus): parser.error("need corpus filename(-f) or corpus range(-c)")

    corpus = vocabulary.load_file(options.filename)
    if options.seed != None:
        numpy.random.seed(options.seed)

    lda = LDA(options.K, options.alpha, options.beta, corpus, [300, 1000], options.smartinit)
    lda_learning(lda, options.iteration)
Exemple #13
0
def main():
    import optparse
    parser = optparse.OptionParser()
    parser.add_option("-f", dest="filename", help="corpus filename")
    parser.add_option("-c", dest="corpus", help="using range of Brown corpus' files(start:end)")
    parser.add_option("--alpha", dest="alpha", type="float", help="parameter alpha", default=numpy.random.gamma(1, 1))
    parser.add_option("--gamma", dest="gamma", type="float", help="parameter gamma", default=numpy.random.gamma(1, 1))
    parser.add_option("--beta", dest="beta", type="float", help="parameter of beta measure H", default=0.5)
    parser.add_option("-i", dest="iteration", type="int", help="iteration count", default=10)
    parser.add_option("-s", dest="stopwords", type="int", help="0=exclude stop words, 1=include stop words", default=1)
    parser.add_option("--seed", dest="seed", type="int", help="random seed")
    parser.add_option("--df", dest="df", type="int", help="threshold of document freaquency to cut words", default=0)
    (options, args) = parser.parse_args()
    if not (options.filename or options.corpus): parser.error("need corpus filename(-f) or corpus range(-c)")
    if options.seed != None:
        numpy.random.seed(options.seed)

    import vocabulary
    if options.filename:
        corpus = vocabulary.load_file(options.filename)
    else:
        corpus = vocabulary.load_corpus(options.corpus)
        if not corpus: parser.error("corpus range(-c) forms 'start:end'")

    voca = vocabulary.Vocabulary(options.stopwords==0)
    docs = [voca.doc_to_ids(doc) for doc in corpus]
    if options.df > 0: docs = voca.cut_low_freq(docs, options.df)

    hdplda = HDPLDA(options.alpha, options.gamma, options.beta, docs, voca.size())
    print "corpus=%d words=%d alpha=%.3f gamma=%.3f beta=%.3f stopwords=%d" % (len(corpus), len(voca.vocas), options.alpha, options.gamma, options.beta, options.stopwords)
    #hdplda.dump()

    #import cProfile
    #cProfile.runctx('hdplda_learning(hdplda, options.iteration)', globals(), locals(), 'hdplda.profile')
    hdplda_learning(hdplda, options.iteration)
    output_summary(hdplda, voca)
Exemple #14
0
def main():
    import optparse
    parser = optparse.OptionParser()
    parser.add_option("-f",
                      dest="filename",
                      help="corpus filename",
                      default="1000_p.txt")
    parser.add_option("-c",
                      dest="corpus",
                      help="using range of Brown corpus' files(start:end)")
    parser.add_option("--alpha",
                      dest="alpha",
                      type="float",
                      help="parameter alpha",
                      default=numpy.random.gamma(1, 1))
    parser.add_option("--gamma",
                      dest="gamma",
                      type="float",
                      help="parameter gamma",
                      default=numpy.random.gamma(1, 1))
    parser.add_option("--beta",
                      dest="beta",
                      type="float",
                      help="parameter of beta measure H",
                      default=0.5)
    parser.add_option("-i",
                      dest="iteration",
                      type="int",
                      help="iteration count",
                      default=10)
    parser.add_option("-s",
                      dest="stopwords",
                      type="int",
                      help="0=exclude stop words, 1=include stop words",
                      default=1)
    parser.add_option("--seed", dest="seed", type="int", help="random seed")
    parser.add_option("--df",
                      dest="df",
                      type="int",
                      help="threshold of document freaquency to cut words",
                      default=0)
    (options, args) = parser.parse_args()
    if not (options.filename or options.corpus):
        parser.error("need corpus filename(-f) or corpus range(-c)")
    if options.seed != None:
        numpy.random.seed(options.seed)

    import vocabulary
    if options.filename:
        corpus = vocabulary.load_file(options.filename)
    else:
        corpus = vocabulary.load_corpus(options.corpus)
        if not corpus: parser.error("corpus range(-c) forms 'start:end'")

    voca = vocabulary.Vocabulary(options.stopwords == 0)
    docs = [voca.doc_to_ids(doc) for doc in corpus]
    if options.df > 0: docs = voca.cut_low_freq(docs, options.df)

    hdplda = HDPLDA(options.alpha, options.gamma, options.beta, docs,
                    voca.size())
    print("corpus=%d words=%d alpha=%.3f gamma=%.3f beta=%.3f stopwords=%d" %
          (len(corpus), len(voca.vocas), options.alpha, options.gamma,
           options.beta, options.stopwords))
    # hdplda.dump()

    # import cProfile
    # cProfile.runctx('hdplda_learning(hdplda, options.iteration)', globals(), locals(), 'hdplda.profile')
    hdplda_learning(hdplda, options.iteration)
    output_summary(hdplda, voca)
Exemple #15
0
def main():
    import os
    import pickle
    import optparse

    parser = optparse.OptionParser()
    parser.add_option("-m", dest="model", help="model filename")
    parser.add_option("-f", dest="filename", help="corpus filename")
    parser.add_option("-b",
                      dest="corpus",
                      help="using range of Brown corpus' files(start:end)")
    parser.add_option("--alpha",
                      dest="alpha",
                      type="float",
                      help="parameter alpha",
                      default=0.1)
    parser.add_option("--beta",
                      dest="beta",
                      type="float",
                      help="parameter beta",
                      default=0.01)
    parser.add_option("--eta",
                      dest="eta",
                      type="float",
                      help="parameter eta",
                      default=100)
    parser.add_option("-k",
                      dest="K",
                      type="int",
                      help="number of topics",
                      default=20)
    parser.add_option("-i",
                      dest="iteration",
                      type="int",
                      help="iteration count",
                      default=10)
    parser.add_option("--seed", dest="seed", type="int", help="random seed")
    parser.add_option("--df",
                      dest="df",
                      type="int",
                      help="threshold of document freaquency to cut words",
                      default=0)
    parser.add_option(
        "-c",
        dest="constraint",
        help="add constraint (wordlist which should belong to the same topic)")
    parser.add_option("-u",
                      "--unassign",
                      dest="unassign",
                      help="unassign method (all/doc/term/none)",
                      default="none")
    (options, args) = parser.parse_args()

    numpy.random.seed(options.seed)

    if options.model and os.path.exists(options.model):
        with open(options.model, "rb") as f:
            lda, voca = pickle.load(f)
    elif not (options.filename or options.corpus):
        parser.error(
            "need corpus filename(-f) or corpus range(-b) or model(-m)")
    else:
        import vocabulary
        if options.filename:
            corpus = vocabulary.load_file(options.filename)
        else:
            corpus = vocabulary.load_corpus(options.corpus)
            if not corpus: parser.error("corpus range(-c) forms 'start:end'")
        voca = vocabulary.Vocabulary()
        docs = [voca.doc_to_ids(doc) for doc in corpus]
        if options.df > 0: docs = voca.cut_low_freq(docs, options.df)
        lda = ITM(options.K, options.alpha, options.beta, options.eta, docs,
                  voca.size())
    param = (len(lda.docs), len(voca.vocas), options.K, options.alpha,
             options.beta, options.eta)
    print "corpus=%d, words=%d, K=%d, a=%f, b=%f, eta=%f" % param

    if options.constraint:
        if options.unassign == "all":
            add_constraint = lda.add_constraint_all
        elif options.unassign == "doc":
            add_constraint = lda.add_constraint_doc
        elif options.unassign == "term":
            add_constraint = lda.add_constraint_term
        elif options.unassign == "none":
            add_constraint = lda.add_constraint_none
        else:
            parser.error("unassign method(-u) must be all/doc/term/none")

        wordlist = options.constraint.split(',')
        idlist = [voca.vocas_id[w] for w in wordlist]

        print "\n== add constraint =="
        for w, v in zip(idlist, wordlist):
            print "%s [%s]" % (v, ",".join(str(x) for x in lda.n_k_w[:, w]))

        add_constraint(idlist)

        lda.verify_topic()

    #import cProfile
    #cProfile.runctx('lda_learning(lda, options.iteration, voca)', globals(), locals(), 'lda.profile')
    lda_learning(lda, options.iteration, voca)

    with open(options.model, "wb") as f:
        pickle.dump((lda, voca), f)
Exemple #16
0
        parser.error('need output file(-o)')

    return (options, args)


if __name__ == "__main__":

    (options, args) = args()
    
    import vocabulary

    # Read corpus from file.
    if options.filename.split()[-1] == 'json':
        corpus = vocabulary.load_file_json(options.filename)
    else:
        corpus = vocabulary.load_file(options.filename)


    voca = vocabulary.Vocabulary(options.stopwords==0)
    docs = [voca.doc_to_ids(doc) for doc in corpus]


    # Running HDP-LDA.
    hdplda = HDPLDA(options.K, options.alpha, options.gamma, options.base, docs, voca.size())

    print("corpus=%d words=%d alpha=%f gamma=%f base=%f initK=%d stopwords=%d" % (len(corpus), len(voca.vocas), options.alpha, options.gamma, options.base, options.K, options.stopwords))
        
    for i in range(options.iteration):
        hdplda.inference()
        print("K: %d" % len(hdplda.topics))
    
Exemple #17
0
def main():
    import optparse
    import vocabulary
    parser = optparse.OptionParser()
    parser.add_option("-f", dest="filename", type="string", help="corpus filename")
    parser.add_option("-t", dest="time_file", help="timestamp of documents")
    parser.add_option("-o", dest="output_dir", type="string", help="output directory")
    parser.add_option("-m", dest="model", help="previously trained model")
    parser.add_option("--alpha", dest="alpha", type="float", help="parameter alpha", default=0.001)
    parser.add_option("--beta", dest="beta", type="float", help="parameter beta", default=0.01)
    parser.add_option("-p", dest="cont", type="float", help="parameter contribution proportion", \
        default=0.5)
    parser.add_option("-k", dest="K", type="int", help="number of topics", default=50)
    parser.add_option("-i", dest="iteration", type="int", help="iteration count", default=500)
    parser.add_option("-s", dest="smartinit", action="store_false", \
        help="smart initialize of parameters", default=True)
    parser.add_option("--stopwords", dest="stopwords", help="exclude stop words", \
        action="store_true", default=True)
    parser.add_option("--seed", dest="seed", type="int", help="random seed")
    parser.add_option("--wf", dest="wf", type="int", \
        help="threshold of word frequency to cut words", default=1)
    parser.add_option("--num-proc", dest="nproc", type="int", help="number of processors", \
        default=4)
    (options, args) = parser.parse_args()
    if not (options.filename) or \
        not (options.time_file) or not(options.output_dir):
        parser.error("need (corpus filename(-f) and "
            "document timestamp file(-t) and output directory(-o)")

    if options.filename:
        corpus = vocabulary.load_file(options.filename)
    if options.seed != None:
        numpy.random.seed(options.seed)

    if not os.path.exists(options.output_dir):
        os.makedirs(options.output_dir)
    
    voca = vocabulary.Vocabulary(options.stopwords, options.wf)

    if options.model:
        (prev_voca, prev_lda) = pickle.load(open(options.model))
        #drop one day worth's of data to accommodate the new day's data
        prev_voca, prev_lda = drop_one_day_data(prev_voca, prev_lda, options.alpha)
        options.K = prev_lda.K
    else:
        prev_lda = None
        prev_voca = None

    #generate the vocabularies for voca
    voca.gen_vocabs(corpus, prev_voca, prev_lda)

    docs = [voca.doc_to_ids(doc) for doc in corpus]
    #calculate the number of elements for each timestamp group in docs
    docs_nt = []
    docs_times = [ item.strip() for item in open(options.time_file).readlines() ]
    tmp_nt = {}
    for time in set(docs_times):
        tmp_nt[time] = docs_times.count(time)
    for (time, count) in sorted(tmp_nt.items()):
        docs_nt.append(count)
    tmp_nt.clear()

    if options.model:
        #update docs
        tmp_docs = []
        tmp_docs.extend(prev_lda.docs)
        tmp_docs.extend(docs)
        docs = tmp_docs

        #update docs_times
        tmp_docs_times = []
        tmp_docs_times.extend(prev_lda.docs_times)
        tmp_docs_times.extend(docs_times)
        docs_times = tmp_docs_times

        #update docs_nt
        tmp_docs_nt = []
        tmp_docs_nt.extend(prev_lda.docs_nt)
        tmp_docs_nt.extend(docs_nt)
        docs_nt = tmp_docs_nt

    #if options.wf > 0: docs = voca.cut_low_freq(docs, options.wf)

    #initialise lda
    lda = LDA(options.K, options.alpha, options.beta, options.cont, docs, docs_nt, voca.size(), \
        docs_times, options.output_dir, prev_lda, options.nproc, options.smartinit)

    #print word frequency
    freqword = {}
    freqword_file = open(lda.output_dir + "/freqwords.txt", "w")
    for (vocab_id, freq) in enumerate(voca.wordfreq):
        freqword[voca.vocas[vocab_id]] = freq
    for (vocab, freq) in sorted(freqword.items(), key=operator.itemgetter(1), reverse=True):
        freqword_file.write(vocab + " " + str(freq) + "\n")
    freqword_file.flush()

    print "corpus=%d, words=%d, K=%d, a=%f, b=%f, nproc=%d" % (len(corpus), len(voca.vocas), 
options.K, options.alpha, options.beta, options.nproc)

    #import cProfile
    #cProfile.runctx('lda_learning(lda, options.iteration, voca)', globals(), locals(), 'lda.profile')
    lda_learning(lda, options.iteration, voca)

    #save the model for potential re-use later
    lda.tlock = None
    pickle.dump((voca, lda), open(options.output_dir + "/model.dat", "w"))
Exemple #18
0
def main():
    import optparse
    import vocabulary
    parser = optparse.OptionParser()
    parser.add_option("-f", dest="filename", type="string", help="corpus filename")
    parser.add_option("-t", dest="time_file", help="timestamp of documents")
    parser.add_option("-o", dest="output_dir", type="string", help="output directory")
    parser.add_option("-m", dest="model", help="previously trained model")
    parser.add_option("--alpha", dest="alpha", type="float", help="parameter alpha", default=0.001)
    parser.add_option("--beta", dest="beta", type="float", help="parameter beta", default=0.01)
    parser.add_option("-p", dest="cont", type="float", help="parameter contribution proportion", \
        default=0.5)
    parser.add_option("-k", dest="K", type="int", help="number of topics", default=25)
    parser.add_option("-i", dest="iteration", type="int", help="iteration count", default=300)
    parser.add_option("-s", dest="smartinit", action="store_false", \
        help="smart initialize of parameters", default=True)
    parser.add_option("--stopwords", dest="stopwords", help="exclude stop words", \
        action="store_true", default=True)
    parser.add_option("--seed", dest="seed", type="int", help="random seed")
    parser.add_option("--wf", dest="wf", type="int", \
        help="threshold of word frequency to cut words", default=10)
    parser.add_option("--num-proc", dest="nproc", type="int", help="number of processors", \
        default=12)
    (options, args) = parser.parse_args()


    if not (options.filename) or \
        not (options.time_file) or not(options.output_dir):
        parser.error("need (corpus filename(-f) and "
            "document timestamp file(-t) and output directory(-o)")

    if options.filename:
	print 'options.filename = ', options.filename
        corpus = vocabulary.load_file(options.filename)
	print 'Corpus length = ', len(corpus)
    if options.seed != None:
        numpy.random.seed(options.seed)

    if not os.path.exists(options.output_dir):
        os.makedirs(options.output_dir)
    
    voca = vocabulary.Vocabulary(options.stopwords, options.wf)

    if options.model:
        (prev_voca, prev_lda) = pickle.load(open(options.model))
        #drop one day worth's of data to accommodate the new day's data
        prev_voca, prev_lda = drop_one_day_data(prev_voca, prev_lda, options.alpha)
        options.K = prev_lda.K
    else:
        prev_lda = None
        prev_voca = None

    #generate the vocabularies for voca
    voca.gen_vocabs(corpus, prev_voca, prev_lda)

    docs = [voca.doc_to_ids(doc) for doc in corpus]
    #calculate the number of elements for each timestamp group in docs
    docs_nt = []
    docs_times = [ item.strip() for item in open(options.time_file).readlines() ]
    print 'len docs_time=', str(len(docs_times)), 'len docs=', str(len(docs))
#    for m, doc in enumerate(docs):
#      try:
#	t= docs_times[m]
#      except:
#	  print 'm=',m, 'len docs_time=', str(len(docs_times)), 'len docs=', str(len(docs))
    tmp_nt = {}
    for time in set(docs_times):
        tmp_nt[time] = docs_times.count(time)
    for (time, count) in sorted(tmp_nt.items()):
        docs_nt.append(count)
    tmp_nt.clear()

    if options.model:
        #update docs
        tmp_docs = []
        tmp_docs.extend(prev_lda.docs)
        tmp_docs.extend(docs)
        docs = tmp_docs

        #update docs_times
        tmp_docs_times = []
        tmp_docs_times.extend(prev_lda.docs_times)
        tmp_docs_times.extend(docs_times)
        docs_times = tmp_docs_times

        #update docs_nt
        tmp_docs_nt = []
        tmp_docs_nt.extend(prev_lda.docs_nt)
        tmp_docs_nt.extend(docs_nt)
        docs_nt = tmp_docs_nt

    #if options.wf > 0: docs = voca.cut_low_freq(docs, options.wf)

    #initialise lda
    lda = LDA(options.K, options.alpha, options.beta, options.cont, docs, docs_nt, voca.size(), \
        docs_times, options.output_dir, prev_lda, options.nproc, options.smartinit)

    #print word frequency
    freqword = {}
    freqword_file = open(lda.output_dir + "/freqwords.txt", "w")
    for (vocab_id, freq) in enumerate(voca.wordfreq):
        freqword[voca.vocas[vocab_id]] = freq
    for (vocab, freq) in sorted(freqword.items(), key=operator.itemgetter(1), reverse=True):
        freqword_file.write(vocab + " " + str(freq) + "\n")
    freqword_file.flush()

    print "corpus=%d, words=%d, K=%d, a=%f, b=%f, nproc=%d" % (len(corpus), len(voca.vocas), 
options.K, options.alpha, options.beta, options.nproc)

    #import cProfile
    #cProfile.runctx('lda_learning(lda, options.iteration, voca)', globals(), locals(), 'lda.profile')
    phi_old = lda.worddist()
    voca_old = voca.vocas
    lda_learning(lda, options.iteration, voca)
    phi_new = lda.worddist()
    fjs = open(lda.output_dir + '/jsdivergence.out','a')
    oldvoca_file = open(lda.output_dir + "/oldvoca.txt","a")
    newvoca_file = open(lda.output_dir + "/newvoca.txt","a")
    oldphi_prob = open(lda.output_dir + "/oldphi.txt","a")
    newphi_prob = open(lda.output_dir + "/newphi.txt","a")
    fjs.write('topic#   JSdiv')
    if prev_lda != None:
      #print '--PREVIOUS word_dist = ----', prev_lda.phi_prob
      for k in range(lda.K):
        res = JS.jensenShannonDivergence(phi_old[k],phi_new[k])
	#print '\n' + str(k) +' '+ str(res) + ' PREV_phi = ' + str(len(phi_old[k]))+ str(phi_old[k]) + ' NEW_phi= ' + str(len(phi_new[k])) + str(phi_new[k]) +'\n----------\n'
	fjs.write('\n' + str(k) +' '+ str(res) + ' ' + str(len(phi_old[k])) + ' ' + str(len(phi_new[k]))) 
	oldvoca_file.write('\nTOPIC='+ str(k) + ' WORDS=')
	for w in numpy.argsort(-phi_old[k])[:1000]: 
	  #print i, voca_old[i]
	  oldvoca_file.write(voca_old[w] + " ")
	  oldphi_prob.write(str(phi_old[k][w])+ " ")
	oldphi_prob.write('\n')
	oldvoca_file.write('\n')

	newvoca_file.write('\nTOPIC='+ str(k) + ' WORDS=')
	for w in numpy.argsort(-phi_new[k])[:1000]:
	  #print i, voca.vocas[i]
	  newvoca_file.write(voca.vocas[w] + " ")
	  newphi_prob.write(str(phi_new[k][w])+ " ")
	newphi_prob.write('\n')
	newvoca_file.write('\n')

    #print '-- SELF word dist = ---', lda.phi_prob 
    fjs.write('\n')
         
    #save the model for potential re-use later
    lda.tlock = None
    pickle.dump((voca, lda), open(options.output_dir + "/model.dat", "w"))
Exemple #19
0
def main():
    t1 = time.time()
    import optparse
    import vocabulary
    global out_dir
    parser = optparse.OptionParser()
    parser.add_option("-f", dest="filename", help="corpus filename")
    parser.add_option("-c",
                      dest="corpus",
                      help="using range of Brown corpus' files(start:end)")
    parser.add_option("--alpha",
                      dest="alpha",
                      type="float",
                      help="parameter alpha",
                      default=0.1)
    parser.add_option("--eta",
                      dest="eta",
                      type="float",
                      help="parameter eta",
                      default=0.2)
    parser.add_option("-k",
                      dest="K",
                      type="int",
                      help="number of topics",
                      default=20)
    parser.add_option("-i",
                      dest="iteration",
                      type="int",
                      help="iteration count",
                      default=100)
    parser.add_option("-s",
                      dest="smartinit",
                      action="store_true",
                      help="smart initialize of parameters",
                      default=False)
    parser.add_option("--stopwords",
                      dest="stopwords",
                      help="exclude stop words",
                      action="store_true",
                      default=False)
    parser.add_option("--seed", dest="seed", type="int", help="random seed")
    parser.add_option("--df",
                      dest="df",
                      type="int",
                      help="threshold of document freaquency to cut words",
                      default=0)
    #parser.add_option("--setup", dest="setup", help="setup details", default="uniform")
    parser.add_option("--dataset",
                      dest="did",
                      help="setup details : Dataset-1/Dataset-2/Dataset-3",
                      default="Dataset-1")
    (options, args) = parser.parse_args()
    if not (options.filename or options.corpus):
        parser.error("need corpus filename(-f) or corpus range(-c)")

    if options.filename:
        if options.did == 'Dataset-1':
            corpus, doc_ids, event_list, total_no_word = vocabulary.load_file(
                options.filename)
        else:
            corpus, doc_ids, event_list, total_no_word = vocabulary.load_file_reuter(
                options.filename)
    else:
        corpus = vocabulary.load_corpus(options.corpus)
        if not corpus: parser.error("corpus range(-c) forms 'start:end'")
    if options.seed != None:
        numpy.random.seed(options.seed)

    voca = vocabulary.Vocabulary(options.stopwords)
    docs = [voca.doc_to_ids(doc) for doc in corpus]
    if options.df > 0: docs = voca.cut_low_freq(docs, options.df)

    if event_list is not None: options.K = options.K  #len(event_list)
    suffix = datetime.now().strftime('%Y-%m-%d_%H:%M:%S')
    #out_dir = '%s/all_words/Topic_%d_alpha_%f_eta_%f_iter_%d/%s' %(out_dir,options.K,options.alpha, options.eta, options.iteration, suffix)
    #out_dir = '%s/Dataset-1/Topic_%d_alpha_%f_eta_%f_iter_%d/%s' %(out_dir,options.K,options.alpha, options.eta, options.iteration, suffix)
    out_dir = '%s/%s/Topic_%d_alpha_%f_eta_%f_iter_%d/%s' % (
        out_dir, options.did, options.K, options.alpha, options.eta,
        options.iteration, suffix)

    #out_dir = '%s/Reuters-21578/R-8-train-train_no-stop/Topic_%d_alpha_%f_eta_%f_iter_%d/%s' %(out_dir,options.K,options.alpha, options.eta, options.iteration, suffix)
    #out_dir = '%s/20-Newsgroup/20-Newsgroup_train-train_all_term/Topic_%d_alpha_%f_eta_%f_iter_%d/%s' %(out_dir,options.K,options.alpha, options.eta, options.iteration, suffix)
    print('out_dir: ', out_dir)
    try:
        os.makedirs(out_dir)
    except Exception as e:
        print(' %s Dir exist ' % (out_dir))
        print('E MSG : ', e)
    lda = LDA(options.K, options.alpha, options.eta, docs, doc_ids,
              voca.size(), options.smartinit)
    t_int = time.time()
    #print 'Intialization time : %f' %(t_int-t1)
    flog = '%s/log_file.txt' % (out_dir)
    f = open(flog, 'w')
    f.write(
        "corpus(# of doc)=%d, no of event = %d , Uniq words=%d, Toal # of word =%d, K=%d, a=%f, b=%f , iteration = %d \n"
        % (len(corpus), len(event_list), len(voca.vocas), total_no_word,
           options.K, options.alpha, options.eta, options.iteration))
    f.close()
    print("corpus=%d, no of event =%d , uniq words=%d, K=%d, a=%f, b=%f" %
          (len(corpus), len(event_list), len(
              voca.vocas), options.K, options.alpha, options.eta)),

    #import cProfile
    #cProfile.runctx('lda_learning(lda, options.iteration, voca)', globals(), locals(), 'lda.profile')
    lda_learning(lda, options.iteration, voca)
    t2 = time.time()
    print(' Total time taken : %f ' % (t2 - t1))
    flog = '%s/log_file.txt' % (out_dir)
    f = open(flog, 'a')
    f.write(' TOtal time taken : %f ' % (t2 - t1))
    f.close()
Exemple #20
0
        parser.error('need output file(-o)')

    return (options, args)


if __name__ == "__main__":

    (options, args) = args()

    import vocabulary

    # Read corpus from file.
    if options.filename.split()[-1] == 'json':
        corpus = vocabulary.load_file_json(options.filename)
    else:
        corpus = vocabulary.load_file(options.filename)

    voca = vocabulary.Vocabulary(options.stopwords == 0)
    docs = [voca.doc_to_ids(doc) for doc in corpus]

    # Running HDP-LDA.
    hdplda = HDPLDA(options.K, options.alpha, options.gamma, options.base,
                    docs, voca.size())

    print(
        "corpus=%d words=%d alpha=%f gamma=%f base=%f initK=%d stopwords=%d" %
        (len(corpus), len(voca.vocas), options.alpha, options.gamma,
         options.base, options.K, options.stopwords))

    for i in range(options.iteration):
        hdplda.inference()
Exemple #21
0
    beta = estimateAlphaMap(nkt, nk, beta, abeta, bbeta)


class FileOutput:
    def __init__(self, file):
        self.file = file
        import datetime
        self.file = file + datetime.datetime.now().strftime('_%m%d_%H%M%S.txt')

    def out(self, s):
        with open(self.file, 'a') as f:
            print >> f, s


corpus = vocabulary.load_file('mood.txt')
# corpus = vocabulary.load_corpus('1:50')
voca = vocabulary.Vocabulary(True)
w = [voca.doc_to_ids(doc) for doc in corpus][:86]
wq = [voca.doc_to_ids(doc) for doc in corpus][:10]
M = len(w)
Mq = len(wq)
V = voca.size()

init()
run(niter)
initq()
runq(niterq)
ppx()

theta = [[0 for j in range(K)] for i in range(M)]  # double
Exemple #22
0
Fichier : itm.py Projet : 52nlp/iir
def main():
    import os
    import pickle
    import optparse

    parser = optparse.OptionParser()
    parser.add_option("-m", dest="model", help="model filename")
    parser.add_option("-f", dest="filename", help="corpus filename")
    parser.add_option("-b", dest="corpus", help="using range of Brown corpus' files(start:end)")
    parser.add_option("--alpha", dest="alpha", type="float", help="parameter alpha", default=0.1)
    parser.add_option("--beta", dest="beta", type="float", help="parameter beta", default=0.01)
    parser.add_option("--eta", dest="eta", type="float", help="parameter eta", default=100)
    parser.add_option("-k", dest="K", type="int", help="number of topics", default=20)
    parser.add_option("-i", dest="iteration", type="int", help="iteration count", default=10)
    parser.add_option("--seed", dest="seed", type="int", help="random seed")
    parser.add_option("--df", dest="df", type="int", help="threshold of document freaquency to cut words", default=0)
    parser.add_option("-c", dest="constraint", help="add constraint (wordlist which should belong to the same topic)")
    parser.add_option("-u", "--unassign", dest="unassign", help="unassign method (all/doc/term/none)", default="none")
    (options, args) = parser.parse_args()

    numpy.random.seed(options.seed)

    if options.model and os.path.exists(options.model):
        with open(options.model, "rb") as f:
            lda, voca = pickle.load(f)
    elif not (options.filename or options.corpus):
        parser.error("need corpus filename(-f) or corpus range(-b) or model(-m)")
    else:
        import vocabulary
        if options.filename:
            corpus = vocabulary.load_file(options.filename)
        else:
            corpus = vocabulary.load_corpus(options.corpus)
            if not corpus: parser.error("corpus range(-c) forms 'start:end'")
        voca = vocabulary.Vocabulary()
        docs = [voca.doc_to_ids(doc) for doc in corpus]
        if options.df > 0: docs = voca.cut_low_freq(docs, options.df)
        lda = ITM(options.K, options.alpha, options.beta, options.eta, docs, voca.size())
    param = (len(lda.docs), len(voca.vocas), options.K, options.alpha, options.beta, options.eta)
    print "corpus=%d, words=%d, K=%d, a=%f, b=%f, eta=%f" % param

    if options.constraint:
        if options.unassign == "all":
            add_constraint = lda.add_constraint_all
        elif options.unassign == "doc":
            add_constraint = lda.add_constraint_doc
        elif options.unassign == "term":
            add_constraint = lda.add_constraint_term
        elif options.unassign == "none":
            add_constraint = lda.add_constraint_none
        else:
            parser.error("unassign method(-u) must be all/doc/term/none")

        wordlist = options.constraint.split(',')
        idlist = [voca.vocas_id[w] for w in wordlist]

        print "\n== add constraint =="
        for w, v in zip(idlist, wordlist):
            print "%s [%s]" % (v, ",".join(str(x) for x in lda.n_k_w[:,w]))

        add_constraint(idlist)

        lda.verify_topic()


    #import cProfile
    #cProfile.runctx('lda_learning(lda, options.iteration, voca)', globals(), locals(), 'lda.profile')
    lda_learning(lda, options.iteration, voca)

    with open(options.model, "wb") as f:
        pickle.dump((lda, voca), f)
def main():
    t1 = time.time()
    import optparse
    import vocabulary
    global out_dir
    parser = optparse.OptionParser()
    parser.add_option("-f", dest="filename", help="corpus filename")
    parser.add_option("-c",
                      dest="corpus",
                      help="using range of Brown corpus' files(start:end)")
    parser.add_option("--alpha",
                      dest="alpha",
                      type="float",
                      help="parameter alpha",
                      default=0.5)
    parser.add_option("--beta",
                      dest="beta",
                      type="float",
                      help="parameter beta",
                      default=0.5)
    parser.add_option("-k",
                      dest="K",
                      type="int",
                      help="number of topics",
                      default=20)
    parser.add_option("-i",
                      dest="iteration",
                      type="int",
                      help="iteration count",
                      default=100)
    parser.add_option("-s",
                      dest="smartinit",
                      action="store_true",
                      help="smart initialize of parameters",
                      default=False)
    parser.add_option("--stopwords",
                      dest="stopwords",
                      help="exclude stop words",
                      action="store_true",
                      default=False)
    parser.add_option("--seed", dest="seed", type="int", help="random seed")
    parser.add_option("--df",
                      dest="df",
                      type="int",
                      help="threshold of document freaquency to cut words",
                      default=0)
    (options, args) = parser.parse_args()
    if not (options.filename or options.corpus):
        parser.error("need corpus filename(-f) or corpus range(-c)")

    if options.filename:
        corpus, doc_ids, event_list = vocabulary.load_file(options.filename)
    else:
        corpus = vocabulary.load_corpus(options.corpus)
        if not corpus: parser.error("corpus range(-c) forms 'start:end'")
    if options.seed != None:
        numpy.random.seed(options.seed)

    voca = vocabulary.Vocabulary(options.stopwords)
    docs = [voca.doc_to_ids(doc) for doc in corpus]
    if options.df > 0: docs = voca.cut_low_freq(docs, options.df)

    if event_list is not None: options.K = len(event_list)
    suffix = datetime.now().strftime('%Y-%m-%d_%H:%M:%S')
    out_dir = '%s/all_words/Topic_%d_alpha_%f_beta_%f_iter_%d/%s' % (
        out_dir, options.K, options.alpha, options.beta, options.iteration,
        suffix)

    try:
        os.makedirs(out_dir)
    except Exception, e:
        print ' %s Dir exist ' % (out_dir)
        print 'E MSG : ', e
Exemple #24
0
def main():
	import optparse
	import vocabulary
	parser = optparse.OptionParser()
	parser.add_option("--newsf", dest="newsfile", help="news corpus filename")
	parser.add_option("--tweetsf", dest="tweetsfile", help="tweets corpus filename")
	parser.add_option("-a", dest="authorfile", help="author filename")
	parser.add_option("-c", dest="corpus", help="using range of Brown corpus' files(start:end)")
	parser.add_option("--alpha", dest="alpha", type="float", help="parameter alpha", default=0.5)
	parser.add_option("--beta", dest="beta", type="float", help="parameter beta", default=0.5)
	parser.add_option("-k", dest="K", type="int", help="number of topics", default=20)
	parser.add_option("-i", dest="iteration", type="int", help="iteration count", default=100)
	parser.add_option("-s", dest="smartinit", action="store_true", help="smart initialize of parameters", default=False)
	parser.add_option("--stopwords", dest="stopwords", help="exclude stop words", action="store_true", default=False)
	parser.add_option("--seed", dest="seed", type="int", help="random seed")
	parser.add_option("--df", dest="df", type="int", help="threshold of document freaquency to cut words", default=0)
	(options, args) = parser.parse_args()
	if not (options.newsfile or options.corpus): parser.error("need corpus news file(--newsf) or corpus range(-c)")
	if not options.tweetsfile: parser.error("need corpus tweets file(--tweetsf)")
	if not options.authorfile: parser.error("need author file(-a)")

	if options.newsfile:
		news_corpus = vocabulary.load_file(options.newsfile)
		news_len = len(news_corpus)
		print "Load News data from '" + options.newsfile + "'"
		print "\t", news_len, "News in total"
	else:
		news_corpus = vocabulary.load_corpus(options.corpus)
		if not news_corpus: parser.error("corpus range(-c) forms 'start:end'")
	if options.seed != None:
		np.random.seed(options.seed)

	voca = vocabulary.Vocabulary(options.stopwords)
	print "Load Twitters data from '" + options.tweetsfile + "'"
	ori_twitter_corpus = vocabulary.load_file(options.tweetsfile, 'utf-8')

	print "Initialize the authors set"
	num_authors, author_set = vocabulary.load_author(options.authorfile)
	print "\t", num_authors, "authors in total"
	# Remove words less frequent
	twitter_dict = {}
	for line in ori_twitter_corpus:
		for w in line:
			if w in twitter_dict:
				twitter_dict[w] += 1
			else:
				twitter_dict[w] = 1
	twitter_corpus = []
	for line in ori_twitter_corpus:
		for w in line:
			if twitter_dict[w] < 2:
				line.remove(w)
		twitter_corpus.append(line)

	twitter_corpus = twitter_corpus[:len(author_set)]
	twitter_len = len(ori_twitter_corpus)
	print "\t", twitter_len, "Tweets in total"

	# Whole collection
	corpus = news_corpus + twitter_corpus
	
	# voca = vocabulary.Vocabulary(options.stopwords)
	docs = [voca.doc_to_ids(doc) for doc in (corpus)]		# docs is the documents list [[1,2,3],[4,2,3...]]
	twitter_words_set = set([w for doc in (twitter_corpus) for w in voca.doc_to_ids(doc)])		# is the Twitter list
	news_words_set = set([w for doc in (news_corpus) for w in voca.doc_to_ids(doc)])			# is the News list
	print "Number for Twitter words:", len(twitter_words_set)
	print "Number of News words:", len(news_words_set)

	if options.df > 0: docs = voca.cut_low_freq(docs, options.df)

	corpus_collection = list(set([w for doc in docs for w in doc]))
	# Initialization
	print "Initialize the heterogenous topic model"
	htm = HTM(options.K, options.alpha, options.beta, docs, news_len, num_authors, author_set, voca, twitter_words_set, news_words_set)

	# Get the results
	news_wt_distribution, tweets_wt_distribution, htm_wt_distribution, tweets_at_distribution, news_dt_distribution = htm.gibbs_sampling(options.iteration)

	print "KL from news to htm"
	KL_divergence(news_wt_distribution, htm_wt_distribution)
	print "KL from tweets to htm"
	KL_divergence(tweets_wt_distribution, htm_wt_distribution)
	print "KL from news to tweets"
	KL_divergence(news_wt_distribution, tweets_wt_distribution)
	print "KL from tweets to news"
	KL_divergence(tweets_wt_distribution, news_wt_distribution)

	htm.print_top_words(20, news_wt_distribution, voca.vocas)

	'''
	Perplexity
	'''
	perplexity = 0
	N = 0
	for line in htm_wt_distribution:
		for v in line:
			perplexity += np.log(v)
		N += len(line)
	print "Perplexity", np.exp(float(-perplexity)/N)

	htm.print_entropy()

	f = open(model + "news_wt.txt", "a")
	for line in news_wt_distribution:
		for n in line:
			f.write(str(n) + " ")
		f.write("\n")
	f.close()

	f = open(model + "tweets_wt.txt", "a")
	for line in tweets_wt_distribution:
		for n in line:
			f.write(str(n) + " ")
		f.write("\n")
	f.close()

	f = open(model + "htm_wt.txt", "a")
	for line in htm_wt_distribution:
		for n in line:
			f.write(str(n) + " ")
		f.write("\n")
	f.close()

	f = open(model + "tweets_at.txt", "a")
	for line in tweets_at_distribution:
		for n in line:
			f.write(str(n) + " ")
		f.write("\n")
	f.close()

	f = open(model + "news_dt.txt", "a")
	for line in news_dt_distribution:
		for n in line:
			f.write(str(n) + " ")
		f.write("\n")
	f.close()
def main():
    import optparse
    import vocabulary
    parser = optparse.OptionParser()
    parser.add_option("-f",
                      dest="filename",
                      type="string",
                      help="corpus filename")
    parser.add_option("-t", dest="time_file", help="timestamp of documents")
    parser.add_option("-o",
                      dest="output_dir",
                      type="string",
                      help="output directory")
    parser.add_option("-m", dest="model", help="previously trained model")
    parser.add_option("--alpha",
                      dest="alpha",
                      type="float",
                      help="parameter alpha",
                      default=0.001)
    parser.add_option("--beta",
                      dest="beta",
                      type="float",
                      help="parameter beta",
                      default=0.01)
    parser.add_option("-p", dest="cont", type="float", help="parameter contribution proportion", \
        default=0.5)
    parser.add_option("-k",
                      dest="K",
                      type="int",
                      help="number of topics",
                      default=50)
    parser.add_option("-i",
                      dest="iteration",
                      type="int",
                      help="iteration count",
                      default=500)
    parser.add_option("-s", dest="smartinit", action="store_false", \
        help="smart initialize of parameters", default=True)
    parser.add_option("--stopwords", dest="stopwords", help="exclude stop words", \
        action="store_true", default=True)
    parser.add_option("--seed", dest="seed", type="int", help="random seed")
    parser.add_option("--wf", dest="wf", type="int", \
        help="threshold of word frequency to cut words", default=1)
    parser.add_option("--num-proc", dest="nproc", type="int", help="number of processors", \
        default=4)
    (options, args) = parser.parse_args()
    if not (options.filename) or \
        not (options.time_file) or not(options.output_dir):
        parser.error("need (corpus filename(-f) and "
                     "document timestamp file(-t) and output directory(-o)")

    if options.filename:
        corpus = vocabulary.load_file(options.filename)
    if options.seed != None:
        numpy.random.seed(options.seed)

    if not os.path.exists(options.output_dir):
        os.makedirs(options.output_dir)

    voca = vocabulary.Vocabulary(options.stopwords, options.wf)

    if options.model:
        (prev_voca, prev_lda) = pickle.load(open(options.model))
        #drop one day worth's of data to accommodate the new day's data
        prev_voca, prev_lda = drop_one_day_data(prev_voca, prev_lda,
                                                options.alpha)
        options.K = prev_lda.K
    else:
        prev_lda = None
        prev_voca = None

    #generate the vocabularies for voca
    voca.gen_vocabs(corpus, prev_voca, prev_lda)

    docs = [voca.doc_to_ids(doc) for doc in corpus]
    #calculate the number of elements for each timestamp group in docs
    docs_nt = []
    docs_times = [item.strip() for item in open(options.time_file).readlines()]
    tmp_nt = {}
    for time in set(docs_times):
        tmp_nt[time] = docs_times.count(time)
    for (time, count) in sorted(tmp_nt.items()):
        docs_nt.append(count)
    tmp_nt.clear()

    if options.model:
        #update docs
        tmp_docs = []
        tmp_docs.extend(prev_lda.docs)
        tmp_docs.extend(docs)
        docs = tmp_docs

        #update docs_times
        tmp_docs_times = []
        tmp_docs_times.extend(prev_lda.docs_times)
        tmp_docs_times.extend(docs_times)
        docs_times = tmp_docs_times

        #update docs_nt
        tmp_docs_nt = []
        tmp_docs_nt.extend(prev_lda.docs_nt)
        tmp_docs_nt.extend(docs_nt)
        docs_nt = tmp_docs_nt

    #if options.wf > 0: docs = voca.cut_low_freq(docs, options.wf)

    #initialise lda
    lda = LDA(options.K, options.alpha, options.beta, options.cont, docs, docs_nt, voca.size(), \
        docs_times, options.output_dir, prev_lda, options.nproc, options.smartinit)

    #print word frequency
    freqword = {}
    freqword_file = open(lda.output_dir + "/freqwords.txt", "w")
    for (vocab_id, freq) in enumerate(voca.wordfreq):
        freqword[voca.vocas[vocab_id]] = freq
    for (vocab, freq) in sorted(freqword.items(),
                                key=operator.itemgetter(1),
                                reverse=True):
        freqword_file.write(vocab + " " + str(freq) + "\n")
    freqword_file.flush()

    print "corpus=%d, words=%d, K=%d, a=%f, b=%f, nproc=%d" % (
        len(corpus), len(
            voca.vocas), options.K, options.alpha, options.beta, options.nproc)

    #import cProfile
    #cProfile.runctx('lda_learning(lda, options.iteration, voca)', globals(), locals(), 'lda.profile')
    lda_learning(lda, options.iteration, voca)

    #save the model for potential re-use later
    lda.tlock = None
    pickle.dump((voca, lda), open(options.output_dir + "/model.dat", "w"))
Exemple #26
0
def main():
    import optparse
    import vocabulary
    parser = optparse.OptionParser()
    parser.add_option("-f", dest="filename", help="corpus filename")
    parser.add_option("--alpha",
                      dest="alpha",
                      type="float",
                      help="parameter alpha",
                      default=0.5)
    parser.add_option("--beta",
                      dest="beta",
                      type="float",
                      help="parameter beta",
                      default=0.5)
    parser.add_option("--lamda",
                      dest="lamda",
                      type="float",
                      help="parameter lamda",
                      default=0.5)
    parser.add_option("-k",
                      dest="K",
                      type="int",
                      help="number of topics",
                      default=20)
    parser.add_option("-i",
                      dest="iteration",
                      type="int",
                      help="iteration count",
                      default=100)
    parser.add_option("-s",
                      dest="smartinit",
                      action="store_true",
                      help="smart initialize of parameters",
                      default=False)
    parser.add_option("--stopwords",
                      dest="stopwords",
                      help="exclude stop words",
                      action="store_true",
                      default=False)
    parser.add_option("--seed", dest="seed", type="int", help="random seed")
    parser.add_option("--df",
                      dest="df",
                      type="int",
                      help="threshold of document freaquency to cut words",
                      default=0)
    (options, args) = parser.parse_args()
    if not (options.filename):
        parser.error("need corpus filename(-f) or corpus range(-c)")
    if options.filename:
        (pids, tids) = vocabulary.load_file(options.filename)
    if options.seed != None:
        numpy.random.seed(options.seed)
    #voca is the object which stores the data structures needed by LDA
    voca = vocabulary.Vocabulary(options.stopwords)
    docs = voca.PT_to_idlist(pids, tids)
    #print docs
    size_of_vocab = max(tids) + 1
    lda = BLDA(options.K, options.alpha, options.beta, options.lamda, docs,
               size_of_vocab, options.smartinit)
    #print "corpus=%d, words=%d, K=%d, a=%f, b=%f" % (len(corpus), len(voca.vocas), options.K, options.alpha, options.beta)
    blda_learning(lda, options.iteration)
Exemple #27
0
def main():
    t1 = time.time()
    import optparse
    import vocabulary
    global out_dir
    parser = optparse.OptionParser()
    parser.add_option("-f", dest="filename", help="corpus filename")
    parser.add_option("-c",
                      dest="corpus",
                      help="using range of Brown corpus' files(start:end)")
    parser.add_option("--alpha",
                      dest="alpha",
                      type="float",
                      help="parameter alpha",
                      default=0.1)
    parser.add_option("--eta1",
                      dest="eta1",
                      type="float",
                      help="parameter eta for ner word",
                      default=0.4)
    parser.add_option("--eta2",
                      dest="eta2",
                      type="float",
                      help="parameter eta for Non-ner word",
                      default=0.2)
    parser.add_option("-k",
                      dest="K",
                      type="int",
                      help="number of topics",
                      default=20)
    parser.add_option("-i",
                      dest="iteration",
                      type="int",
                      help="iteration count",
                      default=100)
    parser.add_option("-s",
                      dest="smartinit",
                      action="store_true",
                      help="smart initialize of parameters",
                      default=False)
    parser.add_option("--stopwords",
                      dest="stopwords",
                      help="exclude stop words",
                      action="store_true",
                      default=False)
    parser.add_option("--seed", dest="seed", type="int", help="random seed")
    parser.add_option("--df",
                      dest="df",
                      type="int",
                      help="threshold of document freaquency to cut words",
                      default=0)
    parser.add_option("--setup",
                      dest="setup",
                      help="setup details : ner_keywords/tf-df-iec/IG",
                      default="ner_keywords")
    parser.add_option("--dataset",
                      dest="did",
                      help="setup details : Dataset-1/Dataset-2/Dataset-3",
                      default="Dataset-1")
    (options, args) = parser.parse_args()
    if not (options.filename or options.corpus):
        parser.error("need corpus filename(-f) or corpus range(-c)")

    if options.filename:
        if options.did == 'Dataset-1':
            corpus, doc_ids, event_list, total_no_word = vocabulary.load_file(
                options.filename)
        else:
            corpus, doc_ids, event_list, total_no_word = vocabulary.load_file_reuter(
                options.filename)
    else:
        corpus = vocabulary.load_corpus(options.corpus)
        if not corpus: parser.error("corpus range(-c) forms 'start:end'")
    if options.seed != None:
        np.random.seed(options.seed)

    # fname_sp = options.filename_sp.replace('/', '-')
    # if 'ner_keywords' in options.setup:
    #     out_dir = '%s/%s/%s/%s_Topic-%d_alpha-%0.2f_eta2-%0.2f_eta1-%0.2f_iter_%d/%s' %(out_dir, options.did,
    #                                          options.setup, options.did, options.K, options.alpha, options.eta2, options.eta1, options.iteration, suffix)
    # elif 'tf-df-icf' in options.setup:
    #     out_dir = '%s/%s/%s/%s_Topic-%d_alpha-%0.2f_eta2-%0.2f_eta1-%0.2f_iter_%d/%s' %(out_dir, options.did,
    #                                          options.setup, options.did, options.K, options.alpha, options.eta2, options.eta1, options.iteration, suffix)
    # else:
    #     print('Out Directory is not defined')
    #     return
    # print(' out_dir line 448 : : ' , out_dir)
    # try:
    #     os.makedirs(out_dir)
    # except Exception as e:
    #     print(' %s Dir exist ' %(out_dir))

    file_name_list = [
        options.did, 'Topic-' + str(options.K), 'alpha-' + str(options.alpha),
        'eta1-' + str(options.eta1), 'eta2-' + str(options.eta2),
        'iter_' + str(options.iteration)
    ]
    suffix = datetime.now().strftime('%Y-%m-%d_%H:%M:%S')
    out_dir = os.path.join(out_dir, options.did, options.setup,
                           '_'.join(file_name_list), suffix)
    try:
        os.makedirs(out_dir)
    except:
        print('% dir exists !' % (out_dir))

    voca = vocabulary.Vocabulary(options.stopwords)
    docs = [voca.doc_to_ids(doc) for doc in corpus]
    if options.df > 0: docs = voca.cut_low_freq(docs, options.df)

    X = np.zeros((len(docs), len(voca.vocas)), dtype=np.int)

    for i, doc in enumerate(docs):
        for j, words in enumerate(doc):
            X[i][words] += 1

    for i in range(len(docs)):
        for j in range(len(voca.vocas)):
            if X[i][j] < 0:
                print(' Value less than zero :', i, j, X[i][j], voca.vocas[j])

    # Guided LDA with seed topics.

    # seed_topics = {}
    # for t_id, st in enumerate(seed_topic_list):
    #     for word in st:
    #         seed_topics[voca.vocas_id[word]] = t_id

    seed_topics_dir = os.path.join(SEED_DIR, options.did, options.setup)
    seed_topics_fname = '{}-{}.json'.format(options.did, options.setup)
    seed_topics_fname_total = os.path.join(seed_topics_dir, seed_topics_fname)
    seed_topics = load_seed_word(seed_topics_fname_total, voca.vocas_id,
                                 event_list)

    # saving to call graph

    model = guidedlda.GuidedLDA(n_topics=options.K,
                                n_iter=options.iteration + 1,
                                alpha=options.alpha,
                                eta=options.eta2,
                                random_state=options.K,
                                refresh=20)
    #model = guidedlda.GuidedLDA(n_topics= options.K, n_iter= options.iteration + 1 , alpha = options.alpha, eta = options.eta2, random_state= options.K, refresh=20)
    model.fit(X, seed_topics=seed_topics, seed_confidence=options.eta1)  #
    #model.fit(X)

    # writing to file doc-topic
    doc_topic = model.transform(X)
    fout_doc_topic = '%s/doc_topic_dist.txt' % (out_dir)
    fdoc = open(fout_doc_topic, 'w')
    st_doc_topic = ''
    for i, item in enumerate(docs):
        st_doc_topic += "{} : Topic_{}\n".format(doc_ids[i],
                                                 doc_topic[i].argmax())
    fdoc.write(st_doc_topic)
    fdoc.close()

    # Writing to file doc_topic_dist_score.csv
    topic_list = []
    for i in range(options.K):
        topic_list.append('Topic_%03d' % (i))
    print(doc_topic.shape, len(topic_list), len(doc_ids))
    df = pd.DataFrame(data=doc_topic, columns=topic_list, index=doc_ids)
    #print(df.head)
    fout_doc_topic_score = os.path.join(out_dir, 'doc_topic_dist_score.csv')
    df.to_csv(fout_doc_topic_score)

    # Writing to file topic-word
    n_top_words = 20
    topic_word = model.topic_word_
    fout_topic_word = '%s/topic_word_dist.txt' % (out_dir)
    ftopic = open(fout_topic_word, 'w')
    st_topic_word = ''
    for i, topic_dist in enumerate(topic_word):
        word_list = np.array(
            voca.vocas)[np.argsort(topic_dist)][:-(n_top_words + 1):-1]
        score_list = np.argsort(topic_dist)[:-(n_top_words + 1):-1]
        st_topic_word += '\n\n\nTopic : {}\n-------------------\n'.format(i)
        st = ''
        for j, word in enumerate(word_list):
            st += '{}:{}\n'.format(word, topic_dist[score_list[j]])
        st_topic_word += st

    #print(docs)
    ftopic.write(st_topic_word)
    ftopic.close()
Exemple #28
0
def main():
    import optparse
    import vocabulary
    parser = optparse.OptionParser()
    parser.add_option("-f", dest="filename", help="corpus filename")
    parser.add_option("-c", dest="corpus", help="using range of Brown corpus' files(start:end)")
    parser.add_option("--alpha", dest="alpha", type="float", help="parameter alpha", default=0.5)
    parser.add_option("--beta", dest="beta", type="float", help="parameter beta", default=0.5)
    parser.add_option("-k", dest="K", type="int", help="number of topics", default=20)
    parser.add_option("-i", dest="iteration", type="int", help="iteration count", default=100)
    parser.add_option("-s", dest="smartinit", action="store_true", help="smart initialize of parameters", default=False)
    parser.add_option("--stopwords", dest="stopwords", help="exclude stop words", action="store_true", default=False)
    parser.add_option("--seed", dest="seed", type="int", help="random seed")
    parser.add_option("--df", dest="df", type="int", help="threshold of document freaquency to cut words", default=0)
    (options, args) = parser.parse_args()
    if not (options.filename or options.corpus): parser.error("need corpus filename(-f) or corpus range(-c)")

    if options.filename:
        corpus = vocabulary.load_file(options.filename)
    else:
        corpus = vocabulary.load_corpus(options.corpus)
        if not corpus: parser.error("corpus range(-c) forms 'start:end'")
    if options.seed != None:
        numpy.random.seed(options.seed)

    voca = vocabulary.Vocabulary(options.stopwords)

    docs = [voca.doc_to_ids(doc) for doc in corpus]
    
    if options.df > 0: docs = voca.cut_low_freq(docs, options.df)

    lda = LDA(options.K, options.alpha, options.beta, docs, voca.size(), options.smartinit)
    print ("corpus=%d, words=%d, K=%d, a=%f, b=%f" % (len(corpus), len(voca.vocas), options.K, options.alpha, options.beta))

    #import cProfile
    #cProfile.runctx('lda_learning(lda, options.iteration, voca)', globals(), locals(), 'lda.profile')
    wt_distribution, dt_distribution = lda_learning(lda, options.iteration, voca)
    
    # Entropy
    entropy = []
    num_topics, num_words = wt_distribution.shape
    num_docs = dt_distribution.shape[1]
    for t in range(num_topics):
        probs = 0.0
        for doc in docs[:27685]:
            prob = 0.0
            if len(doc) == 0:
                continue
            for w in doc:
                prob -= math.log(wt_distribution[t, w]*1.0)
            prob = prob/len(doc)
            probs += prob
        entropy.append(probs/len(docs[:27685]))
    print entropy

    entropy = []
    for t in range(num_topics):
        probs = 0.0
        for doc in docs[27685:]:
            prob = 0.0
            if len(doc) == 0:
                continue
            for w in doc:
                prob -= math.log(wt_distribution[t, w]*1.0)
            prob = prob/len(doc)
            probs += prob
        entropy.append(probs/len(docs[27685:]))
    print entropy

    entropy = []
    for t in range(num_topics):
        probs = 0.0
        for doc in docs:
            prob = 0.0
            if len(doc) == 0:
                continue
            for w in doc:
                prob -= math.log(wt_distribution[t, w]*1.0)
            prob = prob/len(doc)
            probs += prob
        entropy.append(probs/len(docs))
    print entropy

    '''
    Perplexity
    '''
    perplexity = 0
    N = 0
    for line in wt_distribution:
        for v in line:
            perplexity += numpy.log(v)
        N += len(line)
    print N
    print "Perplexity", numpy.exp(float(-perplexity)/N)

    model = "./model_tlda/"
    print wt_distribution.shape
    f = open(model + "wt.txt", "a")
    for line in wt_distribution:
        for n in line:
            f.write(str(n) + " ")
        f.write("\n")
    f.close()

    f = open(model + "dt.txt", "a")
    print dt_distribution.shape
    for line in dt_distribution:
        for n in line:
            f.write(str(n) + " ")
        f.write("\n")
    f.close()
def main():
    t1= time.time()
    import optparse

    global out_dir 
    parser = optparse.OptionParser()
    parser.add_option("--finp", dest="filename_ip", help="input filename")
    parser.add_option("--fsp", dest="filename_sp", help="special words filename")
    parser.add_option("-c", dest="corpus", help="using range of Brown corpus' files(start:end)")
    parser.add_option("--alpha", dest="alpha", type="float", help="parameter alpha", default=0.5)
    parser.add_option("--eta1", dest="eta1", type="float", help="parameter eta for ner word", default=0.4)
    parser.add_option("--eta2", dest="eta2", type="float", help="parameter eta for Non-ner word", default=0.2) # No eta 2 here !!
    parser.add_option("-k", dest="K", type="int", help="number of topics", default=20)
    parser.add_option("-i", dest="iteration", type="int", help="iteration count", default=10)
    parser.add_option("-s", dest="smartinit", action="store_true", help="smart initialize of parameters", default=False)
    parser.add_option("--stopwords", dest="stopwords", help="exclude stop words", action="store_true", default=False)
    parser.add_option("--seed", dest="seed", type="int", help="random seed")
    parser.add_option("--df", dest="df", type="int", help="threshold of document freaquency to cut words", default=0)
    parser.add_option("--dp", dest="dp", help="ditichlet prior sysmetric or asymmetric ?")
    parser.add_option("--setup", dest="setup", help="setup details")
    parser.add_option("--datasets", dest="did", help="setup details",default="dataset_1")
    (options, args) = parser.parse_args()
    #if not (options.filename_ip or options.corpus): parser.error("need corpus filename(-f) or corpus range(-c)")
    
    if options.filename_ip and options.filename_sp:
         special_words = init_special_words(options.filename_sp)
         if options.did == 'Dataset-1':
            corpus,doc_ids, event_list  =   vocabulary.load_file(options.filename_ip,special_words) 
         else:
            corpus,doc_ids, event_list  =   vocabulary.load_file_reuter(options.filename_ip,special_words)
         # print(' Line 420 ...')
    else:
        options.filename_ip = 'filtered_event_new2.pkl'
        options.filename_sp = ''
        special_words = init_special_words(options.filename_sp) 
        corpus,doc_ids, event_list  = vocabulary.load_file(options.filename_ip,special_words)
        # corpus = vocabulary.load_corpus(options.corpus)
        # if not corpus: parser.error("corpus range(-c) forms 'start:end'")
    if options.seed != None:
        numpy.random.seed(options.seed)
    # print(' Line 430')
    voca = vocabulary.Vocabulary(options.stopwords,special_words)
    # print(' Line 432')
    docs = [voca.doc_to_ids(doc) for doc in corpus]
    # print(' Line 433')
    
 
    if options.df > 0: docs = voca.cut_low_freq(docs, options.df)
    
    if event_list is not None : options.K  = options.K #len(event_list)
    suffix = datetime.now().strftime('%Y-%m-%d_%H:%M:%S')
    V1,V2 = voca.size() # Total no of uniq word 
    # print(' Initialization of eta 1 started ..')
    eta1 = initialize_eta1(V1, voca,options.filename_sp) # Modify intialize_eta1 method
    # print(' Initialization of et1 done !!! ..')
    
    # out_dir = '%s/all_words/Topic_%d_alpha_%f_eta1_%f_eta2_%f_iter_%d/%s' %(out_dir,options.K,options.alpha, options.eta1,options.eta2, options.iteration, suffix)
    # out_dir = '%s/all_words/Topic_%d_alpha_%f_eta2_%f_%s_iter_%d/%s' %(out_dir,options.K,options.alpha,options.eta2,options.filename_sp, options.iteration, suffix) # Modify out_dir
    fname_sp = options.filename_sp.replace('/', '-')
    if 'ner_keywords' in options.setup:
        out_dir = '%s/%s/%s/%s_Topic-%d_alpha-%0.2f_eta2-%0.2f_eta1-%0.2f_iter_%d/%s' %(out_dir, options.did,
                                             options.setup, options.did, options.K, options.alpha, options.eta2, options.eta1, options.iteration, suffix)
    elif 'tf-df-icf' in options.setup:
        out_dir = '%s/%s/%s/%s_Topic-%d_alpha-%0.2f_eta2-%0.2f_eta1-%0.2f_iter_%d/%s' %(out_dir, options.did,
                                             options.setup, options.did, options.K, options.alpha, options.eta2, options.eta1, options.iteration, suffix)
    elif 'IG' in options.setup:
        out_dir = '%s/%s/%s/%s_Topic-%d_alpha-%0.2f_eta2-%0.2f_eta1-%0.2f_iter_%d/%s' %(out_dir, options.did,
                                             options.setup, options.did, options.K, options.alpha, options.eta2, options.eta1, options.iteration, suffix)
    else:
        print('Out Directory is not defined')
        return
    print(' out_dir line 448 : : ' , out_dir)
    try:
        os.makedirs(out_dir)
    except Exception as e:
        print(' %s Dir exist ' %(out_dir))
        # print('E MSG : ' , e)
    # lda = LDA(options.K, options.alpha, options.eta, docs, doc_ids, voca.size(), options.smartinit)
 
    print('V1 = %d , V2 = %d ' %(V1,V2)) # How to get V1 and V2
    '''
    print(' Docs :: ') 
    
    for i,doc in enumerate(docs):
        (print 'doc : ' , i, doc)
        
    print(' printing Doc Over  \n \n ')
    '''
    

    lda = LDA(options.K, options.alpha, eta1, options.eta2, docs, doc_ids, V1,V2, smartinit=True) # hv to rechechk and modify options.smartint here #Modify here and LDA class 
    flog = '%s/log_file.txt' %(out_dir)
    f=open(flog,'w')
    f.write("corpus=%d, V1_ner = %d , V2_Nner =%d, K=%d, alpha=%0.2f , eta_2_Nner = %0.2f,  iteration = %d \n" % (len(corpus), V1, V2, options.K, options.alpha, options.eta2, options.iteration))  # Modify here !
    f.write('Dataset-%s , input_file = %s, special word file = %s \n'  %(options.did, options.filename_ip, options.filename_sp) )
    f.close()

    print("corpus=%d, V1_ner = %d , V2_Nner =%d, K=%d, alpha=%0.2f, eta_2_Nner = %0.2f,  iteration = %d \n" % (len(corpus), V1, V2,
                                                                    options.K, options.alpha, options.eta2, options.iteration)) # Modify here @

    # import cProfile
    # cProfile.runctx('lda_learning(lda, options.iteration, voca)', globals(), locals(), 'lda.profile')
    lda_learning(lda, options.iteration, voca) #check this function
    t2= time.time()
    print(' Total time taken : %f ' %(t2-t1))
    flog = '%s/log_file.txt' %(out_dir)
    f=open(flog,'a')
    f.write(' TOtal time taken : %f ' %(t2-t1))
    f.close()