Esempio n. 1
0
def extract_features(qfile="question_train.csv",
                     qcatfile="question_category_train.csv",
                     catfile="category.csv",
                     subcats=False,
                     outfile="features.npz"):
    # loading the categories
    cats = categorie_class.categories()
    # initalizing corpus
    corp = corpus.corpus(cats)
    # loading questions into corpus
    corp.load(qfile, qcatfile)
    # running filers on the raw questions
    sentence_filters = [filters.punctuation_filter]
    word_filters = [
        filters.small_word_filter, filters.stopword_filter,
        filters.stemming_filter
    ]
    corp.process(sentence_filters, word_filters)
    # saving corpus into pickle
    # pickle.dump(corp, "corpus.pkl")
    # selecting the term-space
    term_space = ig_based_non_uniform(corp, M=2500, read_from_file=False)
    d = len(term_space)
    # create mapping form features names to new ids and inverse
    term_to_feature = {}
    feature_to_term = {}
    for term, i in zip(term_space, range(d)):
        term_to_feature[term] = i
        feature_to_term[i] = term
    # creating features and lable arrays
    n = len(corp.tr_set)
    features = np.zeros((d, n))
    categoryids = np.zeros(n)

    # we define new ids vor the parent categories, which will be coherent with ones assigned in categoryids
    number_of_cats = len(corp.cats.all_names())
    new_category_ids = {
        c: i
        for c, i in zip(corp.cats.all_names(), range(number_of_cats))
    }

    for q, j in zip(corp.tr_set, range(n)):
        fe = simple_features(term_space, q["words"])
        for term, value in zip(fe.keys(), fe.values()):
            i = term_to_feature[term]
            features[i, j] = value
            categoryids[j] = new_category_ids[q["category"]]

    categories = {
        i: c
        for c, i in zip(new_category_ids.keys(), new_category_ids.values())
    }

    featurenames = [feature_to_term[i] for i in range(d)]

    np.savez(outfile,
             features=features,
             featurenames=featurenames,
             categoryids=categoryids,
             categories=categories)
Esempio n. 2
0
def main():
    infile = sys.argv[1]
    K = int(sys.argv[2])
    # D is set below
    alpha = float(sys.argv[3])
    eta = float(sys.argv[4])
    kappa = float(sys.argv[5])
    L = int(sys.argv[6])
    S = int(sys.argv[7]) # batchsize

    docs = corpus.corpus()
    docs.read_data(infile)

    vocab = open(sys.argv[8]).readlines()

    if L == 1:
        start = 0
        repeat = 30
        recset = set(map(int, n.logspace(0, 3.35, num=100, base=10.0).tolist()))
        modelset = list()
        for j in range(start, repeat):
            modelset.append( batchLDA(vocab, K, 2e3, alpha, eta, 0.01, kappa, L) )

        for i in range(int(2e3)):
            for j in range(start, repeat):
                rand_ind = n.random.randint(int(2e3), size = S)
                wordids = [docs.docs[idx].words for idx in rand_ind]
                wordcts = [docs.docs[idx].counts for idx in rand_ind]
                modelset[j].update_lambda(wordids, wordcts)

            if i in recset:
                tmp = 0
                for j in range(start, repeat):
                    tmp += modelset[j]._grad
                grad = tmp/repeat
                n.savetxt('gradient-%d/gradient-%d-%d' % (L, 31, i), grad.T)  # for full sufficient statistics

    else:
        start = 0
        repeat = 1
        recset = set(map(int, n.logspace(0, 3.35, num=100, base=10.0).tolist()))
        for j in range(start, repeat):
            model = batchLDA(vocab, K, 2e3,
                                alpha, eta, 0.01, kappa, L)
            for i in range(int(2e3)):
                # print i
                # rand_ind = n.random.randint(int(2e3), size = S) # for
                rand_ind = range(int(2e3))  # for full sufficient statistics
                wordids = [docs.docs[idx].words for idx in rand_ind]
                wordcts = [docs.docs[idx].counts for idx in rand_ind]
                # wordids = [d.words for d in docs.docs[(i*S):((i+1)*S)]]
                # wordcts = [d.counts for d in docs.docs[(i*S):((i+1)*S)]]
                model.update_lambda(wordids, wordcts)
                # n.savetxt('lambda-%d/lambda-%d-%d' % (L, L, i) , model._lambda.T)
                if i in recset:
                    grad = model._grad
                    # gradient-1 folder should pre-exist on pwd
                    n.savetxt('gradient-%d/gradient-%d-%d' % (L, 31, i), grad.T) # for full sufficient statistics
def train_apply_classifier(classifier='NaiveBayes',
                           qfile_train='question_train.csv',
                           qcatfile_train='question_category_train.csv',
                           catfile='category.csv',
                           qfile_test='question_test.csv',
                           subcats=False):
    """This method performs a parameter tuning using cross validation for the specified classfier.
    After the hyper-parameter(s) are selected it returns the predicted labes for the given test-set.
    Following 3 classifiers are known to the method:
        - "NaiveBayes" (default)
        - "LogisticRegression"
        - "RandomForest"
    """
    # initalizing corpus
    corpus = corpus_class.corpus(categories.categories())
    corpus.load(qfile_train, qcatfile_train)
    filts = std_filters()
    corpus.process(corpus_size=-1, **filts)
    corpus.simple_split(0)

    #corpus = corpus_class.load_from_file()
    #corpus.simple_split(0)

    if classifier == 'NaiveBayes':
        clf_par = MultinomialNB_params(corpus)
        clf, feat_params = CV(corpus, *clf_par, n_folds=3)
    elif classifier == 'LogisticRegression':
        clf_par = LogisticRegression_params()
        clf, feat_params = CV(corpus, *clf_par, n_folds=3)
    elif classifier == 'RandomForest':
        clf_par = RandomForest_params(corpus)
        clf, feat_params = CV(corpus,
                              *clf_par,
                              n_folds=3,
                              skipping_rule=RF_skipping_rule)
    else:
        raise ValueError(
            "The given classfier is not known to this method. Look up the doc to see which classfiers work."
        )

    # making the fit for the entier traing set
    corpus.simple_split(0)
    corpus.make_features(**feat_params)
    clf.fit(corpus.X_tr, corpus.y_tr)

    X_te = corpus.process_example(qfile_test)

    return clf.predict(X_te)
Esempio n. 4
0
def main():
    print("main started")
    with open('../data/movie-dialog-corpus/movie_lines.tsv') as tsvfile:
        reader = csv.reader(tsvfile, delimiter='\t')
        dialogues = []
        for row in reader:
            dialogues.append(row[-1])

    corp = corpus()
    corp.read_corpus(dialogues)

    lm = languageModel(corp)

    methods = ['greedy', 'sampling', 'beamSearch']
    for method in methods:
        print("Method: ", method)
        conversation = []
        sentence = []
        #no capital letters!
        fourgram = ['how', 'much', 'do', 'we']
        for i in fourgram:
            sentence.append(i)
        print(sentence)
        n = 1
        while n < 6:
            if fourgram[3] == "</s>":
                n = n + 1
                conversation.append(sentence)
                sentence = []
                nextword = lm.endofSentence
                sentence.append(nextword)
                print("New senctence: ", nextword)
                fourgram = [fourgram[1], fourgram[2], fourgram[3], nextword]

            else:
                if method == 'greedy':
                    nextword = lm.greedy(lm.score(fourgram))
                elif method == 'sampling':
                    nextword = lm.sampling(lm.score(fourgram))
                else:
                    nextword = lm.beamSearch(fourgram, lm.score(fourgram))
                sentence.append(nextword)
                print(nextword)
                fourgram = [fourgram[1], fourgram[2], fourgram[3], nextword]
        conversation.append(sentence)
        speaker = ["Speaker 1:", "Speaker 2:"]
        for q, sent in enumerate(conversation):
            speak(speaker[q % 2], sent)
Esempio n. 5
0
def main():
    
    #try:
    c = corpus();
    tfidf = TFIDF()
    tf_type='aug_freq'
    idf_type='inv_smooth_idf'    
    for i, doc in enumerate(c.documents):
        cnt=0
        print("Top words in document {}".format(i + 1))
        scores = {word: tfidf.tfidf(word, doc, c.documents,tf_type, idf_type) for word in doc.words}
        sorted_words = sorted(scores.items(), key=lambda x: x[1], reverse=True)
        for word, score in sorted_words[:10]:
            cnt+=1
            if(score>0):
                print("\tWord {}: {}, TF-IDF: {}".format(cnt, word, round(score, 5)))
Esempio n. 6
0
def main():
    infile = sys.argv[1]
    K = int(sys.argv[2])
    alpha = float(sys.argv[3])
    eta = float(sys.argv[4])
    kappa = float(sys.argv[5])
    S = int(sys.argv[6])

    docs = corpus.corpus()
    docs.read_data(infile)

    vocab = open(sys.argv[7]).readlines()
    model = OnlineLDA(vocab, K, 100000, 0.1, 0.01, 1, 0.75)
    for i in range(1000):
        print i
        wordids = [d.words for d in docs.docs[(i * S):((i + 1) * S)]]
        wordcts = [d.counts for d in docs.docs[(i * S):((i + 1) * S)]]
        model.update_lambda(wordids, wordcts)
        n.savetxt('/tmp/lambda%d' % i, model._lambda.T)
Esempio n. 7
0
def main():
    infile = sys.argv[1]
    K = int(sys.argv[2])
    alpha = float(sys.argv[3])
    eta = float(sys.argv[4])
    kappa = float(sys.argv[5])
    S = int(sys.argv[6])

    docs = corpus.corpus()
    docs.read_data(infile)

    vocab = open(sys.argv[7]).readlines()
    model = OnlineLDA(vocab, K, 100000,
                      0.1, 0.01, 1, 0.75)
    for i in range(1000):
        print i
        wordids = [d.words for d in docs.docs[(i*S):((i+1)*S)]]
        wordcts = [d.counts for d in docs.docs[(i*S):((i+1)*S)]]
        model.update_lambda(wordids, wordcts)
        n.savetxt('/tmp/lambda%d' % i, model._lambda.T)
Esempio n. 8
0
    print("\t-h, --help\t\t\t\t\t\t\tGet usage info")
    #print("\t-o file, --output=file\t\t\t\t\t\tOutput all information into file")
    print(
        "\t-s name1,name2,..nameN, --source=name1,name2,..nameN\t\tSpecifies the corpus source newspaper. Valid sources are RG, Novaya"
    )
    print("\t\t\t\t\t\t\t\t\tWithout key all possible sources are used")
    print("")


if __name__ == "__main__":
    input_period = []
    output = []
    newspaper = []
    output_file = None
    get_options(a_period=input_period, a_output=output, a_source=newspaper)
    corp = corpus.corpus()
    corp.load('dumps/corp_multy-lemm.dump')
    data = corp.get_lemm(
        period=[int(input_period[0][0]),
                int(input_period[0][1])],
        sources=newspaper)
    help()
    while True:
        command = input('--> ')
        if command == "help":
            help()

        elif command[:len("collocation")] == "collocation":
            try:
                command.split(' ')[1]
            except:
Esempio n. 9
0
    print("\tcli.py -c first_year,last_year [options]")
    print("OPTIONS:")
    print("\t-c first_year,last_year, --corpus=first_year,last_year\t\tMandatory parameter. Certain time period shoud be specifyed")
    print("\t-h, --help\t\t\t\t\t\t\tGet usage info")
    #print("\t-o file, --output=file\t\t\t\t\t\tOutput all information into file")
    print("\t-s name1,name2,..nameN, --source=name1,name2,..nameN\t\tSpecifies the corpus source newspaper. Valid sources are RG, Novaya")
    print("\t\t\t\t\t\t\t\t\tWithout key all possible sources are used")
    print("")

if __name__ == "__main__":
    input_period = []
    output = []
    newspaper = []
    output_file = None
    get_options(a_period=input_period,a_output=output,a_source=newspaper)
    corp = corpus.corpus()
    corp.load('dumps/corp_multy-lemm.dump')
    data = corp.get_lemm(period=[int(input_period[0][0]), int(input_period[0][1])], sources=newspaper)
    help()
    while True:
        command = input('--> ')
        if command == "help":
            help()

        elif command[:len("collocation")] == "collocation":
            try:
                command.split(' ')[1]
            except:
                print ("ngram wasn't found")
                help()
                continue
Esempio n. 10
0
def main():
    corp = corpus()

    # rg_98 = text_unit(dates=[1997, 1998, 1999], sources=['RG'])
    # rg_98.read(DATA_DIR, '98', 'rg' + DATA_EXT)
    # rg_98.get_lemm()
    # corp.add(rg_98)

    # rg_01 = text_unit(dates=[2000, 2001, 2002], sources=['RG'])
    # rg_01.read(DATA_DIR, '01', 'rg' + DATA_EXT)
    # rg_01.get_lemm()
    # corp.add(rg_01)

    # rg_04 = text_unit(dates=[2003, 2004, 2005], sources=['RG'])
    # rg_04.read(DATA_DIR, '04', 'rg' + DATA_EXT)
    # rg_04.get_lemm()
    # corp.add(rg_04)

    # rg_16 = text_unit(dates=[2016], sources=['RG'])
    # rg_16.read(DATA_DIR, '16', 'rg' + DATA_EXT)
    # rg_16.get_lemm()
    # corp.add(rg_16)

    # nv_98 = text_unit(dates=[1997, 1998, 1999], sources=['Novaya'])
    # nv_98.read(DATA_DIR, '98', 'nv' + DATA_EXT)
    # nv_98.get_lemm()
    # corp.add(nv_98)

    # nv_01 = text_unit(dates=[2000, 2001, 2002], sources=['Novaya'])
    # nv_01.read(DATA_DIR, '01', 'nv' + DATA_EXT)
    # nv_01.get_lemm()
    # corp.add(nv_01)

    # nv_04 = text_unit(dates=[2003, 2004, 2005], sources=['Novaya'])
    # nv_04.read(DATA_DIR, '04', 'nv' + DATA_EXT)
    # nv_04.get_lemm()
    # corp.add(nv_04)

    # nv_16 = text_unit(dates=[2016], sources=['Novaya'])
    # nv_16.read(DATA_DIR, '16', 'nv' + DATA_EXT)
    # nv_16.get_lemm()
    # corp.add(nv_16)

    # corp.dump(path='dumps/corp.dump')

    # corp.load('dumps/corp.dump')

    # corp.get_info()

    # lemmas = corp.get_lemm(sources='RG')
    # corp.add_stat(name='RG_lemmas', value=lemmas, descr='RG lemmas')

    # lemmas = corp.get_lemm(sources='Novaya')
    # corp.add_stat(name='Novaya_lemmas', value=lemmas, descr='Novaya lemmas')

    # lemmas = corp.get_lemm(period=[1997, 1999])
    # corp.add_stat(
    #     name='98_lemmas', value=lemmas, descr='Lemmas from 1997 till 1999')

    # lemmas = corp.get_lemm(period=[2000, 2002])
    # corp.add_stat(
    #     name='01_lemmas', value=lemmas, descr='Lemmas from 2000 till 2002')

    # lemmas = corp.get_lemm(period=[2003, 2005])
    # corp.add_stat(
    #     name='04_lemmas', value=lemmas, descr='Lemmas from 2003 till 2005')

    # lemmas = corp.get_lemm(period=[2016, 2016])
    # corp.add_stat(
    #     name='16_lemmas', value=lemmas, descr='Lemmas from 2016')

    # corp.get_lemm()

    # corp.dump('dumps/corp_multy-lemm.dump')

    corp.load('dumps/corp_multy-lemm.dump')

    corp.get_info()

    # lemm_rate = corp.get_lemm_freq()
    # for lemm, q in lemm_rate:
    #     print('{}: {} times'.format(lemm, q))

    lemm_rate = corp.get_lemm_freq(5)
    for lemm, q in lemm_rate:
        print('{}: {} times'.format(lemm, q))
Esempio n. 11
0
 def __init__(self, _fichier, param):
     self.corpus = corpus.corpus(_fichier).generer()
     self.model = w2v.word2vec(param)
Esempio n. 12
0
                inp=Variable(torch.from_numpy(corpus.train_ids[idx:idx+64])).cuda()
                tag=Variable(torch.from_numpy(corpus.train_tags[idx:idx+64])).cuda()
            if inp.size(0)!=batch_size:
                continue
            pred=model(inp,states)
            _,pred_idx=torch.max(pred,1)
            loss = criterion(pred,tag)
            # 剪裁参数梯度
            dev_loss.append(loss.data[0])
            dev_acc.append((sum(pred_idx.cpu().data.numpy()==tag.cpu().data.numpy())*1./tag.size(0)))

        print("epoch :{},train mean loss:{},dev mean loss:{}".format(epoch,np.mean(train_loss),np.mean(dev_loss)))
        train_loss_p.append(np.mean(train_loss))
        dev_loss_p.append(np.mean(dev_loss))
        train_acc_p.append(np.mean(train_acc))
        dev_acc_p.append(np.mean(dev_acc))
        step_p.append(epoch)
        viz.line(
             X=np.column_stack((np.array(step_p), np.array(step_p),np.array(step_p), np.array(step_p))),
             Y=np.column_stack((np.array(train_loss_p),np.array(train_acc_p),np.array(dev_loss_p), np.array(dev_acc_p))),
             win=line,
            opts=dict(legend=["Train_mean_loss", "Train_acc","Eval_mean_loss", "Eval_acc"]))


if __name__ == '__main__':
    corpus=corpus("data/pos.csv","data/neg.csv","data/neutral.csv","data/stop_words.csv")
    model=BILSTM(corpus.num_classes,corpus.vocab_size,hidden_size,num_layers)
    if cuda:
        model=model.cuda()
    train(corpus,model)
Esempio n. 13
0
def main():
    corp = corpus()

    # rg_98 = text_unit(dates=[1997, 1998, 1999], sources=['RG'])
    # rg_98.read(DATA_DIR, '98', 'rg' + DATA_EXT)
    # rg_98.get_lemm()
    # corp.add(rg_98)

    # rg_01 = text_unit(dates=[2000, 2001, 2002], sources=['RG'])
    # rg_01.read(DATA_DIR, '01', 'rg' + DATA_EXT)
    # rg_01.get_lemm()
    # corp.add(rg_01)

    # rg_04 = text_unit(dates=[2003, 2004, 2005], sources=['RG'])
    # rg_04.read(DATA_DIR, '04', 'rg' + DATA_EXT)
    # rg_04.get_lemm()
    # corp.add(rg_04)

    # rg_16 = text_unit(dates=[2016], sources=['RG'])
    # rg_16.read(DATA_DIR, '16', 'rg' + DATA_EXT)
    # rg_16.get_lemm()
    # corp.add(rg_16)

    # nv_98 = text_unit(dates=[1997, 1998, 1999], sources=['Novaya'])
    # nv_98.read(DATA_DIR, '98', 'nv' + DATA_EXT)
    # nv_98.get_lemm()
    # corp.add(nv_98)

    # nv_01 = text_unit(dates=[2000, 2001, 2002], sources=['Novaya'])
    # nv_01.read(DATA_DIR, '01', 'nv' + DATA_EXT)
    # nv_01.get_lemm()
    # corp.add(nv_01)

    # nv_04 = text_unit(dates=[2003, 2004, 2005], sources=['Novaya'])
    # nv_04.read(DATA_DIR, '04', 'nv' + DATA_EXT)
    # nv_04.get_lemm()
    # corp.add(nv_04)

    # nv_16 = text_unit(dates=[2016], sources=['Novaya'])
    # nv_16.read(DATA_DIR, '16', 'nv' + DATA_EXT)
    # nv_16.get_lemm()
    # corp.add(nv_16)

    # corp.dump(path='dumps/corp.dump')

    # corp.load('dumps/corp.dump')

    # corp.get_info()

    # lemmas = corp.get_lemm(sources='RG')
    # corp.add_stat(name='RG_lemmas', value=lemmas, descr='RG lemmas')

    # lemmas = corp.get_lemm(sources='Novaya')
    # corp.add_stat(name='Novaya_lemmas', value=lemmas, descr='Novaya lemmas')

    # lemmas = corp.get_lemm(period=[1997, 1999])
    # corp.add_stat(
    #     name='98_lemmas', value=lemmas, descr='Lemmas from 1997 till 1999')

    # lemmas = corp.get_lemm(period=[2000, 2002])
    # corp.add_stat(
    #     name='01_lemmas', value=lemmas, descr='Lemmas from 2000 till 2002')

    # lemmas = corp.get_lemm(period=[2003, 2005])
    # corp.add_stat(
    #     name='04_lemmas', value=lemmas, descr='Lemmas from 2003 till 2005')

    # lemmas = corp.get_lemm(period=[2016, 2016])
    # corp.add_stat(
    #     name='16_lemmas', value=lemmas, descr='Lemmas from 2016')

    # corp.get_lemm()

    # corp.dump('dumps/corp_multy-lemm.dump')

    corp.load('dumps/corp_multy-lemm.dump')

    corp.get_info()

    # lemm_rate = corp.get_lemm_freq()
    # for lemm, q in lemm_rate:
    #     print('{}: {} times'.format(lemm, q))

    lemm_rate = corp.get_lemm_freq(5)
    for lemm, q in lemm_rate:
        print('{}: {} times'.format(lemm, q))
Esempio n. 14
0
#         # Initialize the variational distribution q(theta|gamma) for#         # the mini-batch#         gamma = 1*n.random.gamma(100., 1./100., (batchD, self._K))#         Elogtheta = dirichlet_expectation(gamma)#         expElogtheta = n.exp(Elogtheta)
#         sstats = n.zeros(self._lambda.shape)#         # Now, for each document d update that document's gamma and phi#         it = 0#         meanchange = 0#         for d in range(0, batchD):#             # These are mostly just shorthand (but might help cache locality)#             ids = wordids[d]#             cts = wordcts[d]#             gammad = gamma[d, :]#             Elogthetad = Elogtheta[d, :]#             expElogthetad = expElogtheta[d, :]#             expElogbetad = self._expElogbeta[:, ids]#             # The optimal phi_{dwk} is proportional to #             # expElogthetad_k * expElogbetad_w. phinorm is the normalizer.#             phinorm = n.dot(expElogthetad, expElogbetad) + 1e-100#             # Iterate between gamma and phi until convergence#             for it in range(0, 100):#                 lastgamma = gammad#                 # We represent phi implicitly to save memory and time.#                 # Substituting the value of the optimal phi back into#                 # the update for gamma gives this update. Cf. Lee&Seung 2001.#                 gammad = self._alpha + expElogthetad * \#                     n.dot(cts / phinorm, expElogbetad.T)#                 Elogthetad = dirichlet_expectation(gammad)#                 expElogthetad = n.exp(Elogthetad)#                 phinorm = n.dot(expElogthetad, expElogbetad) + 1e-100#                 # If gamma hasn't changed much, we're done.#                 meanchange = n.mean(abs(gammad - lastgamma))#                 if (meanchange < meanchangethresh):#                     break#             gamma[d, :] = gammad#             # Contribution of document d to the expected sufficient#             # statistics for the M step.#             sstats[:, ids] += n.outer(expElogthetad.T, cts/phinorm)
#         # This step finishes computing the sufficient statistics for the#         # M step, so that#         # sstats[k, w] = \sum_d n_{dw} * phi_{dwk} #         # = \sum_d n_{dw} * exp{Elogtheta_{dk} + Elogbeta_{kw}} / phinorm_{dw}.#         sstats = sstats * self._expElogbeta
#         return((gamma, sstats))
 def update_lambda_docs(self, docs): """        First does an E step on the mini-batch given in wordids and        wordcts, then uses the result of that E step to update the        variational parameter matrix lambda.        Arguments:        docs:  List of D documents. Each document must be represented               as a string. (Word order is unimportant.) Any               words not in the vocabulary will be ignored.        Returns gamma, the parameters to the variational distribution        over the topic weights theta for the documents analyzed in this        update.        Also returns an estimate of the variational bound for the        entire corpus for the OLD setting of lambda based on the        documents passed in. This can be used as a (possibly very        noisy) estimate of held-out likelihood. """
 # rhot will be between 0 and 1, and says how much to weight # the information we got from this mini-batch.        rhot = pow(self._tau0 + self._updatect, -self._kappa) self._rhot = rhot # Do an E step to update gamma, phi | lambda for this # mini-batch. This also returns the information about phi that # we need to update lambda.        (gamma, sstats) = self.do_e_step_docs(docs) # Estimate held-out likelihood for current values of lambda.        bound = self.approx_bound_docs(docs, gamma) # Update lambda based on documents. self._lambda = self._lambda * (1-rhot) + \            rhot * (self._eta + self._D * sstats / len(docs)) self._Elogbeta = dirichlet_expectation(self._lambda) self._expElogbeta = n.exp(self._Elogbeta) self._updatect += 1
 return(gamma, bound)
 def update_lambda(self, wordids, wordcts): """        First does an E step on the mini-batch given in wordids and        wordcts, then uses the result of that E step to update the        variational parameter matrix lambda.        Arguments:        docs:  List of D documents. Each document must be represented               as a string. (Word order is unimportant.) Any               words not in the vocabulary will be ignored.        Returns gamma, the parameters to the variational distribution        over the topic weights theta for the documents analyzed in this        update.        Also returns an estimate of the variational bound for the        entire corpus for the OLD setting of lambda based on the        documents passed in. This can be used as a (possibly very        noisy) estimate of held-out likelihood. """
 # rhot will be between 0 and 1, and says how much to weight # the information we got from this mini-batch.        rhot = pow(self._tau0 + self._updatect, -self._kappa) self._rhot = rhot # Do an E step to update gamma, phi | lambda for this # mini-batch. This also returns the information about phi that # we need to update lambda.        (gamma, sstats) = self.do_e_step(wordids, wordcts) # Estimate held-out likelihood for current values of lambda.        bound = self.approx_bound(wordids, wordcts, gamma) # Update lambda based on documents. self._lambda = self._lambda * (1-rhot) + \            rhot * (self._eta + self._D * sstats / len(wordids)) self._Elogbeta = dirichlet_expectation(self._lambda) self._expElogbeta = n.exp(self._Elogbeta) self._updatect += 1
 return(gamma, bound)
 def approx_bound(self, wordids, wordcts, gamma): """        Estimates the variational bound over *all documents* using only        the documents passed in as "docs." gamma is the set of parameters        to the variational distribution q(theta) corresponding to the        set of documents passed in.        The output of this function is going to be noisy, but can be        useful for assessing convergence. """
 # This is to handle the case where someone just hands us a single # document, not in a list.        batchD = len(wordids)
        score = 0        Elogtheta = dirichlet_expectation(gamma)        expElogtheta = n.exp(Elogtheta)
 # E[log p(docs | theta, beta)] for d in range(0, batchD):            gammad = gamma[d, :]            ids = wordids[d]            cts = n.array(wordcts[d])            phinorm = n.zeros(len(ids)) for i in range(0, len(ids)):                temp = Elogtheta[d, :] + self._Elogbeta[:, ids[i]]                tmax = max(temp)                phinorm[i] = n.log(sum(n.exp(temp - tmax))) + tmax            score += n.sum(cts * phinorm)#             oldphinorm = phinorm#             phinorm = n.dot(expElogtheta[d, :], self._expElogbeta[:, ids])#             print oldphinorm#             print n.log(phinorm)#             score += n.sum(cts * n.log(phinorm))
 # E[log p(theta | alpha) - log q(theta | gamma)]        score += n.sum((self._alpha - gamma)*Elogtheta)        score += n.sum(gammaln(gamma) - gammaln(self._alpha))        score += sum(gammaln(self._alpha*self._K) - gammaln(n.sum(gamma, 1)))
 # Compensate for the subsampling of the population of documents        score = score * self._D / len(wordids)
 # E[log p(beta | eta) - log q (beta | lambda)]        score = score + n.sum((self._eta-self._lambda)*self._Elogbeta)        score = score + n.sum(gammaln(self._lambda) - gammaln(self._eta))        score = score + n.sum(gammaln(self._eta*self._W) -                               gammaln(n.sum(self._lambda, 1)))
 return(score)
 def approx_bound_docs(self, docs, gamma): """        Estimates the variational bound over *all documents* using only        the documents passed in as "docs." gamma is the set of parameters        to the variational distribution q(theta) corresponding to the        set of documents passed in.        The output of this function is going to be noisy, but can be        useful for assessing convergence. """
 # This is to handle the case where someone just hands us a single # document, not in a list. if (type(docs).__name__ == 'string'):            temp = list()            temp.append(docs)            docs = temp
        (wordids, wordcts) = parse_doc_list(docs, self._vocab)        batchD = len(docs)
        score = 0        Elogtheta = dirichlet_expectation(gamma)        expElogtheta = n.exp(Elogtheta)
 # E[log p(docs | theta, beta)] for d in range(0, batchD):            gammad = gamma[d, :]            ids = wordids[d]            cts = n.array(wordcts[d])            phinorm = n.zeros(len(ids)) for i in range(0, len(ids)):                temp = Elogtheta[d, :] + self._Elogbeta[:, ids[i]]                tmax = max(temp)                phinorm[i] = n.log(sum(n.exp(temp - tmax))) + tmax            score += n.sum(cts * phinorm)#             oldphinorm = phinorm#             phinorm = n.dot(expElogtheta[d, :], self._expElogbeta[:, ids])#             print oldphinorm#             print n.log(phinorm)#             score += n.sum(cts * n.log(phinorm))
 # E[log p(theta | alpha) - log q(theta | gamma)]        score += n.sum((self._alpha - gamma)*Elogtheta)        score += n.sum(gammaln(gamma) - gammaln(self._alpha))        score += sum(gammaln(self._alpha*self._K) - gammaln(n.sum(gamma, 1)))
 # Compensate for the subsampling of the population of documents        score = score * self._D / len(docs)
 # E[log p(beta | eta) - log q (beta | lambda)]        score = score + n.sum((self._eta-self._lambda)*self._Elogbeta)        score = score + n.sum(gammaln(self._lambda) - gammaln(self._eta))        score = score + n.sum(gammaln(self._eta*self._W) -                               gammaln(n.sum(self._lambda, 1)))
 return(score)
def main():    infile = sys.argv[1] K = int(sys.argv[2])    alpha = float(sys.argv[3])    eta = float(sys.argv[4])    kappa = float(sys.argv[5]) S = int(sys.argv[6])
    docs = corpus.corpus()    docs.read_data(infile)
    vocab = open(sys.argv[7]).readlines()    model = OnlineLDA(vocab, K, 100000, 0.1, 0.01, 1, 0.75) for i in range(1000): print i        wordids = [d.words for d in docs.docs[(i*S):((i+1)*S)]]        wordcts = [d.counts for d in docs.docs[(i*S):((i+1)*S)]]        model.update_lambda(wordids, wordcts)        n.savetxt('/tmp/lambda%d' % i, model._lambda.T) #     infile = open(infile)#     corpus.read_stream_data(infile, 100000)
if __name__ == '__main__':    main()
Esempio n. 15
0
import sys
sys.path.append('./model')
sys.path.append('./process')
sys.path.append('./training')
sys.path.append('./service')
from corpus import corpus
from corpus import csvExtractor
from NLTKpipeline import NLTKpipeline

extractor = csvExtractor(label_col=0,text_col=5, delim_patt='"')
extractor.setLabelDictionary({"0":"neg", "4":"pos"})
#c = corpus(loc="/home/nitrous/data/sentiment140/subset.csv", extractor=extractor)
c = corpus(loc="/home/nitrous/data/sentiment140/rand_subset.csv", extractor=extractor)

pipeline = NLTKpipeline()
pipeline.process(c)

for a in c.docs:
  a.toString()
  print "\n"



Esempio n. 16
0
def main():
    infile = sys.argv[1]
    K = int(sys.argv[2])
    # D is set below
    alpha = float(sys.argv[3])
    eta = float(sys.argv[4])
    kappa = float(sys.argv[5])
    L = int(sys.argv[6])
    S = int(sys.argv[7])  # batchsize

    docs = corpus.corpus()
    docs.read_data(infile)

    vocab = open(sys.argv[8]).readlines()

    if L == 1:
        start = 0
        repeat = 30
        recset = set(map(int,
                         n.logspace(0, 3.35, num=100, base=10.0).tolist()))
        modelset = list()
        for j in range(start, repeat):
            modelset.append(batchLDA(vocab, K, 2e3, alpha, eta, 0.01, kappa,
                                     L))

        for i in range(int(2e3)):
            for j in range(start, repeat):
                rand_ind = n.random.randint(int(2e3), size=S)
                wordids = [docs.docs[idx].words for idx in rand_ind]
                wordcts = [docs.docs[idx].counts for idx in rand_ind]
                modelset[j].update_lambda(wordids, wordcts)

            if i in recset:
                tmp = 0
                for j in range(start, repeat):
                    tmp += modelset[j]._grad
                grad = tmp / repeat
                n.savetxt('gradient-%d/gradient-%d-%d' % (L, 31, i),
                          grad.T)  # for full sufficient statistics

    else:
        start = 0
        repeat = 1
        recset = set(map(int,
                         n.logspace(0, 3.35, num=100, base=10.0).tolist()))
        for j in range(start, repeat):
            model = batchLDA(vocab, K, 2e3, alpha, eta, 0.01, kappa, L)
            for i in range(int(2e3)):
                # print i
                # rand_ind = n.random.randint(int(2e3), size = S) # for
                rand_ind = range(int(2e3))  # for full sufficient statistics
                wordids = [docs.docs[idx].words for idx in rand_ind]
                wordcts = [docs.docs[idx].counts for idx in rand_ind]
                # wordids = [d.words for d in docs.docs[(i*S):((i+1)*S)]]
                # wordcts = [d.counts for d in docs.docs[(i*S):((i+1)*S)]]
                model.update_lambda(wordids, wordcts)
                # n.savetxt('lambda-%d/lambda-%d-%d' % (L, L, i) , model._lambda.T)
                if i in recset:
                    grad = model._grad
                    # gradient-1 folder should pre-exist on pwd
                    n.savetxt('gradient-%d/gradient-%d-%d' % (L, 31, i),
                              grad.T)  # for full sufficient statistics
Esempio n. 17
0
            pred_ids.append(sent_ids)

    pred_ids = np.array(pred_ids)
    acc = []
    for idx in tqdm(range(0, len(pred_ids), batch_size)):
        if cuda:
            inp = Variable(torch.from_numpy(pred_ids[idx:idx + 64])).cuda()
            tag = Variable(torch.from_numpy(pred_ids[idx:idx + 64])).cuda()
        pred = cnn(inp)
        _, pred_idx = torch.max(pred, 1)
        acc.append((sum(pred_idx.cpu().data.numpy() == 1) * 1. / tag.size(0)))
    print(np.mean(acc))


if __name__ == '__main__':
    corpus_ = corpus('bank_all_0.txt', 'bank_all_1.txt', 'stop_words.csv')
    print('Max length of sents :{}'.format(np.max(corpus_.lengths)))
    print('The vocab size is {}'.format(len(corpus_.token2idx)))
    print('Train ids shape {}'.format(corpus_.train_ids.shape))
    print('Eval ids shape {}'.format(corpus_.eval_ids.shape))
    num_classes = 2
    vocab_size = len(corpus_.token2idx)
    emb_dim = 128
    criterion = nn.CrossEntropyLoss()

    if not os.path.isfile('cnn.pt'):
        if cuda:
            cnn = CNN(num_classes, vocab_size, emb_dim, filter_sizes,
                      num_filters).cuda()
        else:
            cnn = CNN(num_classes, vocab_size, emb_dim, filter_sizes,