Exemple #1
0
def do_nmf(run_id, no_processes=16):
    stat = RunStats.objects.get(run_id=run_id)
    qid = stat.query.id
    K = stat.K

    TopicTerm.objects.filter(run_id=run_id).delete()
    DocTopic.objects.filter(run_id=run_id).delete()
    Topic.objects.filter(run_id=run_id).delete()

    stat.term_set.clear()

    alpha = stat.alpha
    n_features = stat.max_features
    if n_features == 0:
        n_features = 100000000000
    limit = stat.limit
    ng = stat.ngram

    # if stat.method=="LD" and stat.lda_library!=RunStats.WARP:
    #     if stat.max_iter == 200:
    #         stat.max_iter = 10
    #     if stat.max_iter > 100:
    #         stat.max_iter = 90

    n_samples = stat.max_iter

    stat.process_id = os.getpid()
    stat.status = 1
    stat.save()

    if stat.fulltext:
        docs = Doc.objects.filter(query=qid, fulltext__iregex='\w')
    else:
        docs = Doc.objects.filter(query=qid, content__iregex='\w')

    # if we are limiting, probably for testing, then do that
    if limit > 0:
        docs = docs[:limit]

    print('\n###############################\
    \n## Topic modeling (method: {}, library: {}) on query {} with {} documents \
and {} topics (run_id: {})\n'.format(stat.method, stat.lda_library, qid,
                                     docs.count(), K, run_id))

    # Get the docs into lists
    abstracts, docsizes, ids, citations = proc_docs(docs, stoplist,
                                                    stat.fulltext,
                                                    stat.citations)

    scaled_citations = 1 + RobustScaler(with_centering=False).fit_transform(
        np.array(citations).reshape(-1, 1))

    sentences = [get_sentence(x) for x in abstracts]
    w2v = gensim.models.Word2Vec(sentences)
    validation_measure = WithinTopicMeasure(ModelSimilarity(w2v))

    if stat.fancy_tokenization:
        ######################################
        ## A fancy tokenizer

        from nltk import wordpunct_tokenize
        from nltk import WordNetLemmatizer
        from nltk import sent_tokenize
        from nltk import pos_tag
        from nltk.corpus import stopwords as sw
        punct = set(string.punctuation)
        from nltk.corpus import wordnet as wn
        stopwords = set(sw.words('english'))

        if stat.extra_stopwords:
            stopwords = stopwords | set(stat.extra_stopwords)

        def lemmatize(token, tag):
            tag = {
                'N': wn.NOUN,
                'V': wn.VERB,
                'R': wn.ADV,
                'J': wn.ADJ
            }.get(tag[0], wn.NOUN)
            return WordNetLemmatizer().lemmatize(token, tag)

        kws = Doc.objects.filter(
            query=stat.query,
            kw__text__iregex='\w+[\-\ ]').values('kw__text').annotate(
                n=Count('pk')).filter(n__gt=len(abstracts) //
                                      200).order_by('-n')

        kw_text = set([x['kw__text'].replace('-', ' ') for x in kws])
        kw_ws = set([x['kw__text'].replace('-', ' ').split()[0]
                     for x in kws]) - stopwords

        def fancy_tokenize(X):

            common_words = set([x.lower() for x in X.split()]) & kw_ws
            for w in list(common_words):
                w = w.replace('(', '').replace(')', '')
                wpat = "({}\W*\w*)".format(w)
                wn = [
                    x.lower().replace('-', ' ')
                    for x in re.findall(wpat, X, re.IGNORECASE)
                ]
                kw_matches = set(wn) & kw_text
                if len(kw_matches) > 0:
                    for m in kw_matches:
                        insensitive_m = re.compile(m, re.IGNORECASE)
                        X = insensitive_m.sub(' ', X)
                        yield m.replace(" ", "-")

            for sent in sent_tokenize(X):
                for token, tag in pos_tag(wordpunct_tokenize(sent)):
                    token = token.lower().strip()
                    if token in stopwords:
                        continue
                    if all(char in punct for char in token):
                        continue
                    if len(token) < 3:
                        continue
                    if all(char in string.digits for char in token):
                        continue
                    lemma = lemmatize(token, tag)
                    yield lemma

        tokenizer = fancy_tokenize
    else:
        tokenizer = snowball_stemmer()

    #######################################

    #############################################
    # Use tf-idf features for NMF.
    print("Extracting tf-idf features ...")
    tfidf_vectorizer = TfidfVectorizer(max_df=stat.max_df,
                                       min_df=stat.min_freq,
                                       max_features=n_features,
                                       ngram_range=(ng, ng),
                                       tokenizer=tokenizer,
                                       stop_words=stoplist)

    count_vectorizer = CountVectorizer(max_df=stat.max_df,
                                       min_df=stat.min_freq,
                                       max_features=n_features,
                                       ngram_range=(ng, ng),
                                       tokenizer=tokenizer,
                                       stop_words=stoplist)

    t0 = time()
    if stat.method == "NM":
        tfidf = tfidf_vectorizer.fit_transform(abstracts)
        vectorizer = tfidf_vectorizer
    else:
        tfidf = count_vectorizer.fit_transform(abstracts)
        vectorizer = count_vectorizer
    print("done in %0.3fs." % (time() - t0))
    stat.tfidf_time = time() - t0
    stat.save()

    if citations is not False:
        tfidf = tfidf.multiply(scaled_citations)

    del abstracts
    gc.collect()

    if stat.db:
        vocab = vectorizer.get_feature_names()
        vocab_ids = []
        pool = Pool(processes=no_processes)
        vocab_ids.append(pool.map(partial(add_features, run_id=run_id), vocab))
        pool.terminate()
        #del vocab
        vocab_ids = vocab_ids[0]

        ## Make some topics
        django.db.connections.close_all()
        topic_ids = db.add_topics(K, run_id)
        gc.collect()

    # Fit the NMF model
    print("Fitting the model with tf-idf features, "
          "n_samples=%d and n_features=%d..." % (n_samples, n_features))
    t0 = time()
    if stat.method == "NM":
        model = NMF(n_components=K,
                    random_state=1,
                    alpha=alpha,
                    l1_ratio=.1,
                    verbose=True,
                    init='nndsvd',
                    max_iter=n_samples).fit(tfidf)
        dtm = csr_matrix(model.transform(tfidf))
        components = csr_matrix(model.components_)

    else:
        if stat.lda_library == RunStats.LDA_LIB:
            model = lda.LDA(
                n_topics=K,
                alpha=stat.alpha,
                eta=stat.alpha,
                n_iter=stat.max_iter * 10,
            ).fit(tfidf)
            dtm = model.doc_topic_
            components = csr_matrix(model.components_)
        elif stat.lda_library == RunStats.WARP:
            # Export warp lda
            try:
                warp_path = settings.WARP_LDA_PATH
                os.chdir(warp_path)
            except:
                print(
                    "warplda is not installed, or its path is not defined in settings, exiting...."
                )
                return
            fname = wpu.export_warp_lda(ids, tfidf, vocab, run_id)
            # preformat
            os.system(f'./format -input {fname} -prefix {run_id} train')
            # Run warp lda
            runcmd = f'./warplda --prefix {run_id} --k {stat.K}'
            if stat.alpha:
                runcmd += f' -alpha {stat.alpha}'
            if stat.beta:
                runcmd += f' -beta {stat.beta}'
            else:
                stat.beta = 0.01  # default beta value
                stat.save()
            if stat.max_iter:
                runcmd += f' --niter {stat.max_iter}'
            runcmd += ' train.model'
            print("Running warplda.")
            os.system(runcmd)
            print("Finished running warplda, importing results.")

            warp_vocab = np.loadtxt(f'{run_id}.vocab', dtype=str)
            warp_translate = np.argsort(warp_vocab).argsort()
            # Import warp lda as matrices
            with open(f'{run_id}.model', 'r') as f:
                for i, l in enumerate(f):
                    if i == 0:
                        M = int(l.split()[0])
                        N = int(l.split()[1])
                        components = lil_matrix((N, M))
                    else:
                        largs = l.split('\t')[1].strip().split()
                        for la in largs:
                            wid = warp_translate[i - 1]
                            t, n = la.split(':')
                            components[int(t), wid] = int(n)

            components = components.todense()
            for k in range(components.shape[0]):
                components[k, :] = (components[k, :] + stat.beta) / (
                    components[k, :].sum() + stat.K * stat.beta)
            components = csr_matrix(components)

            dtm = lil_matrix((len(ids), N))
            with open(f'{run_id}.z.estimate', 'r') as f:
                for i, l in enumerate(f):
                    largs = l.split(' ', maxsplit=1)[1].strip().split()
                    for la in largs:
                        w, t = la.split(':')
                        dtm[i, int(t)] += 1

            theta = dtm.todense()
            for i in range(dtm.shape[0]):
                theta[i, :] = (theta[i, :] + stat.alpha) / (
                    theta[i, :].sum() + stat.K * stat.alpha)

            dtm = csr_matrix(theta)

        else:
            model = LDA(
                n_components=K,
                doc_topic_prior=stat.alpha,
                topic_word_prior=stat.beta,
                learning_method=stat.get_lda_learning_method_display().lower(),
                max_iter=stat.max_iter,
                n_jobs=2).fit(tfidf)

            dtm = csr_matrix(model.transform(tfidf))
            components = csr_matrix(model.components_)

    print("done in %0.3fs." % (time() - t0))
    stat.nmf_time = time() - t0

    if stat.db:
        ## Add topics terms
        print("Adding topicterms to db")
        t0 = time()
        ldalambda = find(components)
        topics = range(len(ldalambda[0]))
        tts = []
        pool = Pool(processes=no_processes)

        tts.append(
            pool.map(
                partial(db.f_lambda,
                        m=ldalambda,
                        v_ids=vocab_ids,
                        t_ids=topic_ids,
                        run_id=run_id), topics))
        pool.terminate()
        tts = flatten(tts)
        gc.collect()
        sys.stdout.flush()
        django.db.connections.close_all()
        TopicTerm.objects.bulk_create(tts)
        print("done in %0.3fs." % (time() - t0))
        stat.db_time = stat.db_time + time() - t0

        ## Add topic-docs
        print("Adding DocTopics")
        gamma = find(dtm)
        glength = len(gamma[0])

        chunk_size = 100000

        parallel_add = True

        all_dts = []

        make_t = 0
        add_t = 0

        t0 = time()
        ### Go through in chunks
        for i in range(glength // chunk_size + 1):
            dts = []
            values_list = []
            f = i * chunk_size
            l = (i + 1) * chunk_size
            if l > glength:
                l = glength
            docs = range(f, l)
            doc_batches = []
            for p in range(no_processes):
                doc_batches.append([x for x in docs if x % no_processes == p])
            pool = Pool(processes=no_processes)
            make_t0 = time()
            values_list.append(
                pool.map(
                    partial(db.f_gamma_batch,
                            gamma=gamma,
                            docsizes=docsizes,
                            docUTset=ids,
                            topic_ids=topic_ids,
                            run_id=run_id), doc_batches))
            #dts.append(pool.map(partial(f_gamma, gamma=gamma,
            #                docsizes=docsizes,docUTset=ids,topic_ids=topic_ids),doc_batches))
            pool.terminate()
            make_t += time() - make_t0
            print(make_t)
            django.db.connections.close_all()

            add_t0 = time()
            values_list = [item for sublist in values_list for item in sublist]
            pool = Pool(processes=no_processes)
            pool.map(insert_many, values_list)
            pool.terminate()
            add_t += time() - add_t0
            print(add_t)
            gc.collect()
            sys.stdout.flush()

        stat.db_time = stat.db_time + time() - t0
        print("done in %0.3fs." % (time() - t0))

    em = 0
    for i in range(K):
        if dtm[:, i].nnz == 0:
            em += 1

    stat.empty_topics = em
    if stat.method == "NM":
        stat.error = model.reconstruction_err_
        stat.errortype = "Frobenius"
    elif stat.method == "LD":
        if stat.lda_library == RunStats.LDA_LIB:
            stat.error = model.loglikelihood()
            stat.errortype = "Log likelihood"
            stat.iterations = model.n_iter
        elif stat.lda_library == RunStats.WARP:
            pass
        else:
            stat.error = model.perplexity(tfidf)
            stat.errortype = "Perplexity"
            stat.iterations = model.n_iter_
    stat.last_update = timezone.now()
    stat.status = 3

    stat.save()

    if stat.db:
        term_rankings = []

        topics = Topic.objects.filter(run_id=run_id)

        for topic in topics:
            term_ranking = list(
                Term.objects.filter(topicterm__topic=topic).order_by(
                    '-topicterm__score').values_list('title', flat=True)[:50])
            term_rankings.append(term_ranking)

        stat.coherence = validation_measure.evaluate_rankings(term_rankings)
        stat.save()
        if stat.db:
            management.call_command('update_run', run_id)