Beispiel #1
0
def generate_topics_for_file(data_dir, fname, lda, topic_idx, dictionary):
    topic_id = 0
    #fname_in = os.path.join(data_dir, 'lif', fname[:-5] + '.lif')
    fname_in = os.path.join(data_dir, 'lif', fname)
    fname_out = os.path.join(data_dir, 'top', fname[:-5] + '.lif')
    ensure_directory(fname_out)
    # lif_in = Container(fname_in).payload
    try:
        lif_in = LIF(fname_in)
    except FileNotFoundError:
        print("Warning: file '%s' does not exist" % fname_in)
        return
    lif_out = LIF(json_object=lif_in.as_json())
    # the following three are just to save some space, we get them from the lif
    # file anyway
    lif_out.text.value = None
    lif_out.text.source = fname_in
    lif_out.metadata = {}
    topics_view = _create_view()
    lif_out.views = [topics_view]
    topics_view.annotations.append(markable_annotation(lif_in))
    doc = prepare_text_for_lda(lif_in.text.value)
    bow = dictionary.doc2bow(doc)
    for topic in lda.get_document_topics(bow):
        topic_id += 1
        # these are tuples of topic_id and score
        lemmas = get_lemmas_from_topic_name(topic_idx.get(topic[0]))
        # print('   %3d  %.04f  %s' % (topic[0], topic[1], lemmas))
        topics_view.annotations.append(
            topic_annotation(topic, topic_id, lemmas))
    lif_out.write(fname=fname_out, pretty=True)
def generate_topics_for_file(data_dir, fname, lda, topic_idx, dictionary):
    topic_id = 0
    fname_in = os.path.join(data_dir, 'lif', fname[:-4] + '.lif')
    fname_out = os.path.join(data_dir, 'top', fname[:-4] + '.lif')
    ensure_directory(fname_out)
    lif_in = Container(fname_in).payload
    lif_out = LIF(json_object=lif_in.as_json())
    # just to save some space, we get them from the lif file anyway
    lif_out.metadata = {}
    topics_view = _create_view()
    lif_out.views = [topics_view]
    topics_view.annotations.append(markable_annotation(lif_in))
    doc = prepare_text_for_lda(lif_in.text.value)
    bow = dictionary.doc2bow(doc)
    for topic in lda.get_document_topics(bow):
        topic_id += 1
        # these are tuples of topic_id and score
        lemmas = get_lemmas_from_topic_name(topic_idx.get(topic[0]))
        # print('   %3d  %.04f  %s' % (topic[0], topic[1], lemmas))
        topics_view.annotations.append(
            topic_annotation(topic, topic_id, lemmas))
    lif_out.write(fname=fname_out, pretty=True)
Beispiel #3
0
def generate_topics(lif, top):

    lda = load_model()
    topic_idx = {
        topic_id: topic
        for topic_id, topic in lda.print_topics(num_topics=NUM_TOPICS)
    }
    dictionary = load_dictionary()

    for fname in os.listdir(lif):

        if not fname.endswith('.lif'):
            continue
        # if not fname.startswith('z'): continue

        topic_id = 0
        print("{}".format(os.path.basename(fname)))
        fname_in = os.path.join(lif, fname)
        fname_out = os.path.join(top, fname)
        lif_in = Container(fname_in).payload
        lif_out = LIF(json_object=lif_in.as_json())
        # just to save some space, we get them from the lif file anyway
        lif_out.metadata = {}
        topics_view = _create_view()
        lif_out.views = [topics_view]

        topics_view.annotations.append(markable_annotation(lif_in))
        doc = prepare_text_for_lda(lif_in.text.value)
        bow = dictionary.doc2bow(doc)
        for topic in lda.get_document_topics(bow):
            topic_id += 1
            # these are tuples of topic_id and score
            lemmas = get_lemmas_from_topic_name(topic_idx.get(topic[0]))
            # print('   %3d  %.04f  %s' % (topic[0], topic[1], lemmas))
            topics_view.annotations.append(
                topic_annotation(topic, topic_id, lemmas))
        lif_out.write(fname=fname_out, pretty=True)