Beispiel #1
0
def generate_topics_for_file(data_dir, fname, lda, topic_idx, dictionary):
    topic_id = 0
    #fname_in = os.path.join(data_dir, 'lif', fname[:-5] + '.lif')
    fname_in = os.path.join(data_dir, 'lif', fname)
    fname_out = os.path.join(data_dir, 'top', fname[:-5] + '.lif')
    ensure_directory(fname_out)
    # lif_in = Container(fname_in).payload
    try:
        lif_in = LIF(fname_in)
    except FileNotFoundError:
        print("Warning: file '%s' does not exist" % fname_in)
        return
    lif_out = LIF(json_object=lif_in.as_json())
    # the following three are just to save some space, we get them from the lif
    # file anyway
    lif_out.text.value = None
    lif_out.text.source = fname_in
    lif_out.metadata = {}
    topics_view = _create_view()
    lif_out.views = [topics_view]
    topics_view.annotations.append(markable_annotation(lif_in))
    doc = prepare_text_for_lda(lif_in.text.value)
    bow = dictionary.doc2bow(doc)
    for topic in lda.get_document_topics(bow):
        topic_id += 1
        # these are tuples of topic_id and score
        lemmas = get_lemmas_from_topic_name(topic_idx.get(topic[0]))
        # print('   %3d  %.04f  %s' % (topic[0], topic[1], lemmas))
        topics_view.annotations.append(
            topic_annotation(topic, topic_id, lemmas))
    lif_out.write(fname=fname_out, pretty=True)
Beispiel #2
0
def generate_sentence_types(ttk, sen, words):
    for fname in os.listdir(ttk):
        if not fname.endswith('.lif'):
            continue
        print("{} ... ".format(os.path.basename(fname)))
        if DEBUG:
            GOOD.write(">>> %s\n>>> %s\n>>> %s\n\n" %
                       ('-' * 100, fname, '-' * 100))
            BAD.write(">>> %s\n>>> %s\n>>> %s\n\n" %
                      ('-' * 100, fname, '-' * 100))
        fname_in = os.path.join(ttk, fname)
        fname_out = os.path.join(sen, fname)
        lif_in = LIF(fname_in)
        lif_out = LIF(json_object=lif_in.as_json())
        sentences_view = _create_view()
        lif_out.views = [sentences_view]
        good_sentences = 0
        bad_sentences = 0
        view = lif_in.get_view('v1')
        for anno in view.annotations:
            if anno.type.endswith('Sentence'):
                sc = SentenceClassifier(lif_in, anno, words)
                if sc.is_crap():
                    if DEBUG:
                        BAD.write(">>> %f\n%s\n\n" % (sc.ratio, sc.text))
                    anno.features['type'] = 'crap'
                    bad_sentences += 1
                else:
                    if DEBUG:
                        GOOD.write(">>> %f\n%s\n\n" % (sc.ratio, sc.text))
                    anno.features['type'] = 'normal'
                    good_sentences += 1
                sentences_view.annotations.append(anno)
        if DEBUG:
            print(" (good={:d} bad={:d})".format(good_sentences,
                                                 bad_sentences))
        lif_out.write(fname=fname_out, pretty=True)
        #break
    print