def generate_topics_for_file(data_dir, fname, lda, topic_idx, dictionary): topic_id = 0 #fname_in = os.path.join(data_dir, 'lif', fname[:-5] + '.lif') fname_in = os.path.join(data_dir, 'lif', fname) fname_out = os.path.join(data_dir, 'top', fname[:-5] + '.lif') ensure_directory(fname_out) # lif_in = Container(fname_in).payload try: lif_in = LIF(fname_in) except FileNotFoundError: print("Warning: file '%s' does not exist" % fname_in) return lif_out = LIF(json_object=lif_in.as_json()) # the following three are just to save some space, we get them from the lif # file anyway lif_out.text.value = None lif_out.text.source = fname_in lif_out.metadata = {} topics_view = _create_view() lif_out.views = [topics_view] topics_view.annotations.append(markable_annotation(lif_in)) doc = prepare_text_for_lda(lif_in.text.value) bow = dictionary.doc2bow(doc) for topic in lda.get_document_topics(bow): topic_id += 1 # these are tuples of topic_id and score lemmas = get_lemmas_from_topic_name(topic_idx.get(topic[0])) # print(' %3d %.04f %s' % (topic[0], topic[1], lemmas)) topics_view.annotations.append( topic_annotation(topic, topic_id, lemmas)) lif_out.write(fname=fname_out, pretty=True)
def generate_topics_for_file(data_dir, fname, lda, topic_idx, dictionary): topic_id = 0 fname_in = os.path.join(data_dir, 'lif', fname[:-4] + '.lif') fname_out = os.path.join(data_dir, 'top', fname[:-4] + '.lif') ensure_directory(fname_out) lif_in = Container(fname_in).payload lif_out = LIF(json_object=lif_in.as_json()) # just to save some space, we get them from the lif file anyway lif_out.metadata = {} topics_view = _create_view() lif_out.views = [topics_view] topics_view.annotations.append(markable_annotation(lif_in)) doc = prepare_text_for_lda(lif_in.text.value) bow = dictionary.doc2bow(doc) for topic in lda.get_document_topics(bow): topic_id += 1 # these are tuples of topic_id and score lemmas = get_lemmas_from_topic_name(topic_idx.get(topic[0])) # print(' %3d %.04f %s' % (topic[0], topic[1], lemmas)) topics_view.annotations.append( topic_annotation(topic, topic_id, lemmas)) lif_out.write(fname=fname_out, pretty=True)
def generate_topics(lif, top): lda = load_model() topic_idx = { topic_id: topic for topic_id, topic in lda.print_topics(num_topics=NUM_TOPICS) } dictionary = load_dictionary() for fname in os.listdir(lif): if not fname.endswith('.lif'): continue # if not fname.startswith('z'): continue topic_id = 0 print("{}".format(os.path.basename(fname))) fname_in = os.path.join(lif, fname) fname_out = os.path.join(top, fname) lif_in = Container(fname_in).payload lif_out = LIF(json_object=lif_in.as_json()) # just to save some space, we get them from the lif file anyway lif_out.metadata = {} topics_view = _create_view() lif_out.views = [topics_view] topics_view.annotations.append(markable_annotation(lif_in)) doc = prepare_text_for_lda(lif_in.text.value) bow = dictionary.doc2bow(doc) for topic in lda.get_document_topics(bow): topic_id += 1 # these are tuples of topic_id and score lemmas = get_lemmas_from_topic_name(topic_idx.get(topic[0])) # print(' %3d %.04f %s' % (topic[0], topic[1], lemmas)) topics_view.annotations.append( topic_annotation(topic, topic_id, lemmas)) lif_out.write(fname=fname_out, pretty=True)