def generate_topics_for_file(data_dir, fname, lda, topic_idx, dictionary): topic_id = 0 #fname_in = os.path.join(data_dir, 'lif', fname[:-5] + '.lif') fname_in = os.path.join(data_dir, 'lif', fname) fname_out = os.path.join(data_dir, 'top', fname[:-5] + '.lif') ensure_directory(fname_out) # lif_in = Container(fname_in).payload try: lif_in = LIF(fname_in) except FileNotFoundError: print("Warning: file '%s' does not exist" % fname_in) return lif_out = LIF(json_object=lif_in.as_json()) # the following three are just to save some space, we get them from the lif # file anyway lif_out.text.value = None lif_out.text.source = fname_in lif_out.metadata = {} topics_view = _create_view() lif_out.views = [topics_view] topics_view.annotations.append(markable_annotation(lif_in)) doc = prepare_text_for_lda(lif_in.text.value) bow = dictionary.doc2bow(doc) for topic in lda.get_document_topics(bow): topic_id += 1 # these are tuples of topic_id and score lemmas = get_lemmas_from_topic_name(topic_idx.get(topic[0])) # print(' %3d %.04f %s' % (topic[0], topic[1], lemmas)) topics_view.annotations.append( topic_annotation(topic, topic_id, lemmas)) lif_out.write(fname=fname_out, pretty=True)
def generate_sentence_types(ttk, sen, words): for fname in os.listdir(ttk): if not fname.endswith('.lif'): continue print("{} ... ".format(os.path.basename(fname))) if DEBUG: GOOD.write(">>> %s\n>>> %s\n>>> %s\n\n" % ('-' * 100, fname, '-' * 100)) BAD.write(">>> %s\n>>> %s\n>>> %s\n\n" % ('-' * 100, fname, '-' * 100)) fname_in = os.path.join(ttk, fname) fname_out = os.path.join(sen, fname) lif_in = LIF(fname_in) lif_out = LIF(json_object=lif_in.as_json()) sentences_view = _create_view() lif_out.views = [sentences_view] good_sentences = 0 bad_sentences = 0 view = lif_in.get_view('v1') for anno in view.annotations: if anno.type.endswith('Sentence'): sc = SentenceClassifier(lif_in, anno, words) if sc.is_crap(): if DEBUG: BAD.write(">>> %f\n%s\n\n" % (sc.ratio, sc.text)) anno.features['type'] = 'crap' bad_sentences += 1 else: if DEBUG: GOOD.write(">>> %f\n%s\n\n" % (sc.ratio, sc.text)) anno.features['type'] = 'normal' good_sentences += 1 sentences_view.annotations.append(anno) if DEBUG: print(" (good={:d} bad={:d})".format(good_sentences, bad_sentences)) lif_out.write(fname=fname_out, pretty=True) #break print