def generate_metadata(data_dir, fname): subdir = os.path.split(fname)[0] lif_file = os.path.join(data_dir, 'lif', subdir, "tesseract-300dpi-20p.lif") ner_file = os.path.join(data_dir, 'ner', subdir, "%s.ner.lif" % subdir) mta_file = os.path.join(data_dir, 'mta', subdir, "%s.mta.lif" % subdir) ensure_directory(mta_file) lif = Container(lif_file).payload lif_ner = Container(ner_file).payload lif_mta = LIF(json_object=lif.as_json()) lif_mta.text.value = None lif_mta.text.fname = lif_file lif_mta.views = [] lif.metadata["authors"] = [] lif.metadata["year"] = None page_view = lif.get_view("pages") ner_view = lif_ner.get_view('v2') window = _get_window(page_view) lif.metadata["authors"] = _get_authors(lif, ner_view, window) lif.metadata["year"] = _get_year(ner_view, window) lif_mta.write(fname=mta_file, pretty=True)
def generate_topics_for_file(data_dir, fname, lda, topic_idx, dictionary): topic_id = 0 #fname_in = os.path.join(data_dir, 'lif', fname[:-5] + '.lif') fname_in = os.path.join(data_dir, 'lif', fname) fname_out = os.path.join(data_dir, 'top', fname[:-5] + '.lif') ensure_directory(fname_out) # lif_in = Container(fname_in).payload try: lif_in = LIF(fname_in) except FileNotFoundError: print("Warning: file '%s' does not exist" % fname_in) return lif_out = LIF(json_object=lif_in.as_json()) # the following three are just to save some space, we get them from the lif # file anyway lif_out.text.value = None lif_out.text.source = fname_in lif_out.metadata = {} topics_view = _create_view() lif_out.views = [topics_view] topics_view.annotations.append(markable_annotation(lif_in)) doc = prepare_text_for_lda(lif_in.text.value) bow = dictionary.doc2bow(doc) for topic in lda.get_document_topics(bow): topic_id += 1 # these are tuples of topic_id and score lemmas = get_lemmas_from_topic_name(topic_idx.get(topic[0])) # print(' %3d %.04f %s' % (topic[0], topic[1], lemmas)) topics_view.annotations.append( topic_annotation(topic, topic_id, lemmas)) lif_out.write(fname=fname_out, pretty=True)
def lookup_technologies(data_dir, fname): subdir = os.path.split(fname)[0] pos_file = os.path.join(data_dir, 'pos', subdir, "%s.pos.lif" % subdir) tex_file = os.path.join(data_dir, 'tex', subdir, "%s.lup.lif" % subdir) ensure_directory(tex_file) lif = Container(pos_file).payload lif_tex = LIF(json_object=lif.as_json()) pos_view = lif.get_view('v2') tex_view = create_view('tex', 'Technology', 'dtriac-pipeline:lookup.py') lif_tex.views = [tex_view] tokens = [a for a in pos_view.annotations if a.type.endswith('Token')] _lookup_technologies_in_tokens(lif, tokens, tex_view) lif_tex.write(fname=tex_file, pretty=True)
def generate_sentence_types(data_dir, fname): subdir = os.path.split(fname)[0] lif_file = os.path.join(data_dir, 'lif', subdir, "tesseract-300dpi-20p.lif") spl_file = os.path.join(data_dir, 'spl', subdir, "%s.spl.lif" % subdir) sen_file = os.path.join(data_dir, 'sen', subdir, "%s.sen.lif" % subdir) ensure_directory(sen_file) if DEBUG: SENTS.write(">>> %s\n>>> %s\n>>> %s\n\n" % ('-' * 100, fname, '-' * 100)) lif = Container(lif_file).payload lif_spl = Container(spl_file).payload lif_sen = LIF(json_object=lif.as_json()) spl_sentences_view = lif_spl.get_view('v2') new_sentences_view = _create_view() lif_sen.views = [new_sentences_view] good_sentences = 0 bad_sentences = 0 for anno in spl_sentences_view.annotations: if anno.type.endswith('Sentence'): sc = SentenceClassifier(lif, anno, WORDS) if sc.is_crap(): if DEBUG: SENTS.write("---- %f\n%s\n\n" % (sc.ratio, repr(sc.text))) anno.features['type'] = 'crap' bad_sentences += 1 else: if DEBUG: SENTS.write("++++ %f\n%s\n\n" % (sc.ratio, repr(sc.text))) anno.features['type'] = 'normal' good_sentences += 1 new_sentences_view.annotations.append(anno) if DEBUG: SENTS.write("\nTOTAL GOOD = {:d}\nTOTAL BAD = {:d}\n\n\n".format(good_sentences, bad_sentences)) lif_sen.write(fname=sen_file, pretty=True)
def generate_topics_for_file(data_dir, fname, lda, topic_idx, dictionary): topic_id = 0 fname_in = os.path.join(data_dir, 'lif', fname[:-4] + '.lif') fname_out = os.path.join(data_dir, 'top', fname[:-4] + '.lif') ensure_directory(fname_out) lif_in = Container(fname_in).payload lif_out = LIF(json_object=lif_in.as_json()) # just to save some space, we get them from the lif file anyway lif_out.metadata = {} topics_view = _create_view() lif_out.views = [topics_view] topics_view.annotations.append(markable_annotation(lif_in)) doc = prepare_text_for_lda(lif_in.text.value) bow = dictionary.doc2bow(doc) for topic in lda.get_document_topics(bow): topic_id += 1 # these are tuples of topic_id and score lemmas = get_lemmas_from_topic_name(topic_idx.get(topic[0])) # print(' %3d %.04f %s' % (topic[0], topic[1], lemmas)) topics_view.annotations.append( topic_annotation(topic, topic_id, lemmas)) lif_out.write(fname=fname_out, pretty=True)
def generate_sentence_types(ttk, sen, words): for fname in os.listdir(ttk): if not fname.endswith('.lif'): continue print("{} ... ".format(os.path.basename(fname))) if DEBUG: GOOD.write(">>> %s\n>>> %s\n>>> %s\n\n" % ('-' * 100, fname, '-' * 100)) BAD.write(">>> %s\n>>> %s\n>>> %s\n\n" % ('-' * 100, fname, '-' * 100)) fname_in = os.path.join(ttk, fname) fname_out = os.path.join(sen, fname) lif_in = LIF(fname_in) lif_out = LIF(json_object=lif_in.as_json()) sentences_view = _create_view() lif_out.views = [sentences_view] good_sentences = 0 bad_sentences = 0 view = lif_in.get_view('v1') for anno in view.annotations: if anno.type.endswith('Sentence'): sc = SentenceClassifier(lif_in, anno, words) if sc.is_crap(): if DEBUG: BAD.write(">>> %f\n%s\n\n" % (sc.ratio, sc.text)) anno.features['type'] = 'crap' bad_sentences += 1 else: if DEBUG: GOOD.write(">>> %f\n%s\n\n" % (sc.ratio, sc.text)) anno.features['type'] = 'normal' good_sentences += 1 sentences_view.annotations.append(anno) if DEBUG: print(" (good={:d} bad={:d})".format(good_sentences, bad_sentences)) lif_out.write(fname=fname_out, pretty=True) #break print
def generate_topics(lif, top): lda = load_model() topic_idx = { topic_id: topic for topic_id, topic in lda.print_topics(num_topics=NUM_TOPICS) } dictionary = load_dictionary() for fname in os.listdir(lif): if not fname.endswith('.lif'): continue # if not fname.startswith('z'): continue topic_id = 0 print("{}".format(os.path.basename(fname))) fname_in = os.path.join(lif, fname) fname_out = os.path.join(top, fname) lif_in = Container(fname_in).payload lif_out = LIF(json_object=lif_in.as_json()) # just to save some space, we get them from the lif file anyway lif_out.metadata = {} topics_view = _create_view() lif_out.views = [topics_view] topics_view.annotations.append(markable_annotation(lif_in)) doc = prepare_text_for_lda(lif_in.text.value) bow = dictionary.doc2bow(doc) for topic in lda.get_document_topics(bow): topic_id += 1 # these are tuples of topic_id and score lemmas = get_lemmas_from_topic_name(topic_idx.get(topic[0])) # print(' %3d %.04f %s' % (topic[0], topic[1], lemmas)) topics_view.annotations.append( topic_annotation(topic, topic_id, lemmas)) lif_out.write(fname=fname_out, pretty=True)
def wikify_lif(in_f, wikifier): in_lif = Container(in_f).payload out_lif = LIF(json_object=in_lif.as_json()) out_lif.views = [] out_lif.metadata["wikified_es"] = wikifier.wikify(out_lif.text.value) return out_lif