Exemple #1
0
if __name__ == "__main__":
    MALT_PATH = os.path.abspath("./maltparser-1.9.0/")
    MALT_MODEL = os.path.abspath("./maltparser-1.9.0/engmalt.linear-1.7.mco")
    maltparser = malt.MaltParser(MALT_PATH, MALT_MODEL)

    SAVED_DEP_PARSES = 'headlines.conll'

    raw_headlines = load_headlines(sys.argv[1])
    cleaned_headlines = clean_headlines(raw_headlines)
    raw_to_clean = dict(zip(raw_headlines, cleaned_headlines))
    bodies = load_bodies(sys.argv[2])
    bodies_sents = split_sentences(bodies)

    # load saved dep parses if available to save runtime/repeated work
    if os.path.isfile(SAVED_DEP_PARSES):
        depgraphs = DependencyGraph.load(SAVED_DEP_PARSES,
                                         top_relation_label='null')
        depparses = dict(zip(uniq_headlines, depgraphs))
        print("loaded cached headline dep parses")
    else:
        depparses = dep_parse_sents([h.text for h in cleaned_headlines])
        with open(SAVED_DEP_PARSES, 'w') as p_out:
            for headline in depparses:
                p_out.write(depparses[headline].to_conll(10) + '\n')

    # find most relevant sentences in bodies for every headline
    print("calculating most related sentence(s) for each headline...")
    top_sents = dict()  # maps headline => list of top sentences
    to_be_parsed = set()
    for headline in raw_headlines:
        top_sents[headline] = most_sim_sents(headline.text,
                                             bodies_sents[headline.bodyid],