if __name__ == "__main__": MALT_PATH = os.path.abspath("./maltparser-1.9.0/") MALT_MODEL = os.path.abspath("./maltparser-1.9.0/engmalt.linear-1.7.mco") maltparser = malt.MaltParser(MALT_PATH, MALT_MODEL) SAVED_DEP_PARSES = 'headlines.conll' raw_headlines = load_headlines(sys.argv[1]) cleaned_headlines = clean_headlines(raw_headlines) raw_to_clean = dict(zip(raw_headlines, cleaned_headlines)) bodies = load_bodies(sys.argv[2]) bodies_sents = split_sentences(bodies) # load saved dep parses if available to save runtime/repeated work if os.path.isfile(SAVED_DEP_PARSES): depgraphs = DependencyGraph.load(SAVED_DEP_PARSES, top_relation_label='null') depparses = dict(zip(uniq_headlines, depgraphs)) print("loaded cached headline dep parses") else: depparses = dep_parse_sents([h.text for h in cleaned_headlines]) with open(SAVED_DEP_PARSES, 'w') as p_out: for headline in depparses: p_out.write(depparses[headline].to_conll(10) + '\n') # find most relevant sentences in bodies for every headline print("calculating most related sentence(s) for each headline...") top_sents = dict() # maps headline => list of top sentences to_be_parsed = set() for headline in raw_headlines: top_sents[headline] = most_sim_sents(headline.text, bodies_sents[headline.bodyid],