#!/usr/bin/env python # encoding: utf-8 import sys import textrank ## Stage 1: ## * perform statistical parsing/tagging on a document in JSON format ## ## INPUTS: <stage0> ## OUTPUT: JSON format `ParsedGraf(id, sha1, graf)` if __name__ == "__main__": path = sys.argv[1] for graf in textrank.parse_doc(textrank.json_iter(path)): print(textrank.pretty_print(graf._asdict()))
#!/usr/bin/env python # encoding: utf-8 import sys import textrank DEBUG = False # True ## Stage 2: ## * collect and normalize the key phrases from a parsed document ## ## INPUTS: <stage1> ## OUTPUT: JSON format `RankedLexeme(text, rank, ids, pos)` if __name__ == "__main__": path = sys.argv[1] graph, ranks = textrank.text_rank(path) textrank.render_ranks(graph, ranks) # output as JSON for rl in textrank.normalize_key_phrases(path, ranks): print(textrank.pretty_print(rl._asdict()))
#!/usr/bin/env python # encoding: utf-8 import sys import textrank DEBUG = True # False ## "It scrubs its unreadable characters out of its text stream... ## Then it generates a JSON doc again." if __name__ == "__main__": path = sys.argv[1] lines = [] with open(path, 'r') as f: for line in f.readlines(): line = line.strip().replace('“', '"').replace('”', '"') line = line.replace('…', '...').replace('–', '-') line = line.replace("’", "'").replace("`", "'") lines.append(line) j = {} j["id"] = "777" j["text"] = " ".join(lines) print(textrank.pretty_print(j))
#!/usr/bin/env python # encoding: utf-8 import sys import textrank ## Stage 1: ## * perform statistical parsing/tagging on a document in JSON format (Stage 0) ## * output is in JSON format if __name__ == "__main__": path = sys.argv[1] for graf in textrank.parse_doc(textrank.json_iter(path)): print(textrank.pretty_print(graf))
#!/usr/bin/env python # encoding: utf-8 import sys import textrank DEBUG = False # True ## Stage 2: ## * summarize key phrases from a parsed document in JSON format (Stage 1) ## * output is in JSON format if __name__ == "__main__": path = sys.argv[1] graph, ranks, summary = textrank.text_rank(path) if DEBUG: textrank.render_ranks(graph, ranks) for p in textrank.normalize_keyphrases(summary): print("%0.4f\t%s" % (p.rank, p.text), p.ids) # output as JSON for norm in textrank.get_kernel(summary): print(textrank.pretty_print(norm._asdict()))