def get_FRIENDS_summary(): read_path = "html/FRIENDS/summary/summary.json" save_dir = "data/FRIENDS/summary/" save_name = "summary.json" fh = open(read_path, "r") json_str = fh.read() fh.close() json_obj = json.loads(json_str) html = json_obj["summary"] extractor = DataExtractor("html") extractor.read(html) h3_tags = extractor.findAllTags("h3") h3_contents = extractor.extractTextFromTagList(h3_tags) h3_contents.append("END") dl_tags = extractor.findAllTags("dl") dl_contents = " ".join(extractor.extractTextFromTagList(dl_tags)) dl_contents = dl_contents + " END" res = {} for i in range(len(h3_contents) - 1): begin_txt = h3_contents[i] end_txt = h3_contents[i + 1] begin_idx = dl_contents.index(begin_txt) end_idx = dl_contents.index(end_txt) summary = dl_contents[begin_idx + len(begin_txt) + 1:end_idx] try: strip_idx = summary.index(" ") summary = summary[:strip_idx] except Exception: pass seq_num = begin_txt[:begin_txt.index(" ")] dot_idx = seq_num.index(".") s_num = seq_num[:dot_idx] e_num = seq_num[dot_idx + 1:] seq_num = s_num.zfill(2) + e_num.zfill(2) res[seq_num] = summary print("Serializing...") json_str = json.dumps(res) print("Done.") print("Saving...") extractor.save(json_str, save_dir + save_name) print("Done.")
def get_FRIENDS_transcript(): read_path = "html/FRIENDS/transcript/transcript.json" save_dir = "data/FRIENDS/transcript/" save_name = "transcript.json" fh = open(read_path, "r") json_str = fh.read() fh.close() json_obj = json.loads(json_str) for key in tqdm(list(json_obj.keys())): html = json_obj[key] extractor = DataExtractor("html") extractor.read(html) texts = extractor.extractText() try: begin_idx = texts.index("[Scene") texts = texts[begin_idx:] end_idx = len(texts) - texts[::-1].lower().index("dne") texts = texts[:end_idx] json_obj[key] = texts except Exception as e: print("Ignore transcripts for {}".format(key)) print("Serializing...") json_str = json.dumps(json_obj) print("Done.") print("Saving...") extractor.save(json_str, save_dir + save_name) print("Done.")