def test_add_describing_letters(): with open('test_files/shrek_script.txt', 'r') as inp: full_text = inp.readlines() first_list = label_lines.detect_amount_of_spaces(full_text) second_list = label_lines.give_spaces_label(full_text, first_list) text = ''.join(label_lines.add_describing_letters(full_text, second_list)) for line in text: assert line.startswith(('M|', 'C|', 'D|', 'S|', 'N|', ''))
def main(argv): filename = argv[1] with open(filename, 'r') as inp: script = inp.readlines() no_spaces = label_lines.detect_amount_of_spaces(script) dict_spaces_label = \ label_lines.give_spaces_label(script, no_spaces) labelled_script = \ label_lines.add_describing_letters(script, dict_spaces_label) script_dict = converter(labelled_script) with open('script.json', 'w') as output: json.dump(script_dict, output, indent=4)
def main(argv): """ Takes the file name/-path to the script file, applies the functions, and prints the number of scene descriptions in a movie. """ filename = argv[1] with open(filename, 'r') as inp: text = inp.readlines() # get the functions of program label_lines.py list_number_of_spaces = \ label_lines.detect_amount_of_spaces(text) dict_spaces_label = \ label_lines.give_spaces_label(text, list_number_of_spaces) new_text = \ "".join(label_lines.add_describing_letters(text, dict_spaces_label)) print(count_scenes(new_text))
def compare_script_to_subtitles(script, subtitles): ''' Compares all the sentences of the subtitles to all the sentences of the script to find the best matches. Will add the character to the subtitles and the time to the script if the match is higher than 70%. Also calculates the total similarity of the dialogue. Parameters: script(list): A list of the input script lines subtitles(str): A string of the subtitles file Returns: average_ratio(float): The similarity of the dialogue in percentage script_dict(dict): The new script, with timestamps subtitles_dict(dict): The new subtitles, with characters ''' subtitles_dict = OrderedDict(order_text(subtitles)) # Remove the <tags> from the text for item in subtitles_dict: subtitles_dict[item]['text'] = \ re.sub('<.*?>', '', subtitles_dict[item]['text']) # merge subtitles for complete lines subtitle_dict_length = len(subtitles_dict) i = 1 while i < subtitle_dict_length: subtitles_dict, i = process_subtitle(subtitles_dict, i) # process the script no_spaces = label_lines.detect_amount_of_spaces(script) dict_spaces_label = \ label_lines.give_spaces_label(script, no_spaces) labelled_script = \ label_lines.add_describing_letters(script, dict_spaces_label) script_dict = script_to_json.converter(labelled_script) # loop to compare the texts progress = [0, len(subtitles_dict)] average_ratio = [0, 0] for item in subtitles_dict: time = '' highest_ratio = 0 for sub_sentence in subtitles_dict[item]['text']: character = '' for index in script_dict: if 'dialogue' in script_dict[index]: dialogue_text = \ nltk.sent_tokenize(script_dict[index]['dialogue']) for d_sentence in dialogue_text: ratio = SequenceMatcher(None, sub_sentence, d_sentence).ratio() if ratio > highest_ratio: highest_ratio = ratio highest_D_match = index if ratio >= 0.7: time = subtitles_dict[item]['time'] character = script_dict[index]['character'] if character != '': subtitles_dict[item]['character'] = character if time != '': script_dict[highest_D_match]['time'] = time average_ratio[0] += highest_ratio average_ratio[1] += 1 progress[0] += 1 print(f'{progress[0]}/{progress[1]}') for item in subtitles_dict: subtitles_dict[item]['text'] = ' '.join(subtitles_dict[item]['text']) average_ratio = (average_ratio[0] / average_ratio[1]) * 100 return average_ratio, script_dict, subtitles_dict
def compare_script_to_subtitles(script, subtitles): subtitles_dict = OrderedDict(order_text(subtitles)) # Remove the <tags> from the text for item in subtitles_dict: subtitles_dict[item]['text'] = \ re.sub('<.*?>', '', subtitles_dict[item]['text']) # merge subtitles for complete lines subtitle_dict_length = len(subtitles_dict) i = 1 while i < subtitle_dict_length: subtitles_dict, i = process_subtitle(subtitles_dict, i) no_spaces = label_lines.detect_amount_of_spaces(script) dict_spaces_label = \ label_lines.give_spaces_label(script, no_spaces) labelled_script = \ label_lines.add_describing_letters(script, dict_spaces_label) script_dict = script_to_json.converter(labelled_script) progress = [0, len(subtitles_dict)] average_ratio = [0, 0] for item in subtitles_dict: time = '' highest_ratio = 0 for sub_sentence in subtitles_dict[item]['text']: character = '' for index in script_dict: if 'dialogue' in script_dict[index]: dialogue_text = \ nltk.sent_tokenize(script_dict[index]['dialogue']) for d_sentence in dialogue_text: ratio = SequenceMatcher(None, sub_sentence, d_sentence).ratio() if ratio > highest_ratio: highest_ratio = ratio highest_D_match = index if ratio >= 0.7: time = subtitles_dict[item]['time'] character = script_dict[index]['character'] if character != '': subtitles_dict[item]['character'] = character if time != '': print() script_dict[highest_D_match]['time'] = time average_ratio[0] += highest_ratio average_ratio[1] += 1 progress[0] += 1 print(f'{progress[0]}/{progress[1]}', file=sys.stderr) for item in subtitles_dict: subtitles_dict[item]['text'] = ' '.join(subtitles_dict[item]['text']) average_ratio = (average_ratio[0] / average_ratio[1]) * 100 return average_ratio, script_dict, subtitles_dict