def test_add_describing_letters():

    with open('test_files/shrek_script.txt', 'r') as inp:
        full_text = inp.readlines()

    first_list = label_lines.detect_amount_of_spaces(full_text)
    second_list = label_lines.give_spaces_label(full_text, first_list)
    text = ''.join(label_lines.add_describing_letters(full_text, second_list))
    for line in text:
        assert line.startswith(('M|', 'C|', 'D|', 'S|', 'N|', ''))
def main(argv):

    filename = argv[1]

    with open(filename, 'r') as inp:
        script = inp.readlines()

    no_spaces = label_lines.detect_amount_of_spaces(script)

    dict_spaces_label = \
        label_lines.give_spaces_label(script, no_spaces)

    labelled_script = \
        label_lines.add_describing_letters(script, dict_spaces_label)

    script_dict = converter(labelled_script)

    with open('script.json', 'w') as output:
        json.dump(script_dict, output, indent=4)
Example #3
0
def main(argv):
    """
    Takes the file name/-path to the script file,
    applies the functions, and prints the number of
    scene descriptions in a movie.
    """

    filename = argv[1]

    with open(filename, 'r') as inp:
        text = inp.readlines()

    # get the functions of program label_lines.py
    list_number_of_spaces = \
        label_lines.detect_amount_of_spaces(text)
    dict_spaces_label = \
        label_lines.give_spaces_label(text, list_number_of_spaces)

    new_text = \
        "".join(label_lines.add_describing_letters(text, dict_spaces_label))
    print(count_scenes(new_text))
def compare_script_to_subtitles(script, subtitles):
    '''
    Compares all the sentences of the subtitles to all the sentences
    of the script to find the best matches. Will add the character to
    the subtitles and the time to the script if the match is higher than
    70%. Also calculates the total similarity of the dialogue.

    Parameters:
        script(list): A list of the input script lines
        subtitles(str): A string of the subtitles file

    Returns:
        average_ratio(float): The similarity of the dialogue in percentage
        script_dict(dict): The new script, with timestamps
        subtitles_dict(dict): The new subtitles, with characters
    '''

    subtitles_dict = OrderedDict(order_text(subtitles))

    # Remove the <tags> from the text
    for item in subtitles_dict:
        subtitles_dict[item]['text'] = \
            re.sub('<.*?>', '', subtitles_dict[item]['text'])

    # merge subtitles for complete lines
    subtitle_dict_length = len(subtitles_dict)
    i = 1
    while i < subtitle_dict_length:
        subtitles_dict, i = process_subtitle(subtitles_dict, i)

    # process the script
    no_spaces = label_lines.detect_amount_of_spaces(script)

    dict_spaces_label = \
        label_lines.give_spaces_label(script, no_spaces)

    labelled_script = \
        label_lines.add_describing_letters(script, dict_spaces_label)

    script_dict = script_to_json.converter(labelled_script)

    # loop to compare the texts
    progress = [0, len(subtitles_dict)]

    average_ratio = [0, 0]

    for item in subtitles_dict:

        time = ''

        highest_ratio = 0

        for sub_sentence in subtitles_dict[item]['text']:

            character = ''

            for index in script_dict:

                if 'dialogue' in script_dict[index]:

                    dialogue_text = \
                        nltk.sent_tokenize(script_dict[index]['dialogue'])

                    for d_sentence in dialogue_text:

                        ratio = SequenceMatcher(None, sub_sentence,
                                                d_sentence).ratio()

                        if ratio > highest_ratio:

                            highest_ratio = ratio
                            highest_D_match = index

                            if ratio >= 0.7:
                                time = subtitles_dict[item]['time']
                                character = script_dict[index]['character']

            if character != '':
                subtitles_dict[item]['character'] = character

            if time != '':
                script_dict[highest_D_match]['time'] = time

        average_ratio[0] += highest_ratio
        average_ratio[1] += 1

        progress[0] += 1

        print(f'{progress[0]}/{progress[1]}')

    for item in subtitles_dict:
        subtitles_dict[item]['text'] = ' '.join(subtitles_dict[item]['text'])

    average_ratio = (average_ratio[0] / average_ratio[1]) * 100

    return average_ratio, script_dict, subtitles_dict
def compare_script_to_subtitles(script, subtitles):

    subtitles_dict = OrderedDict(order_text(subtitles))

    # Remove the <tags> from the text
    for item in subtitles_dict:
        subtitles_dict[item]['text'] = \
            re.sub('<.*?>', '', subtitles_dict[item]['text'])

    # merge subtitles for complete lines
    subtitle_dict_length = len(subtitles_dict)
    i = 1
    while i < subtitle_dict_length:
        subtitles_dict, i = process_subtitle(subtitles_dict, i)

    no_spaces = label_lines.detect_amount_of_spaces(script)

    dict_spaces_label = \
        label_lines.give_spaces_label(script, no_spaces)

    labelled_script = \
        label_lines.add_describing_letters(script, dict_spaces_label)

    script_dict = script_to_json.converter(labelled_script)

    progress = [0, len(subtitles_dict)]

    average_ratio = [0, 0]

    for item in subtitles_dict:

        time = ''

        highest_ratio = 0

        for sub_sentence in subtitles_dict[item]['text']:

            character = ''

            for index in script_dict:

                if 'dialogue' in script_dict[index]:

                    dialogue_text = \
                        nltk.sent_tokenize(script_dict[index]['dialogue'])

                    for d_sentence in dialogue_text:

                        ratio = SequenceMatcher(None, sub_sentence,
                                                d_sentence).ratio()

                        if ratio > highest_ratio:

                            highest_ratio = ratio
                            highest_D_match = index

                            if ratio >= 0.7:
                                time = subtitles_dict[item]['time']
                                character = script_dict[index]['character']

            if character != '':
                subtitles_dict[item]['character'] = character

            if time != '':
                print()
                script_dict[highest_D_match]['time'] = time

        average_ratio[0] += highest_ratio
        average_ratio[1] += 1

        progress[0] += 1

        print(f'{progress[0]}/{progress[1]}', file=sys.stderr)

    for item in subtitles_dict:
        subtitles_dict[item]['text'] = ' '.join(subtitles_dict[item]['text'])

    average_ratio = (average_ratio[0] / average_ratio[1]) * 100

    return average_ratio, script_dict, subtitles_dict