Example #1
0
def build_items_to_download_map(subs_list_file, lines_to_skip, index):
    movies_map = OrderedDict()
    with open(subs_list_file, 'r') as subs_list_f:
        for line in map(lambda l: l.rstrip('\r\n'), utils.iterate_after_dropping(subs_list_f, lines_to_skip)):
            imdb_id, s_num = line.split(':')
            item_iter = map(lambda _: subs_list_f.readline().split('#')[index], itertools.repeat(None, int(s_num)))
            movies_map[imdb_id] = [item.rstrip('\r\n') for item in item_iter]
    return movies_map
Example #2
0
def extract_imdb_data_from_ids(ids_file, data_folder, lines_to_skip=0):
    os.makedirs(data_folder, exist_ok=True)
    with open(ids_file, "r") as f:
        for movie_id in map(lambda l: l.rstrip("\r\n"), utils.iterate_after_dropping(f, lines_to_skip)):
            lines_to_skip += 1
            print("Processing movie {0} at line {1}...".format(movie_id, lines_to_skip))
            plot_summaries_text, synopsis_text = get_movie_info(movie_id)
            print("Found: {0} plot(s), {1} synopsis/es".format(len(plot_summaries_text), int(bool(synopsis_text))))
            with open(data_folder + movie_id + ".txt", "w") as meta_f:
                meta_f.write("\n".join(plot_summaries_text + [synopsis_text]))
    print("Plots/synopses extraction completed.")
Example #3
0
def merge_subs(subs_folder, subs_folder_merged):
    os.makedirs(subs_folder_merged, exist_ok=True)
    for folder in os.listdir(subs_folder):
        print('Processing folder: {0}'.format(folder))
        with open(subs_folder_merged + '/' + folder + '.txt', 'w', encoding='ISO-8859-1') as out:
            for sub in glob.glob(subs_folder + '/' + folder + '/*.srt'):
                with open(sub, 'r', encoding='ISO-8859-1') as in_file:
                    for line in map(lambda l: l.rstrip('\r\n'), utils.iterate_after_dropping(in_file, 3)):
                        if line == '':
                            for _ in itertools.repeat(None, 2):
                                in_file.readline()
                        else:
                            if re.search('<(.*?)>', line):
                                bs = BeautifulSoup(line, 'lxml')
                                out.write(bs.get_text() + ' ')
                            else:
                                out.write(line + ' ')
Example #4
0
def find_subs_links_from_ids_list(source_file, subs_links_file, movies_with_subs_list_file, lines_to_skip=0,
                                  ids_per_request=10, lang='eng', os_client=utils.ScriBa.get_client()):
    with open(source_file, 'r') as source_f, open(subs_links_file, 'w') as subs_links_f, \
            open(movies_with_subs_list_file, 'w') as movies_with_subs_list_f:
        pending_ids = []
        movies_found = 0
        lines_iter = enumerate(map(lambda l: l.rstrip('\r\n'), utils.iterate_after_dropping(source_f, lines_to_skip)))
        for count, imdb_id in map(lambda p: (p[0], p[1].split(',')[0]), lines_iter):
            pending_ids.append(imdb_id)
            if count and not count % ids_per_request:
                movies_found += find_matching_subs(os_client, pending_ids, lang, subs_links_f, movies_with_subs_list_f)
                lines_to_skip += ids_per_request
                print('Processed {0} lines. Found subtitles for {1} movies\n'.format(lines_to_skip, movies_found))
                pending_ids = []

        if pending_ids:
            movies_found += find_matching_subs(os_client, pending_ids, lang, subs_links_f, movies_with_subs_list_f)
            lines_to_skip += len(pending_ids)
            print('Processed {0} lines. Found subtitles for {1} movies\n'.format(lines_to_skip, movies_found))