def build_items_to_download_map(subs_list_file, lines_to_skip, index): movies_map = OrderedDict() with open(subs_list_file, 'r') as subs_list_f: for line in map(lambda l: l.rstrip('\r\n'), utils.iterate_after_dropping(subs_list_f, lines_to_skip)): imdb_id, s_num = line.split(':') item_iter = map(lambda _: subs_list_f.readline().split('#')[index], itertools.repeat(None, int(s_num))) movies_map[imdb_id] = [item.rstrip('\r\n') for item in item_iter] return movies_map
def extract_imdb_data_from_ids(ids_file, data_folder, lines_to_skip=0): os.makedirs(data_folder, exist_ok=True) with open(ids_file, "r") as f: for movie_id in map(lambda l: l.rstrip("\r\n"), utils.iterate_after_dropping(f, lines_to_skip)): lines_to_skip += 1 print("Processing movie {0} at line {1}...".format(movie_id, lines_to_skip)) plot_summaries_text, synopsis_text = get_movie_info(movie_id) print("Found: {0} plot(s), {1} synopsis/es".format(len(plot_summaries_text), int(bool(synopsis_text)))) with open(data_folder + movie_id + ".txt", "w") as meta_f: meta_f.write("\n".join(plot_summaries_text + [synopsis_text])) print("Plots/synopses extraction completed.")
def merge_subs(subs_folder, subs_folder_merged): os.makedirs(subs_folder_merged, exist_ok=True) for folder in os.listdir(subs_folder): print('Processing folder: {0}'.format(folder)) with open(subs_folder_merged + '/' + folder + '.txt', 'w', encoding='ISO-8859-1') as out: for sub in glob.glob(subs_folder + '/' + folder + '/*.srt'): with open(sub, 'r', encoding='ISO-8859-1') as in_file: for line in map(lambda l: l.rstrip('\r\n'), utils.iterate_after_dropping(in_file, 3)): if line == '': for _ in itertools.repeat(None, 2): in_file.readline() else: if re.search('<(.*?)>', line): bs = BeautifulSoup(line, 'lxml') out.write(bs.get_text() + ' ') else: out.write(line + ' ')
def find_subs_links_from_ids_list(source_file, subs_links_file, movies_with_subs_list_file, lines_to_skip=0, ids_per_request=10, lang='eng', os_client=utils.ScriBa.get_client()): with open(source_file, 'r') as source_f, open(subs_links_file, 'w') as subs_links_f, \ open(movies_with_subs_list_file, 'w') as movies_with_subs_list_f: pending_ids = [] movies_found = 0 lines_iter = enumerate(map(lambda l: l.rstrip('\r\n'), utils.iterate_after_dropping(source_f, lines_to_skip))) for count, imdb_id in map(lambda p: (p[0], p[1].split(',')[0]), lines_iter): pending_ids.append(imdb_id) if count and not count % ids_per_request: movies_found += find_matching_subs(os_client, pending_ids, lang, subs_links_f, movies_with_subs_list_f) lines_to_skip += ids_per_request print('Processed {0} lines. Found subtitles for {1} movies\n'.format(lines_to_skip, movies_found)) pending_ids = [] if pending_ids: movies_found += find_matching_subs(os_client, pending_ids, lang, subs_links_f, movies_with_subs_list_f) lines_to_skip += len(pending_ids) print('Processed {0} lines. Found subtitles for {1} movies\n'.format(lines_to_skip, movies_found))