def parse(self): """Parsing nwt videos""" self.work_dir = expandpath(self.work_dir) self.input = expandpath(self.input) self.books = [int(bk) for bk in self.book.split(',')] self.chapters = [int(chp) for chp in self.chapter.split(',')] self._get_db() print('This may take several minutes', flush=True) verse_videos = self.get_cutup_verses() match_videos = self.get_match_videos() self.num_bookname = parse_num_book( get_nwt_video_info(match_videos[0], 'lang')) add_numeration(self.work_dir, self.num_bookname) print(f'Getting chapter marks from {self.input}', end='\t-> ') result = [] for video in match_videos: booknum = get_nwt_video_info(video, 'booknum') json_markers = probe_markers(video) markers = parse_markers_nwt(json_markers, video, bookname=self.num_bookname[booknum]) for mark in markers: # print(mark['title'], end='\t') if self.db.get(woext(video)) == os.stat(video).st_size and \ verse_videos.get(mark['title']): # verse it exist and is the latest version. do nothing pass # print('already exists') else: result.append(mark) self.db[woext(video)] = os.stat(video).st_size self.write_json(self.db) print(f'{len(result)} found\n') return result
def raw_parse(self): """Parsing any video""" self._get_db() result = [] match_videos = self.get_match_videos() verse_videos = self.get_cutup_verses() for video in match_videos: json_markers = probe_markers(video) markers = parse_markers_raw(json_markers, video) for mark in markers: # print(mark['title'], end='\t') print(verse_videos.get(mark['title']), mark['title']) if self.db.get(woext(video)) == os.stat(video).st_size and \ verse_videos.get(mark['title']): pass else: result.append(mark) self.db[woext(video)] = os.stat(video).st_size self.write_json(self.db) print('raw', verse_videos) return result
def get_cutup_verses(self): print(f'Getting verses videos from {self.work_dir}', end='\t-> ', flush=True) path = pj(self.work_dir, 'db', 'ready.json') try: with open(path, 'r', encoding='utf-8') as jsonfile: self.ready = json.load(jsonfile) except (FileNotFoundError, UnsupportedOperation, JSONDecodeError): self.ready = {} versiculos = {} for dirpath, dirnames, filenames in os.walk(self.work_dir): if dirpath[len(self.work_dir):].count( os.sep) < 2: # nivel prinicpal y un nivel de subdirectorio for filename in sorted(filenames): if (filename.endswith('.mp4') or filename.endswith('.m4v') ) and not filename.startswith('nwt'): if self.ready.get(woext(filename)) == os.stat( pj(dirpath, filename)).st_size: versiculos.update( {woext(filename): pj(dirpath, filename)}) # print(f'...fast...{filename}') elif 'vbastianpc' in ffprobe_signature( pj(dirpath, filename)): versiculos.update( {woext(filename): pj(dirpath, filename)}) self.ready.update({ woext(filename): os.stat(pj(dirpath, filename)).st_size }) # print(f'...slow...{filename}') with open(path, 'w', encoding='utf-8') as jsonfile: json.dump(self.ready, jsonfile, ensure_ascii=False, indent=4) print(f'{len(versiculos)} found') return versiculos
def download_media(self, media, directory, check_only=False): """Download media file and check it. Download file, check MD5 sum and size, delete file if it missmatches. :param media: a Media instance :param directory: dir to save the files to :param check_only: bool, True means no downloading :return: filename, or None if unsuccessful """ if not os.path.exists(directory) and not self.download: return None os.makedirs(directory, exist_ok=True) base = urllib.parse.urlparse(media.url).path if self.title: file_extension = os.path.splitext(os.path.basename(base))[-1] title = media.name.replace('"', "'").replace(':', '.') base = ''.join(c if c.isalnum() or c in ".-_()¡!¿';, " else '' \ for c in title \ ) + file_extension else: base = os.path.basename(base) # Delete files if same basename in main dir if self.type == 'video': for path, dirnames, filenames in os.walk(directory): for filename in filenames: if filename == base: pass elif woext(filename) == woext(base): os.remove(os.path.join(path, filename)) print('deleted:', os.path.join(path, filename)) break file = os.path.join(directory, base) # Only try resuming and downloading once resumed = False downloaded = False progressbar = False if self.subtitles else True while True: if os.path.exists(file): # os.path.exists(file): # Set timestamp to date of publishing # NOTE: Do this before checking _checked_files since # this is not done for newly renamed .part files! if media.date: os.utime(file, (media.date, media.date)) if os.path.getsize(file) == media.size or not media.size: # File size is OK or unknown - Validate checksum if self.checksums and media.md5 and _md5( file) != media.md5: # Checksum is bad - Remove if self.quiet < 2: msg('checksum mismatch, deleting: {}'.format(base)) os.remove(file) else: # Checksum is correct return file else: # File size is bad - Delete msg('size mismatch, deleting: {}'.format(base)) os.remove(file) elif check_only: # The rest of this method is only applicable in download mode return None elif os.path.exists(file + '.part'): fsize = os.path.getsize(file + '.part') if fsize == media.size or not media.size: # File size is OK - Validate checksum if self.checksums and media.md5 and _md5( file + '.part') != media.md5: # Checksum is bad - Remove if self.quiet < 2: msg('checksum mismatch, deleting: {}'.format( base + '.part')) os.remove(file + '.part') else: # Checksum is correct or unknown - Move and approve os.rename(file + '.part', file) return file elif fsize < media.size and not resumed: # File is smaller - Resume download once resumed = True if self.quiet < 2: msg('resuming: {} ({})'.format(base + '.part', media.name)) _curl( media.url, file + '.part', resume=True, rate_limit=self.rate_limit, curl_path=self.curl_path, progress=progressbar, ) else: # File size is bad - Remove msg('size mismatch, deleting: {}'.format(base + '.part')) os.remove(file + '.part') else: # Download whole file once if not downloaded: msg('downloading: {} ({})'.format(base, media.name)) _curl( media.url, file + '.part', rate_limit=self.rate_limit, curl_path=self.curl_path, progress=progressbar, ) downloaded = True else: # If we get here, all tests have failed. # Resume and regular download too. # There is nothing left to do. msg('failed to download: {} ({})'.format(base, media.name)) return None