def get_video(self) -> None: """ Download video and write output to folder """ self.create_path() output_folder = self.get_path()['path'] COLLECTION.update_one({'_id': self.video_id}, {'$set': { 'path': output_folder }}) y = YoutubeDl(output_folder=output_folder, mode=self.mode) ydl = y.ydl """ Update the processing attribute to True whilst the video is being downloaded so that other processes don't attempt to download the video whilst it's in progress. Finally update the processed Flag to true and processing Flag to False """ try: with ydl: self.vid.processing = True info_dict = ydl.extract_info(self.video_id) ydl.process_info(info_dict) self.vid.processing = False self.vid.processed = True self.vid.file_path = output_folder self.vid.save_video() except (DownloadError, SameFileError) as e: logger.error(e) logger.info('failed to download video %s', self.video_id)
def write_nfo(self) -> None: """ Create an nfo file and write to path""" i = COLLECTION.find_one({'_id': self.video_id}, { '_id': True, 'uploader': True, 'title': True, 'description': True, 'upload_date': True }) try: with open(os.path.join(APP_PATH, 'template.nfo'), 'r') as fl: template = Template(fl.read()) except FileNotFoundError as e: logger.error(e) out_template = template.substitute(unique_id=i['_id'], studio=i['uploader'], title=i['title'], plot=i['description'], date_prem=i['upload_date']) path = os.path.join(self.get_path()['path'], 'tvshow.nfo') logger.info('writing nfo to path %s', path) with open(path, 'w') as fl: fl.write(out_template)
def save_video(self) -> Union[None, str]: """save the Video instance to the database Return: None: success str: test mode message """ if self._mode == 'test': return 'Data cannot be saved in test mode' dct = self.__dict__ doc = {} # pop the private modifier of the attribute name before saving for k, v in dct.items(): if k == '_video_id': doc['_id'] = v elif k == '_processed': doc['Processed'] = v else: dk = k.lstrip('_') doc[dk] = v COLLECTION.replace_one({'_id': self._video_id}, doc) logger.info('%s updated', self._video_id) return None
def main(): start_count = len(data_in) end_count = 0 while len(data_in) != 0: data = data_in.pop(0) push_object(data, 'analysis_objects0') end_count = end_count + 1 logger.info('Imported %d out of %d', end_count, start_count)
def get_thumbnail(self) -> None: # TODO fix path to use same as video_downloader """Download thumbnail to path """ url = COLLECTION.find_one({'_id': self.video_id}, {'thumbnail': True})['thumbnail'] data = requests.get(url, stream=True) image_data = Image.open(io.BytesIO(data.content)) path = self.get_path()['path'] logger.info('writing thumbnail to path %s', path) image_data.save(os.path.join(path, 'thumbnail.jpg'), 'jpeg')
def create_path(self) -> None: """Create folder for video from tags""" check = self.get_path() exists: bool = check.__getitem__('exists') path: str = check.__getitem__('path') if not exists: os.makedirs(path) logger.info('Creating path %s', path) else: logger.info('Existing path %s found', path)
def get_video_info(video_id: str, tags: List = None) -> Union[None, dict]: """get metadata for video Args: video_id (str): The id of a youtube video tags (list): A list of tags Returns: None: if exception dict: example {'_id': 'video_id', 'Processed': False, 'title': 'video title', 'uploader': 'video creator', 'upload_date': 'video upload date (%Y%m%d)' 'description': 'video description', 'thumbnail': 'video thumbnail (url)', 'tags': [tags]} Raises: TypeError: invalid video_id type """ if not isinstance(video_id, str): raise TypeError(f'{video_id} should be str not {type(video_id)}') if tags is None: tags = [] try: video_info = INFO_EXTRACTOR.extract_info(video_id) except youtube_dl.utils.DownloadError as e: logger.error(e) logger.info('%s is not a valid id', video_id) return None info = { '_id': video_info['id'], 'Processed': False, 'title': video_info['title'], 'uploader': video_info['uploader'], 'upload_date': datetime.strptime(video_info['upload_date'], '%Y%m%d'), 'description': video_info['description'], 'thumbnail': video_info['thumbnail'], 'tags': tags } return info
def create_analysis_object(qo): """ Take a QueryObject and GET data from the page specified in the url_query attribute. :param qo: A list of QueryObjects :type qo: list[QueryObject] """ try: for qu in qo: logger.info('pid: %d - processing: %s', os.getpid(), qu.title) ao = AnalysisObject(title=qu.title, info=get_info(qu.url_query)) push_object(ao, 'analysis_objects0') except AttributeError as e: logger.error(e)
def __init__(self, mode: str, video_id: str, processed: bool, title: str, uploader: str, upload_date: str, description: str, thumbnail: str, tags: List[str], file_path: str): logger.info('Video object initialised') self._mode = mode self._video_id: str = video_id self._processed: bool = processed self._title: str = title self._uploader: str = uploader self._upload_date: str = upload_date self._description: str = description self._thumbnail: str = thumbnail self._tags: Optional[List[str]] = tags self._file_path: str = file_path self._processing: bool = False
def delete_video(self, check: bool = False) -> Union[Tuple, str]: """Remove the current object from the database along with any files or folders Args: check (bool): a value to be explicitly set in order for the video to be deleted Returns: tuple: (int: 0, str: '') - the document was not found tuple: (int: 1, str: '') - the document was found and deleted but had not path key tuple: (int: 2, str: '') - the document and path were found both were deleted str: check did not pass """ if self._mode == 'test': return 'Data cannot be deleted in test mode' if not check: return '' result = COLLECTION.find_one({'_id': self.video_id}, { '_id': True, 'path': True }) if not result: res = (0, '') elif 'path' not in result.keys(): COLLECTION.delete_one({'_id': self.video_id}) res = (1, '') logger.info('video_id %s deleted - no folder found', self.video_id) else: COLLECTION.delete_one({'_id': self.video_id}) path: str = result['path'] try: shutil.rmtree(os.path.split(path)[0], ignore_errors=True) except FileNotFoundError as e: logger.error(e) res = (2, path) logger.info('video_id %s deleted - folder %s deleted', self.video_id, path) return res
def add_queue(video_id: str, tags: list = None) -> bool: """If the video_id does not exist, insert an entry into the database from the information provided by get_video_info Args: video_id (str): The id of a youtube video tags (list): A list of tags Returns: True: item was added False: exception occurred Raises: TypeError: invalid video_id type """ ret = False vid_info = None if tags is None: tags = ['undefined'] if not isinstance(video_id, str): raise TypeError(f'{video_id} should be str not {type(video_id)}') if check_db(video_id) is True: logger.info('video_id %s already exists in database', video_id) ret = False else: vid_info = get_video_info(video_id, tags) if vid_info is not None: COLLECTION.insert_one(get_video_info(video_id, tags)) logger.info('%s successfully inserted', video_id) ret = True return ret
def get_info(url_query): """ Perform a GET request on the url passed into the function, parse the data and sanitise it of any unwanted data then return the parsed data as a list of strings :param url_query: page to be queried :type url_query: str :returns: list of filtered strings from webpage :rtype: list """ try: req = requests.get(url_query, timeout=30) except (ConnectionError, requests.exceptions.ReadTimeout) as e: logger.error(e) req = None if req is None or req.status_code != 200: logger.info('%s returned no information or invalid status', url_query) info = '' else: remove_tags = [ 'pre', 'script', 'nav', 'footer', 'form', 'input', 'meta' ] soup = BeautifulSoup(req.text, features='html.parser') soup = soup.find('div', attrs={'class': 'mw-parser-output'}) # soup = soup.find('p') for f in remove_tags: # remove this stuff from the soup object for j in soup.find_all(f): j.decompose() info = soup.text.replace('\n', '').replace('"', '').replace("'", '').lower() return info
def processed(self, p: bool) -> None: if not isinstance(p, bool): raise ValueError('processed must be bool') self._processed = p logger.info('processed attribute set to %s', str(p))
def file_path(self, path: str) -> None: if not isinstance(path, str): raise ValueError('Path must be str') self._file_path = path logger.info('file_path attribute set to %s', path)
# pylint: disable=all import os import pickle from app.project_logging import logger from app.database import push_object, flush_redis """ Import test data into redis """ file = os.path.join(os.path.dirname(__file__), 'analysis_objects0.pickle') logger.info('Reading from %s', file) with open(file, 'rb') as fl: data_in = pickle.loads(fl.read()) def main(): start_count = len(data_in) end_count = 0 while len(data_in) != 0: data = data_in.pop(0) push_object(data, 'analysis_objects0') end_count = end_count + 1 logger.info('Imported %d out of %d', end_count, start_count) if __name__ == '__main__': flush_redis()