def __init__(self, cfg, max_results=50): self.max_results = max_results self.cfg = cfg self.svc = build( get_from_config(cfg, 'api_service_name', 'youtube'), get_from_config(cfg, 'api_version', 'youtube'), developerKey=get_from_config(cfg, 'api_key', 'youtube'), )
def video_to_dict(self, video): """Converts youtube#video to a python dict """ snippet = video.get('snippet', {}) status = video.get('status', {}) player = video.get('player', {}) thumbnails = snippet.get('thumbnails', {}) thumbnail = thumbnails.get('high', {}) video_id = video['id'] """ if self.cfg.has_option('project', 'language'): language = get_from_config(self.cfg, 'language'), else: language = 'English' """ raw_title = snippet.get('title', '') title = parse_title(raw_title) raw_description = snippet.get('description', '') d = parse_speakers_and_description(raw_description) raw_speakers = d['speakers'] speakers = parse_speakers(raw_speakers) item = { 'category': get_from_config(self.cfg, 'category'), 'title': title, 'description': d['description'], 'copyright_text': status.get('license', ''), 'recorded': snippet.get('publishedAt', '')[0:10], 'thumbnail_url': thumbnail.get('url', ''), 'embed': player.get('embedHtml', ''), 'summary': '', 'language': 'English', 'state': DRAFT, 'whiteboard': 'needs editing', 'quality_notes': '', 'slug': '', 'speakers': speakers, 'source_url': 'https://www.youtube.com/watch?v={}'.format(video_id) } return item
def pull_cmd(cfg, parser, parsed, args): if not parsed.quiet: parser.print_byline() username = get_from_config(cfg, 'username') api_url = get_from_config(cfg, 'api_url') cat_title = get_from_config(cfg, 'category') # Command line api_key overrides config-set api_key api_key = parsed.apikey if not api_key: try: api_key = cfg.get('project', 'api_key') except ConfigParser.NoOptionError: pass if not api_key: err('Specify an api key either in steve.ini, on command line, ' 'or in API_KEY file.') return 1 if not username or not api_url or not cat_title or not api_key: return 1 api = steve.restapi.API(api_url) all_categories = steve.restapi.get_content( api.category.get(username=username, api_key=api_key, limit=0)) cat = [cat_item for cat_item in all_categories['objects'] if cat_item['title'] == cat_title] if not cat: err('Category "{0}" does not exist.'.format(cat_title)) return 1 # Get the category from the list of 1. cat = cat[0] out('Retrieved category.') data = [] for counter, video_url in enumerate(cat['videos']): # Lame, but good enough for now. video_id = video_url.split('/')[-2] video_data = steve.restapi.get_content( api.video(video_id).get(username=username, api_key=api_key)) out('Working on "{0}"'.format(video_data['slug'])) # Nix some tastypie bits from the data. for bad_key in ('resource_uri',): if bad_key in video_data: del video_data[bad_key] # Add id. video_data['id'] = video_id fn = 'json/{0:4d}_{1}.json'.format(counter, video_data['slug']) data.append((fn, video_data)) out('Saving files....') save_json_files(cfg, data) return 0
def push_cmd(cfg, parser, parsed, args): if not parsed.quiet: parser.print_byline() # Get username, api_url and api_key. username = get_from_config(cfg, 'username') api_url = get_from_config(cfg, 'api_url') update = parsed.update # Command line api_key overrides config-set api_key api_key = parsed.apikey if not api_key: try: api_key = cfg.get('project', 'api_key') except ConfigParser.NoOptionError: pass if not api_key: err('Specify an api key either in steve.ini, on command line, ' 'or in API_KEY file.') return 1 if not username or not api_url or not api_key: return 1 data = load_json_files(cfg) if args: data = [(fn, contents) for fn, contents in data if fn in args] # There are two modes: # # 1. User set category in configuration. Then the json files can # either have no category set or they have to have the same # category set. # # 2. User has NOT set category in configuration. Then the json # files must all have the category set. The categories can be # different. # # Go through and make sure there aren't any problems with # categories. api = steve.restapi.API(api_url) all_categories = dict( [(cat['title'], cat) for cat in steve.richardapi.get_all_categories(api_url)]) try: category = cfg.get('project', 'category') category = category.strip() if category not in all_categories: err('Category "{0}" does not exist on server. Build it there ' 'first.'.format(category)) return 1 except ConfigParser.NoOptionError: category = None errors = False for fn, contents in data: if category is None: this_cat = contents.get('category') if not this_cat: err('No category set in configuration and {0} has no ' 'category set.'.format(fn)) errors = True elif this_cat != this_cat.strip(): err('Category "{0}" has whitespace at beginning or ' 'end.'.format(this_cat)) return 1 elif this_cat not in all_categories: err('Category "{0}" does not exist on server. ' 'Build it there first.'.format(this_cat)) return 1 else: this_cat = contents.get('category') if this_cat is not None and str(this_cat).strip() != category: err('Category set in configuration ({0}), but {1} has ' 'different category ({2}).'.format( category, fn, this_cat)) errors = True if update: for fn, contents in data: if not 'id' in contents: err('id not in contents for "{0}".'.format(fn)) errors = True break if errors: err('Aborting.') return 1 # Everything looks ok. So double-check with the user and push. out('Pushing to: {0}'.format(api_url)) out('Username: {0}'.format(username)) out('api_key: {0}'.format(api_key)) out('update?: {0}'.format(update)) out('Once you push, you can not undo it. Push for realz? Y/N') if not raw_input().strip().lower().startswith('y'): err('Aborting.') return 1 for fn, contents in data: contents['category'] = category or contents.get('category') if not update: # Nix any id field since that causes problems. if 'id' in contents: del contents['id'] out('Pushing {0}'.format(fn)) try: vid = steve.restapi.get_content( api.video.post(contents, username=username, api_key=api_key)) if 'id' in vid: contents['id'] = vid['id'] out(' Now has id {0}'.format(vid['id'])) else: err(' Errors?: {0}'.format(vid)) except steve.restapi.RestAPIException as exc: err(' Error?: {0}'.format(exc)) err(' "{0}"'.format(exc.response.content)) else: out('Updating {0} "{1}" ({2})'.format( contents['id'], contents['title'], fn)) try: vid = steve.restapi.get_content( api.video(contents['id']).put( contents, username=username, api_key=api_key)) except steve.restapi.RestAPIException as exc: err(' Error?: {0}'.format(exc)) err(' "{0}"'.format(exc.response.content)) save_json_file(cfg, fn, contents) return 0
} return item if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument("--maxresults", help="Max results", default=50) parser.add_argument("-c", "--channel", action='store_true', help="YouTube channel id") parser.add_argument("-p", "--playlist", action='store_true', help="YouTube playlist id") args = parser.parse_args() cfg = get_project_config() scraper = YouTubeScraper(cfg, max_results=args.maxresults) if args.channel: channel_id = get_from_config(cfg, 'channel_id', 'youtube') print("scraping channel {}".format(channel_id)) data = scraper.scrape_channel(channel_id) save_json_files(cfg, data) elif args.playlist: playlist_id = get_from_config(cfg, 'playlist_id', 'youtube') print("scraping playlist {}".format(playlist_id)) data = scraper.scrape_playlist(playlist_id) save_json_files(cfg, data) else: print("nothing to do. no channel or playlist requested")
def push_cmd(cfg, parser, parsed, args): if not parsed.quiet: parser.print_byline() # Get username, api_url and api_key. username = get_from_config(cfg, 'username') api_url = get_from_config(cfg, 'api_url') update = parsed.update # Command line api_key overrides config-set api_key api_key = parsed.apikey if not api_key: try: api_key = cfg.get('project', 'api_key') except ConfigParser.NoOptionError: pass if not api_key: err('Specify an api key either in steve.ini, on command line, ' 'or in API_KEY file.') return 1 if not username or not api_url or not api_key: return 1 data = load_json_files(cfg) if args: data = [(fn, contents) for fn, contents in data if fn in args] # There are two modes: # # 1. User set category in configuration. Then the json files can # either have no category set or they have to have the same # category set. # # 2. User has NOT set category in configuration. Then the json # files must all have the category set. The categories can be # different. # # Go through and make sure there aren't any problems with # categories. all_categories = dict([ (cat['title'], cat) for cat in steve.richardapi.get_all_categories(api_url) ]) try: category = cfg.get('project', 'category') category = category.strip() if category not in all_categories: err('Category "{0}" does not exist on server. Build it there ' 'first.'.format(category)) return 1 else: out('Category {0} exists on site.'.format(category)) except ConfigParser.NoOptionError: category = None errors = False for fn, contents in data: if category is None: this_cat = contents.get('category') if not this_cat: err('No category set in configuration and {0} has no ' 'category set.'.format(fn)) errors = True elif this_cat != this_cat.strip(): err('Category "{0}" has whitespace at beginning or ' 'end.'.format(this_cat)) return 1 elif this_cat not in all_categories: err('Category "{0}" does not exist on server. ' 'Build it there first.'.format(this_cat)) return 1 else: this_cat = contents.get('category') if this_cat is not None and str(this_cat).strip() != category: err('Category set in configuration ({0}), but {1} has ' 'different category ({2}).'.format(category, fn, this_cat)) errors = True if update: for fn, contents in data: if not 'id' in contents: err('id not in contents for "{0}".'.format(fn)) errors = True break if errors: err('Aborting.') return 1 # Everything looks ok. So double-check with the user and push. out('Pushing to: {0}'.format(api_url)) out('Username: {0}'.format(username)) out('api_key: {0}'.format(api_key)) out('update?: {0}'.format(update)) out('# videos: {0}'.format(len(data))) out('Once you push, you can not undo it. Push for realz? Y/N') if not raw_input().strip().lower().startswith('y'): err('Aborting.') return 1 for fn, contents in data: contents['category'] = category or contents.get('category') if not update: # Nix any id field since that causes problems. if 'id' in contents: if not parsed.overwrite: print 'Skipping... already exists' continue del contents['id'] out('Pushing {0}'.format(fn)) try: vid = steve.richardapi.create_video(api_url, api_key, contents) if 'id' in vid: contents['id'] = vid['id'] out(' Now has id {0}'.format(vid['id'])) else: err(' Errors?: {0}'.format(vid)) except steve.restapi.RestAPIException as exc: err(' Error?: {0}'.format(exc)) err(' "{0}"'.format(exc.response.content)) else: out('Updating {0} "{1}" ({2})'.format(contents['id'], contents['title'], fn)) try: vid = steve.richardapi.update_video(api_url, api_key, contents['id'], contents) except steve.restapi.RestAPIException as exc: err(' Error?: {0}'.format(exc)) err(' "{0}"'.format(exc.response.content)) save_json_file(cfg, fn, contents) return 0
def pull_cmd(cfg, parser, parsed, args): if not parsed.quiet: parser.print_byline() username = get_from_config(cfg, 'username') api_url = get_from_config(cfg, 'api_url') cat_title = get_from_config(cfg, 'category') # Command line api_key overrides config-set api_key api_key = parsed.apikey if not api_key: try: api_key = cfg.get('project', 'api_key') except ConfigParser.NoOptionError: pass if not api_key: err('Specify an api key either in steve.ini, on command line, ' 'or in API_KEY file.') return 1 if not username or not api_url or not cat_title or not api_key: return 1 api = steve.restapi.API(api_url) all_categories = steve.restapi.get_content( api.category.get(username=username, api_key=api_key, limit=0)) cat = [ cat_item for cat_item in all_categories['objects'] if cat_item['title'] == cat_title ] if not cat: err('Category "{0}" does not exist.'.format(cat_title)) return 1 # Get the category from the list of 1. cat = cat[0] out('Retrieved category.') data = [] for counter, video_url in enumerate(cat['videos']): # Lame, but good enough for now. video_id = video_url.split('/')[-2] video_data = steve.restapi.get_content( api.video(video_id).get(username=username, api_key=api_key)) out('Working on "{0}"'.format(video_data['slug'])) # Nix some tastypie bits from the data. for bad_key in ('resource_uri', ): if bad_key in video_data: del video_data[bad_key] # Add id. video_data['id'] = video_id fn = 'json/{0:4d}_{1}.json'.format(counter, video_data['slug']) data.append((fn, video_data)) out('Saving files....') save_json_files(cfg, data) return 0
def pull(cfg, ctx, quiet, apikey): """Pulls data from a richard instance.""" if not quiet: click.echo(VERSION) username = get_from_config(cfg, 'username') api_url = get_from_config(cfg, 'api_url') cat_title = get_from_config(cfg, 'category') # Command line api_key overrides config-set api_key if not apikey: try: apikey = cfg.get('project', 'api_key') except NoOptionError: pass if not apikey: raise click.ClickException( u'Specify an api key either in {0}, on command line, ' u'or in API_KEY file.'.format(get_project_config_file_name()) ) if not username or not api_url or not cat_title or not apikey: raise click.ClickException(u'Missing username, api_url or api_key.') api = steve.restapi.API(api_url) all_categories = steve.restapi.get_content( api.category.get(username=username, api_key=apikey, limit=0)) cat = [cat_item for cat_item in all_categories['objects'] if cat_item['title'] == cat_title] if not cat: raise click.ClickException(u'Category "{0}" does not exist.'.format(cat_title)) # Get the category from the list of 1. cat = cat[0] click.echo('Retrieved category.') data = [] for counter, video_url in enumerate(cat['videos']): # Lame, but good enough for now. video_id = video_url.split('/')[-2] video_data = steve.restapi.get_content( api.video(video_id).get(username=username, api_key=apikey)) click.echo('Working on "{0}"'.format(video_data['slug'])) # Nix some tastypie bits from the data. for bad_key in ('resource_uri',): if bad_key in video_data: del video_data[bad_key] # Add id. video_data['id'] = video_id fn = 'json/{0:4d}_{1}.json'.format(counter, video_data['slug']) data.append((fn, video_data)) click.echo('Saving files....') save_json_files(cfg, data)
def push(cfg, ctx, quiet, apikey, update, overwrite, files): """Pushes metadata to a richard instance.""" if not quiet: click.echo(VERSION) # Get username, api_url and api_key. username = get_from_config(cfg, 'username') api_url = get_from_config(cfg, 'api_url') # Command line api_key overrides config-set api_key if not apikey: try: apikey = cfg.get('project', 'api_key') except NoOptionError: pass if not apikey: raise click.ClickException( u'Specify an api key either in {0}, on command line, ' u'or in API_KEY file.'.format(get_project_config_file_name()) ) if not username or not api_url or not apikey: raise click.ClickException(u'Missing username, api_url or api_key.') data = load_json_files(cfg) if files: data = [(fn, contents) for fn, contents in data if fn in files] # There are two modes: # # 1. User set category in configuration. Then the json files can # either have no category set or they have to have the same # category set. # # 2. User has NOT set category in configuration. Then the json # files must all have the category set. The categories can be # different. # # Go through and make sure there aren't any problems with # categories. all_categories = dict( [(cat['title'], cat) for cat in steve.richardapi.get_all_categories(api_url)]) try: category = cfg.get('project', 'category') category = category.strip() if category not in all_categories: raise click.ClickException( u'Category "{0}" does not exist on server. Build it there ' u'first.'.format(category) ) else: click.echo('Category {0} exists on site.'.format(category)) except NoOptionError: category = None errors = [] for fn, contents in data: if category is None: this_cat = contents.get('category') if not this_cat: errors.append( u'No category set in configuration and {0} has no ' u'category set.'.format(fn) ) elif this_cat != this_cat.strip(): errors.append( u'Category "{0}" has whitespace at beginning or ' u'end.'.format(this_cat) ) elif this_cat not in all_categories: errors.append( u'Category "{0}" does not exist on server. ' u'Build it there first.'.format(this_cat) ) else: this_cat = contents.get('category') if this_cat is not None and str(this_cat).strip() != category: errors.append( u'Category set in configuration ({0}), but {1} has ' u'different category ({2}).'.format(category, fn, this_cat) ) if update: for fn, contents in data: if 'id' not in contents: errors.append( u'id not in contents for "{0}".'.format(fn) ) if errors: raise click.ClickException('\n'.join(errors)) # Everything looks ok. So double-check with the user and push. click.echo('Pushing to: {0}'.format(api_url)) click.echo('Username: {0}'.format(username)) click.echo('api_key: {0}'.format(apikey)) click.echo('update?: {0}'.format(update)) click.echo('# videos: {0}'.format(len(data))) click.echo('Once you push, you can not undo it. Push for realz? Y/N') if not raw_input().strip().lower().startswith('y'): raise click.Abort() for fn, contents in data: contents['category'] = category or contents.get('category') if not update: # Nix any id field since that causes problems. if 'id' in contents: if not overwrite: click.echo(u'Skipping... already exists.') continue del contents['id'] click.echo('Pushing {0}'.format(fn)) try: vid = steve.richardapi.create_video(api_url, apikey, contents) if 'id' in vid: contents['id'] = vid['id'] click.echo(' Now has id {0}'.format(vid['id'])) else: click.echo(' Errors?: {0}'.format(vid), err=True) except steve.restapi.RestAPIException as exc: click.echo(' Error?: {0}'.format(exc), err=True) click.echo(' "{0}"'.format(exc.response.content), err=True) else: click.echo('Updating {0} "{1}" ({2})'.format( contents['id'], contents['title'], fn)) try: vid = steve.richardapi.update_video( api_url, apikey, contents['id'], contents) except steve.restapi.RestAPIException as exc: click.err(' Error?: {0}'.format(exc)) click.err(' "{0}"'.format(exc.response.content)) save_json_file(cfg, fn, contents)
# In[9]: from bs4 import BeautifulSoup from steve.util import (get_project_config, get_from_config, load_json_files, save_json_files) from steve import richardapi import requests # In[7]: cfg = get_project_config() # In[10]: apiurl = get_from_config(cfg, 'api_url') # In[14]: category = richardapi.get_category(apiurl, 'PyCon US 2009') # In[16]: videos = category['videos'] # In[25]:
def pull(cfg, ctx, quiet, apikey): """Pulls data from a richard instance.""" if not quiet: click.echo(VERSION) username = get_from_config(cfg, 'username') api_url = get_from_config(cfg, 'api_url') cat_title = get_from_config(cfg, 'category') # Command line api_key overrides config-set api_key if not apikey: try: apikey = cfg.get('project', 'api_key') except ConfigParser.NoOptionError: pass if not apikey: raise click.ClickException( u'Specify an api key either in {0}, on command line, ' u'or in API_KEY file.'.format(get_project_config_file_name()) ) if not username or not api_url or not cat_title or not apikey: raise click.ClickException(u'Missing username, api_url or api_key.') api = steve.restapi.API(api_url) all_categories = steve.restapi.get_content( api.category.get(username=username, api_key=apikey, limit=0)) cat = [cat_item for cat_item in all_categories['objects'] if cat_item['title'] == cat_title] if not cat: raise click.ClickException(u'Category "{0}" does not exist.'.format(cat_title)) # Get the category from the list of 1. cat = cat[0] click.echo('Retrieved category.') data = [] for counter, video_url in enumerate(cat['videos']): video_id = get_video_id(video_url) video_data = steve.restapi.get_content( api.video(video_id).get(username=username, api_key=apikey)) click.echo('Working on "{0}"'.format(video_data['slug'])) # Nix some tastypie bits from the data. for bad_key in ('resource_uri',): if bad_key in video_data: del video_data[bad_key] # Add id. video_data['id'] = video_id fn = 'json/{0:4d}_{1}.json'.format(counter, video_data['slug']) data.append((fn, video_data)) click.echo('Saving files....') save_json_files(cfg, data)
def push(cfg, ctx, quiet, apikey, update, overwrite, files): """Pushes metadata to a richard instance.""" if not quiet: click.echo(VERSION) # Get username, api_url and api_key. username = get_from_config(cfg, 'username') api_url = get_from_config(cfg, 'api_url') # Command line api_key overrides config-set api_key if not apikey: try: apikey = cfg.get('project', 'api_key') except ConfigParser.NoOptionError: pass if not apikey: raise click.ClickException( u'Specify an api key either in {0}, on command line, ' u'or in API_KEY file.'.format(get_project_config_file_name()) ) if not username or not api_url or not apikey: raise click.ClickException(u'Missing username, api_url or api_key.') data = load_json_files(cfg) if files: data = [(fn, contents) for fn, contents in data if fn in files] # There are two modes: # # 1. User set category in configuration. Then the json files can # either have no category set or they have to have the same # category set. # # 2. User has NOT set category in configuration. Then the json # files must all have the category set. The categories can be # different. # # Go through and make sure there aren't any problems with # categories. all_categories = dict( [(cat['title'], cat) for cat in steve.richardapi.get_all_categories(api_url)]) try: category = cfg.get('project', 'category') category = category.strip() if category not in all_categories: raise click.ClickException( u'Category "{0}" does not exist on server. Build it there ' u'first.'.format(category) ) else: click.echo('Category {0} exists on site.'.format(category)) except ConfigParser.NoOptionError: category = None errors = [] for fn, contents in data: if category is None: this_cat = contents.get('category') if not this_cat: errors.append( u'No category set in configuration and {0} has no ' u'category set.'.format(fn) ) elif this_cat != this_cat.strip(): errors.append( u'Category "{0}" has whitespace at beginning or ' u'end.'.format(this_cat) ) elif this_cat not in all_categories: errors.append( u'Category "{0}" does not exist on server. ' u'Build it there first.'.format(this_cat) ) else: this_cat = contents.get('category') if this_cat is not None and str(this_cat).strip() != category: errors.append( u'Category set in configuration ({0}), but {1} has ' u'different category ({2}).'.format(category, fn, this_cat) ) if update: for fn, contents in data: if 'id' not in contents: errors.append( u'id not in contents for "{0}".'.format(fn) ) if errors: raise click.ClickException('\n'.join(errors)) # Everything looks ok. So double-check with the user and push. click.echo('Pushing to: {0}'.format(api_url)) click.echo('Username: {0}'.format(username)) click.echo('api_key: {0}'.format(apikey)) click.echo('update?: {0}'.format(update)) click.echo('# videos: {0}'.format(len(data))) click.echo('Once you push, you can not undo it. Push for realz? Y/N') if not raw_input().strip().lower().startswith('y'): raise click.Abort() for fn, contents in data: contents['category'] = category or contents.get('category') if not update: # Nix any id field since that causes problems. if 'id' in contents: if not overwrite: click.echo(u'Skipping... already exists.') continue del contents['id'] click.echo('Pushing {0}'.format(fn)) try: vid = steve.richardapi.create_video(api_url, apikey, contents) if 'id' in vid: contents['id'] = vid['id'] click.echo(' Now has id {0}'.format(vid['id'])) else: click.echo(' Errors?: {0}'.format(vid), err=True) except steve.restapi.RestAPIException as exc: click.echo(' Error?: {0}'.format(exc), err=True) click.echo(' "{0}"'.format(exc.response.content), err=True) else: click.echo('Updating {0} "{1}" ({2})'.format( contents['id'], contents['title'], fn)) try: vid = steve.richardapi.update_video( api_url, apikey, contents['id'], contents) except steve.restapi.RestAPIException as exc: click.echo(' Error?: {0}'.format(exc), err=True) click.echo(' "{0}"'.format(exc.response.content), err=True) save_json_file(cfg, fn, contents)