def debug_print(self, obj): msg = u"spider {} complete, id: {}".format(self.__file_type__, obj['id']) if 'name' in obj: msg += u", name: {}".format(obj['name']) log.print_info(msg)
def spls(self): from NXSpider.bin.models import playlist_mo if self.param_check(['playlist'], sys._getframe().f_code.co_name) is False: return download_type = self.parse_download() playlists = self.app.pargs.playlist.split(',') # type: list for pid in playlists: playlist_detail = api.get_playlist_detail(pid) if playlist_detail: log.print_info(u"<{}> author:{}".format( playlist_detail['name'], playlist_detail['creator']['nickname'], )) playlist_mo.parse_model( playlist_detail, download_type=download_type, file_check=Config().get_file_check(), shortcuts_stack=[] if Config().get_shortcut() else None) log.print_info("spider complete!~") pass
def sar_top_mp3(self): from NXSpider.bin.models import artist_mo if self.param_check(['artist'], sys._getframe().f_code.co_name) is False: return download_type = self.parse_download() artists = self.app.pargs.artist.split(',') # type: list for arid in artists: detail = api.get_artists_songs(arid) if detail is None: continue artist_detail = detail['artist'] artist_detail['mp3'] = detail['hotSongs'] log.print_info(u"<{}>".format(artist_detail['name'])) artist_mo.parse_model( artist_detail, download_type=download_type, file_check=Config().get_file_check(), shortcuts_stack=[] if Config().get_shortcut() else None) log.print_info("spider complete!~") pass
def scls_pls(self): from NXSpider.bin.models import playlist_mo if self.param_check(['cls'], sys._getframe().f_code.co_name) is False: return download_type = self.parse_download() class_name = self.app.pargs.cls class_name = py2_decoding(class_name) if class_name != u"全部" and py2_encoding( class_name) not in api.ALL_CLASSES: log.print_err( "class name is wrong, pls check by run : nxspider sw-pl-classes" ) return playlists = api.get_top_playlists(category=class_name, offset=self.app.pargs.offset or 0, limit=self.app.pargs.limit or 50) # type: list for pl_obj in playlists: playlist_detail = api.get_playlist_detail(pl_obj['id']) if playlist_detail: log.print_info(u"<{}> author:{}".format( playlist_detail['name'], playlist_detail['creator']['nickname'], )) playlist_mo.parse_model(playlist_detail, download_type=download_type, file_check=Config().get_file_check()) log.print_info("spider complete!~") pass
def download_file(self, doc): """ download file from music 163 :param doc: :return: """ file_relative_path = self.download_relative_path(doc) path = Config().get_path() content = self.request_file(doc) if content is None: log.print_err(u"file download failed : %s" % file_relative_path) return False try: file_name = os.path.join(path, file_relative_path) # dir make dir_name = os.path.dirname(file_name) if not os.path.exists(dir_name): os.makedirs(dir_name) # file write with open(file_name, "wb") as code: code.write(content) self.download_file_tag(file_name, doc) log.print_info(u"file download complete: %s" % file_relative_path) self.download_log(doc) return True except Exception as e: log.print_err("file save failed : %s, err: %s" % (file_relative_path, e)) return False
def sab(self): from NXSpider.bin.models import album_mo if self.param_check(['album'], sys._getframe().f_code.co_name) is False: return download_type = self.parse_download() albums = self.app.pargs.album.split(',') # type: list for pid in albums: album_detail = api.get_album_detail(pid) if album_detail is None: continue log.print_info(u"{} artist:{}".format( "<" + album_detail['name'] + ">", album_detail['artist']['name'], )) album_mo.parse_model( album_detail, download_type=download_type, file_check=Config().get_file_check(), shortcuts_stack=[] if Config().get_shortcut() else None) log.print_info("spider complete!~") pass
def search(self): search_key = 'mp3' key_num = 0 for k, v in search_types.items(): if getattr(self.app.pargs, k, None): search_key = k key_num += 1 if key_num > 1: log.print_err("it could search by only one type") # input must be decode in python2 search_value = getattr(self.app.pargs, search_key) search_value = py2_decoding(search_value) res = api.search(search_value, stype=search_key, offset=self.app.pargs.offset or 0, limit=self.app.pargs.limit or 50) if not res: log.print_info("nothing found!") return if search_key in PRINT_ATTR_FUNC_MAP: func = PRINT_ATTR_FUNC_MAP[search_key][1] # type: function value = (res.get(PRINT_ATTR_FUNC_MAP[search_key][0], [])) # type: list func(value)
def login_smv(self): from NXSpider.bin.models import no_rec_mv_mo if self.param_check(['lu'], sys._getframe().f_code.co_name) is False: return plaintext_pwd = self.app.pargs.lp or None if plaintext_pwd is None: import getpass plaintext_pwd = getpass.getpass("Please input your password:"******"none"))) exit() mvs = api.my_mvs(session) mvs = [api.get_mv_detail(d['id']) for d in mvs] mvs = [d for d in mvs if d] for mv in mvs: no_rec_mv_mo.parse_model( mv, download_type=['mv'], file_check=Config().get_file_check(), shortcuts_stack=[] if Config().get_shortcut() else None) log.print_info("spider complete!~") pass
def config_check(self): self.config_show() try: config = Config() if config.config_test(): log.print_info('config check complete, all is well done!') except: log.print_err('config check failed, pls re config')
def config_spider(self): config = Config() config_dict = config.config # type: dict is_config = False try: if self.app.pargs.path_download is not None: paths = self.app.pargs.path_download.split(',') # type: list if default_path_key in paths: index = paths.index(default_path_key) paths.remove(default_path_key) paths.insert(index, default_download_dir) final_paths = [] for p in paths: try: # some error need pass if os.path.isdir(p) is False: os.mkdir(p) final_paths.append(p) except: log.print_warn("path may be wrong and be deleted: {}".format(p)) pass if not final_paths: final_paths.append(default_download_dir) log.print_info('path will be set as: ' + ','.join(final_paths)) config_dict['download_path'] = final_paths is_config = True if self.app.pargs.mv_resolution is not None: r = int(self.app.pargs.mv_resolution) if r not in mv_resolutions: log.print_warn("-mvr resolution config skip, value must be 240,480,720,1080") config_dict['mv_def_resolution'] = r is_config = True if self.app.pargs.media_tag is not None: config_dict['media_tag'] = True if self.app.pargs.media_tag.lower() == 'true'\ or self.app.pargs.media_tag == '1' else False is_config = True if self.app.pargs.media_tag_163 is not None: config_dict['media_tag_163'] = True if self.app.pargs.media_tag_163.lower() == 'true' \ or self.app.pargs.media_tag_163 == '1' else False is_config = True except: log.print_err("input error, pls check") raise if is_config: config.save_config_file() log.print_info("config success") self.config_show()
def playlist_by_id(link, download_type=all_download_type, save=True, file_check=True): playlist_detail = get_playlist_detail(link) with tools.ignored(Exception): log.print_info("%s author:%s" % ( "<" + playlist_detail['name'] + ">", playlist_detail['creator']['nickname'], )) playlist_mo = Playlist() playlist_mo.parse_model(playlist_detail, save=save, download_type=download_type, file_check=file_check) pass
def stop_mvs(self): from NXSpider.bin.models import no_rec_mv_mo mvs = api.top_mvs(offset=self.app.pargs.offset or 0, limit=self.app.pargs.limit or 50) mvs = [api.get_mv_detail(d['id']) for d in mvs] mvs = [d for d in mvs if d] for mv in mvs: no_rec_mv_mo.parse_model( mv, download_type=['mv'], file_check=Config().get_file_check(), shortcuts_stack=[] if Config().get_shortcut() else None) log.print_info("spider complete!~") pass
def attach_media_idv3_by_db(): """ attach media idv3 by paths in config :return: """ for suffix, type_setting in media_types: model, driver = type_setting # type: Mp3Model or Mp4Model, Music163Obj objs = model.objects(downloaded=True) for obj in objs: file_path = driver.download_check(obj, check_file=True) if not file_path: continue if is_latin1(file_path): res = attach_media_tag(obj, file_path) else: res = attach_shadow(file_path, suffix, obj) log.print_info('idv3 attach %s, file: %s' % ('success' if res else 'failed', file_path))
def request_file(doc): """ implement pls :param doc: :return: :rtype: bytes """ url = None try: url = getattr(doc, model_download_url, None) if url is None: return None r = requests.get(url) if r.status_code != 200: return None return r.content except Exception as e: log.print_info(u"url download failed %s , err: %s" % (url, e)) return None
def smp3s(self): from NXSpider.bin.models import dw_mp3_mo if self.param_check(['mp3'], sys._getframe().f_code.co_name) is False: return download_type = self.parse_download() mp3s = self.app.pargs.mp3.split(',') # type: list details = api.get_mp3_details(mp3s) for mid, detail in details.items(): log.print_info(u"<{}>".format(detail['name'])) dw_mp3_mo.parse_model( detail, download_type=download_type, file_check=Config().get_file_check(), shortcuts_stack=[] if Config().get_shortcut() else None) log.print_info("spider complete!~") pass
def try_download(self, doc, download_type, file_check): if self.__file_type__ not in download_type: return True # download file and set download flag if download_type is not None \ and not self.download_check(doc, check_file=file_check): # need download, try url which is set first or get new url and download if getattr(doc, model_download_url, None) \ and self.download_file(doc): return True # get download link here doc[model_download_url] = self.url_load(doc) self.download_file(doc) else: name = self.download_relative_path(doc) if name: log.print_info( u"file is exist or is not need to download : %s" % name)
def sur_pls(self): from NXSpider.bin.models import playlist_mo if self.param_check(['user'], sys._getframe().f_code.co_name) is False: return download_type = self.parse_download() user_id = self.app.pargs.user playlists = api.user_playlist(user_id) from terminaltables import AsciiTable table = AsciiTable([["ID", "Name", "User", "PlayCount"]]) table_data = [[ str(item['id']), item['name'], item['creator']['nickname'], str(item['playCount']), ] for item in playlists] table.table_data.extend(table_data) log.print_info("playlists bellow will be crawled") print(table.table) for pl_obj in playlists: playlist_detail = api.get_playlist_detail(pl_obj['id']) if playlist_detail: log.print_info(u"<{}> author:{}".format( playlist_detail['name'], playlist_detail['creator']['nickname'], )) playlist_mo.parse_model(playlist_detail, download_type=download_type, file_check=Config().get_file_check()) log.print_info("spider complete!~") pass
def attach_media_tag_by_path(path): """ please run in python3 if your os is windows, cause os.walk has a encoding bug :type path: str :param path: :return: """ for root, dirs, files in os.walk(path): for file in files: # type: str suffix_i = file.rfind('.') suffix = file[suffix_i + 1:] if suffix not in media_types: continue split_txt = ' - ' split_i = file.find(split_txt) if split_i == -1: continue artist = file[:split_i] download_file_name = os.path.join(artist, file) obj = load_media_obj(suffix, download_file_name) file_path = os.path.join(root, file) # f**k!!!! although i fix python-magic encoding bug of Chinese str in windows, # the f**k libmagic.dll doesn't recognize Korean. so, i am really pissed off to # change file name and change back ''' if is_latin1(file_path): res = attach_media_tag(obj, file_path) else: res = attach_shadow(file_path, suffix, obj) ''' # Dobby is free, thanks to mutagen res = attach_media_tag(obj, file_path) log.print_info('idv3 attach %s, file: %s' % ('success' if res else 'failed', file_path))
def sur_pls(self): from NXSpider.bin.models import playlist_mo if self.param_check(['user'], sys._getframe().f_code.co_name) is False: return download_type = self.parse_download() user_id = self.app.pargs.user playlists = api.user_playlist(user_id, offset=self.app.pargs.offset or 0, limit=self.app.pargs.limit or 50) log.print_info("playlists bellow will be crawled") print_playlist(playlists) for pl_obj in playlists: playlist_detail = api.get_playlist_detail(pl_obj['id']) if playlist_detail: log.print_info(u"<{}> author:{}".format( playlist_detail['name'], playlist_detail['creator']['nickname'], )) playlist_mo.parse_model( playlist_detail, download_type=download_type, file_check=Config().get_file_check(), shortcuts_stack=[] if Config().get_shortcut() else None) log.print_info("spider complete!~") pass
def config_mongo(self): config = Config() config_dict = config.config # type: dict mongo_key = 'mongo' is_config = False try: if self.app.pargs.mhost is not None: config_dict[mongo_key]['host'] = self.app.pargs.mhost config_dict['no_mongo'] = False is_config = True if self.app.pargs.mport is not None: config_dict[mongo_key]['port'] = int(self.app.pargs.mport) is_config = True if self.app.pargs.muser is not None: config_dict[mongo_key]['username'] = self.app.pargs.muser is_config = True if self.app.pargs.mpassword is not None: config_dict[mongo_key]['password'] = self.app.pargs.mpassword is_config = True if self.app.pargs.mdbname is not None: config_dict[mongo_key]['name'] = self.app.pargs.mdbname is_config = True if self.app.pargs.nomongo is not None: config_dict['no_mongo'] = True if self.app.pargs.nomongo.lower() == 'true'\ or self.app.pargs.nomongo == '1' else False is_config = True except: log.print_err("input error, pls check") raise if is_config: config.save_config_file() log.print_info("config success") self.config_show()
def crawl_playlist_by_page(page, dtype="全部", download_type=['mp3', 'mv'], save=True, file_check=True): play_url = "http://music.163.com/discover/playlist/?order=hot&cat={}&limit=35&offset={}" play_url = play_url.format(dtype, page * 35) playlist_id = [] titles = [] try: acmsk = {'class': 'msk'} scnb = {'class': 'nb'} dcu = {'class': 'u-cover u-cover-1'} ucm = {'class': 'm-cvrlst f-cb'} data = tools.curl(play_url, headers, type=RETURE_HTML) lst = data.find('ul', ucm) for play in lst.find_all('div', dcu): title = play.find('a', acmsk)['title'] link = play.find('a', acmsk)['href'].replace("/playlist?id=", "") playlist_detail = get_playlist_detail(link) with tools.ignored(Exception): log.print_info("%s author:%s" % ( "<" + playlist_detail['name'] + ">", tools.encode(playlist_detail['creator']['nickname']), )) playlist_mo = Playlist() playlist_mo.parse_model(playlist_detail, save=save, download_type=download_type, file_check=file_check) return titles except Exception as e: log.print_err("crawl html error:{} type:{} page:{}".format( e, dtype, page)) raise
def login_spls(self): if self.param_check(['lu', 'lp'], sys._getframe().f_code.co_name) is False: return from NXSpider.bin.models import playlist_mo plaintext_pwd = self.app.pargs.lp or None if plaintext_pwd is None: import getpass plaintext_pwd = getpass.getpass("Please input your password:"******"none"))) exit() user_id = res['account']['id'] download_type = self.parse_download() playlists = api.user_playlist(user_id, offset=self.app.pargs.offset or 0, limit=self.app.pargs.limit or 1000) log.print_info("playlists bellow will be crawled") print_playlist(playlists) for pl_obj in playlists: playlist_detail = api.get_playlist_detail(pl_obj['id']) if playlist_detail: log.print_info(u"<{}> author:{}".format( playlist_detail['name'], playlist_detail['creator']['nickname'], )) playlist_mo.parse_model( playlist_detail, download_type=download_type, file_check=Config().get_file_check(), shortcuts_stack=[] if Config().get_shortcut() else None) log.print_info("spider complete!~") pass
def sar_albums(self): from NXSpider.bin.models import artist_album_mo if self.param_check(['artist'], sys._getframe().f_code.co_name) is False: return download_type = self.parse_download() artists = self.app.pargs.artist.split(',') # type: list for arid in artists: detail = api.get_artist_album(arid, offset=self.app.pargs.offset or 0, limit=self.app.pargs.limit or 50) if detail is None: continue artist_detail = detail['artist'] album_details = [ api.get_album_detail(d['id']) for d in detail['hotAlbums'] ] album_details = [d for d in album_details if d] artist_detail['albums'] = album_details from terminaltables import AsciiTable table = AsciiTable([["ID", "Album", "Artist", "ArtistID"]]) table_data = [[ str(item['id']), item['name'], ','.join([ar['name'] for ar in item['artists']]), ','.join([str(ar['id']) for ar in item['artists']]), ] for item in artist_detail['albums']] table.table_data.extend(table_data) log.print_info(u"<{}>".format(artist_detail['name'])) log.print_info("albums bellow will be crawled") print(table.table) artist_album_mo.parse_model(artist_detail, download_type=download_type, file_check=Config().get_file_check()) log.print_info("spider complete!~") pass
def sar_albums(self): from NXSpider.bin.models import artist_album_mo if self.param_check(['artist'], sys._getframe().f_code.co_name) is False: return download_type = self.parse_download() artists = self.app.pargs.artist.split(',') # type: list for arid in artists: detail = api.get_artist_album(arid, offset=self.app.pargs.offset or 0, limit=self.app.pargs.limit or 50) if detail is None: continue artist_detail = detail['artist'] album_details = [ api.get_album_detail(d['id']) for d in detail['hotAlbums'] ] album_details = [d for d in album_details if d] artist_detail['albums'] = album_details log.print_info(u"<{}>".format(artist_detail['name'])) log.print_info("albums bellow will be crawled") print_albums(artist_detail['albums']) artist_album_mo.parse_model( artist_detail, download_type=download_type, file_check=Config().get_file_check(), shortcuts_stack=[] if Config().get_shortcut() else None) log.print_info("spider complete!~") pass
def config_clear(self): Config().config_reset() log.print_info("config has been reset, u need re-config from beginning pls") self.config_show()
def config_show(self): config_dict = Config().config log.print_info("config will be show fellow:") print(json.dumps(config_dict, ensure_ascii=False, indent=1))
def search(self): search_key = 'mp3' key_num = 0 for k, v in search_types.items(): if getattr(self.app.pargs, k, None): search_key = k key_num += 1 if key_num > 1: log.print_err("it could search by only one type") # input must be decode in python2 search_value = getattr(self.app.pargs, search_key) search_value = py2_decoding(search_value) res = api.search(search_value, stype=search_key, offset=self.app.pargs.offset or 0, limit=self.app.pargs.limit or 50) if not res: log.print_info("nothing found!") table = '' if search_key == 'mp3' and 'songs' in res: table = AsciiTable( [["ID", "Name", "Album", "AlbumID", "Artist", "ArtistID"]]) table_data = [[ str(item['id']), item['name'], item['album']['name'], item['album']['id'], ','.join([ar['name'] for ar in item['artists']]), ','.join([str(ar['id']) for ar in item['artists']]), ] for item in res['songs']] table.table_data.extend(table_data) elif search_key == 'playlist' and 'playlists' in res: table = AsciiTable( [["ID", "Name", "User", "PlayCount", "FavoriteCount"]]) table_data = [[ str(item['id']), item['name'], item['creator']['nickname'], str(item['playCount']), str(item['bookCount']), ] for item in res['playlists']] table.table_data.extend(table_data) pass elif search_key == 'user' and 'userprofiles' in res: table = AsciiTable([["ID", "Name", "Signature"]]) table_data = [[ str(item['userId']), item['nickname'], item['signature'], ] for item in res['userprofiles']] table.table_data.extend(table_data) pass elif search_key == 'artist' and 'artists' in res: table = AsciiTable([["ID", "Name", "AlbumNum", "MVNum"]]) table_data = [[ str(item['id']), item['name'], str(item['albumSize']), str(item['mvSize']) ] for item in res['artists']] table.table_data.extend(table_data) elif search_key == 'album' and 'albums' in res: table = AsciiTable([["ID", "Album", "Artist", "ArtistID"]]) table_data = [[ str(item['id']), item['name'], ','.join([ar['name'] for ar in item['artists']]), ','.join([str(ar['id']) for ar in item['artists']]), ] for item in res['albums']] table.table_data.extend(table_data) pass elif search_key == 'mv' and 'mvs' in res: table = AsciiTable( [["ID", "Name", "Artist", "ArtistID", "Duration", "PlayCount"]]) table_data = [[ str(item['id']), item['name'], item['artistName'], item['artistId'], '%02d:%02d' % divmod(int(item['duration'] / 1000), 60), item['playCount'], ] for item in res['mvs']] table.table_data.extend(table_data) pass if table == '': log.print_err('nothing found') else: print(table.table)
def config_test(self): result = True try: # check mongodb config if self.config['no_mongo'] is False: log.print_info('check mongodb config') mongo = self.config['mongo'] for k in ['name', 'host', 'port']: if k not in mongo: log.print_err( "mongo config error, key mongo.{} is not set yet". format(k)) result = False # try import model, which will connect to server and exit if server config wrong import NXSpider.model.mongo_model for k in [ 'download_path', 'mv_def_resolution', 'media_tag', 'media_tag_163' ]: if k not in self.config: log.print_err( "config error, key {} is not set yet".format(k)) result = False # check type type_check = { 'download_path': list, 'mv_def_resolution': int, 'media_tag': bool, 'media_tag_163': bool, 'download_file_check': bool, 'no_mongo': bool, } need_save = False for k, v in type_check.items(): if not isinstance(self.config[k], v): log.print_err("config error, {} is not a require type, " "and is reset to default value: {}".format( k, self.default_config[k])) self.config[k] = self.default_config[k] need_save = True result = False # download path check final_paths = [] for p in self.config['download_path']: try: # some error need pass if os.path.isdir(p) is False: os.mkdir(p) final_paths.append(p) except: log.print_warn( "download path may be wrong and be deleted: {}".format( p)) need_save = True result = False pass # mv resolution check if self.config['mv_def_resolution'] not in mv_resolutions: log.print_warn( "mv_def_resolution will be reset to default: {}".format( self.default_config['mv_def_resolution'])) self.config['mv_def_resolution'] = self.default_config[ 'mv_def_resolution'] need_save = True result = False if need_save: self.config['download_path'] = final_paths self.save_config_file() return result except Exception as e: log.print_err(e) return False