def before_finish(uuid, username, address, start_time, ts, *args): if BEFORE_DONE: func_name = args[0].__class__.__full_name__ msg = "Before Finish << {} >> username: {} -|- " msg += "func: {} -- address: {} -- args: {}" msg = msg.format(uuid, username, func_name, address, args[1:]) toLog(msg, 'request')
def checking(*args, **kwargs): """ Exception Handler """ try: return cls(*args, **kwargs) except Exception as e: toLog(traceback.format_exc(), 'error') msg = str(e) if 'run()' in msg: name = args[0].__full_name__ msg = msg.replace('run()', "Function {0}()".format(name)) toLog(msg, 'error') try: error = ErrorGeneratorFromRaise( msg, e.type_error).generateException() return error except: error = GeneralError(msg) return error
def fetch_sp(self): while True: try: time.sleep(1.5) sp = gen_sp() return sp except: toLog(traceback.format_exc(), 'error')
def run(self): while True: func, args, kargs = self.tasks.get() try: func(*args, **kargs) except Exception, e: toLog('Thread Error: %s' % e, 'error') self.tasks.task_done()
def soundcloud_update(): toLog("Start updating soundcloud ", 'jobs') less_today = datetime.datetime.now().replace(hour=2, minute=30, second=0) _criteria = { '$or': [ {'update_track_data': {'$lte': less_today}}, {'update_track_data': {'$exists': False}} ] } projection = { 'id': 1 } all_tracks = cursor_soundcloud.refined_data.find( _criteria, projection, no_cursor_timeout=True ) count = 1 print all_tracks.count() print datetime.datetime.now() for track in all_tracks: time.sleep(0.15) try: new_track = track_info(track) refine_track = today_yesterday_data(new_track, track) if not refine_track: toLog( 'Unsuccessful update: {0}'.format(track['id']), 'jobs' ) continue criteria = {'_id': track['_id']} _update = {'$set': refine_track} update = cursor_soundcloud.refined_data.update_one( criteria, _update ) if not update.raw_result.get('updatedExisting', None): count += 1 msg = "The video with this id" msg += " '{0}' can't be updated".format(track['id']) if (count % 100) == 0: toLog(msg, 'db') except Exception as e: count += 1 if (count % 1000) == 0: toLog(str(e), 'error') toLog("End updating soundcloud ", 'jobs') sc_most_played()
def registerPlugin(): """ This module register every plugin that calls this method """ toLog('CORE_SERVICES_API :: add plugin << %s >> to list of ' 'all plugins' % cls.__name__, 'service') plugin_functions.append(cls.__full_name__) plugin = plugin_handler.initHandler() plugin.registerPlugin(cls())
def execute_batch(_date, name, criteria): next_page = None for i in range(1, (batch_loop + 1)): try: next_page = executor_crawl(_date, name, criteria, next_page) except Exception as e: toLog(str(e), 'error')
def add_task(self, func, *args, **kwargs): """Add a task to the queue""" key = random.randint(1000, 100000) msg = "Calling background process with id: {0} -- func: {1}" toLog(msg.format(key, func.__name__), 'jobs') self.tasks.put((func, args, kwargs)) toLog("End of background process with id: {0}".format(key), 'jobs')
def registerPlugin(): """ This module register every plugin that calls this method """ toLog( 'CORE_SERVICES_API :: add plugin << %s >> to list of ' 'all plugins' % cls.__name__, 'service') plugin_functions.append(cls.__full_name__) plugin = plugin_handler.initHandler() plugin.registerPlugin(cls())
def __getFilesList(self, directory): """ return list of all files in "directory" """ # TODO:get directory which running try: return os.listdir(directory) except OSError as e: toLog('PluginLoader.__getFilesList: %s' % e, 'error') return []
def start_crawling(): scheduler.add_job(bulk_jobs_from_dates, trigger='cron', hour=hour_crawl, minute=minute_crawl, args=[], timezone=local_tz) msg = "Start crawling." toLog(msg, 'jobs')
def __init__(self): BaseRPC.__init__(self) load_module = __import__("services") classobj = getattr(load_module.plugins, name) load_flag = classobj.getFlag() if not load_flag: self.loadPlugins(name) self.setPlugins() toLog('<%s added to services>' % class_name, 'service')
def update_crawl_data(): scheduler.add_job(start_updating_jobs, trigger='cron', hour=hour_update, minute=minute_update, args=[], timezone=local_tz) # result = send_request('crawler.cycle_update', '') msg = "Start new jobs for update crawling data." toLog(msg, 'jobs')
def create_capped_collection(self): if CAPPED_NAME not in self.coll_names: result = cursor.create_collection(CAPPED_NAME, capped=True, size=5e+8, max=CAPPED_SIZE) if result: msg = 'Capped collection "middle" has been created: ' toLog((msg + str(result)), 'db') else: self.set_zero_capped_collection()
def __callInits(self, modules): """ call init function of all modules in "modules" dict """ for obj in modules.itervalues(): try: if hasattr(obj, 'init'): obj.init() except: toLog('PluginLoader.__callInits', 'error')
def create_crawl_job(): time_list = [2, 2.12, 3, 2.2, 2.75, 2.6, 1.1, 2.31, 2.5] msg = "start crawler jobs" toLog(msg, 'jobs') for i in range(1, max_page_crawl + 1): for case in keyword_list: crawl_search(case, i) time.sleep(random.choice(time_list)) msg = "end crawler jobs" toLog(msg, 'jobs')
def get_settings(settings_type, key): settings = cursor_local.settings.find_one({'settings_type': settings_type}) if settings: if key: return settings[key] else: return settings else: toLog('Get Settings: No settings in DB', 'error')
def start_crawling(): scheduler.add_job( bulk_jobs_from_dates, trigger='cron', hour=hour_crawl, minute=minute_crawl, args=[], timezone=local_tz ) msg = "Start crawling." toLog(msg, 'jobs')
def track_info(track_doc): proxies = { 'http': 'http://*****:*****@170.130.59.249:3128', 'https': 'https://*****:*****@170.130.59.249:3128' } headers = { 'User-Agent': 'Maryam&Ali' } try: url = "https://api-v2.soundcloud.com/tracks/" + str(track_doc['id']) url += "?client_id=" + SOUNDCLOUD_ID track = requests.get(url, headers=headers, proxies=proxies) # track = requests.get(url) track = track.json() if 'last_modified' in track: track['last_modified'] = parser.parse( track['last_modified'] ) if 'created_at' in track: track['created_at'] = parser.parse(track['created_at']) if 'user' in track: if 'username' in track['user']: track['username'] = track['user']['username'] del track['user'] track['has_yesterday'] = True track['update_track_data'] = datetime.datetime.now() return track except Exception as e: data_log = {'track_id': track_doc['id']} data_log['type'] = 'update_data' data_log['date'] = datetime.datetime.now() if 'list index out of range' in str(e): msg = "Track Id: {0} can't be fetch".format(track_doc['id']) data_log['reason'] = msg toLog(msg, 'error') else: toLog(e, 'error') data_log['reason'] = str(e) cursor_soundcloud.logs.insert(data_log)
def update_crawl_data(): scheduler.add_job( start_updating_jobs, trigger='cron', hour=hour_update, minute=minute_update, args=[], timezone=local_tz ) # result = send_request('crawler.cycle_update', '') msg = "Start new jobs for update crawling data." toLog(msg, 'jobs')
def timeit(result, username, address, ts, *args): if DEBUG: # End of time execute work func_name = args[0].__class__.__full_name__ te = time.time() msg = "RPC Call username: {0} -- time: {1:2.4f} sec -- " msg += "func: {2} -- address: {3} -- args: {4}" msg = msg.format(username, te - ts, func_name, address, args[1:]) toLog(msg, 'request') # set_activity_log(username, address, func_name, args[1:]) return result
def select_send_request(host, port, *args): output = {'error': {}, 'result': {}} rpc_id = random_id() args = args + (rpc_id,) result = None try: result = get_server(host, port)._request(*args) if isinstance(result, str) or isinstance(result, unicode): try: output['result'] = loads(result) except ValueError: if ObjectId.is_valid(str(result)): output['result'] = str(result) else: raise elif isinstance(result, dict) or isinstance(result, list): output['result'] = result elif isinstance(result, bool): output['result'] = result elif not result: output['result'] = {} else: raise TypeError except Exception as e: message_error = 'Func: {}, Error Type: {}, Error: {}, Result: {}, Message: {}'.format( str(args[0]), type(e), handle_errors(e, rpc_id), result, str(e) ) output['error'] = handle_errors(e, rpc_id), message_error toLog(message_error, 'error') if hasattr(e, 'strerror') and (e.strerror == "No route to host"): msg = "RPC Connection Failed: The Core server is unavailable." msg += "\nThe system couldn't connect to core with this address:" msg += ' {0}'.format(settings.VIR_SERVER) raise ConnectionFailed(msg) return output
def select_send_request(server, *args): output = {'error': {}, 'result': {}} rpc_id = random_id() args = args + (rpc_id,) result = None try: result = get_server(server)._request(*args) if isinstance(result, str) or isinstance(result, unicode): try: output['result'] = loads(result) except ValueError: if ObjectId.is_valid(str(result)): output['result'] = str(result) else: raise elif isinstance(result, dict) or isinstance(result, list): output['result'] = result elif isinstance(result, bool): output['result'] = result elif not result: output['result'] = {} else: raise TypeError except Exception as e: message_error = 'Func: {}, Error Type: {}, Error: {}, Result: {}, Message: {}'.format( str(args[0]), type(e), handle_errors(e, rpc_id), result, str(e) ) output['error'] = handle_errors(e, rpc_id), message_error toLog(message_error, 'error') if hasattr(e, 'strerror') and (e.strerror == "No route to host"): msg = "RPC Connection Failed: The Core server is unavailable." msg += "\nThe system couldn't connect to core with this address:" msg += ' {0}'.format(settings.VIR_SERVER) raise ConnectionFailed(msg) return output
def __getitem__(self, key): c = self.cache[key] n = datetime.now() if n - c['timestamp'] < c['expireTime'] or not self.processExpires: return c['data'] msg = 'Deleted << "{0}" >> object from << "{1}" >> DataCache!!!' toLog(msg.format(self.cache[key], self.__name__), 'object') del self.cache[key] if self.dbg: toLog('DataCache: Key %s expired' % repr(key), 'object') raise KeyExpiredError(key)
def set_name_spacing(self): """ Please set your component in this here. This is follow namespace concept in your API service. Usage: Please call your component after API cursor when connection is connected to the server. """ for component in installed_component: try: klass = generate_class_component(component) self.putSubHandler(component, klass()) except Exception as e: toLog("{}".format(e), 'error') msg = "Component {} Faild to register!".format(component) toLog(msg, 'error')
def crawl_search(keyword, page): if ' ' in keyword: keyword = keyword.replace(' ', '+') url = 'https://www.youtube.com/results?search_sort=video_view_count' # url += '&filters=today' url += '&search_query=' + keyword url += '&page={0}'.format(page) text = requests.get(url).text soup = bs4.BeautifulSoup(text, "html.parser") div = [] for d in soup.find_all('div'): if d.has_attr('class') and 'yt-lockup-dismissable' in d['class']: div.append(d) for d in div: doc = {'created_date': datetime.datetime.now()} img0 = d.find_all('img')[0] a0 = d.find_all('a')[0] if not img0.has_attr('data-tumb'): doc['img'] = img0['src'] else: doc['img'] = img0['data-tumb'] a0 = [x for x in d.find_all('a') if x.has_attr('title')][0] doc['title'] = a0['title'] doc['href'] = 'https://www.youtube.com' + a0['href'] doc['id'] = get_video_id(doc['href']) try: result = cursor.refined_data.insert(doc) except DuplicateKeyError: result = True toLog("Crawling Error: It can't be save record", 'error') if not result: toLog("Crawling Error: It can't be save record", 'error')
def bulk_jobs_from_dates(): #reactor.callInThread(soundcloud_runner) # tuple_month_list = divide_datetime(period_years) now = datetime.datetime.now() last_day = now - datetime.timedelta(days=1) last_week = now - datetime.timedelta(days=7) last_month = now - datetime.timedelta(days=31) last_year = now - datetime.timedelta(days=365) ten_years = now - datetime.timedelta(days=(365 * 10)) weekly = (last_week, last_day) monthly = (last_month, last_week) yearly = (last_year, last_month) ten = (ten_years, last_year) date_list = [ ((last_day, "Now"), 'Daily'), (weekly, 'Weekly'), (monthly, 'Monthly'), (yearly, 'Yearly'), (ten, 'Ten years') ] category_list = ['10', '24'] order_list = ['date', 'rating', 'relevance', 'viewCount'] for order in order_list: for _date, _name in date_list: for item in category_list: criteria = { 'max_results': 50, 'q': '', 'category_id': item, 'order': order } result = execute_batch(_date, _name, criteria) msg = _name + " Crawler Jobs" msg += " from: {0} | category: {1}".format(_date, item) msg += "{0}".format(str(result)) toLog(msg, 'jobs') delete_video()
def timeit(result, uuid, username, address, start_time, ts, *args): if AFTER_DONE: # End of time execute work func_name = args[0].__class__.__full_name__ te = time.time() msg = "After Done (Finish) == {} ==> start from: {} >|< username: {} " msg += "-|- time: {:2.4f} sec -|- func: {} -- address: {} -- args: {}" if DEBUG_RESULT: msg += " -- result: {}" msg = msg.format(uuid, start_time, username, te - ts, func_name, address, args[1:], result) else: msg = msg.format(start_time, username, te - ts, func_name, address, args[1:]) toLog(msg, 'request') # set_activity_log(username, address, func_name, args[1:]) return result
def __registerModuleAttributes(self, module, mapping): """ Find all attribute classes in module and register it in attribute factory attribute classes are inherited from attributNe parents available in mapping """ toLog('RegisterModuleAttributes: processing module: %s' % module, 'service') for obj_name in dir(module): obj = getattr(module, obj_name) if self.__isClass(obj): for klass in mapping: toLog( 'RegisterModuleAttributes: ' 'obj = %s klass: %s subclass = %s' % (obj, klass, issubclass(obj, klass)), 'service') if issubclass(obj, klass) and obj != klass: mapping[klass](obj)
def run(self, _continue=False): if not _continue: counter = CAPPED_SIZE self.create_capped_collection() self.fill_capped_collection() else: counter = cursor[CAPPED_NAME].count() for i in range(counter): sort = [("followers", DESCENDING)] doc = cursor[CAPPED_NAME].find_one_and_delete({}, sort=sort) if doc: try: sp = gen_sp() except Exception as e: toLog('Spotify API: {}'.format(str(e)), 'error') response = sp.user_playlist_tracks(doc['owner_id'], doc['playlist_id'], None, 100, 0) tracks = response.get('items', []) self.save_tracks(tracks, doc) one = 'next' in response and response['next'] is not None while one and response.get('next', ''): if not self.allow_time(): return try: sp = gen_sp() response = sp.next(response) self.save_tracks(response.get('items', []), doc) except SpotifyException: continue except Exception as e: toLog("{}".format(e), 'error')
def __registerModuleAttributes(self, module, mapping): """ Find all attribute classes in module and register it in attribute factory attribute classes are inherited from attributNe parents available in mapping """ toLog('RegisterModuleAttributes: processing module: %s' % module, 'service') for obj_name in dir(module): obj = getattr(module, obj_name) if self.__isClass(obj): for klass in mapping: toLog('RegisterModuleAttributes: ' 'obj = %s klass: %s subclass = %s' % (obj, klass, issubclass(obj, klass)), 'service') if issubclass(obj, klass) and obj != klass: mapping[klass](obj)
def handle_errors(error, rpc_id): result = {} if isinstance(error, ProtocolError): response_error = loads(history.response) toLog(response_error, 'debug') try: list_some_formal_error = ['ParamsError', 'GeneralError'] for _error in list_some_formal_error: if _error in error.message: result = {error.message[1]: ( error.message[0], response_error['error'].get('data'))} else: result = response_error['error']['data'] except KeyError: result = response_error elif isinstance(error, KeyError): response_error = loads(history.response) toLog(response_error, 'debug') if rpc_id == response_error['id'] and 'error' in response_error and 'fault' in response_error['error']: result['message'] = response_error['error'].pop('fault') result['code'] = response_error['error'].pop('faultCode') result['data'] = response_error['error'].pop('faultString') else: result['message'] = 'Client RPC Issue!' elif isinstance(error, socket_error): result['code'] = error[0] result['message'] = error[1] return result
def catharsis(tracks): counter = 0 for track in tracks['collection']: track = pre_catharsis(track) track['created_date'] = datetime.datetime.now() if not track.get('isrc', None): if track.get('publisher_metadata', None): track['isrc'] = track["publisher_metadata"].get("isrc", None) if 'user' in track: if 'username' in track['user']: track['username'] = track['user']['username'] del track['user'] if 'last_modified' in track: track['last_modified'] = parser.parse( track['last_modified'] ) if 'created_at' in track: track['created_at'] = parser.parse(track['created_at']) try: result = cursor_soundcloud.refined_data.insert(track) except DuplicateKeyError: counter += 1 result = True if (counter % 25) == 0: toLog("Duplicate Error: It can't be save record", 'error') if not result: msg = "Crawling Error: It can't be save record" msg += "{0}".format(track) toLog(msg, 'error')
def save_to_db(self, playlists): ids = [] for doc in playlists: data = {} try: if ('id' in doc) and doc['id']: data['playlist_id'] = doc['id'] else: toLog("{}".format(doc), 'lost_ids') continue try: data['name'] = doc.get('name', '').strip() except AttributeError: data['name'] = '' data['created_date'] = datetime.datetime.now() data['href'] = doc.get('href', None) data['external_url'] = doc.get('external_urls', {}).get('spotify', None) data['uri'] = doc.get('uri', None) data['owner_external_url'] = doc.get('owner', {}).get( 'external_urls', {}).get('spotify', None) data['owner_id'] = doc.get('owner', {}).get('id', None) data['owner_href'] = doc.get('owner', {}).get('href', None) data['owner_uri'] = doc.get('owner', {}).get('uri', None) cursor.playlist.replace_one( {'playlist_id': data['playlist_id']}, data, upsert=True) ids.append(data) except AttributeError: pass try: # Start Update info Playlist self.update_info(ids) except: toLog(traceback.format_exc(), 'error')
def save_tracks(self, tracks, pl): for per, track in enumerate(tracks, 1): doc = {} if 'track' in track and track['track']: artists = "" for artist in track['track'].get('artists', []): artists += artist['name'] + ", " artists = artists[:-2] href = track['track'].get('external_urls', {}).get('spotify', "") isrc = track['track'].get('external_ids', {}).get('isrc', '') doc['song_name'] = track['track'].get('name', "") doc['created_date'] = datetime.datetime.now() doc['playlist_name'] = pl['name'] doc['playlist_followers'] = pl['followers'] doc['playlist_owner'] = pl['owner_id'] doc['playlist_href'] = pl['external_url'] doc['playlist_id'] = pl['playlist_id'] doc['song_id'] = track['track'].get('id', '') doc['playlist_description'] = pl['description'] doc['artist'] = artists.strip() doc['href'] = href doc['uri'] = track['track'].get('uri', '') doc['song_position'] = per doc['popularity'] = track['track'].get('popularity', 0) doc['isrc'] = isrc doc['allbum'] = track['track'].get('album', {}).get('name', '') try: cursor.tracks.insert(doc) self.check_history(doc) except DuplicateKeyError: pass except: toLog(traceback.format_exc(), 'error')
def delete_video(): # now = datetime.datetime.now() # last_six_month = now - datetime.timedelta(days=31 * delete_month) _date = datetime.datetime.now().replace(hour=2, minute=30) last_date = _date - datetime.timedelta(days=7) criteria = { "published_at": { "$lt": last_date } } delete_id = cursor.refined_data.find(criteria, {'_id': 1}) delete_id = delete_id.sort('all_views', DESCENDING) delete_id = list(delete_id.skip(70000)) count = 0 for _id in delete_id: cursor.refined_data.delete_one({'_id': _id['_id']}) count += 1 toLog('Removed "%s" videos from DB' % str(count), 'debug')
def job_logger(event): if event.code > 512: toLog('Job {}, code {}'.format( event.job_id, event_code_translator(event.code)), 'jobs') elif event > 64: toLog('Event {} for job {} happenend'.format( event_code_translator(event.code), event.job_id ), 'jobs') else: toLog('Event {} happenend'.format( event_code_translator(event.code)), 'jobs' )
def start_updating_jobs(): #reactor.callInThread(soundcloud_update,) less_today = datetime.datetime.now().replace(hour=2, minute=30, second=0) _criteria = { 'private': {'$ne': True}, '$or': [ {'update_video_data': {'$lte': less_today}}, {'all_views': {'$exists': False}} ] } _projection = { 'id': 1 } toLog('Start updating jobs criteria: {0}'.format(str(_criteria)), 'jobs') all_videos = cursor.refined_data.find( _criteria, _projection, no_cursor_timeout=True ) for doc in all_videos: try: _id = doc['id'] criteria = {'_id': doc['_id']} new_data = today_yesterday_data(_id) if not new_data: # toLog('Unsuccessful update: {0}'.format(_id), 'jobs') continue _update = {'$set': new_data} update = cursor.refined_data.update_one(criteria, _update) if not update.raw_result.get('updatedExisting', None): msg = "The video with this id" msg += " '{0}' can't be updated".format(_id) toLog(msg, 'db') except Exception as e: toLog(str(e), 'error') clean_title() yt_most_viewed()
def soundcloud_runner(): toLog("Start crawling soundcloud ", 'jobs') # client = soundcloud.Client(client_id=SOUNDCLOUD_ID) # now = datetime.datetime.now() # last_day = now - datetime.timedelta(days=1) # last_week = now - datetime.timedelta(days=7) # last_month = now - datetime.timedelta(days=31) # last_year = now - datetime.timedelta(days=365) # ten_years = now - datetime.timedelta(days=(365 * 10)) # daily = { # 'from': last_day.strftime("%Y-%m-%d %H:%M:%S"), # 'to': now.strftime("%Y-%m-%d %H:%M:%S") # } # weekly = { # 'from': last_week.strftime("%Y-%m-%d %H:%M:%S"), # 'to': last_day.strftime("%Y-%m-%d %H:%M:%S") # } # monthly = { # 'from': last_month.strftime("%Y-%m-%d %H:%M:%S"), # 'to': last_week.strftime("%Y-%m-%d %H:%M:%S") # } # yearly = { # 'from': last_year.strftime("%Y-%m-%d %H:%M:%S"), # 'to': last_month.strftime("%Y-%m-%d %H:%M:%S") # } # ten = { # 'from': ten_years.strftime("%Y-%m-%d %H:%M:%S"), # 'to': last_year.strftime("%Y-%m-%d %H:%M:%S") # } # date_list = [ # (daily, 'Daily'), # (weekly, 'Weekly'), # (monthly, 'Monthly'), # (yearly, 'Yearly'), # (ten, 'Ten years') # ] kind_list = [ 'top', # Top 50 'trending' # New & Hot ] genres_list = [ "all-music", "all-audio", "alternativerock", "ambient", "classical", "country", "danceedm", "dancehall", "deephouse", "disco", "drumbass", "dubstep", "electronic", "folksingersongwriter", "hiphoprap", "house", "indie", "jazzblues", "latin", "metal", "piano", "pop", "rbsoul", "reggae", "reggaeton", "rock", "soundtrack", "techno", "trance", "trap", "triphop", "world", "audiobooks", "business", "comedy", "entertainment", "learning", "newspolitics", "religionspirituality", "science", "sports", "storytelling", "technology", ] proxies = { 'http': 'http://*****:*****@170.130.59.249:3128', 'https': 'https://*****:*****@170.130.59.249:3128' } headers = { 'User-Agent': 'Maryam&Ali' } for kind in kind_list: for genre in genres_list: offset = 0 for i in range(1, num_pages + 1): url = "https://api-v2.soundcloud.com" url += "/charts?kind={0}".format(kind) url += "&genre=soundcloud:genres:{0}&client".format(genre) url += "_id={0}&offset={1}&".format(SOUNDCLOUD_ID, offset) url += "limit={0}&linked_partitioning=1".format(page_length) time.sleep(0.15) data = requests.get(url, headers=headers, proxies=proxies) # data = requests.get(url) try: loads_data = data.json() if loads_data and 'error' not in loads_data: catharsis(loads_data) else: pass except Exception as e: print str(e) offset += page_length toLog("End crawling soundcloud ", 'jobs')
from config.settings import installed_component from core import toLog __all__ = installed_component try: # because we want to import using a variable, do it this way # create a global object containging our component import_component = __import__('services.plugins.', fromlist=["*"]) globals()['services.plugins.'] = import_component except ImportError as e: error = "{}".format(e) toLog(error, 'error')
def yt_most_viewed(): toLog('Start Migration to MySQL', 'db') mydb = MySQLdb.connect( SQL_HOST, SQL_USER, SQL_PASS, SQL_DB, charset='utf8mb4', use_unicode=True ) sql_cursor = mydb.cursor() sql_cursor.execute("SET NAMES utf8mb4;") sql_cursor.execute("SET CHARACTER SET utf8mb4;") sql_cursor.execute("SET character_set_connection=utf8mb4;") query = "ALTER DATABASE newdatabase CHARACTER SET = utf8mb4" query += " COLLATE = utf8mb4_unicode_ci;" sql_cursor.execute(query) query = "ALTER TABLE songs_chart CONVERT TO CHARACTER SET" query += " utf8mb4 COLLATE utf8mb4_unicode_ci;" sql_cursor.execute(query) _date = datetime.datetime.now().replace(hour=2, minute=30) last_date = _date - datetime.timedelta(days=int(yt_settings('last_date'))) criteria = { "$or": [ { "update_video_data": { "$gt": _date }, "daily_views_yesterday": { "$gt": 0 } }, { "published_at": { "$gte": last_date } } ] } sql_column = { 'published_at': 'ReleaseDate', 'href': 'YTURL', 'dislikes': 'YTDisLikes', 'likes': 'YTLikes', 'id': 'YTVideoID', 'daily_views_today': 'YTDailyViews', 'title': 'YTTitle', 'comment_count': 'YTComments', 'channel_title': 'YTChannel', 'description': 'YTDescription', 'daily_views_yesterday': 'YTDailyViewsYest', 'channel_id': 'YTChannelID', 'all_views': 'YTAllTimeViews', 'category_name': 'YTCategory', 'song_title': 'Song', 'artist': 'Artist' } extra_int_columns = [ 'Met', 'Flag', 'Cover', 'Favorite', 'Listened_To', 'Playlist', 'Omit' ] extra_str_columns = [ 'Brand_new_artist', 'Total', 'Brand_new_song', 'Tags', 'Album', 'Chart_name', 'Genre', 'Label', 'Charts_today', 'Charts_today_type', 'Manager', 'Agent', 'Lawyer', 'Notes', 'Negotiation', 'Price', 'Chart_name_2' ] projection = {} for i in sql_column.keys(): projection[i] = 1 data = cursor.refined_data.find( criteria, projection, no_cursor_timeout=True ) new_data = data.sort('daily_views_today', DESCENDING).limit(50000) if data: count = 1 for doc in new_data: new_doc = {} for k, v in doc.items(): if k != '_id': if k == 'published_at': new_doc[sql_column[k]] = str(v.date()) else: if isinstance(v, int): new_doc[sql_column[k]] = v elif isinstance(v, float): new_doc[sql_column[k]] = v elif isinstance(v, long): new_doc[sql_column[k]] = v else: if k == 'description': if isinstance(v, basestring): text = v[:50].encode('utf8') text += ' ...' new_doc[sql_column[k]] = text else: text = unicode(v[:50]).encode('utf8') text += ' ...' new_doc[sql_column[k]] = text else: if isinstance(v, basestring): text = v.encode('utf8') new_doc[sql_column[k]] = text else: text = unicode(v).encode('utf8') new_doc[sql_column[k]] = text for item in extra_str_columns: new_doc[item] = "" for item in extra_int_columns: new_doc[item] = 0 new_doc['Date'] = datetime.datetime.now().replace(hour=6, minute=0) new_doc['Date'] = str(new_doc['Date'].date()) new_doc['Chart_type'] = 'YouTube' new_doc['Rank'] = count try: table = 'songs_chart' sql = insert_from_dict(table, new_doc) sql_cursor.execute(sql, new_doc) mydb.commit() except MySQLdb.IntegrityError as e: print str(e) qry = 'UPDATE songs_chart SET {}'.format( ', '.join('{}=%s'.format(k) for k in new_doc) ) sql_cursor.execute(qry, new_doc.values()) mydb.commit() except Exception as e: print str(e) count += 1 toLog('End Migration to MySQL', 'db')
def sc_most_played(): toLog('Start Migration to MySQL', 'db') mydb = MySQLdb.connect( SQL_HOST, SQL_USER, SQL_PASS, SQL_DB, charset='utf8mb4', use_unicode=True ) sql_cursor = mydb.cursor() sql_cursor.execute("SET NAMES utf8mb4;") sql_cursor.execute("SET CHARACTER SET utf8mb4;") sql_cursor.execute("SET character_set_connection=utf8mb4;") query = "ALTER DATABASE newdatabase CHARACTER SET = utf8mb4" query += " COLLATE = utf8mb4_unicode_ci;" sql_cursor.execute(query) query = "ALTER TABLE songs_chart CONVERT TO CHARACTER SET" query += " utf8mb4 COLLATE utf8mb4_unicode_ci;" sql_cursor.execute(query) _date = datetime.datetime.now().replace(hour=2, minute=30) last_date = _date - datetime.timedelta(days=1) criteria = { "$or": [ { "update_track_data": { "$gt": _date }, "daily_playback_count_yesterday": { "$gt": 0 } }, { "created_at": { "$gte": last_date } } ] } sql_column = { 'created_at': 'ReleaseDate', 'permalink_url': 'SCURL', 'label_name': 'Label', 'genre': 'Genre', 'download_count': 'SCDownloads', 'id': 'SCSongID', 'daily_playback_count_today': 'SCDailyStreams', 'isrc': 'ISRC', 'title': 'Song', 'favoritings_count': 'SCFavorites', 'comment_count': 'SCComments', 'description': 'SCDescription', 'daily_playback_count_yesterday': 'SCDailyStreamsYest', 'artwork_url': 'SCArtWorkURL', 'playback_count': 'SCAllTimeStreams', 'user_id': 'SCUserID', 'score': 'SCScore', 'reposts_count': 'SCReposts', 'username': '******', 'artist': 'Artist' } extra_int_columns = [ 'Met', 'Flag', 'Cover', 'Favorite', 'Playlist', 'Listened_To', 'Omit' ] extra_str_columns = [ 'YTTitle', 'Brand_new_artist', 'Total', 'Brand_new_song', 'Tags', 'Chart_name', 'Charts_today', 'Charts_today_type', 'Manager', 'Agent', 'Lawyer', 'Notes', 'Negotiation', 'Price', 'Chart_name_2' ] projection = {'publisher_metadata': 1} for i in sql_column.keys(): projection[i] = 1 data = cursor_soundcloud.refined_data.find( criteria, projection, no_cursor_timeout=True ) new_data = data.sort('daily_playback_count_today', DESCENDING).limit(50000) if data: count = 1 for doc in new_data: new_doc = {} for k, v in doc.items(): ignore_list = [ 'publisher_metadata', '_id', 'artist', 'isrc' ] if (k == 'publisher_metadata') and doc[k]: album = v.get('album_title', " ") new_doc['Album'] = album isrc = v.get('isrc', " ") new_doc['ISRC'] = isrc artist = v.get('artist', "") if not artist: doc['artist'] = doc['username'] new_doc['Artist'] = doc['username'] else: doc['artist'] = artist new_doc['Artist'] = artist elif (k == 'publisher_metadata') and not doc.get(k, None): artist = doc.get('artist', None) if artist: new_doc['Artist'] = artist else: new_doc['Artist'] = doc['username'] elif k not in ignore_list: if k == 'created_at': new_doc[sql_column[k]] = str(v.date()) else: if isinstance(v, int): new_doc[sql_column[k]] = v elif isinstance(v, float): new_doc[sql_column[k]] = v elif isinstance(v, long): new_doc[sql_column[k]] = v elif isinstance(v, dict): continue else: if k == 'description': if isinstance(v, basestring): text = v[:50].encode('utf8') text += ' ...' new_doc[sql_column[k]] = text else: try: text = unicode(v[:50]).encode('utf8') text += ' ...' new_doc[sql_column[k]] = text except Exception as e: new_doc[sql_column[k]] = '' else: if isinstance(v, basestring): text = v.encode('utf8') new_doc[sql_column[k]] = text else: text = unicode(v).encode('utf8') new_doc[sql_column[k]] = text else: if 'Album' not in new_doc: new_doc['Album'] = " " for item in extra_str_columns: new_doc[item] = " " for item in extra_int_columns: new_doc[item] = 0 new_doc['Date'] = datetime.datetime.now().replace(hour=6, minute=0) new_doc['Date'] = str(new_doc['Date'].date()) new_doc['Chart_type'] = 'SoundCloud' new_doc['Rank'] = count try: table = 'songs_chart' sql = insert_from_dict(table, new_doc) sql_cursor.execute(sql, new_doc) mydb.commit() except MySQLdb.IntegrityError as e: print str(e) qry = 'UPDATE songs_chart SET {}'.format( ', '.join('{}=%s'.format(k) for k in new_doc) ) sql_cursor.execute(qry, new_doc.values()) mydb.commit() except Exception as e: print str(e) count += 1 toLog('End Migration to MySQL', 'db')
def get_video_info(video_id): doc = {} try: video = open_url_api(video_id) if 'items' in video: if video['items'] == []: doc['private'] = True doc['update_video_data'] = datetime.datetime.now() return doc snippet = video.get('items', [])[0].get('snippet', {}) statistics = video.get('items', [])[0].get('statistics', {}) doc['thumbnails'] = snippet.get('thumbnails', '') doc['title'] = snippet.get('title', '') doc['channel_id'] = snippet.get('channelId', '') doc['category_id'] = snippet.get('categoryId', '') doc['published_at'] = parser.parse(snippet.get('publishedAt', '')) doc['channel_title'] = snippet.get('channelTitle', '') doc['description'] = snippet.get('description', '') doc['keywords'] = snippet.get('tags', '') doc['comment_count'] = int(statistics.get('commentCount', 0)) doc['dislikes'] = int(statistics.get('dislikeCount', 0)) doc['favorite_count'] = int(statistics.get('favoriteCount', 0)) doc['all_views'] = int(statistics.get('viewCount', 0)) doc['likes'] = int(statistics.get('likeCount', 0)) if ' - ' in doc.get('title', ''): splited = doc['title'].split(' - ') doc['artist'] = splited[0].strip() doc['song_title'] = ' - '.join(splited[1:]) doc['song_title'] = doc['song_title'].strip() else: doc['artist'] = doc.get('title', '') doc['song_title'] = doc.get('title', '') doc['has_yesterday'] = True doc['update_video_data'] = datetime.datetime.now() return doc except Exception as e: data_log = {'video_id': video_id} data_log['type'] = 'update_data' data_log['date'] = datetime.datetime.now() if 'list index out of range' in str(e): msg = "Video Id: {0} can't be fetch".format(video_id) data_log['reason'] = msg toLog(msg, 'error') else: toLog(e, 'error') data_log['reason'] = str(e) # cursor.logs.insert(data_log) return doc
def executor_crawl(_date, name, criteria, next_page_token=None): youtube = build_youtube_api() # Call the search.list method to retrieve results matching the specified # query term. if name == 'Daily': if next_page_token: search_response = youtube.search().list( q=criteria['q'], maxResults=criteria['max_results'], part="id,snippet", type='video', order=criteria['order'], pageToken=next_page_token, videoCategoryId=criteria['category_id'], # Music/Entertaiment publishedAfter=_date[0].strftime('%Y-%m-%dT%H:%M:%SZ'), # publishedBefore=_to.strftime('%Y-%m-%dT%H:%M:%SZ'), ).execute() else: search_response = youtube.search().list( q=criteria['q'], maxResults=criteria['max_results'], part="id,snippet", type='video', order=criteria['order'], videoCategoryId=criteria['category_id'], # Music/Entertaiment publishedAfter=_date[0].strftime('%Y-%m-%dT%H:%M:%SZ'), # publishedBefore=_to.strftime('%Y-%m-%dT%H:%M:%SZ'), ).execute() else: if next_page_token: search_response = youtube.search().list( q=criteria['q'], maxResults=criteria['max_results'], part="id,snippet", type='video', order=criteria['order'], pageToken=next_page_token, videoCategoryId=criteria['category_id'], # Music/Entertaiment publishedAfter=_date[0].strftime('%Y-%m-%dT%H:%M:%SZ'), publishedBefore=_date[1].strftime('%Y-%m-%dT%H:%M:%SZ'), ).execute() else: search_response = youtube.search().list( q=criteria['q'], maxResults=criteria['max_results'], part="id,snippet", type='video', order=criteria['order'], videoCategoryId=criteria['category_id'], # Music/Entertaiment publishedAfter=_date[0].strftime('%Y-%m-%dT%H:%M:%SZ'), publishedBefore=_date[1].strftime('%Y-%m-%dT%H:%M:%SZ'), ).execute() # Add each result to the appropriate list, and then display the lists of # matching videos, channels, and playlists. duplicate = 0 for search_result in search_response.get("items", []): _video = {'created_date': datetime.datetime.now()} category_trans = {'24': 'Entertainment', '10': 'Music'} _video['category_name'] = category_trans.get( criteria['category_id'], 'UnKnown' ) if search_result["kind"] == "youtube#video": _publish = parser.parse(search_result['snippet']['publishedAt']) _video['href'] = 'https://www.youtube.com/watch?v=' _video['img'] = search_result['snippet']['thumbnails']['default'] _video['title'] = search_result['snippet']['title'] _video['channel_id'] = search_result['snippet']['channelId'] _video['channel_title'] = search_result['snippet']['channelTitle'] _video['published_at'] = _publish _video['description'] = search_result['snippet']['description'] if 'id' in search_result['snippet']: _video['href'] += search_result['snippet']['id']['videoId'] _video['id'] = search_result['snippet']['id']['videoId'] else: _video['href'] += search_result['id']['videoId'] _video['id'] = search_result['id']['videoId'] try: result = cursor.refined_data.insert(_video) except DuplicateKeyError: result = True if (duplicate % 1000) == 0: toLog("Duplicate Error: 1000 records", 'error') duplicate += 1 if not result: msg = "Crawling Error: It can't be save record" msg += "{0}".format(search_result) toLog(msg, 'error') elif search_result["kind"] == "youtube#searchResult": _publish = parser.parse(search_result['snippet']['publishedAt']) _video['href'] = 'https://www.youtube.com/watch?v=' _video['img'] = search_result['snippet']['thumbnails']['default'] _video['title'] = search_result['snippet']['title'] _video['channel_id'] = search_result['snippet']['channelId'] _video['channel_title'] = search_result['snippet']['channelTitle'] _video['published_at'] = _publish _video['description'] = search_result['snippet']['description'] if 'id' in search_result['snippet']: _video['href'] += search_result['snippet']['id']['videoId'] _video['id'] = search_result['snippet']['id']['videoId'] else: _video['href'] += search_result['id']['videoId'] _video['id'] = search_result['id']['videoId'] try: result = cursor.refined_data.insert(_video) except DuplicateKeyError: result = True if (duplicate % 1000) == 0: toLog("Duplicate Error: 1000 records", 'error') duplicate += 1 if not result: msg = "Crawling Error: It can't be save record" msg += "{0}".format(search_result) toLog(msg, 'error') else: toLog("UnHandled Crawling: {0}".format(search_result), 'debug') if not result: toLog("Crawling Error: It can't be save record", 'error') # Create Next Page next_page_token = search_response.get("nextPageToken") return next_page_token
def __init__(self, msg, code=401): jsonrpclib.Fault.__init__(self, code, msg) msg = "Message: {0}, Code: {1}".format(msg, code) toLog(msg, 'error')
def generateException(self): release_error = eval('%s' % self.type_error)(self.msg) toLog(str(release_error), 'error') return release_error