Python toLog Examples, core.toLog Python Examples

Example #1

0

Show file

def before_finish(uuid, username, address, start_time, ts, *args):
    if BEFORE_DONE:
        func_name = args[0].__class__.__full_name__
        msg = "Before Finish << {} >> username: {}  -|- "
        msg += "func: {} -- address: {} -- args: {}"
        msg = msg.format(uuid, username, func_name, address, args[1:])
        toLog(msg, 'request')

Example #2

0

Show file

    def checking(*args, **kwargs):
        """
           Exception Handler
        """
        try:
            return cls(*args, **kwargs)

        except Exception as e:

            toLog(traceback.format_exc(), 'error')
            msg = str(e)

            if 'run()' in msg:
                name = args[0].__full_name__
                msg = msg.replace('run()', "Function {0}()".format(name))

            toLog(msg, 'error')

            try:
                error = ErrorGeneratorFromRaise(
                    msg, e.type_error).generateException()
                return error

            except:
                error = GeneralError(msg)
                return error

Example #3

0

Show file

File: base_handler.py Project: ali-hallaji/social_crawler

    def checking(*args, **kwargs):
        """
           Exception Handler
        """
        try:
            return cls(*args, **kwargs)

        except Exception as e:

            toLog(traceback.format_exc(), 'error')
            msg = str(e)

            if 'run()' in msg:
                name = args[0].__full_name__
                msg = msg.replace('run()', "Function {0}()".format(name))

            toLog(msg, 'error')

            try:
                error = ErrorGeneratorFromRaise(
                    msg, e.type_error).generateException()
                return error

            except:
                error = GeneralError(msg)
                return error

Example #4

0

Show file

 def fetch_sp(self):
     while True:
         try:
             time.sleep(1.5)
             sp = gen_sp()
             return sp
         except:
             toLog(traceback.format_exc(), 'error')

Example #5

0

Show file

 def run(self):
     while True:
         func, args, kargs = self.tasks.get()
         try:
             func(*args, **kargs)
         except Exception, e:
             toLog('Thread Error: %s' % e, 'error')
         self.tasks.task_done()

Example #6

0

Show file

File: soundcloud_func.py Project: gitscamI/social_crawler

def soundcloud_update():
    toLog("Start updating soundcloud ", 'jobs')
    less_today = datetime.datetime.now().replace(hour=2, minute=30, second=0)
    _criteria = {
        '$or': [
            {'update_track_data': {'$lte': less_today}},
            {'update_track_data': {'$exists': False}}
        ]
    }
    projection = {
        'id': 1
    }
    all_tracks = cursor_soundcloud.refined_data.find(
        _criteria,
        projection,
        no_cursor_timeout=True
    )

    count = 1
    print all_tracks.count()
    print datetime.datetime.now()

    for track in all_tracks:
        time.sleep(0.15)
        try:
            new_track = track_info(track)
            refine_track = today_yesterday_data(new_track, track)

            if not refine_track:
                toLog(
                    'Unsuccessful update: {0}'.format(track['id']), 'jobs'
                )
                continue

            criteria = {'_id': track['_id']}
            _update = {'$set': refine_track}
            update = cursor_soundcloud.refined_data.update_one(
                criteria,
                _update
            )

            if not update.raw_result.get('updatedExisting', None):
                count += 1
                msg = "The video with this id"
                msg += " '{0}' can't be updated".format(track['id'])

                if (count % 100) == 0:
                    toLog(msg, 'db')

        except Exception as e:
            count += 1

            if (count % 1000) == 0:
                toLog(str(e), 'error')

    toLog("End updating soundcloud ", 'jobs')
    sc_most_played()

Example #7

0

Show file

File: register.py Project: ali-hallaji/social_crawler

 def registerPlugin():
     """
        This module register every plugin that calls this method
     """
     toLog('CORE_SERVICES_API :: add plugin << %s >> to list of '
           'all plugins' % cls.__name__, 'service')
     plugin_functions.append(cls.__full_name__)
     plugin = plugin_handler.initHandler()
     plugin.registerPlugin(cls())

Example #8

0

Show file

def execute_batch(_date, name, criteria):
    next_page = None

    for i in range(1, (batch_loop + 1)):
        try:
            next_page = executor_crawl(_date, name, criteria, next_page)

        except Exception as e:
            toLog(str(e), 'error')

Example #9

0

Show file

    def add_task(self, func, *args, **kwargs):
        """Add a task to the queue"""

        key = random.randint(1000, 100000)
        msg = "Calling background process with id: {0} -- func: {1}"
        toLog(msg.format(key, func.__name__), 'jobs')

        self.tasks.put((func, args, kwargs))

        toLog("End of background process with id: {0}".format(key), 'jobs')

Example #10

0

Show file

File: register.py Project: ali-hallaji/spotify_crawling

 def registerPlugin():
     """
        This module register every plugin that calls this method
     """
     toLog(
         'CORE_SERVICES_API :: add plugin << %s >> to list of '
         'all plugins' % cls.__name__, 'service')
     plugin_functions.append(cls.__full_name__)
     plugin = plugin_handler.initHandler()
     plugin.registerPlugin(cls())

Example #11

0

Show file

File: plugin_loader.py Project: ali-hallaji/social_crawler

    def __getFilesList(self, directory):
        """
            return list of all files in "directory"
        """
        # TODO:get directory which running
        try:
            return os.listdir(directory)

        except OSError as e:
            toLog('PluginLoader.__getFilesList: %s' % e, 'error')
            return []

Example #12

0

Show file

def start_crawling():

    scheduler.add_job(bulk_jobs_from_dates,
                      trigger='cron',
                      hour=hour_crawl,
                      minute=minute_crawl,
                      args=[],
                      timezone=local_tz)

    msg = "Start crawling."
    toLog(msg, 'jobs')

Example #13

0

Show file

        def __init__(self):
            BaseRPC.__init__(self)
            load_module = __import__("services")
            classobj = getattr(load_module.plugins, name)
            load_flag = classobj.getFlag()

            if not load_flag:
                self.loadPlugins(name)

            self.setPlugins()
            toLog('<%s added to services>' % class_name, 'service')

Example #14

0

Show file

    def __getFilesList(self, directory):
        """
            return list of all files in "directory"
        """
        # TODO:get directory which running
        try:
            return os.listdir(directory)

        except OSError as e:
            toLog('PluginLoader.__getFilesList: %s' % e, 'error')
            return []

Example #15

0

Show file

def update_crawl_data():

    scheduler.add_job(start_updating_jobs,
                      trigger='cron',
                      hour=hour_update,
                      minute=minute_update,
                      args=[],
                      timezone=local_tz)

    # result = send_request('crawler.cycle_update', '')
    msg = "Start new jobs for update crawling data."
    toLog(msg, 'jobs')

Example #16

0

Show file

File: tracks_crawling.py Project: ali-hallaji/spotify_crawling

    def create_capped_collection(self):
        if CAPPED_NAME not in self.coll_names:
            result = cursor.create_collection(CAPPED_NAME,
                                              capped=True,
                                              size=5e+8,
                                              max=CAPPED_SIZE)

            if result:
                msg = 'Capped collection "middle" has been created: '
                toLog((msg + str(result)), 'db')

        else:
            self.set_zero_capped_collection()

Example #17

0

Show file

File: plugin_loader.py Project: ali-hallaji/social_crawler

    def __callInits(self, modules):
        """
            call init function of all modules in "modules" dict
        """
        for obj in modules.itervalues():

            try:

                if hasattr(obj, 'init'):
                    obj.init()

            except:
                toLog('PluginLoader.__callInits', 'error')

Example #18

0

Show file

def create_crawl_job():
    time_list = [2, 2.12, 3, 2.2, 2.75, 2.6, 1.1, 2.31, 2.5]
    msg = "start crawler jobs"
    toLog(msg, 'jobs')

    for i in range(1, max_page_crawl + 1):

        for case in keyword_list:
            crawl_search(case, i)
            time.sleep(random.choice(time_list))

    msg = "end crawler jobs"
    toLog(msg, 'jobs')

Example #19

0

Show file

File: get_settings.py Project: ali-hallaji/social_crawler

def get_settings(settings_type, key):
    settings = cursor_local.settings.find_one({'settings_type': settings_type})

    if settings:

        if key:
            return settings[key]

        else:
            return settings

    else:
        toLog('Get Settings: No settings in DB', 'error')

Example #20

0

Show file

File: initialize_functions.py Project: ali-hallaji/social_crawler

def create_crawl_job():
    time_list = [2, 2.12, 3, 2.2, 2.75, 2.6, 1.1, 2.31, 2.5]
    msg = "start crawler jobs"
    toLog(msg, 'jobs')

    for i in range(1, max_page_crawl + 1):

        for case in keyword_list:
            crawl_search(case, i)
            time.sleep(random.choice(time_list))

    msg = "end crawler jobs"
    toLog(msg, 'jobs')

Example #21

0

Show file

    def __callInits(self, modules):
        """
            call init function of all modules in "modules" dict
        """
        for obj in modules.itervalues():

            try:

                if hasattr(obj, 'init'):
                    obj.init()

            except:
                toLog('PluginLoader.__callInits', 'error')

Example #22

0

Show file

File: get_settings.py Project: ali-hallaji/spotify_crawling

def get_settings(settings_type, key):
    settings = cursor_local.settings.find_one({'settings_type': settings_type})

    if settings:

        if key:
            return settings[key]

        else:
            return settings

    else:
        toLog('Get Settings: No settings in DB', 'error')

Example #23

0

Show file

File: initialize_functions.py Project: ali-hallaji/social_crawler

def start_crawling():

    scheduler.add_job(
        bulk_jobs_from_dates,
        trigger='cron',
        hour=hour_crawl,
        minute=minute_crawl,
        args=[],
        timezone=local_tz 
    )

    msg = "Start crawling."
    toLog(msg, 'jobs')

Example #24

0

Show file

File: soundcloud_func.py Project: gitscamI/social_crawler

def track_info(track_doc):

    proxies = {
        'http': 'http://*****:*****@170.130.59.249:3128',
        'https': 'https://*****:*****@170.130.59.249:3128'
    }
    headers = {
        'User-Agent': 'Maryam&Ali'
    }

    try:
        url = "https://api-v2.soundcloud.com/tracks/" + str(track_doc['id'])
        url += "?client_id=" + SOUNDCLOUD_ID
        track = requests.get(url, headers=headers, proxies=proxies)
        # track = requests.get(url)
        track = track.json()

        if 'last_modified' in track:
            track['last_modified'] = parser.parse(
                track['last_modified']
            )

        if 'created_at' in track:
            track['created_at'] = parser.parse(track['created_at'])

        if 'user' in track:
            if 'username' in track['user']:
                track['username'] = track['user']['username']

            del track['user']

        track['has_yesterday'] = True
        track['update_track_data'] = datetime.datetime.now()

        return track

    except Exception as e:
        data_log = {'track_id': track_doc['id']}
        data_log['type'] = 'update_data'
        data_log['date'] = datetime.datetime.now()

        if 'list index out of range' in str(e):
            msg = "Track Id: {0} can't be fetch".format(track_doc['id'])
            data_log['reason'] = msg
            toLog(msg, 'error')

        else:
            toLog(e, 'error')

        data_log['reason'] = str(e)
        cursor_soundcloud.logs.insert(data_log)

Example #25

0

Show file

File: initialize_functions.py Project: ali-hallaji/social_crawler

def update_crawl_data():

    scheduler.add_job(
        start_updating_jobs,
        trigger='cron',
        hour=hour_update,
        minute=minute_update,
        args=[],
        timezone=local_tz 
    )

    # result = send_request('crawler.cycle_update', '')
    msg = "Start new jobs for update crawling data."
    toLog(msg, 'jobs')

Example #26

0

Show file

File: async_call.py Project: gitscamI/social_crawler

def timeit(result, username, address, ts, *args):

    if DEBUG:
        # End of time execute work
        func_name = args[0].__class__.__full_name__
        te = time.time()

        msg = "RPC Call username: {0}  -- time: {1:2.4f} sec -- "
        msg += "func: {2} -- address: {3} -- args: {4}"
        msg = msg.format(username, te - ts, func_name, address, args[1:])
        toLog(msg, 'request')

        # set_activity_log(username, address, func_name, args[1:])

    return result

Example #27

0

Show file

File: query_handler.py Project: ali-hallaji/social_crawler

def select_send_request(host, port, *args):
    output = {'error': {}, 'result': {}}
    rpc_id = random_id()
    args = args + (rpc_id,)
    result = None

    try:
        result = get_server(host, port)._request(*args)

        if isinstance(result, str) or isinstance(result, unicode):

            try:
                output['result'] = loads(result)

            except ValueError:

                if ObjectId.is_valid(str(result)):
                    output['result'] = str(result)

                else:
                    raise

        elif isinstance(result, dict) or isinstance(result, list):
            output['result'] = result

        elif isinstance(result, bool):
            output['result'] = result

        elif not result:
            output['result'] = {}

        else:
            raise TypeError

    except Exception as e:
        message_error = 'Func: {}, Error Type: {}, Error: {}, Result: {}, Message: {}'.format(
                str(args[0]), type(e), handle_errors(e, rpc_id), result, str(e)
            )
        output['error'] = handle_errors(e, rpc_id), message_error
        toLog(message_error, 'error')

        if hasattr(e, 'strerror') and (e.strerror == "No route to host"):
            msg = "RPC Connection Failed: The Core server is unavailable."
            msg += "\nThe system couldn't connect to core with this address:"
            msg += ' {0}'.format(settings.VIR_SERVER)
            raise ConnectionFailed(msg)

    return output

Example #28

0

Show file

File: query_handler.py Project: ali-hallaji/spotify_crawling

def select_send_request(server, *args):
    output = {'error': {}, 'result': {}}
    rpc_id = random_id()
    args = args + (rpc_id,)
    result = None

    try:
        result = get_server(server)._request(*args)

        if isinstance(result, str) or isinstance(result, unicode):

            try:
                output['result'] = loads(result)

            except ValueError:

                if ObjectId.is_valid(str(result)):
                    output['result'] = str(result)

                else:
                    raise

        elif isinstance(result, dict) or isinstance(result, list):
            output['result'] = result

        elif isinstance(result, bool):
            output['result'] = result

        elif not result:
            output['result'] = {}

        else:
            raise TypeError

    except Exception as e:
        message_error = 'Func: {}, Error Type: {}, Error: {}, Result: {}, Message: {}'.format(
                str(args[0]), type(e), handle_errors(e, rpc_id), result, str(e)
            )
        output['error'] = handle_errors(e, rpc_id), message_error
        toLog(message_error, 'error')

        if hasattr(e, 'strerror') and (e.strerror == "No route to host"):
            msg = "RPC Connection Failed: The Core server is unavailable."
            msg += "\nThe system couldn't connect to core with this address:"
            msg += ' {0}'.format(settings.VIR_SERVER)
            raise ConnectionFailed(msg)

    return output

Example #29

0

Show file

File: data_cache.py Project: ali-hallaji/social_crawler

    def __getitem__(self, key):
        c = self.cache[key]
        n = datetime.now()

        if n - c['timestamp'] < c['expireTime'] or not self.processExpires:
            return c['data']

        msg = 'Deleted << "{0}" >> object from << "{1}" >> DataCache!!!'
        toLog(msg.format(self.cache[key], self.__name__), 'object')

        del self.cache[key]

        if self.dbg:
            toLog('DataCache: Key %s expired' % repr(key), 'object')

        raise KeyExpiredError(key)

Example #30

0

Show file

File: data_cache.py Project: ali-hallaji/spotify_crawling

    def __getitem__(self, key):
        c = self.cache[key]
        n = datetime.now()

        if n - c['timestamp'] < c['expireTime'] or not self.processExpires:
            return c['data']

        msg = 'Deleted << "{0}" >> object from << "{1}" >> DataCache!!!'
        toLog(msg.format(self.cache[key], self.__name__), 'object')

        del self.cache[key]

        if self.dbg:
            toLog('DataCache: Key %s expired' % repr(key), 'object')

        raise KeyExpiredError(key)

Example #31

0

Show file

    def set_name_spacing(self):
        """
            Please set your component in this here.
            This is follow namespace concept in your API service.
            Usage:
                Please call your component after API cursor when connection is
                connected to the server.
        """
        for component in installed_component:

            try:
                klass = generate_class_component(component)
                self.putSubHandler(component, klass())

            except Exception as e:
                toLog("{}".format(e), 'error')
                msg = "Component {} Faild to register!".format(component)
                toLog(msg, 'error')

Example #32

0

Show file

File: main_json_rpc.py Project: ali-hallaji/social_crawler

    def set_name_spacing(self):
        """
            Please set your component in this here.
            This is follow namespace concept in your API service.
            Usage:
                Please call your component after API cursor when connection is
                connected to the server.
        """
        for component in installed_component:

            try:
                klass = generate_class_component(component)
                self.putSubHandler(component, klass())

            except Exception as e:
                toLog("{}".format(e), 'error')
                msg = "Component {} Faild to register!".format(component)
                toLog(msg, 'error')

Example #33

0

Show file

def crawl_search(keyword, page):
    if ' ' in keyword:
        keyword = keyword.replace(' ', '+')

    url = 'https://www.youtube.com/results?search_sort=video_view_count'
    # url += '&filters=today'
    url += '&search_query=' + keyword
    url += '&page={0}'.format(page)

    text = requests.get(url).text
    soup = bs4.BeautifulSoup(text, "html.parser")

    div = []

    for d in soup.find_all('div'):
        if d.has_attr('class') and 'yt-lockup-dismissable' in d['class']:
            div.append(d)

    for d in div:
        doc = {'created_date': datetime.datetime.now()}
        img0 = d.find_all('img')[0]
        a0 = d.find_all('a')[0]

        if not img0.has_attr('data-tumb'):
            doc['img'] = img0['src']

        else:
            doc['img'] = img0['data-tumb']

        a0 = [x for x in d.find_all('a') if x.has_attr('title')][0]
        doc['title'] = a0['title']
        doc['href'] = 'https://www.youtube.com' + a0['href']
        doc['id'] = get_video_id(doc['href'])

        try:
            result = cursor.refined_data.insert(doc)

        except DuplicateKeyError:
            result = True
            toLog("Crawling Error: It can't be save record", 'error')

        if not result:
            toLog("Crawling Error: It can't be save record", 'error')

Example #34

0

Show file

def bulk_jobs_from_dates():
    #reactor.callInThread(soundcloud_runner)
    # tuple_month_list = divide_datetime(period_years)

    now = datetime.datetime.now()
    last_day = now - datetime.timedelta(days=1)
    last_week = now - datetime.timedelta(days=7)
    last_month = now - datetime.timedelta(days=31)
    last_year = now - datetime.timedelta(days=365)
    ten_years = now - datetime.timedelta(days=(365 * 10))

    weekly = (last_week, last_day)
    monthly = (last_month, last_week)
    yearly = (last_year, last_month)
    ten = (ten_years, last_year)

    date_list = [
        ((last_day, "Now"), 'Daily'),
        (weekly, 'Weekly'),
        (monthly, 'Monthly'),
        (yearly, 'Yearly'),
        (ten, 'Ten years')
    ]
    category_list = ['10', '24']
    order_list = ['date', 'rating', 'relevance', 'viewCount']

    for order in order_list:
        for _date, _name in date_list:
            for item in category_list:
                criteria = {
                    'max_results': 50,
                    'q': '',
                    'category_id': item,
                    'order': order
                }
                result = execute_batch(_date, _name, criteria)

                msg = _name + " Crawler Jobs"
                msg += " from: {0} | category: {1}".format(_date, item)
                msg += "{0}".format(str(result))
                toLog(msg, 'jobs')

    delete_video()

Example #35

0

Show file

def timeit(result, uuid, username, address, start_time, ts, *args):
    if AFTER_DONE:
        # End of time execute work
        func_name = args[0].__class__.__full_name__
        te = time.time()

        msg = "After Done (Finish) == {} ==> start from: {} >|< username: {} "
        msg += "-|- time: {:2.4f} sec -|- func: {} -- address: {} -- args: {}"

        if DEBUG_RESULT:
            msg += " -- result: {}"
            msg = msg.format(uuid, start_time, username, te - ts, func_name,
                             address, args[1:], result)
        else:
            msg = msg.format(start_time, username, te - ts, func_name, address,
                             args[1:])

        toLog(msg, 'request')
        # set_activity_log(username, address, func_name, args[1:])

    return result

Example #36

0

Show file

    def __registerModuleAttributes(self, module, mapping):
        """
            Find all attribute classes in module and register
            it in attribute factory attribute classes are inherited
            from attributNe parents available in mapping
        """
        toLog('RegisterModuleAttributes: processing module: %s' % module,
              'service')

        for obj_name in dir(module):
            obj = getattr(module, obj_name)

            if self.__isClass(obj):

                for klass in mapping:
                    toLog(
                        'RegisterModuleAttributes: '
                        'obj = %s klass: %s subclass = %s' %
                        (obj, klass, issubclass(obj, klass)), 'service')

                    if issubclass(obj, klass) and obj != klass:
                        mapping[klass](obj)

Example #37

0

Show file

File: tracks_crawling.py Project: ali-hallaji/spotify_crawling

    def run(self, _continue=False):
        if not _continue:
            counter = CAPPED_SIZE
            self.create_capped_collection()
            self.fill_capped_collection()
        else:
            counter = cursor[CAPPED_NAME].count()

        for i in range(counter):
            sort = [("followers", DESCENDING)]
            doc = cursor[CAPPED_NAME].find_one_and_delete({}, sort=sort)

            if doc:
                try:
                    sp = gen_sp()
                except Exception as e:
                    toLog('Spotify API: {}'.format(str(e)), 'error')

                response = sp.user_playlist_tracks(doc['owner_id'],
                                                   doc['playlist_id'], None,
                                                   100, 0)
                tracks = response.get('items', [])
                self.save_tracks(tracks, doc)

                one = 'next' in response and response['next'] is not None
                while one and response.get('next', ''):
                    if not self.allow_time():
                        return

                    try:
                        sp = gen_sp()
                        response = sp.next(response)
                        self.save_tracks(response.get('items', []), doc)

                    except SpotifyException:
                        continue

                    except Exception as e:
                        toLog("{}".format(e), 'error')

Example #38

0

Show file

File: plugin_loader.py Project: ali-hallaji/social_crawler

    def __registerModuleAttributes(self, module, mapping):
        """
            Find all attribute classes in module and register
            it in attribute factory attribute classes are inherited
            from attributNe parents available in mapping
        """
        toLog('RegisterModuleAttributes: processing module: %s' % module,
              'service')

        for obj_name in dir(module):
            obj = getattr(module, obj_name)

            if self.__isClass(obj):

                for klass in mapping:
                    toLog('RegisterModuleAttributes: '
                          'obj = %s klass: %s subclass = %s'
                          % (obj, klass, issubclass(obj, klass)),
                          'service')

                    if issubclass(obj, klass) and obj != klass:
                        mapping[klass](obj)

Example #39

0

Show file

File: query_handler.py Project: ali-hallaji/spotify_crawling

def handle_errors(error, rpc_id):
    result = {}

    if isinstance(error, ProtocolError):
        response_error = loads(history.response)
        toLog(response_error, 'debug')

        try:
            list_some_formal_error = ['ParamsError', 'GeneralError']

            for _error in list_some_formal_error:

                if _error in error.message:
                    result = {error.message[1]: (
                        error.message[0], response_error['error'].get('data'))}

                else:
                    result = response_error['error']['data']

        except KeyError:
            result = response_error

    elif isinstance(error, KeyError):
        response_error = loads(history.response)
        toLog(response_error, 'debug')

        if rpc_id == response_error['id'] and 'error' in response_error and 'fault' in response_error['error']:
            result['message'] = response_error['error'].pop('fault')
            result['code'] = response_error['error'].pop('faultCode')
            result['data'] = response_error['error'].pop('faultString')

        else:
            result['message'] = 'Client RPC Issue!'

    elif isinstance(error, socket_error):
        result['code'] = error[0]
        result['message'] = error[1]

    return result

Example #40

0

Show file

File: query_handler.py Project: ali-hallaji/social_crawler

def handle_errors(error, rpc_id):
    result = {}

    if isinstance(error, ProtocolError):
        response_error = loads(history.response)
        toLog(response_error, 'debug')

        try:
            list_some_formal_error = ['ParamsError', 'GeneralError']

            for _error in list_some_formal_error:

                if _error in error.message:
                    result = {error.message[1]: (
                        error.message[0], response_error['error'].get('data'))}

                else:
                    result = response_error['error']['data']

        except KeyError:
            result = response_error

    elif isinstance(error, KeyError):
        response_error = loads(history.response)
        toLog(response_error, 'debug')

        if rpc_id == response_error['id'] and 'error' in response_error and 'fault' in response_error['error']:
            result['message'] = response_error['error'].pop('fault')
            result['code'] = response_error['error'].pop('faultCode')
            result['data'] = response_error['error'].pop('faultString')

        else:
            result['message'] = 'Client RPC Issue!'

    elif isinstance(error, socket_error):
        result['code'] = error[0]
        result['message'] = error[1]

    return result

Example #41

0

Show file

File: soundcloud_func.py Project: gitscamI/social_crawler

def catharsis(tracks):
    counter = 0
    for track in tracks['collection']:
        track = pre_catharsis(track)
        track['created_date'] = datetime.datetime.now()

        if not track.get('isrc', None):
            if track.get('publisher_metadata', None):
                track['isrc'] = track["publisher_metadata"].get("isrc", None)

        if 'user' in track:
            if 'username' in track['user']:
                track['username'] = track['user']['username']

            del track['user']

        if 'last_modified' in track:
            track['last_modified'] = parser.parse(
                track['last_modified']
            )

        if 'created_at' in track:
            track['created_at'] = parser.parse(track['created_at'])

        try:
            result = cursor_soundcloud.refined_data.insert(track)

        except DuplicateKeyError:
            counter += 1
            result = True

            if (counter % 25) == 0:
                toLog("Duplicate Error: It can't be save record", 'error')

        if not result:
            msg = "Crawling Error: It can't be save record"
            msg += "{0}".format(track)
            toLog(msg, 'error')

Example #42

0

Show file

    def save_to_db(self, playlists):
        ids = []
        for doc in playlists:
            data = {}
            try:
                if ('id' in doc) and doc['id']:
                    data['playlist_id'] = doc['id']
                else:
                    toLog("{}".format(doc), 'lost_ids')
                    continue

                try:
                    data['name'] = doc.get('name', '').strip()
                except AttributeError:
                    data['name'] = ''

                data['created_date'] = datetime.datetime.now()
                data['href'] = doc.get('href', None)
                data['external_url'] = doc.get('external_urls',
                                               {}).get('spotify', None)
                data['uri'] = doc.get('uri', None)
                data['owner_external_url'] = doc.get('owner', {}).get(
                    'external_urls', {}).get('spotify', None)
                data['owner_id'] = doc.get('owner', {}).get('id', None)
                data['owner_href'] = doc.get('owner', {}).get('href', None)
                data['owner_uri'] = doc.get('owner', {}).get('uri', None)
                cursor.playlist.replace_one(
                    {'playlist_id': data['playlist_id']}, data, upsert=True)
                ids.append(data)

            except AttributeError:
                pass

        try:
            # Start Update info Playlist
            self.update_info(ids)
        except:
            toLog(traceback.format_exc(), 'error')

Example #43

0

Show file

    def save_tracks(self, tracks, pl):
        for per, track in enumerate(tracks, 1):
            doc = {}
            if 'track' in track and track['track']:

                artists = ""
                for artist in track['track'].get('artists', []):
                    artists += artist['name'] + ", "
                artists = artists[:-2]

                href = track['track'].get('external_urls',
                                          {}).get('spotify', "")
                isrc = track['track'].get('external_ids', {}).get('isrc', '')

                doc['song_name'] = track['track'].get('name', "")
                doc['created_date'] = datetime.datetime.now()
                doc['playlist_name'] = pl['name']
                doc['playlist_followers'] = pl['followers']
                doc['playlist_owner'] = pl['owner_id']
                doc['playlist_href'] = pl['external_url']
                doc['playlist_id'] = pl['playlist_id']
                doc['song_id'] = track['track'].get('id', '')
                doc['playlist_description'] = pl['description']
                doc['artist'] = artists.strip()
                doc['href'] = href
                doc['uri'] = track['track'].get('uri', '')
                doc['song_position'] = per
                doc['popularity'] = track['track'].get('popularity', 0)
                doc['isrc'] = isrc
                doc['allbum'] = track['track'].get('album', {}).get('name', '')

                try:
                    cursor.tracks.insert(doc)
                    self.check_history(doc)
                except DuplicateKeyError:
                    pass
                except:
                    toLog(traceback.format_exc(), 'error')

Example #44

0

Show file

File: func_tools.py Project: ali-hallaji/social_crawler

def delete_video():
    # now = datetime.datetime.now()
    # last_six_month = now - datetime.timedelta(days=31 * delete_month)

    _date = datetime.datetime.now().replace(hour=2, minute=30)
    last_date = _date - datetime.timedelta(days=7)

    criteria = {
        "published_at": {
            "$lt": last_date
        }
    }

    delete_id = cursor.refined_data.find(criteria, {'_id': 1})
    delete_id = delete_id.sort('all_views', DESCENDING)
    delete_id = list(delete_id.skip(70000))

    count = 0
    for _id in delete_id:
        cursor.refined_data.delete_one({'_id': _id['_id']})
        count += 1

    toLog('Removed "%s" videos from DB' % str(count), 'debug')

Example #45

0

Show file

File: scheduler.py Project: ali-hallaji/social_crawler

def job_logger(event):
    if event.code > 512:
        toLog('Job {}, code {}'.format(
            event.job_id, event_code_translator(event.code)), 'jobs')

    elif event > 64:
        toLog('Event {} for job {} happenend'.format(
            event_code_translator(event.code),
            event.job_id
        ), 'jobs')

    else:
        toLog('Event {} happenend'.format(
            event_code_translator(event.code)),
            'jobs'
        )

Example #46

0

Show file

File: func_tools.py Project: ali-hallaji/social_crawler

def start_updating_jobs():
    #reactor.callInThread(soundcloud_update,)

    less_today = datetime.datetime.now().replace(hour=2, minute=30, second=0)
    _criteria = {
        'private': {'$ne': True},
        '$or': [
            {'update_video_data': {'$lte': less_today}},
            {'all_views': {'$exists': False}}
        ]
    }
    _projection = {
        'id': 1
    }
    toLog('Start updating jobs criteria: {0}'.format(str(_criteria)), 'jobs')

    all_videos = cursor.refined_data.find(
        _criteria,
        _projection,
        no_cursor_timeout=True
    )

    for doc in all_videos:
        try:
            _id = doc['id']
            criteria = {'_id': doc['_id']}
            new_data = today_yesterday_data(_id)

            if not new_data:
                # toLog('Unsuccessful update: {0}'.format(_id), 'jobs')
                continue

            _update = {'$set': new_data}
            update = cursor.refined_data.update_one(criteria, _update)

            if not update.raw_result.get('updatedExisting', None):
                msg = "The video with this id"
                msg += " '{0}' can't be updated".format(_id)
                toLog(msg, 'db')

        except Exception as e:
            toLog(str(e), 'error')

    clean_title()
    yt_most_viewed()

Example #47

0

Show file

File: soundcloud_func.py Project: ali-hallaji/social_crawler

def soundcloud_runner():
    toLog("Start crawling soundcloud ", 'jobs')
    # client = soundcloud.Client(client_id=SOUNDCLOUD_ID)

    # now = datetime.datetime.now()
    # last_day = now - datetime.timedelta(days=1)
    # last_week = now - datetime.timedelta(days=7)
    # last_month = now - datetime.timedelta(days=31)
    # last_year = now - datetime.timedelta(days=365)
    # ten_years = now - datetime.timedelta(days=(365 * 10))

    # daily = {
    #     'from': last_day.strftime("%Y-%m-%d %H:%M:%S"),
    #     'to': now.strftime("%Y-%m-%d %H:%M:%S")
    # }
    # weekly = {
    #     'from': last_week.strftime("%Y-%m-%d %H:%M:%S"),
    #     'to': last_day.strftime("%Y-%m-%d %H:%M:%S")
    # }
    # monthly = {
    #     'from': last_month.strftime("%Y-%m-%d %H:%M:%S"),
    #     'to': last_week.strftime("%Y-%m-%d %H:%M:%S")
    # }
    # yearly = {
    #     'from': last_year.strftime("%Y-%m-%d %H:%M:%S"),
    #     'to': last_month.strftime("%Y-%m-%d %H:%M:%S")
    # }
    # ten = {
    #     'from': ten_years.strftime("%Y-%m-%d %H:%M:%S"),
    #     'to': last_year.strftime("%Y-%m-%d %H:%M:%S")
    # }

    # date_list = [
    #     (daily, 'Daily'),
    #     (weekly, 'Weekly'),
    #     (monthly, 'Monthly'),
    #     (yearly, 'Yearly'),
    #     (ten, 'Ten years')
    # ]

    kind_list = [
        'top',      # Top 50
        'trending'  # New & Hot
    ]

    genres_list = [
        "all-music",
        "all-audio",
        "alternativerock",
        "ambient",
        "classical",
        "country",
        "danceedm",
        "dancehall",
        "deephouse",
        "disco",
        "drumbass",
        "dubstep",
        "electronic",
        "folksingersongwriter",
        "hiphoprap",
        "house",
        "indie",
        "jazzblues",
        "latin",
        "metal",
        "piano",
        "pop",
        "rbsoul",
        "reggae",
        "reggaeton",
        "rock",
        "soundtrack",
        "techno",
        "trance",
        "trap",
        "triphop",
        "world",
        "audiobooks",
        "business",
        "comedy",
        "entertainment",
        "learning",
        "newspolitics",
        "religionspirituality",
        "science",
        "sports",
        "storytelling",
        "technology",
    ]

    proxies = {
        'http': 'http://*****:*****@170.130.59.249:3128',
        'https': 'https://*****:*****@170.130.59.249:3128'
    }
    headers = {
        'User-Agent': 'Maryam&Ali'
    }

    for kind in kind_list:
        for genre in genres_list:
            offset = 0
            for i in range(1, num_pages + 1):
                url = "https://api-v2.soundcloud.com"
                url += "/charts?kind={0}".format(kind)
                url += "&genre=soundcloud:genres:{0}&client".format(genre)
                url += "_id={0}&offset={1}&".format(SOUNDCLOUD_ID, offset)
                url += "limit={0}&linked_partitioning=1".format(page_length)
                time.sleep(0.15)
                data = requests.get(url, headers=headers, proxies=proxies)
                # data = requests.get(url)

                try:
                    loads_data = data.json()

                    if loads_data and 'error' not in loads_data:
                        catharsis(loads_data)
                    else:
                        pass

                except Exception as e:
                    print str(e)

                offset += page_length

    toLog("End crawling soundcloud ", 'jobs')

Example #48

0

Show file

File: __init__.py Project: ali-hallaji/social_crawler

from config.settings import installed_component
from core import toLog
__all__ = installed_component


try:
    # because we want to import using a variable, do it this way
    # create a global object containging our component
    import_component = __import__('services.plugins.', fromlist=["*"])
    globals()['services.plugins.'] = import_component

except ImportError as e:
    error = "{}".format(e)
    toLog(error, 'error')

Example #49

0

Show file

File: migrate_to_mysql.py Project: ali-hallaji/social_crawler

def yt_most_viewed():
    toLog('Start Migration to MySQL', 'db')

    mydb = MySQLdb.connect(
        SQL_HOST,
        SQL_USER,
        SQL_PASS,
        SQL_DB,
        charset='utf8mb4',
        use_unicode=True
    )
    sql_cursor = mydb.cursor()
    sql_cursor.execute("SET NAMES utf8mb4;")
    sql_cursor.execute("SET CHARACTER SET utf8mb4;")
    sql_cursor.execute("SET character_set_connection=utf8mb4;")

    query = "ALTER DATABASE newdatabase CHARACTER SET = utf8mb4"
    query += " COLLATE = utf8mb4_unicode_ci;"
    sql_cursor.execute(query)

    query = "ALTER TABLE songs_chart CONVERT TO CHARACTER SET"
    query += " utf8mb4 COLLATE utf8mb4_unicode_ci;"
    sql_cursor.execute(query)

    _date = datetime.datetime.now().replace(hour=2, minute=30)
    last_date = _date - datetime.timedelta(days=int(yt_settings('last_date')))

    criteria = {
        "$or": [
            {
                "update_video_data": {
                    "$gt": _date
                },
                "daily_views_yesterday": {
                    "$gt": 0
                }
            },
            {
                "published_at": {
                    "$gte": last_date
                }
            }
        ]
    }

    sql_column = {
        'published_at': 'ReleaseDate',
        'href': 'YTURL',
        'dislikes': 'YTDisLikes',
        'likes': 'YTLikes',
        'id': 'YTVideoID',
        'daily_views_today': 'YTDailyViews',
        'title': 'YTTitle',
        'comment_count': 'YTComments',
        'channel_title': 'YTChannel',
        'description': 'YTDescription',
        'daily_views_yesterday': 'YTDailyViewsYest',
        'channel_id': 'YTChannelID',
        'all_views': 'YTAllTimeViews',
        'category_name': 'YTCategory',
        'song_title': 'Song',
        'artist': 'Artist'
    }

    extra_int_columns = [
        'Met',
        'Flag',
        'Cover',
        'Favorite',
        'Listened_To',
        'Playlist',
        'Omit'
    ]

    extra_str_columns = [
        'Brand_new_artist',
        'Total',
        'Brand_new_song',
        'Tags',
        'Album',
        'Chart_name',
        'Genre',
        'Label',
        'Charts_today',
        'Charts_today_type',
        'Manager',
        'Agent',
        'Lawyer',
        'Notes',
        'Negotiation',
        'Price',
        'Chart_name_2'
    ]

    projection = {}
    for i in sql_column.keys():
        projection[i] = 1

    data = cursor.refined_data.find(
        criteria,
        projection,
        no_cursor_timeout=True
    )
    new_data = data.sort('daily_views_today', DESCENDING).limit(50000)

    if data:
        count = 1
        for doc in new_data:
            new_doc = {}

            for k, v in doc.items():
                if k != '_id':
                    if k == 'published_at':
                        new_doc[sql_column[k]] = str(v.date())

                    else:
                        if isinstance(v, int):
                            new_doc[sql_column[k]] = v

                        elif isinstance(v, float):
                            new_doc[sql_column[k]] = v

                        elif isinstance(v, long):
                            new_doc[sql_column[k]] = v

                        else:
                            if k == 'description':
                                if isinstance(v, basestring):
                                    text = v[:50].encode('utf8')
                                    text += ' ...'
                                    new_doc[sql_column[k]] = text

                                else:
                                    text = unicode(v[:50]).encode('utf8')
                                    text += ' ...'
                                    new_doc[sql_column[k]] = text

                            else:
                                if isinstance(v, basestring):
                                    text = v.encode('utf8')
                                    new_doc[sql_column[k]] = text

                                else:
                                    text = unicode(v).encode('utf8')
                                    new_doc[sql_column[k]] = text

            for item in extra_str_columns:
                new_doc[item] = ""

            for item in extra_int_columns:
                new_doc[item] = 0

            new_doc['Date'] = datetime.datetime.now().replace(hour=6, minute=0)
            new_doc['Date'] = str(new_doc['Date'].date())
            new_doc['Chart_type'] = 'YouTube'
            new_doc['Rank'] = count

            try:
                table = 'songs_chart'
                sql = insert_from_dict(table, new_doc)
                sql_cursor.execute(sql, new_doc)
                mydb.commit()

            except MySQLdb.IntegrityError as e:
                print str(e)
                qry = 'UPDATE songs_chart SET {}'.format(
                    ', '.join('{}=%s'.format(k) for k in new_doc)
                )
                sql_cursor.execute(qry, new_doc.values())
                mydb.commit()

            except Exception as e:
                print str(e)

            count += 1

    toLog('End Migration to MySQL', 'db')

Example #50

0

Show file

File: migrate_to_mysql.py Project: ali-hallaji/social_crawler

def sc_most_played():
    toLog('Start Migration to MySQL', 'db')

    mydb = MySQLdb.connect(
        SQL_HOST,
        SQL_USER,
        SQL_PASS,
        SQL_DB,
        charset='utf8mb4',
        use_unicode=True
    )
    sql_cursor = mydb.cursor()
    sql_cursor.execute("SET NAMES utf8mb4;")
    sql_cursor.execute("SET CHARACTER SET utf8mb4;")
    sql_cursor.execute("SET character_set_connection=utf8mb4;")

    query = "ALTER DATABASE newdatabase CHARACTER SET = utf8mb4"
    query += " COLLATE = utf8mb4_unicode_ci;"
    sql_cursor.execute(query)

    query = "ALTER TABLE songs_chart CONVERT TO CHARACTER SET"
    query += " utf8mb4 COLLATE utf8mb4_unicode_ci;"
    sql_cursor.execute(query)

    _date = datetime.datetime.now().replace(hour=2, minute=30)
    last_date = _date - datetime.timedelta(days=1)

    criteria = {
        "$or": [
            {
                "update_track_data": {
                    "$gt": _date
                },
                "daily_playback_count_yesterday": {
                    "$gt": 0
                }
            },
            {
                "created_at": {
                    "$gte": last_date
                }
            }
        ]
    }

    sql_column = {
        'created_at': 'ReleaseDate',
        'permalink_url': 'SCURL',
        'label_name': 'Label',
        'genre': 'Genre',
        'download_count': 'SCDownloads',
        'id': 'SCSongID',
        'daily_playback_count_today': 'SCDailyStreams',
        'isrc': 'ISRC',
        'title': 'Song',
        'favoritings_count': 'SCFavorites',
        'comment_count': 'SCComments',
        'description': 'SCDescription',
        'daily_playback_count_yesterday': 'SCDailyStreamsYest',
        'artwork_url': 'SCArtWorkURL',
        'playback_count': 'SCAllTimeStreams',
        'user_id': 'SCUserID',
        'score': 'SCScore',
        'reposts_count': 'SCReposts',
        'username': '******',
        'artist': 'Artist'
    }

    extra_int_columns = [
        'Met',
        'Flag',
        'Cover',
        'Favorite',
        'Playlist',
        'Listened_To',
        'Omit'
    ]

    extra_str_columns = [
        'YTTitle',
        'Brand_new_artist',
        'Total',
        'Brand_new_song',
        'Tags',
        'Chart_name',
        'Charts_today',
        'Charts_today_type',
        'Manager',
        'Agent',
        'Lawyer',
        'Notes',
        'Negotiation',
        'Price',
        'Chart_name_2'
    ]

    projection = {'publisher_metadata': 1}
    for i in sql_column.keys():
        projection[i] = 1

    data = cursor_soundcloud.refined_data.find(
        criteria,
        projection,
        no_cursor_timeout=True
    )
    new_data = data.sort('daily_playback_count_today', DESCENDING).limit(50000)

    if data:
        count = 1
        for doc in new_data:
            new_doc = {}

            for k, v in doc.items():
                ignore_list = [
                    'publisher_metadata',
                    '_id',
                    'artist',
                    'isrc'
                ]
                if (k == 'publisher_metadata') and doc[k]:
                    album = v.get('album_title', " ")
                    new_doc['Album'] = album

                    isrc = v.get('isrc', " ")
                    new_doc['ISRC'] = isrc

                    artist = v.get('artist', "")
                    if not artist:
                        doc['artist'] = doc['username']
                        new_doc['Artist'] = doc['username']
                    else:
                        doc['artist'] = artist
                        new_doc['Artist'] = artist

                elif (k == 'publisher_metadata') and not doc.get(k, None):
                    artist = doc.get('artist', None)
                    if artist:
                        new_doc['Artist'] = artist
                    else:
                        new_doc['Artist'] = doc['username']

                elif k not in ignore_list:
                    if k == 'created_at':
                        new_doc[sql_column[k]] = str(v.date())

                    else:
                        if isinstance(v, int):
                            new_doc[sql_column[k]] = v

                        elif isinstance(v, float):
                            new_doc[sql_column[k]] = v

                        elif isinstance(v, long):
                            new_doc[sql_column[k]] = v

                        elif isinstance(v, dict):
                            continue

                        else:
                            if k == 'description':
                                if isinstance(v, basestring):
                                    text = v[:50].encode('utf8')
                                    text += ' ...'
                                    new_doc[sql_column[k]] = text

                                else:
                                    try:
                                        text = unicode(v[:50]).encode('utf8')
                                        text += ' ...'
                                        new_doc[sql_column[k]] = text
                                    except Exception as e:
                                        new_doc[sql_column[k]] = ''
                            else:
                                if isinstance(v, basestring):
                                    text = v.encode('utf8')
                                    new_doc[sql_column[k]] = text

                                else:
                                    text = unicode(v).encode('utf8')
                                    new_doc[sql_column[k]] = text

                else:
                    if 'Album' not in new_doc:
                        new_doc['Album'] = " "

            for item in extra_str_columns:
                new_doc[item] = " "

            for item in extra_int_columns:
                new_doc[item] = 0

            new_doc['Date'] = datetime.datetime.now().replace(hour=6, minute=0)
            new_doc['Date'] = str(new_doc['Date'].date())
            new_doc['Chart_type'] = 'SoundCloud'
            new_doc['Rank'] = count

            try:
                table = 'songs_chart'
                sql = insert_from_dict(table, new_doc)
                sql_cursor.execute(sql, new_doc)
                mydb.commit()

            except MySQLdb.IntegrityError as e:
                print str(e)
                qry = 'UPDATE songs_chart SET {}'.format(
                    ', '.join('{}=%s'.format(k) for k in new_doc)
                )
                sql_cursor.execute(qry, new_doc.values())
                mydb.commit()

            except Exception as e:
                print str(e)

            count += 1

    toLog('End Migration to MySQL', 'db')

Example #51

0

Show file

File: func_tools.py Project: ali-hallaji/social_crawler

def get_video_info(video_id):
    doc = {}

    try:
        video = open_url_api(video_id)

        if 'items' in video:
            if video['items'] == []:
                doc['private'] = True
                doc['update_video_data'] = datetime.datetime.now()

                return doc

        snippet = video.get('items', [])[0].get('snippet', {})
        statistics = video.get('items', [])[0].get('statistics', {})

        doc['thumbnails'] = snippet.get('thumbnails', '')
        doc['title'] = snippet.get('title', '')
        doc['channel_id'] = snippet.get('channelId', '')
        doc['category_id'] = snippet.get('categoryId', '')
        doc['published_at'] = parser.parse(snippet.get('publishedAt', ''))
        doc['channel_title'] = snippet.get('channelTitle', '')
        doc['description'] = snippet.get('description', '')
        doc['keywords'] = snippet.get('tags', '')

        doc['comment_count'] = int(statistics.get('commentCount', 0))
        doc['dislikes'] = int(statistics.get('dislikeCount', 0))
        doc['favorite_count'] = int(statistics.get('favoriteCount', 0))
        doc['all_views'] = int(statistics.get('viewCount', 0))
        doc['likes'] = int(statistics.get('likeCount', 0))

        if ' - ' in doc.get('title', ''):
            splited = doc['title'].split(' - ')
            doc['artist'] = splited[0].strip()
            doc['song_title'] = ' - '.join(splited[1:])
            doc['song_title'] = doc['song_title'].strip()

        else:
            doc['artist'] = doc.get('title', '')
            doc['song_title'] = doc.get('title', '')

        doc['has_yesterday'] = True
        doc['update_video_data'] = datetime.datetime.now()

        return doc

    except Exception as e:
        data_log = {'video_id': video_id}
        data_log['type'] = 'update_data'
        data_log['date'] = datetime.datetime.now()

        if 'list index out of range' in str(e):
            msg = "Video Id: {0} can't be fetch".format(video_id)
            data_log['reason'] = msg
            toLog(msg, 'error')

        else:
            toLog(e, 'error')

        data_log['reason'] = str(e)
        # cursor.logs.insert(data_log)
        return doc

Example #52

0

Show file

File: func_tools.py Project: ali-hallaji/social_crawler

def executor_crawl(_date, name, criteria, next_page_token=None):
    youtube = build_youtube_api()
    # Call the search.list method to retrieve results matching the specified
    # query term.
    if name == 'Daily':
        if next_page_token:
            search_response = youtube.search().list(
                q=criteria['q'],
                maxResults=criteria['max_results'],
                part="id,snippet",
                type='video',
                order=criteria['order'],
                pageToken=next_page_token,
                videoCategoryId=criteria['category_id'],  # Music/Entertaiment
                publishedAfter=_date[0].strftime('%Y-%m-%dT%H:%M:%SZ'),
                # publishedBefore=_to.strftime('%Y-%m-%dT%H:%M:%SZ'),
            ).execute()

        else:
            search_response = youtube.search().list(
                q=criteria['q'],
                maxResults=criteria['max_results'],
                part="id,snippet",
                type='video',
                order=criteria['order'],
                videoCategoryId=criteria['category_id'],  # Music/Entertaiment
                publishedAfter=_date[0].strftime('%Y-%m-%dT%H:%M:%SZ'),
                # publishedBefore=_to.strftime('%Y-%m-%dT%H:%M:%SZ'),
            ).execute()

    else:
        if next_page_token:
            search_response = youtube.search().list(
                q=criteria['q'],
                maxResults=criteria['max_results'],
                part="id,snippet",
                type='video',
                order=criteria['order'],
                pageToken=next_page_token,
                videoCategoryId=criteria['category_id'],  # Music/Entertaiment
                publishedAfter=_date[0].strftime('%Y-%m-%dT%H:%M:%SZ'),
                publishedBefore=_date[1].strftime('%Y-%m-%dT%H:%M:%SZ'),
            ).execute()

        else:
            search_response = youtube.search().list(
                q=criteria['q'],
                maxResults=criteria['max_results'],
                part="id,snippet",
                type='video',
                order=criteria['order'],
                videoCategoryId=criteria['category_id'],  # Music/Entertaiment
                publishedAfter=_date[0].strftime('%Y-%m-%dT%H:%M:%SZ'),
                publishedBefore=_date[1].strftime('%Y-%m-%dT%H:%M:%SZ'),
            ).execute()

    # Add each result to the appropriate list, and then display the lists of
    # matching videos, channels, and playlists.
    duplicate = 0

    for search_result in search_response.get("items", []):
        _video = {'created_date': datetime.datetime.now()}

        category_trans = {'24': 'Entertainment', '10': 'Music'}
        _video['category_name'] = category_trans.get(
            criteria['category_id'],
            'UnKnown'
        )

        if search_result["kind"] == "youtube#video":
            _publish = parser.parse(search_result['snippet']['publishedAt'])
            _video['href'] = 'https://www.youtube.com/watch?v='
            _video['img'] = search_result['snippet']['thumbnails']['default']
            _video['title'] = search_result['snippet']['title']
            _video['channel_id'] = search_result['snippet']['channelId']
            _video['channel_title'] = search_result['snippet']['channelTitle']
            _video['published_at'] = _publish
            _video['description'] = search_result['snippet']['description']

            if 'id' in search_result['snippet']:
                _video['href'] += search_result['snippet']['id']['videoId']
                _video['id'] = search_result['snippet']['id']['videoId']
            else:
                _video['href'] += search_result['id']['videoId']
                _video['id'] = search_result['id']['videoId']

            try:
                result = cursor.refined_data.insert(_video)
            except DuplicateKeyError:
                result = True
                if (duplicate % 1000) == 0:
                    toLog("Duplicate Error: 1000 records", 'error')

                duplicate += 1

            if not result:
                msg = "Crawling Error: It can't be save record"
                msg += "{0}".format(search_result)
                toLog(msg, 'error')

        elif search_result["kind"] == "youtube#searchResult":
            _publish = parser.parse(search_result['snippet']['publishedAt'])
            _video['href'] = 'https://www.youtube.com/watch?v='
            _video['img'] = search_result['snippet']['thumbnails']['default']
            _video['title'] = search_result['snippet']['title']
            _video['channel_id'] = search_result['snippet']['channelId']
            _video['channel_title'] = search_result['snippet']['channelTitle']
            _video['published_at'] = _publish
            _video['description'] = search_result['snippet']['description']

            if 'id' in search_result['snippet']:
                _video['href'] += search_result['snippet']['id']['videoId']
                _video['id'] = search_result['snippet']['id']['videoId']
            else:
                _video['href'] += search_result['id']['videoId']
                _video['id'] = search_result['id']['videoId']

            try:
                result = cursor.refined_data.insert(_video)
            except DuplicateKeyError:
                result = True
                if (duplicate % 1000) == 0:
                    toLog("Duplicate Error: 1000 records", 'error')

                duplicate += 1

            if not result:
                msg = "Crawling Error: It can't be save record"
                msg += "{0}".format(search_result)
                toLog(msg, 'error')

        else:
            toLog("UnHandled Crawling: {0}".format(search_result), 'debug')

        if not result:
            toLog("Crawling Error: It can't be save record", 'error')

    # Create Next Page
    next_page_token = search_response.get("nextPageToken")
    return next_page_token

Example #53

0

Show file

File: base_handler.py Project: ali-hallaji/social_crawler

 def __init__(self, msg, code=401):
     jsonrpclib.Fault.__init__(self, code, msg)
     msg = "Message: {0}, Code: {1}".format(msg, code)
     toLog(msg, 'error')

Example #54

0

Show file

File: base_handler.py Project: ali-hallaji/social_crawler

 def generateException(self):
     release_error = eval('%s' % self.type_error)(self.msg)
     toLog(str(release_error), 'error')
     return release_error