def push_random_kols_to_queue(self, base_path='..', number=300): print( "Push %s random kols id to smcc service to get their post in the next crawl" % str(number)) channel = self._connection.channel() queue_name = 'facebook_scanning' queue_state = channel.queue_declare(queue_name, durable=True) with open_utf8_file_to_read( get_independent_os_path( [base_path, 'backend', 'input', 'kols_list.txt'])) as stream: kols_list = [ x.strip() for x in stream.read().split('\n') if x.strip() != '' ] choosen = set() count = 0 number_of_kols = len(kols_list) while count < number: index = randint(0, number_of_kols - 1) kol_id = kols_list[index] if kol_id not in choosen: choosen.add(kol_id) count += 1 for kol_id in choosen: body = kol_id print(kol_id) channel.basic_publish(exchange='', routing_key=queue_name, body=body)
def load_data(self, crawl_newspaper=True, crawl_kols=False, crawl_kols_by_smcc=False, random_kols=True, random_fb_account=True, max_kols=5, base_path=None): ''' input ----- crawl_newspaper: crawl newspaper configs in /backend/input/config.yaml crawl_kols: crawl kols post from kols id list in /backend/input/kols_list.txt by using facebook bot random_kols: choose random (max_kols) from kols list to crawl random_fb_account: set random fb bot (presetup in /backend/input/fb_list.txt file to crawl kol posts crawl_kols_by_smcc: crawl kols posts using smcc service (push some kol id to queue and get back post from queue). Choose random kols id (100 in number) by default and create only one webconfig with crawl_type = "kols smcc" base_path: specify path to resources folder ''' if not base_path: base_path = os.environ['DOCBAO_BASE_DIR'] #print(self._config) stream = open_utf8_file_to_read(self._filename) self._config = yaml.full_load(stream) stream.close() newspaper_list = [] if not crawl_newspaper: self.replace_crawl_list([]) else: newspaper_list = self.get_newspaper_list( ) # crawl newspaper last to init browser with random profiles first self.replace_crawl_list([]) if crawl_kols: # get kols_list kols_list = [] with open_utf8_file_to_read(self._kol_filename) as stream: kols_list = [ x for x in stream.read().split('\n') if x.strip() != '' ] # get fb account list fb_list = [] if random_fb_account: with open_utf8_file_to_read( self._fb_account_filename) as stream: fb_list = [ x for x in stream.read().split('\n') if x.strip() != '' ] count = 0 index = 0 choosen = set() while count < max_kols and count < len( kols_list): # finish when get max_kols count += 1 if random_kols: index = random.randint(0, len(kols_list) - 1) while index in choosen: # no repeat value index = random.randint(0, len(kols_list) - 1) choosen.add(index) print(f"Choose random kols: {kols_list[index]}" ) # print choosen kol for debugging else: index += 1 if index == len(kols_list): # end of kols list break if ';' not in kols_list[ index]: # this line contain just id, not name;url kol_name = 'unknown_id_' + kols_list[index] crawl_url = kols_list[index].strip() # profile id else: kol_name = kols_list[index].split(';')[0] crawl_url = kols_list[index].split(';')[1] webconfig = WebConfig() webconfig.load_default_config( 'facebook user', get_independent_os_path( [base_path, 'resources', 'configs', 'newspaper'])) webconfig.set_webname(kol_name) webconfig.set_config('crawl_url', crawl_url) webconfig.set_config( 'remove_me', True) # tag for delete when program finish # set random fb account to crawl if random_fb_account: profile_index = random.randint(0, len(fb_list) - 1) profile = fb_list[profile_index] webconfig.set_config('browser_profile', profile) self.add_newspaper(webconfig) # print(self._config) # crawl kols by smcc if crawl_kols_by_smcc: # create a 'crawl_type: kols smcc' WebConfig webconfig = WebConfig() webconfig.load_default_config( 'facebook user', get_independent_os_path( [base_path, 'resources', 'configs', 'newspaper'])) webconfig.set_config('crawl_type', 'kols smcc') webconfig.set_config('remove_me', True) webconfig.set_config('timezone', 'UTC') webconfig.set_webname('kol posts') webconfig.set_config('minimum_duration_between_crawls', -5) self.add_newspaper(webconfig) # append newspaper list if crawl_newspaper: for newspaper in newspaper_list: self.add_newspaper(newspaper, beginning=True)
def load_config_from_file(self, filepath): with open_utf8_file_to_read(filepath) as stream: self._web = yaml.full_load(stream)[0]