def __addPosts(self): """ It will add Post for a particular thread """ try: reviews = [ x.findParent('div') for x in self.soup.findAll('div','pBody')] except: log.exception(self.log_msg('Reviews are not found')) return False for i, review in enumerate(reviews): post_type = "" if i==0 and self.post_type: post_type = "Question" self.post_type = False else: post_type = "Suggestion" page = self.__getData( review , post_type ) log.info(self.log_msg(page)) try: review_hash = get_hash( page ) log.info(page) unique_key = get_hash( {'data':page['data'],'title':page['title']}) if checkSessionInfo(self.genre, self.session_info_out, unique_key,\ self.task.instance_data.get('update'),parent_list\ =[self.parent_uri]): log.info(self.log_msg('session info return True')) continue result=updateSessionInfo(self.genre, self.session_info_out, unique_key, \ review_hash,'Thread', self.task.instance_data.get('update'),\ parent_list=[self.parent_uri]) if not result['updated']: log.info(self.log_msg('result not updated')) continue #page['first_version_id']=result['first_version_id'] #page['parent_id']= '-'.join(result['id'].split('-')[:-1]) #page['id'] = result['id'] parent_list = [self.parent_uri] page['parent_path']=copy.copy(parent_list) parent_list.append(unique_key) page['path']=parent_list page['priority']=self.task.priority page['level']=self.task.level page['pickup_date'] = datetime.strftime(datetime.utcnow()\ ,"%Y-%m-%dT%H:%M:%SZ") page['connector_instance_log_id'] = self.task.connector_instance_log_id page['connector_instance_id'] = self.task.connector_instance_id page['workspace_id'] = self.task.workspace_id page['client_id'] = self.task.client_id page['client_name'] = self.task.client_name page['last_updated_time'] = page['pickup_date'] page['versioned'] = False page['entity'] = 'Review' page['category'] = self.task.instance_data.get('category','') page['task_log_id']=self.task.id page['uri'] = self.currenturi page['uri_domain'] = urlparse.urlparse(page['uri'])[1] self.pages.append( page ) #log.info(page) log.info(self.log_msg('Review Added')) except: log.exception(self.log_msg('Error while adding session info'))
def __addPost(self, post,is_question=False): '''This will add the post ''' try: page = self.__getData(post,is_question) if not page: log.info(self.log_msg('No data found in url %s'%self.currenturi)) return True unique_key = get_hash({'data':page['data'], 'title':page['title']}) if checkSessionInfo(self.__genre, self.session_info_out, \ unique_key, self.task.instance_data.get('update'),\ parent_list=[self.task.instance_data['uri']]): log.info(self.log_msg('Session info returns True for uri %s'% \ self.currenturi)) return False result = updateSessionInfo(self.__genre, self.session_info_out, unique_key, \ get_hash( page ),'forum', self.task.instance_data.get('update'),\ parent_list=[self.task.instance_data['uri']]) if result['updated']: page['parent_path'] = [self.task.instance_data['uri']] page['path'] = [self.task.instance_data['uri'], unique_key ] page['uri_domain'] = urlparse.urlparse(page['uri'])[1] page.update(self.__task_elements_dict) self.pages.append(page) else: log.info(self.log_msg('Update session info returns False for \ url %s'%self.currenturi)) except: log.exception(self.log_msg('Cannot add the post in url %s'%self.currenturi)) return True
def __addPost(self, post): """ This will take the post tag , and fetch data and meta data and add it to self.pages """ try: page = self.__getData(post) if not page: log.info(self.log_msg('page contains empty data, getdata \ returns False for uri %s'%self.currenturi)) return True unique_key = get_hash(page) if checkSessionInfo(self.__genre, self.session_info_out, unique_key,\ self.task.instance_data.get('update')): log.info(self.log_msg('Session info returns True for uri %s'%unique_key)) return False result = updateSessionInfo(self.__genre, self.session_info_out, unique_key, \ get_hash( page ),'forum', self.task.instance_data.get('update')) if result['updated']: page['parent_path'] = [] page['path'] = [unique_key] page['uri'] = self.currenturi page['uri_domain'] = urlparse.urlparse(page['uri'])[1] log.info(page) page.update(self.__task_elements_dict) self.pages.append(page) else: log.info(self.log_msg('Update session info returns False for \ url %s'%self.currenturi)) except: log.exception(self.log_msg('Cannot add the post for the uri %s'%self.currenturi)) return True
def __addPost(self, post): '''It will add the post ''' try: page = self.__getData(post) if not page: return True unique_key = get_hash( {'data' : page['data'] }) if checkSessionInfo('review', self.session_info_out, unique_key,\ self.task.instance_data.get('update'),parent_list\ = [self.currenturi]): log.info(self.log_msg('Session info returns True')) return False result=updateSessionInfo('review', self.session_info_out, unique_key, \ get_hash( page ),'Review', self.task.instance_data.get('update'),\ parent_list=[self.currenturi]) if not result['updated']: log.info(self.log_msg('Update session info returns False')) return True page['path'] = [self.currenturi] page['parent_path'] = [] #page['path'].append(unique_key) page['uri'] = self.currenturi page['uri_domain'] = urlparse.urlparse(page['uri'])[1] page['entity'] = 'post' page.update(self.__task_elements_dict) self.pages.append(page) log.info(page) log.info(self.log_msg('Post Added')) return True except: log.exception(self.log_msg('Error while adding session info')) return False
def __addPosts(self): '' try: reviews =self.soup.findAll('div',id=re.compile('^edit.*?')) if not reviews: log.info(self.log_msg('No reviews found')) return False except: log.exception(self.log_msg('Reviews are not found')) return False for i, review in enumerate(reviews): post_type = "Question" if i==0 and self.post_type: post_type = "Question" self.post_type = False else: post_type = "Suggestion" page = self.__getData( review , post_type ) if not page: log.info(self.log_msg('no page is sent back')) continue try: review_hash = get_hash( page ) # not changed ,bcoz, we already crawled unique_key = get_hash( {'data':page['data'],'title':page['title']}) if checkSessionInfo(self.genre, self.session_info_out, unique_key,\ self.task.instance_data.get('update'),parent_list\ =[self.parent_uri]): continue result=updateSessionInfo(self.genre, self.session_info_out, unique_key, \ review_hash,'Review', self.task.instance_data.get('update'),\ parent_list=[self.parent_uri]) if not result['updated']: continue parent_list = [ self.parent_uri ] page['parent_path'] = copy.copy(parent_list) parent_list.append( unique_key ) page['path']=parent_list page['priority']=self.task.priority page['level']=self.task.level page['pickup_date'] = datetime.strftime(datetime.utcnow()\ ,"%Y-%m-%dT%H:%M:%SZ") page['connector_instance_log_id'] = self.task.connector_instance_log_id page['connector_instance_id'] = self.task.connector_instance_id page['workspace_id'] = self.task.workspace_id page['client_id'] = self.task.client_id page['client_name'] = self.task.client_name page['last_updated_time'] = page['pickup_date'] page['versioned'] = False page['entity'] = 'Review' page['category'] = self.task.instance_data.get('category','') page['task_log_id']=self.task.id #page['uri'] = self.currenturi #Skumar page['uri_domain'] = urlparse.urlparse(page['uri'])[1] self.pages.append( page ) #log.info(page) log.info(self.log_msg('Review Added')) except: log.exception(self.log_msg('Error while adding session info'))
def __addPosts(self): """ It will add Post for a particular thread """ try: reviews = [ BeautifulSoup(x) for x in self.soup.find('table','Frm_MsgTable').__str__().split('<!-- Start Message head -->')[1:]] except: log.exception(self.log_msg('Reviews are not found')) return False post_type = "Question" log.info([review.find('a')['name'] for review in reviews]) for i, review in enumerate(reviews): if i==0 and self.post_type: post_type = "Question" self.post_type = False else: post_type = "Suggestion" page = self.__getData( review , post_type ) if not page: log.info(self.log_msg('Todays Post , so, continue with other post')) continue try: review_hash = get_hash( page ) #unique_key = review.find('a')['name'] unique_key = get_hash( {'data':page['data'],'title':page['title']}) if checkSessionInfo(self.genre, self.session_info_out, unique_key,\ self.task.instance_data.get('update'),parent_list\ =[self.parent_uri]): continue result=updateSessionInfo(self.genre, self.session_info_out, unique_key, \ review_hash,'Review', self.task.instance_data.get('update'),\ parent_list=[self.parent_uri]) if not result['updated']: continue parent_list = [self.parent_uri] page['parent_path']=copy.copy(parent_list) parent_list.append(unique_key) page['path']=parent_list page['priority']=self.task.priority page['level']=self.task.level page['pickup_date'] = datetime.strftime(datetime.utcnow()\ ,"%Y-%m-%dT%H:%M:%SZ") page['connector_instance_log_id'] = self.task.connector_instance_log_id page['connector_instance_id'] = self.task.connector_instance_id page['workspace_id'] = self.task.workspace_id page['client_id'] = self.task.client_id page['client_name'] = self.task.client_name page['last_updated_time'] = page['pickup_date'] page['versioned'] = False page['entity'] = 'Review' page['category'] = self.task.instance_data.get('category','') page['task_log_id']=self.task.id page['uri'] = page.get('uri',self.parent_uri) page['uri_domain'] = urlparse.urlparse(page['uri'])[1] self.pages.append( page ) #log.info(page) log.info(self.log_msg('Review Added')) except: log.exception(self.log_msg('Error while adding session info'))
def check_for_duplicates(paths, hash=hashlib.sha1): hashes_by_size = {} hashes_on_1k = {} hashes_full = {} n_tot = len(paths) for i, path in enumerate(paths): file_size = os.path.getsize(path) duplicate = hashes_by_size.get(file_size) if duplicate: hashes_by_size[file_size].append(path) else: # create the list for this file size hashes_by_size[file_size] = [] hashes_by_size[file_size].append(path) if ((i % 10000) == 0) and i > 0: print("Checked size of {}/{} files".format(i, n_tot)) # For all files with the same file size, get their # hash on the 1st 1024 bytes logger.info("Checking for duplication by comparing small hashes ..") for __, files in hashes_by_size.items(): if len(files) < 2: continue for filename in files: try: small_hash = get_hash(filename, first_chunk_only=True) except (OSError, ): # the file access might've changed till the exec point got here continue duplicate = hashes_on_1k.get(small_hash) if duplicate: hashes_on_1k[small_hash].append(filename) else: # create the list for this 1k hash hashes_on_1k[small_hash] = [] hashes_on_1k[small_hash].append(filename) # For all files with the hash on the 1st 1024 bytes, get their # hash on the full file - collisions will be duplicates n_duplicates = 0 logger.info("Checking for duplication by comparing full hashes ..") for __, files in hashes_on_1k.items(): # this hash of fist 1k file bytes is unique, no need to # spend cpy cycles on it if len(files) < 2: continue for filename in files: try: full_hash = get_hash(filename, first_chunk_only=False) except (OSError, ): # the file access might've changed till the exec point got here continue duplicate = hashes_full.get(full_hash) if duplicate: logger.info("Duplicate found: %s and %s" % (filename, duplicate)) n_duplicates += 1 else: hashes_full[full_hash] = filename logger.info("Found {} duplicates".format(n_duplicates))
def __addPosts(self): """ It will add Post for a particular thread """ try: reviews = self.soup.findAll('table',id='tblTitle') except: log.exception(self.log_msg('Reviews are not found')) return False for i, review in enumerate(reviews): if i==0 and self.post_type: post_type = "Question" self.post_type = False else: post_type = "Suggestion" try: page = self.__getData( review, post_type ) unique_key = get_hash( {'data':page['data'],'title':page['title']}) #unique_key = stripHtml(review.findNext('a',id=re.compile('PostLink')).renderContents()).split('#')[-1] if checkSessionInfo(self.genre, self.session_info_out, unique_key,\ self.task.instance_data.get('update'),parent_list\ =[self.parent_uri]): log.info(self.log_msg('Session info returns True')) continue except: log.info(self.log_msg('unique key not found')) continue try: result=updateSessionInfo(self.genre, self.session_info_out, unique_key, \ get_hash( page ),'Review', self.task.instance_data.get('update'),\ parent_list=[self.parent_uri]) if not result['updated']: continue parent_list = [ self.parent_uri ] page['parent_path'] = copy.copy(parent_list) parent_list.append( unique_key ) page['path']=parent_list page['priority']=self.task.priority page['level']=self.task.level page['pickup_date'] = datetime.strftime(datetime.utcnow()\ ,"%Y-%m-%dT%H:%M:%SZ") page['connector_instance_log_id'] = self.task.connector_instance_log_id page['connector_instance_id'] = self.task.connector_instance_id page['workspace_id'] = self.task.workspace_id page['client_id'] = self.task.client_id page['client_name'] = self.task.client_name page['last_updated_time'] = page['pickup_date'] page['versioned'] = False page['entity'] = 'Review' page['category'] = self.task.instance_data.get('category','') page['task_log_id']=self.task.id page['uri'] = self.currenturi page['uri_domain'] = urlparse.urlparse(page['uri'])[1] self.pages.append( page ) log.info(self.log_msg('Review Added')) except: log.exception(self.log_msg('Error while adding session info'))
def __setParentPage(self): """ this will set parent page info """ page = {} try: page['title'] = stripHtml(self.soup.find('div','brdSubHd grey top botOne').renderContents()).split('replies')[-1].strip() #log.info(page['title']) page['data'] = stripHtml(self.soup.find('div','mbPanel clearPanel').renderContents()) try: date_str = stripHtml(self.soup.find('div','brdSubHd blue').renderContents()).split('on')[-1].strip() page['posted_date'] = datetime.strftime(datetime.strptime(date_str,'%d/%m/%y at %I:%M %p'),"%Y-%m-%dT%H:%M:%SZ") except: log.exception(self.log_msg('Posted date not found')) page['posted_date'] = datetime.strftime(datetime.utcnow(), "%Y-%m-%dT%H:%M:%SZ") except: log.exception(self.log_msg('main page title not found')) return False unique_key = get_hash({'title': page['title'],'data' : page['data']}) if checkSessionInfo(self.genre, self.session_info_out, unique_key,\ self.task.instance_data.get('update')): log.info(self.log_msg('Session info returns True for uri %s'\ %self.currenturi)) return False page_data_keys = ['et_first_author_name', 'ei_thread_replies_count', \ 'edate_last_post_date'] [page.update({each:self.task.pagedata.get(each)}) for each in \ page_data_keys if self.task.pagedata.get(each)] try: result=updateSessionInfo(self.genre, self.session_info_out, unique_key, \ get_hash( page ),'Review', self.task.instance_data.get('update')) if not result['updated']: log.exception(self.log_msg('Update session info returns False')) return True page['parent_path'] = page['path'] = [self.task.instance_data['uri']] ## page['path'] = [unique_key] #page['path'].append(unique_key) page['uri'] = self.currenturi page['entity'] = 'Review' page['uri_domain'] = urlparse.urlparse(page['uri'])[1] page.update(self.__task_elements_dict) self.pages.append(page) #log.info(page) log.info(self.log_msg('Post Added')) return True except: log.exception(self.log_msg('Error while adding session info')) return False
def __addPost(self, post, is_original_post=False): try: unique_key = stripHtml(str(post.findAll('div', 'oneLine')[2])).split()[2] page = self.__get_data(post, is_original_post, unique_key) if not page: log.info(self.log_msg('page is empty, __get_data returns False for uri %s' % self.currenturi)) return True if checkSessionInfo(self.__genre, self.session_info_out, unique_key, self.task.instance_data.get('update'), parent_list=[self.task.instance_data['uri']]): log.info(self.log_msg('Session info returns True for uri %s' % self.task.instance_data['uri'])) return False result = updateSessionInfo(self.__genre, self.session_info_out, unique_key, get_hash(page),'forum', self.task.instance_data.get('update'), parent_list=[self.task.instance_data['uri']]) if result['updated']: page['parent_path'] = [self.task.instance_data['uri']] page['path'] = [self.task.instance_data['uri'], unique_key] page['uri_domain'] = urlparse.urlparse(page['uri'])[1] page.update(self.__task_elements_dict) self.pages.append(page) else: log.info(self.log_msg('Update session info returns False for url %s' % self.currenturi)) except: log.exception(self.log_msg('Cannot add the post for the uri %s' % self.currenturi)) return True
def __addPost(self, post, is_question=False): try: unique_key = re.search(r'(\d+)', post.find('div', id = re.compile(r'^post-\d+$'))['id']).groups()[0] if checkSessionInfo(self.__genre, self.session_info_out, unique_key, \ self.task.instance_data.get('update'),parent_list\ = [self.task.instance_data['uri']]): log.info(self.log_msg('Session info returns True for %s' % unique_key)) return False page = self.__getData(post, is_question) if not page: log.info(self.log_msg('page contains empty data __getData \ returns False for uri %s'%self.currenturi) ) return True result = updateSessionInfo(self.__genre, self.session_info_out, unique_key, \ get_hash( page ),'forum', self.task.instance_data.get('update'),\ parent_list=[self.task.instance_data['uri']]) if result['updated']: page['parent_path'] = [self.task.instance_data['uri']] page['path'] = [ self.task.instance_data['uri'], unique_key] page['uri'] = post.findPrevious('a', attrs = {'onclick': re.compile('link_to_post')})['href'].__str__() page['uri_domain'] = urlparse.urlparse(page['uri'])[1] page.update(self.__task_elements_dict) self.pages.append(page) else: log.info(self.log_msg('Update session info returns False for \ url %s' % self.currenturi)) except: log.exception(self.log_msg('Cannot add the post for the uri %s' % self.currenturi)) return True
def eliminate_ambigous_size_matches(matches): """ Eliminate ambigous matches (same size) by comparing file hashes """ # eliminate ambiguous matches for path in matches.keys(): size_matches = matches[path] hash_matches = list() if len(size_matches) > 1: hash_to_find = get_hash(path, first_chunk_only=False) for file in size_matches: try: hash_to_match = get_hash(file, first_chunk_only=False) except (OSError, ): continue if hash_to_match == hash_to_find: hash_matches.append(file) matches[path] = hash_matches
def __addPost(self, post, is_question=False): """ This will take the post tag , and fetch data and meta data and add it to self.pages """ try: unique_key = stripHtml(post.find('div', id=re.compile('msgId\d+'))\ .renderContents())[1:-1].replace('Msg Id: ', '') if checkSessionInfo(self.__genre, self.session_info_out, unique_key, \ self.task.instance_data.get('update'),parent_list\ = [self.task.instance_data['uri']]): log.info(self.log_msg('Session info returns True for uri %s'%unique_key)) return False page = self.__getData(post, is_question) if not page: log.info(self.log_msg('page contains empty data, getdata \ returns False for uri %s'%self.currenturi)) return True result = updateSessionInfo(self.__genre, self.session_info_out, unique_key, \ get_hash( page ),'forum', self.task.instance_data.get('update'), \ parent_list=[self.task.instance_data['uri']]) if result['updated']: page['parent_path'] = [self.task.instance_data['uri']] page['path'] = [self.task.instance_data['uri'], unique_key] page['uri_domain'] = urlparse.urlparse(page['uri'])[1] page.update(self.__task_elements_dict) self.pages.append(page) log.info(self.log_msg('Page added')) else: log.info(self.log_msg('Update session info returns False for \ url %s'%self.currenturi)) return False except: log.exception(self.log_msg('Cannot add the post for the uri %s'%self.currenturi)) return True
def __addReviews(self): '''It will fetch the the reviews and append it to self.pages ''' reviews= [x.findParent('div').findParent('div') for x in self.soup.findAll('span' ,'ctedit')] log.debug(self.log_msg('# Of Reviews found is %d'%len(reviews))) for review in reviews: try: unique_key = review.find('a')['name'] if checkSessionInfo(self.genre, self.session_info_out, unique_key,\ self.task.instance_data.get('update'),parent_list\ =[ self.task.instance_data['uri'] ]): log.info(self.log_msg('session info return True in url %s'%self.currenturi)) continue page = self.__getData(review) if not page: log.info(self.log_msg('No data found in url %s'%self.currenturi)) continue result = updateSessionInfo(self.genre, self.session_info_out, unique_key, \ get_hash(page),'comment', self.task.instance_data.get('update'),\ parent_list=[self.task.instance_data['uri']]) if not result['updated']: log.info(self.log_msg('result not updated')) continue page['path'] = page['parent_path'] = [ self.task.instance_data['uri'] ] page['path'].append( unique_key ) page['entity'] = 'comment' page['uri'] = self.task.instance_data['uri'] page['uri_domain'] = urlparse.urlparse(page['uri'])[1] page.update(self.__task_elements_dict) self.pages.append(page) log.info(self.log_msg('Review Added')) except: log.exception(self.log_msg('Exception while adding session info in url %s'%self.currenturi))
def __addPost(self, post, is_question = False): """ This will take the post tag , and fetch data and meta data and add it to self.pages """ try: unique_tag = post.find('a', 'postcounter') #is_question = stripHtml(unique_tag.renderContents())== u'#1' unique_key = unique_tag['href'] if checkSessionInfo(self.__genre, self.session_info_out, unique_key,\ self.task.instance_data.get('update')): log.info(self.log_msg('Session info returns True for uri %s'%unique_key)) return False page = self.__getData(post, is_question, unique_key) if not page: log.info(self.log_msg('page contains empty data, getdata \ returns False for uri %s'%self.currenturi)) return True result = updateSessionInfo(self.__genre, self.session_info_out, unique_key, \ get_hash( page ),'forum', self.task.instance_data.get('update')) if result['updated']: page['parent_path'] = [] page['path'] = [unique_key] page['uri'] = unique_key page['uri_domain'] = urlparse.urlparse(page['uri'])[1] page.update(self.__task_elements_dict) self.pages.append(page) else: log.info(self.log_msg('Update session info returns False for \ url %s'%self.currenturi)) except: log.exception(self.log_msg('Cannot add the post for the uri %s'%self.currenturi)) return True
def __addPosts(self): """ It will add Post for a particular thread """ try: """for block_quote in re.findall('<BLOCKQUOTE>.*?</BLOCKQUOTE>',self.rawpage,re.S): self.rawpage = self.rawpage.replace(block_quote,'') self._setCurrentPage() #reviews = self.soup.findAll('div','thread')""" reviews = self.soup.findAll('div','wrapper_comment') except: log.exception(self.log_msg('Reviews are not found')) return False for i, review in enumerate(reviews): post_type = "Question" if i==0: post_type = "Question" else: post_type = "Suggestion" try: unique_key = dict(parse_qsl(review.find('div','commentbox_nav').find('a',text='Reply').parent['href'].split('?')[-1]))['ReplyToPostID'] if checkSessionInfo(self.genre, self.session_info_out, unique_key,\ self.task.instance_data.get('update'),parent_list\ =[self.parent_uri]): log.info(self.log_msg('Session info returns True')) continue page = self.__getData( review, post_type ) log.info(page) except: log.info(self.log_msg('unique key not found')) continue try: result=updateSessionInfo(self.genre, self.session_info_out, unique_key, \ get_hash( page ),'Review', self.task.instance_data.get('update'),\ parent_list=[self.parent_uri]) if not result['updated']: continue parent_list = [ self.parent_uri ] page['parent_path'] = copy.copy(parent_list) parent_list.append( unique_key ) page['path']=parent_list page['priority']=self.task.priority page['level']=self.task.level page['pickup_date'] = datetime.strftime(datetime.utcnow()\ ,"%Y-%m-%dT%H:%M:%SZ") page['connector_instance_log_id'] = self.task.connector_instance_log_id page['connector_instance_id'] = self.task.connector_instance_id page['workspace_id'] = self.task.workspace_id page['client_id'] = self.task.client_id page['client_name'] = self.task.client_name page['last_updated_time'] = page['pickup_date'] page['versioned'] = False page['entity'] = 'Review' page['category'] = self.task.instance_data.get('category','') page['task_log_id']=self.task.id page['uri'] = self.currenturi page['uri_domain'] = urlparse.urlparse(page['uri'])[1] self.pages.append( page ) log.info(self.log_msg('Review Added')) except: log.exception(self.log_msg('Error while adding session info'))
def __addPost(self, post, is_question=False): """ This will take the post tag , and fetch data and meta data and add it to self.pages """ try: unique_key = post.find('a', attrs={'name':True})['name'] permalink = self.currenturi + '#' + unique_key if checkSessionInfo(self.__genre, self.session_info_out, \ unique_key, self.task.instance_data.get('update'),\ parent_list=[self.task.instance_data['uri']]): log.info(self.log_msg('Session info returns True for uri %s'% \ permalink)) return False page = self.__getData(post, is_question, unique_key) if not page: log.info(self.log_msg('page contains empty data, getdata \ returns False for uri %s'%self.currenturi)) return True result = updateSessionInfo(self.__genre, self.session_info_out, unique_key, \ get_hash( page ),'forum', self.task.instance_data.get('update'),\ parent_list=[self.task.instance_data['uri']]) if result['updated']: page['parent_path'] = [self.task.instance_data['uri']] page['path'] = [self.task.instance_data['uri'], unique_key ] page['uri_domain'] = urlparse.urlparse(page['uri'])[1] page.update(self.__task_elements_dict) self.pages.append(page) else: log.info(self.log_msg('Update session info returns False for \ url %s'%self.currenturi)) except: log.exception(self.log_msg('Cannot add the post for the uri %s'%self.currenturi)) return True
def __addPosts(self,link): '''It will add the post ''' try: self.currenturi = link if checkSessionInfo('review', self.session_info_out, self.currenturi, \ self.task.instance_data.get('update')): log.info(self.log_msg('Session info returns True for uri %s'\ %self.currenturi)) return False self.__setSoupForCurrentUri() page = self.__getData() if not page: return True result = updateSessionInfo('review', self.session_info_out, self.currenturi,get_hash( page ),'review', self.task.instance_data.get('update')) if result['updated']: page['path'] = [ self.currenturi] page['parent_path'] = [] page['uri']= self.currenturi page['uri_domain'] = urlparse.urlparse(page['uri'])[1] page['entity'] = 'review' page.update(self.__task_elements_dict) self.pages.append(page) log.info(self.log_msg('Page added')) else: log.info(self.log_msg('Update session info returns False for \ url %s'%self.currenturi)) except: log.exception(self.log_msg('Cannot add the post for the uri %s'\ %self.currenturi)) return False
def __getParentPage(self,comment): """This will get the parent info """ page = {} try: self.__total_replies_count = page['ei_data_replies_count'] = int(stripHtml(comment.find('totalreplies').renderContents())) page['title'] = page['data'] = stripHtml(comment.find('name').renderContents()) page['posted_date'] = stripHtml(comment.find('dateadded').renderContents()).split('.')[0] unique_key = stripHtml(comment.find('messageid').renderContents()) if checkSessionInfo(self.__genre, self.session_info_out, self.task.instance_data['uri'],\ self.task.instance_data.get('update')): log.info(self.log_msg('Session info return True, Already exists')) return result = updateSessionInfo('review', self.session_info_out, self.\ task.instance_data['uri'], get_hash( page ), 'forum', self.task.instance_data.get('update')) if result['updated']: page['path']=[unique_key] page['parent_path']=[] page['uri'] = self.currenturi page['uri_domain'] = unicode(urlparse.urlparse(page['uri'])[1]) page['entity'] = 'post' page.update(self.__task_elements_dict) log.info(page['data']) self.pages.append(page) else: log.info(self.log_msg('Result[updated] returned True for \ uri'%self.currenturi)) except: log.exception(self.log_msg('Hierachy/Title not found in url %s'%self.currenturi)) return
def __addPost(self, post, is_question = False): """ This will take the post tag , and fetch data and meta data and add it to self.pages """ try: unique_key_tag = post.find('a', id=re.compile('postcount\d+')) #unique_key = self.__removeSessionId('http://htcpedia.com/forum/' + unique_key_tag['href']) unique_key = unique_key_tag['id'] log.info(unique_key) if checkSessionInfo(self.__genre, self.session_info_out, unique_key,\ self.task.instance_data.get('update')): log.info(self.log_msg('Session info returns True for uri %s'%unique_key)) return False page = self.__getData(post, is_question) if not page: log.info(self.log_msg('page contains empty data, getdata \ returns False for uri %s'%self.currenturi)) return True result = updateSessionInfo(self.__genre, self.session_info_out, unique_key, \ get_hash( page ),'forum', self.task.instance_data.get('update')) if result['updated']: page['parent_path'] = [] page['path'] = [self.task.instance_data['uri'], unique_key] page['uri'] = self.currenturi + "#" + unique_key page['uri_domain'] = urlparse.urlparse(page['uri'])[1] log.info(page) page.update(self.__task_elements_dict) self.pages.append(page) else: log.info(self.log_msg('Update session info returns False for \ url %s'%self.currenturi)) except: log.exception(self.log_msg('Cannot add the post for the uri %s'%self.currenturi)) return True
def __setParentPage(self): """This will get the parent info """ page = {} try: page['et_thread_hierarchy'] = self.__hierarchy = [x.strip() for x in stripHtml(self.soup.find('div', 'deck breadcrumbs').renderContents()).split('>') if x.strip()][1:] page['data'] = page['title'] = page['et_thread_hierarchy'][-1] except: log.exception(self.log_msg('Thread hierarchy and Title Not found for uri\ %s'%self.currenturi)) return if checkSessionInfo(self.__genre, self.session_info_out, self.task.instance_data['uri'], \ self.task.instance_data.get('update')): log.info(self.log_msg('Session info return True, Already exists')) return try: result = updateSessionInfo('review', self.session_info_out, self.\ task.instance_data['uri'], get_hash( page ), 'forum', self.task.instance_data.get('update')) if result['updated']: page['path'] = [self.task.instance_data['uri']] page['parent_path'] = [] page['uri'] = self.currenturi page['uri_domain'] = unicode(urlparse.urlparse(page['uri'])[1]) page['data'] = '' page['entity'] = 'thread' page.update(self.__task_elements_dict) page['posted_date'] = page['pickup_date'] self.pages.append(page) log.info(self.log_msg('Parent Page Added')) else: log.info(self.log_msg('Result[updated] returned True for \ uri'%self.currenturi)) except: log.exception(self.log_msg("parent post couldn't be parsed"))
def __getParentPage(self): '' if checkSessionInfo(self.genre, self.session_info_out, self.currenturi,\ self.task.instance_data.get('update')): log.info(self.log_msg('Session info return True, Already exists')) return False page = {} try: page['et_thread_hierarchy'] = [each.replace('>','').strip() for each in stripHtml(self.soup.find('span','navbar').findParent('table').renderContents()).split('\n') if not each.strip()==''] page['title']= page['et_thread_hierarchy'][-1] except: log.info(self.log_msg('Thread hierarchy is not found')) page['title']='' for each in ['title','et_last_post_author_name','ei_thread_replies_count','ei_thread_views_count','edate_last_post_date','ei_thread_votes_count','ef_thread_rating']: try: page[each] = self.task.pagedata[each] except: log.info(self.log_msg('page data cannot be extracted')) try: page['et_thread_id'] = self.currenturi.split('&')[-1].split('=')[-1] except: log.info(self.log_msg('Thread id not found')) try: post_hash = get_hash( page ) id=None if self.session_info_out=={}: id=self.task.id result=updateSessionInfo( self.genre, self.session_info_out, self.\ currenturi, post_hash,'Post',self.task.instance_data.get('update'), Id=id) if not result['updated']: return False page['path']=[self.currenturi] page['parent_path']=[] page['uri'] = normalize( self.currenturi ) page['uri_domain'] = unicode(urlparse.urlparse(page['uri'])[1]) page['priority']=self.task.priority page['level']=self.task.level page['pickup_date'] = datetime.strftime(datetime.utcnow(),"%Y-%m-%dT%H:%M:%SZ") page['posted_date'] = datetime.strftime(datetime.utcnow(),"%Y-%m-%dT%H:%M:%SZ") page['connector_instance_log_id'] = self.task.connector_instance_log_id page['connector_instance_id'] = self.task.connector_instance_id page['workspace_id'] = self.task.workspace_id page['client_id'] = self.task.client_id page['client_name'] = self.task.client_name page['last_updated_time'] = page['pickup_date'] page['versioned'] = False #page['first_version_id']=result['first_version_id'] page['data'] = '' #page['id'] = result['id'] page['task_log_id']=self.task.id page['entity'] = 'Post' page['category']=self.task.instance_data.get('category','') self.pages.append(page) log.info(page) log.info(self.log_msg('Parent Page added')) return True except : log.exception(self.log_msg("parent post couldn't be parsed")) return False
def __addPost(self, post, is_question=False): try: unique_key = post.find('span', attrs={'class': 'name'}).\ find('a')['name'] if checkSessionInfo(self.__genre, self.session_info_out, unique_key, self.task.instance_data.get('update'),parent_list\ = [self.task.instance_data['uri']]): log.info(self.log_msg('Session info returns True for %s' % unique_key)) return False page = self.__getData(post, is_question) log.info(self.log_msg('page')) if not page: log.info(self.log_msg('page contains empty data __getData returns False \ for uri %s'%self.currenturi)) return True result = updateSessionInfo(self.__genre, self.session_info_out, unique_key, get_hash( page ),'forum', self.task.\ instance_data.get('update'), parent_list = \ [ self.task.instance_data['uri'] ] ) if result['updated']: page['parent_path'] = [self.task.instance_data['uri']] page['path'] = [ self.task.instance_data['uri'], unique_key] page['uri'] = self.currenturi page['uri_domain'] = urlparse.urlparse(page['uri'])[1] page.update(self.__task_elements_dict) self.pages.append(page) else: log.info(self.log_msg('Update session info returns False for \ url %s'%self.currenturi)) except: log.exception(self.log_msg('Cannot add the post for the uri %s' \ % self.currenturi)) return True
def __addPost(self, post, is_question=False): try: unique_key = re.search(r'(\d+)', post['id']).groups()[0] if checkSessionInfo(self.__genre, self.session_info_out, unique_key, \ self.task.instance_data.get('update'),parent_list\ = [self.task.instance_data['uri']]): log.info(self.log_msg('Session info returns True for %s'%unique_key)) return False page = self.__getData(post, is_question) if not page: log.info(self.log_msg('page contains empty data, getdata \ returns False for uri %s'%self.currenturi)) return True result = updateSessionInfo(self.__genre, self.session_info_out, unique_key, \ get_hash( page ),'forum', self.task.instance_data.get('update'),\ parent_list=[self.task.instance_data['uri']]) if result['updated']: page['parent_path'] = [self.task.instance_data['uri']] page['path'] = [ self.task.instance_data['uri'], unique_key] page['uri'] = self.__baseuri + 'showpost.php?p=' + unique_key page['uri_domain'] = urlparse.urlparse(page['uri'])[1] page.update(self.__task_elements_dict) self.pages.append(page) else: log.info(self.log_msg('Update session info returns False for \ url %s'%self.currenturi)) except: log.exception(self.log_msg('Cannot add the post for the uri %s'%self.currenturi)) return True
def __getParentPage(self): """ This will get the parent info """ page = {} try: self.hierarchy = page['et_thread_hierarchy'] = [stripHtml(x.renderContents()) for x in self.soup.find('div','CommonBreadCrumbArea').findAll('a')][1:] page['title']= page['et_thread_hierarchy'][-1] except: log.info(self.log_msg('Thread hierarchy is not found')) page['title']='' try: self.thread_id = page['et_thread_id'] = unicode(self.currenturi.split('/')[-1].replace('.aspx','')) except: log.info(self.log_msg('Thread id not found')) if checkSessionInfo(self.genre, self.session_info_out, self.parent_uri,\ self.task.instance_data.get('update')): log.info(self.log_msg('Session info return True, Already exists')) return False for each in ['et_thread_last_post_author','ei_thread_replies_count','edate_last_post_date']: try: page[each] = self.task.pagedata[each] except: log.info(self.log_msg('page data cannot be extracted for %s'%each)) try: post_hash = get_hash( page ) id=None if self.session_info_out=={}: id=self.task.id result=updateSessionInfo( self.genre, self.session_info_out, self.\ parent_uri, post_hash,'Forum',self.task.instance_data.get('update'), Id=id) if not result['updated']: return False page['path']=[self.parent_uri] page['parent_path']=[] page['uri'] = normalize( self.currenturi ) page['uri_domain'] = unicode(urlparse.urlparse(page['uri'])[1]) page['priority']=self.task.priority page['level']=self.task.level page['pickup_date'] = datetime.strftime(datetime.utcnow(),"%Y-%m-%dT%H:%M:%SZ") page['posted_date'] = datetime.strftime(datetime.utcnow(),"%Y-%m-%dT%H:%M:%SZ") page['connector_instance_log_id'] = self.task.connector_instance_log_id page['connector_instance_id'] = self.task.connector_instance_id page['workspace_id'] = self.task.workspace_id page['client_id'] = self.task.client_id page['client_name'] = self.task.client_name page['last_updated_time'] = page['pickup_date'] page['versioned'] = False page['data'] = '' page['task_log_id']=self.task.id page['entity'] = 'Post' page['category']=self.task.instance_data.get('category','') self.pages.append(page) log.info(page) log.info(self.log_msg('Parent Page added')) return True except : log.exception(self.log_msg("parent post couldn't be parsed")) return False
def __addPosts(self, post): '''It will add the post ''' try: unique_key = post['id'].split('_')[-1] if checkSessionInfo('review', self.session_info_out, unique_key, \ self.task.instance_data.get('update')): log.info(self.log_msg('Session info returns True for uri %s'\ %self.currenturi)) return False page = self.__getData(post) if not page: return True result = updateSessionInfo('review', self.session_info_out, unique_key,get_hash( page ),'review', self.task.instance_data.get('update')) if result['updated']: page['path'] = [ self.currenturi, unique_key] page['parent_path'] = [] if not page.get('uri'): page['uri']= self.currenturi + '#' + unique_key page['uri_domain'] = urlparse.urlparse(page['uri'])[1] page['entity'] = 'review' page.update(self.__task_elements_dict) self.pages.append(page) log.info(self.log_msg('Page added')) else: log.info(self.log_msg('Update session info returns False for \ url %s'%self.currenturi)) except: log.exception(self.log_msg('Cannot add the post for the uri %s'\ %self.currenturi)) return True
def __addPost(self, post, is_question=False): try: unique_key = post.find('a')['name'].replace('Post','') log.debug(self.log_msg('POST: ' + str(unique_key))) if checkSessionInfo('review', self.session_info_out, unique_key, \ self.task.instance_data.get('update'),parent_list\ = [self.task.instance_data['uri']]): log.info(self.log_msg('Session info returns True for uri %s'\ %unique_key)) return False page = self.__getData(post, is_question) if not page: return True result = updateSessionInfo('review', self.session_info_out, unique_key,get_hash( page ),'forum', self.task.instance_data.get\ ('update'),parent_list=[self.task.instance_data['uri']]) if result['updated']: page['path'] = [ self.task.instance_data['uri'], unique_key] page['parent_path'] = [self.task.instance_data['uri']] page['uri']= self.currenturi + '#' + unique_key page['uri_domain'] = urlparse.urlparse(page['uri'])[1] #page['entity'] = '' #log.info(page) page.update(self.__task_elements_dict) self.pages.append(page) log.info(self.log_msg('Page added')) else: log.info(self.log_msg('Update session info returns False for \ url %s'%self.currenturi)) except: log.exception(self.log_msg('Cannot add the post for the uri %s'\ %self.currenturi)) return True
def __getParentPage(self): ''' This will get the Parent Page info ''' page = {} try: self.hierarchy = page['et_thread_hierarchy'] = [stripHtml(x.renderContents()) for x in self.soup.find('div',{'class':'rd Microsoft_Msn_Boards_Read_List Web_Bindings_Base'}).findAll('li')] except: log.info(self.log_msg('Thread hierarchy is not found')) try: self.forum_title = page['title'] = stripHtml(self.soup.find('h2').renderContents()) except: log.info(self.log_msg('Title Not Found')) page['title'] = '' if checkSessionInfo(self.genre, self.session_info_out, self.parent_uri, self.task.instance_data.get('update')): log.info(self.log_msg('Session info return True')) return False for each in ['et_author_name','ei_thread_replies_count','ei_thread_view_count','ei_author_count','et_last_post_author','edate_last_post_date','posted_date']: try: page[each] = self.task.pagedata[each] except: log.info(self.log_msg('Page data cannot be extracted for %s'%each)) try: page['ei_thread_id'] = int(urlparse.urlparse(self.currenturi)[4].split('&')[0].split('ThreadId=')[1]) except: log.info(self.log_msg('Thread id not found')) try: post_hash = get_hash(page) id = None if self.session_info_out == {}: id = self.task.id result = updateSessionInfo(self.genre, self.session_info_out, self.parent_uri, post_hash, 'Post', self.task.instance_data.get('update'),Id=id) if not result['updated']: return False page['path'] = [self.parent_uri] page['parent_path'] = [] page['uri'] = normalize(self.currenturi) page['uri_domain'] = unicode(urlparse.urlparse(page['uri'])[1]) page['priority'] = self.task.priority page['level'] = self.task.level page['pickup_date'] = datetime.strftime(datetime.utcnow(),'%Y-%m-%dT%H:%M:%SZ') page['connector_instance_log_id'] = self.task.connector_instance_log_id page['connector_instance_id'] = self.task.connector_instance_id page['workspace_id'] = self.task.workspace_id page['client_id'] = self.task.client_id page['client_name'] = self.task.client_name page['last_updated_time'] = page['pickup_date'] page['versioned'] = False page['data'] = '' page['task_log_id']=self.task.id page['entity'] = 'Post' page['category']=self.task.instance_data.get('category','') self.pages.append(page) log.info(page) log.info(self.log_msg('Parent Page added')) return True except : log.exception(self.log_msg("parent post couldn't be parsed")) return False
def __addQuestionInfo(self): """ This will get Question Info """ question = self.soup.find('div', 'question') if not question: log.info(self.log_msg('No Question Info Found')) return False #raise Exception('Question Not Found, Cannot Continue') page = {'uri':self.currenturi} try: self.__thread_topic = page['data'] = page['title'] = stripHtml(question.find('dd', id='topic_text').renderContents()) except: log.info(self.log_msg('No Question Data Found')) return False try: created_text = stripHtml(question.find('cite', id='created').renderContents()) match_object = re.search('Posted (?P<date_str>.+?) in (?P<et_thread_category>.+?) by (?P<et_author_name>.+?$)', created_text) page.update(match_object.groupdict()) except: log.info(self.log_msg('Not enough information')) try: date_str = page.pop('date_str') page['posted_date'] = self.__getDate(re.sub('\(.+?\)', '', date_str)) #page['posted_date'] = datetime.strftime(datetime.strptime(re.sub('\(.+?\)', '', date_str), '%I:%M%p on %Y-%m-%d'), "%Y-%m-%dT%H:%M:%SZ") except: log.info(self.log_msg('posted_date not found in url %s'%self.currenturi)) page['posted_date'] = datetime.strftime(datetime.utcnow(),"%Y-%m-%dT%H:%M:%SZ") try: rating_str = stripHtml(self.soup.find('span', id='overall_rating_score').renderContents()) if not rating_str == '--': page['ef_thread_rating'] = float(rating_str) except: log.info(self.log_msg('Rating not found')) try: page['ei_replies_count'] = int(stripHtml(self.soup.find('span', id='answer_count').renderContents()).split(' of ')[1].replace(',', '')) except: log.info(self.log_msg('Replies count not found')) try: if checkSessionInfo(self.__genre, self.session_info_out, self.task.instance_data['uri'], \ self.task.instance_data.get('update')): log.info(self.log_msg('Session info return True, Already exists')) return True result = updateSessionInfo('review', self.session_info_out, self.\ task.instance_data['uri'], get_hash( page ), 'forum', self.task.instance_data.get('update')) if result['updated']: page['path'] = [ self.task.instance_data['uri'] ] page['parent_path'] = [] page['uri'] = self.currenturi page['uri_domain'] = unicode(urlparse.urlparse(page['uri'])[1]) page['entity'] = 'question' page.update(self.__task_elements_dict) self.pages.append(page) else: log.info(self.log_msg('Result[updated] returned True for \ uri'%self.currenturi)) except: log.exception(self.log_msg("parent post couldn't be parsed")) return True
def __addPost(self, review_page_link, post_id): """ This will take the post tag , and fetch data and meta data and add it to self.pages """ try: self.currenturi = review_page_link self.__setSoupForCurrentUri() page = self.__getData(post_id) if not page: return True unique_key = get_hash({'data' : page['data']}) log.info(unique_key) if checkSessionInfo(self.genre, self.session_info_out, unique_key,\ self.task.instance_data.get('update'),parent_list\ = [self.task.instance_data['uri']]): log.info(self.log_msg('Session info returns True')) return False except: log.exception(self.log_msg('Cannot add the post for the url %s'%\ self.currenturi)) return False try: page['uri'] = self.currenturi except: log.info(self.log_msg('Cannot find the uri')) page['uri'] = self.currenturi try: result=updateSessionInfo(self.genre, self.session_info_out, unique_key, \ get_hash( page ),'Review', self.task.instance_data.get('update'),\ parent_list=[self.task.instance_data['uri']]) if not result['updated']: log.exception(self.log_msg('Update session info returns False')) return True page['parent_path'] = [] page['path'] = [self.task.instance_data['uri'], unique_key] page['uri_domain'] = urlparse.urlparse(page['uri'])[1] page.update(self.__task_elements_dict) self.pages.append(page) log.info(page) log.info(self.log_msg('Post Added')) return True except: log.exception(self.log_msg('Error while adding session info')) return True
def __setParentPage(self): """ """ page = {} try: hierarchies = [each for each in [stripHtml(x.renderContents()) for\ x in self.soup.find('table','tborder').\ table.findAll('td')] if each] self.hierarchy = [x.strip() for x in hierarchies[0].split('>')] page['data'] = page['title'] = hierarchies[1] self.hierarchy.append(page['title']) page['et_thread_hierarchy'] = self.hierarchy except: log.info(self.log_msg('Thread hierarchy is not found')) return False if checkSessionInfo(self.genre, self.session_info_out, self.task.instance_data['uri'],\ self.task.instance_data.get('update')): log.info(self.log_msg('Session info return True, Already exists')) return False page_data_keys = ['et_author_name', 'ei_thread_replies_count', \ 'ei_thread_views_count','edate_last_post_date',\ 'et_last_post_author'] [page.update({each:self.task.pagedata.get(each)}) for each in \ page_data_keys if self.task.pagedata.get(each)] try: date_str = stripHtml(self.soup.find('div', id='posts').find('td', \ 'thead').renderContents()) if date_str.startswith('Today'): date_str = date_str.replace('Today', datetime.strftime\ (datetime.utcnow(),'%m-%d-%Y')) elif date_str.startswith('Yesterday'): date_str = date_str.replace('Yesterday', datetime.\ strftime((datetime.utcnow()-timedelta\ (days=1)),'%m-%d-%Y')) page['posted_date'] = datetime.strftime(datetime.strptime(date_str,\ '%m-%d-%Y, %I:%M %p'),"%Y-%m-%dT%H:%M:%SZ") except: log.exception(self.log_msg('Posted date not found')) page['posted_date'] = datetime.strftime(datetime.utcnow(), \ "%Y-%m-%dT%H:%M:%SZ") try: result = updateSessionInfo('review', self.session_info_out, self.\ task.instance_data['uri'], get_hash(page), 'forum', \ self.task.instance_data.get('update')) if result['updated']: page['path'] = [self.task.instance_data['uri']] page['parent_path'] = [] page['uri'] = self.currenturi page['uri_domain'] = unicode(urlparse.urlparse(page['uri'])[1]) page['entity'] = 'thread' page.update(self.__task_elements_dict) self.pages.append(page) else: log.info(self.log_msg('Result[updated] returned True for \ uri'%self.currenturi)) except : log.exception(self.log_msg("parent post couldn't be parsed"))
def __addComments(self,parent_list): page={} try: self.currenturi = self.base_url + parent_list[-1] self.rawpage = urlopen(self.currenturi).read() self._setCurrentPage() except: log.info(self.log_msg('comment not found')) comments = self.soup.findAll('div','commentWrapper') for comment in comments: page={} try: page['et_author_name'] = stripHtml(comment.find('a',id=re.compile('.*UserProfileLink$')).renderContents()) except: log.info(self.log_msg('author name not found')) try: comment_panel = comment.find('div',id=re.compile('.*CommentUpdatePanel$')) no_of_days = int(re.search('commented (\d+) days ago',stripHtml(comment_panel.renderContents())).group(1)) page['posted_date'] = datetime.strftime(datetime.now() - timedelta(days=no_of_days),"%Y-%m-%dT%H:%M:%SZ") except: log.info(self.log_msg('posted date not found')) page['posted_date'] = datetime.strftime(datetime.utcnow(),"%Y-%m-%dT%H:%M:%SZ") try: page['data'] = stripHtml(comment_panel.find('div','bottom').renderContents().replace('/>>', '/>')) except: log.info(self.log_msg('data not found')) page['data']='' try: if len(page['data']) > 50: page['title'] = page['data'][:50] + '...' else: page['title'] = page['data'] except: log.info(self.log_msg('Title not found')) page['title'] = '' try: unique_key = get_hash( {'data':page['data'],'title':page['title']}) if checkSessionInfo(self.genre, self.session_info_out, unique_key, self.task.instance_data.get('update'), parent_list=parent_list): log.info(self.log_msg('session info returns true for comemnst')) continue result=updateSessionInfo(self.genre, self.session_info_out,unique_key , get_hash(page), 'Comment', self.task.instance_data.get('update'), parent_list=parent_list) if result['updated']: temp_parent_list = parent_list[:] page['parent_path'] = temp_parent_list[:] temp_parent_list.append(unique_key) page['path'] = temp_parent_list page['uri'] = self.currenturi page['uri_domain'] = urlparse.urlparse(self.currenturi)[1] page['entity'] = 'Comment' page.update(self.__task_elements_dict) self.pages.append(page) log.info(self.log_msg('Comment added')) except: log.info(self.log_msg('Comment not added'))
def getHash(self): for f in filehandlers: if hasattr(f, "getHash"): try: h = f.getHash(self) if h: return h except: logException("file handler getHash() failed") return get_hash(self.retrieveFile())
def check_password(self, password: str) -> bool: return safe_str_cmp(self.password, get_hash(password))
def set_password(self, password: str): self.password = get_hash(password)
def hash(self): return get_hash(self.path)