def __addPosts(self):
     """ It will add Post for a particular thread
     """
     try:
         reviews = [ x.findParent('div') for x in self.soup.findAll('div','pBody')]
     except:
         log.exception(self.log_msg('Reviews are not found'))
         return False
     for i, review in enumerate(reviews):
         post_type = ""
         if i==0 and self.post_type:
             post_type = "Question"
             self.post_type = False
         else:
             post_type = "Suggestion"
         page = self.__getData( review , post_type )
         log.info(self.log_msg(page))
         try:
             review_hash = get_hash( page )
             log.info(page)
             unique_key = get_hash( {'data':page['data'],'title':page['title']})
             if checkSessionInfo(self.genre, self.session_info_out, unique_key,\
                          self.task.instance_data.get('update'),parent_list\
                                                         =[self.parent_uri]):
                 log.info(self.log_msg('session info return True'))
                 continue
             result=updateSessionInfo(self.genre, self.session_info_out, unique_key, \
                         review_hash,'Thread', self.task.instance_data.get('update'),\
                                                     parent_list=[self.parent_uri])
             if not result['updated']:
                 log.info(self.log_msg('result not updated'))
                 continue
             #page['first_version_id']=result['first_version_id']
             #page['parent_id']= '-'.join(result['id'].split('-')[:-1])
             #page['id'] = result['id']
             parent_list = [self.parent_uri]
             page['parent_path']=copy.copy(parent_list)
             parent_list.append(unique_key)
             page['path']=parent_list
             page['priority']=self.task.priority
             page['level']=self.task.level
             page['pickup_date'] = datetime.strftime(datetime.utcnow()\
                                                 ,"%Y-%m-%dT%H:%M:%SZ")
             page['connector_instance_log_id'] = self.task.connector_instance_log_id
             page['connector_instance_id'] = self.task.connector_instance_id
             page['workspace_id'] = self.task.workspace_id
             page['client_id'] = self.task.client_id
             page['client_name'] = self.task.client_name
             page['last_updated_time'] = page['pickup_date']
             page['versioned'] = False
             page['entity'] = 'Review'
             page['category'] = self.task.instance_data.get('category','')
             page['task_log_id']=self.task.id
             page['uri'] = self.currenturi
             page['uri_domain'] = urlparse.urlparse(page['uri'])[1]
             self.pages.append( page )
             #log.info(page)
             log.info(self.log_msg('Review Added'))
         except:
             log.exception(self.log_msg('Error while adding session info'))
 def __addPost(self, post,is_question=False):
     '''This will add the post
     '''
     try:
         page = self.__getData(post,is_question)
         if not page:
             log.info(self.log_msg('No data found in url %s'%self.currenturi))        
             return True
         unique_key = get_hash({'data':page['data'], 'title':page['title']})
         if checkSessionInfo(self.__genre, self.session_info_out, \
                 unique_key, self.task.instance_data.get('update'),\
                 parent_list=[self.task.instance_data['uri']]):
             log.info(self.log_msg('Session info returns True for uri %s'% \
                                                         self.currenturi))
             return False            
         result = updateSessionInfo(self.__genre, self.session_info_out, unique_key, \
             get_hash( page ),'forum', self.task.instance_data.get('update'),\
                             parent_list=[self.task.instance_data['uri']])
         if result['updated']:
             page['parent_path'] = [self.task.instance_data['uri']]
             page['path'] = [self.task.instance_data['uri'], unique_key ]
             page['uri_domain']  = urlparse.urlparse(page['uri'])[1]
             page.update(self.__task_elements_dict)
             self.pages.append(page)
         else:
             log.info(self.log_msg('Update session info returns False for \
                                                 url %s'%self.currenturi))
     except:
         log.exception(self.log_msg('Cannot add the post in url %s'%self.currenturi))
     return True
 def __addPost(self, post):
     """
     This will take the post tag , and fetch data and meta data and add it to 
     self.pages
     """
     try:
         page = self.__getData(post)
         if not page:
             log.info(self.log_msg('page contains empty data, getdata \
                                 returns  False for uri %s'%self.currenturi))
             return True
         unique_key = get_hash(page)
         if checkSessionInfo(self.__genre, self.session_info_out, unique_key,\
                                     self.task.instance_data.get('update')):
             log.info(self.log_msg('Session info returns True for uri %s'%unique_key))
             return False
         result = updateSessionInfo(self.__genre, self.session_info_out, unique_key, \
             get_hash( page ),'forum', self.task.instance_data.get('update'))
         if result['updated']:
             page['parent_path'] = []
             page['path'] = [unique_key]
             page['uri'] = self.currenturi
             page['uri_domain']  = urlparse.urlparse(page['uri'])[1]
             log.info(page)
             page.update(self.__task_elements_dict)
             self.pages.append(page)
         else:
             log.info(self.log_msg('Update session info returns False for \
                                                 url %s'%self.currenturi))
     except:
         log.exception(self.log_msg('Cannot add the post for the uri %s'%self.currenturi))
     return True
    def __addPost(self, post):
        '''It will add the post
        '''
        try:
            
            page = self.__getData(post)
            if not page:
                return True
            unique_key  = get_hash( {'data' : page['data'] })
            if checkSessionInfo('review', self.session_info_out, unique_key,\
                         self.task.instance_data.get('update'),parent_list\
                                            = [self.currenturi]):
                log.info(self.log_msg('Session info returns True'))
                return False

            result=updateSessionInfo('review', self.session_info_out, unique_key, \
                get_hash( page ),'Review', self.task.instance_data.get('update'),\
                                parent_list=[self.currenturi])
            if not result['updated']:
                log.info(self.log_msg('Update session info returns False'))
                return True
            page['path'] = [self.currenturi] 
            page['parent_path'] = []
            #page['path'].append(unique_key)
            page['uri'] = self.currenturi
            page['uri_domain']  = urlparse.urlparse(page['uri'])[1]
            page['entity'] = 'post'
            page.update(self.__task_elements_dict)
            self.pages.append(page)
            log.info(page)
            log.info(self.log_msg('Post Added'))
            return True
        except:
            log.exception(self.log_msg('Error while adding session info'))
            return False  
 def __addPosts(self):
         ''
         try:
             reviews =self.soup.findAll('div',id=re.compile('^edit.*?'))
             if not reviews:
                 log.info(self.log_msg('No reviews found'))
                 return False
         except:
             log.exception(self.log_msg('Reviews are not found'))
             return False
         for i, review in enumerate(reviews):
             post_type = "Question"
             if i==0 and self.post_type:
                 post_type = "Question"
                 self.post_type = False
             else:
                 post_type = "Suggestion"
             page = self.__getData( review , post_type )
             if not page:
                 log.info(self.log_msg('no page is sent back'))
                 continue
             try:
                 review_hash = get_hash( page )
                 # not changed ,bcoz, we already crawled
                 unique_key = get_hash( {'data':page['data'],'title':page['title']})
                 if checkSessionInfo(self.genre, self.session_info_out, unique_key,\
                              self.task.instance_data.get('update'),parent_list\
                                                             =[self.parent_uri]):
                     continue
                 result=updateSessionInfo(self.genre, self.session_info_out, unique_key, \
                             review_hash,'Review', self.task.instance_data.get('update'),\
                                                         parent_list=[self.parent_uri])
                 if not result['updated']:
                     continue
                 parent_list = [ self.parent_uri ]
                 page['parent_path'] = copy.copy(parent_list)
                 parent_list.append( unique_key )
                 page['path']=parent_list
                 page['priority']=self.task.priority
                 page['level']=self.task.level
                 page['pickup_date'] = datetime.strftime(datetime.utcnow()\
                                                     ,"%Y-%m-%dT%H:%M:%SZ")
                 page['connector_instance_log_id'] = self.task.connector_instance_log_id
                 page['connector_instance_id'] = self.task.connector_instance_id
                 page['workspace_id'] = self.task.workspace_id
                 page['client_id'] = self.task.client_id
                 page['client_name'] = self.task.client_name
                 page['last_updated_time'] = page['pickup_date']
                 page['versioned'] = False
                 page['entity'] = 'Review'
                 page['category'] = self.task.instance_data.get('category','')
                 page['task_log_id']=self.task.id
                 #page['uri'] = self.currenturi #Skumar
                 page['uri_domain'] = urlparse.urlparse(page['uri'])[1]
                 self.pages.append( page )
                 #log.info(page)
                 log.info(self.log_msg('Review Added'))
             except:
                 log.exception(self.log_msg('Error while adding session info'))
 def __addPosts(self):
     """ It will add Post for a particular thread
     """
     try:
         reviews = [ BeautifulSoup(x) for x in  self.soup.find('table','Frm_MsgTable').__str__().split('<!-- Start Message head -->')[1:]]
     except:
         log.exception(self.log_msg('Reviews are not found'))
         return False
     post_type = "Question"
     log.info([review.find('a')['name'] for review in reviews])
     for i, review in enumerate(reviews):
         if i==0 and self.post_type:
             post_type = "Question"
             self.post_type = False
         else:
             post_type = "Suggestion"
         page = self.__getData( review , post_type )
         if not page:
             log.info(self.log_msg('Todays Post , so, continue with other post'))
             continue
         try:
             review_hash = get_hash( page )
             #unique_key = review.find('a')['name']
             unique_key = get_hash( {'data':page['data'],'title':page['title']})
             if checkSessionInfo(self.genre, self.session_info_out, unique_key,\
                          self.task.instance_data.get('update'),parent_list\
                                                         =[self.parent_uri]):
                 continue
             result=updateSessionInfo(self.genre, self.session_info_out, unique_key, \
                         review_hash,'Review', self.task.instance_data.get('update'),\
                                                     parent_list=[self.parent_uri])
             if not result['updated']:
                 continue
             parent_list = [self.parent_uri]
             page['parent_path']=copy.copy(parent_list)
             parent_list.append(unique_key)
             page['path']=parent_list
             page['priority']=self.task.priority
             page['level']=self.task.level
             page['pickup_date'] = datetime.strftime(datetime.utcnow()\
                                                 ,"%Y-%m-%dT%H:%M:%SZ")
             page['connector_instance_log_id'] = self.task.connector_instance_log_id
             page['connector_instance_id'] = self.task.connector_instance_id
             page['workspace_id'] = self.task.workspace_id
             page['client_id'] = self.task.client_id
             page['client_name'] = self.task.client_name
             page['last_updated_time'] = page['pickup_date']
             page['versioned'] = False
             page['entity'] = 'Review'
             page['category'] = self.task.instance_data.get('category','')
             page['task_log_id']=self.task.id
             page['uri'] = page.get('uri',self.parent_uri)
             page['uri_domain'] = urlparse.urlparse(page['uri'])[1]
             self.pages.append( page )
             #log.info(page)
             log.info(self.log_msg('Review Added'))
         except:
             log.exception(self.log_msg('Error while adding session info'))
Example #7
0
def check_for_duplicates(paths, hash=hashlib.sha1):
    hashes_by_size = {}
    hashes_on_1k = {}
    hashes_full = {}
    n_tot = len(paths)
    for i, path in enumerate(paths):
        file_size = os.path.getsize(path)
        duplicate = hashes_by_size.get(file_size)
        if duplicate:
            hashes_by_size[file_size].append(path)
        else:
            # create the list for this file size
            hashes_by_size[file_size] = []
            hashes_by_size[file_size].append(path)
        if ((i % 10000) == 0) and i > 0:
            print("Checked size of {}/{} files".format(i, n_tot))
    # For all files with the same file size, get their
    # hash on the 1st 1024 bytes
    logger.info("Checking for duplication by comparing small hashes ..")
    for __, files in hashes_by_size.items():
        if len(files) < 2:
            continue
        for filename in files:
            try:
                small_hash = get_hash(filename, first_chunk_only=True)
            except (OSError, ):
                # the file access might've changed till the exec point got here
                continue
            duplicate = hashes_on_1k.get(small_hash)
            if duplicate:
                hashes_on_1k[small_hash].append(filename)
            else:
                # create the list for this 1k hash
                hashes_on_1k[small_hash] = []
                hashes_on_1k[small_hash].append(filename)
    # For all files with the hash on the 1st 1024 bytes, get their
    # hash on the full file - collisions will be duplicates
    n_duplicates = 0
    logger.info("Checking for duplication by comparing full hashes ..")
    for __, files in hashes_on_1k.items():
        # this hash of fist 1k file bytes is unique, no need to
        # spend cpy cycles on it
        if len(files) < 2:
            continue
        for filename in files:
            try:
                full_hash = get_hash(filename, first_chunk_only=False)
            except (OSError, ):
                # the file access might've changed till the exec point got here
                continue
            duplicate = hashes_full.get(full_hash)
            if duplicate:
                logger.info("Duplicate found: %s and %s" %
                            (filename, duplicate))
                n_duplicates += 1
            else:
                hashes_full[full_hash] = filename
    logger.info("Found {} duplicates".format(n_duplicates))
 def __addPosts(self):
     """ It will add Post for a particular thread
     """
     try:
         reviews = self.soup.findAll('table',id='tblTitle')
     except:
         log.exception(self.log_msg('Reviews are not found'))
         return False
     for i, review in enumerate(reviews):
         if i==0 and self.post_type:
             post_type = "Question"
             self.post_type = False
         else:
             post_type = "Suggestion"
         try:
             page = self.__getData( review, post_type )
             unique_key = get_hash( {'data':page['data'],'title':page['title']})
             #unique_key = stripHtml(review.findNext('a',id=re.compile('PostLink')).renderContents()).split('#')[-1]
             if checkSessionInfo(self.genre, self.session_info_out, unique_key,\
                          self.task.instance_data.get('update'),parent_list\
                                                         =[self.parent_uri]):
                 log.info(self.log_msg('Session info returns True'))
                 continue
             
         except:
             log.info(self.log_msg('unique key not found'))
             continue
         try:
             result=updateSessionInfo(self.genre, self.session_info_out, unique_key, \
                         get_hash( page ),'Review', self.task.instance_data.get('update'),\
                                                     parent_list=[self.parent_uri])
             if not result['updated']:
                 continue
             parent_list = [ self.parent_uri ]
             page['parent_path'] = copy.copy(parent_list)
             parent_list.append( unique_key )
             page['path']=parent_list
             page['priority']=self.task.priority
             page['level']=self.task.level
             page['pickup_date'] = datetime.strftime(datetime.utcnow()\
                                                 ,"%Y-%m-%dT%H:%M:%SZ")
             page['connector_instance_log_id'] = self.task.connector_instance_log_id
             page['connector_instance_id'] = self.task.connector_instance_id
             page['workspace_id'] = self.task.workspace_id
             page['client_id'] = self.task.client_id
             page['client_name'] = self.task.client_name
             page['last_updated_time'] = page['pickup_date']
             page['versioned'] = False
             page['entity'] = 'Review'
             page['category'] = self.task.instance_data.get('category','')
             page['task_log_id']=self.task.id
             page['uri'] = self.currenturi
             page['uri_domain'] = urlparse.urlparse(page['uri'])[1]
             self.pages.append( page )
             log.info(self.log_msg('Review Added'))
         except:
             log.exception(self.log_msg('Error while adding session info'))
    def __setParentPage(self):
        """ this will set parent page info """
        
        page = {}
        try: 
            page['title']  = stripHtml(self.soup.find('div','brdSubHd grey top botOne').renderContents()).split('replies')[-1].strip()
            #log.info(page['title'])
            
            page['data'] = stripHtml(self.soup.find('div','mbPanel clearPanel').renderContents())
             
            try:
                date_str = stripHtml(self.soup.find('div','brdSubHd blue').renderContents()).split('on')[-1].strip()
                page['posted_date'] = datetime.strftime(datetime.strptime(date_str,'%d/%m/%y at %I:%M %p'),"%Y-%m-%dT%H:%M:%SZ")             
   
            except:
                log.exception(self.log_msg('Posted date not found'))
                page['posted_date'] = datetime.strftime(datetime.utcnow(), "%Y-%m-%dT%H:%M:%SZ")
        except:
            log.exception(self.log_msg('main page title  not found'))
            return False  
        unique_key = get_hash({'title': page['title'],'data' : page['data']})
        if checkSessionInfo(self.genre, self.session_info_out, unique_key,\
            self.task.instance_data.get('update')):
                    
            log.info(self.log_msg('Session info returns True for uri %s'\
                                                                           %self.currenturi))
            return False
        page_data_keys = ['et_first_author_name', 'ei_thread_replies_count', \
                            'edate_last_post_date']
        [page.update({each:self.task.pagedata.get(each)}) for each in \
                                page_data_keys if self.task.pagedata.get(each)] 
        try:
            result=updateSessionInfo(self.genre, self.session_info_out, unique_key, \
                    get_hash( page ),'Review', self.task.instance_data.get('update'))
            if not result['updated']:
                log.exception(self.log_msg('Update session info returns False'))
                return True
            page['parent_path'] = page['path'] = [self.task.instance_data['uri']]
##            page['path'] = [unique_key]
            #page['path'].append(unique_key)
            page['uri'] = self.currenturi
            page['entity'] = 'Review'
            page['uri_domain']  = urlparse.urlparse(page['uri'])[1]
            page.update(self.__task_elements_dict)
            self.pages.append(page)
            #log.info(page)
            log.info(self.log_msg('Post Added'))
            return True        
        except:
            log.exception(self.log_msg('Error while adding session info'))
            return False  
    def __addPost(self, post, is_original_post=False):
        try:
            unique_key = stripHtml(str(post.findAll('div', 'oneLine')[2])).split()[2]

            page = self.__get_data(post, is_original_post, unique_key)
            if not page: 
                log.info(self.log_msg('page is empty, __get_data returns  False for uri %s' % 
                                      self.currenturi))
                return True

            if checkSessionInfo(self.__genre, self.session_info_out, 
                                unique_key, self.task.instance_data.get('update'), 
                                parent_list=[self.task.instance_data['uri']]):
                log.info(self.log_msg('Session info returns True for uri %s' % 
                                      self.task.instance_data['uri']))
                return False

            result = updateSessionInfo(self.__genre, self.session_info_out, unique_key, 
                                       get_hash(page),'forum', self.task.instance_data.get('update'), 
                                       parent_list=[self.task.instance_data['uri']])
            if result['updated']:
                page['parent_path'] = [self.task.instance_data['uri']]
                page['path'] = [self.task.instance_data['uri'], unique_key]
                page['uri_domain']  = urlparse.urlparse(page['uri'])[1]
                page.update(self.__task_elements_dict)
                self.pages.append(page)
            else:
                log.info(self.log_msg('Update session info returns False for url %s' % self.currenturi))
        except:
            log.exception(self.log_msg('Cannot add the post for the uri %s' % self.currenturi))

        return True
 def __addPost(self, post, is_question=False):
     try:
         unique_key = re.search(r'(\d+)', post.find('div', id = re.compile(r'^post-\d+$'))['id']).groups()[0]
         if checkSessionInfo(self.__genre, self.session_info_out, unique_key, \
                          self.task.instance_data.get('update'),parent_list\
                                         = [self.task.instance_data['uri']]):
             log.info(self.log_msg('Session info returns True for %s' % unique_key))
             return False
         page = self.__getData(post, is_question)
         if not page:
             log.info(self.log_msg('page contains empty data __getData \
                         returns  False for uri %s'%self.currenturi) )
             return True
         result = updateSessionInfo(self.__genre, self.session_info_out, unique_key, \
             get_hash( page ),'forum', self.task.instance_data.get('update'),\
                             parent_list=[self.task.instance_data['uri']])
         if result['updated']:
             page['parent_path'] = [self.task.instance_data['uri']]
             page['path'] = [ self.task.instance_data['uri'], unique_key]
             page['uri'] = post.findPrevious('a', attrs = {'onclick': re.compile('link_to_post')})['href'].__str__()
             page['uri_domain']  = urlparse.urlparse(page['uri'])[1]
             page.update(self.__task_elements_dict)
             self.pages.append(page)
         else:
             log.info(self.log_msg('Update session info returns False for \
                                                 url %s' % self.currenturi))
     except:
         log.exception(self.log_msg('Cannot add the post for the uri %s' % self.currenturi))
     return True
def eliminate_ambigous_size_matches(matches):
    """ Eliminate ambigous matches (same size) by comparing file hashes """
    # eliminate ambiguous matches
    for path in matches.keys():
        size_matches = matches[path]
        hash_matches = list()
        if len(size_matches) > 1:
            hash_to_find = get_hash(path, first_chunk_only=False)
            for file in size_matches:
                try:
                    hash_to_match = get_hash(file, first_chunk_only=False)
                except (OSError, ):
                    continue
                if hash_to_match == hash_to_find:
                    hash_matches.append(file)
            matches[path] = hash_matches
 def __addPost(self, post, is_question=False):
     """
     This will take the post tag , and fetch data and meta data and add it to 
     self.pages
     """
     try:
         unique_key = stripHtml(post.find('div', id=re.compile('msgId\d+'))\
                         .renderContents())[1:-1].replace('Msg Id: ', '')
         if checkSessionInfo(self.__genre, self.session_info_out, unique_key, \
                          self.task.instance_data.get('update'),parent_list\
                                     = [self.task.instance_data['uri']]):
             log.info(self.log_msg('Session info returns True for uri %s'%unique_key))
             return False
         page = self.__getData(post, is_question)
         if not page:
             log.info(self.log_msg('page contains empty data, getdata \
                                 returns  False for uri %s'%self.currenturi))
             return True
         result = updateSessionInfo(self.__genre, self.session_info_out, unique_key, \
             get_hash( page ),'forum', self.task.instance_data.get('update'), \
                             parent_list=[self.task.instance_data['uri']])
         if result['updated']:
             page['parent_path'] = [self.task.instance_data['uri']]
             page['path'] = [self.task.instance_data['uri'], unique_key]
             page['uri_domain']  = urlparse.urlparse(page['uri'])[1]
             page.update(self.__task_elements_dict)
             self.pages.append(page)
             log.info(self.log_msg('Page added'))
         else:
             log.info(self.log_msg('Update session info returns False for \
                                                 url %s'%self.currenturi))
             return False
     except:
         log.exception(self.log_msg('Cannot add the post for the uri %s'%self.currenturi))
     return True
 def __addReviews(self):
     '''It will fetch the the reviews and append it  to self.pages
     '''
     reviews= [x.findParent('div').findParent('div')  for x in self.soup.findAll('span' ,'ctedit')]
     log.debug(self.log_msg('# Of Reviews found is %d'%len(reviews)))
     for review in reviews:
         try:
             unique_key = review.find('a')['name']
             if checkSessionInfo(self.genre, self.session_info_out, unique_key,\
                          self.task.instance_data.get('update'),parent_list\
                                         =[ self.task.instance_data['uri'] ]):
                 log.info(self.log_msg('session info return True in url %s'%self.currenturi))
                 continue
             page = self.__getData(review)
             if not page:
                 log.info(self.log_msg('No data found in url %s'%self.currenturi))
                 continue                
             result = updateSessionInfo(self.genre, self.session_info_out, unique_key, \
                 get_hash(page),'comment', self.task.instance_data.get('update'),\
                                 parent_list=[self.task.instance_data['uri']])
             if not result['updated']:
                 log.info(self.log_msg('result not updated'))
                 continue
             page['path'] = page['parent_path'] = [ self.task.instance_data['uri'] ]
             page['path'].append( unique_key )
             page['entity'] = 'comment'
             page['uri'] = self.task.instance_data['uri']
             page['uri_domain'] = urlparse.urlparse(page['uri'])[1]
             page.update(self.__task_elements_dict)                
             self.pages.append(page)
             log.info(self.log_msg('Review Added'))
         except:
             log.exception(self.log_msg('Exception while adding session info in url %s'%self.currenturi))
 def __addPost(self, post, is_question = False):
     """
     This will take the post tag , and fetch data and meta data and add it to 
     self.pages
     """
     try:  
         unique_tag = post.find('a', 'postcounter')
        #is_question = stripHtml(unique_tag.renderContents())== u'#1'
         unique_key = unique_tag['href']
         if checkSessionInfo(self.__genre, self.session_info_out, unique_key,\
                                     self.task.instance_data.get('update')):
             log.info(self.log_msg('Session info returns True for uri %s'%unique_key))
             return False
         page = self.__getData(post, is_question, unique_key)
         if not page:
             log.info(self.log_msg('page contains empty data, getdata \
                                 returns  False for uri %s'%self.currenturi))
             return True
         result = updateSessionInfo(self.__genre, self.session_info_out, unique_key, \
             get_hash( page ),'forum', self.task.instance_data.get('update'))
         if result['updated']:
             page['parent_path'] = []
             page['path'] = [unique_key]
             page['uri'] = unique_key
             page['uri_domain']  = urlparse.urlparse(page['uri'])[1]
             page.update(self.__task_elements_dict)
             self.pages.append(page)
         else:
             log.info(self.log_msg('Update session info returns False for \
                                                 url %s'%self.currenturi))
     except:
         log.exception(self.log_msg('Cannot add the post for the uri %s'%self.currenturi))
     return True
 def __addPosts(self):
     """ It will add Post for a particular thread
     """
     try:
         """for block_quote in re.findall('<BLOCKQUOTE>.*?</BLOCKQUOTE>',self.rawpage,re.S):
             self.rawpage = self.rawpage.replace(block_quote,'')
         self._setCurrentPage()
         #reviews = self.soup.findAll('div','thread')"""
         reviews = self.soup.findAll('div','wrapper_comment')
     except:
         log.exception(self.log_msg('Reviews are not found'))
         return False
     for i, review in enumerate(reviews):
         post_type = "Question"
         if i==0:
             post_type = "Question"
         else:
             post_type = "Suggestion"
         try:
             unique_key = dict(parse_qsl(review.find('div','commentbox_nav').find('a',text='Reply').parent['href'].split('?')[-1]))['ReplyToPostID']
             if checkSessionInfo(self.genre, self.session_info_out, unique_key,\
                          self.task.instance_data.get('update'),parent_list\
                                                         =[self.parent_uri]):
                 log.info(self.log_msg('Session info returns True'))
                 continue
             page = self.__getData( review, post_type )
             log.info(page)
         except:
             log.info(self.log_msg('unique key not found'))
             continue
         try:
             result=updateSessionInfo(self.genre, self.session_info_out, unique_key, \
                         get_hash( page ),'Review', self.task.instance_data.get('update'),\
                                                     parent_list=[self.parent_uri])
             if not result['updated']:
                 continue
             parent_list = [ self.parent_uri ]
             page['parent_path'] = copy.copy(parent_list)
             parent_list.append( unique_key )
             page['path']=parent_list
             page['priority']=self.task.priority
             page['level']=self.task.level
             page['pickup_date'] = datetime.strftime(datetime.utcnow()\
                                                 ,"%Y-%m-%dT%H:%M:%SZ")
             page['connector_instance_log_id'] = self.task.connector_instance_log_id
             page['connector_instance_id'] = self.task.connector_instance_id
             page['workspace_id'] = self.task.workspace_id
             page['client_id'] = self.task.client_id
             page['client_name'] = self.task.client_name
             page['last_updated_time'] = page['pickup_date']
             page['versioned'] = False
             page['entity'] = 'Review'
             page['category'] = self.task.instance_data.get('category','')
             page['task_log_id']=self.task.id
             page['uri'] = self.currenturi
             page['uri_domain'] = urlparse.urlparse(page['uri'])[1]
             self.pages.append( page )
             log.info(self.log_msg('Review Added'))
         except:
             log.exception(self.log_msg('Error while adding session info'))
 def __addPost(self, post, is_question=False):
     """
     This will take the post tag , and fetch data and meta data and add it to 
     self.pages
     """
     try:
         unique_key = post.find('a', attrs={'name':True})['name']
         permalink = self.currenturi + '#' + unique_key
         if checkSessionInfo(self.__genre, self.session_info_out, \
                     unique_key, self.task.instance_data.get('update'),\
                     parent_list=[self.task.instance_data['uri']]):
             log.info(self.log_msg('Session info returns True for uri %s'% \
                                                             permalink))
             return False
         page = self.__getData(post, is_question, unique_key)
         if not page:
             log.info(self.log_msg('page contains empty data, getdata \
                                 returns  False for uri %s'%self.currenturi))
             return True
         result = updateSessionInfo(self.__genre, self.session_info_out, unique_key, \
             get_hash( page ),'forum', self.task.instance_data.get('update'),\
                             parent_list=[self.task.instance_data['uri']])
         if result['updated']:
             page['parent_path'] = [self.task.instance_data['uri']]
             page['path'] = [self.task.instance_data['uri'], unique_key ]
             page['uri_domain']  = urlparse.urlparse(page['uri'])[1]
             page.update(self.__task_elements_dict)
             self.pages.append(page)
         else:
             log.info(self.log_msg('Update session info returns False for \
                                                 url %s'%self.currenturi))
     except:
         log.exception(self.log_msg('Cannot add the post for the uri %s'%self.currenturi))
     return True
 def __addPosts(self,link):
     '''It will add the post
     '''
     try:
         self.currenturi = link
         if checkSessionInfo('review', self.session_info_out, self.currenturi, \
                         self.task.instance_data.get('update')):
             log.info(self.log_msg('Session info returns True for uri %s'\
                                                            %self.currenturi))
             return False
         self.__setSoupForCurrentUri()    
         page = self.__getData()
         if not page:
             return True
         result = updateSessionInfo('review', self.session_info_out, 
             self.currenturi,get_hash( page ),'review', self.task.instance_data.get('update'))
         if result['updated']:
             page['path'] = [ self.currenturi]
             page['parent_path'] = []
             page['uri']= self.currenturi 
             page['uri_domain'] = urlparse.urlparse(page['uri'])[1]
             page['entity'] = 'review'
             page.update(self.__task_elements_dict)
             self.pages.append(page)
             log.info(self.log_msg('Page added'))
         else:
             log.info(self.log_msg('Update session info returns False for \
                                             url %s'%self.currenturi))
     except:
         log.exception(self.log_msg('Cannot add the post for the uri %s'\
                                                         %self.currenturi))
         return False 
Example #19
0
 def __getParentPage(self,comment):
     """This will get the parent info
     """
     page = {}
     try:
         self.__total_replies_count = page['ei_data_replies_count'] = int(stripHtml(comment.find('totalreplies').renderContents()))
         page['title'] = page['data'] = stripHtml(comment.find('name').renderContents())
         page['posted_date'] = stripHtml(comment.find('dateadded').renderContents()).split('.')[0]
         unique_key = stripHtml(comment.find('messageid').renderContents())
         if checkSessionInfo(self.__genre, self.session_info_out, self.task.instance_data['uri'],\
                                      self.task.instance_data.get('update')):
             log.info(self.log_msg('Session info return True, Already exists'))
             return
         result = updateSessionInfo('review', self.session_info_out, self.\
             task.instance_data['uri'], get_hash( page ), 'forum', self.task.instance_data.get('update'))
         if result['updated']:
             page['path']=[unique_key] 
             page['parent_path']=[]
             page['uri'] = self.currenturi
             page['uri_domain'] = unicode(urlparse.urlparse(page['uri'])[1])
             page['entity'] = 'post'
             page.update(self.__task_elements_dict)
             log.info(page['data'])
             self.pages.append(page)
         else:
             log.info(self.log_msg('Result[updated] returned True for \
                                                     uri'%self.currenturi))
     except:
         log.exception(self.log_msg('Hierachy/Title not found in url %s'%self.currenturi))
         return
 def __addPost(self, post, is_question = False):
     """
     This will take the post tag , and fetch data and meta data and add it to 
     self.pages
     """
     try:
         unique_key_tag = post.find('a', id=re.compile('postcount\d+'))
         #unique_key = self.__removeSessionId('http://htcpedia.com/forum/' + unique_key_tag['href'])
         unique_key = unique_key_tag['id']
         log.info(unique_key)
         if checkSessionInfo(self.__genre, self.session_info_out, unique_key,\
                                     self.task.instance_data.get('update')):
             log.info(self.log_msg('Session info returns True for uri %s'%unique_key))
             return False
         page = self.__getData(post, is_question)
         if not page:
             log.info(self.log_msg('page contains empty data, getdata \
                                 returns  False for uri %s'%self.currenturi))
             return True
         result = updateSessionInfo(self.__genre, self.session_info_out, unique_key, \
             get_hash( page ),'forum', self.task.instance_data.get('update'))
         if result['updated']:
             page['parent_path'] = []
             page['path'] = [self.task.instance_data['uri'], unique_key]
             page['uri'] = self.currenturi + "#" + unique_key
             page['uri_domain']  = urlparse.urlparse(page['uri'])[1]
             log.info(page)
             page.update(self.__task_elements_dict)
             self.pages.append(page)
         else:
             log.info(self.log_msg('Update session info returns False for \
                                                 url %s'%self.currenturi))
     except:
         log.exception(self.log_msg('Cannot add the post for the uri %s'%self.currenturi))
     return True
 def __setParentPage(self):
     """This will get the parent info
     """
     page = {}
     try:
         page['et_thread_hierarchy'] = self.__hierarchy = [x.strip() for x in stripHtml(self.soup.find('div', 'deck breadcrumbs').renderContents()).split('>') if x.strip()][1:]
         page['data'] = page['title'] = page['et_thread_hierarchy'][-1]
     except:
         log.exception(self.log_msg('Thread hierarchy and Title Not found for uri\
                                                         %s'%self.currenturi))
         return
     if checkSessionInfo(self.__genre, self.session_info_out, self.task.instance_data['uri'], \
                                      self.task.instance_data.get('update')):
         log.info(self.log_msg('Session info return True, Already exists'))
         return
     try:
         result = updateSessionInfo('review', self.session_info_out, self.\
             task.instance_data['uri'], get_hash( page ), 'forum', self.task.instance_data.get('update'))
         if result['updated']:
             page['path'] = [self.task.instance_data['uri']] 
             page['parent_path'] = []
             page['uri'] = self.currenturi
             page['uri_domain'] = unicode(urlparse.urlparse(page['uri'])[1])
             page['data'] = ''
             page['entity'] = 'thread'
             page.update(self.__task_elements_dict)
             page['posted_date'] = page['pickup_date']
             self.pages.append(page)
             log.info(self.log_msg('Parent Page Added'))
         else:
             log.info(self.log_msg('Result[updated] returned True for \
                                                     uri'%self.currenturi))
     except:
         log.exception(self.log_msg("parent post couldn't be parsed"))
 def __getParentPage(self):
     ''
     if checkSessionInfo(self.genre, self.session_info_out, self.currenturi,\
                                      self.task.instance_data.get('update')):
         log.info(self.log_msg('Session info return True, Already exists'))
         return False
     page = {}
     try:
         page['et_thread_hierarchy'] = [each.replace('>','').strip() for each in stripHtml(self.soup.find('span','navbar').findParent('table').renderContents()).split('\n') if not each.strip()=='']
         page['title']= page['et_thread_hierarchy'][-1]
     except:
         log.info(self.log_msg('Thread hierarchy is not found'))
         page['title']=''
     for each in ['title','et_last_post_author_name','ei_thread_replies_count','ei_thread_views_count','edate_last_post_date','ei_thread_votes_count','ef_thread_rating']:
         try:
             page[each] = self.task.pagedata[each]
         except:
             log.info(self.log_msg('page data cannot be extracted'))
     try:
         page['et_thread_id'] = self.currenturi.split('&')[-1].split('=')[-1]
     except:
         log.info(self.log_msg('Thread id not found'))
         
     try:
         post_hash = get_hash( page )
         id=None
         if self.session_info_out=={}:
             id=self.task.id
         result=updateSessionInfo( self.genre, self.session_info_out, self.\
                currenturi, post_hash,'Post',self.task.instance_data.get('update'), Id=id)
         if not result['updated']:
             return False
         page['path']=[self.currenturi]
         page['parent_path']=[]
         page['uri'] = normalize( self.currenturi )
         page['uri_domain'] = unicode(urlparse.urlparse(page['uri'])[1])
         page['priority']=self.task.priority
         page['level']=self.task.level
         page['pickup_date'] = datetime.strftime(datetime.utcnow(),"%Y-%m-%dT%H:%M:%SZ")
         page['posted_date'] = datetime.strftime(datetime.utcnow(),"%Y-%m-%dT%H:%M:%SZ")
         page['connector_instance_log_id'] = self.task.connector_instance_log_id
         page['connector_instance_id'] = self.task.connector_instance_id
         page['workspace_id'] = self.task.workspace_id
         page['client_id'] = self.task.client_id
         page['client_name'] = self.task.client_name
         page['last_updated_time'] = page['pickup_date']
         page['versioned'] = False
         #page['first_version_id']=result['first_version_id']
         page['data'] = ''
         #page['id'] = result['id']
         page['task_log_id']=self.task.id
         page['entity'] = 'Post'
         page['category']=self.task.instance_data.get('category','')
         self.pages.append(page)
         log.info(page)
         log.info(self.log_msg('Parent Page added'))
         return True
     except :
         log.exception(self.log_msg("parent post couldn't be parsed"))
         return False
 def __addPost(self, post, is_question=False):
     try:
         unique_key = post.find('span', attrs={'class': 'name'}).\
                      find('a')['name']
         if checkSessionInfo(self.__genre, self.session_info_out, unique_key,
                          self.task.instance_data.get('update'),parent_list\
                                         = [self.task.instance_data['uri']]):
             log.info(self.log_msg('Session info returns True for %s' % unique_key))
             return False
         page = self.__getData(post, is_question)
         log.info(self.log_msg('page'))
         if not page:
             log.info(self.log_msg('page contains empty data __getData returns False \
                         for uri %s'%self.currenturi))
             return True
         result = updateSessionInfo(self.__genre, self.session_info_out, 
                 unique_key, get_hash( page ),'forum', self.task.\
                 instance_data.get('update'), parent_list = \
                 [ self.task.instance_data['uri'] ] )
         if result['updated']:
             page['parent_path'] = [self.task.instance_data['uri']]
             page['path'] = [ self.task.instance_data['uri'], unique_key]
             page['uri'] = self.currenturi 
             page['uri_domain']  = urlparse.urlparse(page['uri'])[1]
             page.update(self.__task_elements_dict)
             self.pages.append(page)
         else:
             log.info(self.log_msg('Update session info returns False for \
                                                 url %s'%self.currenturi))
     except:
         log.exception(self.log_msg('Cannot add the post for the uri %s' \
             % self.currenturi))
     return True
 def __addPost(self, post, is_question=False):
     try:
         unique_key = re.search(r'(\d+)', post['id']).groups()[0]
         if checkSessionInfo(self.__genre, self.session_info_out, unique_key, \
                          self.task.instance_data.get('update'),parent_list\
                                         = [self.task.instance_data['uri']]):
             log.info(self.log_msg('Session info returns True for %s'%unique_key))
             return False
         page = self.__getData(post, is_question)
         if not page:
             log.info(self.log_msg('page contains empty data, getdata \
                                 returns  False for uri %s'%self.currenturi))
             return True
         result = updateSessionInfo(self.__genre, self.session_info_out, unique_key, \
             get_hash( page ),'forum', self.task.instance_data.get('update'),\
                             parent_list=[self.task.instance_data['uri']])
         if result['updated']:
             page['parent_path'] = [self.task.instance_data['uri']]
             page['path'] = [ self.task.instance_data['uri'], unique_key]
             page['uri'] = self.__baseuri + 'showpost.php?p=' + unique_key
             page['uri_domain']  = urlparse.urlparse(page['uri'])[1]
             page.update(self.__task_elements_dict)
             self.pages.append(page)
         else:
             log.info(self.log_msg('Update session info returns False for \
                                                 url %s'%self.currenturi))
     except:
         log.exception(self.log_msg('Cannot add the post for the uri %s'%self.currenturi))
     return True
    def __getParentPage(self):
        """
        This will get the parent info
        """
        page = {}
        try:
            self.hierarchy =  page['et_thread_hierarchy'] = [stripHtml(x.renderContents()) for x in self.soup.find('div','CommonBreadCrumbArea').findAll('a')][1:]
            page['title']= page['et_thread_hierarchy'][-1]
        except:
            log.info(self.log_msg('Thread hierarchy is not found'))
            page['title']=''
        try:
            self.thread_id =  page['et_thread_id'] = unicode(self.currenturi.split('/')[-1].replace('.aspx',''))
        except:
            log.info(self.log_msg('Thread id not found'))
        if checkSessionInfo(self.genre, self.session_info_out, self.parent_uri,\
                                         self.task.instance_data.get('update')):
            log.info(self.log_msg('Session info return True, Already exists'))
            return False

        for each in ['et_thread_last_post_author','ei_thread_replies_count','edate_last_post_date']:
            try:
                page[each] = self.task.pagedata[each]
            except:
                log.info(self.log_msg('page data cannot be extracted for %s'%each))
        try:
            post_hash = get_hash( page )
            id=None
            if self.session_info_out=={}:
                id=self.task.id
            result=updateSessionInfo( self.genre, self.session_info_out, self.\
                   parent_uri, post_hash,'Forum',self.task.instance_data.get('update'), Id=id)
            if not result['updated']:
                return False
            page['path']=[self.parent_uri]
            page['parent_path']=[]
            page['uri'] = normalize( self.currenturi )
            page['uri_domain'] = unicode(urlparse.urlparse(page['uri'])[1])
            page['priority']=self.task.priority
            page['level']=self.task.level
            page['pickup_date'] = datetime.strftime(datetime.utcnow(),"%Y-%m-%dT%H:%M:%SZ")
            page['posted_date'] = datetime.strftime(datetime.utcnow(),"%Y-%m-%dT%H:%M:%SZ")
            page['connector_instance_log_id'] = self.task.connector_instance_log_id
            page['connector_instance_id'] = self.task.connector_instance_id
            page['workspace_id'] = self.task.workspace_id
            page['client_id'] = self.task.client_id
            page['client_name'] = self.task.client_name
            page['last_updated_time'] = page['pickup_date']
            page['versioned'] = False
            page['data'] = ''
            page['task_log_id']=self.task.id
            page['entity'] = 'Post'
            page['category']=self.task.instance_data.get('category','')
            self.pages.append(page)
            log.info(page)
            log.info(self.log_msg('Parent Page added'))
            return True
        except :
            log.exception(self.log_msg("parent post couldn't be parsed"))
            return False
Example #26
0
 def __addPosts(self, post):
     '''It will add the post
     '''
     try:
         unique_key = post['id'].split('_')[-1]
         if checkSessionInfo('review', self.session_info_out, unique_key, \
                         self.task.instance_data.get('update')):
             log.info(self.log_msg('Session info returns True for uri %s'\
                                                            %self.currenturi))
             return False
         page = self.__getData(post)
         if not page:
             return True
         result = updateSessionInfo('review', self.session_info_out, 
             unique_key,get_hash( page ),'review', self.task.instance_data.get('update'))
         if result['updated']:
             page['path'] = [ self.currenturi, unique_key]
             page['parent_path'] = []
             if not page.get('uri'):
                 page['uri']= self.currenturi + '#' + unique_key
             page['uri_domain'] = urlparse.urlparse(page['uri'])[1]
             page['entity'] = 'review'
             page.update(self.__task_elements_dict)
             self.pages.append(page)
             log.info(self.log_msg('Page added'))
         else:
             log.info(self.log_msg('Update session info returns False for \
                                             url %s'%self.currenturi))
     except:
         log.exception(self.log_msg('Cannot add the post for the uri %s'\
                                                         %self.currenturi))
     return True
 def __addPost(self, post, is_question=False):
     try:
         unique_key = post.find('a')['name'].replace('Post','')
         log.debug(self.log_msg('POST: ' + str(unique_key)))
         if checkSessionInfo('review', self.session_info_out, unique_key, \
                          self.task.instance_data.get('update'),parent_list\
                                         = [self.task.instance_data['uri']]):
             log.info(self.log_msg('Session info returns True for uri %s'\
                                                             %unique_key))
             return False
         page = self.__getData(post, is_question)
         if not page:
             return True
         result = updateSessionInfo('review', self.session_info_out, 
             unique_key,get_hash( page ),'forum', self.task.instance_data.get\
                 ('update'),parent_list=[self.task.instance_data['uri']])
         if result['updated']:
             page['path'] = [ self.task.instance_data['uri'], unique_key]
             page['parent_path'] = [self.task.instance_data['uri']]
             page['uri']= self.currenturi + '#' + unique_key
             page['uri_domain'] = urlparse.urlparse(page['uri'])[1]
             #page['entity'] = ''
             #log.info(page)
             page.update(self.__task_elements_dict)
             self.pages.append(page)
             log.info(self.log_msg('Page added'))
         else:
             log.info(self.log_msg('Update session info returns False for \
                                                 url %s'%self.currenturi))
     except:
         log.exception(self.log_msg('Cannot add the post for the uri %s'\
                                                         %self.currenturi))
     return True
 def __getParentPage(self):
     '''
         This will get the Parent Page info
     '''
     page = {}
     try:
         self.hierarchy = page['et_thread_hierarchy'] = [stripHtml(x.renderContents()) for x in self.soup.find('div',{'class':'rd Microsoft_Msn_Boards_Read_List Web_Bindings_Base'}).findAll('li')]
     except:
         log.info(self.log_msg('Thread hierarchy is not found'))
     try:
        self.forum_title = page['title'] = stripHtml(self.soup.find('h2').renderContents())
     except:
         log.info(self.log_msg('Title Not Found'))
         page['title'] = ''
     if checkSessionInfo(self.genre, self.session_info_out, self.parent_uri, self.task.instance_data.get('update')):
         log.info(self.log_msg('Session info return True'))
         return False
     for each in ['et_author_name','ei_thread_replies_count','ei_thread_view_count','ei_author_count','et_last_post_author','edate_last_post_date','posted_date']:
         try:
             page[each] = self.task.pagedata[each]
         except:
             log.info(self.log_msg('Page data cannot be extracted for %s'%each))
     try:
         page['ei_thread_id'] = int(urlparse.urlparse(self.currenturi)[4].split('&')[0].split('ThreadId=')[1])
     except:
         log.info(self.log_msg('Thread id not found'))
     try:
         post_hash = get_hash(page)
         id = None
         if self.session_info_out == {}:
             id = self.task.id
         result = updateSessionInfo(self.genre, self.session_info_out, self.parent_uri, post_hash, 'Post', self.task.instance_data.get('update'),Id=id)
         if not result['updated']:
             return False
         page['path'] = [self.parent_uri]
         page['parent_path'] = []
         page['uri'] = normalize(self.currenturi)
         page['uri_domain'] = unicode(urlparse.urlparse(page['uri'])[1])
         page['priority'] = self.task.priority
         page['level'] = self.task.level
         page['pickup_date'] = datetime.strftime(datetime.utcnow(),'%Y-%m-%dT%H:%M:%SZ')
         page['connector_instance_log_id'] = self.task.connector_instance_log_id
         page['connector_instance_id'] = self.task.connector_instance_id
         page['workspace_id'] = self.task.workspace_id
         page['client_id'] = self.task.client_id
         page['client_name'] = self.task.client_name
         page['last_updated_time'] = page['pickup_date']
         page['versioned'] = False
         page['data'] = ''
         page['task_log_id']=self.task.id
         page['entity'] = 'Post'
         page['category']=self.task.instance_data.get('category','')
         self.pages.append(page)
         log.info(page)
         log.info(self.log_msg('Parent Page added'))
         return True
     except :
         log.exception(self.log_msg("parent post couldn't be parsed"))
         return False
 def __addQuestionInfo(self):
     """
     This will get Question Info
     """
     question = self.soup.find('div', 'question')
     if not question:
         log.info(self.log_msg('No Question Info Found'))
         return False
         #raise Exception('Question Not Found, Cannot Continue')
     page = {'uri':self.currenturi}
     try:            
         self.__thread_topic = page['data'] = page['title'] = stripHtml(question.find('dd', id='topic_text').renderContents())
     except:
         log.info(self.log_msg('No Question Data Found'))
         return False
     try:
         created_text = stripHtml(question.find('cite', id='created').renderContents())
         match_object = re.search('Posted (?P<date_str>.+?) in (?P<et_thread_category>.+?) by (?P<et_author_name>.+?$)', created_text)
         page.update(match_object.groupdict())
     except:
         log.info(self.log_msg('Not enough information'))
     try:
         date_str = page.pop('date_str')
         page['posted_date'] = self.__getDate(re.sub('\(.+?\)', '', date_str))
         #page['posted_date'] = datetime.strftime(datetime.strptime(re.sub('\(.+?\)', '', date_str), '%I:%M%p  on %Y-%m-%d'), "%Y-%m-%dT%H:%M:%SZ")
     except:
         log.info(self.log_msg('posted_date not found in url %s'%self.currenturi))
         page['posted_date'] = datetime.strftime(datetime.utcnow(),"%Y-%m-%dT%H:%M:%SZ")        
     try:
         rating_str = stripHtml(self.soup.find('span', id='overall_rating_score').renderContents())
         if not rating_str == '--':
             page['ef_thread_rating'] = float(rating_str)
     except:
         log.info(self.log_msg('Rating not found'))
     try:
         page['ei_replies_count'] = int(stripHtml(self.soup.find('span', id='answer_count').renderContents()).split(' of ')[1].replace(',', ''))
     except:
         log.info(self.log_msg('Replies count not found'))
     try:
         if checkSessionInfo(self.__genre, self.session_info_out, self.task.instance_data['uri'],    \
                                      self.task.instance_data.get('update')):
             log.info(self.log_msg('Session info return True, Already exists'))
             return True
         result = updateSessionInfo('review', self.session_info_out, self.\
             task.instance_data['uri'], get_hash( page ), 'forum', self.task.instance_data.get('update'))
         if result['updated']:
             page['path'] = [ self.task.instance_data['uri'] ] 
             page['parent_path'] = []
             page['uri'] = self.currenturi
             page['uri_domain'] = unicode(urlparse.urlparse(page['uri'])[1])
             page['entity'] = 'question'
             page.update(self.__task_elements_dict)
             self.pages.append(page)
         else:
             log.info(self.log_msg('Result[updated] returned True for \
                                                     uri'%self.currenturi))
     except:
         log.exception(self.log_msg("parent post couldn't be parsed"))
     return True
 def __addPost(self, review_page_link, post_id): 
     """
     This will take the post tag , and fetch data and meta data and add it to 
     self.pages
     """
     try:
         self.currenturi = review_page_link
         self.__setSoupForCurrentUri()
         page = self.__getData(post_id)
         if not page:
             return True 
         unique_key = get_hash({'data' : page['data']})
         log.info(unique_key)
         if checkSessionInfo(self.genre, self.session_info_out, unique_key,\
                      self.task.instance_data.get('update'),parent_list\
                                         = [self.task.instance_data['uri']]):
             log.info(self.log_msg('Session info returns True'))
             return False
         
     except:
         log.exception(self.log_msg('Cannot add the post for the url %s'%\
                                                         self.currenturi))
         return False
     try:
         page['uri'] = self.currenturi 
     except:
         log.info(self.log_msg('Cannot find the uri'))
         page['uri'] = self.currenturi
     try:
         result=updateSessionInfo(self.genre, self.session_info_out, unique_key, \
             get_hash( page ),'Review', self.task.instance_data.get('update'),\
                             parent_list=[self.task.instance_data['uri']])
         if not result['updated']:
             log.exception(self.log_msg('Update session info returns False'))
             return True
         page['parent_path'] =  []
         page['path'] = [self.task.instance_data['uri'], unique_key]
         page['uri_domain']  = urlparse.urlparse(page['uri'])[1]
         page.update(self.__task_elements_dict)
         self.pages.append(page)
         log.info(page)
         log.info(self.log_msg('Post Added'))
         return True
     except:
         log.exception(self.log_msg('Error while adding session info'))
     return True  
 def __setParentPage(self):
     """
     """
     page = {}
     try: 
         hierarchies = [each for each in [stripHtml(x.renderContents()) for\
                                 x in self.soup.find('table','tborder').\
                                             table.findAll('td')] if each]
         self.hierarchy = [x.strip() for x in hierarchies[0].split('>')]
         page['data'] = page['title'] = hierarchies[1]
         self.hierarchy.append(page['title'])
         page['et_thread_hierarchy'] = self.hierarchy
     except:
         log.info(self.log_msg('Thread hierarchy is not found'))
         return False    
     if checkSessionInfo(self.genre, self.session_info_out, self.task.instance_data['uri'],\
                                      self.task.instance_data.get('update')):
         log.info(self.log_msg('Session info return True, Already exists'))
         return False
     page_data_keys = ['et_author_name', 'ei_thread_replies_count', \
                         'ei_thread_views_count','edate_last_post_date',\
                                 'et_last_post_author']
     [page.update({each:self.task.pagedata.get(each)}) for each in \
                             page_data_keys if self.task.pagedata.get(each)] 
     try:
         date_str = stripHtml(self.soup.find('div', id='posts').find('td', \
                                                 'thead').renderContents())
         if date_str.startswith('Today'):
             date_str = date_str.replace('Today', datetime.strftime\
                                             (datetime.utcnow(),'%m-%d-%Y'))            
         elif date_str.startswith('Yesterday'):
                     date_str = date_str.replace('Yesterday', datetime.\
                                 strftime((datetime.utcnow()-timedelta\
                                      (days=1)),'%m-%d-%Y'))                                               
                                                 
         page['posted_date'] = datetime.strftime(datetime.strptime(date_str,\
                                 '%m-%d-%Y, %I:%M %p'),"%Y-%m-%dT%H:%M:%SZ")   
     except:
         log.exception(self.log_msg('Posted date not found'))
         page['posted_date'] = datetime.strftime(datetime.utcnow(), \
                                                     "%Y-%m-%dT%H:%M:%SZ")                            
     try:
         result = updateSessionInfo('review', self.session_info_out, self.\
             task.instance_data['uri'], get_hash(page), 'forum', \
                                 self.task.instance_data.get('update'))
         if result['updated']:
             page['path'] = [self.task.instance_data['uri']] 
             page['parent_path'] = []
             page['uri'] = self.currenturi
             page['uri_domain'] = unicode(urlparse.urlparse(page['uri'])[1])                
             page['entity'] = 'thread'
             page.update(self.__task_elements_dict)
             self.pages.append(page)
         else:
             log.info(self.log_msg('Result[updated] returned True for \
                                                     uri'%self.currenturi))
     except :
         log.exception(self.log_msg("parent post couldn't be parsed"))
 def __addComments(self,parent_list):
     page={}
     try:
         self.currenturi = self.base_url + parent_list[-1]
         self.rawpage = urlopen(self.currenturi).read()
         self._setCurrentPage()
     except:
         log.info(self.log_msg('comment not found'))
     comments = self.soup.findAll('div','commentWrapper')
     for comment in comments:
         page={}
         try:
             page['et_author_name'] = stripHtml(comment.find('a',id=re.compile('.*UserProfileLink$')).renderContents())
         except:
             log.info(self.log_msg('author name not found'))
         try:
             comment_panel = comment.find('div',id=re.compile('.*CommentUpdatePanel$'))
             no_of_days = int(re.search('commented (\d+) days ago',stripHtml(comment_panel.renderContents())).group(1))
             page['posted_date'] = datetime.strftime(datetime.now() - timedelta(days=no_of_days),"%Y-%m-%dT%H:%M:%SZ")
         except:
             log.info(self.log_msg('posted date not found'))
             page['posted_date'] = datetime.strftime(datetime.utcnow(),"%Y-%m-%dT%H:%M:%SZ")
         try:
             page['data'] =  stripHtml(comment_panel.find('div','bottom').renderContents().replace('/>>', '/>'))
         except:
             log.info(self.log_msg('data not found'))
             page['data']=''
         try:
             if len(page['data']) > 50:
                 page['title'] = page['data'][:50] + '...'
             else:
                 page['title'] = page['data']
         except:
             log.info(self.log_msg('Title not found'))
             page['title'] = ''
         try:
             unique_key = get_hash( {'data':page['data'],'title':page['title']})
             if checkSessionInfo(self.genre, self.session_info_out,
                                         unique_key, self.task.instance_data.get('update'),
                                         parent_list=parent_list):
                 log.info(self.log_msg('session info returns true for comemnst'))
                 continue
             result=updateSessionInfo(self.genre, self.session_info_out,unique_key , get_hash(page),
                                              'Comment', self.task.instance_data.get('update'),
                                              parent_list=parent_list)
             if result['updated']:
                 temp_parent_list = parent_list[:]
                 page['parent_path'] = temp_parent_list[:]
                 temp_parent_list.append(unique_key)
                 page['path'] = temp_parent_list
                 page['uri'] = self.currenturi
                 page['uri_domain'] = urlparse.urlparse(self.currenturi)[1]
                 page['entity'] = 'Comment'
                 page.update(self.__task_elements_dict)
                 self.pages.append(page)
                 log.info(self.log_msg('Comment added'))
         except:
             log.info(self.log_msg('Comment not added'))
Example #33
0
 def getHash(self):
     for f in filehandlers:
         if hasattr(f, "getHash"):
             try:
                 h = f.getHash(self)
                 if h:
                     return h
             except:
                 logException("file handler getHash() failed")
     return get_hash(self.retrieveFile())
Example #34
0
 def check_password(self, password: str) -> bool:
     return safe_str_cmp(self.password, get_hash(password))
Example #35
0
 def set_password(self, password: str):
     self.password = get_hash(password)
Example #36
0
 def hash(self):
     return get_hash(self.path)