Ejemplo n.º 1
0
def make_index(index, url, content_soup):
    try:
        style_num = content_soup.find_all('style')
        script_num = content_soup.find_all('script')
        for script in script_num:
            content_soup.script.decompose()
        for style in style_num:
            content_soup.style.decompose()
        content = content_soup.body.get_text()

    except:
        return
    words = content.split()

    stopwords = ['']
    unwanted_punctuations = '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~>>'

    try:
        with open('google.csv', 'rb') as sw:  #stopword from google
            read = csv.reader(sw)
            for stopword in read:
                stopwords.append(''.join(stopword))
    except:
        pass
    for word in words:

        word = word.lstrip(unwanted_punctuations)
        word = word.rstrip(unwanted_punctuations)
        word = word.lower()

        if word not in stopwords:
            add_to_search_index(index, word, url)


# make_index({}, 'http://www.google.com', soup(''))
Ejemplo n.º 2
0
def make_index(index, url, content_soup):
    try:
       style_num=content_soup.find_all('style')
       script_num=content_soup.find_all('script')
       for script in script_num:
           content_soup.script.decompose()
       for style in style_num:
           content_soup.style.decompose()
       content=content_soup.body.get_text()
       
    except:
        return
    words=content.split()
    
    stopwords=['']
    unwanted_punctuations='!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~>>'
    
    try:
        with open('google.csv','rb') as sw :#stopword from google
            read=csv.reader(sw)
            for stopword in read:
                stopwords.append(''.join(stopword))
    except:
        pass
    for word in words:
        
        word=word.lstrip(unwanted_punctuations)
        word=word.rstrip(unwanted_punctuations)
        word=word.lower()
        
        if word not in stopwords:
           add_to_search_index(index,word,url)
           
        
# make_index({}, 'http://www.google.com', soup('')) 
Ejemplo n.º 3
0
def make_index( url, content_soup):
    if content_soup==soup('','lxml'):
       print 'Cannot be indexed #CoNTeNT EmptY....'
      
       return ''
 	
    docdigest=sha224(content_soup.body.encode('utf-8')).hexdigest()
    
    texts=soup(content_soup.get_text( ),'lxml').findAll(text=True)

   #obtaining meta info
    meta_info=[]
    
    for i in  content_soup.findAll('meta'):
       try:
         if i['content']:
            for i in i['content'].split():
                 meta_info.append(i)
       except:
		pass
    
   
    '''
    (str(),str())->()
    
    This module is responsible for preprocessing content to obtain only keywords 
    This keywords are inserted into the document together with their location and frequency in the document
    This helps us to calculate relevant scores based on this document.

    '''
    
    try:
      
       if content_soup.title==None:
          title=url
       else:
            title=content_soup.title.string

       '''
       style_num=content_soup.find_all('style')
       script_num=content_soup.find_all('script')
       
       for script in script_num:
           content_soup.script.decompose()
       for style in style_num:
           content_soup.style.decompose()
       content=content_soup.body.get_text()
       #finding the best way to obtain only text from a page
       '''
      
       
       

       
       content= ''.join([visible(elem) for elem in texts])      
       #content=content.encode('ascii','ignore')
       
       
       #print soup(content,'lxml')
       #print content

       if meta_info:
           
          page_body=' '.join(meta_info).lower()
       else:
           page_body=' '.join(content.split()[:50]).lower()
       
       
    except:
               date=datetime.today()
               db.crawler_error_log.insert({'error_type':str(sys.exc_info()),'date':date,'from_module':str(__file__)})
               print 'problem with obtaining index from the make_index module','.......',traceback.print_exc()
               return
    
    #content=content.encode('ascii','ignore')
    splitter=re.compile('\\W*')
    
    words=[s.lower( ) for s in splitter.split(content) if s!='']
    #later try to remove all stopwords
    
    
   

    
    stopwords=stopword([])
    
    unwanted_punctuations="!\"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~>>"
    '''
    try:
        with open('stopwords.csv','rb') as sw :#stopword from google
            read=csv.reader(sw)
            for stopword in read:
                stopwords.append(''.join(stopword))
    except:
        print 'problem from stopwords.csv'
    '''
    def isStringLike(word):
         '''
           str()->bool()
		This module checks if a word is a string by using the duck typing style .
		if it wals like a duck and quacks like a duck then it duck like enough for a purpose
	 '''
         try: word+''
         except: return False
	 else:   return True
    
    for i in xrange(len(words)):
        word=words[i]
       
        
       
        
        #if word=='code':
        #  print word
        
       
       
        if word not in stopwords and (not word.isdigit()) and (word not in unwanted_punctuations) and(word.isalpha()) :
           if isStringLike(word):
           	'''
           	have to check for alphabet using regular expressions
           	'''
           	location,count =i,words.count(word)
           
           
           	word=word.lstrip(unwanted_punctuations)
           	word=word.rstrip(unwanted_punctuations)
           
           
           
           
           
           	#if word=='code':
           	#to index i stem every word and a url a location, etc. 
           	add_to_search_index(stem(word),url,title,page_body,location,count,docdigest)