예제 #1
0
    def get_description(self):
        '''
        This function gets all the url, finds their description text
        and update them to the database
        '''
        #get doc_id
        self.mycursor.execute("select id,url from doc")
        myresult = self.mycursor.fetchall()
        for doc_id, url in myresult:
            #print("**********Doc ID is "+str(doc_id)+" ********")
            c = Corpus()
            name = c.url_to_dir(url)
            #print("Name is "+ name)
            with open(name, "rb") as file:
                content = file.read()
                soup = BeautifulSoup(content, "lxml")
                metas = soup.find_all("meta")
                result = ''
                for meta in metas:
                    if ('content' in meta.attrs) and ('name' in meta.attrs) and \
                       ((meta.attrs['name'] == 'description') or (meta.attrs['name'] == 'keywords')):
                        result = " ".join(meta.attrs['content'].split())

                #if html doesn't have description tag
                if result == '':
                    script = soup.find(
                        ["h1", "h2", "h3", "h4", "h5", "strong", "title", "b"])
                    if script:
                        temp = " ".join(script.text.split())
                        result += temp if len(temp) < 200 else ""
                print(result)
                i_sql = "update doc set description =%s where id = %s"
                i_val = (result, doc_id)
                self.mycursor.execute(i_sql, i_val)
                self.mydb.commit()
                print(self.mycursor.rowcount,
                      "was inserted in DOC , DOC ID IS " + str(doc_id))