def get_description(self): ''' This function gets all the url, finds their description text and update them to the database ''' #get doc_id self.mycursor.execute("select id,url from doc") myresult = self.mycursor.fetchall() for doc_id, url in myresult: #print("**********Doc ID is "+str(doc_id)+" ********") c = Corpus() name = c.url_to_dir(url) #print("Name is "+ name) with open(name, "rb") as file: content = file.read() soup = BeautifulSoup(content, "lxml") metas = soup.find_all("meta") result = '' for meta in metas: if ('content' in meta.attrs) and ('name' in meta.attrs) and \ ((meta.attrs['name'] == 'description') or (meta.attrs['name'] == 'keywords')): result = " ".join(meta.attrs['content'].split()) #if html doesn't have description tag if result == '': script = soup.find( ["h1", "h2", "h3", "h4", "h5", "strong", "title", "b"]) if script: temp = " ".join(script.text.split()) result += temp if len(temp) < 200 else "" print(result) i_sql = "update doc set description =%s where id = %s" i_val = (result, doc_id) self.mycursor.execute(i_sql, i_val) self.mydb.commit() print(self.mycursor.rowcount, "was inserted in DOC , DOC ID IS " + str(doc_id))