def create_json_node(self, page): if getattr(page, 'status', 'published') != 'published': return soup_title = BeautifulSoup(page.title.replace(' ', ' '), 'html.parser') page_title = soup_title.get_text(' ', strip=True).replace('“', '"').replace('”', '"').replace('’', "'").replace('^', '^') soup_text = BeautifulSoup(page.content, 'html.parser') page_text = soup_text.get_text(' ', strip=True).replace('“', '"').replace('”', '"').replace('’', "'").replace('¶', ' ').replace('^', '^') page_text = ' '.join(page_text.split()) page_category = page.category.name if getattr(page, 'category', 'None') != 'None' else '' page_url = '.' if page.url: page_url = page.url if self.relative_urls else (self.siteurl + '/' + page.url) node = {'title': page_title, 'text': page_text, 'tags': page_category, 'url': page_url, 'loc': page_url} # changed from 'url' following http://blog.siphos.be/2015/08/updates-on-my-pelican-adventure/ (an update to Pelican made it not work, because the update (e.g., in the theme folder, static/tipuesearch/tipuesearch.js is looking for the 'loc' attribute. self.json_nodes.append(node)
def crawl_web(self, time): # returns index, graph of inlinks print 'Starting crawl' t = clock() # initial time while self.tocrawl and clock() - t < time: # loop when len(tocrawl) > 0 and deltatime > tFull url = self.tocrawl.pop(0) # take first page from tocrawl if url not in self.crawled: # check if page is not in crawled self.current_page = url html = self.get_text(url) # gets contents of page if html != '': try: soup = BeautifulSoup(html, 'lxml') # parse with lxml (faster html parser) except: # parse with html5lib if lxml fails (more forgiving) soup = BeautifulSoup(html, 'html5lib') try: text = str(soup.get_text()).lower() # convert from unicode except: text = soup.get_text().lower() # keep as unicode #try: # title = soup.title.string #except: # pass #do nothing outlinks = self.get_all_links(soup) # get links on page self.pages[url] = (tuple(outlinks), text) # creates new page object self.add_page_to_index(url) # adds page to index self.union(self.tocrawl, outlinks) # adds links on page to tocrawl self.crawled.append(url) # add the url to crawled print 'Crawl finished'
def create_a_beautiful_soup_object(html): """ Try to create a BeautifulSoup object that has not an empty body If so try a different HTML parser. Args: html (string): HTML string of the email body to be sent. Returns: soup (BeautifulSoup object or None) """ if not html: return None soup = BeautifulSoup(html, 'lxml', from_encoding='utf-8') if soup.get_text() == '': soup = BeautifulSoup(html, 'html.parser', from_encoding='utf-8') if soup.get_text() == '': soup = BeautifulSoup(html, 'html5lib', from_encoding='utf-8') if soup.get_text() == '': soup = BeautifulSoup(html, 'xml', from_encoding='utf-8') if soup.get_text == '': soup = None return soup
def create_json_node(self, page): if getattr(page, 'status', 'published') != 'published': return soup_title = BeautifulSoup(page.title.replace(' ', ' '), "html.parser") page_title = soup_title.get_text(' ', strip=True).replace('“', '"').replace('”', '"').replace('’', "'").replace('^', '^') soup_text = BeautifulSoup(page.content, "html.parser") page_text = soup_text.get_text(' ', strip=True).replace('“', '"').replace('”', '"').replace('’', "'").replace('¶', ' ').replace('^', '^') page_text = ' '.join(page_text.split()) if not hasattr(page, 'tags'): page_tags = '' else: page_tags = " ".join([tag.name for tag in page.tags]) page_url = self.siteurl + '/' + page.url node = {'title': page_title, 'text': page_text, 'tags': page_tags, 'url': page_url} self.json_nodes.append(node)
def get_text(l1, l2): soup1 = BeautifulSoup(l1) # kill all script and style elements for script in soup1(["script", "style"]): script.extract() # rip it out # get text text1 = soup1.get_text() # break into lines and remove leading and trailing space on each lines1 = (line.strip() for line in text1.splitlines()) # break multi-headlines into a line each chunks1 = (phrase.strip() for line in lines1 for phrase in line.split(" ")) # drop blank lines text1 = '\n'.join(chunk for chunk in chunks1 if chunk) #print(text1.encode('utf-8')) soup2 = BeautifulSoup(l2) # kill all script and style elements for script in soup2(["script", "style"]): script.extract() # rip it out # get text text2 = soup2.get_text() # break into lines and remove leading and trailing space on each lines2 = (line.strip() for line in text2.splitlines()) # break multi-headlines into a line each chunks2 = (phrase.strip() for line in lines2 for phrase in line.split(" ")) # drop blank lines text2 = '\n'.join(chunk for chunk in chunks2 if chunk) #print(text2.encode('utf-8')) return text1 == text2
def dnsquery(dn): url = "https://jiexifenxi.51240.com/web_system/51240_com_www/system/file/jiexifenxi/get/?ajaxtimestamp=1526175925753" headers = { 'user-agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133 Safari/534.16'} params = {'q': '{}'.format(dn), 'type': 'a'} reqst = requests.post(url=url, headers=headers, params=params) content = reqst.content.decode('utf-8') bd = BeautifulSoup(content, 'html.parser') print('---[+]A record---') print(bd.get_text()) print('---[+]MX record---') params2 = {'q': '{}'.format(dn), 'type': 'mx'} rest = requests.post(url=url, headers=headers, params=params2) content2 = BeautifulSoup(rest.content.decode('utf-8'), 'html.parser') print(content2.get_text()) print('---[+]CNAME record---') params3 = {'q': '{}'.format(dn), 'type': 'cname'} rest2 = requests.post(url=url, headers=headers, params=params3) content3 = BeautifulSoup(rest2.content.decode('utf-8'), 'html.parser') print(content3.get_text()) print('---[+]NS record---') params4 = {'q': '{}'.format(dn), 'type': 'ns'} rest3 = requests.post(url=url, headers=headers, params=params4) content4 = BeautifulSoup(rest3.content.decode('utf-8'), 'html.parser') print(content4.get_text()) print('---[+]TXT record---') params5 = {'q': '{}'.format(dn), 'type': 'txt'} rest4 = requests.post(url=url, headers=headers, params=params5) content5 = BeautifulSoup(rest4.content.decode('utf-8'), 'html.parser') print(content5.get_text())
def get_travel_content(url): res = build_request(url) res_text = res.text try: soup = BeautifulSoup(res_text, 'lxml').find('div', {'class': 'vc_article'}) img_list = [] content = soup.get_text().replace('\r', '').replace('\n', ' ').replace('\xa0','') items = soup.find_all('div', {'class': 'add_pic _j_anchorcnt _j_seqitem'}) for item in items: try: img_url = item.find('img').get('data-src') except: continue img_list.append(img_url) result = { 'content': content, 'images': img_list } return result except: soup=BeautifulSoup(res_text,'lxml').find('div',{'id':'pnl_contentinfo'}) img_list = [] content = soup.get_text().replace('\r', '').replace('\n', ' ').replace('\xa0','') items = soup.find_all('img') for item in items: try: img_url = item.get('data-src') except: continue img_list.append(img_url) result = { 'content': content, 'images': img_list } return result
def get_nodes(self): from bs4 import BeautifulSoup from artgraph.node import Node, NodeTypes from artgraph.relationship import AssociatedActRelationship, MembershipRelationship, ArtistGenreRelationship relationships = [] node = self.get_node() wikicode = self.get_wikicode(node.get_dbtitle()) if wikicode: for t in wikicode.filter_templates(): if t.name.matches('Infobox musical artist'): db = self.get_artistgraph_connection() cursor = db.cursor() # Fill in current node info if t.has('birth_name'): name_cleaner = BeautifulSoup(str(t.get('birth_name').value)) while name_cleaner.ref: name_cleaner.ref.extract() cursor.execute("UPDATE artist SET name = %s WHERE id = %s", (name_cleaner.get_text(), node.get_id())) if t.has('image'): image_cleaner = BeautifulSoup(str(t.get('image').value)) image = image_cleaner.get_text() cursor.execute("UPDATE artist SET imageLocation = %s WHERE id = %s", (self.resolve_image(image), node.get_id())) db.commit() db.close() if t.has('associated_acts'): associated_acts = t.get('associated_acts') for w in associated_acts.value.filter_wikilinks(): relationships.append(AssociatedActRelationship(node, Node(str(w.title), NodeTypes.ARTIST))) if t.has('genre'): genres = t.get('genre') for w in genres.value.filter_wikilinks(): relationships.append(ArtistGenreRelationship(node, Node(str(w.title), NodeTypes.GENRE))) if t.has('current_members'): current_members = t.get('current_members') for w in current_members.value.filter_wikilinks(): relationships.append(MembershipRelationship(node, Node(str(w.title), NodeTypes.ARTIST), True)) if t.has('past_members'): current_members = t.get('past_members') for w in current_members.value.filter_wikilinks(): relationships.append(MembershipRelationship(node, Node(str(w.title), NodeTypes.ARTIST), False)) break return relationships
def parse(self, response): hxs = HtmlXPathSelector(response) con = lite.connect(DOCSET_PATH + '/Contents/Resources/docSet.dsidx') with con: cur = con.cursor() cur.execute("DROP TABLE IF EXISTS searchIndex") cur.execute("CREATE TABLE searchIndex(id INTEGER PRIMARY KEY, name TEXT, type TEXT, path TEXT)") items = [] for a in hxs.select("//ol/li/a[@href[contains(., '.html')]]"): soup = BeautifulSoup(a.extract()) path = soup.a['href'] path = path[len('../'):] if path.endswith('-binding.html'): type = "Binding" try: title = soup.code.get_text() except Exception: title = soup.get_text() else: type = "Guide" title = soup.get_text() cur.execute("INSERT INTO searchIndex(name, type, path) VALUES(?, ?, ?)", (title, type, path)) # item = BindingItem() # item['title'] = title # item['path'] = path # items.append(item) return items
def create_json_node(self, page): if getattr(page, 'status', 'published') != 'published': return soup_title = BeautifulSoup(page.title.replace(' ', ' ')) page_title = soup_title.get_text(' ', strip=True).replace('“', '"').replace('”', '"').replace('’', "'").replace('^', '^') soup_text = BeautifulSoup(page.content) page_text = soup_text.get_text(' ', strip=True).replace('“', '"').replace('”', '"').replace('’', "'").replace('¶', ' ').replace('^', '^') page_text = ' '.join(page_text.split()) if getattr(page, 'category', 'None') == 'None': page_category = '' else: page_category = page.category.name page_url = self.siteurl + '/' + page.url node = {'title': page_title, 'text': page_text, 'tags': page_category, 'loc': page_url} self.json_nodes.append(node)
def create_json_node(self, page): if getattr(page, 'status', 'published') != 'published': return soup_title = BeautifulSoup(page.title.replace(' ', ' '), 'html.parser') page_title = soup_title.get_text(' ', strip=True).replace('“', '"').replace('”', '"').replace('’', "'").replace('^', '^') soup_text = BeautifulSoup(page.content, 'html.parser') page_text = soup_text.get_text(' ', strip=True).replace('“', '"').replace('”', '"').replace('’', "'").replace('¶', ' ').replace('^', '^') page_text = ' '.join(page_text.split()) if getattr(page, 'category', 'None') == 'None': page_category = '' else: page_category = page.category.name page_url = self.siteurl + os.path.dirname(page.source_path.split("content")[-1]) + "/" + page.url #print ">>>", page_url node = {'title': page_title, 'text': page_text, 'tags': page_category, 'url': page_url} self.json_nodes.append(node)
def parsePost(self,response): logging.info(response) sel = Selector(response) posts = sel.xpath('//*[@id="posts"]/li') items = [] topic = sel.css('.threadtitle').xpath('./a/text()').extract()[0] condition="Carcinoid Cancer" url = response.url for post in posts: item = PostItemsList() item['author'] = post.xpath('.//a[contains(@class, "username")]/strong/text()').extract()[0].strip() item['author_link']=response.urljoin(post.xpath('.//a[contains(@class, "username")]/@href').extract()[0]) date = post.css('.postdate').extract()[0] soup = BeautifulSoup(date, 'html.parser') date=re.sub(" +|\n|\r|\t|\0|\x0b|\xa0",' ',soup.get_text()).strip() item['condition']=condition item['create_date']=date post_msg=post.css('.postcontent').extract()[0] soup = BeautifulSoup(post_msg, 'html.parser') post_msg = re.sub(" +|\n|\r|\t|\0|\x0b|\xa0",' ',soup.get_text()).strip() item['post']=post_msg item['tag']='' item['topic'] = topic item['url']=url logging.info(post_msg) items.append(item) return items
def fetch_status(self): submissionId = self.submissionId while True: r = urlopen(urls.STATUS_URL, data=urlencode(dict( ids=submissionId ))) data = json.loads(r.read()) data = data[0] final = data['final'] if final == '1': print '\r\x1b[KResult: %s' % data['status_description'].strip() print 'Memory: %s' % data['mem'].strip() # Fixed no markup specified warning. If not specified it uses # best for the system but it can then behave differently for # different system soup = BeautifulSoup(data['time'],"lxml") time_taken = soup.get_text() print 'Time: %s' % time_taken.strip() if "accepted" in data['status_description']: prob_db=utils.get_problem_database() prob_db[self.problem]['solved']=True utils.set_problem_database(prob_db) break else: soup = BeautifulSoup(data['status_description'],"lxml") string = soup.get_text().strip() string = string.replace('\t', '') string = string.replace('\n', '') sys.stdout.write('\r\x1b[KStatus: %s' % string) sys.stdout.flush() time.sleep(0.5)
def get_lyrics(song,band): # Constructing the url to fetch lyrics from lyrics_url = "http://www.azlyrics.com/lyrics/" + band.replace(" ","") + "/" + song.replace(" ", "") + ".html" #print lyrics_url + "\n" try: # Open and read the page page = urlopen(lyrics_url) html = page.read() #Find the starting and ending indices for the lyrics startindex = html.find("<!-- start of lyrics -->") endindex = html.find("<!-- end of lyrics -->") # Slicing to get the lyrics lyrics = html[startindex:endindex] # Soupifying the page for better display soup = BeautifulSoup(lyrics) print "\nHere is the lyrics for " + song.upper() + " by " + band.upper() + "\n" print soup.get_text() except: # Printing error message print "\nSorry " + song.upper() + " by " + band.upper() + " NOT FOUND\n"
def remove_tags(p, p2): # print "remove_tags p++++++++++++++++", repr(p), "remove_tags p2###################", repr(p2) # p=str(p) soup = BeautifulSoup(p2, 'html.parser') print "++++++++++++++++ soup removed tags: ", soup.get_text() return soup.get_text()
def getajax(url): if not pattern.match(url): url = 'http://' + url try: browser.get(url) n = browser.page_source soup = BeautifulSoup(n) n = soup.get_text() try: n = n.encode('utf-8') except: d = chardet.detect(n) n = n.decode(d['encoding']).encode('utf-8') except TimeoutException: n = browser.page_source soup = BeautifulSoup(n) n = soup.get_text() try: n = n.encode('utf-8') except: d = chardet.detect(n) n = n.decode(d['encoding']).encode('utf-8') n = 'TIMEDOUT'+n except WebDriverException as error: if 'MALFORMED_URI' in error.msg: n = 'MALFORMED_URI' else: raise error except Exception, error: raise error
def experience(): # Reading the Data train = pd.read_csv("labeledTrainData.tsv", header=0, delimiter="\t", quoting=3) print("train.shape: ", train.shape) print("train.columns.values: ", train.columns.values) print("print(train['sentiment'][0]): ", train["sentiment"][0]) print("\nprint(train['review'][0]): ", train["review"][0]) # Data Cleaning and Text Preprocessing # Initialize the BeautifulSoup object on a single movie review example1 = BeautifulSoup(train["review"][0], "lxml") print("\nBeautifulSoup(train['review'][0]): ", example1.get_text()) # Use regular expressions to do a find-and-replace letters_only = re.sub("[^a-zA-Z]", # The pattern to search for " ", # The pattern to replace it with example1.get_text() ) # The text to search print("\nletters_only: ", letters_only) lower_case = letters_only.lower() # Convert to lower case words = lower_case.split() # Split into words print("found {0} words".format(len(words))) # removed, because size of nltk data (>3.7GB) # import nltk # # nltk.download() # Download text data sets, including stop words # from nltk.corpus import stopwords # Import the stop word list # print("stopwords.words: ", stopwords.words("english")) stopwords = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"] # Remove stop words from "words" words = [w for w in words if not w in stopwords] print("\n all words:\n", words)
def store_feed(e): """ store a single entry from the feedparser :param e: the entry :return: if succeed the stored key else None """ query = WebResource.query().filter(WebResource.url == e["link"]) if query.count() == 0: print "STORING: " + e["link"] try: if 'summary' in e: s, t = BeautifulSoup(e['summary'], "lxml"), BeautifulSoup(e['title'], "lxml") e['summary'], e['title'] = s.get_text(), t.get_text() else: t = BeautifulSoup(e['title'], "lxml") e['summary'], e['title'] = None , t.get_text() k = WebResource.store_feed(e) print "STORED: " + str(k) return k except Exception as e: print "Cannot Store: " + str(e) return None else: print "Resource already stored" return None
def getTaggedBlog(self, tag): ''' Return the tagged blogs's captions or post.''' tagged_uri = "http://api.tumblr.com/v2/tagged?tag=" + tag + "&api_key=" + \ self.consumer_key + "&limit=20" req = requests.get(tagged_uri) jsonlist = json.loads(req.content) body = jsonlist['response'] tagtext = [] for blog in body: #print "####" for data in blog: #post if data == "body": if blog[data]: #print blog[data] soup = BeautifulSoup(blog[data]) text = soup.get_text() tagtext.append(text) #an image if data == "caption": if blog[data]: #print blog[data] soup = BeautifulSoup(blog[data]) text = soup.get_text() tagtext.append(text) return tagtext
class GenericUrl: def __init__(self, url): self.url = url req = requests.get(url) print req.status_code print req.headers print req.encoding html_doc = req.text self.soup = BeautifulSoup(html_doc, 'html.parser') def title(self): print '*** Title:', self.soup.title def hrefs(self): print '*** hrefs:' for link in self.soup.find_all('a'): print ' ', link.get('href') def text(self): print '*** Text:' print self.soup.get_text() # print page text def kill_scripts(self): # remove script elements [s.extract() for s in self.soup('script')]
def indexpage_off(url): resp = requests.get(url) soup = BeautifulSoup(resp.text, 'html.parser') soup.get_text() es = Elasticsearch() es.index(index="bc", doc_type='webpage', body={"timestamp": datetime.now(),"text":soup.get_text(),"url":url}) return True
def build_data(self, page): if getattr(page, 'status', 'published') != 'published': return soup_title = BeautifulSoup(page.title.replace(' ', ' ')) page_title = soup_title.get_text(' ', strip=True).replace('“', '"').replace('”', '"').replace('’', "'").replace('^', '^') soup_text = BeautifulSoup(page.content) page_text = soup_text.get_text(' ', strip=True).replace('“', '"').replace('”', '"').replace('’', "'").replace('¶', ' ').replace('^', '^') page_text = ' '.join(page_text.split()) if getattr(page, 'category', 'None') == 'None': page_category = '' else: page_category = page.category.name page_url = self.siteurl + '/' + page.url page_time = getattr(page, 'date', datetime(1970, 1, 1, 1, 0)).strftime('%s') # There may be possible collisions, but it's the best I can think of. page_index = abs(zlib.crc32(page_time + page_url)) return {'title': page_title, 'author': page.author, 'tags': page_category, 'url': page_url, 'content': page_text, 'slug': page.slug, 'time': page_time, 'index': page_index, 'summary': page.summary}
def create_json_node(self, page): if getattr(page, "status", "published") != "published": return soup_title = BeautifulSoup(page.title.replace(" ", " "), "html.parser") page_title = ( soup_title.get_text(" ", strip=True) .replace("“", '"') .replace("”", '"') .replace("’", "'") .replace("^", "^") ) soup_text = BeautifulSoup(page.content, "html.parser") page_text = ( soup_text.get_text(" ", strip=True) .replace("“", '"') .replace("”", '"') .replace("’", "'") .replace("¶", " ") .replace("^", "^") ) page_text = " ".join(page_text.split()) if getattr(page, "category", "None") == "None": page_category = "" else: page_category = page.category.name page_url = self.siteurl + "/" + page.url node = {"title": page_title, "text": page_text, "tags": page_category, "url": page_url} self.json_nodes.append(node)
def example(request): url="https://es.wikipedia.org/wiki/Parten%C3%B3n" req = urllib.request.Request(url) print("check ") response = urllib.request.urlopen(req) print("check ") the_page = response.read() #webread=urlopen('/acilveti92.pythonanywhere.com/hello/') print("problem ") soup = BeautifulSoup(the_page) texts = soup.get_text() text = soup.get_text() splittext=texts.split() simpletext=set(splittext) #deletes after seeing words reduction print("comparation") print(len(splittext)) print(len(simpletext)) print("extract now") for elem in soup.find_all(['script', 'style']): elem.extract() print("empieza el texto") texts = soup.get_text() print("acaba el texto") splittext=texts.split() session_key = request.session._session_key #print("the session key is") #print(request.session._session_key) session = Session.objects.get(session_key=session_key) uid = session.get_decoded().get('_auth_user_id') click_user = User.objects.get(pk=uid) #print("the user is") #print(click_user) print(len(splittext)) print("here comes the boom") print(splittext[0]) #now there should be made a duplicate avoider algorithm to make faster the replacement #first filter non alfanumeric letters simpletext=set(splittext) #deletes duplicates items print("comparation") print(len(splittext)) print(len(simpletext)) WordNumber=len(simpletext) return HttpResponse(text)
def parsePost(self,response): logging.info(response) sel = Selector(response) posts = sel.xpath('//div[contains(@class, "exchange_thread_reply_rdr")]') items = [] if len(sel.xpath('//div[contains(@class, "first_item_title_fmt")]'))==0: return items topic = sel.xpath('//div[contains(@class, "first_item_title_fmt")]/text()').extract()[0] url = response.url condition="hiv" post = sel.xpath('//div[contains(@class, "firstitem_mid_fmt")]') item = PostItemsList() if len(post.css('.post_hdr_fmt').xpath('./a'))>0: item['author'] = post.css('.post_hdr_fmt').xpath("./a").xpath("text()").extract()[0].strip() item['author_link']=response.urljoin(post.css('.post_hdr_fmt').xpath("./a/@href").extract()[0]) else: item['author'] = "" item['author_link']="" date = post.css('.first_posted_fmt').extract()[0] date = date[date.find('DateDelta')+11:date.rfind("'")] item['condition'] = condition item['create_date'] = date post_msg=post.css('.post_fmt').extract()[0] soup = BeautifulSoup(post_msg, 'html.parser') post_msg = re.sub(" +|\n|\r|\t|\0|\x0b|\xa0",' ',soup.get_text()).strip() item['post']=post_msg item['tag']='' item['topic'] = topic item['url']=url logging.info(post_msg) items.append(item) for post in posts: item = PostItemsList() if len(post.css('.post_hdr_fmt'))==0: continue if len(post.css('.post_hdr_fmt').xpath('./a'))>0: item['author'] = post.css('.post_hdr_fmt').xpath("./a").xpath("text()").extract()[0].strip() item['author_link']=response.urljoin(post.css('.post_hdr_fmt').xpath("./a/@href").extract()[0]) else: item['author'] = "" item['author_link']="" date = post.css('.posted_fmt').extract()[0] date = date[date.find('DateDelta')+11:date.rfind("'")] item['condition'] = condition item['create_date'] = date post_msg=post.css('.post_fmt').extract()[0] soup = BeautifulSoup(post_msg, 'html.parser') post_msg = re.sub(" +|\n|\r|\t|\0|\x0b|\xa0",' ',soup.get_text()).strip() item['post']=post_msg item['tag']='hiv' item['topic'] = topic item['url']=url logging.info(post_msg) items.append(item) return items
def scrape_supost(keywords, days_to_check, logfile_path, previous_logfile_contents): """Scrapes supost.com to find all posts which contains the given keywords. Args: keywords (list of str): keywords to search for days_to_check (int): number of days back to search logfile_path (str): path of log file previous_logfile_contents (str): contents of the old log file Returns: list of str, int: list of new matches, posts searched """ oldest_date = (datetime.date.today() - datetime.timedelta(days=days_to_check)) oldest_date_str = oldest_date.strftime("%a, %b %d") offset = 0 new_matches = [] h = httplib2.Http(".cache") link = "http://supost.com/search/index/5" is_scraping = True while is_scraping: response, content = h.request(link) link_page = BeautifulSoup(content) for link in link_page.find_all("a"): if ("post/index" in str(link.get("href"))): response, content = h.request("http://supost.com" + str(link.get("href"))) post_page = BeautifulSoup(content) for keyword in keywords: if keyword in str(post_page.get_text()).lower(): post_title = post_page.find("h2", {"id": "posttitle"}).text output_string = (post_title + ": supost.com" + link.get("href")) if output_string in previous_logfile_contents: return new_matches, offset else: new_matches.append(output_string) # stops scraper when oldest date is found if (oldest_date_str in str(link_page.get_text())): return new_matches, offset # makes sure scraper doesn't go too far if (offset + OFFSET_INCREASE > (OFFSET_INCREASE*2*days_to_check)): return new_matches, offset offset = offset + OFFSET_INCREASE # updates the link with the new offset link = "http://supost.com/search/index/5?offset=" + str(offset) return new_matches, offset
def edit_distance(self): import Levenshtein srcSoup = BeautifulSoup(self.source_text(), 'html5lib') src_content_text = srcSoup.get_text().replace('\n', '').replace('\r', '').replace(' ', '').replace(' ', '').replace(u' ', '') dstSoup = BeautifulSoup(self.finish, 'html5lib') dst_content_text = dstSoup.get_text().replace('\n', '').replace('\r', '').replace(' ', '').replace(' ', '').replace(u' ', '') return Levenshtein.distance(src_content_text, dst_content_text)
def edit_distance(src, dst, encoding='utf-8'): with codecs.open(src, 'r', encoding=encoding) as srcFile: src_content = srcFile.read() src_content = '<p>' +src_content.replace('\r\n', '</p>\r\n<p>') +'</p>' srcSoup = BeautifulSoup(src_content, 'html5lib') src_content_text = srcSoup.get_text().replace('\n', '').replace('\r', '').replace(' ', '').replace(' ', '').replace(u' ', '') with codecs.open(dst, 'r', encoding=encoding) as dstFile: dst_content = dstFile.read() dstSoup = BeautifulSoup(dst_content, 'html5lib') dst_content_text = dstSoup.get_text().replace('\n', '').replace('\r', '').replace(' ', '').replace(' ', '').replace(u' ', '') return Levenshtein.distance(src_content_text, dst_content_text)
def printClasses(): class_dict = getClasses() for class_id in class_dict: print 'Assignments for %s: \n\n' % class_dict[class_id] URL = 'https://canvas.instructure.com/api/v1/courses/%d/assignments' % class_id assignments = requests.get(URL, headers={'Authorization': 'Bearer %s' % TOKEN}).json() for arr in assignments: if (arr['description'] != ""): soup = BeautifulSoup(arr['description'], 'html.parser') dateReg = '(([0-9]{4})-([0-9]{2})-([0-9]{2}).*)' date = re.search(dateReg, arr['due_at']) print soup.get_text() + '- Due at: ' + '%s-%s-%s' % (date.group(3), date.group(4), date.group(2)) + '\n\n\n'
def get_transcript_soup(slug): """Returns the html elements based on given slug.""" url = 'http://www.ted.com/talks/' + slug + "/transcript?language=en" content = urllib2.urlopen(url) soup = BeautifulSoup(content, "html.parser") #CHECK ME! soup.prettify() #turn a BS parse tree into a nicely formatted Unicode string soup.get_text() #gets only the text within elements, can probably be cancelled return soup
def _read_source(imap_host, imap_port, imap_user, imap_pass, imap_folder, email_inreply): source = {'alreadyLoaded': False} try: ## Time to search for the original email try: if "gmail" in imap_host: # gmail server requires an ssl connection print("gmail server") imap = IMAP4_SSL(imap_host, imap_port) else: # tls is preferred imap = IMAP4(imap_host, imap_port) imap.starttls() ## login to server #print(imap_user, imap_pass) imap.login(imap_user, imap_pass) except: print("Failed to login") return False if "gmail" in imap_host: imap.select('"[Gmail]/Sent Mail"') # connect to sent mail. #print("Opening gmail 'Sent'") else: imap.select('Sent') # connect to sent mail. #print("Opening 'Sent'") # Search for the original email ID messages = imap.search(None, 'HEADER', 'MESSAGE-ID', email_inreply) # Process the result to get the message id’s messages = messages[1][0].split() # Use the first id to view the headers for a message result, source_data = imap.fetch(messages[0], '(RFC822)') raw_source = source_data[0][ 1] # here's the body, which is raw headers and html and body of the whole email s = email.message_from_bytes( raw_source) # convert to message object source_subject = s['subject'] source['date'] = s['Date'] source['bcc'] = s['bcc'] #.split(',') source['msg_id'] = s['Message-ID'] #print("BCC from source: ", source_bcc) source_body = s.get_payload() if s.is_multipart(): # search for text in the body for part in s.walk(): ctype = part.get_content_type() cdispo = str(part.get('Content-Disposition')) if ctype == ('text/plain' or 'text/html') and 'attachment' not in cdispo: source_body = part.get_payload() #print(email_body) break #print(frm, " Sent a reply to: ", source_subject) self.get_parent().msgIsReply = True src_sub = BeautifulSoup(source_subject, 'html.parser') try: # extra check for encryption (in case user has encypted email) src_body = BeautifulSoup(source_body, 'html.parser') except: # if email is encrypted it will throw an exception src_body = encription_warning source['subject'] = src_sub.get_text() source['body'] = src_body.get_text() return source except: print("no origin found") return False
def soupify(webtext): soup = BeautifulSoup(webtext) scraped = soup.get_text() return scraped.encode('utf-8')
def _read_mail(imap_host, imap_port, imap_user, imap_pass, imap_folder, eNum): # reads the most recent email and parses the text ### Reading emails from the server. The bulk of the logic is here ### We prosses an email, clean up the text, check if it is a reply ### If the message is a reply, search for the original email in the sent box ### If the original email exists, run a search on the inbox for all emails replying to the original ### And finally, check for and load images global ids_list if eNum == -1: ids_list = [] email_recieved = {'alreadyLoaded': False} try: if "gmail" in imap_host: # gmail server requires an ssl connection print("gmail server") imap = IMAP4_SSL(imap_host, imap_port) else: # tls is preferred imap = IMAP4(imap_host, imap_port) imap.starttls() ## login to server print(imap_user, imap_pass) imap.login(imap_user, imap_pass) except: print("Failed to login") return False #print(imap.list()) # for identifying mailboxes on the server imap.select("Inbox") # connect to all mail. result, data = imap.uid('search', None, "ALL") # search and return uids instead ids = data[0] # data is a list. id_list = ids.split() # ids is a space separated string current_email_uid = data[0].split()[eNum] #print(current_email_uid) result, data = imap.uid( 'fetch', current_email_uid, '(RFC822)' ) # fetch the email headers and body (RFC822) for the given ID raw_email = data[0][ 1] # here's the body, which is raw headers and html and body of the whole email b = email.message_from_bytes(raw_email) email_recieved['msg_id'] = b['Message-ID'] #print("printing id", msg_id, ids_list) for i in ids_list: if i == email_recieved['msg_id']: print("mail already loaded") email_recieved['alreadyLoaded'] = True #self.get_parent()._already_loaded() return ids_list.append(email_recieved['msg_id']) email_from = b['from'] email_subject = b['subject'] email_recieved['date'] = b['Date'] email_recieved['inreply'] = b['in-reply-to'] email_recieved['refs'] = b['references'] email_body = b.get_payload() if b.is_multipart(): # search for text in the body for part in b.walk(): ctype = part.get_content_type() cdispo = str(part.get('Content-Disposition')) if ctype == ('text/plain' or 'text/html') and 'attachment' not in cdispo: email_body = part.get_payload() #print(email_body) break # Use beautifulsoup to get readable text frm = BeautifulSoup(email_from, 'html.parser') sub = BeautifulSoup(email_subject, 'html.parser') try: # Try parsing the body text body = BeautifulSoup(email_body, 'html.parser') except: # if email is encrypted it will throw an exception email_recieved['body'] = encription_warning email_recieved['from'] = frm.get_text() email_recieved['subject'] = sub.get_text() #find just the email address add1 = re.findall("([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)", str(frm)) email_recieved['email'] = add1[0] if body != encription_warning: email_recieved['body'] = body.get_text() return email_recieved '''
def find_replies(imap_host, imap_port, imap_user, imap_pass, imap_folder, email_inreply): try: # On to find more emails that may be replies replies_list = [] try: if "gmail" in imap_host: # gmail server requires an ssl connection print("gmail server") imap = IMAP4_SSL(imap_host, imap_port) else: # tls is preferred imap = IMAP4(imap_host, imap_port) imap.starttls() ## login to server #print(imap_user, imap_pass) imap.login(imap_user, imap_pass) except: print("Failed to login") return False imap.select("Inbox") replies = imap.search(None, 'HEADER', 'IN-REPLY-TO', email_inreply) # BODY.PEEK[HEADER.FIELDS (SUBJECT)] print("searched inbox for ", email_inreply) # Process the result to get the message id’s replies = replies[1][0].split() print("got list of replies") # Use the first id to view the headers for a message replies.reverse() for i in replies: reply = {} print("Checking list of replies") result, reply_data = imap.fetch(i, '(RFC822)') print("loaded a reply") raw_reply = reply_data[0][ 1] # here's the body, which is raw headers and html and body of the whole email #print("raw reply") r = email.message_from_bytes( raw_reply) # convert to message object #reply_to = r['in-reply-to'] reply['refs'] = r['references'] print("references", reply['refs']) reply_from = r['from'] reply_email = re.findall( "([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)", str(reply_from)) reply['date'] = r['Date'] reply['msg_id'] = r['Message-ID'] reply_body = r.get_payload() if r.is_multipart(): # search for text in the body for part in r.walk(): ctype = part.get_content_type() cdispo = str(part.get('Content-Disposition')) if ctype == ('text/plain' or 'text/html' ) and 'attachment' not in cdispo: reply_body = part.get_payload() #print(email_body) break rep_from = BeautifulSoup(reply_from, 'html.parser') reply['email'] = reply_email[0] reply['from'] = rep_from.get_text() try: # extra check for encryption (in case user has encypted email) rep_body = BeautifulSoup(reply_body, 'html.parser') reply['body'] = rep_body.get_text() except: # if email is encrypted it will throw an exception reply['body'] = encription_warning #print("Hello! I am found, ") replies_list.append(reply) return replies_list except: return False print("No more replies found.")
from googlesearch import search from bs4 import BeautifulSoup import requests import urllib.request for i,j in workdf.iterrows(): if(len(workdf['Company'][i]))<=5: str_list = list('https://www.ifsccodebank.com/search-by-IFSC-code.aspx?IFSCCode=') link2 = workdf['Company'][i] print(link2,type(link2)) str_list.append(link2) url = ''.join(str_list) print(url) response = requests.get(''.join(str_list)) soup = BeautifulSoup(response.text,'html.parser') t = soup.get_text() r = t[t.find(link2)+7:] workdf['Title'][i] = (r[:r.find('-')-5].strip()) for j in search(str(workdf['Title'][i]),stop = 1): workdf['Link'][i] = j else: workdf['Title'][i] = 'err' workdf """### FOR non-acronym companies --- OBTAINING TITLE from LINK(column - WEBPAGE) TITLE""" count = 0 for i,j in workdf.iterrows(): if workdf['Title'][i]=='err' or workdf['Title'][i].startswith('ifsccode') or workdf['Title'][i]=='0': url = workdf['Link'][i]
def scrape(url): resp = requests.get(url) if resp.status_code == 200: soup = BeautifulSoup(resp.text, 'html.parser') text = soup.get_text() return text
def Tell_Me_Alfred(query="The Himalayas are", answer_type="Description"): global ALL_RESULTS global ALL_ANSWERS_SORTED global ALL_ANSWERS ALL_RESULTS = [] ALL_ANSWERS = dict() for url in search(query, stop=20): try: #print(url) ALL_RESULTS.append(url) except: print("URL Error") #ALL_RESULTS=['http://www.victoriamemorial-cal.org/', 'http://www.tripadvisor.in/Attraction_Review-g304558-d311680-Reviews-Victoria_Memorial_Hall-Kolkata_Calcutta_West_Bengal.html', 'http://kolkata.cityseekr.com/venue/403224-victoria-memorial', 'http://www.thecityguide.in/Kolkata/Art-Entertainment/SGGG/Victoria-Memorial-Elgin', 'http://www.justdial.com/Kolkata/Victoria-Memorial-Hall/033P6853927_BZDET', 'http://en.wikipedia.org/wiki/Victoria_Memorial_(India)', 'http://en.wikipedia.org/wiki/Victoria_Memorial_(India)#History', 'http://en.wikipedia.org/wiki/Victoria_Memorial_(India)#Finance', 'http://en.wikipedia.org/wiki/Victoria_Memorial_(India)#Design', 'http://en.wikipedia.org/wiki/Victoria_Memorial_(India)#Construction', 'http://ww.itimes.com/poll/best-image-of-victoria-memorial-kolkata-54ad5bd294fa2', 'http://www.trekearth.com/gallery/Asia/India/East/West_Bengal/Kolkata_(Calcutta)/photo1412050.htm', 'http://www.culturalindia.net/monuments/victoria-memorial.html', 'http://knowindia.gov.in/knowindia/culture_heritage.php?id=68', 'http://www.youtube.com/watch?v=C_0IvslcRqU', 'http://www.ixigo.com/victoria-memorial-kolkata-india-ne-1019165', 'http://www.lonelyplanet.com/india/kolkata-calcutta/sights/architecture/victoria-memorial', 'http://www.indianholiday.com/tourist-attraction/kolkata/victoria-memorial.html', 'http://www.mapsofindia.com/kolkata/places-of-interest/famous-monuments/victoria-memorial.html', 'https://www.facebook.com/pages/Victoria-Memorial-Hall-Kolkata/113100222172879', 'http://www.iloveindia.com/indian-monuments/victoria-memorial.html', 'http://www.kolkata.org.uk/tourist-attractions/victoria-memorial.html', 'http://www.vmsb.org/contact_us.html', 'http://mocomi.com/victoria-memorial-facts/', 'http://www.journeymart.com/de/india/west-bengal/kolkata/victoria-memorial.aspx', 'http://www.theincredibleindiatravel.com/victoria-memorial-hall-india/victoria-memorial.html', 'http://goindia.about.com/od/cities/ig/Kolkata-Photo-Gallery/Victoria-Memorial.htm', 'http://zeenews.india.com/news/sci-tech/victoria-memorial-museum-blackout-in-kolkata-for-earth-hour_1569445.html'] #ALL_RESULTS=['http://en.wikipedia.org/wiki/Himalayas', 'http://en.wikipedia.org/wiki/Paro_Taktsang', 'http://en.wikipedia.org/wiki/List_of_Himalayan_peaks_and_passes', 'http://en.wikipedia.org/wiki/Indian_Himalayan_Region', 'http://en.wikipedia.org/wiki/Indian_Plate', 'http://simple.wikipedia.org/wiki/Himalayas', 'http://www.thehindu.com/sci-tech/energy-and-environment/emissions-from-biomass-burning-cross-the-himalayas/article7105899.ece', 'http://www.npr.org/blogs/goatsandsoda/2015/04/15/399579066/in-search-of-the-missing-trekkers-in-nepal-s-muddy-morass', 'http://www.nzherald.co.nz/bay-of-plenty-times/news/article.cfm?c_id=1503343&objectid=11434737', 'http://www.youtube.com/watch?v=HuSHOQ6gv5Y', 'http://www.britannica.com/EBchecked/topic/266037/Himalayas', 'http://www.english-online.at/geography/himalayas/himalaya-mountain-range.html', 'http://www.himalayanfootsteps.com/destinations/where-are-the-himalayas/', 'http://www.mountainprofessor.com/the-himalaya.html', 'http://www.himalaya2000.com/himalayan-facts/location-of-himalayas.html', 'http://www.unmuseum.org/yeti.htm', 'http://www.hitt-initiative.org/mla/?page_id=390', 'http://www.robinsonlibrary.com/geography/physical/mountains/himalaya.htm', 'http://geography.howstuffworks.com/asia/the-himalayas.htm', 'http://www.kidsdiscover.com/spotlight/himalayas-kids/', 'http://pubs.usgs.gov/gip/dynamic/himalaya.html', 'http://www.todayifoundout.com/index.php/2013/12/himalayas-formed/', 'http://www.pbs.org/wgbh/nova/everest/earth/birth.html', 'http://www.pbs.org/wnet/nature/the-himalayas-himalayas-facts/6341/', 'http://www.pbs.org/wnet/nature/the-himalayas-introduction/6338/', 'http://www.oddizzi.com/teachers/explore-the-world/physical-features/mountains/mountain-case-study/himalayas/', 'https://vimeo.com/121045965', 'http://www.worldwildlife.org/places/eastern-himalayas', 'http://www.answers.com/Q/What_are_the_Himalayas'] print('YOUR TOP ANSWERS ARE:') c = 0.0 for res in ALL_RESULTS: Exact_Match_Found_flag = 0 try: timeout = 0 #print 'Checking Source:',res response = urllib.request.urlopen(res) page_data = response.read() page_data = BeautifulSoup(page_data) page_data = page_data.get_text() page_data = page_data.split('.') # Read from Individual Web Pages if answer_type == "Description": Start_T = time.time() for line in page_data: Curr_T = time.time() if (Curr_T - Start_T) > 15.0: break if re.findall(query.lower(), line.lower()) != []: c += 1.0 line_low = line.lower() line = line_low.split(query.lower()) print( '===============================================================================' ) print('Answer ', c, ':') line = query + line[1] + '.' print(line) print('\n\nSource: ', res) print( '===============================================================================' ) Exact_Match_Found_flag = 1 break elif answer_type == "Location": query_parts = query.split(' ') Start_T = time.time() for line in page_data: Curr_T = time.time() if (Curr_T - Start_T) > 30.0: break check_next = 0 for each_qp in query_parts: if re.findall(each_qp.lower(), line.lower()) == []: check_next = 1 break if check_next == 1: continue else: line_parts = line.split(' ') for each_lp in line_parts: if (each_lp in query_parts) or ( each_lp in IGNORE_LIST): #Skip the Query Words continue if check_WordNet( word=each_lp, def_word='city') or check_WordNet( word=each_lp, def_word='country') or check_WordNet( word=each_lp, def_word='continent' ) or check_WordNet(word=each_lp, def_word='state'): c += 1.0 print(each_lp) if each_lp not in ALL_ANSWERS: ALL_ANSWERS[each_lp] = 1 else: ALL_ANSWERS[each_lp] += 1 Exact_Match_Found_flag = 1 break if Exact_Match_Found_flag: break #print 'Finished Checking Source:',res except: print() #Give a Probability for One Word Answers if answer_type == "Location": ALL_ANSWERS_SORTED = [] all_ans = list(ALL_ANSWERS.keys()) for each_ans in all_ans: ALL_ANSWERS_SORTED.append([ALL_ANSWERS[each_ans], each_ans]) ALL_ANSWERS_SORTED.sort() print( '===============================================================================' ) print('SUMMARY:') print( '---------------------------------------------------------------------------' ) for each_sa in range(0, len(ALL_ANSWERS_SORTED)): idx = len(ALL_ANSWERS_SORTED) - 1 - each_sa print(ALL_ANSWERS_SORTED[idx][1]) print('Confidence Measure= ', (ALL_ANSWERS_SORTED[idx][0] / c * 100.0), '%') print( '---------------------------------------------------------------------------' ) print( '===============================================================================' )
def ProcessPage(options, mycursor, languages, mtProc, statusCode, orig_encoding, htmlText, pageURL, crawlDate, languagesClass): print("page", pageURL) if pageURL == "unknown": logging.info("Unknown page url") return if orig_encoding == None: logging.info("Encoding of document " + pageURL + " could not be identified") if len(htmlText) == 0: logging.info("Empty page") return # lang id # printable_str = ''.join(x for x in cleantree if x in string.printable) logging.info(pageURL + ": detecting language") success, lang = guess_lang_from_data2(htmlText) if success: langId = languagesClass.GetOrSaveLang(lang) else: return logging.info(pageURL + ": Getting text with BeautifulSoup") soup = BeautifulSoup(htmlText, features='html5lib') # lxml html.parser for script in soup(["script", "style", "img"]): script.extract() # rip it out plaintext = soup.get_text() if len(plaintext) > 0: # Guessing MIME of the file (checked on original content) logging.info(pageURL + ": Getting mime") mime = magic.from_buffer(htmlText, mime=True) # mimeFile.write(mime.encode() + b"\n") c = hashlib.md5() c.update(htmlText.encode()) hashDoc = c.hexdigest() pageURLId = SaveURL(mycursor, pageURL) docId = SaveDoc(mycursor, crawlDate, statusCode, pageURLId, langId, mime, hashDoc) # print("docId", docId) # links SaveLinks(mycursor, languages, mtProc, soup, pageURL, docId, languagesClass) # write html and text files filePrefix = options.outDir + "/" + str(docId) with lzma.open(filePrefix + ".html.xz", "wt") as htmlFile: htmlFile.write(htmlText) with lzma.open(filePrefix + ".text.xz", "wt") as textFile: textFile.write(plaintext) # print("plaintext", len(plaintext)) splitterCmd = "{bitextorRoot}/preprocess/moses/ems/support/split-sentences.perl -b -l {lang1}".format( bitextorRoot=bitextorRoot, lang1=lang) extractedLines = split_sentences(plaintext, splitterCmd, options.prune_type, options.prune_threshold) if os.path.exists(options.outDir): if not os.path.isdir(options.outDir): sys.stderr.write("Must be a directory: " + options.outDir) else: os.mkdir(options.outDir) # write splitted file extractPath = options.outDir + "/" + str( docId) + "." + lang + ".extracted.xz" with lzma.open(extractPath, 'wt') as extractFile: for extractedLine in extractedLines: extractFile.write(str(docId) + "\t" + extractedLine + "\n") if lang != languages[-1]: # translate transPath = options.outDir + "/" + str(docId) + ".trans.xz" transFile = lzma.open(transPath, 'wt') for inLine in extractedLines: pass # print("inLine", inLine) #inLine += "\n" #mtProc.stdin.write(inLine.encode('utf-8')) #mtProc.stdin.flush() #outLine = mtProc.stdout.readline() #outLine = outLine.decode("utf-8") #transFile.write(str(docId) + "\t" + outLine) transFile.close()
import requests from bs4 import BeautifulSoup WIKI_URL = 'https://ia601405.us.archive.org/18/items/alicesadventures19033gut/19033.txt' req = requests.get(WIKI_URL) soup = BeautifulSoup(req.text, 'html5lib') file = soup.get_text() testo = ''.join([x for x in file if x in string.ascii_letters + ' ' + '-' ]) word_counts = {} parole = testo.strip().split(' ') for value in parole: key = value.translate(str.maketrans('','',string.punctuation)).lower() if key in word_counts.keys(): word_counts[key] += 1 else: word_counts[key] = 1 print(word_counts)
<li><a href="/scholarships-for-veterans" id="scholarship"> <li><a href="http://www.simplelearn.com/feed/" id="rss">RSS FEED</a></li> </ul>""" #<!--Create soup object--> soup_SL = BeautifulSoup(data_SL, 'html.parser') #if i do this get all info print(soup_SL) #parse only part of the document,text values for tags using getText method print('_______________________Get only req___________________') print(soup_SL.get_text()) #import soupstrainer class for parsing the desied part of the web document from bs4 import SoupStrainer #create object to parse only the id (link) with lab tags_with_LabLink = SoupStrainer(id='lab') #print the part of the parsed document print(BeautifulSoup(data_SL, 'html.parser', parse_only=tags_with_LabLink)) print('--------------------------') print( BeautifulSoup(data_SL, 'html.parser',
r = requests.get(url) # Extract the response as html: html_doc html_doc = r.text # Create a BeautifulSoup object from the HTML: soup soup = BeautifulSoup(html_doc) # Get the title of Guido's webpage: guido_title guido_title = (soup.title) # Print the title of Guido's webpage to the shell print(guido_title) # Get Guido's text: guido_text guido_text = soup.get_text() # Print Guido's text to the shell print(guido_text) # Import packages import requests from bs4 import BeautifulSoup # Specify url url = 'https://www.python.org/~guido/' # Package the request, send the request and catch the response: r r = requests.get(url) # Extracts the response as html: html_doc
def get_html_node_text(self, html): soup = BeautifulSoup(html, "lxml") return str(soup.get_text())
def strip_html(text): soup = BeautifulSoup(text, "html.parser") return soup.get_text()