def trigger_w(self, msg): "Usage: w <search term>. Prints a short description of the corresponding wikipedia article." if len(msg.args) == 0: self.bot.notice(msg.nick, "Please specify a search term") return params = { 'action': 'opensearch', 'format': 'xml', 'limit': '2', 'search': ' '.join(msg.args) } url = 'http://{:s}.wikipedia.org/w/api.php'.format(self.language) response = BeautifulStoneSoup(requests.post(url, data=params).text) # Damn BS4 is case sensitive, hence all the 'regex's. if response.find(re.compile('text', re.I)): index = 0 if "may refer to:" in response.find(re.compile('description', re.I)).string: index = 1 info = response.find_all(re.compile('description', re.I))[index].string.strip() url = response.find_all(re.compile('url', re.I))[index].string short_url = self.shorten(url) message = u"\002Wikipedia ::\002 {} \002::\002 {}".format(info, short_url) self.bot.privmsg(msg.channel, message) else: self.bot.privmsg(msg.channel, "{}: no articles were found.".format(' '.join(msg.args)))
def run(self): while True: try: maxid = self.db.news_list()[0]['id'] except: maxid = 1 print(maxid) client = HTTPClient() response = client.fetch('http://cs.hust.edu.cn/rss') result = response.body.decode("utf-8",errors='ignore') soup = BeautifulStoneSoup(result) items = soup.find_all('item') for item in items: title = item.title.text link = item.link.text desc = item.description.text linkid = self.link_id(link) if linkid > maxid: result = self.db.add_news(linkid,title,desc,link) if result: result = self.get_article(link) else: break time.sleep(3600)
def analyze_site_map(): r = requests.get('{}{}sitemap.xml'.format(app.config['WEB_PROTOCOL'], app.config['DOMAIN'])) soup = Soup(r.content) locs = soup.findAll('loc') return [loc.string for loc in locs]
def loadJATSSentence(self, s, newDocument, par_id, section_id): """ Loads a JATS sentence (ready split) :param s: the plain text of the sentence (with all tags inside, e.g. <xref>) :param newDocument: SciDoc :param par_id: id of the paragraph containing this sentence :param section_id: id of the section containing the paragraph """ newSent = newDocument.addSentence(par_id, "") s_soup = BeautifulStoneSoup(s) refs = s_soup.findAll("xref", {"ref-type": "bibr"}) citations_found = [] for r in refs: citations_found.extend( self.loadJATSCitation(r, newSent["id"], newDocument, section=section_id)) non_refs = s_soup.findAll( lambda tag: tag.name.lower() == "xref" and "ref-type" in tag and tag["ref-type"].lower() != "bibr") for nr in non_refs: nr.name = "inref" newSent["citations"] = [acit["id"] for acit in citations_found] # TODO replace <xref> tags with <cit> tags newSent["text"] = newDocument.extractSentenceTextWithCitationTokens( s_soup, newSent["id"]) ## print(newSent["text"]) # deal with many citations within characters of each other: make them know they are a cluster # TODO cluster citations? Store them in some other way? newDocument.countMultiCitations(newSent)
def _parse_request(self): """ Parses various parameters from _request_xml into _request_params. We need to override parse here as Microsoft Azure doesn't send AssertionConsumerServiceURL (ACS_URL) """ # Minimal test to verify that it's not binarily encoded still: if not self._request_xml.strip().startswith('<'): raise Exception('RequestXML is not valid XML; ' 'it may need to be decoded or decompressed.') soup = BeautifulStoneSoup(self._request_xml) request = soup.findAll()[0] if request.get('AssertionConsumerServiceURL', None): raise Exception( 'Invalid Azure request. AssertionConsumerServiceURL exists!') params = {} params['ACS_URL'] = AZURE_ACS_URL params['REQUEST_ID'] = request.get('id', request.get('ID')) params['REQUEST_ISSUER'] = self._get_request_issuer(request) params['DESTINATION'] = request.get('Destination', '') params['PROVIDER_NAME'] = request.get('ProviderName', '') self._request_params = params # Set subject format - overrides the value set in _reset() self._subject_format = AZURE_SUBJECT_FORMAT
def extract_tags_bs4(self): """ Using: BeatifulSoup's XML parser Returns XML data in dict format """ soup = Soup(self.query_xml) # XML as a string self.entries = soup.findAll('entry') # list of <entry>'s find_authors = lambda x: x.find('name').string for entry in self.entries: # strip down entry ID in url to (say) -> 'abs/math/0507289v1' entry_id = urlparse(entry.find('id').string).path.lstrip('/') title = entry.find('title').string summary = entry.find('summary').string # findAll() for multiple entries authors = entry.findAll('author') # returns list of data-type: BeautifulSoup.Tag # PYLINT chatters: authors = map(self.find_authors, authors) # using list comprehension instead authors = [find_authors(i) for i in authors] published = entry.find('published').string meta = { 'title': title, 'summary': summary, \ 'authors': authors, 'published': published } self.data[entry_id] = meta return self.data # python dict
class FollowThatTag(SoupTest): "Tests the various ways of fetching tags from a soup." def setUp(self): ml = Display.write(self) self.soup = BeautifulStoneSoup(ml) def testFindAllByName(self): matching = self.soup('https://stackoverflow.com/jobs/feed?l=Bridgewater%2c+MA%2c+United+States&u=Miles&d=50') self.assertEqual(len(matching), 2) self.assertEqual(matching[0].name, 'a') self.assertEqual(matching, self.soup.findAll('a')) self.assertEqual(matching, self.soup.findAll(SoupStrainer('a'))) def testFindAllText(self): soup = BeautifulSoup("<html>\xbb</html>", "lxml") self.assertEqual(soup.findAll(text=re.compile('.*')), [u'\xbb']) def testTextNavigation(self): soup = BeautifulSoup('<url>http://cdn.sstatic.net/Sites/stackoverflow/img/favicon.ico?v=4f32ecc8f43d</url><title>Small funded Boston start-up seeks Senior Python/Django Developer at Circulation (Boston, MA)</title>', "lxml") baz = soup.find(text='Small funded Boston start-up seeks Senior Python/Django Developer at Circulation (Boston, MA)') self.assertEquals(baz.findParent("url")['title'])
def _parse_html(self, url): ''' Метод загружает страницу из url и обрабатывает все ссылки на ней присутствующие. Рекурсивно вызывает самого себя для первой из еще не обработанных ссылок, т.о. парсится весь сайт. ''' html = None page_content = None self._processed.add(url) self._recursion_counter += 1 try: with urllib.request.urlopen(url) as response: html = response.read() except Exception: html = None print('Unable to load url %s' % url) if html: try: page_content = Soup(html) except Exception: page_content = None if page_content: stop_list = ('#', '', '/') for a in page_content.find_all('a', href=True): if a['href'] not in stop_list: href = self._build_link(url=a['href'], location_parts=urlparse(url)) if href: try: self._url_validator(href) except ValidationError: print('%s is not valid url' % href) else: self._finds.add(href) self._add_location(url) unprocessed = self._finds - self._processed print('Всего страниц: %s. Обработано страниц: %s. Найдено рецептов: %s. Последний URL: %s' % (len(self._finds), len(self._processed), len(self._urls), url)) # На каждом 20-ом вызове данного метода сохраняем self._urls if self._recursion_counter % 20: self._save() self._recursion_counter = 0 if unprocessed: if self.sleep_time > 0: time.sleep(self.sleep_time) next_url = list(unprocessed)[0] self._parse_html(next_url)
def get_all_urls(self): """Возвращает список url""" list_of_urls = [] for url in self.urls: request = self.session.get(url, headers=self.headers) soap = BeautifulStoneSoup(request.content) urls = soap.find_all('loc') list_of_urls += [url.next_element for url in urls] return list_of_urls
def read(self, xml, identifier): """ Load a JATS/NLM (PubMed) XML into a SciDoc. :param xml: full xml string :type xml: basestring :param identifier: an identifier for this document, e.g. file name If an actual full path, the path will be removed from it when stored :type identifier: basestring :returns: :class:`SciDoc <SciDoc>` object :rtype: SciDoc """ # this solves a "bug" in BeautifulStoneSoup with "sec" tags BeautifulStoneSoup.NESTABLE_TAGS["sec"] = [] #xml=fixNumberCitationsXML(xml) soup = BeautifulStoneSoup(xml) # Create a new SciDoc to store the paper newDocument = SciDoc() metadata = newDocument["metadata"] metadata["filename"] = os.path.basename(identifier) metadata["original_citation_style"] = detectCitationStyle(xml) body = soup.find("body") if not body: # TODO: Make the error handling less terrible debugAddMessage(newDocument, "error", "NO <BODY> IN THIS PAPER! file: " + identifier) newDocument["metadata"]["guid"] = cp.Corpus.generateGUID() return newDocument # Load metadata, either from corpus or from file self.loadJATSMetadataFromPaper(newDocument, soup) metadata["guid"] = cp.Corpus.generateGUID(metadata) # Load all references from the XML back = soup.find("back") if back: ref_list = back.find("ref-list") # other things in <back> like appendices: ignore them for now if ref_list: for ref in ref_list.findAll("ref"): self.loadJATSReference(ref, newDocument) newDocument.updateReferences() # Load Abstract self.loadJATSAbstract(soup, newDocument) for sec in body.findChildren("sec", recursive=False): self.loadJATSSection(sec, newDocument, "root") newDocument.updateAuthorsAffiliations() return newDocument
def open(self, book_id=None): if book_id: self.book_id = book_id if not self.book_id: raise Exception('Book id not set') self.size = os.path.getsize(self._FILE % (self.book_id, self.book_id)) sz_mult = 1.0 / (1024**2) result = u'%.1f' % (self.size * sz_mult) self.size = u'<0.1' if result == u'0.0' else result self.f = zipfile.ZipFile(self._FILE % (self.book_id, self.book_id), 'r') soup = BeautifulStoneSoup(self.f.read('META-INF/container.xml')) oebps = soup.findAll('rootfile')[0]['full-path'] folder = oebps.rfind(os.sep) self.oebps_folder = '' if folder == -1 else oebps[:folder + 1] # 找到oebps的文件夹名称 oebps_content = self.f.read(oebps) self.read_doc_props(oebps_content) opf_bs = BeautifulStoneSoup(oebps_content) ncx = opf_bs.findAll('item', {'id': 'ncx'})[0] ncx = self.oebps_folder + ncx['href'] # 找到ncx的完整路径 ncx_bs = BeautifulStoneSoup(self.f.read(ncx)) self.chapters = [ (nav.navlabel.text, nav.content['src']) for nav in ncx_bs.findAll('navmap')[0].findAll('navpoint') ] self.cover_href = self.chapters[0][1] # 封面路径
def extract_info_from_html(): from bs4 import BeautifulStoneSoup import re html = open("data/url.html").read() soup = BeautifulStoneSoup(html) inputTag = soup.findAll("a") inputTag = str(inputTag).split(",") m = [re.search(" +href=\"(.*?)\"", i) for i in inputTag] urls = [i.group(1) for i in m] code = [ i[9:-9].replace("<", "") for i in str(soup.findAll('strong')).split(",") ] city = [ i.split('<span class="uni-code">')[0].replace("\t", "").replace( "</span>", "").replace("\n", "") for i in html.split( '<i class="fa fa-map-marker" aria-hidden="true"></i>')[1:] ] abbr = [ i.split('</div>')[0].replace("\t", "").replace("</span>", "").replace("\n", "") for i in html.split('<div class="name-group">')[1::2] ] # ADD CODE TO UNI_INFO map_abbr_code = [{ "abbr": m, "code": n } for m, n in zip(abbr, code) if m != ""] import json uni = json.load(open("data/university.json")) len(uni) new_uni = [] abbrs = [] for i in uni: if (i["abbr"] in abbrs): continue else: for j in map_abbr_code: if (j["abbr"] == i["abbr"]): i["code"] = j["code"] break new_uni.append(i) abbrs.append(i["abbr"]) with open('data/university_add_code.json', 'w') as outfile: json.dump(new_uni, outfile, ensure_ascii=False, indent=4)
def __call__(self, content: str) -> str: filenames = [] soup = BeautifulStoneSoup(content) links = soup.table.findAll('a') for link in links: filenames.append(link.text) return '\n'.join(filenames)
def test_beautifulstonesoup_is_xml_parser(self): # Make sure that the deprecated BSS class uses an xml builder # if one is installed. with warnings.catch_warnings(record=True) as w: soup = BeautifulStoneSoup("<b />") self.assertEqual("<b/>", str(soup.b)) self.assertTrue("BeautifulStoneSoup class is deprecated" in str(w[0].message))
def fuseReferences(doc, ref): """ """ prevref = doc["references"][-1] doc["metadata"]["ref_replace_list"] = doc["metadata"].get( "ref_replace_list", {}) id = "" try: id = ref["id"] if not id: id = prevref["id"] if isinstance(id, six.string_types): id = "ref" + str(len(doc["references"]) + 1) elif isinstance(id, int): id = id + 1 except: id = "ref" + str(len(doc["references"]) + 1) doc["metadata"]["ref_replace_list"][id] = prevref["id"] doc["references"].remove(prevref) fullstring = re.sub(r"</reference>", "", prevref["xml"], 0, re.IGNORECASE) fullstring += re.sub(r"<reference.+?>", "", ref.__repr__(), 0, re.IGNORECASE) ## ref=BeautifulStoneSoup(prevref["xml"]+ref.__repr__()) ref = BeautifulStoneSoup(fullstring).find("reference") processReferenceXML(ref, doc, False)
def HTMLEntitiesToUnicode(text): """ Converts HTML entities to unicode. For example '&' becomes '&'. """ text = BeautifulStoneSoup(text, convertEntities=BeautifulStoneSoup.ALL_ENTITIES) return text
def read(self, xml, filename): """ Load a document from the Athar corpus Args: xml: full xml string """ ## # this solves a "bug" in BeautifulStoneSoup with "sec" tags ## BeautifulStoneSoup.NESTABLE_TAGS["sec"]=[] soup = BeautifulStoneSoup(xml) paper_data_node = soup.find("div", {"class": "dstPaperData"}) paper_data = { "id": paper_data_node.text, "title": "", "authors": "", } title = paper_data_node.find("div", {"class": "dstPaperTitle"}) if title: paper_data["title"] = title.text authors = paper_data_node.find("div", {"class": "dstPaperAuthors"}) if authors: author_chunks = title.text.split(";") for author in author_chunks: chunks = author.split(",") author_dict = {"given": chunks[1], "family": chunks[0]} paper_data["authors"] = author_dict ## print(paper_data) all_contexts = [] all_docs = [] document_nodes = soup.findAll("table", {"class": "srcPaper"}) for index, document_node in enumerate(document_nodes): try: doc, contexts = self.loadDocumentNode(document_node, paper_data, index) all_docs.append(doc) all_contexts.extend(contexts) except ValueError: print("Error:", sys.exc_info()[1]) break return all_docs, all_contexts
def parse(self, content: str) -> str: """Parses web content""" filenames = [] soup = BeautifulStoneSoup(content) links = soup.table.findAll('a') for link in links: filenames.append(link['href']) return '\n'.join(filenames)
def __init__(self): try: # Получение xml в string формате with req.urlopen(self.URL) as open_url: soup = BeautifulStoneSoup(open_url.read()) self.FEED = (series.series_from_xml(soup, 'item'))[2:] except Exception as e: logger.error("Cannot get a XML-file: %s" % e)
def response_soup(self): "Returns a BeautifulSoup object of the response." if not self._response_soup: self._response_soup = BeautifulStoneSoup( str(self._response_content, encoding='utf-8')) return self._response_soup
def run(self, file_name, user, **kwargs): """ Parse the given xml file using BeautifulSoup. Save all Article, Redirect and Page objects. """ f = open(file_name, 'r') xml = f.read() f.close() soup = BeautifulStoneSoup(xml) items = soup.find_all('item') for item in items: post_type = item.find('wp:post_type').string post_status = item.find('wp:status').string if post_type == 'attachment': get_media(item, user) # Note! This script assumes all the attachments come before # posts and pages in the xml. If this ends up changing, # do two loops, one with attachments and the second with posts and pages. elif post_type == 'post' and post_status == 'publish': get_posts(item, user) elif post_type == 'page' and post_status == 'publish': get_pages(item, user) if user.email: context = { 'SITE_GLOBAL_SITEDISPLAYNAME': get_setting('site', 'global', 'sitedisplayname'), 'SITE_GLOBAL_SITEURL': get_setting('site', 'global', 'siteurl'), } subject = ''.join( render_to_string( template_name=('notification/wp_import/short.txt'), context=context).splitlines()) body = render_to_string( template_name=('notification/wp_import/full.html'), context=context) #send_mail(subject, body, settings.DEFAULT_FROM_EMAIL, [user.email], fail_silently=False) email = EmailMessage(subject, body, settings.DEFAULT_FROM_EMAIL, [user.email]) email.content_subtype = 'html' email.send(fail_silently=True)
def parse_data(self, url): '''Собирает данные в словарь''' request = self.session.get(url, headers=self.headers) if request.status_code == 200: soup = BeautifulStoneSoup(request.content) if not (bool(soup.find('div', {"class": 'error404__text'})) or bool(soup.find('div', {"class": 'nothing-search'})) or bool(soup.find('div', {"id": 'productList'}))): try: name_of_product = soup.find('h1').next_element except Exception: raise Format_Exeption('name', url) try: price_for_all = soup.find( 'span', { "class": "item__price item__price--normal-left" }).next_element.replace(" ", "").replace("\n", "") except Exception: price_for_all = "Нет в наличии" try: price_for_registered = soup.find( 'span', { "class": "item__price item__price--red-bold" }).next_element.replace(" ", "").replace("\n", "") except Exception: price_for_registered = "Нет в наличии" try: reference = soup.findAll( 'div', {"class": "item__card-info-articul"}) reference = reference[1].next_element reference = str(reference).split()[2].replace("-", '') except Exception: reference = "Нет номера" final = { "name_of_product": name_of_product, "price_for_all": price_for_all, "price_for_registered": price_for_registered, "reference": reference, "url": url } return final else: print("Не тот формат, вот ссылка {0}".format(url)) raise Format_Exeption else: raise Connection_Exception
def render(self, context): fancount = '' fb_api_url = 'http://api.facebook.com/restserver.php' tw_api_url = 'http://api.twitter.com' cache_key = '' cache_time = 1800 if self.service == "facebook": query = '%s?method=facebook.fql.query&query=SELECT%%20fan_count%%20FROM%%20page%%20WHERE%%20page_id=%s' xml_path = query % (fb_api_url, self.service_id) cache_key = md5(xml_path.encode()).hexdigest() fancount = cache.get(cache_key) if not fancount: try: xml = urlopen(xml_path) content = xml.read() soup = BeautifulStoneSoup(content) nodes = soup.find_all('page') for node in nodes: fancount = node.fan_count.string cache.set(cache_key, fancount, cache_time) except: pass if self.service == "twitter": query = "%s/1/users/show/%s.xml" xml_path = query % (tw_api_url, self.service_id) cache_key = md5(xml_path.encode()).hexdigest() fancount = cache.get(cache_key) if not fancount: try: xml = urlopen(xml_path) content = xml.read() soup = BeautifulStoneSoup(content) nodes = soup.find_all('user') for node in nodes: fancount = node.followers_count.string cache.set(cache_key, fancount, cache_time) except: pass return fancount
def get_info(self, account): request = urllib.request.Request(self.info_url) response = self.opener.open(request) content = response.read().decode(self.character).encode("utf-8") file = open('new/' + account + '.html', 'wb') file.write(content) file.close() detail_html = BeautifulStoneSoup(content) img_url = detail_html.find(id="Student11_Image1") link = img_url.get('src') link = link[2:] pto_url = 'http://szjy.swun.edu.cn/Sys/SystemForm' + link pto_url = pto_url.replace('照片', '%D5%D5%C6%AC') urllib.request.install_opener(opener=self.opener) img_name = 'photos/' + account + '.jpg' urllib.request.urlretrieve(pto_url, img_name) self.cookie = self.cookie.clear()
def parseCermineXML(self, xml_string): """ This is meant to load the full output from Cermine, whichever it may be. Currently only reads references. """ soup = BeautifulStoneSoup(xml_string, convertEntities=BeautifulStoneSoup.HTML_ENTITIES) ## print(xml_string) references=self.readReferences(soup) # TODO implement reading the rest of the Cermine/ParsHed tagging return references
def parse(self): ''' Метод формирует JSON список url с рецептами сайта и сохраняет его в MEDIA_ROOT/parser/source.js. В зависимости от настроек анализирует карту сайта или же парсит html-страницы. ''' # Парсинг по карте сайта if hasattr(settings, 'PARSER__URL_SOURCE') and settings.PARSER__URL_SOURCE == 'sitemap': xml = None if not hasattr(settings, 'PARSER__SITEMAP_URL') or not settings.PARSER__SITEMAP_URL: print('PARSER__SITEMAP_URL is not defined') else: try: with urllib.request.urlopen(settings.PARSER__SITEMAP_URL) as response: xml = response.read() except Exception: xml = None if xml: sitemap = Soup(xml) urls = sitemap.findAll('url') for u in urls: loc = u.find('loc').string self._add_location(loc) else: # Парсинг по тегам html-страниц if not hasattr(settings, 'PARSER__CELL_HOMEPAGE') or not settings.PARSER__CELL_HOMEPAGE: print('PARSER__CELL_HOMEPAGE is not defined') return False # Счетчик рекурсивных вызовов метода _parse_html self._recursion_counter = 0 self._parse_html(settings.PARSER__CELL_HOMEPAGE) self._save() return self.json_file_path
def soup_maker(fh): """ Takes a file handler returns BeautifulSoup""" try: from bs4 import BeautifulSoup soup = BeautifulSoup(fh, "lxml") for tag in soup.find_all(): tag.name = tag.name.lower() except ImportError: from bs4 import BeautifulStoneSoup soup = BeautifulStoneSoup(fh) return soup
def login(self): """ Read greeting """ greeting = self.read() soup = BeautifulStoneSoup(greeting, 'lxml') svid = soup.find('svid') version = soup.find('version') print("Connected to %s (v%s)\n" % (svid.text, version.text)) """ Login """ xml = commands.login % self.config if not self.cmd(xml, silent=True): exit(1)
def getSeriesDetailsByName(self, serieName): if serieName in IGNORE_SHOWS: return None print 'checking: ' + serieName if serieName in KNOWN_SHOWS.keys(): url = GET_SERIES_URL % (urllib.quote( KNOWN_SHOWS[serieName]['TVDBName'])) else: url = GET_SERIES_URL % (urllib.quote(serieName)) try: # Change the User Agent USER_AGENT = 'Mozilla/5.0 (iPad; U; CPU OS 3_2 like Mac OS X; en-us) AppleWebKit/531.21.10 (KHTML, like Gecko) Version/4.0.4 Mobile/7B334b Safari/531.21.10' cj = cookielib.CookieJar() opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj)) req = urllib2.Request(url) req.add_header('User-Agent', USER_AGENT) resp = opener.open(req) soup = BeautifulStoneSoup(resp.read()) resp.close() if len(soup.findAll('series')) == 1: self.saveSerieDetail(serieName, soup.series) else: for serie in soup.findAll('series'): if serie.seriesname.string == serieName: self.saveSerieDetail(serieName, serie) if serieName in KNOWN_SHOWS.keys(): return KNOWN_SHOWS[serieName] return None except: print 'Error: ' + url return None
def get_text(fileDir): document = zipfile.ZipFile(fileDir) #xml_content = document.read('content.xml') #document.close() #xml = parse(document.) #xml = parse('inputText/content.xml') #print(document.filelist) #print(document.open('content.xml')) xml = parse(document.open('content.xml')) textSoup = BeautifulStoneSoup(document.read('content.xml')) #print(textSoup.prettify()) #print(textSoup.get_text()) document.close() """ officeText = xml.getElementsByTagName('office:text') textFromDoc = [] if len((officeText[0].childNodes)) != 0: for officeNode in officeText[0].childNodes: if len(officeNode.childNodes) != 0: for nextNode1 in officeNode.childNodes: if len(nextNode1.childNodes) == 0: if nextNode1.nodeValue == None: textFromDoc.append(' ') else: textFromDoc.append(nextNode1.nodeValue) else: for nextNode2 in nextNode1.childNodes: if len(nextNode2.childNodes) == 0: textFromDoc.append(nextNode2.nodeValue) """ #for node in text: #textFromDoc.append(getTextFromTag(node)) #print(getTextFromTag(node)) return textSoup.get_text()
def _parse_request(self): """ Parses various parameters from _request_xml into _request_params. """ # Minimal test to verify that it's not binarily encoded still: if isinstance(self._request_xml, bytes): request_xml = self._request_xml.decode('utf-8') else: request_xml = self._request_xml if not request_xml.strip().startswith('<'): raise Exception('RequestXML is not valid XML; ' 'it may need to be decoded or decompressed.') soup = BeautifulStoneSoup(self._request_xml) request = soup.findAll()[0] params = {} params['ACS_URL'] = request.get('AssertionConsumerServiceURL') params['REQUEST_ID'] = request.get('id', request.get('ID')) params['DESTINATION'] = request.get('Destination', '') params['PROVIDER_NAME'] = request.get('ProviderName', '') self._request_params = params
def getDetailsForSerieByID(self, serieName, serieID): url = SERIE_DETAILS_URL % (urllib.quote(serieID)) try: # Change the User Agent USER_AGENT = 'Mozilla/5.0 (iPad; U; CPU OS 3_2 like Mac OS X; en-us) AppleWebKit/531.21.10 (KHTML, like Gecko) Version/4.0.4 Mobile/7B334b Safari/531.21.10' cj = cookielib.CookieJar() opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj)) req = urllib2.Request(url) req.add_header('User-Agent', USER_AGENT) resp = opener.open(req) soup = BeautifulStoneSoup(resp.read()) resp.close() for banner in soup.banners.findAll('banner'): if banner.language.string == 'en': if not 'Fanart' in KNOWN_SHOWS[serieName].keys( ) and banner.bannertype.string == 'fanart': KNOWN_SHOWS[serieName]['Fanart'] = str( BANNER_URL % (banner.bannerpath.string)) if banner.thumbnailpath: KNOWN_SHOWS[serieName]['FanartThumb'] = str( BANNER_URL % (banner.thumbnailpath.string)) elif not 'Poster' in KNOWN_SHOWS[serieName].keys( ) and banner.bannertype.string == 'poster': KNOWN_SHOWS[serieName]['Poster'] = str( BANNER_URL % (banner.bannerpath.string)) if banner.thumbnailpath: KNOWN_SHOWS[serieName]['PosterThumb'] = str( BANNER_URL % (banner.thumbnailpath.string)) elif not 'Season' in KNOWN_SHOWS[serieName].keys( ) and banner.bannertype.string == 'season': KNOWN_SHOWS[serieName]['Season'] = str( BANNER_URL % (banner.bannerpath.string)) if banner.thumbnailpath: KNOWN_SHOWS[serieName]['SeasonThumb'] = str( BANNER_URL % (banner.thumbnailpath.string)) elif not 'Series' in KNOWN_SHOWS[serieName].keys( ) and banner.bannertype.string == 'series': KNOWN_SHOWS[serieName]['Series'] = str( BANNER_URL % (banner.bannerpath.string)) if banner.thumbnailpath: KNOWN_SHOWS[serieName]['SeriesThumb'] = str( BANNER_URL % (banner.thumbnailpath.string)) return KNOWN_SHOWS[serieName] except: print 'Error: ' + url return None
def open(self, book_id=None): if book_id: self.book_id = book_id if not self.book_id: raise Exception('Book id not set') self.size = os.path.getsize(self._FILE % (self.book_id, self.book_id)) sz_mult = 1.0/(1024**2) result = u'%.1f' % (self.size * sz_mult) self.size = u'<0.1' if result == u'0.0' else result self.f = zipfile.ZipFile(self._FILE % (self.book_id, self.book_id), 'r') soup = BeautifulStoneSoup(self.f.read('META-INF/container.xml')) oebps = soup.findAll('rootfile')[0]['full-path'] folder = oebps.rfind(os.sep) self.oebps_folder = '' if folder == -1 else oebps[:folder+1] # 找到oebps的文件夹名称 oebps_content = self.f.read(oebps) self.read_doc_props(oebps_content) opf_bs = BeautifulStoneSoup(oebps_content) ncx = opf_bs.findAll('item', {'id': 'ncx'})[0] ncx = self.oebps_folder + ncx['href'] # 找到ncx的完整路径 ncx_bs = BeautifulStoneSoup(self.f.read(ncx)) self.chapters = [(nav.navlabel.text, nav.content['src']) for nav in ncx_bs.findAll('navmap')[0].findAll('navpoint')] self.cover_href = self.chapters[0][1] # 封面路径
def trigger_w(self, msg): "Usage: w <search term>. Prints a short description of the corresponding wikipedia article." if len(msg.args) == 0: self.bot.notice(msg.nick, "Please specify a search term") return params = { 'action': 'opensearch', 'format': 'xml', 'limit': '2', 'search': ' '.join(msg.args) } url = 'http://{:s}.wikipedia.org/w/api.php'.format(self.language) response = BeautifulStoneSoup(requests.post(url, data=params).text) # Damn BS4 is case sensitive, hence all the regex. if response.find(re.compile('text', re.I)): index = 0 if "may refer to:" in response.find(re.compile('description', re.I)).string: index = 1 info = response.find_all(re.compile('description', re.I))[index].string.strip() url = response.find_all(re.compile('url', re.I))[index].string short_url = self.shorten(url) message = u"\002Wikipedia ::\002 {} \002::\002 {}".format(info, short_url) self.bot.privmsg(msg.channel, message) else: self.bot.privmsg(msg.channel, "{}: no articles were found.".format(' '.join(msg.args)))
def parse_data(self, url): '''Собирает данные в словарь''' request = self.session.get(url, headers=self.headers) if request.status_code == 200: soap = BeautifulStoneSoup(request.content) if not (bool(soap.find('table', {"class": 'map-columns'})) or bool( soap.find('div', {"class": 'col-md-12 catalog-items'}))): try: name_of_product = soap.find('h1', { 'class': 'title' }).next_element except Exception: raise Format_Exeption('name', url) try: price_for_all = soap.find('div', { "class": "price" }).next_element.replace(" ", "").replace("\n", "")[:-1] except Exception: price_for_all = "Нет в наличии" try: price_for_rozn = soap.find('div', { "class": "rozn-price" }).next_element.replace(" ", "").replace("\n", "")[:-1] price_for_rozn = ''.join( filter(str.isdigit, price_for_rozn)) except Exception: price_for_rozn = "Нет в наличии" try: reference = soap.find('div', { 'class': 'article' }).next_element.replace("-", '')[9:] except Exception: reference = "Нет номера" final = { "name_of_product": name_of_product, "price_for_all": price_for_all, "price_for_registered": price_for_rozn, "reference": reference, "url": url } return final else: print("Не тот формат, вот ссылка {0}".format(url)) raise Format_Exeption else: raise Connection_Exception
def HTMLEntitiesToUnicode(self, text): """ Converts HTML entities to unicode. For example '&' becomes '&'. Args: text: HTML laden text to convert to unicode Returns: String converted to unicode """ try: text = str(BeautifulStoneSoup(text, convertEntities=BeautifulStoneSoup.ALL_ENTITIES)) return text except Exception as e: print("error formatting string: %s ; Errors: %s" % text, e) return None
def parse(word: str, soup: BeautifulStoneSoup) -> dict: entries = [] word = {'word': word, 'entries': entries} for entry in soup.find_all(class_='ldoceEntry Entry'): entries.append({}) last_entry = entries[-1] with suppress(AttributeError): american_pron = entry.find(class_='AMEVARPRON') american = f'/{american_pron.text.strip()}/' if american_pron else '' last_entry['pron'] = '/{english}/ {american}'.format( english=entry.find(class_='PRON').text.strip(), american=american, ).rstrip() try: last_entry['pos'] = entry.find(class_='POS').text.strip() except AttributeError: entries.pop() continue senses = last_entry['senses'] = [] for sense in entry.find_all(class_='Sense'): senses.append({}) last_sense = senses[-1] try: last_sense['definition'] = sense.find( class_='DEF').text.strip() except AttributeError: try: last_sense['definition'] = sense.find( class_='REFHWD').text.strip() except AttributeError: senses.pop() continue find_rel = sense.find(class_='RELATEDWD') if find_rel: last_sense['rel'] = find_rel.text.strip()[2:] find_syn = sense.find(class_='SYN') if find_syn: last_sense['syn'] = find_syn.text.strip()[4:] find_opp = sense.find(class_='OPP') if find_opp: last_sense['opp'] = find_opp.text.strip()[4:] last_sense['examples'] = [ e.text.strip() for e in sense.find_all(class_='EXAMPLE') ] return word
from bs4 import BeautifulStoneSoup import json import os # Rip tags from dumped evernote file markup = open('../data/aaronsw.enex').read() soup = BeautifulStoneSoup(markup) posts = soup.find_all('note') tagged_posts = [i for i in posts if len(i.find_all('tag')) > 0] tagged_posts_dict = {} for post in tagged_posts: post_id = post.find_all('title')[0].text tags = [tag.text for tag in post.find_all('tag')] print post_id, tags tagged_posts_dict[post_id] = tags # Add tags to blog_posts.json blog_posts_file = open(os.path.join('..','data','blog_posts.json'),'r+') blog_posts = json.loads(blog_posts_file.read()) for post_title,post in blog_posts.iteritems(): post_tags = tagged_posts_dict.get(post['postid'],[]) blog_posts[post_title]['tags'] = post_tags
app_name = 'cyREST' item.version.string.replace_with(ver_map[app_name]) # Read versions from output of shell script VER_FILE = './' + build_dir + '/apps/versions.txt' ver_map = {} with open(VER_FILE, 'r') as f: reader = csv.reader(f) for row in reader: ver_map[row[0]] = row[1] print(ver_map) XMLFILE = './' + build_dir + '/cytoscape/gui-distribution/assembly/pom.xml' f = open(XMLFILE, 'r') soup = BeautifulStoneSoup(f.read()) f.close() res = soup.build.plugins.find_all('plugin') print(type(res)) for r in res: p = r.artifactId if p.text == 'maven-dependency-plugin': replaceVer(r) with open(XMLFILE, "w+b") as file: file.write(soup.prettify('utf-8', formatter='xml'))
#-*- coding: utf-8 -*- import MySQLdb from bs4 import BeautifulStoneSoup db = MySQLdb.connect('localhost', 'root', '80671551192', 'test') cursor = db.cursor() xml_cinema = open('dumps/cinema.xml') soup = BeautifulStoneSoup(xml_cinema) for i in soup.findAll('cinema'): id = int(i['id']) cinema = i['name'].encode('utf-8') city_id = int(i['id']) cinema_circuit_id = '' street_type_id = '' street_name = '' number_housing = '' number_hous = '' letter_housing = '' try: zip = int(i.zip['value']) except ValueError: zip = 0 opening = '' note = '' code = '' coding = "SET NAMES 'utf8'" cursor.execute(coding)
# octopress will not show comment input?? # ex> open, closed wp_comment_status = _(item.find("comment_status")) out.write(u'comments: %s\n' % ('true' if wp_comment_status == u'open' else 'false')) # end of yaml header out.write(u'---\n') content = _(item.find("encoded")) content = to_markdown(content.strip()) out.write(content) out.close() if __name__ == '__main__': if DEBUG: if os.access(LOGFILE, os.F_OK): os.remove(LOGFILE) # if len(sys.argv) > 1: # XML = sys.argv[1] print 'loading...' soup = BeautifulStoneSoup(open(XML), features="xml") print 'parsing...' for item in soup.find_all("item"): parse_item(item) print 'done'