Exemple #1
0
	def trigger_w(self, msg):
		"Usage: w <search term>. Prints a short description of the corresponding wikipedia article."
		if len(msg.args) == 0:
			self.bot.notice(msg.nick, "Please specify a search term")
			return

		params = {
			'action': 'opensearch',
			'format': 'xml',
			'limit': '2',
			'search': ' '.join(msg.args)
		}
		url = 'http://{:s}.wikipedia.org/w/api.php'.format(self.language)

		response = BeautifulStoneSoup(requests.post(url, data=params).text)

		# Damn BS4 is case sensitive, hence all the 'regex's.
		if response.find(re.compile('text', re.I)):
			index = 0
			if "may refer to:" in response.find(re.compile('description', re.I)).string:
				index = 1

			info = response.find_all(re.compile('description', re.I))[index].string.strip()
			url = response.find_all(re.compile('url', re.I))[index].string

			short_url = self.shorten(url)

			message = u"\002Wikipedia ::\002 {} \002::\002 {}".format(info, short_url)
			self.bot.privmsg(msg.channel, message)
		else:
			self.bot.privmsg(msg.channel, "{}: no articles were found.".format(' '.join(msg.args)))
Exemple #2
0
    def run(self):
        while True:
            try:
                maxid = self.db.news_list()[0]['id']
            except:
                maxid = 1
            print(maxid)
            client = HTTPClient()
            response = client.fetch('http://cs.hust.edu.cn/rss')
            result = response.body.decode("utf-8",errors='ignore')
            soup = BeautifulStoneSoup(result)
            
            items = soup.find_all('item')
            for item in items:
                title = item.title.text
                link = item.link.text
                desc = item.description.text
                linkid = self.link_id(link)
                if linkid > maxid:
                    result = self.db.add_news(linkid,title,desc,link)
                    if result:
                        result = self.get_article(link)
                else:
                    break

            time.sleep(3600)
Exemple #3
0
def analyze_site_map():
    r = requests.get('{}{}sitemap.xml'.format(app.config['WEB_PROTOCOL'],
                                              app.config['DOMAIN']))

    soup = Soup(r.content)
    locs = soup.findAll('loc')
    return [loc.string for loc in locs]
Exemple #4
0
    def loadJATSSentence(self, s, newDocument, par_id, section_id):
        """
            Loads a JATS sentence (ready split)

            :param s: the plain text of the sentence (with all tags inside, e.g. <xref>)
            :param newDocument: SciDoc
            :param par_id: id of the paragraph containing this sentence
            :param section_id: id of the section containing the paragraph
        """
        newSent = newDocument.addSentence(par_id, "")
        s_soup = BeautifulStoneSoup(s)

        refs = s_soup.findAll("xref", {"ref-type": "bibr"})
        citations_found = []
        for r in refs:
            citations_found.extend(
                self.loadJATSCitation(r,
                                      newSent["id"],
                                      newDocument,
                                      section=section_id))

        non_refs = s_soup.findAll(
            lambda tag: tag.name.lower() == "xref" and "ref-type" in tag and
            tag["ref-type"].lower() != "bibr")
        for nr in non_refs:
            nr.name = "inref"

        newSent["citations"] = [acit["id"] for acit in citations_found]
        # TODO replace <xref> tags with <cit> tags
        newSent["text"] = newDocument.extractSentenceTextWithCitationTokens(
            s_soup, newSent["id"])
        ##            print(newSent["text"])
        # deal with many citations within characters of each other: make them know they are a cluster
        # TODO cluster citations? Store them in some other way?
        newDocument.countMultiCitations(newSent)
Exemple #5
0
    def _parse_request(self):
        """
        Parses various parameters from _request_xml into _request_params.
        We need to override parse here as Microsoft Azure doesn't send
        AssertionConsumerServiceURL (ACS_URL)
        """
        # Minimal test to verify that it's not binarily encoded still:
        if not self._request_xml.strip().startswith('<'):
            raise Exception('RequestXML is not valid XML; '
                            'it may need to be decoded or decompressed.')

        soup = BeautifulStoneSoup(self._request_xml)
        request = soup.findAll()[0]

        if request.get('AssertionConsumerServiceURL', None):
            raise Exception(
                'Invalid Azure request. AssertionConsumerServiceURL exists!')

        params = {}
        params['ACS_URL'] = AZURE_ACS_URL
        params['REQUEST_ID'] = request.get('id', request.get('ID'))

        params['REQUEST_ISSUER'] = self._get_request_issuer(request)

        params['DESTINATION'] = request.get('Destination', '')
        params['PROVIDER_NAME'] = request.get('ProviderName', '')

        self._request_params = params

        # Set subject format - overrides the value set in _reset()
        self._subject_format = AZURE_SUBJECT_FORMAT
Exemple #6
0
    def extract_tags_bs4(self):
        """
        Using: BeatifulSoup's XML parser
        Returns XML data in dict format 
        """
        soup = Soup(self.query_xml) # XML as a string
        self.entries = soup.findAll('entry') # list of <entry>'s
        find_authors = lambda x: x.find('name').string
        for entry in self.entries:
            # strip down entry ID in url to (say) -> 'abs/math/0507289v1'
            entry_id = urlparse(entry.find('id').string).path.lstrip('/') 
            title = entry.find('title').string
            summary = entry.find('summary').string
            # findAll() for multiple entries 
            authors = entry.findAll('author') 
            # returns list of data-type: BeautifulSoup.Tag
            # PYLINT chatters: authors = map(self.find_authors, authors)
            # using list comprehension instead
            authors = [find_authors(i) for i in authors]

            published = entry.find('published').string
            meta = { 'title': title, 'summary': summary, \
                  'authors': authors, 'published': published }
            self.data[entry_id] = meta

        return self.data # python dict
class FollowThatTag(SoupTest):
    "Tests the various ways of fetching tags from a soup."

    def setUp(self):



        ml = Display.write(self)
        self.soup = BeautifulStoneSoup(ml)

    def testFindAllByName(self):
        matching = self.soup('https://stackoverflow.com/jobs/feed?l=Bridgewater%2c+MA%2c+United+States&u=Miles&d=50')
        self.assertEqual(len(matching), 2)
        self.assertEqual(matching[0].name, 'a')
        self.assertEqual(matching, self.soup.findAll('a'))
        self.assertEqual(matching, self.soup.findAll(SoupStrainer('a')))



    def testFindAllText(self):
        soup = BeautifulSoup("<html>\xbb</html>", "lxml")
        self.assertEqual(soup.findAll(text=re.compile('.*')),
                         [u'\xbb'])

    def testTextNavigation(self):
        soup = BeautifulSoup('<url>http://cdn.sstatic.net/Sites/stackoverflow/img/favicon.ico?v=4f32ecc8f43d</url><title>Small funded Boston start-up seeks Senior Python/Django Developer at Circulation (Boston, MA)</title>', "lxml")
        baz = soup.find(text='Small funded Boston start-up seeks Senior Python/Django Developer at Circulation (Boston, MA)')
        self.assertEquals(baz.findParent("url")['title'])
    def _parse_html(self, url):
        '''
        Метод загружает страницу из url и обрабатывает все ссылки на ней присутствующие.
        Рекурсивно вызывает самого себя для первой из еще не обработанных ссылок, т.о.
        парсится весь сайт.

        '''

        html = None
        page_content = None

        self._processed.add(url)
        self._recursion_counter += 1

        try:
            with urllib.request.urlopen(url) as response:
                html = response.read()
        except Exception:
            html = None
            print('Unable to load url %s' % url)

        if html:
            try:
                page_content = Soup(html)
            except Exception:
                page_content = None

        if page_content:
            stop_list = ('#', '', '/')

            for a in page_content.find_all('a', href=True):
                if a['href'] not in stop_list:
                    href = self._build_link(url=a['href'], location_parts=urlparse(url))
                    if href:
                        try:
                            self._url_validator(href)
                        except ValidationError:
                            print('%s is not valid url' % href)
                        else:
                            self._finds.add(href)

            self._add_location(url)

        unprocessed = self._finds - self._processed

        print('Всего страниц: %s. Обработано страниц: %s. Найдено рецептов: %s. Последний URL: %s' %
            (len(self._finds), len(self._processed), len(self._urls), url))

        # На каждом 20-ом вызове данного метода сохраняем self._urls
        if self._recursion_counter % 20:
            self._save()
            self._recursion_counter = 0

        if unprocessed:
            if self.sleep_time > 0:
                time.sleep(self.sleep_time)

            next_url = list(unprocessed)[0]
            self._parse_html(next_url)
Exemple #9
0
 def get_all_urls(self):
     """Возвращает список url"""
     list_of_urls = []
     for url in self.urls:
         request = self.session.get(url, headers=self.headers)
         soap = BeautifulStoneSoup(request.content)
         urls = soap.find_all('loc')
         list_of_urls += [url.next_element for url in urls]
     return list_of_urls
Exemple #10
0
    def read(self, xml, identifier):
        """
            Load a JATS/NLM (PubMed) XML into a SciDoc.

            :param xml: full xml string
            :type xml: basestring
            :param identifier: an identifier for this document, e.g. file name
                        If an actual full path, the path will be removed from it
                        when stored
            :type identifier: basestring
            :returns: :class:`SciDoc <SciDoc>` object
            :rtype: SciDoc
        """
        # this solves a "bug" in BeautifulStoneSoup with "sec" tags
        BeautifulStoneSoup.NESTABLE_TAGS["sec"] = []
        #xml=fixNumberCitationsXML(xml)
        soup = BeautifulStoneSoup(xml)

        # Create a new SciDoc to store the paper
        newDocument = SciDoc()
        metadata = newDocument["metadata"]
        metadata["filename"] = os.path.basename(identifier)
        metadata["original_citation_style"] = detectCitationStyle(xml)

        body = soup.find("body")
        if not body:
            # TODO: Make the error handling less terrible
            debugAddMessage(newDocument, "error",
                            "NO <BODY> IN THIS PAPER! file: " + identifier)
            newDocument["metadata"]["guid"] = cp.Corpus.generateGUID()
            return newDocument

        # Load metadata, either from corpus or from file
        self.loadJATSMetadataFromPaper(newDocument, soup)
        metadata["guid"] = cp.Corpus.generateGUID(metadata)

        # Load all references from the XML
        back = soup.find("back")
        if back:
            ref_list = back.find("ref-list")
            # other things in <back> like appendices: ignore them for now
            if ref_list:
                for ref in ref_list.findAll("ref"):
                    self.loadJATSReference(ref, newDocument)

        newDocument.updateReferences()

        # Load Abstract
        self.loadJATSAbstract(soup, newDocument)

        for sec in body.findChildren("sec", recursive=False):
            self.loadJATSSection(sec, newDocument, "root")

        newDocument.updateAuthorsAffiliations()
        return newDocument
Exemple #11
0
    def open(self, book_id=None):
        if book_id:
            self.book_id = book_id
        if not self.book_id:
            raise Exception('Book id not set')

        self.size = os.path.getsize(self._FILE % (self.book_id, self.book_id))
        sz_mult = 1.0 / (1024**2)
        result = u'%.1f' % (self.size * sz_mult)
        self.size = u'<0.1' if result == u'0.0' else result

        self.f = zipfile.ZipFile(self._FILE % (self.book_id, self.book_id),
                                 'r')
        soup = BeautifulStoneSoup(self.f.read('META-INF/container.xml'))

        oebps = soup.findAll('rootfile')[0]['full-path']
        folder = oebps.rfind(os.sep)
        self.oebps_folder = '' if folder == -1 else oebps[:folder +
                                                          1]  # 找到oebps的文件夹名称

        oebps_content = self.f.read(oebps)
        self.read_doc_props(oebps_content)

        opf_bs = BeautifulStoneSoup(oebps_content)
        ncx = opf_bs.findAll('item', {'id': 'ncx'})[0]
        ncx = self.oebps_folder + ncx['href']  # 找到ncx的完整路径

        ncx_bs = BeautifulStoneSoup(self.f.read(ncx))

        self.chapters = [
            (nav.navlabel.text, nav.content['src'])
            for nav in ncx_bs.findAll('navmap')[0].findAll('navpoint')
        ]
        self.cover_href = self.chapters[0][1]  # 封面路径
Exemple #12
0
def extract_info_from_html():
    from bs4 import BeautifulStoneSoup
    import re
    html = open("data/url.html").read()
    soup = BeautifulStoneSoup(html)
    inputTag = soup.findAll("a")
    inputTag = str(inputTag).split(",")
    m = [re.search(" +href=\"(.*?)\"", i) for i in inputTag]
    urls = [i.group(1) for i in m]

    code = [
        i[9:-9].replace("<", "")
        for i in str(soup.findAll('strong')).split(",")
    ]
    city = [
        i.split('<span class="uni-code">')[0].replace("\t", "").replace(
            "</span>", "").replace("\n", "") for i in html.split(
                '<i class="fa fa-map-marker" aria-hidden="true"></i>')[1:]
    ]
    abbr = [
        i.split('</div>')[0].replace("\t", "").replace("</span>",
                                                       "").replace("\n", "")
        for i in html.split('<div class="name-group">')[1::2]
    ]

    # ADD CODE TO UNI_INFO
    map_abbr_code = [{
        "abbr": m,
        "code": n
    } for m, n in zip(abbr, code) if m != ""]
    import json
    uni = json.load(open("data/university.json"))

    len(uni)
    new_uni = []
    abbrs = []
    for i in uni:
        if (i["abbr"] in abbrs):
            continue
        else:
            for j in map_abbr_code:
                if (j["abbr"] == i["abbr"]):
                    i["code"] = j["code"]
                    break
            new_uni.append(i)
            abbrs.append(i["abbr"])

    with open('data/university_add_code.json', 'w') as outfile:
        json.dump(new_uni, outfile, ensure_ascii=False, indent=4)
Exemple #13
0
 def __call__(self, content: str) -> str:
     filenames = []
     soup = BeautifulStoneSoup(content)
     links = soup.table.findAll('a')
     for link in links:
         filenames.append(link.text)
     return '\n'.join(filenames)
Exemple #14
0
 def test_beautifulstonesoup_is_xml_parser(self):
     # Make sure that the deprecated BSS class uses an xml builder
     # if one is installed.
     with warnings.catch_warnings(record=True) as w:
         soup = BeautifulStoneSoup("<b />")
     self.assertEqual("<b/>", str(soup.b))
     self.assertTrue("BeautifulStoneSoup class is deprecated" in str(w[0].message))
Exemple #15
0
    def fuseReferences(doc, ref):
        """
        """
        prevref = doc["references"][-1]
        doc["metadata"]["ref_replace_list"] = doc["metadata"].get(
            "ref_replace_list", {})
        id = ""
        try:
            id = ref["id"]
            if not id:
                id = prevref["id"]
                if isinstance(id, six.string_types):
                    id = "ref" + str(len(doc["references"]) + 1)
                elif isinstance(id, int):
                    id = id + 1
        except:
            id = "ref" + str(len(doc["references"]) + 1)

        doc["metadata"]["ref_replace_list"][id] = prevref["id"]
        doc["references"].remove(prevref)

        fullstring = re.sub(r"</reference>", "", prevref["xml"], 0,
                            re.IGNORECASE)
        fullstring += re.sub(r"<reference.+?>", "", ref.__repr__(), 0,
                             re.IGNORECASE)
        ##                ref=BeautifulStoneSoup(prevref["xml"]+ref.__repr__())
        ref = BeautifulStoneSoup(fullstring).find("reference")
        processReferenceXML(ref, doc, False)
Exemple #16
0
def HTMLEntitiesToUnicode(text):
    """
    Converts HTML entities to unicode.  For example '&amp;' becomes '&'.
    """
    text = BeautifulStoneSoup(text,
                              convertEntities=BeautifulStoneSoup.ALL_ENTITIES)
    return text
Exemple #17
0
    def read(self, xml, filename):
        """
            Load a document from the Athar corpus

            Args:
                xml: full xml string
        """
        ##        # this solves a "bug" in BeautifulStoneSoup with "sec" tags
        ##        BeautifulStoneSoup.NESTABLE_TAGS["sec"]=[]

        soup = BeautifulStoneSoup(xml)

        paper_data_node = soup.find("div", {"class": "dstPaperData"})
        paper_data = {
            "id": paper_data_node.text,
            "title": "",
            "authors": "",
        }
        title = paper_data_node.find("div", {"class": "dstPaperTitle"})
        if title:
            paper_data["title"] = title.text

        authors = paper_data_node.find("div", {"class": "dstPaperAuthors"})
        if authors:
            author_chunks = title.text.split(";")
            for author in author_chunks:
                chunks = author.split(",")
                author_dict = {"given": chunks[1], "family": chunks[0]}
            paper_data["authors"] = author_dict


##        print(paper_data)

        all_contexts = []
        all_docs = []
        document_nodes = soup.findAll("table", {"class": "srcPaper"})
        for index, document_node in enumerate(document_nodes):
            try:
                doc, contexts = self.loadDocumentNode(document_node,
                                                      paper_data, index)
                all_docs.append(doc)
                all_contexts.extend(contexts)
            except ValueError:
                print("Error:", sys.exc_info()[1])
                break
        return all_docs, all_contexts
Exemple #18
0
 def parse(self, content: str) -> str:
     """Parses web content"""
     filenames = []
     soup = BeautifulStoneSoup(content)
     links = soup.table.findAll('a')
     for link in links:
         filenames.append(link['href'])
     return '\n'.join(filenames)
Exemple #19
0
 def __init__(self):
     try:
         # Получение xml в string формате
         with req.urlopen(self.URL) as open_url:
             soup = BeautifulStoneSoup(open_url.read())
             self.FEED = (series.series_from_xml(soup, 'item'))[2:]
     except Exception as e:
         logger.error("Cannot get a XML-file: %s" % e)
Exemple #20
0
    def response_soup(self):
        "Returns a BeautifulSoup object of the response."

        if not self._response_soup:
            self._response_soup = BeautifulStoneSoup(
                str(self._response_content, encoding='utf-8'))

        return self._response_soup
Exemple #21
0
    def run(self, file_name, user, **kwargs):
        """
        Parse the given xml file using BeautifulSoup. Save all Article, Redirect and Page objects.
        """
        f = open(file_name, 'r')
        xml = f.read()
        f.close()

        soup = BeautifulStoneSoup(xml)
        items = soup.find_all('item')

        for item in items:
            post_type = item.find('wp:post_type').string
            post_status = item.find('wp:status').string

            if post_type == 'attachment':
                get_media(item, user)
                # Note! This script assumes all the attachments come before
                # posts and pages in the xml. If this ends up changing,
                # do two loops, one with attachments and the second with posts and pages.
            elif post_type == 'post' and post_status == 'publish':
                get_posts(item, user)
            elif post_type == 'page' and post_status == 'publish':
                get_pages(item, user)

        if user.email:
            context = {
                'SITE_GLOBAL_SITEDISPLAYNAME':
                get_setting('site', 'global', 'sitedisplayname'),
                'SITE_GLOBAL_SITEURL':
                get_setting('site', 'global', 'siteurl'),
            }
            subject = ''.join(
                render_to_string(
                    template_name=('notification/wp_import/short.txt'),
                    context=context).splitlines())
            body = render_to_string(
                template_name=('notification/wp_import/full.html'),
                context=context)

            #send_mail(subject, body, settings.DEFAULT_FROM_EMAIL, [user.email], fail_silently=False)
            email = EmailMessage(subject, body, settings.DEFAULT_FROM_EMAIL,
                                 [user.email])
            email.content_subtype = 'html'
            email.send(fail_silently=True)
Exemple #22
0
    def parse_data(self, url):
        '''Собирает данные в словарь'''
        request = self.session.get(url, headers=self.headers)
        if request.status_code == 200:
            soup = BeautifulStoneSoup(request.content)
            if not (bool(soup.find('div', {"class": 'error404__text'}))
                    or bool(soup.find('div', {"class": 'nothing-search'}))
                    or bool(soup.find('div', {"id": 'productList'}))):

                try:
                    name_of_product = soup.find('h1').next_element
                except Exception:
                    raise Format_Exeption('name', url)

                try:
                    price_for_all = soup.find(
                        'span', {
                            "class": "item__price item__price--normal-left"
                        }).next_element.replace(" ", "").replace("\n", "")
                except Exception:
                    price_for_all = "Нет в наличии"
                try:
                    price_for_registered = soup.find(
                        'span', {
                            "class": "item__price item__price--red-bold"
                        }).next_element.replace(" ", "").replace("\n", "")
                except Exception:
                    price_for_registered = "Нет в наличии"

                try:
                    reference = soup.findAll(
                        'div', {"class": "item__card-info-articul"})
                    reference = reference[1].next_element
                    reference = str(reference).split()[2].replace("-", '')
                except Exception:
                    reference = "Нет номера"
                final = {
                    "name_of_product": name_of_product,
                    "price_for_all": price_for_all,
                    "price_for_registered": price_for_registered,
                    "reference": reference,
                    "url": url
                }
                return final
            else:
                print("Не тот формат, вот ссылка {0}".format(url))
                raise Format_Exeption
        else:
            raise Connection_Exception
Exemple #23
0
    def render(self, context):
        fancount = ''
        fb_api_url = 'http://api.facebook.com/restserver.php'
        tw_api_url = 'http://api.twitter.com'

        cache_key = ''
        cache_time = 1800

        if self.service == "facebook":
            query = '%s?method=facebook.fql.query&query=SELECT%%20fan_count%%20FROM%%20page%%20WHERE%%20page_id=%s'
            xml_path = query % (fb_api_url, self.service_id)
            cache_key = md5(xml_path.encode()).hexdigest()
            fancount = cache.get(cache_key)
            if not fancount:
                try:
                    xml = urlopen(xml_path)
                    content = xml.read()
                    soup = BeautifulStoneSoup(content)
                    nodes = soup.find_all('page')
                    for node in nodes:
                        fancount = node.fan_count.string
                    cache.set(cache_key, fancount, cache_time)
                except:
                    pass

        if self.service == "twitter":
            query = "%s/1/users/show/%s.xml"
            xml_path = query % (tw_api_url, self.service_id)
            cache_key = md5(xml_path.encode()).hexdigest()
            fancount = cache.get(cache_key)
            if not fancount:
                try:
                    xml = urlopen(xml_path)
                    content = xml.read()
                    soup = BeautifulStoneSoup(content)
                    nodes = soup.find_all('user')
                    for node in nodes:
                        fancount = node.followers_count.string
                    cache.set(cache_key, fancount, cache_time)
                except:
                    pass

        return fancount
Exemple #24
0
    def get_info(self, account):
        request = urllib.request.Request(self.info_url)
        response = self.opener.open(request)
        content = response.read().decode(self.character).encode("utf-8")

        file = open('new/' + account + '.html', 'wb')
        file.write(content)
        file.close()

        detail_html = BeautifulStoneSoup(content)
        img_url = detail_html.find(id="Student11_Image1")
        link = img_url.get('src')
        link = link[2:]
        pto_url = 'http://szjy.swun.edu.cn/Sys/SystemForm' + link
        pto_url = pto_url.replace('照片', '%D5%D5%C6%AC')
        urllib.request.install_opener(opener=self.opener)
        img_name = 'photos/' + account + '.jpg'
        urllib.request.urlretrieve(pto_url, img_name)
        self.cookie = self.cookie.clear()
Exemple #25
0
    def parseCermineXML(self, xml_string):
        """
            This is meant to load the full output from Cermine, whichever it may be.
            Currently only reads references.
        """
        soup = BeautifulStoneSoup(xml_string, convertEntities=BeautifulStoneSoup.HTML_ENTITIES)
##        print(xml_string)
        references=self.readReferences(soup)
        # TODO implement reading the rest of the Cermine/ParsHed tagging
        return references
    def parse(self):
        '''
        Метод формирует JSON список url с рецептами сайта и
        сохраняет его в MEDIA_ROOT/parser/source.js.
        В зависимости от настроек анализирует карту сайта или же
        парсит html-страницы.

        '''

        # Парсинг по карте сайта
        if hasattr(settings, 'PARSER__URL_SOURCE') and settings.PARSER__URL_SOURCE == 'sitemap':

            xml = None

            if not hasattr(settings, 'PARSER__SITEMAP_URL') or not settings.PARSER__SITEMAP_URL:
                print('PARSER__SITEMAP_URL is not defined')
            else:
                try:
                    with urllib.request.urlopen(settings.PARSER__SITEMAP_URL) as response:
                        xml = response.read()
                except Exception:
                    xml = None

            if xml:
                sitemap = Soup(xml)
                urls = sitemap.findAll('url')
                for u in urls:
                    loc = u.find('loc').string
                    self._add_location(loc)
        else:
            # Парсинг по тегам html-страниц
            if not hasattr(settings, 'PARSER__CELL_HOMEPAGE') or not settings.PARSER__CELL_HOMEPAGE:
                print('PARSER__CELL_HOMEPAGE is not defined')
                return False

            # Счетчик рекурсивных вызовов метода _parse_html
            self._recursion_counter = 0

            self._parse_html(settings.PARSER__CELL_HOMEPAGE)

        self._save()

        return self.json_file_path
Exemple #27
0
def soup_maker(fh):
    """ Takes a file handler returns BeautifulSoup"""
    try:
        from bs4 import BeautifulSoup
        soup = BeautifulSoup(fh, "lxml")
        for tag in soup.find_all():
            tag.name = tag.name.lower()
    except ImportError:
        from bs4 import BeautifulStoneSoup
        soup = BeautifulStoneSoup(fh)
    return soup
Exemple #28
0
 def login(self):
     """ Read greeting """
     greeting = self.read()
     soup = BeautifulStoneSoup(greeting, 'lxml')
     svid = soup.find('svid')
     version = soup.find('version')
     print("Connected to %s (v%s)\n" % (svid.text, version.text))
     """ Login """
     xml = commands.login % self.config
     if not self.cmd(xml, silent=True):
         exit(1)
    def getSeriesDetailsByName(self, serieName):

        if serieName in IGNORE_SHOWS:
            return None

        print 'checking: ' + serieName

        if serieName in KNOWN_SHOWS.keys():
            url = GET_SERIES_URL % (urllib.quote(
                KNOWN_SHOWS[serieName]['TVDBName']))
        else:
            url = GET_SERIES_URL % (urllib.quote(serieName))

        try:
            # Change the User Agent
            USER_AGENT = 'Mozilla/5.0 (iPad; U; CPU OS 3_2 like Mac OS X; en-us) AppleWebKit/531.21.10 (KHTML, like Gecko) Version/4.0.4 Mobile/7B334b Safari/531.21.10'

            cj = cookielib.CookieJar()
            opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))

            req = urllib2.Request(url)
            req.add_header('User-Agent', USER_AGENT)

            resp = opener.open(req)

            soup = BeautifulStoneSoup(resp.read())
            resp.close()

            if len(soup.findAll('series')) == 1:
                self.saveSerieDetail(serieName, soup.series)
            else:
                for serie in soup.findAll('series'):
                    if serie.seriesname.string == serieName:
                        self.saveSerieDetail(serieName, serie)

            if serieName in KNOWN_SHOWS.keys():
                return KNOWN_SHOWS[serieName]
            return None
        except:
            print 'Error: ' + url
            return None
Exemple #30
0
def get_text(fileDir):
    document = zipfile.ZipFile(fileDir)
    #xml_content = document.read('content.xml')
    #document.close()

    #xml = parse(document.)
    #xml = parse('inputText/content.xml')
    #print(document.filelist)
    #print(document.open('content.xml'))
    xml = parse(document.open('content.xml'))

    textSoup = BeautifulStoneSoup(document.read('content.xml'))
    #print(textSoup.prettify())
    #print(textSoup.get_text())

    document.close()
    """
    officeText = xml.getElementsByTagName('office:text')

    textFromDoc = []

    if len((officeText[0].childNodes)) != 0:
        for officeNode in officeText[0].childNodes:
            if len(officeNode.childNodes) != 0:
                for nextNode1 in officeNode.childNodes:
                    if len(nextNode1.childNodes) == 0:
                        if nextNode1.nodeValue == None:
                            textFromDoc.append(' ')
                        else:
                            textFromDoc.append(nextNode1.nodeValue)
                    else:
                        for nextNode2 in  nextNode1.childNodes:
                            if len(nextNode2.childNodes) == 0:
                                textFromDoc.append(nextNode2.nodeValue)
    """

    #for node in text:
    #textFromDoc.append(getTextFromTag(node))
    #print(getTextFromTag(node))

    return textSoup.get_text()
Exemple #31
0
    def _parse_request(self):
        """
        Parses various parameters from _request_xml into _request_params.
        """
        # Minimal test to verify that it's not binarily encoded still:
        if isinstance(self._request_xml, bytes):
            request_xml = self._request_xml.decode('utf-8')
        else:
            request_xml = self._request_xml
        if not request_xml.strip().startswith('<'):
            raise Exception('RequestXML is not valid XML; '
                            'it may need to be decoded or decompressed.')

        soup = BeautifulStoneSoup(self._request_xml)
        request = soup.findAll()[0]
        params = {}
        params['ACS_URL'] = request.get('AssertionConsumerServiceURL')
        params['REQUEST_ID'] = request.get('id', request.get('ID'))
        params['DESTINATION'] = request.get('Destination', '')
        params['PROVIDER_NAME'] = request.get('ProviderName', '')
        self._request_params = params
    def getDetailsForSerieByID(self, serieName, serieID):
        url = SERIE_DETAILS_URL % (urllib.quote(serieID))

        try:
            # Change the User Agent
            USER_AGENT = 'Mozilla/5.0 (iPad; U; CPU OS 3_2 like Mac OS X; en-us) AppleWebKit/531.21.10 (KHTML, like Gecko) Version/4.0.4 Mobile/7B334b Safari/531.21.10'

            cj = cookielib.CookieJar()
            opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))

            req = urllib2.Request(url)
            req.add_header('User-Agent', USER_AGENT)

            resp = opener.open(req)

            soup = BeautifulStoneSoup(resp.read())
            resp.close()

            for banner in soup.banners.findAll('banner'):
                if banner.language.string == 'en':
                    if not 'Fanart' in KNOWN_SHOWS[serieName].keys(
                    ) and banner.bannertype.string == 'fanart':
                        KNOWN_SHOWS[serieName]['Fanart'] = str(
                            BANNER_URL % (banner.bannerpath.string))
                        if banner.thumbnailpath:
                            KNOWN_SHOWS[serieName]['FanartThumb'] = str(
                                BANNER_URL % (banner.thumbnailpath.string))
                    elif not 'Poster' in KNOWN_SHOWS[serieName].keys(
                    ) and banner.bannertype.string == 'poster':
                        KNOWN_SHOWS[serieName]['Poster'] = str(
                            BANNER_URL % (banner.bannerpath.string))
                        if banner.thumbnailpath:
                            KNOWN_SHOWS[serieName]['PosterThumb'] = str(
                                BANNER_URL % (banner.thumbnailpath.string))
                    elif not 'Season' in KNOWN_SHOWS[serieName].keys(
                    ) and banner.bannertype.string == 'season':
                        KNOWN_SHOWS[serieName]['Season'] = str(
                            BANNER_URL % (banner.bannerpath.string))
                        if banner.thumbnailpath:
                            KNOWN_SHOWS[serieName]['SeasonThumb'] = str(
                                BANNER_URL % (banner.thumbnailpath.string))
                    elif not 'Series' in KNOWN_SHOWS[serieName].keys(
                    ) and banner.bannertype.string == 'series':
                        KNOWN_SHOWS[serieName]['Series'] = str(
                            BANNER_URL % (banner.bannerpath.string))
                        if banner.thumbnailpath:
                            KNOWN_SHOWS[serieName]['SeriesThumb'] = str(
                                BANNER_URL % (banner.thumbnailpath.string))

            return KNOWN_SHOWS[serieName]
        except:
            print 'Error: ' + url
            return None
Exemple #33
0
    def open(self, book_id=None):
        if book_id:
            self.book_id = book_id
        if not self.book_id:
            raise Exception('Book id not set')

        self.size = os.path.getsize(self._FILE % (self.book_id, self.book_id))
        sz_mult = 1.0/(1024**2)
        result = u'%.1f' % (self.size * sz_mult)
        self.size = u'<0.1' if result == u'0.0' else result

        self.f = zipfile.ZipFile(self._FILE % (self.book_id, self.book_id), 'r')
        soup = BeautifulStoneSoup(self.f.read('META-INF/container.xml'))

        oebps = soup.findAll('rootfile')[0]['full-path']
        folder = oebps.rfind(os.sep)
        self.oebps_folder = '' if folder == -1 else oebps[:folder+1]   # 找到oebps的文件夹名称

        oebps_content = self.f.read(oebps)
        self.read_doc_props(oebps_content)

        opf_bs = BeautifulStoneSoup(oebps_content)
        ncx = opf_bs.findAll('item', {'id': 'ncx'})[0]
        ncx = self.oebps_folder + ncx['href']     # 找到ncx的完整路径

        ncx_bs = BeautifulStoneSoup(self.f.read(ncx))

        self.chapters = [(nav.navlabel.text, nav.content['src']) for
                         nav in ncx_bs.findAll('navmap')[0].findAll('navpoint')]
        self.cover_href = self.chapters[0][1]    # 封面路径
Exemple #34
0
	def trigger_w(self, msg):
		"Usage: w <search term>. Prints a short description of the corresponding wikipedia article."
		if len(msg.args) == 0:
			self.bot.notice(msg.nick, "Please specify a search term")
			return

		params = {
			'action': 'opensearch',
			'format': 'xml',
			'limit': '2',
			'search': ' '.join(msg.args)
		}
		url = 'http://{:s}.wikipedia.org/w/api.php'.format(self.language)

		response = BeautifulStoneSoup(requests.post(url, data=params).text)

		# Damn BS4 is case sensitive, hence all the regex.
		if response.find(re.compile('text', re.I)):
			index = 0
			if "may refer to:" in response.find(re.compile('description', re.I)).string:
				index = 1

			info = response.find_all(re.compile('description', re.I))[index].string.strip()
			url = response.find_all(re.compile('url', re.I))[index].string

			short_url = self.shorten(url)

			message = u"\002Wikipedia ::\002 {} \002::\002 {}".format(info, short_url)
			self.bot.privmsg(msg.channel, message)
		else:
			self.bot.privmsg(msg.channel, "{}: no articles were found.".format(' '.join(msg.args)))
Exemple #35
0
    def parse_data(self, url):
        '''Собирает данные в словарь'''
        request = self.session.get(url, headers=self.headers)
        if request.status_code == 200:
            soap = BeautifulStoneSoup(request.content)
            if not (bool(soap.find('table', {"class": 'map-columns'})) or bool(
                    soap.find('div', {"class": 'col-md-12 catalog-items'}))):
                try:
                    name_of_product = soap.find('h1', {
                        'class': 'title'
                    }).next_element
                except Exception:
                    raise Format_Exeption('name', url)

                try:
                    price_for_all = soap.find('div', {
                        "class": "price"
                    }).next_element.replace(" ", "").replace("\n", "")[:-1]
                except Exception:
                    price_for_all = "Нет в наличии"
                try:
                    price_for_rozn = soap.find('div', {
                        "class": "rozn-price"
                    }).next_element.replace(" ", "").replace("\n", "")[:-1]
                    price_for_rozn = ''.join(
                        filter(str.isdigit, price_for_rozn))
                except Exception:
                    price_for_rozn = "Нет в наличии"
                try:
                    reference = soap.find('div', {
                        'class': 'article'
                    }).next_element.replace("-", '')[9:]
                except Exception:
                    reference = "Нет номера"

                final = {
                    "name_of_product": name_of_product,
                    "price_for_all": price_for_all,
                    "price_for_registered": price_for_rozn,
                    "reference": reference,
                    "url": url
                }
                return final
            else:
                print("Не тот формат, вот ссылка {0}".format(url))
                raise Format_Exeption
        else:
            raise Connection_Exception
 def HTMLEntitiesToUnicode(self, text):
     """
     Converts HTML entities to unicode.  For example '&amp;' becomes '&'.
     Args:
         text: HTML laden text to convert to unicode
     Returns:
         String converted to unicode
     """
     try:
         text = str(BeautifulStoneSoup(text, convertEntities=BeautifulStoneSoup.ALL_ENTITIES))
         return text
     except Exception as e:
         print("error formatting string: %s ; Errors:  %s" % text, e)
         return None
Exemple #37
0
def parse(word: str, soup: BeautifulStoneSoup) -> dict:
    entries = []
    word = {'word': word, 'entries': entries}
    for entry in soup.find_all(class_='ldoceEntry Entry'):
        entries.append({})
        last_entry = entries[-1]
        with suppress(AttributeError):
            american_pron = entry.find(class_='AMEVARPRON')
            american = f'/{american_pron.text.strip()}/' if american_pron else ''
            last_entry['pron'] = '/{english}/ {american}'.format(
                english=entry.find(class_='PRON').text.strip(),
                american=american,
            ).rstrip()
        try:
            last_entry['pos'] = entry.find(class_='POS').text.strip()
        except AttributeError:
            entries.pop()
            continue

        senses = last_entry['senses'] = []
        for sense in entry.find_all(class_='Sense'):
            senses.append({})
            last_sense = senses[-1]
            try:
                last_sense['definition'] = sense.find(
                    class_='DEF').text.strip()
            except AttributeError:
                try:
                    last_sense['definition'] = sense.find(
                        class_='REFHWD').text.strip()
                except AttributeError:
                    senses.pop()
                    continue

            find_rel = sense.find(class_='RELATEDWD')
            if find_rel:
                last_sense['rel'] = find_rel.text.strip()[2:]

            find_syn = sense.find(class_='SYN')
            if find_syn:
                last_sense['syn'] = find_syn.text.strip()[4:]

            find_opp = sense.find(class_='OPP')
            if find_opp:
                last_sense['opp'] = find_opp.text.strip()[4:]

            last_sense['examples'] = [
                e.text.strip() for e in sense.find_all(class_='EXAMPLE')
            ]
    return word
Exemple #38
0
from bs4 import BeautifulStoneSoup

import json
import os

# Rip tags from dumped evernote file
markup = open('../data/aaronsw.enex').read()
soup = BeautifulStoneSoup(markup)

posts = soup.find_all('note')

tagged_posts = [i for i in posts if len(i.find_all('tag')) > 0]
tagged_posts_dict = {}

for post in tagged_posts:
    post_id = post.find_all('title')[0].text

    tags = [tag.text for tag in post.find_all('tag')]

    print post_id, tags
    tagged_posts_dict[post_id] = tags


# Add tags to blog_posts.json
blog_posts_file = open(os.path.join('..','data','blog_posts.json'),'r+')
blog_posts = json.loads(blog_posts_file.read())

for post_title,post in blog_posts.iteritems():
    post_tags = tagged_posts_dict.get(post['postid'],[])
    blog_posts[post_title]['tags'] = post_tags
                    app_name = 'cyREST'
                item.version.string.replace_with(ver_map[app_name])

# Read versions from output of shell script
VER_FILE = './' + build_dir + '/apps/versions.txt'
ver_map = {}

with open(VER_FILE, 'r') as f:
    reader = csv.reader(f)
    for row in reader:
        ver_map[row[0]] = row[1]

print(ver_map)

XMLFILE = './' + build_dir + '/cytoscape/gui-distribution/assembly/pom.xml'

f = open(XMLFILE, 'r')
soup = BeautifulStoneSoup(f.read())
f.close()

res = soup.build.plugins.find_all('plugin')
print(type(res))

for r in res:
    p = r.artifactId
    if p.text == 'maven-dependency-plugin':
        replaceVer(r)

with open(XMLFILE, "w+b") as file:
    file.write(soup.prettify('utf-8', formatter='xml'))
Exemple #40
0
#-*- coding: utf-8 -*-

import MySQLdb

from bs4 import BeautifulStoneSoup

db = MySQLdb.connect('localhost', 'root', '80671551192', 'test')
cursor = db.cursor()
xml_cinema = open('dumps/cinema.xml')
soup = BeautifulStoneSoup(xml_cinema)

for i in soup.findAll('cinema'):
	id = int(i['id'])
	cinema = i['name'].encode('utf-8')
	city_id = int(i['id'])
	cinema_circuit_id = ''
	street_type_id = ''
	street_name = ''
	number_housing = ''
	number_hous = ''
	letter_housing = ''
	try:
		zip = int(i.zip['value'])
	except ValueError:
		zip = 0
	opening = ''
	note = ''
	code = ''

	coding = "SET NAMES 'utf8'"
	cursor.execute(coding)
Exemple #41
0
    # octopress will not show comment input??
    # ex> open, closed
    wp_comment_status = _(item.find("comment_status"))
    out.write(u'comments: %s\n' % ('true' if wp_comment_status == u'open' else 'false'))

    # end of yaml header
    out.write(u'---\n')

    content = _(item.find("encoded"))
    content = to_markdown(content.strip())
    out.write(content)

    out.close()

if __name__ == '__main__':

    if DEBUG:
        if os.access(LOGFILE, os.F_OK):
            os.remove(LOGFILE)

    # if len(sys.argv) > 1:
    #     XML = sys.argv[1]

    print 'loading...'
    soup = BeautifulStoneSoup(open(XML), features="xml")
    print 'parsing...'
    for item in soup.find_all("item"):
        parse_item(item)
    print 'done'