Example #1
0
    def parse_begin_xxx(self, m, root):
        symbol = m.group(1)
        if symbol in ['html', 'HTML']:
            new_tag = BeautifulSoup(m.group(2), 'html.parser').contents[0]
        elif symbol in ['example', 'EXAMPLE']:
            new_tag = self.soup.new_tag('pre')
            new_tag['class'] = 'example'
            new_tag.string = m.group(2)
        elif symbol in ['quote', 'QUOTE']:
            new_tag = self.soup.new_tag('blockquote')
            # new_tag.string = m.group(2)
            for part in re.split('\n{2,}', m.group(2)):
                new_p_tag = self.soup.new_tag('p')
                new_p_tag.string = part
                new_tag.append(new_p_tag)
        elif symbol in ['verse', 'VERSE']:
            new_tag = self.soup.new_tag('p')
            new_tag['class'] = 'verse'
            new_tag.string = m.group(2)
        elif symbol in ['center', 'CENTER']:
            new_tag = self.soup.new_tag('div')
            new_tag['class'] = 'center'
            new_tag.string = m.group(2)
        else:
            raise RuntimeError('Not supportted begin symbol: %s' % symbol)

        root.append(new_tag)
Example #2
0
    def generate_rss_item(self):
        item = BeautifulSoup(features="xml").new_tag("item")
        bare_tags = {
            "title": self.name,
            "itunes:duration": self.duration,
            "description": self.description,
            "itunes:subtitle": self.description,
            "itunes:summary": self.description,
        }

        for t, v in bare_tags.items():
            tag = BeautifulSoup(features="xml").new_tag(t)
            tag.string = v if v is not None else ""
            item.append(tag)

        guid = BeautifulSoup(features="xml").new_tag("guid", isPermaLink="false")
        guid.string = self.storage_key
        item.append(guid)

        url = f"{CDN_BASE_URL}/{self.storage_key}"

        item.append(
            BeautifulSoup(features="xml").new_tag(
                "enclosure", url=url, type="audio/mpeg"
            )
        )

        return item
Example #3
0
    def page_soupify(self, working_dir, html_filename):
        """
        Assumption: working only with normal bootstrap-type file
        organization. That is,
        if the index.html file is located at a directory
        /home/user/examples, then only include javascript and CSS
        from directories of type /home/user/examples/js, that is,
        only one level deep.
        """
        if not working_dir.endswith('/'):
            working_dir += '/'
        file_fullpath = working_dir + html_filename

        self.index_soup = BeautifulSoup(open(file_fullpath))

        #NOTE(rushiagr): Assumes that all the <link> tags inside <head>
        # are for CSS files which lie locally!
        
        # Create a <style> tag for every <link> tag

        links = self.index_soup.head.find_all('link')
        
        for i in range(len(links)):
            link_media = links[i].get('media')
            style_tag = BeautifulSoup().new_tag(
                'style', media=link_media, type='text/css'
            )
            style_data = ''.join(line for line in \
                    open(working_dir+links[i].get('href')).readlines())
            style_tag.string = style_data
            self.index_soup.head.append(style_tag)

        for i in range(len(links)):
            self.index_soup.head.link.decompose()
        
        # Create a <script> tag, which contains ALL the javascript embedded
        # in it, for every existing <script> tag. As you can see, the method
        # is going to be slightly different than above.
        
        scripts = self.index_soup.head.find_all('script')
        script_filenames = []
        
        for script in scripts:
            script_filenames.append(script.get('src'))
        for i in range(len(scripts)):
            self.index_soup.head.script.decompose()
        for i in range(len(scripts)):
            script_tag = BeautifulSoup().new_tag('script')
            script_data = ''.join(line for line in \
                    open(working_dir+script_filenames[i]).readlines())
            script_tag.string = script_data
            self.index_soup.head.append(script_tag)
        
        outfile = open(file_fullpath[:-5]+'_output.html', 'w')
        outfile.write(self.index_soup.prettify())
        outfile.close()
Example #4
0
def process(filename):
    with open(filename) as f:
        soup = BeautifulSoup(f, "lxml")

    div = soup.find("div", class_="wy-side-nav-search")
    a = BeautifulSoup(
        """<a href="http://www.helsinki.fi" style="margin-bottom: 0px;"><img src="https://uni.materialbank.net/NiboWEB/uni/getPublicFile.do?uuid=146263&amp;inline=false&amp;ticket=8a2a112700dc87abd2813d55e149bc0c&amp;type=original" style="margin-bottom: 0px;max-width: 60%;height: auto;width: auto;"></a>""",
        "html.parser")
    div.insert(0, a)

    divs = soup.find_all("div", class_="admonition")

    for d in divs:
        if len(d.contents) != 1:
            continue
        m = re.match(r"\n*(Exercise \d+ \([\w ]+\))", d.contents[0].string)
        if m:
            exercise = m[1]
            a = soup.new_tag("a", id=exercise.replace(" ", "-"))
            a.string = exercise
            d.string = ""
            d.append(a)
            #d.string = '<a name="%s">%s</a>' % (exercise.replace(" ", "-"), exercise)
            #print("\n", d)

    with open(filename, "w") as f:
        f.write(str(soup))
Example #5
0
 def _processing_attachment(self, matched):
     file_path = matched.group('post_path')
     download_url = 'http://{tistory_url}/attachment/{pre_path}@{post_path}'.format(
         tistory_url='{user_name}.tistory.com'.format(user_name=self.user_name),
         pre_path=matched.group('pre_path'),
         post_path=file_path)
     dir_path = self.dir_path
     download_path = '{dir_path}/{file_path}'.format(
         dir_path=dir_path,
         file_path=file_path
     )
     self._file_data_to_download.append((download_url, dir_path, file_path))
     if 'image/jpeg' in matched.group('attr'):
         return u'<img src="{site_url_tmpl}{download_path}" {attr}>'.format(
             site_url_tmpl='{{site.url}}/',
             download_path=download_path,
             attr=matched.group('attr'))
     else:
         tag = u'<a href="{site_url_tmpl}{download_path}" {attr}></a>'.format(
             site_url_tmpl='{{site.url}}/',
             download_path=download_path,
             attr=matched.group('attr'))
         soup = BeautifulSoup(tag).a
         soup.string = soup['filename']
         return unicode(soup)
Example #6
0
def gen_nojs(sibling):
    nojs_link = BeautifulSoup().new_tag('a')
    nojs_link['href'] = '/window?location=' + sibling['href']
    nojs_link['style'] = 'display:block;width:100%;'
    nojs_link.string = 'NoJS Link: ' + nojs_link['href']
    sibling.append(BeautifulSoup('<br><hr><br>', 'html.parser'))
    sibling.append(nojs_link)
Example #7
0
def rando():
    #clip off the last semicolon then split up the separate querries
    reqs = request.query_string.decode('utf-8')[:-1].split(";")
    querry = []
    #make a list of querries to quinterest
    for r in reqs:
        querry.append(formatreq(r))
    if (len(querry) > 25):
        querry = querry[:25]
    questions = []
    for q in querry:
        out = get("http://quinterest.org{}".format(q)).text
        out = BeautifulSoup(out,
                            'html.parser').find_all(attrs={"class": "row"})
        out.pop(0)
        for e in out:
            #insert the query and replace button at the end of the question
            querystr = q[23:]
            querystr = sub('amount=[0-9]+', 'amount=1', querystr)
            span = BeautifulSoup(
                '<span class="subjTag" style="display:none"></span>').span
            repbutton = BeautifulSoup(
                '<button class="btn repbutton" onclick="replaceQuestion($(this))">Replace This Question</button>'
            ).button
            span.string = querystr
            e.div.append(span)
            e.div.append(repbutton)
            questions.append(str(e))
    questions = processQuestions(questions)
    return ("<br>".join(questions))
Example #8
0
    def find_all_p(self, segment):
        def skip_p(p):
            text_is_unicode_space = lambda x: len(x) <= 2 and (chr(194) in x or chr(160) in x)
            no_text = p.text == "" or p.text == "\n" or p.text.replace(" ", "") == "" or text_is_unicode_space(
                p.text.encode('utf-8'))
            return no_text and not p.find("img")

        ps = segment.find_all("p")
        new_ps = []
        temp_p = ""
        for p_n, p in enumerate(ps):
            if skip_p(p):
                continue
            elif len(p.text.split()) == 1 and re.compile(u"^.{1,2}[\)|\.]").match(p.text):  # make sure it's in form 1. or ש.
                temp_p += p.text
            elif p.find("img"):
                img = p.find("img")
                if "pages/images/hard.gif" == img.attrs["src"]:
                    temp_p += "*"
                elif "pages/images/harder.gif" == img.attrs["src"]:
                    temp_p += "**"
            else:
                if temp_p:
                    temp_tag = BeautifulSoup("<p></p>", "lxml")
                    temp_tag = temp_tag.new_tag("p")
                    temp_tag.string = temp_p
                    temp_p = ""
                    p.insert(0, temp_tag)
                new_ps.append(p)

        return new_ps
Example #9
0
def rando():
    #clip off the last semicolon then split up the separate querries
    reqs = request.query_string.decode('utf-8')[:-1].split(";")
    querry = []
    #make a list of querries to quinterest
    for r in reqs:
        querry.append(formatreq(r))
    if(len(querry) > 25):
        querry = querry[:25]
    questions = []
    for q in querry:
        out = get("http://quinterest.org{}".format(q)).text
        out = BeautifulSoup(out, 'html.parser').find_all(attrs={"class":"row"})
        out.pop(0)
        for e in out:
            #insert the query and replace button at the end of the question
            querystr = q[23:]
            querystr = sub('amount=[0-9]+','amount=1',querystr)
            span = BeautifulSoup('<span class="subjTag" style="display:none"></span>').span
            repbutton = BeautifulSoup('<button class="btn repbutton" onclick="replaceQuestion($(this))">Replace This Question</button>').button
            span.string = querystr
            e.div.append(span)
            e.div.append(repbutton)
            questions.append(str(e))
    questions = processQuestions(questions)
    return ("<br>".join(questions))
Example #10
0
def encodeScript(line):
    sc = BeautifulSoup(line, "html.parser").find("script")
    if(sc.get("src")):
        sc["src"] = encodeBase64(sc.get("src"))
    else:
        sc.string = pattern.sub(
            lambda x:  repr(encodeBase64(x.group(2), dirname)), sc.string)
    return sc.prettify()
Example #11
0
    def generate_rss_channel(self):
        channel = BeautifulSoup(features="xml").new_tag("channel")
        bare_tags = {
            "title": self.name,
            "description": self.description,
            "language": "en-us",
            "docs": "http://www.rssboard.org/rss-specification",
            "generator": "myself",
            "lastBuildDate": datetime.now().ctime(),
        }
        for t, v in bare_tags.items():
            tag = BeautifulSoup(features="xml").new_tag(t)
            tag.string = v
            channel.append(tag)

        # Links
        lt = BeautifulSoup(features="xml").new_tag("link")
        lt.string = self.url
        channel.append(lt)

        lta = BeautifulSoup(features="xml").new_tag(
            "atom:link", href=self.url, rel="self"
        )
        channel.append(lta)

        # iTunes category and friends
        cat = BeautifulSoup(features="xml").new_tag(
            "itunes:category", text="Technology"
        )
        cat.append(
            BeautifulSoup(features="xml").new_tag("itunes:category", text="Podcasting")
        )
        channel.append(cat)

        channel.append(
            BeautifulSoup(features="xml").new_tag(
                "itunes:image",
                href="https://timbrook-podcast.sfo2.digitaloceanspaces.com/podcover.png",
            )
        )
        expl = BeautifulSoup(features="xml").new_tag("itunes:explicit")
        expl.string = "yes"
        channel.append(expl)

        return channel
Example #12
0
def format(content):
    bs = BeautifulSoup(content, "html.parser")
    if bs.div is None:
        for _img in bs.find_all("img"):
            tex = BeautifulSoup("", "html.parser").new_tag("tex")
            tex.string = "\\" + _img["latex"]
            _img.replace_with(tex)
    strs = str(bs)
    return strs
Example #13
0
def get_trans_text():
    url = 'https://translate.google.cn/#view=home&op=translate&sl=zh-CN&tl=en&text=%3Cdiv%20class%3D%22dpl-box-title%22%3E%0A%20%20%20%20%20%20%20%20%20%20%20%20%E8%B4%A7%E5%93%81%E7%B1%BB%E5%9E%8B%0A%20%20%20%20%20%20%20%20%3C%2Fdiv%3E'
    # req = request.urlopen(url)
    wd = webdriver.Chrome(executable_path=os.path.join(
        os.path.dirname(__file__), 'library/chromedriver.exe'))
    wd.get(url)
    time.sleep(10)
    html_text = wd.page_source
    wd.quit()
    print(html_text)
    soup = BeautifulSoup(html_text, features="html.parser")
    print(soup.string())
Example #14
0
def _merge_consecutive_symbols(consecutive_char_sequence: List[BeautifulSoup],
                               consecutive_char_indices: List[int],
                               base_tag: BeautifulSoup) -> NodeSymbol:
    base_tag['s2:start'] = consecutive_char_sequence[0]['s2:start']
    base_tag['s2:end'] = consecutive_char_sequence[-1]['s2:end']
    base_tag['s2:index'] = consecutive_char_sequence[0]['s2:index']
    base_tag.string = ''.join(
        list(map(lambda node: node.string, consecutive_char_sequence)))
    node_clone = _clean_node_of_annotations(base_tag)
    return NodeSymbol(characters=consecutive_char_indices,
                      mathml=str(node_clone),
                      node=base_tag)
Example #15
0
def get_dc_row(element, qualifier, value):
    """ Parameters:
      element - xml element 
      qualifier - xml qualifier 
      value - value to be written in the xml file for the the specific element and qualifier
      Returns:
      The newly created xml file row with parsed from the supplied information
      <dcvalue element="date" qualifier="issued">2018-04</dcvalue>
  """
    row = BeautifulSoup("<dcvalue></dcvalue>", "xml").dcvalue
    row['element'] = element
    row['qualifier'] = qualifier
    row.string = value
    return row
Example #16
0
def append_nojs(result: BeautifulSoup) -> None:
    """Appends a no-Javascript alternative for a search result

    Args:
        result: The search result to append a no-JS link to

    Returns:
        None

    """
    nojs_link = BeautifulSoup(features='html.parser').new_tag('a')
    nojs_link['href'] = f'/{Endpoint.window}?location=' + result['href']
    nojs_link.string = ' NoJS Link'
    result.append(nojs_link)
Example #17
0
def get_header_to_link(html):
    for title in html.find_all('h3'):
        # Add a link to search on how to do the achievement
        link = BeautifulSoup().new_tag(
            "a",
            href=
            f'http://www.google.com/search?q=halo+{title.string}+achievement')

        # Open in new tab on click
        link["target"] = "_blank"

        link.string = f'{title.string}'
        title.string.replace_with(link)

    return str(html)
Example #18
0
def append_nojs(result: BeautifulSoup) -> None:
    """Appends a no-Javascript alternative for a search result

    Args:
        result: The search result to append a no-JS link to

    Returns:
        None

    """
    nojs_link = BeautifulSoup(features='html.parser').new_tag('a')
    nojs_link['href'] = '/window?location=' + result['href']
    nojs_link['style'] = 'display:block;width:100%;'
    nojs_link.string = 'NoJS Link: ' + nojs_link['href']
    result.append(BeautifulSoup('<br><hr><br>', 'html.parser'))
    result.append(nojs_link)
Example #19
0
def getitem(speech, command, data):
    parse_count = 0
    no_check = 0
    tag = None

    # default tags
    for item in tag_mapper:
        if item in speech:
            parse_count = speech.find(item)
            tag = data.new_tag(tag_mapper[item])
            break
    else:
        # snippets
        items = os.listdir(cache_dir)
        for item in items:
            tmp = item.split(".")[0]
            if tmp in speech:
                parse_count = speech.find(tmp)
                with open(cache_dir + item, "r") as f:
                    tag = BeautifulSoup(f.read(), features="html.parser")
                    no_check = 1
                break

    speech = speech[parse_count:].split()

    try:
        if not no_check:
            check_content = speech.index("content")
            tag.string = speech[check_content + 1]
            check_class = speech.index("class")
            class_ = speech[check_class + 2]
            tag["class"] = class_
    except ValueError:
        pass
    if data and tag:
        data.body.append(tag)
        savefile(data)
        speak("Sucessfully added to the html")
    else:
        print(data, "#############", tag)
        speak("Invalid command")
    return data
Example #20
0
 def _processing_attachment(self, matched):
     file_path = matched.group('post_path')
     download_url = 'http://{tistory_url}/attachment/{pre_path}@{post_path}'.format(
         tistory_url='{user_name}.tistory.com'.format(
             user_name=self.user_name),
         pre_path=matched.group('pre_path'),
         post_path=file_path)
     dir_path = self.dir_path
     download_path = '{dir_path}/{file_path}'.format(dir_path=dir_path,
                                                     file_path=file_path)
     self._file_data_to_download.append((download_url, dir_path, file_path))
     if 'image/jpeg' in matched.group('attr'):
         return u'<img src="{site_url_tmpl}{download_path}" {attr}>'.format(
             site_url_tmpl='{{site.url}}/',
             download_path=download_path,
             attr=matched.group('attr'))
     else:
         tag = u'<a href="{site_url_tmpl}{download_path}" {attr}></a>'.format(
             site_url_tmpl='{{site.url}}/',
             download_path=download_path,
             attr=matched.group('attr'))
         soup = BeautifulSoup(tag).a
         soup.string = soup['filename']
         return unicode(soup)
Example #21
0
def get_article(url):
    result = {"url": url}
    if(not validURL(url)):
        result.update({"error": "url is not valid"})
        return result
    html = urlopen(url)
    soup = cleanHTML(BS(html), url)

    # put into <article>
    atl = BS("<article></article>").article
    # get title and append to h1
    head = BS("<h1></h1>").h1
    head.string = soup.title.string
    result.update({"title": soup.title.string})

    atlist = []
    for p in soup.findAll("p"):
        tag = p.parent
        if(tag not in atlist):
            atlist.append(tag)
    #atlist = [p.parent for p in soup.findAll("p")]
    scored = {}
    for tag in atlist:
        scored.update({tag: get_score(tag)})

    # get the highest score
    final_score = 0
    content = ""
    for t, s in scored.items():
        if(s >= final_score):
            final_score = s
            content = t
    if(final_score == 0):
        # all score < 0
        result.update({"error": "nothing valualbe is not found"})
        return result

    ## remove div in content
    #for div in content.findAll("div"):
    #    divs = get_score(div)
    #    if(divs < 20):
    #        div.extract()

    # if article found, return article
    if(content.name == "article"):
        atl = content
    else:
        # if no h1 found, put the title as h1
        if(content.find("h1") is None):
            # if previous_sibling is head, append
            pre1 = content.findPreviousSibling("h1")
            pre2 = content.findPreviousSibling("h2")
            if(pre1):
                atl.append(pre1)
            elif(pre2):
                atl.append(pre2)
            else:
                atl.append(head)
        atl.append(content)
    atl = unicode(atl)

    result.update({"article": atl})
    result.update({"score": final_score})
    return result
    def transformMissions(self):
        links1 = self.__getAllMissions(self.__res1)
        links2 = self.__getAllMissions(self.__res2)

        links = links1.copy()
        links.update(links2)
        #print(links)

        count = 0
        transformedMissions = {}
        transformedMissions["timestamp"] = {
            "date": datetime.now().strftime("%d-%m-%y"),
            "time": datetime.now().strftime("%H-%M-%S")
        }

        for name, link in links.items():
            count += 1

            try:
                #print("---------------------------" +link+ "---------------------------")
                time.sleep(0.5)
                mission = requests.get(
                    'https://escapefromtarkov.gamepedia.com' + link)
                mission = BeautifulSoup(mission.text, "html.parser")

            except requests.exceptions.SSLError as e:
                print("Error reading mission")
                #mission = "<h1> Error </h1>"

            name = mission.select_one("h1").text
            if name == "Quests" or name == "Quests/zh":
                continue

            transformedMissions[name] = ({"favorite": 0})

            infoxbox = []
            for temp in mission.select(".va-infobox-content"):
                if "previous:" not in temp.text and "leads to:" not in temp.text:
                    infoxbox.append(temp.text)

            transformedMissions[name].update({"infobox": infoxbox})

            liste = []
            for headlines in mission.select("h2 span"):

                #print(headlines.next_sibling)
                #print(headlines.attrs)
                if "class" not in headlines.attrs:
                    continue

                if headlines.attrs["class"][0] == "mw-headline":

                    temp = headlines.parent

                    for tag in temp.next_siblings:
                        if tag.name == "ul" or "table":
                            #print(tag.name)
                            if (tag.name
                                    == "h2") or (tag.name == "table"
                                                 and "class" in tag.attrs
                                                 and tag.attrs["class"][-1]
                                                 == "va-navbox-bottom"):
                                transformedMissions[name].update(
                                    {headlines.text: liste})
                                liste = []
                                break

                            if hasattr(tag, "text"):
                                liste.append(tag.text.strip())

                            if tag.name == "table" and "class" in tag.attrs or tag.name == "p" or tag.name == "li":
                                #print(tag.findAll("img"))
                                for image in tag.findAll("img"):
                                    #print(image.attrs["src"])
                                    liste.append(image.attrs["src"])

            completeSite = mission.find("div", {"id": "bodyContent"})
            #print(completeSite.findAll("img"))
            for editSpan in completeSite.select(
                    "span[class='mw-editsection']"):
                editSpan.extract()

            for questList in completeSite.select(
                    "table[class='va-navbox-border va-navbox-bottom']"):
                questList.extract()

            for questHeader in completeSite.select("div[class='catlinks']"):
                questHeader.extract()

            for infoxbox in completeSite.select("table[class='va-infobox']"):
                infoxbox.extract()

            for jumper in completeSite.select("div[class='mw-jump']"):
                jumper.extract()

            for hidden in completeSite.select("div[class='noprint']"):
                hidden.extract()

            for image in completeSite.select("a[class='image']"):
                image.attrs["href"] = image.contents[0].attrs["src"]
                #print(image.attrs["href"])

            for aLink in completeSite.select("a"):
                try:
                    if "class" in aLink.attrs.keys():
                        for attr in aLink.attrs["class"]:
                            if attr == "image":
                                #print(aLink.contents[0])
                                pass

                            else:
                                newTag = BeautifulSoup(
                                    features="html.parser").new_tag("b")
                                newTag.string = aLink.text
                                aLink.replace_with(newTag)
                    else:
                        newTag = BeautifulSoup(
                            features="html.parser").new_tag("b")
                        newTag.string = aLink.text
                        aLink.replace_with(newTag)

                except Exception as e:
                    print(e)

            #print(completeSite)
            # print(completeSite)

            transformedMissions[name].update(
                {"completeSite": str(completeSite)})

            if (count % 10) == 0:
                print(count)

            #print(mission)
            #if count == 8:
            #   break
        print(str(count) + " missions loaded")
        return transformedMissions
Example #23
0
def handle_text(filename, img_keyword, sound_keyword, video_keyword):
    """
    :param paras:file name of a charpter, such like 'Charpter1.txt', without directory path.
    :result: a html file
    """
    # open file and read paragraphs
    with open(os.path.join('./text/', filename), 'r+') as f:
        paras = [p.strip() for p in f.readlines() if len(p) > 4]
    # read html template
    with open(r'base.txt', 'r+') as f:
        template_text = f.read()
        temp = BeautifulSoup(template_text, "lxml")

    # replace cover img
    # cover = temp.find('img', {'id': 'cover'})
    # cover['src'] = './pics/cover.jpg'

    # handle title
    title = temp.find('h3')
    title.string = paras[0]
    temp.title = paras[0]

    # handle paras
    text_box = temp.find('div', {'id': 'text'})
    js_box = temp.find('script', {'id': 'main'})
    count = [0,0]
    img_pat = re.compile(r'\((\W+?)\)\['+img_keyword+r'(\S+?)\]')
    sound_pat = re.compile(r'\((\W+?)\)\['+sound_keyword+r'(\S+?)\]')
    video_pat = re.compile(r'\((\W+?)\)\['+video_keyword+r'(\S+?)\]')
    for i in range(1, len(paras)):
        new_p = temp.new_tag('p')
        new_br = temp.new_tag('br')
        # handle img in text
        if img_pat.findall(paras[i]):
            imgs = img_pat.findall(paras[i])# a list of tuple(text, img_id)
            for img in imgs:
                img_result = insert_img(img[1], temp, count)
                new_img_div, count = img_result[0], img_result[1]
                text_box.append(new_img_div)
            new_p.string = re.sub(img_pat, r'\1', paras[i])# delete () and []
            # text_box.append(new_p)
            # text_box.append(new_br)
        if sound_pat.findall(paras[i]):
            sounds = sound_pat.findall(paras[i])
            new_p.string = re.sub(sound_pat, r'\1', paras[i])
            for sound in sounds:
                new_play_logo = insert_sound(sound[0], sound[1], paras[i], temp)
                new_p.append(new_play_logo)
            # text_box.append(new_p)
            # text_box.append(new_br)
        if video_pat.findall(paras[i]):
            videos = video_pat.findall(paras[i])
            for video in videos:
                new_video_link = temp.new_string("<a target='_blank' href='"+insert_video(video[1], paras[i], temp) + ".html'>"+video[0]+"</a>")
                new_p.string = re.sub(video_pat, new_video_link, new_p.string)
                new_p = BeautifulSoup(html_parser.unescape(str(new_p)), 'lxml')
        if not (img_pat.findall(paras[i]) or sound_pat.findall(paras[i]) or video_pat.findall(paras[i])):
            new_p.string = paras[i]
        text_box.append(new_p)
        text_box.append(new_br)

    with open('audio.txt', 'r+') as f:
        text = f.read()
        audio_tag = BeautifulSoup(text, 'lxml').div
        text_box.append(audio_tag)

    # add js about sound to html script
    # with open('static/js/audio.js', 'r+') as f:
    #     audio_js = f.read()
    #     js_box.append(audio_js)     

    with open(filename[:-4] + '.html', 'w+') as f:
        f.write(temp.prettify("utf-8"))
        print '==========finish ' + filename + '==========' 
Example #24
0
    def collapse_sections(self) -> None:
        """Collapses long result sections ("people also asked", "related
         searches", etc) into "details" elements

        These sections are typically the only sections in the results page that
        have more than ~5 child divs within a primary result div.

        Returns:
            None (The soup object is modified directly)
        """
        minimal_mode = read_config_bool('WHOOGLE_MINIMAL')

        def pull_child_divs(result_div: BeautifulSoup):
            try:
                return result_div.findChildren(
                    'div', recursive=False)[0].findChildren('div',
                                                            recursive=False)
            except IndexError:
                return []

        if not self.main_divs:
            return

        # Loop through results and check for the number of child divs in each
        for result in self.main_divs:
            result_children = pull_child_divs(result)
            if minimal_mode:
                if len(result_children) in (1, 3):
                    continue
            else:
                if len(result_children) < self.RESULT_CHILD_LIMIT:
                    continue

            # Find and decompose the first element with an inner HTML text val.
            # This typically extracts the title of the section (i.e. "Related
            # Searches", "People also ask", etc)
            label = 'Collapsed Results'
            for elem in result_children:
                if elem.text:
                    label = elem.text
                    elem.decompose()
                    break

            # Create the new details element to wrap around the result's
            # first parent
            parent = None
            idx = 0
            while not parent and idx < len(result_children):
                parent = result_children[idx].parent
                idx += 1

            details = BeautifulSoup(features='html.parser').new_tag('details')
            summary = BeautifulSoup(features='html.parser').new_tag('summary')
            summary.string = label
            details.append(summary)

            if parent and not minimal_mode:
                parent.wrap(details)
            elif parent and minimal_mode:
                # Remove parent element from document if "minimal mode" is
                # enabled
                parent.decompose()
Example #25
0
    def update_link(self, link: Tag) -> None:
        """Update internal link paths with encrypted path, otherwise remove
        unnecessary redirects and/or marketing params from the url

        Args:
            link: A bs4 Tag element to inspect and update

        Returns:
            None (the tag is updated directly)

        """
        # Replace href with only the intended destination (no "utm" type tags)
        href = link['href'].replace('https://www.google.com', '')
        if 'advanced_search' in href or 'tbm=shop' in href:
            # FIXME: The "Shopping" tab requires further filtering (see #136)
            # Temporarily removing all links to that tab for now.
            link.decompose()
            return

        result_link = urlparse.urlparse(href)
        q = extract_q(result_link.query, href)

        if q.startswith('/'):
            # Internal google links (i.e. mail, maps, etc) should still
            # be forwarded to Google
            link['href'] = 'https://google.com' + q
        elif '/search?q=' in href:
            # "li:1" implies the query should be interpreted verbatim,
            # which is accomplished by wrapping the query in double quotes
            if 'li:1' in href:
                q = '"' + q + '"'
            new_search = 'search?q=' + self.encrypt_path(q)

            query_params = parse_qs(urlparse.urlparse(href).query)
            for param in VALID_PARAMS:
                if param not in query_params:
                    continue
                param_val = query_params[param][0]
                new_search += '&' + param + '=' + param_val
            link['href'] = new_search
        elif 'url?q=' in href:
            # Strip unneeded arguments
            link['href'] = filter_link_args(q)

            # Add no-js option
            if self.config.nojs:
                append_nojs(link)

            if self.config.new_tab:
                link['target'] = '_blank'
        else:
            if href.startswith(MAPS_URL):
                # Maps links don't work if a site filter is applied
                link['href'] = MAPS_URL + "?q=" + clean_query(q)
            else:
                link['href'] = href

        # Replace link location if "alts" config is enabled
        if self.config.alts:
            # Search and replace all link descriptions
            # with alternative location
            link['href'] = get_site_alt(link['href'])
            link_desc = link.find_all(
                text=re.compile('|'.join(SITE_ALTS.keys())))
            if len(link_desc) == 0:
                return

            # Replace link description
            link_desc = link_desc[0]
            for site, alt in SITE_ALTS.items():
                if site not in link_desc:
                    continue
                new_desc = BeautifulSoup(features='html.parser').new_tag('div')
                new_desc.string = str(link_desc).replace(site, alt)
                link_desc.replace_with(new_desc)
                break
Example #26
0
post_files = [f for f in listdir(POST_DIR) if isfile(join(POST_DIR, f))]
posts = []

for post_file in post_files:
    # factor the post data
    with open((POST_DIR + post_file), 'r') as f:
        post_file_data = f.read()
    # construct the HTML tree
    post_soup = Soup(post_file_data, features="html.parser")
    # find the <title> tag
    title_soup = post_soup.find("title").extract()
    title = title_soup.string

    post_meta_soup = post_soup.find(id="post-meta")
    title_in_post_soup = Soup(features="html.parser").new_tag("h2")
    title_in_post_soup.string = title
    post_meta_soup.append(title_in_post_soup)
    date_in_post_soup = Soup(features="html.parser").new_tag("p")
    date_in_post_soup["class"] = "small-gray"
    date_in_post_soup.string = "Published on " + date_from(post_file)
    post_meta_soup.append(date_in_post_soup)

    # find all the <latex> tags
    latexes = post_soup.find_all('latex')
    for latex in latexes:
        # convert the latex to html
        latex_html = delatex(latex.string)
        latex.replace_with(Soup(latex_html, features="html.parser"))

    # create the post html and write it to file
    # insert the post soup into the template soup
Example #27
0
def createElement (element, classID, string):
	new_tag = BeautifulSoup('<'+element+'></'+element+'>', 'lxml')
	new_tag = new_tag.find(element)
	new_tag['class'] = classID
	new_tag.string = string
	return new_tag
Example #28
0
# -*- coding: utf-8 -*-
# 字符串常被包含在tag内.Beautiful Soup用 NavigableString 类来包装tag中的字符串:
from bs4 import BeautifulSoup
tag = BeautifulSoup("<b class='clas1'>李伟</b>")
print tag.string  # 李伟
print type(tag.string)  # <class 'bs4.element.NavigableString'>

# 一个 NavigableString 字符串与Python中的Unicode字符串相同,通过 unicode() 方法可以直接将 NavigableString 对象转换成Unicode字符串
unicode_string = unicode(tag.string)
print unicode_string  # 李伟
print type(unicode_string)  # <type 'unicode'>

# tag中包含的字符串不能编辑,但是可以被替换成其它的字符串,用 replace_with() 方法:
tag.string = 'lijie'
print tag.string
tag.string.replace_with('lijiebao')
print tag.string
# 说明:NavigableString 对象支持 遍历文档树 和 搜索文档树 中定义的大部分属性, 并非全部.
# 尤其是,一个字符串不能包含其它内容(tag能够包含字符串或是其它tag),字符串不支持 .contents 或 .string 属性或 find() 方法.
# 如果想在Beautiful Soup之外使用 NavigableString 对象,需要调用 unicode() 方法,
# 将该对象转换成普通的Unicode字符串,否则就算Beautiful Soup已方法已经执行结束,该对象的输出也会带有对象的引用地址.这样会浪费内存.
Example #29
0
def inspect_file(file_name):

    raw_file = source_path + file_name
    raw_soup = BeautifulSoup(open(raw_file), from_encoding="utf-8")

    all_prons = raw_soup.find_all('span', class_="pron")

    if len(all_prons) > 0:

        for pron in all_prons:

            all_labels = pron.find_all('span', class_='lbl')

            dialects = ""
            register = ""

            if len(all_labels) > 0:

                dialects_tags = pron.find_all('span', class_='geo')
                register_tags = pron.find_all('span', class_='register')

                if dialects_tags is not None and dialects_tags != []:
                    d_array = []
                    for d_tag in dialects_tags:
                        d_array.append(str(d_tag.string))
                        d_tag.decompose()
                    dialects = ", ".join(sorted(d_array))

                if register_tags is not None and register_tags != []:
                    register = str(register_tags[0].string)
                    for r_tag in register_tags:
                        r_tag.decompose()

                mod_tags = pron.find_all('span', class_='mod')
                for m_tag in mod_tags:
                    m_tag.decompose()

            ipa_tag = BeautifulSoup().new_tag("div", **{'class': 'ipa'})
            # Find image tag
            img = pron.find('img')
            if img is not None:

                # Extract MP3 link
                attr_text = img['onclick']
                # Extract link text with RegEx
                m = re.search(r"[^\/]*\.mp3", attr_text)
                link = m.group(0)

                # Extract target word
                attr_word = img['alt']
                target = attr_word.replace('Pronunciation for ', '')

                # Extract IPA of pron, strip brackets etc.

                raw_ipa = "".join(pron.find_all(text=True))
                raw_ipa = re.sub(r'\(|\)|;|\n', '', raw_ipa)
                raw_ipa = raw_ipa.strip()
                raw_ipa = re.sub(r'\,$', ' ', raw_ipa)
                ipa = raw_ipa.strip()

                ipa_tag.string = ipa
                ipa_tag['data-audio'] = link
                if target != "":
                    ipa_tag['data-orth'] = target
                if dialects != "":
                    ipa_tag['data-dialects'] = dialects
                if register != "":
                    ipa_tag['data-register'] = register

                # Replace pron with IPA tag
            else:
                ipa_str = str(pron.string)
                ipa_str = re.sub(r'\(|\)|;', '', ipa_str)
                ipa_tag.string = ipa_str.strip()

            pron.replace_with(ipa_tag)

            f_output = open(raw_file, 'w')
            f_output.write(str(raw_soup))
            f_output.close()
    return
Example #30
0
def createElement(element, classID, string):
    new_tag = BeautifulSoup('<' + element + '></' + element + '>', 'lxml')
    new_tag = new_tag.find(element)
    new_tag['class'] = classID
    new_tag.string = string
    return new_tag