Ejemplo n.º 1
0
def fetchDef(term):
    searched = re.search(r'^[^\[]+', term)
    if searched:
        term = searched.group(0)
    defText = ""
    pageUrl = "http://www.sanseido.biz/User/Dic/Index.aspx?TWords=" + urllib.quote(
        term.encode('utf-8')) + "&st=0&DailyJJ=checkbox"
    response = urllib.urlopen(pageUrl)
    soup = BeautifulSoup(response)
    NetDicBody = soup.find('div', class_="NetDicBody")
    if NetDicBody != None:
        defFinished = False

        for line in NetDicBody.children:
            if line.name == "b":
                if len(line) != 1:
                    for child in line.children:
                        if child.name == "span":
                            defFinished = True
            if defFinished:
                break

            if line.string != None and line.string != u"\n":
                defText += line.string

    defText = re.sub(ur"[(?P<no>[2-9]+)]", ur"<br/><br/>[\1]", defText)
    if defText:
        defText = u" ・ <b>" + term + "</b>: " + mecab.reading(defText)

    return re.sub(ur"((?P<num>[2-9]+))", ur"<br/>(\1)", defText)
def fetchDef(term):
    defText = ""
    pageUrl = "https://krdict.korean.go.kr/dicSearch/search?mainSearchWord=" + urllib.quote(
        term.encode('utf-8'))
    response = urllib.urlopen(pageUrl)
    soup = BeautifulSoup(response)
    NetDicBody = soup.find('span', class_="base_t")

    if NetDicBody != None:
        defFinished = False

        for line in NetDicBody.children:
            if line.name == "b":
                if len(line) != 1:
                    for child in line.children:
                        if child.name == "span":
                            defFinished = True
            if defFinished:
                break

            if line.string != None and line.string != u"\n":
                defText += line.string

    defText = re.sub(ur"[(?P<no>[2-9]+)]", ur"<br/><br/>[\1]", defText)
    return re.sub(ur"((?P<num>[2-9]+))", ur"<br/>(\1)", defText)
Ejemplo n.º 3
0
def fetchDef(term):
    defText = ""
    pageUrl = "http://kjjk.weblio.jp/content/" + urllib.quote(
        term.encode('utf-8'))
    response = urllib.urlopen(pageUrl)
    soup = BeautifulSoup(response)
    NetDicBody = soup.find('div', class_="kiji")

    if NetDicBody is not None:
        test = NetDicBody.find_all('span', {"lang": "ja"})
        counter = 1

        if test is not None:
            for line in test:
                if unicode(line).find(term) == -1:
                    defText += "(" + str(counter) + ") " + unicode(
                        line) + "<br/>"
                    counter = counter + 1

    if defText != "":
        defText = string.replace(defText, ',', ', ')
    return defText
Ejemplo n.º 4
0
    def get_meta_content_with_tag(soup: bs.BeautifulSoup = None, tag="title"):
        """
        Retrieve the content of a tag as define by *beautifulsoup*
        :param BeautifulSoup soup: the soup to extract tag from
        :param str tag: the tag to find in the soup
        :return: a string representation of the content of the tag
        """
        if soup is None:
            raise TypeError("get_meta_constent_with_tag missing one required positional 1 argument: soup")
        m_name = JASSS_Scrapper._JASSS_META_NAME
        m_content = JASSS_Scrapper._JASSS_META_CONTENT
        if soup.find_next(JASSS_Scrapper._JASSS_META_TAG, {JASSS_Scrapper._JASSS_META_NAME.upper(): "title"}):
            m_name = JASSS_Scrapper._JASSS_META_NAME.upper()
            m_content = JASSS_Scrapper._JASSS_META_CONTENT.upper()

        if isinstance(JASSS_Scrapper._META[tag], str):
            meta_context = soup.find(JASSS_Scrapper._JASSS_META_TAG, {m_name: JASSS_Scrapper._META[tag]})
        else:
            for tg in JASSS_Scrapper._META[tag]:
                meta_context = soup.find(JASSS_Scrapper._JASSS_META_TAG, {m_name: tg})
                if meta_context is not None:
                    break
        return meta_context[m_content]
Ejemplo n.º 5
0
 def get_art_content_with_tag(soup: bs.BeautifulSoup = None, tag="title"):
     """
     Retrieve the content of a tag define in the *art* section of JASSS article pages
     :param BeautifulSoup soup: the soup to extract tag from
     :param str tag: the 'art' tag to find in the soup
     :return: a string representation of the content of the tag
     """
     if soup is None:
         raise TypeError("get_art_constent_with_tag missing one required positional 1 argument: soup")
     balise: str = "p"
     if tag == "doi":
         balise = "span"
     result = soup.find(balise, {'class': JASSS_Scrapper._ART[tag]})
     if result is None:
         if tag == "doi":
             return JASSS_Scrapper.doi(None, soup)
         else:
             return super().NA
     elif tag == "doi":
         result = result.contents[0].replace('DOI:', '') if result else super().NA
     return result.strip()
    def _get_content(self):
        # 不知道如何处理非英文字符,没有例子,stackoverflow怎么解决就怎么抄
        reload(sys)
        sys.setdefaultencoding('utf8')

        # 沪江https会报错,不知原理,google后写下这段
        ssl.match_hostname = lambda cert, hostname: True

        url = 'https://www.hjdict.com/jp/jc/' + urllib2.quote(
            self.word.encode('utf-8'))

        # 沪江需要请求头,因为它会判断如果是手机浏览器就逼你下app
        request = urllib2.Request(
            url,
            headers={
                "Accept-Language":
                "en-US,en;q=0.5",
                "User-Agent":
                "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:40.0) Gecko/20100101 Firefox/40.0"
            })
        contents = urllib2.urlopen(request).read()
        soup = BeautifulSoup(contents)

        #变量大小写规范不管了,之前抄不知道日本人还是韩国人的anki weblio爬虫插件,里面有大写就跟着大写了
        NetDicBody = soup.find('div', class_="word-details")
        if type(NetDicBody) == types.NoneType:
            errorMsg = '查无此词'
            return self.cache_this({
                'expressions': '',
                'Meaning': '',
                'phonetic': '',
                'mp3': '',
                'sentences': '',
                'sentence_trans': '',
                'status': errorMsg
            })
        if not isinstance(
                NetDicBody.find('header',
                                class_='word-details-pane-header-multi'),
                types.NoneType):
            errorMsg = '一词多义'
            return self.cache_this({
                'expressions': '',
                'Meaning': '',
                'phonetic': '',
                'mp3': '',
                'sentences': '',
                'sentence_trans': '',
                'status': errorMsg
            })

        Expression = NetDicBody.find('div', 'word-text').h2.string
        Pronounces = NetDicBody.find('div', 'pronounces').span.string[1:-1]
        mp3 = NetDicBody.find('span', 'word-audio').get('data-src')

        Meaning = ''
        Poses = NetDicBody.find('div', 'simple').find_all("span", class_=None)

        MeaningRawRaw = NetDicBody.find_all('span', 'simple-definition')
        # 用来去除例句多余空格,这里不知道怎么处理,随便抄了几段处理的代码重复操作几遍,写的很难看
        for s in range(len(MeaningRawRaw)):
            MeaningRaw = ' '.join(
                re.split(' +|\n+', MeaningRawRaw[s].get_text())).strip()
            m_temp = ' '.join(MeaningRaw.split())
            if len(Poses) < 1:
                Meaning += m_temp + "\n"
            else:
                Meaning += Poses[s].get_text() + m_temp + "\n"
        Meaning = Meaning.rstrip()
        Meaning = Meaning.replace("; ", "")

        sents_raw = NetDicBody.find_all("p", "def-sentence-from")
        sentstrans_raw = NetDicBody.find_all("p", "def-sentence-to")
        Sents = ''
        sentstrans = ''
        for s in sents_raw:
            Sents += ' '.join(s.get_text().split()) + "<br />"
        for s in sentstrans_raw:
            sentstrans += ' '.join(s.get_text().split()) + "<br />"
        Sents = Sents.rstrip()
        sentstrans = sentstrans.rstrip()

        return self.cache_this({
            'expressions': Expression,
            'Meaning': Meaning,
            'phonetic': Pronounces,
            'mp3': mp3,
            'sentences': Sents,
            'sentence_trans': sentstrans,
            'status': ''
        })