def fetchDef(term): searched = re.search(r'^[^\[]+', term) if searched: term = searched.group(0) defText = "" pageUrl = "http://www.sanseido.biz/User/Dic/Index.aspx?TWords=" + urllib.quote( term.encode('utf-8')) + "&st=0&DailyJJ=checkbox" response = urllib.urlopen(pageUrl) soup = BeautifulSoup(response) NetDicBody = soup.find('div', class_="NetDicBody") if NetDicBody != None: defFinished = False for line in NetDicBody.children: if line.name == "b": if len(line) != 1: for child in line.children: if child.name == "span": defFinished = True if defFinished: break if line.string != None and line.string != u"\n": defText += line.string defText = re.sub(ur"[(?P<no>[2-9]+)]", ur"<br/><br/>[\1]", defText) if defText: defText = u" ・ <b>" + term + "</b>: " + mecab.reading(defText) return re.sub(ur"((?P<num>[2-9]+))", ur"<br/>(\1)", defText)
def fetchDef(term): defText = "" pageUrl = "https://krdict.korean.go.kr/dicSearch/search?mainSearchWord=" + urllib.quote( term.encode('utf-8')) response = urllib.urlopen(pageUrl) soup = BeautifulSoup(response) NetDicBody = soup.find('span', class_="base_t") if NetDicBody != None: defFinished = False for line in NetDicBody.children: if line.name == "b": if len(line) != 1: for child in line.children: if child.name == "span": defFinished = True if defFinished: break if line.string != None and line.string != u"\n": defText += line.string defText = re.sub(ur"[(?P<no>[2-9]+)]", ur"<br/><br/>[\1]", defText) return re.sub(ur"((?P<num>[2-9]+))", ur"<br/>(\1)", defText)
def get_meta_content_with_tag(soup: bs.BeautifulSoup = None, tag="title"): """ Retrieve the content of a tag as define by *beautifulsoup* :param BeautifulSoup soup: the soup to extract tag from :param str tag: the tag to find in the soup :return: a string representation of the content of the tag """ if soup is None: raise TypeError("get_meta_constent_with_tag missing one required positional 1 argument: soup") m_name = JASSS_Scrapper._JASSS_META_NAME m_content = JASSS_Scrapper._JASSS_META_CONTENT if soup.find_next(JASSS_Scrapper._JASSS_META_TAG, {JASSS_Scrapper._JASSS_META_NAME.upper(): "title"}): m_name = JASSS_Scrapper._JASSS_META_NAME.upper() m_content = JASSS_Scrapper._JASSS_META_CONTENT.upper() if isinstance(JASSS_Scrapper._META[tag], str): meta_context = soup.find(JASSS_Scrapper._JASSS_META_TAG, {m_name: JASSS_Scrapper._META[tag]}) else: for tg in JASSS_Scrapper._META[tag]: meta_context = soup.find(JASSS_Scrapper._JASSS_META_TAG, {m_name: tg}) if meta_context is not None: break return meta_context[m_content]
def get_art_content_with_tag(soup: bs.BeautifulSoup = None, tag="title"): """ Retrieve the content of a tag define in the *art* section of JASSS article pages :param BeautifulSoup soup: the soup to extract tag from :param str tag: the 'art' tag to find in the soup :return: a string representation of the content of the tag """ if soup is None: raise TypeError("get_art_constent_with_tag missing one required positional 1 argument: soup") balise: str = "p" if tag == "doi": balise = "span" result = soup.find(balise, {'class': JASSS_Scrapper._ART[tag]}) if result is None: if tag == "doi": return JASSS_Scrapper.doi(None, soup) else: return super().NA elif tag == "doi": result = result.contents[0].replace('DOI:', '') if result else super().NA return result.strip()
def fetchDef(term): defText = "" pageUrl = "http://kjjk.weblio.jp/content/" + urllib.quote( term.encode('utf-8')) response = urllib.urlopen(pageUrl) soup = BeautifulSoup(response) NetDicBody = soup.find('div', class_="kiji") if NetDicBody is not None: test = NetDicBody.find_all('span', {"lang": "ja"}) counter = 1 if test is not None: for line in test: if unicode(line).find(term) == -1: defText += "(" + str(counter) + ") " + unicode( line) + "<br/>" counter = counter + 1 if defText != "": defText = string.replace(defText, ',', ', ') return defText
def _get_content(self): # 不知道如何处理非英文字符,没有例子,stackoverflow怎么解决就怎么抄 reload(sys) sys.setdefaultencoding('utf8') # 沪江https会报错,不知原理,google后写下这段 ssl.match_hostname = lambda cert, hostname: True url = 'https://www.hjdict.com/jp/jc/' + urllib2.quote( self.word.encode('utf-8')) # 沪江需要请求头,因为它会判断如果是手机浏览器就逼你下app request = urllib2.Request( url, headers={ "Accept-Language": "en-US,en;q=0.5", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:40.0) Gecko/20100101 Firefox/40.0" }) contents = urllib2.urlopen(request).read() soup = BeautifulSoup(contents) #变量大小写规范不管了,之前抄不知道日本人还是韩国人的anki weblio爬虫插件,里面有大写就跟着大写了 NetDicBody = soup.find('div', class_="word-details") if type(NetDicBody) == types.NoneType: errorMsg = '查无此词' return self.cache_this({ 'expressions': '', 'Meaning': '', 'phonetic': '', 'mp3': '', 'sentences': '', 'sentence_trans': '', 'status': errorMsg }) if not isinstance( NetDicBody.find('header', class_='word-details-pane-header-multi'), types.NoneType): errorMsg = '一词多义' return self.cache_this({ 'expressions': '', 'Meaning': '', 'phonetic': '', 'mp3': '', 'sentences': '', 'sentence_trans': '', 'status': errorMsg }) Expression = NetDicBody.find('div', 'word-text').h2.string Pronounces = NetDicBody.find('div', 'pronounces').span.string[1:-1] mp3 = NetDicBody.find('span', 'word-audio').get('data-src') Meaning = '' Poses = NetDicBody.find('div', 'simple').find_all("span", class_=None) MeaningRawRaw = NetDicBody.find_all('span', 'simple-definition') # 用来去除例句多余空格,这里不知道怎么处理,随便抄了几段处理的代码重复操作几遍,写的很难看 for s in range(len(MeaningRawRaw)): MeaningRaw = ' '.join( re.split(' +|\n+', MeaningRawRaw[s].get_text())).strip() m_temp = ' '.join(MeaningRaw.split()) if len(Poses) < 1: Meaning += m_temp + "\n" else: Meaning += Poses[s].get_text() + m_temp + "\n" Meaning = Meaning.rstrip() Meaning = Meaning.replace("; ", "") sents_raw = NetDicBody.find_all("p", "def-sentence-from") sentstrans_raw = NetDicBody.find_all("p", "def-sentence-to") Sents = '' sentstrans = '' for s in sents_raw: Sents += ' '.join(s.get_text().split()) + "<br />" for s in sentstrans_raw: sentstrans += ' '.join(s.get_text().split()) + "<br />" Sents = Sents.rstrip() sentstrans = sentstrans.rstrip() return self.cache_this({ 'expressions': Expression, 'Meaning': Meaning, 'phonetic': Pronounces, 'mp3': mp3, 'sentences': Sents, 'sentence_trans': sentstrans, 'status': '' })