def update_from_wiki(wc): try: p = wikipedia.page(wc) except wikipedia.DisambiguationError: return [], '' l = [] url = '' if HanziConv.same(p.title, wc): s = wikipedia.summary(wc, sentences=2) s = HanziConv.toSimplified(s) py = '/'.join([a[0] for a in pypinyin.pinyin(wc)]) l.append((py, s)) url = p.url return l, url
def wiki_crawler(target, lang="zh-tw", wait_time=3): wikipedia.set_lang(lang) try: page_data = wikipedia.page(target) if match_pattern(target, dic['zh_en_num']): zh_name = target else: zh_name = page_data.title except wikipedia.exceptions.PageError or ValueError: print("{} not found!".format(target)) print() return (target, "", "") except wikipedia.exceptions.DisambiguationError: print("{} ambiguous!".format(target)) print() return (target, "", "") print("loading page...") print(page_data.url) page_url = page_data.url # wiki_wiki = wikipediaapi.Wikipedia(lang) # page_py = wiki_wiki.page(target) # # if page_py.exists(): # page_data = page_py # page_url = page_py.fullurl # if match_pattern(target, dic['zh_en_num']): # zh_name = target # else: # answer = input("save this data? (Y/n)") # to_the_end = (answer == "end") # if to_the_end: # return TO_THE_END # # print("{} not found!".format(target)) # print() # return (target, "", "") # eng_title = soup.find('li', attrs={'class':'interlanguage-link interwiki-en'}).a['title'] target_temp = target.replace("*", "") target_temp = target_temp.replace("+", "") target_reg = re.compile(".{0,30}".join(target_temp.lower().split(" "))) if not HanziConv.same(target, page_data.title) \ and target not in page_data.title \ and page_data.title not in target \ and target not in page_data.summary \ and not len(target_reg.findall(page_data.summary.lower())): print("this target: {}".format(target)) print("wiki target: {}".format(page_data.title)) print(page_data.summary) answer = input("save this data? (Y/n)") # answer = "n" throw_this = (answer == "n") to_the_end = (answer == "end") if throw_this: print("{} unchanged!".format(target)) print() return (target, "", "") if to_the_end: print("to the end!") print() return TO_THE_END else: firefox_profile = webdriver.FirefoxProfile() firefox_profile.set_preference("intl.accept_languages", lang) firefox_profile.update_preferences() driver = webdriver.Firefox(firefox_profile=firefox_profile) driver.get(page_url) html_data = driver.page_source.encode('utf-8') driver.close() soup = BeautifulSoup(html_data, 'html.parser') zh_names = soup.find('h1', attrs={'id': 'firstHeading'}) eng_names = soup.findAll('span', attrs={'class': 'nickname'}) nick_names = soup.find('td', attrs={'class': 'nickname'}) english_link_name = soup.find('a', attrs={ 'lang': 'en', 'hreflang': 'en' }) zh_name = zh_names.text.replace("[編輯]", "") if not match_pattern(zh_name, dic["zh_en_num"]): zh_name = target eng_name = "" nick_name = "" if eng_names: try: eng_name = eng_names[1].text.strip() except: eng_name = eng_names[0].text.strip() if not match_pattern(eng_name, dic["en"]): eng_name = "" if english_link_name: eng_name = english_link_name.get('title') eng_name = eng_name.split("–")[0].strip() if nick_names: print(nick_names) pattern = r'\[\d+\]' nick_name = nick_names.get_text("\n").strip() nick_name = nick_name.replace("、", ",") nick_name = nick_name.replace("\n", ",") nick_name = re.sub(pattern, "", nick_name) nick_name_temp = nick_name.split(",") for i in range(0, len(nick_name_temp)): if nick_name_temp[i] == target: nick_name_temp[i] = "" nick_name_temp[i] = nick_name_temp[i].strip() nick_name_temp[:] = [item for item in nick_name_temp if item != ''] nick_name = ",".join(nick_name_temp) print("printing info:") print(zh_name) print(eng_name) print(nick_name) print() return (zh_name, eng_name, nick_name)
# You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # from hanziconv import HanziConv print(HanziConv.toSimplified('繁簡轉換器')) # 繁简转换器 print(HanziConv.toTraditional('繁简转换器')) # 繁簡轉換器 print(HanziConv.same('繁簡轉換器', '繁简转换器')) # True print(HanziConv.toSimplified("把中檀元帅固定在神轿")) print(HanziConv.toSimplified("把中壇元帥固定在神轎")) # 把中坛元帅固定在神轿 print(HanziConv.toSimplified("洩")) # 泄 print(HanziConv.toSimplified("湿")) # 溼 print(HanziConv.toSimplified("淫慾")) # 淫欲 print(HanziConv.toSimplified("呼吸")) # 唿吸 print(HanziConv.toSimplified("猛烈")) # 勐烈 print(HanziConv.toSimplified("四週"))