def get_baike_pages(items): item_list = [] for i, item in enumerate(items): time.sleep(5) if i% 10 == 0: print i, item try: item = item.strip() item_page = Page(item) dicts = item_page.get_info() url = dicts['url'] item_list.append([ item, craw_baike(url)]) except PageError: word_list = pullword(item, threshold=0.7) newitem = word_list[0][0] try: new_item_page = Page(newitem) newdicts = new_item_page.get_info() url = newdicts['url'] item_list.append([ item, craw_baike(url)]) except PageError: print 'sorry, I do not understand' fp = codecs.open('major_decription.txt', 'wb', 'utf8') for item, des in item_list: try : fp.write(item + "::" + des + '\n') except UnicodeDecodeError: print item, des return item_list
# coding=utf-8 from pullword import pullword pullword( u"马总真是个好人", 1, )
# coding=utf-8 from pullword import pullword fin = open("input.txt",'r') fout = open("output.txt", 'a') ftext = fin.readline() while ('' != ftext): string = ftext.decode('utf-8') str = pullword(string) fout.write(str.encode('utf-8')) ftext = fin.readline() fin.close() fout.close()
#!/usr/bin/env python # coding=utf-8 from pullword import pullword print pullword(u"华中科技大学") print pullword(u"华中科技大学", debug=0) print pullword(u"华中科技大学", threshold=1)
#!/usr/bin/env python # coding=utf-8 from pullword import pullword s = pullword( u"习近平指出,中柬两国人民传统友谊源远流长。中柬友好是由中国老一辈领导人和西哈努克太皇共同缔造和精心培育的,弥足珍贵。进入新的历史时期,中柬关系又增添新的活力,得到长足发展。去年我同西哈莫尼国王成功互访。当前,两国政治上高度互信,经济上互利合作,在实现国家发展中互帮互助,在国际和地区事务中密切配合。柬埔寨王室长期以来积极致力于中柬友好事业,为两国关系发展作出了重要贡献。我们愿同柬方携手努力,推动中柬全面战略合作不断迈上新台阶,更好造福两国人民。" ) print s with open('/Users/cheng/Downloads/xxxx', 'w') as f: f.writelines([ss[0] + ss[1] + ' ' for ss in s])
# -*- coding: utf-8 -*- from pullword import pullword print(pullword(u"华中科技大学")) print(pullword(u"华中科技大学", debug=0)) print(pullword(u"华中科技大学", threshold=1))
#!/usr/bin/env python # coding=utf-8 from pullword import pullword import codecs #pullword(u"华中科技大学") #print pullword(u"华中科技大学", debug=0) #print pullword(u"华中科技大学", threshold=1) f1 = open('temp.txt', 'w') f2 = codecs.open('ill.data', 'r', 'utf-8') line = f2.readline() while line: word_list = pullword(line, threshold=0.8, debug=1) print word_list, f1.write(str(word_list)), line = f2.readline() f1.close() f2.close()