def bing_search(query, pages=10, search_type='Web'): if os.path.exists('%ssearch_result_%s.html' % (html_path, query.replace('/', '_'))): res = open( '%ssearch_result_%s.html' % (html_path, query.replace('/', '_')), 'rb').read() data = json.loads(res) return data['d']['results'] query = urllib.quote(query) credentials = (':%s' % bing_api_key).encode('base64')[:-1] authorrization = 'Basic %s' % credentials search_url = 'https://api.datamarket.azure.com/Bing/Search/v1/' \ '%(search_type)s?Query=%%27%(query)s%%27&$top=%(number)s&$format=json' % \ { 'search_type': search_type, 'query': query, 'number': pages*10 } headers = { 'Authorization': authorrization, } res = gunzip(get(search_url, headers).read()) write_file( '%ssearch_result_%s.html' % (html_path, query.replace('/', '_')), res, True) data = json.loads(res) return data['d']['results']
def get_all_corpus(website): print '%s%s.txt' % (corpus_path, website[0]) if os.path.exists(u'%s%s.txt' % (corpus_path, website[0])): return open(u'%s%s.txt' % (corpus_path, website[0]), 'rb').read() content = '%s\n%s' % (get_corpus_by_baidu(website), get_corpus_by_bing(website)) split_words = split_word_only(content) write_file(u'%s%s.txt' % (corpus_path, website[0]), split_words, debug_flag) return split_words
def expand(word): file_path = u'%s%s' % (expanded_words_path, word) if os.path.exists(file_path): return open(file_path, 'rb').read() if debug_flag: print 'wiki expanding ... ' p = search(word) content = get_content(p) write_file(file_path, content, True) return content
def baidu_search(query, pages=10, search_type='Web'): if os.path.exists(u'%ssearch_result_%s.html' % (html_path, query.replace('/', '_'))): res = open(u'%ssearch_result_%s.html' % (html_path, query.replace('/', '_')), 'rb').read() data = json.loads(res) return data result_list = [] for page in range(pages): result_list += baidu_search_single_search(query, page, search_type) res = json.dumps(result_list) write_file(u'%ssearch_result_%s.html' % (html_path, query.replace('/', '_')), res, True) return result_list
def split_word(sentence, filename='', cut_all=True, show_nominal=False): split_words = '' if not show_nominal: word_list = jieba.cut(sentence, cut_all) else: word_list = pseg.cut(sentence, cut_all) for word in word_list: if debug_flag: print word split_words += '%s ' % word if not filename == '': write_file(filename, split_words) return split_words