def save_html(url, file_name): html = urlopen(url.encode('utf-8')).read() # print(html) write('./html/' + file_name, html.decode('utf-8'))
def save_html(url, file_name): print(url) print(file_name) html = urlopen(url).read() print(html) write('./html/' + file_name, html.decode('utf-8'))
# coding=utf-8 import glob, os, path import jieba from common.chinese import read_lines, write from common.persistence import from_pickle stopwords = set(from_pickle('stopwords.pkl')) print len(stopwords) for fname in glob.glob('*.txt'): print fname + ' started' name_without_ext = os.path.splitext(fname)[0] segmented = [] for line in read_lines(fname): parts = line.strip().split('\t') if len(parts) < 3: continue seg_list = jieba.cut(parts[2], cut_all=False) seg_list = [seg for seg in seg_list if seg not in stopwords] s = ' '.join(seg_list) segmented.append(s) # print s write(name_without_ext + '.seg', '\n'.join(segmented)) print fname + ' done'
def make_soup(html): return BeautifulSoup(html, "lxml") def save_jobs_html(url, skill, pn=1): try: html = urlopen(url.encode('utf-8')).read() except URLError, e: if hasattr(e, 'code'): print(u'http error occured for skill: {0}, code: {1}'.format(skill, e.code)) elif hasattr(e, 'reason'): print('server not reachable: ' + e.reason) return '' else: write(u'./html/{0}_{1}.html'.format(skill, pn), html.decode('utf-8')) return html def download_search_results(skill_url_fmt, kd, city, page): html_file = u'./html/{0}_{1}.html'.format(kd, page) if os.path.exists(html_file): print(html_file + u' already exists') return url = make_search_url(skill_url_fmt, kd, city, page) html = save_jobs_html(url, kd, page) time.sleep(2) if not html: