Beispiel #1
0
def save_html(url, file_name):
    html = urlopen(url.encode('utf-8')).read()
    # print(html)
    write('./html/' + file_name, html.decode('utf-8'))
Beispiel #2
0
def save_html(url, file_name):
    print(url)
    print(file_name)
    html = urlopen(url).read()
    print(html)
    write('./html/' + file_name, html.decode('utf-8'))
Beispiel #3
0
# coding=utf-8
import glob, os, path
import jieba
from common.chinese import read_lines, write
from common.persistence import from_pickle

stopwords = set(from_pickle('stopwords.pkl'))
print len(stopwords)

for fname in glob.glob('*.txt'):
    print fname + ' started'

    name_without_ext = os.path.splitext(fname)[0]

    segmented = []

    for line in read_lines(fname):
        parts = line.strip().split('\t')

        if len(parts) < 3:
            continue

        seg_list = jieba.cut(parts[2], cut_all=False)
        seg_list = [seg for seg in seg_list if seg not in stopwords]
        s = ' '.join(seg_list)
        segmented.append(s)
        # print s

    write(name_without_ext + '.seg', '\n'.join(segmented))

    print fname + ' done'
Beispiel #4
0
def make_soup(html):
    return BeautifulSoup(html, "lxml")


def save_jobs_html(url, skill, pn=1):
    try:
        html = urlopen(url.encode('utf-8')).read()
    except URLError, e:
        if hasattr(e, 'code'):
            print(u'http error occured for skill: {0}, code: {1}'.format(skill, e.code))
        elif hasattr(e, 'reason'):
            print('server not reachable: ' + e.reason)

        return ''
    else:
        write(u'./html/{0}_{1}.html'.format(skill, pn), html.decode('utf-8'))
        return html


def download_search_results(skill_url_fmt, kd, city, page):

    html_file = u'./html/{0}_{1}.html'.format(kd, page)
    if os.path.exists(html_file):
        print(html_file + u' already exists')
        return

    url = make_search_url(skill_url_fmt, kd, city, page)
    html = save_jobs_html(url, kd, page)
    time.sleep(2)

    if not html:
Beispiel #5
0
def save_html(url, file_name):
    html = urlopen(url.encode('utf-8')).read()
    # print(html)
    write('./html/' + file_name, html.decode('utf-8'))