Beispiel #1
0
def test_pickle_simple_object():
    i = 100
    fpath = './tests/int.pkl'
    to_pickle(i, fpath)

    from_file = from_pickle(fpath)
    assert i == from_file
Beispiel #2
0
def test_pickle_simple_object():
    i = 100
    fpath = './tests/int.pkl'
    to_pickle(i, fpath)

    from_file = from_pickle(fpath)
    assert i == from_file
def build_unigram_dict(data_dir):

    cwd = os.getcwdu()
    print('cwd: {0}'.format(cwd))
    print('change wd to: {0}'.format(data_dir))
    os.chdir(data_dir)

    en_dict = defaultdict(int)
    line_counter = 0
    word_counter = 0

    for data_file in glob.glob('*1gram-20090715*.txt'):
        print('extracting data from: ' + data_file)

        with open(data_file) as f:
            for raw_line in f:
                line_counter += 1
                word, freq = read_word_freq(raw_line)
                if word is not None:
                    word_counter += 1
                    en_dict[word] += freq

    to_pickle(en_dict, './en_dict.pkl')
    print('total lines: {}'.format(line_counter))
    print('total word lines: {}'.format(word_counter))
    print('total unique words: {}'.format(len(en_dict)))

    print('change wd back to: {0}'.format(cwd))
    os.chdir(cwd)
Beispiel #4
0
def clear_non_sentiments():
    all_reviews = from_pickle('./all_reviews.pkl')
    print(len(all_reviews))

    has_sentiment = [(score, review) for (score, review) in all_reviews if score > 0 and len(review) > 0]
    to_pickle(has_sentiment, './sent_reviews.pkl')
    print(len(has_sentiment))

    to_pickle(has_sentiment[:10000], './train0.pkl')
Beispiel #5
0
def clear_non_sentiments():
    all_reviews = from_pickle('./all_reviews.pkl')
    print(len(all_reviews))

    has_sentiment = [(score, review) for (score, review) in all_reviews if score > 0 and len(review) > 0]
    to_pickle(has_sentiment, './sent_reviews.pkl')
    print(len(has_sentiment))

    to_pickle(has_sentiment[:10000], './train0.pkl')
Beispiel #6
0
def test_pickle_complex_object():
    d = {'u': u'中文', 's': 'hello', 'i': 100, 'l': [1, [2, 3]]}

    fpath = './tests/complex.pkl'
    to_pickle(d, fpath)

    from_file = from_pickle(fpath)
    assert isinstance(from_file['u'], unicode)
    assert isinstance(from_file['s'], str)
    assert isinstance(from_file['i'], int)
    assert isinstance(from_file['l'], list)

    assert from_file == d
Beispiel #7
0
def test_pickle_complex_object():
    d = {'u': u'中文',
         's': 'hello',
         'i': 100,
         'l': [1, [2, 3]]}

    fpath = './tests/complex.pkl'
    to_pickle(d, fpath)

    from_file = from_pickle(fpath)
    assert isinstance(from_file['u'], unicode)
    assert isinstance(from_file['s'], str)
    assert isinstance(from_file['i'], int)
    assert isinstance(from_file['l'], list)

    assert from_file == d
Beispiel #8
0
def save_keywords(html_file):
    html = read_all(html_file)
    soup = make_soup(html)

    main_navs = soup.find('div', 'mainNavs')
    cat_tags = main_navs.find_all('div', 'menu_box')
    cats = {}
    for i, cat in enumerate(cat_tags):
        cats[i] = category_contents(cat)

    for i in cats:
        cat = cats[i]
        print(u'** {0} **'.format(cat['name']))
        for sub in cat['sub_cats']:
            print(sub[0])
            for skill, href in sub[1]:
                print('\t-' + skill + ' - ' + href)

    to_pickle(cats, 'cats.pkl')
Beispiel #9
0
def save_degree_dict():
    dict_file = './data/degree_zh.txt'
    lines = chinese.read_lines(dict_file)

    degree = 0
    degree_dict = defaultdict(int)
    for l in lines:
        l = l.strip()

        if l and (not is_comment(l)):
            if l[0].isdigit():
                parts = l.split('-')
                assert len(parts) == 2
                degree = int(parts[1])
            else:
                degree_dict[l] = degree

    for k in degree_dict:
        print(k, degree_dict[k])

    to_pickle(degree_dict, './data/degree_zh.pkl')
Beispiel #10
0
def load_job_data():

    start = datetime.datetime.now()

    cats = from_pickle('cats.pkl')

    os.chdir('./html/detail')
    print(os.getcwd())

    detail_html_paths = []
    for i in cats:
        cat = cats[i]
        print(u'** cat {0}: {1} **'.format(i, cat['name']))
        for sub in cat['sub_cats']:
            for skill, href in sub[1]:
                print('loading... ' + skill)
                if '#' in skill or '/' in skill:
                    skill = quoted(skill)

                for f in glob.glob(skill + u'_*.html'):
                    hp = os.path.join(os.getcwdu(), f)
                    # print(hp)
                    detail_html_paths.append((skill, hp))

    os.chdir('../..')
    print(os.getcwd())

    all_jobs = []
    all_comps = {}
    all_jc = {}
    counter = 0
    for skill, detail_file in detail_html_paths:
        job = read_job_from_html(skill, detail_file)
        if job:
            j, c = job

            all_jobs.append(j)
            if c.comp_id not in all_comps:
                all_comps[c.comp_id] = c

            counter += 1
            if counter % 50 == 0:
                print(counter)
            uni = unified(j, c)
            all_jc[j.job_id] = uni

    print(counter)
    print(len(all_jobs))
    print(len(all_comps))
    print(len(all_jc))

    to_pickle(all_jobs, 'jobs.pkl')
    to_pickle(all_comps, 'comps.pkl')
    to_pickle(all_jc, 'unified.pkl')

    print(start)
    print(datetime.datetime.now())
Beispiel #11
0
def save_pos_sentiment_dict():
    dict_file = './data/pos_sent_zh.txt'
    lines = chinese.read_lines(dict_file)
    dict_data = [w.strip() for w in lines if w.strip()]
    to_pickle(dict_data, './data/pos_sent_zh.pkl')
Beispiel #12
0
    #             "or name like '%C#%' or name like '%搜索算法%' or name like '%Hadoop%' "
    #             "or name like '%交互设计师%' or name like '%数据分析师%' or name like '%Java%'")
rows = cur.fetchall()

print(len(rows))

n = 1000000
cnt_word_doc = Counter()
cnt_words = Counter()
for pos in rows[:n]:
    desc = strip_tags(pos['desc'])
    tokens = jieba.tokenize(desc, mode="search")
    words = [t[0] for t in tokens]
    cur = Counter(words)
    for k in cur:
        cnt_word_doc[k] += 1
        cnt_words[k] += cur[k]

to_pickle(cnt_words, 'cnt_words.pkl')
to_pickle(cnt_word_doc, 'cnt_word_doc.pkl')
# rows = from_pickle('pos_list.pkl')

for k in cnt_words.most_common(200):
    print u'{0}: {1}'.format(k[0], k[1])

print
print
print '++in docs++'
for k in cnt_word_doc.most_common(200):
    print u'{0}: {1}'.format(k[0], k[1])
Beispiel #13
0
def save_chinese_stopwords():
    dict_file = './data/stopwords_zh_hit.txt'
    lines = chinese.read_lines(dict_file)
    dict_data = [w.strip() for w in lines if w.strip()]
    to_pickle(dict_data, './data/stopwords_zh.pkl')
Beispiel #14
0
def save_inverse_dict():
    dict_file = './data/inverse_zh.txt'
    lines = chinese.read_lines(dict_file)

    inv_dict = [w.strip() for w in lines if w.strip() and (not is_comment(w))]
    to_pickle(inv_dict, './data/inverse_zh.pkl')
Beispiel #15
0
def save_neg_sentiment_dict():
    dict_file = './data/neg_sent_zh.txt'
    lines = chinese.read_lines(dict_file)
    dict_data = [w.strip() for w in lines if w.strip()]
    to_pickle(dict_data, '../douban/movies/dicts/neg_sent_zh.pkl')
Beispiel #16
0
from common.html_util import strip_tags
from common.persistence import to_pickle, from_pickle

con = sqlite.connect('lagou.db')

with con:

    con.row_factory = sqlite.Row
    cur = con.cursor()
    cur.execute("select * from position where subcategory in ('后端开发', '前端开发', '用户研究')")
    rows = cur.fetchall()

    positions = []
    for row in rows:
        if row['desc'] != 'n/a':
            positions.append((row['pos_id'], row['name'], row['industry'], row['desc']))

    print(len(positions))
    for p in positions[0]:
        print p

    to_pickle(positions, 'positions.pkl')

#####
raw_positions = positions
pos_dict = {}
for rp in raw_positions:
    pos_dict[int(rp[0])] = (rp[1], rp[2], strip_tags(rp[3]))

to_pickle(pos_dict, 'pos_norm.pkl')
Beispiel #17
0
def save(reviews, pickle_file):
    to_pickle(reviews, pickle_file)
Beispiel #18
0
con = sqlite.connect('lagou.db')

with con:

    con.row_factory = sqlite.Row
    cur = con.cursor()
    cur.execute(
        "select * from position where subcategory in ('后端开发', '前端开发', '用户研究')")
    rows = cur.fetchall()

    positions = []
    for row in rows:
        if row['desc'] != 'n/a':
            positions.append(
                (row['pos_id'], row['name'], row['industry'], row['desc']))

    print(len(positions))
    for p in positions[0]:
        print p

    to_pickle(positions, 'positions.pkl')

#####
raw_positions = positions
pos_dict = {}
for rp in raw_positions:
    pos_dict[int(rp[0])] = (rp[1], rp[2], strip_tags(rp[3]))

to_pickle(pos_dict, 'pos_norm.pkl')
Beispiel #19
0
# coding=utf-8
from common.chinese import read_lines
from common.persistence import to_pickle


def chinese():
    hit = set([line.strip() for line in read_lines('stopwords_zh.txt')])
    lagou = set([line.strip() for line in read_lines('stopwords_lagou.txt')])
    en = set([line.strip() for line in read_lines('stopwords_en.txt')])
    ws = {u'', u' ', u'\t', u'\n'}

    return en | hit | lagou | ws


if __name__ == '__main__':
    stopwords = chinese()
    to_pickle(stopwords, 'stopwords.pkl')
    print 'he' in stopwords
    print u'he' in stopwords

    print u'他' in stopwords
    print u'职位' in stopwords
    print u'负责' in stopwords
Beispiel #20
0
def save(reviews, pickle_file):
    to_pickle(reviews, pickle_file)