def test_pickle_simple_object(): i = 100 fpath = './tests/int.pkl' to_pickle(i, fpath) from_file = from_pickle(fpath) assert i == from_file
def build_unigram_dict(data_dir): cwd = os.getcwdu() print('cwd: {0}'.format(cwd)) print('change wd to: {0}'.format(data_dir)) os.chdir(data_dir) en_dict = defaultdict(int) line_counter = 0 word_counter = 0 for data_file in glob.glob('*1gram-20090715*.txt'): print('extracting data from: ' + data_file) with open(data_file) as f: for raw_line in f: line_counter += 1 word, freq = read_word_freq(raw_line) if word is not None: word_counter += 1 en_dict[word] += freq to_pickle(en_dict, './en_dict.pkl') print('total lines: {}'.format(line_counter)) print('total word lines: {}'.format(word_counter)) print('total unique words: {}'.format(len(en_dict))) print('change wd back to: {0}'.format(cwd)) os.chdir(cwd)
def clear_non_sentiments(): all_reviews = from_pickle('./all_reviews.pkl') print(len(all_reviews)) has_sentiment = [(score, review) for (score, review) in all_reviews if score > 0 and len(review) > 0] to_pickle(has_sentiment, './sent_reviews.pkl') print(len(has_sentiment)) to_pickle(has_sentiment[:10000], './train0.pkl')
def test_pickle_complex_object(): d = {'u': u'中文', 's': 'hello', 'i': 100, 'l': [1, [2, 3]]} fpath = './tests/complex.pkl' to_pickle(d, fpath) from_file = from_pickle(fpath) assert isinstance(from_file['u'], unicode) assert isinstance(from_file['s'], str) assert isinstance(from_file['i'], int) assert isinstance(from_file['l'], list) assert from_file == d
def save_keywords(html_file): html = read_all(html_file) soup = make_soup(html) main_navs = soup.find('div', 'mainNavs') cat_tags = main_navs.find_all('div', 'menu_box') cats = {} for i, cat in enumerate(cat_tags): cats[i] = category_contents(cat) for i in cats: cat = cats[i] print(u'** {0} **'.format(cat['name'])) for sub in cat['sub_cats']: print(sub[0]) for skill, href in sub[1]: print('\t-' + skill + ' - ' + href) to_pickle(cats, 'cats.pkl')
def save_degree_dict(): dict_file = './data/degree_zh.txt' lines = chinese.read_lines(dict_file) degree = 0 degree_dict = defaultdict(int) for l in lines: l = l.strip() if l and (not is_comment(l)): if l[0].isdigit(): parts = l.split('-') assert len(parts) == 2 degree = int(parts[1]) else: degree_dict[l] = degree for k in degree_dict: print(k, degree_dict[k]) to_pickle(degree_dict, './data/degree_zh.pkl')
def load_job_data(): start = datetime.datetime.now() cats = from_pickle('cats.pkl') os.chdir('./html/detail') print(os.getcwd()) detail_html_paths = [] for i in cats: cat = cats[i] print(u'** cat {0}: {1} **'.format(i, cat['name'])) for sub in cat['sub_cats']: for skill, href in sub[1]: print('loading... ' + skill) if '#' in skill or '/' in skill: skill = quoted(skill) for f in glob.glob(skill + u'_*.html'): hp = os.path.join(os.getcwdu(), f) # print(hp) detail_html_paths.append((skill, hp)) os.chdir('../..') print(os.getcwd()) all_jobs = [] all_comps = {} all_jc = {} counter = 0 for skill, detail_file in detail_html_paths: job = read_job_from_html(skill, detail_file) if job: j, c = job all_jobs.append(j) if c.comp_id not in all_comps: all_comps[c.comp_id] = c counter += 1 if counter % 50 == 0: print(counter) uni = unified(j, c) all_jc[j.job_id] = uni print(counter) print(len(all_jobs)) print(len(all_comps)) print(len(all_jc)) to_pickle(all_jobs, 'jobs.pkl') to_pickle(all_comps, 'comps.pkl') to_pickle(all_jc, 'unified.pkl') print(start) print(datetime.datetime.now())
def save_pos_sentiment_dict(): dict_file = './data/pos_sent_zh.txt' lines = chinese.read_lines(dict_file) dict_data = [w.strip() for w in lines if w.strip()] to_pickle(dict_data, './data/pos_sent_zh.pkl')
# "or name like '%C#%' or name like '%搜索算法%' or name like '%Hadoop%' " # "or name like '%交互设计师%' or name like '%数据分析师%' or name like '%Java%'") rows = cur.fetchall() print(len(rows)) n = 1000000 cnt_word_doc = Counter() cnt_words = Counter() for pos in rows[:n]: desc = strip_tags(pos['desc']) tokens = jieba.tokenize(desc, mode="search") words = [t[0] for t in tokens] cur = Counter(words) for k in cur: cnt_word_doc[k] += 1 cnt_words[k] += cur[k] to_pickle(cnt_words, 'cnt_words.pkl') to_pickle(cnt_word_doc, 'cnt_word_doc.pkl') # rows = from_pickle('pos_list.pkl') for k in cnt_words.most_common(200): print u'{0}: {1}'.format(k[0], k[1]) print print print '++in docs++' for k in cnt_word_doc.most_common(200): print u'{0}: {1}'.format(k[0], k[1])
def save_chinese_stopwords(): dict_file = './data/stopwords_zh_hit.txt' lines = chinese.read_lines(dict_file) dict_data = [w.strip() for w in lines if w.strip()] to_pickle(dict_data, './data/stopwords_zh.pkl')
def save_inverse_dict(): dict_file = './data/inverse_zh.txt' lines = chinese.read_lines(dict_file) inv_dict = [w.strip() for w in lines if w.strip() and (not is_comment(w))] to_pickle(inv_dict, './data/inverse_zh.pkl')
def save_neg_sentiment_dict(): dict_file = './data/neg_sent_zh.txt' lines = chinese.read_lines(dict_file) dict_data = [w.strip() for w in lines if w.strip()] to_pickle(dict_data, '../douban/movies/dicts/neg_sent_zh.pkl')
from common.html_util import strip_tags from common.persistence import to_pickle, from_pickle con = sqlite.connect('lagou.db') with con: con.row_factory = sqlite.Row cur = con.cursor() cur.execute("select * from position where subcategory in ('后端开发', '前端开发', '用户研究')") rows = cur.fetchall() positions = [] for row in rows: if row['desc'] != 'n/a': positions.append((row['pos_id'], row['name'], row['industry'], row['desc'])) print(len(positions)) for p in positions[0]: print p to_pickle(positions, 'positions.pkl') ##### raw_positions = positions pos_dict = {} for rp in raw_positions: pos_dict[int(rp[0])] = (rp[1], rp[2], strip_tags(rp[3])) to_pickle(pos_dict, 'pos_norm.pkl')
def save(reviews, pickle_file): to_pickle(reviews, pickle_file)
con = sqlite.connect('lagou.db') with con: con.row_factory = sqlite.Row cur = con.cursor() cur.execute( "select * from position where subcategory in ('后端开发', '前端开发', '用户研究')") rows = cur.fetchall() positions = [] for row in rows: if row['desc'] != 'n/a': positions.append( (row['pos_id'], row['name'], row['industry'], row['desc'])) print(len(positions)) for p in positions[0]: print p to_pickle(positions, 'positions.pkl') ##### raw_positions = positions pos_dict = {} for rp in raw_positions: pos_dict[int(rp[0])] = (rp[1], rp[2], strip_tags(rp[3])) to_pickle(pos_dict, 'pos_norm.pkl')
# coding=utf-8 from common.chinese import read_lines from common.persistence import to_pickle def chinese(): hit = set([line.strip() for line in read_lines('stopwords_zh.txt')]) lagou = set([line.strip() for line in read_lines('stopwords_lagou.txt')]) en = set([line.strip() for line in read_lines('stopwords_en.txt')]) ws = {u'', u' ', u'\t', u'\n'} return en | hit | lagou | ws if __name__ == '__main__': stopwords = chinese() to_pickle(stopwords, 'stopwords.pkl') print 'he' in stopwords print u'he' in stopwords print u'他' in stopwords print u'职位' in stopwords print u'负责' in stopwords