def dump_test_make(pre_file='SMN/data/smn_test.pkl', result_file='SMN/data/result_test.txt', max_word_per_utterence=50, output_file='SMN/data/datasets_test.pkl'): """ dump test make file """ version = begin_time() pre = pickle.load(open(pre_file, "rb")) revs, wordvecs, max_l2 = pre[0], pre[1], pre[2] datasets = make_data(revs, wordvecs.word_idx_map, max_l=max_word_per_utterence) dump_bigger(datasets, output_file) end_time(version)
def origin_test_master(self, input_file, output_file, block_size=100000, test_size=2000): """ the master of mult-Theading for get origin sample """ version = begin_time() with codecs.open(input_file, 'r', 'utf-8') as f: self.origin_sample = f.readlines() threadings = [] num = len(self.origin_sample) start = 0 end = min(block_size, num - 1) for block in range(int(num / block_size) + 1): while self.origin_sample[end] != '\r\n' and end < num - 1: end += 1 work = threading.Thread(target=self.origin_sample_agent, args=( start, end, block, )) threadings.append(work) start = end + 1 end = min(num - 1, block_size * (block + 1)) for work in threadings: work.start() for work in threadings: work.join() content = [self.content[k] for k in sorted(self.content.keys())] self.content = sum(content, []) response = [self.response[k] for k in sorted(self.response.keys())] self.response = sum(response, []) totalnum = len(self.content) randomlists = np.random.randint(0, totalnum, test_size) for index in randomlists: temp_context = self.content[index] self.test.append("1#" + temp_context + self.response[index]) otherindexs = np.random.randint(0, totalnum, 9) for otherindex in otherindexs: while otherindex == index: otherindex = np.random.randint(0, totalnum, 1)[0] self.test.append("0#" + temp_context + self.response[otherindex]) pickle.dump(self.test, open(output_file, 'wb')) end_time(version)
def kuaidaili(self, page): """ kuaidaili https://www.kuaidaili.com/free/ """ version = begin_time() threadings = [] for index in range(1, page + 1): work = threading.Thread(target=self.kuaidailithread, args=(index, )) threadings.append(work) for work in threadings: work.start() for work in threadings: work.join() self.threadjude() end_time(version)
def make_data_train(revs, word_idx_map, max_l=50, validation_num=50000, block_size=200000): """ Transforms sentences into a 2-d matrix. """ version = begin_time() test = [] threadings = queue.Queue() waitthreadings = queue.Queue() num = len(revs) start = 0 end = min(block_size, num - 1) for block in range(int(num / block_size) + 1): work = threading.Thread(target=make_data_theading, args=( revs, word_idx_map, max_l, validation_num, start, end, )) threadings.put(work) start = end + 1 end = min(num - 1, block_size * (block + 2)) while not threadings.empty(): tempwork = threadings.get() tempwork.start() waitthreadings.put(tempwork) while not waitthreadings.empty(): waitthreadings.get().join() global trains, vals train = sum(trains, []) val = sum(vals, []) train = np.array(train, dtype="int") val = np.array(val, dtype="int") test = np.array(test, dtype="int") print('trainning data', len(train), 'val data', len(val), 'spend time:', spend_time(version)) return [train, val, test]
def origin_result_direct(self, input_file1, input_file2, output_file): """ origin sample direct no theading """ version = begin_time() pre = [] dataset = [] with codecs.open(input_file1, 'r', 'utf-8') as f: temp_context = '' last_index = '' for tempword in f: if tempword == '\r\n': pre.append("1#" + temp_context + last_index) temp_context = '' last_index = '' else: if len(last_index): temp_context += (last_index + '#') last_index = tempword[:-1].strip() with codecs.open(input_file2, 'r', 'utf-8') as f: temp_context = [] index = 0 totalnum = len(pre) for tempword in f: if tempword == '\r\n': if len(temp_context) < 9: continue elif len(temp_context) == 9: if index < totalnum: dataset.append(pre[index] + '#' + temp_context[0]) index += 1 temp_context = [] else: index += 1 temp_context = [] else: temp_context.append(tempword[:-1].strip()) if index < totalnum: dataset.append( pre[index] + '#' + tempword[:-1].replace(u'\ufeff', '').strip()) pickle.dump([pre, dataset], open(output_file, "wb")) end_time(version)
def sixsixip(self, area, page): """ 66ip proxy http://www.66ip.cn/areaindex_{area}/{page}.html """ version = begin_time() threadings = [] for index in range(1, area + 1): for pageindex in range(1, page + 1): print(str(index) + ' ' + str(pageindex)) work = threading.Thread(target=self.sixsixthread, args=(index, pageindex)) threadings.append(work) for work in threadings: work.start() for work in threadings: work.join() self.threadjude() end_time(version)
def make_data_theading(revs, word_idx_map, max_l, validation_num, start, end): """ make data theading """ version = begin_time() temptrain, tempval, temptest = [], [], [] for index in range(start, end): rev = revs[index] sent = get_idx_from_sent_msg(rev["m"], word_idx_map, max_l, True) sent += get_idx_from_sent(rev["r"], word_idx_map, max_l, True) sent += get_session_mask(rev["m"]) sent.append(int(rev["y"])) if index >= validation_num: temptrain.append(sent) else: tempval.append(sent) global trains, vals trains.append(temptrain) vals.append(tempval)
def get_href(self): """ get summarization from http://news.baidu.com/ns?word=%E6%AF%92%E7%8B%97%E8%82%89&tn=news&from=news&cl=2&rn=20&ct=1 """ version = begin_time() threadings = [] for index in range(71): work = threading.Thread(target=self.href_once, args=(index, )) threadings.append(work) for work in threadings: # time.sleep(.5) work.start() for work in threadings: work.join() href_map = [self.href_map[k] for k in sorted(self.href_map.keys())] self.href_map = sum(href_map, []) with codecs.open('bjh_href_poison.txt', 'w', encoding='utf-8') as f: f.write("\n".join(self.href_map)) end_time(version)
def data5u(self): """ data5u proxy http://www.data5u.com/ no one can use """ version = begin_time() url_list = ['', 'free/gngn/index.shtml', 'free/gwgn/index.shtml'] host = 'http://www.data5u.com/' for uri in url_list: html = self.get_request_proxy(host + uri, 0) if not html: continue table = html.find_all('ul', class_='l2') for index in table: tds = index.find_all('li') ip = tds[3].text self.waitjudge.append(ip + '://' + tds[0].text + ':' + tds[1].text) self.threadjude() end_time(version)
def test_model(dataset_file='SMN/data/datasets_test11.pkl', pre_file='SMN/data/smn_test11.pkl', model_name='SMN/data/model.bin', result_file='SMN/data/result_test11.txt'): """ test model return accuracy """ version = begin_time() datasets = load_bigger(dataset_file) pre = pickle.load(open(pre_file, "rb")) wordvecs = pre[1] predict(datasets, wordvecs.W, batch_size=200, max_l=50, hidden_size=200, word_embedding_size=200, model_name=model_name, result_file=result_file) sampleConduct = SampleConduct() end_time(version) return sampleConduct.calculate_test(result_file)
def testdb(self, types): ''' test proxy in db can use ''' version = begin_time() typestr = '' if types == 2: typestr = '(0,1,2,3)' elif types == 1: typestr = '(1,3)' else: typestr = '(0,2)' results = self.Db.select_db(self.select_all % typestr) if results != 0: for index in results: self.waitjudge.append(index[0]) self.threadjude() else: pass self.initproxy() end_time(version)
def xiciproxy(self, page): """ xici proxy http://www.xicidaili.com/nn/{page} The first proxy I use, but now it can not use it mostly. """ if not str(page).isdigit(): print("Please input num!") return [] version = begin_time() url = 'http://www.xicidaili.com/nn/%d' for index in range(1, page + 1): html = basic_req(url % (index), 0) tem = html.find_all('tr') for index in range(1, len(tem)): tds = tem[index].find_all('td') ip = tds[5].text.lower() self.waitjudge.append(ip + '://' + tds[1].text + ':' + tds[2].text) self.threadjude() end_time(version)
def origin_t_direct(self, input_file='SMN/data/test_SMN.pkl', output_file='SMN/data/weibo/val_Dataset.pkl', small_size=10000, word2id_file='SMN/data/weibo/word2id.pkl'): """ origin sample direct no theading """ version = begin_time() test = pickle.load(open(input_file, 'rb')) self.word2id = pickle.load(open(word2id_file, 'rb')) c = [] r = [] num = 0 for tempword in test: words = tempword[2:].split('#') contexts = words[:-1] replys = words[-1] context = [] reply = [] for idx, index in enumerate(words): if idx: context.append(1) temp_context = index.split() for temp in temp_context: context.append(self.word2id[LCS(temp)] if LCS(temp) in self.word2id else 0) for index in replys: temp_reply = index.split() for temp in temp_reply: reply.append(self.word2id[LCS(temp)] if LCS(temp) in self.word2id else 0) r.append(reply) c.append(context) y = [1, 0, 0, 0, 0, 0, 0, 0, 0, 0] * (len(test) // 10) val_data = {'y': y, 'r': r, 'c': c} pickle.dump(val_data, open(output_file, "wb")) end_time(version)
def origin_sample_direct(self, input1_file, input2_file, output_file, small_size=2000): """ origin sample direct no theading """ version = begin_time() with codecs.open(input1_file, 'r', 'utf-8') as f: sample1 = f.readlines() with codecs.open(input2_file, 'r', 'utf-8') as f: sample2 = f.readlines() temp_context = '' last_index = '' content = [] r = [] for tempword in sample1: if tempword == '\n': content.append(temp_context + last_index[:-5]) temp_context = '' last_index = '' else: if len(last_index): temp_context += last_index last_index = tempword[:-1].strip() + '[SEP]' num = 0 print(len(sample2)) for index, tempword in enumerate(sample2): if tempword != '\n': last_index = tempword[:-1].replace('\"', '').replace('\\', '') r.append('0#' + content[num] + '#' + last_index) else: num += 1 pickle.dump(r, open(output_file, "wb")) end_time(version)
def run_model(pre_file, types, model_name='SMN/data/model_little0.pkl', max_word_per_utterence=50, validation_num=500000, result_file='SMN/data/20result1.txt', exicted_model=None): """ run model for train or predict @params: types 0-train, 1-predict """ version = begin_time() pre = pickle.load(open(pre_file, "rb")) revs, wordvecs, max_l2 = pre[0], pre[1], pre[2] datasets = make_data(revs, wordvecs.word_idx_map, max_l=max_word_per_utterence, validation_num=validation_num) if not types: train(datasets, wordvecs.W, batch_size=200, max_l=max_word_per_utterence, hidden_size=200, word_embedding_size=200, exicted_model=exicted_model) else: predict(datasets, wordvecs.W, batch_size=200, max_l=max_word_per_utterence, hidden_size=200, word_embedding_size=200, model_name=model_name, result_file=result_file) end_time(version)
def build_md(self, load_img=False): """ build md """ version = begin_time() threadings = [] for index, tid in enumerate(self.request_list): work = threading.Thread(target=self.build_md_once, args=( index, tid, )) threadings.append(work) for work in threadings: work.start() for work in threadings: work.join() if not load_img: return img_map = {k: self.img_map[k] for k in sorted(self.img_map.keys())} img_threadings = [] for index in img_map.keys(): for img_id, img_url in enumerate(img_map[index]): work = threading.Thread(target=self.load_img, args=( index, img_id, img_url, )) img_threadings.append(work) for work in img_threadings: work.start() for work in img_threadings: work.join() end_time(version)
def preData(self): """ data prepare """ version = begin_time() file_d = open('vsm/test3', 'r') articles = file_d.readlines() threadings = [] self.articleNum = len(articles) self.articleMaps = [None for i in range(self.articleNum)] self.resultArray = [None for i in range(self.articleNum)] for index in range(self.articleNum): work = threading.Thread(target=self.preDataBasic, args=( articles[index].strip('\n').rstrip(), index, )) threadings.append(work) for work in threadings: work.start() for work in threadings: work.join() end_time(version)
def calculate_test(self, input_file, block_size=10): """ calculate result """ version = begin_time() with codecs.open(input_file, 'r', 'utf-8') as f: results = f.readlines() totalnum = int(len(results)) correctnum = 0 top3num = 0 for index in range(int(totalnum / block_size)): pre = results[index * block_size:(index + 1) * block_size] temp_index = np.array(pre).argmax() top3 = np.array(pre).argsort()[-3:][::-1] if not temp_index: correctnum += 1 if 0 in top3: top3num += 1 print(correctnum, top3num, int(totalnum / block_size), spend_time(version), str(correctnum / int(totalnum / block_size))[:5], str(top3num / int(totalnum / block_size))[:5]) return str(correctnum / int(totalnum / block_size))[:5]
def origin_sample_direct(self, input_file, output_file): """ origin sample direct no theading """ version = begin_time() with codecs.open(input_file, 'r', 'utf-8') as f: temp_context = '' last_index = '' content = [] response = [] pre = [] for tempword in f: if tempword == '\r\n': content.append(temp_context) response.append(last_index) pre.append("1#" + temp_context + last_index) temp_context = '' else: if len(last_index): temp_context += (last_index + '#') last_index = tempword[:-1].strip() pickle.dump(pre, open(output_file, "wb")) end_time(version)
def get_classify(self): """ get classify from /discover/playlist """ version = begin_time() self.classifylist = {} host = 'https://music.163.com/discover/playlist' html = get_request_proxy(host, 0) if not html: print('Empty') if can_retry(host): self.get_classify() return [] alist = html.find_all('a', class_='s-fc1') if not len(alist): if can_retry(host): self.get_classify() print(html) for index in alist: self.classifylist[index.text] = index['href'] end_time(version)
def load_spot(self, batch_size=50): ''' load spot ''' version = begin_time() self.load_city_list() # self.city_list = [10186] city_threading = [ threading.Thread(target=self.load_spot_once, args=( 1, ii, )) for ii in self.city_list ] shuffle_batch_run_thread(city_threading, 150) spot_continue = [] for ii, jj in self.spot_pn.items(): spot_continue += [ threading.Thread(target=self.load_spot_once, args=( pn, ii, )) for pn in range(2, jj + 1) ] shuffle_batch_run_thread(spot_continue, 150) output = [ '{},{}'.format(self.id2map[ii], ','.join(jj)) for ii, jj in self.spot_result.items() ] output_path = '{}spot.txt'.format(data_dir) with open(output_path, 'w') as f: f.write('\n'.join(output)) city_num = len(self.city_list) spot_num = sum([len(ii) for ii in self.spot_result.values()]) echo( 1, 'City num: {}\nSpot num: {}\nOutput path: {}\nSpend time: {:.2f}s\n' .format(city_num, spot_num, output_path, end_time(version, 0)))
def gatherproxy(self, types): """ :100: very nice website first of all you should download proxy ip txt from: http://www.gatherproxy.com/zh/proxylist/country/?c=China """ version = begin_time() if not os.path.exists('%sgatherproxy' % data_path): print('Gather file not exist!!!') return with codecs.open('%sgatherproxy' % data_path, 'r', encoding='utf-8') as f: file_d = [ii.strip() for ii in f.readlines()] if not types: waitjudge = ['http://' + ii[:-1] for ii in file_d] elif types == 1: waitjudge = ['https://' + ii[:-1] for ii in file_d] else: waitjudge1 = ['http://' + ii[:-1] for ii in file_d] waitjudge2 = ['https://' + ii[:-1] for ii in file_d] waitjudge = [*waitjudge1, *waitjudge2] self.waitjudge = waitjudge print('load gather over!') end_time(version)
def match_goods(self): self.headers = { 'pragma': 'no-cache', 'X-Requested-With': 'XMLHttpRequest', 'cache-control': 'no-cache', 'Cookie': '', 'Content-Type': 'application/x-www-form-urlencoded;charset=UTF-8', 'Accept': 'application/json, text/javascript, */*; q=0.01', "Accept-Encoding": "", "Accept-Language": "zh-CN,zh;q=0.9", "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3682.0 Safari/537.36", } version = begin_time() changeHtmlTimeout(30) block_size = 10 if not os.path.exists('%sgoods' % data_dir): print('goods file not exist!!!') return with codecs.open('%sgoods' % data_dir, 'r', encoding='utf-8') as f: wait_goods = f.readlines() goods_url = [ re.findall('http.* ', index)[0].strip().replace('https', 'http') if 'http' in index and not '【' in index else False for index in wait_goods ] if not os.path.exists('%scollect_wyy' % data_dir): print('collect file not exist!!!') return with codecs.open('%scollect_wyy' % data_dir, 'r', encoding='utf-8') as f: collect = f.readlines() self.title2map = { index.split("||")[1]: index.split("||")[0] for index in collect } threadings = [] for index, url in enumerate(goods_url): if url == False: continue work = threading.Thread(target=self.get_goods_id_first, args=( url, index, )) threadings.append(work) url_len = len(threadings) for index in range((url_len - 1) // block_size + 1): begin_id = index * block_size end_id = min(url_len, (index + 1) * block_size) threadings_block = threadings[begin_id:end_id] for work in threadings_block: work.start() for work in threadings_block: work.join() time.sleep(random.randint(0, 9)) write_body = [ ' '.join([self.goods_map[index], body]) if index in self.goods_map else (' '.join([self.url2goods[goods_url[index]], body]) if goods_url[index] in self.url2goods else body) for index, body in enumerate(wait_goods) ] with codecs.open('%sgoods_one' % data_dir, 'w', encoding='utf-8') as f: f.write(''.join(write_body)) end_time(version)
def onetime_master(self, input_file, output_file, block_size=900000, test_size=2000): """ by numpy """ version = begin_time() with codecs.open(input_file, 'r', 'utf-8') as f: self.origin_sample = f.readlines() threadings = [] num = 0 for index, line in enumerate(self.origin_sample): num += 1 start = 0 end = min(block_size, num - 1) block_num = int(num / block_size) + 1 print('Thread Begin. ', num) for block in range(block_num): while self.origin_sample[end] != '\r\n' and end < num - 1: end += 1 work = threading.Thread(target=self.origin_sample_agent, args=( start, end, block, )) threadings.append(work) start = end + 1 end = min(num - 1, block_size * (block + 1)) print('point 1') for work in threadings: work.start() for work in threadings: work.join() print('Thread Over.') return self.content, self.response content = np.hstack(np.array(list(self.content.values()))) totalnum = len(content) print(totalnum) randomIndexs = unique_randomint(0, totalnum, test_size) otherIndexs = np.setdiff1d(np.arange(totalnum), randomIndexs) pre_content = content[otherIndexs] test_content = content[randomIndexs] del content gc.collect() response = np.hstack(np.array(list(self.response.values()))) test_response = [ response[index] + '\n' + list2str(response[unique_randomint(0, totalnum, 9, [index])]) + '\n' for index in randomIndexs ] otherIndexs = np.setdiff1d(np.arange(totalnum), randomIndexs) pre_response = response[otherIndexs] max_dtype = max(pre_content.dtype, pre_response.dtype) pre_next = pre_content.astype(max_dtype) + pre_response.astype( max_dtype) with open(output_file + 'seq_replies.txt', 'wb') as f: f.write(list2str(test_response)) with open(output_file + 'seq_context.txt', 'wb') as f: f.write(list2str(test_content)) with open(output_file + 'train.txt', 'wb') as f: f.write(list2str(pre_next)) end_time(version)
def origin_sample_master( self, input_file, output_file='SMN/data/weibo/train_data_small.pkl', word2id_file='SMN/data/weibo/word2id.pkl', embedding_file='SMN/data/weibo/word_embedding.pkl', block_size=900000, small_size=100000): """ the master of mult-Theading for get origin sample """ version = begin_time() with codecs.open(input_file, 'r', 'utf-8') as f: self.origin_sample = f.readlines() self.word2id = pickle.load(open(word2id_file, 'rb')) # self.embedding = pickle.load(open(embedding_file, 'rb')) threadings = [] num = len(self.origin_sample) start = 0 end = min(block_size, num - 1) for block in range(int(num / block_size) + 1): while self.origin_sample[end] != '\r\n' and end < num - 1: end += 1 work = threading.Thread(target=self.origin_sample_agent, args=( start, end, block, )) threadings.append(work) start = end + 1 end = min(num - 1, block_size * (block + 1)) for work in threadings: work.start() for work in threadings: work.join() content_order = self.content.keys() content_order.sort() response_order = self.response.keys() response_order.sort() content = sum(list(self.content.values()), []) response = sum(list(self.response.values()), []) totalnum = len(content) print(totalnum) # return totalnum randomIndexs = unique_randomint(0, totalnum, small_size) y = [1, 0, 0] * small_size c = [] r = [] for index in randomIndexs: c.append(content[index]) c.append(content[index]) c.append(content[index]) r.append(response[index]) r.append(response[unique_randomint(0, totalnum, 1, [index])[0]]) r.append(response[unique_randomint(0, totalnum, 1, [index])[0]]) train_data = {'y': y, 'r': r, 'c': c} pickle.dump(train_data, open(output_file, "wb")) end_time(version)
def origin_sample_direct(self, input1_file, input2_file, output_file, small_size=10000, word2id_file='SMN/data/weibo/word2id.pkl'): """ origin sample direct no theading """ version = begin_time() with codecs.open(input1_file, 'r', 'utf-8') as f: sample1 = f.readlines() with codecs.open(input2_file, 'r', 'utf-8') as f: sample2 = f.readlines() self.word2id = pickle.load(open(word2id_file, 'rb')) temp_context = [] last_index = [] c = [] r = [] num = 0 for tempword in sample1: if tempword == '\r\n': num += 1 for idx in range(10): c.append(temp_context + last_index[:-1]) temp_context = [] last_index = [] else: if len(last_index): temp_context += last_index last_index = tempword[:-1].replace('\"', '').replace( '\\', '').strip().split() if '\"' in last_index: print(last_index) last_index = [(self.word2id[LCS(index)] if LCS(index) in self.word2id else 0) for index in last_index] last_index.append(1) # for idx in range(10): # c.append(temp_context + last_index[:-1]) num = 0 print(len(sample2)) for index, tempword in enumerate(sample2): if tempword != '\r\n': num += 1 last_index = tempword[:-1].replace('\"', '').replace( '\\', '').strip().split() last_index = [(self.word2id[LCS(index)] if LCS(index) in self.word2id else 0) for index in last_index] r.append(last_index) else: if num != 10: r.append(last_index) print(num, index) num = 0 y = [1, 0, 0, 0, 0, 0, 0, 0, 0, 0] * small_size val_data = {'y': y, 'r': r, 'c': c} pickle.dump(val_data, open(output_file, "wb")) end_time(version)
def word2ids(self, input_file, embedding_file, output1_file='SMN/data/weibo/word2id.pkl', output2_file='SMN/data/weibo/word_embedding.pkl', output3_file='SMN/data/weibo/word2id', min_n=1, max_n=3): """ word 2 id """ version = begin_time() with codecs.open(input_file, 'r', 'utf-8') as f: origin_sample = f.readlines() word_embedding = load_bigger(embedding_file) words = [] word_map = {} embedding_lists = [] word_map['_OOV_'] = 0 word_map['_EOS_'] = 1 embedding_lists.append([0] * 200) embedding_lists.append([0] * 200) for index in origin_sample: if index == '\r\n': continue words += [LCS(idx) for idx in index.replace('\r\n', '').split()] # words.update(set(index.replace('\r\n', '').split())) words = Counter(words) words = [index for index in words] word2id = ['_OOV_ 0', '_EOS_ 1'] word_size = word_embedding.wv.syn0[0].shape[0] print('Step 2: Begin') index_num = 2 for idx, index in enumerate(words): if index in word_map: continue if index in word_embedding.wv.vocab.keys(): word_map[index] = index_num index_num += 1 word2id.append(index + ' ' + str(word_map[index])) embedding_lists.append(word_embedding[index].astype('float32')) else: ngrams = compute_ngrams(index, min_n=min_n, max_n=max_n) word_vec = np.zeros(word_size, dtype=np.float32) ngrams_found = 0 ngrams_single = [ng for ng in ngrams if len(ng) == 1] ngrams_more = [ng for ng in ngrams if len(ng) > 1] for ngram in ngrams_more: if ngram in word_embedding.wv.vocab.keys(): word_vec += word_embedding[ngram] ngrams_found += 1 if ngrams_found == 0: for ngram in ngrams_single: if ngram in word_embedding.wv.vocab.keys(): word_vec += word_embedding[ngram] ngrams_found += 1 if word_vec.any(): word_vec /= max(1, ngrams_found) word_map[index] = index_num index_num += 1 word2id.append(index + ' ' + str(word_map[index])) embedding_lists.append(word_vec) print(index_num) with open(output3_file, 'w') as f: f.write(list2str(word2id)) print('Step 2: Over') # return embedding_lists, word_map pickle.dump(embedding_lists, open(output2_file, "wb")) pickle.dump(word_map, open(output1_file, "wb")) end_time(version)
def word2ids(self, input_file, embedding_file, output1_file='SMN/data/weibo/word2id.pkl', output2_file='SMN/data/weibo/word_embedding.pkl', output3_file='SMN/data/weibo/word2id'): """ word 2 id """ version = begin_time() with codecs.open(input_file, 'r', 'utf-8') as f: origin_sample = f.readlines() word_embedding = load_bigger(embedding_file) words = [] word_map = {} embedding_lists = [] word_map['_OOV_'] = 0 word_map['_EOS_'] = 1 embedding_lists.append([0] * 200) embedding_lists.append([0] * 200) for index in origin_sample: if index == '\r\n': continue words += [LCS(idx) for idx in index.replace('\r\n', '').split()] # words.update(set(index.replace('\r\n', '').split())) words = Counter(words) words = [index for index in words if words[index] > 2] word2id = ['_OOV_ 0', '_EOS_ 1'] print('Step 2: Begin') index_num = 2 for idx, index in enumerate(words): if index in word_embedding: if index not in word_map: word_map[index] = index_num index_num += 1 word2id.append(index + ' ' + str(word_map[index])) embedding_lists.append( list(word_embedding[index].astype('float16'))) # elif index[:3] in word_embedding: # if index[:3] not in word_map: # word_map[index[:3]] = index_num # word_map[index] = index_num # index_num += 1 # word2id.append(index[:3] + ' ' + str(word_map[index[:3]])) # word2id.append(index + ' ' + str(word_map[index])) # embedding_lists.append(list(word_embedding[index[:3]].astype('float16'))) # else: # word_map[index] = word_map[index[:3]] # word2id.append(index + ' ' + str(word_map[index])) # elif index[:2] in word_embedding: # if index[:2] not in word_map: # word_map[index[:2]] = index_num # word_map[index] = index_num # index_num += 1 # word2id.append(index[:2] + ' ' + str(word_map[index[:2]])) # word2id.append(index + ' ' + str(word_map[index])) # embedding_lists.append(list(word_embedding[index[:2]].astype('float16'))) # else: # word_map[index] = word_map[index[:2]] # word2id.append(index + ' ' + str(word_map[index])) # elif index[:1] in word_embedding: # if index[:1] not in word_map: # word_map[index[:1]] = index_num # word_map[index] = index_num # index_num += 1 # word2id.append(index[:1] + ' ' + str(word_map[index[:1]])) # word2id.append(index + ' ' + str(word_map[index])) # embedding_lists.append(list(word_embedding[index[:1]].astype('float16'))) # else: # word_map[index] = word_map[index[:1]] # word2id.append(index + ' ' + str(word_map[index])) print(index_num) with open(output3_file, 'w') as f: f.write(list2str(word2id)) print('Step 2: Over') # return embedding_lists, word_map pickle.dump(embedding_lists, open(output2_file, "wb")) pickle.dump(word_map, open(output1_file, "wb")) end_time(version)
result.append('best cv score:' + str(eval_hist['auc-mean'][-1]) + '\n') with open(model_path + 'result', 'a') as f: f.write('\n'.join([str(index) for index in result])) print('best n_estimators:', len(eval_hist['auc-mean'])) print('best cv score:', eval_hist['auc-mean'][-1]) self.OPT_ROUNDS = len(eval_hist['auc-mean']) if (eval_hist['auc-mean'][-1] > self.basic_auc): self.basic_auc = eval_hist['auc-mean'][-1] if not index is None and index != -1: self.good_columns.append(self.wait_columns[index]) with open(model_path + 'columns.csv', 'w') as f: f.write(','.join([str(index) for index in self.good_columns])) if __name__ == '__main__': version = begin_time() model = False single = True im = SA() # im.pre_data_v1(1) # im.pre_data_v1(0) # single = True if single: im.load_data(model) im.optimize_model(model) im.train_model() im.evaulate_model(model) else: for index in range(-1, len(im.wait_columns)): # filter good feature
def bulk_import_alimama(self): """ bulk import alimama """ version = begin_time() if not os.path.exists('%scollect_wyy' % data_dir): print('Collect File not exist!!!') return with codecs.open('%scollect_wyy' % data_dir, 'r', encoding='utf-8') as f: goods = f.readlines() self.goods_candidate = [index.split('||')[0] for index in goods] goods_len = len(self.goods_candidate) self.headers = { 'pragma': 'no-cache', 'X-Requested-With': 'XMLHttpRequest', 'cache-control': 'no-cache', 'Cookie': '', 'Content-Type': 'application/x-www-form-urlencoded;charset=UTF-8', 'Accept': 'application/json, text/javascript, */*; q=0.01', "Accept-Encoding": "", "Accept-Language": "zh-CN,zh;q=0.9", "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3682.0 Safari/537.36", 'Origin': 'http://pub.alimama.com', 'Referer': 'http://pub.alimama.com/promo/search/index.htm?q=%E7%AC%AC%E5%9B%9B%E5%8D%81%E4%B9%9D%E5%A4%A9%2019%E6%98%A5%E5%AD%A3&_t=1550891362391' } if not os.path.exists('%scookie_alimama' % data_dir): print('alimama cookie not exist!!!') return with codecs.open('%scookie_alimama' % data_dir, 'r', encoding='utf-8') as f: cookie = f.readlines() url_list = [ 'https://pub.alimama.com/favorites/group/newList.json?toPage=1&perPageSize=40&keyword=&t=', str(int(round(time.time() * 1000))), '&_tb_token_=', cookie[1][:-1], '&pvid=', cookie[2][:-1] ] url = ''.join(url_list) self.headers['Cookie'] = cookie[0][:-1] self.headers['Host'] = url.split('/')[2] group_list = basic_req(url, 2, header=self.headers) if group_list.status_code != 200 or group_list.json( )['info']['message'] == 'nologin': print('group_list error') return group_list = group_list.json()['data']['result'] group_list = [index['id'] for index in group_list] print(group_list) assert len(group_list) > (goods_len - 1) // 200 threadings = [] for index in range((goods_len - 1) // 200 + 1): work = threading.Thread(target=self.bulk_import_alimama_once, args=( index, group_list[index], )) threadings.append(work) for work in threadings: work.start() for work in threadings: work.join() end_time(version)