class MySpider(object): def __init__(self, root_url): self.parser = HtmlParser() self.storage = DataStore() self._get_root_urls(root_url) def _get_root_urls(self, root_url): if os.path.exists('job_class.json'): pass else: new_urls = self.parser.get_url(root_url) self.storage.local_store(new_urls, 'job_class.json') #存储要爬取的行业类别url def joburl_init(self, pagenum, path='job_class.json'): root_urls = self.storage.load_data(path) jobs_dict = {} for i in pagenum: for list in root_urls: jobs_dict[list + str(i)] = root_urls[list] + str(i) #构造要爬取的网址链接 self.storage.local_store(jobs_dict, 'job_page_url.json') #存储构造好的网址链接 def company_url(self, path='job_page_url.json'): company_urls = self.storage.load_data(path) company_dicts = {} url_get = 0 #已获取的网址总数 for company_info_url in company_urls: print("待爬取的行业网址总数:", len(company_urls) - url_get) url_get += 1 url = company_urls[company_info_url] company_dicts.update(self.parser.getcompany_url(url)) self.storage.local_store(url, 'job_page_url_old.json') #存储已爬取的网址 self.storage.local_store(company_dicts, 'company_info_url_new.json') #存储公司信息的URL def company_info(self, path='company_info_url_new.json'): company_info_urls = self.storage.load_data(path) url_get = 0 #以获取的公司信息网址总数 for company_name in company_info_urls: print("待爬取的公司信息网址总数:", len(company_info_urls) - url_get) url_get += 1 url = company_info_urls[company_name] self.parser.getcompany_info(company_name, url) self.storage.local_store( url, 'compang_info_url_old.json') #存储以爬取的存储公司信息URL #从上次断点出重新开始获取公司信息 def grab_increment(self): new_urls = self.storage.load_data('company_info_url_new.json') old_urls = self.storage.load_data('compang_info_url_old.json') for company_name in new_urls: new_url = new_urls[company_name] if new_url not in old_urls: self.parser.getcompany_info(company_name, url) self.storage.local_store( url, 'compang_info_url_old.json') # 存储以爬取的存储公司信息URL
import pickle import tensorflow as tf import numpy as np from datastore import DataStore from model_def import CombinedModel from tensorflow.keras.optimizers import Adam # pass the mode OLD to read the picle datastore object MODE = "OLD" if MODE == "NEW": datastore = DataStore() # interactions file datastore.load_data("agent_interactions.jsl") datastore.build() with open('dataset_pick.pkl', 'wb') as o: pickle.dump(datastore, o, pickle.HIGHEST_PROTOCOL) else: with open("dataset_pick.pkl", "rb") as p: datastore = pickle.load(p) print(datastore.actions.shape) combined_model = CombinedModel(16, 8, 4) optimizer = Adam(learning_rate=0.00001) @tf.function def train_step(train_states, prev_states, current_states, actions): with tf.GradientTape() as tape: