def build_from_txt(self, file_path: str, type_name: str = None): check_file(file_path, 'txt') file_name = get_file_name(file_path) if not type_name: type_name = file_name file_data = open(file_path, encoding='utf8').read().split('\n') logger.info("正在从{}中导入词表,共计{}条数据".format(file_path, len(file_data))) self._kp.add_keywords_from_list(file_data) for word in tqdm(file_data): self._type_dict[word] = type_name
def from_json(self, json_path, clear=False): if clear: self.clear() for line in tqdm(read_json_line(json_path)): self.db.insert({ 'raw_data': line, 'labeled_status': False, 'updated_time': None, 'labeled_user': None, 'labeled_data': None, 'check_status': False }) logger.info("successfully loaded from {}".format(json_path))
def json_writer(result_q, task_nums, save_path): logger.info('写入地址为:{}, 临时文件地址为:{}'.format(save_path + os.sep + 'result.json', save_path + os.sep + 'handled.txt')) bar = tqdm(total=task_nums) while True: result, task = result_q.get(True) if result == '-end-': break with open(save_path + os.sep + 'result.json', 'a+', encoding='utf-8') as f: line = json.dumps(result, ensure_ascii=False) f.write(line + '\n') with open(save_path + os.sep + 'handled.txt', 'a+', encoding='utf-8') as f: f.write(task + '\n') bar.update()
def csv_writer(result_q, task_nums, save_path): logger.info('写入地址为:{}, 临时文件地址为:{}'.format(save_path + os.sep + 'result.csv', save_path + os.sep + 'handle.txt')) bar = tqdm(total=task_nums) while True: result, task = result_q.get(True) if result == '-end-': break with open(save_path + os.sep + 'result.csv', 'a+', encoding='utf-8', newline='') as f: f_csv = csv.writer(f) f_csv.writerow(result) with open(save_path + os.sep + 'handle.txt', 'a+', encoding='utf-8') as f: f.write(task + '\n') bar.update()
def from_csv(self, csv_path, headers=None, clear=False): if clear: self.clear() with open(csv_path, encoding='utf-8') as f: csv_reader = csv.reader(f) if not headers: headers = next(csv_reader) for line in tqdm(csv_reader): assert len(headers) == len(line) self.db.insert({ 'raw_data': {key: value for key, value in zip(headers, line)}, 'labeled_status': False, 'updated_time': None, 'labeled_user': None, 'labeled_data': None, 'check_status': False }) logger.info("successfully loaded from {}".format(csv_path))
def get_response(url, user_proxy=None): if user_proxy: if user_proxy == DEFAULT_PROXY: proxy = get_proxy() if proxy: proxy = {'http': proxy} else: proxy = user_proxy else: proxy = None try: r = requests.get(url, headers=HEADERS, proxies=proxy, timeout=3) if r.status_code == requests.codes.ok: if r.encoding != 'utf-8': r.encoding = 'utf-8' return r except Exception as e: logger.info(str(e)) return None return None
def build_from_csv(self, file_path: str, column: int, type_name: str = None): check_file(file_path, 'csv') file_name = get_file_name(file_path) if not type_name: type_name = file_name file_data = [] with open(file_path, encoding='utf8') as file: csv_reader = csv.reader(file) headers = next(csv_reader) assert column < len(headers) logger.info("headers:{}".format(','.join(headers))) for line in csv_reader: file_data.append(line[column]) logger.info("正在从{}中导入词表,共计{}条数据".format(file_path, len(file_data))) self._kp.add_keywords_from_list(file_data) for word in tqdm(file_data): self._type_dict[word] = type_name
def build_from_dir(self, file_dir, type_name: str = None): for file_path in os.listdir(file_dir): file_full_path = os.path.join(file_dir, file_path) file_name = get_file_name(file_full_path) if not type_name: type_name = file_name if file_path.endswith('csv'): file_data = [] with open(file_full_path, encoding='utf8') as file: csv_reader = csv.reader(file) headers = next(csv_reader) logger.info("headers:{}".format(','.join(headers))) for line in csv_reader: file_data.append(line[1]) else: # default txt format file_data = open(file_full_path, encoding='utf8').read().split('\n') logger.info("正在从{}中导入词表,共计{}条数据".format(file_path, len(file_data))) self._kp.add_keywords_from_list(file_data) for word in tqdm(file_data): self._type_dict[word] = type_name
def load(self, save_path: str = 'ner.pt'): logger.info("从{}中加载模型中".format(save_path)) with open(save_path, 'rb') as file: self._kp = pickle.load(file) self._type_dict = pickle.load(file) logger.info("成功从{}中加载模型".format(save_path))
def save(self, save_path: str = 'ner.pt'): logger.info("将模型保存至{}中".format(save_path)) with open(save_path, 'wb') as file: pickle.dump(self._kp, file) pickle.dump(self._type_dict, file) logger.info("成功将模型保存至{}中".format(save_path))
def clear(self): result = self.cls.delete_many({}) logger.info("has successfully clear the {} collection of {} database".format(self.cls_name, self.database_name)) return result
from lightutils import logger logger.info("info") logger.warning('warning') logger.debug('debug') logger.error('error')
def run(self, tasks, parser, notification=None): if type(tasks[0]) == int: tasks = [str(task) for task in tasks] if not os.path.isfile(self.save_path + os.sep + 'task.txt'): with open(self.save_path + os.sep + 'task.txt', 'w', encoding='utf-8') as f: for task in tasks: f.write(task + '\n') if os.path.isfile(self.save_path + os.sep + 'handled.txt'): with open(self.save_path + os.sep + 'handled.txt', encoding='utf-8') as f: handled_tasks = [word.strip() for word in f] else: handled_tasks = list() unhandled_tasks = set(tasks) - set(handled_tasks) logger.info('已处理{}条数据,还需处理{}条数据'.format(len(handled_tasks), len(unhandled_tasks))) task_q = Queue() for task in unhandled_tasks: task_q.put(task) task_q.put('-end-') handled_tasks_list = Manager().list() result_q = Queue() result_process = Process(target=self.writer, args=(result_q, len(unhandled_tasks), self.save_path)) result_process.start() begin_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) logger.info('开始爬取,当前时间为:{}'.format(begin_time)) a = time.time() process_lst = [] for i in range(multiprocessing.cpu_count()): p = Process(target=parser, args=(self.base_url, task_q, result_q, handled_tasks_list, self.proxy, self.interval)) process_lst.append(p) for p in process_lst: p.start() for p in process_lst: p.join() result_q.put(('-end-', None)) result_process.join() end_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) logger.info('执行结束, 当前时间为:{}'.format(end_time)) b = time.time() logger.info('本次共计耗时{},共爬取{}条数据,平均速度为:{}(task/s)'.format( time_convert(round(b - a, 2)), len(unhandled_tasks), round(len(unhandled_tasks) / (b - a), 2))) if notification: if "to" not in notification or "task_name" not in notification: logger.info( "the notification must have 'to' and 'task_name' attribute" ) else: result = send_email_notification( to=notification["to"], subject="the spider job {} completed".format( notification["task_name"]), contents=[ "本次任务开始时间:{}".format(begin_time), "本次任务结束时间:{}".format(end_time), "写入文件夹:{}".format(self.save_path), "保存文件格式:{}".format(self.save_format), '本次共计耗时{},共爬取{}条数据,平均速度为:{}(task/s)'.format( time_convert(round(b - a, 2)), len(unhandled_tasks), round(len(unhandled_tasks) / (b - a), 2)) ]) if result: logger.info("邮件发送成功!") else: logger.info("邮件发送失败,看来是哪里出了差错,是否信息填写正确呢?")