Example #1
0
 def build_from_txt(self, file_path: str, type_name: str = None):
     check_file(file_path, 'txt')
     file_name = get_file_name(file_path)
     if not type_name:
         type_name = file_name
     file_data = open(file_path, encoding='utf8').read().split('\n')
     logger.info("正在从{}中导入词表,共计{}条数据".format(file_path, len(file_data)))
     self._kp.add_keywords_from_list(file_data)
     for word in tqdm(file_data):
         self._type_dict[word] = type_name
Example #2
0
 def from_json(self, json_path, clear=False):
     if clear:
         self.clear()
     for line in tqdm(read_json_line(json_path)):
         self.db.insert({
             'raw_data': line,
             'labeled_status': False,
             'updated_time': None,
             'labeled_user': None,
             'labeled_data': None,
             'check_status': False
         })
     logger.info("successfully loaded from {}".format(json_path))
Example #3
0
def json_writer(result_q, task_nums, save_path):
    logger.info('写入地址为:{}, 临时文件地址为:{}'.format(save_path + os.sep + 'result.json',
                                              save_path + os.sep + 'handled.txt'))
    bar = tqdm(total=task_nums)
    while True:
        result, task = result_q.get(True)
        if result == '-end-':
            break
        with open(save_path + os.sep + 'result.json', 'a+', encoding='utf-8') as f:
            line = json.dumps(result, ensure_ascii=False)
            f.write(line + '\n')
        with open(save_path + os.sep + 'handled.txt', 'a+', encoding='utf-8') as f:
            f.write(task + '\n')
        bar.update()
Example #4
0
def csv_writer(result_q, task_nums, save_path):
    logger.info('写入地址为:{}, 临时文件地址为:{}'.format(save_path + os.sep + 'result.csv',
                                              save_path + os.sep + 'handle.txt'))
    bar = tqdm(total=task_nums)
    while True:
        result, task = result_q.get(True)
        if result == '-end-':
            break
        with open(save_path + os.sep + 'result.csv', 'a+', encoding='utf-8', newline='') as f:
            f_csv = csv.writer(f)
            f_csv.writerow(result)
        with open(save_path + os.sep + 'handle.txt', 'a+', encoding='utf-8') as f:
            f.write(task + '\n')
        bar.update()
Example #5
0
 def from_csv(self, csv_path, headers=None, clear=False):
     if clear:
         self.clear()
     with open(csv_path, encoding='utf-8') as f:
         csv_reader = csv.reader(f)
         if not headers:
             headers = next(csv_reader)
         for line in tqdm(csv_reader):
             assert len(headers) == len(line)
             self.db.insert({
                 'raw_data': {key: value for key, value in zip(headers, line)},
                 'labeled_status': False,
                 'updated_time': None,
                 'labeled_user': None,
                 'labeled_data': None,
                 'check_status': False
             })
     logger.info("successfully loaded from {}".format(csv_path))
Example #6
0
def get_response(url, user_proxy=None):
    if user_proxy:
        if user_proxy == DEFAULT_PROXY:
            proxy = get_proxy()
            if proxy:
                proxy = {'http': proxy}
        else:
            proxy = user_proxy
    else:
        proxy = None
    try:
        r = requests.get(url, headers=HEADERS, proxies=proxy, timeout=3)
        if r.status_code == requests.codes.ok:
            if r.encoding != 'utf-8':
                r.encoding = 'utf-8'
            return r
    except Exception as e:
        logger.info(str(e))
        return None
    return None
Example #7
0
 def build_from_csv(self,
                    file_path: str,
                    column: int,
                    type_name: str = None):
     check_file(file_path, 'csv')
     file_name = get_file_name(file_path)
     if not type_name:
         type_name = file_name
     file_data = []
     with open(file_path, encoding='utf8') as file:
         csv_reader = csv.reader(file)
         headers = next(csv_reader)
         assert column < len(headers)
         logger.info("headers:{}".format(','.join(headers)))
         for line in csv_reader:
             file_data.append(line[column])
     logger.info("正在从{}中导入词表,共计{}条数据".format(file_path, len(file_data)))
     self._kp.add_keywords_from_list(file_data)
     for word in tqdm(file_data):
         self._type_dict[word] = type_name
Example #8
0
    def build_from_dir(self, file_dir, type_name: str = None):
        for file_path in os.listdir(file_dir):
            file_full_path = os.path.join(file_dir, file_path)
            file_name = get_file_name(file_full_path)
            if not type_name:
                type_name = file_name
            if file_path.endswith('csv'):
                file_data = []
                with open(file_full_path, encoding='utf8') as file:
                    csv_reader = csv.reader(file)
                    headers = next(csv_reader)
                    logger.info("headers:{}".format(','.join(headers)))
                    for line in csv_reader:
                        file_data.append(line[1])
            else:  # default txt format
                file_data = open(file_full_path,
                                 encoding='utf8').read().split('\n')

            logger.info("正在从{}中导入词表,共计{}条数据".format(file_path, len(file_data)))
            self._kp.add_keywords_from_list(file_data)
            for word in tqdm(file_data):
                self._type_dict[word] = type_name
Example #9
0
 def load(self, save_path: str = 'ner.pt'):
     logger.info("从{}中加载模型中".format(save_path))
     with open(save_path, 'rb') as file:
         self._kp = pickle.load(file)
         self._type_dict = pickle.load(file)
     logger.info("成功从{}中加载模型".format(save_path))
Example #10
0
 def save(self, save_path: str = 'ner.pt'):
     logger.info("将模型保存至{}中".format(save_path))
     with open(save_path, 'wb') as file:
         pickle.dump(self._kp, file)
         pickle.dump(self._type_dict, file)
     logger.info("成功将模型保存至{}中".format(save_path))
Example #11
0
 def clear(self):
     result = self.cls.delete_many({})
     logger.info("has successfully clear the {} collection of {} database".format(self.cls_name, self.database_name))
     return result
Example #12
0
from lightutils import logger

logger.info("info")
logger.warning('warning')
logger.debug('debug')
logger.error('error')
Example #13
0
    def run(self, tasks, parser, notification=None):
        if type(tasks[0]) == int:
            tasks = [str(task) for task in tasks]
        if not os.path.isfile(self.save_path + os.sep + 'task.txt'):
            with open(self.save_path + os.sep + 'task.txt',
                      'w',
                      encoding='utf-8') as f:
                for task in tasks:
                    f.write(task + '\n')

        if os.path.isfile(self.save_path + os.sep + 'handled.txt'):
            with open(self.save_path + os.sep + 'handled.txt',
                      encoding='utf-8') as f:
                handled_tasks = [word.strip() for word in f]
        else:
            handled_tasks = list()
        unhandled_tasks = set(tasks) - set(handled_tasks)
        logger.info('已处理{}条数据,还需处理{}条数据'.format(len(handled_tasks),
                                                len(unhandled_tasks)))

        task_q = Queue()
        for task in unhandled_tasks:
            task_q.put(task)
        task_q.put('-end-')

        handled_tasks_list = Manager().list()

        result_q = Queue()
        result_process = Process(target=self.writer,
                                 args=(result_q, len(unhandled_tasks),
                                       self.save_path))
        result_process.start()

        begin_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
        logger.info('开始爬取,当前时间为:{}'.format(begin_time))
        a = time.time()

        process_lst = []
        for i in range(multiprocessing.cpu_count()):
            p = Process(target=parser,
                        args=(self.base_url, task_q, result_q,
                              handled_tasks_list, self.proxy, self.interval))
            process_lst.append(p)

        for p in process_lst:
            p.start()

        for p in process_lst:
            p.join()

        result_q.put(('-end-', None))
        result_process.join()

        end_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
        logger.info('执行结束, 当前时间为:{}'.format(end_time))
        b = time.time()
        logger.info('本次共计耗时{},共爬取{}条数据,平均速度为:{}(task/s)'.format(
            time_convert(round(b - a, 2)), len(unhandled_tasks),
            round(len(unhandled_tasks) / (b - a), 2)))
        if notification:
            if "to" not in notification or "task_name" not in notification:
                logger.info(
                    "the notification must have 'to' and 'task_name' attribute"
                )
            else:
                result = send_email_notification(
                    to=notification["to"],
                    subject="the spider job {} completed".format(
                        notification["task_name"]),
                    contents=[
                        "本次任务开始时间:{}".format(begin_time),
                        "本次任务结束时间:{}".format(end_time),
                        "写入文件夹:{}".format(self.save_path),
                        "保存文件格式:{}".format(self.save_format),
                        '本次共计耗时{},共爬取{}条数据,平均速度为:{}(task/s)'.format(
                            time_convert(round(b - a, 2)),
                            len(unhandled_tasks),
                            round(len(unhandled_tasks) / (b - a), 2))
                    ])
                if result:
                    logger.info("邮件发送成功!")
                else:
                    logger.info("邮件发送失败,看来是哪里出了差错,是否信息填写正确呢?")