Beispiel #1
0
def crawler_multiprocessing(worker_id, task_queue_crawler, path_dir,
                            task_queue_record_article, headers, each_region,
                            each_target):
    while True:

        task_list_get = task_queue_crawler.get()
        print('id =', worker_id, ',stard_task-----', task_list_get[0], '/',
              task_list_get[1], '/', task_queue_crawler.qsize())
        shortcode = task_list_get[2]

        global each_article_url
        each_article_url = 'https://www.instagram.com/p/' + shortcode + '/'

        html_article = get_html(each_article_url, headers)
        post_time = get_data(html_article, path_dir, each_region, each_target)

        if post_time == None:
            line_notify.lineNotifyMessage(msg='id =' + str(worker_id) + '\n' +
                                          str(each_article_url) +
                                          '\npost_time = ' + str(post_time))
            break
        else:
            task_queue_record_article.put(shortcode)  # 放進_任務佇列

            now_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
            time_sleep = random.randint(1, 3)
            print('id =', worker_id, ',done_task------',
                  task_list_get[0], '/', task_list_get[1], '/',
                  task_queue_crawler.qsize(), ',休息', time_sleep, '秒', now_time,
                  each_article_url, '發文時間:', post_time)
            time.sleep(time_sleep)
            task_queue_crawler.task_done()
Beispiel #2
0
def put_task_queue_crawler(task_queue_crawler, record_list,
                           record_article_set):
    for each_record in record_list:
        if each_record not in record_article_set:
            task = [
                record_list.index(each_record) + 1,
                len(record_list), each_record
            ]
            task_queue_crawler.put(task)  # 放進_任務佇列
            record_article_set.add(each_record)

            # line 通知 進度
            global search_tag
            data_count = len(record_article_set)
            if data_count % 20000 == 0:
                line_notify.lineNotifyMessage(msg=str(search_tag) + ' 已收集: ' +
                                              str(data_count))

    return record_article_set
Beispiel #3
0
                #     request.urlretrieve(img_url,path_dir_each + '/' + title + '.jpg')

        return posting_time
    except Exception as e:
        print(e)


# def collect_Exception(e,url=''):
#     path = './{0}_Exception.txt'.format(os.path.basename(__file__).replace('.py', ''))
#     time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
#     save_Exception_dict = {"time":time,"Exception":str(e),"url":url}
#     js_data_Exception = json.dumps(save_Exception_dict)
#     # 存文字檔
#     with open(path, 'a', encoding='utf8') as f:
#         f.write( js_data_Exception + '\n')

if __name__ == '__main__':

    tag_list_region = ['台北', '新北', '基隆', '桃園', '新竹', '宜蘭']  # 手動更改
    # tag_list_region = ['桃園', '新竹','宜蘭']  # 手動更改
    tag_list_target = ['景點', '美食']  # 手動更改

    for each_target in tag_list_target:
        for each_region in tag_list_region:
            print('開始爬:', each_region + each_target)
            line_notify.lineNotifyMessage(
                msg='開始爬:{0}{1}'.format(each_region, each_target))
            main(each_region, each_target)

    print('Complete!!!!!!!!!!')
Beispiel #4
0
                            str_tmp += each_word + " "

            except Exception as e:
                print(e)
                continue

            # 斷詞結果存檔
            segSaveFile = save_path
            with open(segSaveFile, 'ab') as saveFile:
                saveFile.write(str_tmp.encode('utf-8') + '\n'.encode('utf-8'))

    cost_time = time.time() - time_start
    print('ckiptagger 花了', cost_time / 3600, '小時')
    print('save_path=', save_path)


def main():
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.INFO)

    ckiptagger(load_txt_path='../combin/raw_place.txt',
               save_path='./segDone_place.txt')
    ckiptagger(load_txt_path='../combin/raw_food.txt',
               save_path='./segDone_food.txt')


if __name__ == '__main__':
    main()
    print('Complete!!!!!!!!!!')
    line_notify.lineNotifyMessage(msg='combin_ig_txt Complete!!!!!!!!!!')