Ejemplo n.º 1
0
def itdonga_crawler(file_path):

    pages = np.array([])
    page_num = 1
    last_page = False

    file_name = '{}.json'.format(corp)
    file = os.path.join(file_path, file_name)

    try:
        with open(file, 'r', encoding='utf-8') as f:
            data = json.load(f)
        update = True

    except FileNotFoundError:
        data = None
        update = False
        dump, page_num = start_from_dump(corp)

        if dump:
            pages = np.append(dump, pages)

    while not last_page:
        one_page, last_page = crawler(page_num, whole_data=data)
        if one_page:
            pages = np.append(pages, one_page)
            page_num += 1
            temp_dump(pages, page_num, corp, update)
    if data:
        pages = np.append(pages, data)

    pages = pages.tolist()
    with open(file, 'w', encoding='utf-8') as f:
        json.dump(pages, f, indent='\t', ensure_ascii=False)
    print(corp, ' Done')
Ejemplo n.º 2
0
def platum_crawler(integrated_file_path, individual_file_path):

    ended_categories = []

    for category in CATEGORIES:

        pages = np.array([])
        page_num = 1
        last_page = False

        category_file_name = '{0}_{1}'.format(CORP, category)
        category_file_name_ = '{0}_{1}.json'.format(CORP, category)
        category_file_path = os.path.join(individual_file_path,
                                          category_file_name_)

        try:
            with open(category_file_path, 'r', encoding='utf-8') as f:
                data = json.load(f)
            update = True

        except FileNotFoundError:
            data = None
            update = False
            dump, page_num = start_from_dump(category_file_name)

            if dump:
                pages = np.append(dump, pages)

        while not last_page:
            one_page, last_page = crawler(page_num,
                                          category,
                                          ended_categories,
                                          whole_data=data)
            if one_page:
                pages = np.append(pages, one_page)
                temp_dump(pages, page_num, category_file_name, update)
            page_num += 1

        if data:
            pages = np.append(pages, data)

        if category == 'startup-3':
            category = 'main'

        ended_categories.append(category)

        pages = pages.tolist()
        with open(category_file_path, 'w', encoding='utf-8') as f:
            json.dump(pages, f, indent='\t', ensure_ascii=False)

    integrate_files(individual_file_path, integrated_file_path, CATEGORIES,
                    CORP)
Ejemplo n.º 3
0
def ainews_crawler(individual_file_path, integrated_file_path):
    
    for category_name, category_num in categories.items():
        
        pages = np.array([])
        page_num = 1
        last_page = False
        individual_file_name = '{0}_{1}'.format(corp, category_name)
        individual_file = os.path.join(individual_file_path, '{}.json'.format(individual_file_name))

        try:
            with open(individual_file, 'r', encoding='utf-8') as f:
                data = json.load(f)
                last_url = data[0]['url']
            update = True

        except FileNotFoundError:
            data = None
            update = False
            last_url = None
            dump, page_num = start_from_dump(individual_file_name)

            if dump:
                pages = np.append(dump, pages)

        while not last_page:
            one_page, last_page = crawler(page_num, category_num, category_name, update, last_url, whole_data=data)
            if one_page:
                pages = np.append(pages, one_page)
                page_num += 1
                temp_dump(pages, page_num, individual_file_name, update)
        if data:
            pages = np.append(pages, data)

        pages = pages.tolist()
        with open(individual_file, 'w', encoding='utf-8') as f:
            json.dump(pages, f, indent='\t', ensure_ascii=False)
    integrate_files(individual_file_path, integrated_file_path)