Ejemplo n.º 1
0
def start_datascraper(api, identifier, site_name, choice_type=None):
    print("Scrape Processing")
    info = link_check(sessions[0], identifier)
    user = info["user"]
    user = json.loads(json.dumps(
        user), object_hook=lambda d: SimpleNamespace(**d))
    if not info["exists"]:
        info["user"] = user
        return [False, info]
    is_me = user.is_me
    post_counts = info["count"]
    post_count = post_counts[0]
    user_id = str(user.id)
    avatar = user.avatar
    username = user.username
    link = user.link
    info["download"] = prepare_download.start(
        username=username, link=link, image_url=avatar, post_count=post_count, webhook=webhook)
    if not info["subbed"]:
        print(f"You are not subbed to {user.username}")
        return [False, info]

    print("Name: "+username)
    api_array = scrape_choice(user_id, post_counts, is_me)
    api_array = format_options(api_array, "apis")
    apis = api_array[0]
    api_string = api_array[1]
    if not json_settings["auto_scrape_apis"]:
        print("Apis: "+api_string)
        value = int(input().strip())
    else:
        value = 0
    if value:
        apis = [apis[value]]
    else:
        apis.pop(0)
    info["download"] = prepare_download.start(
        username=username, link=link, image_url=avatar, post_count=post_count, webhook=webhook)
    for item in apis:
        print("Type: "+item["api_type"])
        only_links = item["api_array"]["only_links"]
        post_count = str(item["api_array"]["post_count"])
        item["api_array"]["username"] = username
        api_type = item["api_type"]
        results = prepare_scraper(
            sessions, site_name, item)
        if results:
            for result in results[0]:
                if not only_links:
                    media_set = result
                    if not media_set["valid"]:
                        continue
                    directory = results[1]
                    location = result["type"]
                    info["download"].others.append(
                        [media_set["valid"], sessions, directory, username, post_count, location, api_type])
    # When profile is done scraping, this function will return True
    print("Scrape Completed"+"\n")
    return [True, info]
Ejemplo n.º 2
0
def start_datascraper(session, board_name, site_name, link_type, choice_type=None):
    print("Scrape Processing")
    info = link_check(session, board_name)
    if not info["exists"]:
        return [False, info]
    print("Board: " + board_name)
    array = scrape_choice(board_name)
    pool = multiprocessing()
    threads = board_scraper(session, array[0], "")
    archive_threads = board_scraper(session, array[1], "archive")
    threads = threads + archive_threads
    print("Original Count: "+str(len(threads)))
    formatted_directories = main_helper.format_directories(
        j_directory, site_name, board_name)
    model_directory = formatted_directories["model_directory"]
    metadata_directory = formatted_directories["metadata_directory"]
    api_directory = formatted_directories["api_directory"]
    directory = model_directory
    print("Scraping Threads")
    threads = pool.starmap(thread_scraper,
                           product(threads, [board_name], [session], [directory]))
    threads = [x for x in threads if x is not None]
    post_count = len(threads)
    print("Valid Count: "+str(post_count))
    print("Downloading Media")
    count_results = str(len([x for x in threads if x is None]))
    print("Invalid Count: "+count_results)
    num = random.randrange(0, 200)
    avatar = f"https://s.4cdn.org/image/title/{num}.png"
    link = info["link"]
    info["download"] = prepare_download.start(
        username=board_name, link=link, image_url=avatar, post_count=post_count, webhook=webhook)
    info["download"].others.append([threads, session, directory, board_name])
    # When profile is done scraping, this function will return True
    return [True, info]
Ejemplo n.º 3
0
def start_datascraper(api, identifier, site_name, choice_type=None):
    print("Scrape Processing")
    subscription = api.get_subscription(identifier)
    if not subscription:
        return [False, subscription]
    post_count = subscription.postsCount
    user_id = str(subscription.id)
    avatar = subscription.avatar
    username = subscription.username
    link = subscription.link
    formatted_directories = main_helper.format_directories(
        j_directory, site_name, username)
    metadata_directory = formatted_directories["metadata_directory"]
    archive_path = os.path.join(metadata_directory, "Mass Messages.json")
    if subscription.is_me:
        imported = import_archive(archive_path)
        mass_messages = api.get_mass_messages(resume=imported)
        export_archive(mass_messages, archive_path,
                       json_settings, rename=False)
    info = {}
    info["download"] = prepare_download.start(
        username=username, link=link, image_url=avatar, post_count=post_count, webhook=webhook)
    print("Name: "+username)
    api_array = scrape_choice(api, subscription)
    api_array = format_options(api_array, "apis")
    apis = api_array[0]
    api_string = api_array[1]
    if not json_settings["auto_scrape_apis"]:
        print("Apis: "+api_string)
        value = int(input().strip())
    else:
        value = 0
    if value:
        apis = [apis[value]]
    else:
        apis.pop(0)
    for item in apis:
        print("Type: "+item["api_type"])
        only_links = item["api_array"]["only_links"]
        post_count = str(item["api_array"]["post_count"])
        item["api_array"]["username"] = username
        item["api_array"]["subscription"] = subscription
        api_type = item["api_type"]
        results = prepare_scraper(
            api, site_name, item)
        print
    if any(x for x in subscription.scraped):
        subscription.download_info["directory"] = j_directory
        subscription.download_info["model_directory"] = os.path.join(
            j_directory, username)
        subscription.download_info["webhook"] = webhook
    print("Scrape Completed"+"\n")
    return [True, info]
Ejemplo n.º 4
0
def start_datascraper(api, identifier, site_name, choice_type=None):
    print("Scrape Processing")
    subscription = api.get_subscription(identifier)
    if not subscription:
        return [False, subscription]
    post_count = subscription.postsCount
    user_id = str(subscription.id)
    avatar = subscription.avatar
    username = subscription.username
    link = subscription.link
    info = {}
    info["download"] = prepare_download.start(username=username,
                                              link=link,
                                              image_url=avatar,
                                              post_count=post_count,
                                              webhook=webhook)
    print("Name: " + username)
    api_array = scrape_choice(api, subscription)
    api_array = format_options(api_array, "apis")
    apis = api_array[0]
    api_string = api_array[1]
    if not json_settings["auto_scrape_apis"]:
        print("Apis: " + api_string)
        value = int(input().strip())
    else:
        value = 0
    if value:
        apis = [apis[value]]
    else:
        apis.pop(0)
    metadata_locations = {}
    for item in apis:
        print("Type: " + item["api_type"])
        only_links = item["api_array"]["only_links"]
        post_count = str(item["api_array"]["post_count"])
        item["api_array"]["username"] = username
        item["api_array"]["subscription"] = subscription
        api_type = item["api_type"]
        results = prepare_scraper(api, site_name, item)
    print("Scrape Completed" + "\n")
    return [True, info]