Python parse_s3_path Examples, nesta.core.luigihacks.s3.parse_s3_path Python Examples

Example #1

0

Show file

def run():
    # Extract environmental variables
    s3_path_in = os.environ['BATCHPAR_s3_path_in']
    first_index = int(os.environ['BATCHPAR_first_index'])
    last_index = int(os.environ['BATCHPAR_last_index'])

    # Load the chunk
    s3 = boto3.resource('s3')
    s3_obj_in = s3.Object(*parse_s3_path(s3_path_in))
    data = json.load(s3_obj_in.get()['Body'])

    # Extract ngrams
    ngrammer = Ngrammer(config_filepath="mysqldb.config",
                        database="production")
    processed = []
    for i, row in enumerate(data[first_index:last_index]):
        new_row = {
            k: ngrammer.process_document(v)
            if type(v) is str and len(v) > 50 else v
            for k, v in row.items()
        }
        processed.append(new_row)

    # Mark the task as done and save the data
    if "BATCHPAR_outinfo" in os.environ:
        s3_path_out = os.environ["BATCHPAR_outinfo"]
        s3 = boto3.resource('s3')
        s3_obj = s3.Object(*parse_s3_path(s3_path_out))
        s3_obj.put(Body=json.dumps(processed))

Example #2

0

Show file

File: run.py Project: yitzikc/nesta

def run():
    '''Make a single request for country's worldbank data'''

    # Remove the 2 default parameters for the batch job
    outpath = os.environ.pop("BATCHPAR_outinfo")
    _ = os.environ.pop("BATCHPAR_done")

    kwargs = {}
    for k, v in os.environ.items():
        if not k.startswith("BATCHPAR_"):
            continue
        if k.isupper():
            continue
        new_key = k.replace("BATCHPAR_", "")
        if v.isdigit():
            v = int(v)
        kwargs[new_key] = v
    kwargs['data_key_path'] = ast.literal_eval(kwargs['data_key_path'])
    print("===>", kwargs)

    country_data = country_data_single_request(**kwargs)

    # Generate the output json
    data = json.dumps(country_data).encode('utf8')
    # Upload the data to S3
    s3 = boto3.resource('s3')
    s3_obj = s3.Object(*parse_s3_path(outpath))
    s3_obj.put(Body=data)

Example #3

0

Show file

def run():
    logging.getLogger().setLevel(logging.INFO)

    # Fetch the input parameters
    member_ids = literal_eval(os.environ["BATCHPAR_member_ids"])
    s3_path = os.environ["BATCHPAR_outinfo"]
    db = os.environ["BATCHPAR_db"]

    # Generate the groups for these members
    output = []
    for member_id in member_ids:
        response = get_member_details(member_id, max_results=200)
        output += get_member_groups(response)
    logging.info("Got %s groups", len(output))

    # Load connection to the db, and create the tables
    objs = insert_data("BATCHPAR_config", "mysqldb", db, Base, GroupMember,
                       output)
    logging.info("Inserted %s groups", len(objs))

    # Mark the task as done
    s3 = boto3.resource('s3')
    s3_obj = s3.Object(*parse_s3_path(s3_path))
    s3_obj.put(Body="")

    return len(objs)

Example #4

0

Show file

File: run.py Project: yitzikc/nesta

def run():
    table_name = os.environ["BATCHPAR_table_name"]
    url = os.environ["BATCHPAR_url"]
    db_name = os.environ["BATCHPAR_db_name"]
    s3_path = os.environ["BATCHPAR_outinfo"]

    # Setup the database connectors
    engine = get_mysql_engine("BATCHPAR_config", "mysqldb", db_name)
    try_until_allowed(Base.metadata.create_all, engine)
    _class = get_class_by_tablename(Base, table_name)
    Session = try_until_allowed(sessionmaker, engine)
    session = try_until_allowed(Session)

    # Commit the data
    all_pks = set()
    objs = []
    pkey_cols = _class.__table__.primary_key.columns
    for row in iterrows(url):
        if len(row) == 0:
            continue
        if session.query(exists(_class, **row)).scalar():
            continue
        pk = tuple([row[pkey.name] for pkey in pkey_cols])
        if pk in all_pks:
            continue
        all_pks.add(pk)
        objs.append(_class(**row))
    session.bulk_save_objects(objs)
    session.commit()
    session.close()

    # Mark the task as done
    s3 = boto3.resource('s3')
    s3_obj = s3.Object(*parse_s3_path(s3_path))
    s3_obj.put(Body="")

Example #5

0

Show file

def deep_split(s3_path):
    """Return subbucket path: <s3:pathto/subbucket_name>/keys

    Args:
        s3_path (str): S3 path string.
    Returns:
        subbucket_path (str): Path to the subbucket.
    """
    s3_bucket, s3_key = s3.parse_s3_path(s3_path)
    subbucket, _ = os.path.split(s3_key)
    return s3_bucket, subbucket, s3_key

Example #6

0

Show file

def run():
    # Get variables out
    s3_path_in = os.environ['BATCHPAR_s3_path_in']
    s3_path_out = os.environ["BATCHPAR_outinfo"]
    first_index = int(os.environ['BATCHPAR_first_index'])
    last_index = int(os.environ['BATCHPAR_last_index'])

    # Load the data
    s3 = boto3.resource('s3')
    s3_obj_in = s3.Object(*parse_s3_path(s3_path_in))
    data = json.load(s3_obj_in.get()['Body'])

    # Create a "corpus" by joining together text fields
    # which have been analysed by the ngrammer already
    n_topics = 10
    n_topics_per_doc = 3

    # Assign fake topics
    topic_loop = cycle(range(0, n_topics))
    topic_nums = list(range(0, n_topics))
    all_topics = []
    for _ in range(0, len(data)):
        idx = np.random.choice(topic_nums)
        topics = []
        counter = 0
        for i, jdx in enumerate(topic_loop):
            if idx == jdx:
                counter += 1
            if counter > 0:
                topics.append(f"FAKE_TOPIC_{i}")
                counter += 1
            if counter > n_topics_per_doc:
                break
        all_topics.append(topics)

    # Mark the task as done
    if s3_path_out != "":
        s3 = boto3.resource('s3')
        s3_obj = s3.Object(*parse_s3_path(s3_path_out))
        s3_obj.put(Body=json.dumps(all_topics))

Example #7

0

Show file

def run():
    s3_path_in = os.environ['BATCHPAR_s3_path_in']
    text_field = optional('text_field', 'body')
    id_field = optional('id_field', 'id')
    binary = optional('binary', False)
    min_df = optional('min_df', 1)
    max_df = optional('max_df', 1.0)

    # Load the chunk
    s3 = boto3.resource('s3')
    s3_obj_in = s3.Object(*parse_s3_path(s3_path_in))
    data = json.load(s3_obj_in.get()['Body'])

    # Extract text and indexes from the data, then delete the dead weight
    _data = [merge_lists(row[text_field]) for row in data]
    index = [row[id_field] for row in data]
    assert len(_data) == len(data)
    del data

    # Build the corpus
    dct = Dictionary(_data)
    dct.filter_extremes(no_below=np.ceil(min_df * len(_data)), no_above=max_df)

    # Write the data as JSON
    body = json.dumps([
        dict(id=idx, **term_counts(dct, row, binary))
        for idx, row in zip(index, _data)
    ])
    del _data
    del index
    del dct

    # Mark the task as done and save the data
    if "BATCHPAR_outinfo" in os.environ:
        s3_path_out = os.environ["BATCHPAR_outinfo"]
        s3 = boto3.resource('s3')
        s3_obj = s3.Object(*parse_s3_path(s3_path_out))
        s3_obj.put(Body=body)

Example #8

0

Show file

def run():
    '''Gets the name and age of the muppet, and increments the age.
    The result is transferred to S3.'''

    # Get parameters for the batch job
    outpath = os.environ["BATCHPAR_outinfo"]
    age = int(os.environ["BATCHPAR_age"])
    name = os.environ["BATCHPAR_name"]
    # Generate the output json
    data = json.dumps({"name": name, "age": age + 1}).encode('utf8')
    # Upload the data to S3
    s3 = boto3.resource('s3')
    s3_obj = s3.Object(*parse_s3_path(outpath))
    s3_obj.put(Body=data)

Example #9

0

Show file

File: run.py Project: yitzikc/nesta

def run():
    test = literal_eval(os.environ["BATCHPAR_test"])
    db_name = os.environ["BATCHPAR_db_name"]
    batch_size = int(os.environ["BATCHPAR_batch_size"])  # example parameter
    s3_path = os.environ["BATCHPAR_outinfo"]
    start_string = os.environ["BATCHPAR_start_string"],  # example parameter
    offset = int(os.environ["BATCHPAR_offset"])

    # reduce records in test mode
    if test:
        limit = 50
        logging.info(f"Limiting to {limit} rows in test mode")
    else:
        limit = batch_size

    logging.info(f"Processing {offset} - {offset + limit}")

    # database setup
    logging.info(f"Using {db_name} database")
    engine = get_mysql_engine("BATCHPAR_config", "mysqldb", db_name)
    try_until_allowed(Base.metadata.create_all, engine)

    with db_session(engine) as session:
        # consider moving this query and the one from the prepare step into a package
        batch_records = (session.query(MyTable.id, MyTable.name).filter(
            MyTable.founded_on > '2007-01-01').offset(offset).limit(limit))

    # process and insert data
    processed_batch = []
    for row in batch_records:
        processed_row = some_func(start_string=start_string, row=row)
        processed_batch.append(processed_row)

    logging.info(f"Inserting {len(processed_batch)} rows")
    insert_data("BATCHPAR_config",
                'mysqldb',
                db_name,
                Base,
                MyOtherTable,
                processed_batch,
                low_memory=True)

    logging.info(f"Marking task as done to {s3_path}")
    s3 = boto3.resource('s3')
    s3_obj = s3.Object(*parse_s3_path(s3_path))
    s3_obj.put(Body="")

    logging.info("Batch job complete.")

Example #10

0

Show file

File: run.py Project: yitzikc/nesta

def run():
    logging.getLogger().setLevel(logging.INFO)

    # Fetch the input parameters
    group_urlnames = literal_eval(os.environ["BATCHPAR_group_urlnames"])
    group_urlnames = [x.decode("utf8") for x in group_urlnames]
    s3_path = os.environ["BATCHPAR_outinfo"]
    db = os.environ["BATCHPAR_db"]

    # Generate the groups for these members
    _output = []
    for urlname in group_urlnames:
        _info = get_group_details(urlname, max_results=200)
        if len(_info) == 0:
            continue
        _output.append(_info)
    logging.info("Processed %s groups", len(_output))

    # Flatten the output
    output = flatten_data(_output,
                          keys = [('category', 'name'),
                                  ('category', 'shortname'),
                                  ('category', 'id'),
                                  'created',
                                  'country',
                                  'city',
                                  'description',
                                  'id',
                                  'lat',
                                  'lon',
                                  'members',
                                  'name',
                                  'topics',
                                  'urlname'])

    objs = insert_data("BATCHPAR_config", "mysqldb", db,
                       Base, Group, output[48:49])

    # Mark the task as done
    s3 = boto3.resource('s3')
    s3_obj = s3.Object(*parse_s3_path(s3_path))
    s3_obj.put(Body="")

    return len(objs)

Example #11

0

Show file

def run():
    # Get variables out
    s3_path_in = os.environ['BATCHPAR_s3_path_in']
    s3_path_out = os.environ["BATCHPAR_outinfo"]
    first_index = int(os.environ['BATCHPAR_first_index'])
    last_index = int(os.environ['BATCHPAR_last_index'])

    # Load the data
    s3 = boto3.resource('s3')
    #s3_obj_in = s3.Object(*parse_s3_path(s3_path_in))
    #data = json.load(s3_obj_in.get()['Body'])

    # Curate the output
    output = {'loss': 100, 'data': {'rows': ["DUMMY", "JSON"]}}

    # Mark the task as done
    if s3_path_out != "":
        s3_obj = s3.Object(*parse_s3_path(s3_path_out))
        s3_obj.put(Body=json.dumps(output))

Example #12

0

Show file

File: run.py Project: yitzikc/nesta

def run():
    test = literal_eval(os.environ["BATCHPAR_test"])
    db_name = os.environ["BATCHPAR_db_name"]
    table = os.environ["BATCHPAR_table"]
    batch_size = int(os.environ["BATCHPAR_batch_size"])
    s3_path = os.environ["BATCHPAR_outinfo"]

    logging.warning(f"Processing {table} file")

    # database setup
    engine = get_mysql_engine("BATCHPAR_config", "mysqldb", db_name)
    try_until_allowed(Base.metadata.create_all, engine)
    table_name = f"crunchbase_{table}"
    table_class = get_class_by_tablename(Base, table_name)

    # collect file
    nrows = 1000 if test else None
    df = get_files_from_tar([table], nrows=nrows)[0]
    logging.warning(f"{len(df)} rows in file")

    # get primary key fields and set of all those already existing in the db
    pk_cols = list(table_class.__table__.primary_key.columns)
    pk_names = [pk.name for pk in pk_cols]
    with db_session(engine) as session:
        existing_rows = set(session.query(*pk_cols).all())

    # process and insert data
    processed_rows = process_non_orgs(df, existing_rows, pk_names)
    for batch in split_batches(processed_rows, batch_size):
        insert_data("BATCHPAR_config",
                    'mysqldb',
                    db_name,
                    Base,
                    table_class,
                    processed_rows,
                    low_memory=True)

    logging.warning(f"Marking task as done to {s3_path}")
    s3 = boto3.resource('s3')
    s3_obj = s3.Object(*parse_s3_path(s3_path))
    s3_obj.put(Body="")

    logging.warning("Batch job complete.")

Example #13

0

Show file

File: run.py Project: yitzikc/nesta

def run():
    PAGE_SIZE = int(os.environ['BATCHPAR_PAGESIZE'])
    page = int(os.environ['BATCHPAR_page'])
    db = os.environ["BATCHPAR_db"]
    s3_path = os.environ["BATCHPAR_outinfo"]

    data = defaultdict(list)

    # Get all projects on this page
    projects = read_xml_from_url(TOP_URL, p=page, s=PAGE_SIZE)
    for project in projects.getchildren():        
        # Extract the data for the project into 'row'
        # Then recursively extract data from nested rows into the parent 'row'
        _, row = extract_data(project)
        extract_data_recursive(project, row)
        # Flatten out any list data directly into 'data'
        unpack_list_data(row, data)
        # Append the row
        data[row.pop('entity')].append(row)

    # Much of the participant data is repeated so remove overlaps
    if 'participant' in data:
        deduplicate_participants(data)
    # Finally, extract links between entities and the core projects
    extract_link_table(data)
    
    objs = []
    for table_name, rows in data.items():
        _class = get_class_by_tablename(Base, f"gtr_{table_name}")
        # Remove any fields that aren't in the ORM
        cleaned_rows = [{k:v for k, v in row.items() if k in _class.__dict__}
                        for row in rows]
        objs += insert_data("BATCHPAR_config", "mysqldb", db,
                            Base, _class, cleaned_rows)

    # Mark the task as done
    if s3_path != "":
        s3 = boto3.resource('s3')
        s3_obj = s3.Object(*parse_s3_path(s3_path))
        s3_obj.put(Body="")

    return len(objs)

Example #14

0

Show file

def run():
    # Get variables out
    s3_path_in = os.environ['BATCHPAR_s3_path_in']
    s3_path_out = os.environ["BATCHPAR_outinfo"]
    first_index = int(os.environ['BATCHPAR_first_index'])
    last_index = int(os.environ['BATCHPAR_last_index'])

    logging.info(f"Using pars {s3_path_out} {parse_s3_path(s3_path_out)}")

    # Load the data
    s3 = boto3.resource('s3')
    #s3_obj_in = s3.Object(*parse_s3_path(s3_path_in))
    #data = json.load(s3_obj_in.get()['Body'])

    # Mark the task as done
    if s3_path_out != "":
        s3_obj = s3.Object(*parse_s3_path(s3_path_out))
        logging.info(f"Putting an object in {s3_path_out}, "
                     f"{parse_s3_path(s3_path_out)}, {s3_obj}")
        s3_obj.put(Body=json.dumps(["DUMMY", "JSON"]))

Example #15

0

Show file

File: run.py Project: yitzikc/nesta

def run():
    logging.getLogger().setLevel(logging.INFO)

    # Fetch the input parameters
    iso2 = os.environ["BATCHPAR_iso2"]
    name = os.environ["BATCHPAR_name"]
    category = os.environ["BATCHPAR_cat"]
    coords = literal_eval(os.environ["BATCHPAR_coords"])
    radius = float(os.environ["BATCHPAR_radius"])
    s3_path = os.environ["BATCHPAR_outinfo"]
    db = os.environ["BATCHPAR_db"]

    # Get the data
    mcg = MeetupCountryGroups(country_code=iso2,
                              category=category,
                              coords=coords,
                              radius=radius)
    mcg.get_groups_recursive()
    output = flatten_data(mcg.groups,
                          country_name=name,
                          country=iso2,
                          timestamp=func.utc_timestamp(),
                          keys=[('category', 'name'), ('category',
                                                       'shortname'),
                                ('category', 'id'), 'description', 'created',
                                'country', 'city', 'id', 'lat', 'lon',
                                'members', 'name', 'topics', 'urlname'])

    # Add the data
    objs = insert_data("BATCHPAR_config", "mysqldb", db, Base, Group, output)

    # Mark the task as done
    s3 = boto3.resource('s3')
    s3_obj = s3.Object(*parse_s3_path(s3_path))
    s3_obj.put(Body="")

    # Mainly for testing
    return len(objs)

Example #16

0

Show file

File: run.py Project: hmessafi/nesta

def run():
    s3_path_in = os.environ['BATCHPAR_s3_path_in']
    n_hidden = int(literal_eval(os.environ['BATCHPAR_n_hidden']))

    # Load and shape the data
    s3 = boto3.resource('s3')
    s3_obj_in = s3.Object(*parse_s3_path(s3_path_in))
    data = json.load(s3_obj_in.get()['Body'])

    # Pack the data into a sparse matrix
    ids = []  # Index of each row
    indptr = [0]  # Number of non-null entries per row
    indices = []  # Positions of non-null entries per row
    counts = []  # Term counts/weights per position
    vocab = {}  # {Term: position} lookup
    for row in data:
        ids.append(row.pop('id'))
        for term, count in row.items():
            idx = vocab.setdefault(term, len(vocab))
            indices.append(idx)
            counts.append(count)
        indptr.append(len(indices))
    X = csr_matrix((counts, indices, indptr), dtype=int)

    # {Position: term} lookup
    _vocab = {v: k for k, v in vocab.items()}

    # Fit the model
    topic_model = ct.Corex(n_hidden=n_hidden)
    topic_model.fit(X)
    topics = topic_model.get_topics()

    # Generate topic names
    topic_names = {
        f'topic_{itop}': [_vocab[idx] for idx, weight in topic]
        for itop, topic in enumerate(topics)
    }

    # Calculate topic weights as sum(bool(term in doc)*{term_weight})
    rows = [{
        f'topic_{itop}':
        sum(row.getcol(idx).toarray()[0][0] * weight for idx, weight in topic)
        for itop, topic in enumerate(topics)
    } for row in X]
    # Zip the row indexes back in, and ignore small weights
    rows = [
        dict(id=id, **{k: v
                       for k, v in row.items() if v > WEIGHT_THRESHOLD})
        for id, row in zip(ids, rows)
    ]

    # Curate the output
    output = {
        'loss': topic_model.tc,
        'data': {
            'topic_names': topic_names,
            'rows': rows
        }
    }

    # Mark the task as done and save the data
    if "BATCHPAR_outinfo" in os.environ:
        s3_path_out = os.environ["BATCHPAR_outinfo"]
        s3 = boto3.resource('s3')
        s3_obj = s3.Object(*parse_s3_path(s3_path_out))
        s3_obj.put(Body=json.dumps(output))

Example #17

0

Show file

File: run.py Project: yitzikc/nesta

def run():
    # Get variables out
    s3_path_in = os.environ['BATCHPAR_s3_path_in']
    s3_path_out = os.environ["BATCHPAR_outinfo"]
    first_index = int(os.environ['BATCHPAR_first_index'])
    last_index = int(os.environ['BATCHPAR_last_index'])
    lower_tfidf_percentile = int(os.environ['BATCHPAR_lower_tfidf_percentile'])
    upper_tfidf_percentile = int(os.environ['BATCHPAR_upper_tfidf_percentile'])

    # Load the data
    s3 = boto3.resource('s3')
    s3_obj_in = s3.Object(*parse_s3_path(s3_path_in))
    data = json.load(s3_obj_in.get()['Body'])

    # Create a "corpus" by joining together text fields
    # which have been analysed by the ngrammer already
    corpus = []
    for row in data[first_index:last_index]:
        doc = []
        for k, v in row.items():
            if not (type(v) is list):
                continue
            doc += [" ".join(item) for item in v]
        corpus.append(" ".join(doc))

    # Calculate tfidf values for all terms
    tvec = TfidfVectorizer()
    _transformed = tvec.fit_transform(corpus)

    # Extract a reverse lookup for indexes to terms
    lookup = {idx: term for term, idx in tvec.vocabulary_.items()}

    # Calculate the lower and upper bounds from the percentiles
    tfidf_values = _transformed[_transformed > 0]
    lower_cut = np.percentile(tfidf_values, lower_tfidf_percentile)
    upper_cut = np.percentile(tfidf_values, upper_tfidf_percentile)
    del tfidf_values

    # Generate the list of allowed terms for each document
    good_words_corpus = []
    for row in chunker(_transformed, 100):
        good_words_doc = set(lookup[idx] for idx, value in enumerate(row)
                             if (value > lower_cut) and (value < upper_cut))
        good_words_corpus.append(good_words_doc)

    # Finally, filter the input data
    outdata = []
    for row, good_words in zip(data[first_index:last_index],
                               good_words_corpus):
        new_row = dict(**row)
        for k, v in row.items():
            if not (type(v) is list):
                continue
            new_row[k] = [
                " ".join(term for term in sentence if term in good_words)
                for sentence in v
            ]
        outdata.append(new_row)

    # Mark the task as done
    if s3_path_out != "":
        s3 = boto3.resource('s3')
        s3_obj = s3.Object(*parse_s3_path(s3_path_out))
        s3_obj.put(Body=json.dumps(outdata))
    else:
        return outdata

Example #18

0

Show file

File: run.py Project: yitzikc/nesta

def run():
    db_name = os.environ["BATCHPAR_db_name"]
    s3_path = os.environ["BATCHPAR_outinfo"]
    start_cursor = int(os.environ["BATCHPAR_start_cursor"])
    end_cursor = int(os.environ["BATCHPAR_end_cursor"])
    batch_size = end_cursor - start_cursor
    logging.warning(f"Retrieving {batch_size} articles between {start_cursor - 1}:{end_cursor - 1}")

    # Setup the database connectors
    engine = get_mysql_engine("BATCHPAR_config", "mysqldb", db_name)
    try_until_allowed(Base.metadata.create_all, engine)

    # load arxiv subject categories to database
    bucket = 'innovation-mapping-general'
    cat_file = 'arxiv_classification/arxiv_subject_classifications.csv'
    load_arxiv_categories("BATCHPAR_config", db_name, bucket, cat_file)

    # process data
    articles = []
    article_cats = []
    resumption_token = request_token()
    for row in retrieve_arxiv_batch_rows(start_cursor, end_cursor, resumption_token):
        with db_session(engine) as session:
            categories = row.pop('categories', [])
            articles.append(row)
            for cat in categories:
                # TODO:this is inefficient and should be queried once to a set. see
                # iterative proceess.
                try:
                    session.query(Category).filter(Category.id == cat).one()
                except NoResultFound:
                    logging.warning(f"missing category: '{cat}' for article {row['id']}.  Adding to Category table")
                    session.add(Category(id=cat))
                article_cats.append(dict(article_id=row['id'], category_id=cat))

    inserted_articles, existing_articles, failed_articles = insert_data(
                                                "BATCHPAR_config", "mysqldb", db_name,
                                                Base, Article, articles,
                                                return_non_inserted=True)
    logging.warning(f"total article categories: {len(article_cats)}")
    inserted_article_cats, existing_article_cats, failed_article_cats = insert_data(
                                                "BATCHPAR_config", "mysqldb", db_name,
                                                Base, ArticleCategory, article_cats,
                                                return_non_inserted=True)

    # sanity checks before the batch is marked as done
    logging.warning((f'inserted articles: {len(inserted_articles)} ',
                     f'existing articles: {len(existing_articles)} ',
                     f'failed articles: {len(failed_articles)}'))
    logging.warning((f'inserted article categories: {len(inserted_article_cats)} ',
                     f'existing article categories: {len(existing_article_cats)} ',
                     f'failed article categories: {len(failed_article_cats)}'))
    if len(inserted_articles) + len(existing_articles) + len(failed_articles) != batch_size:
        raise ValueError(f'Inserted articles do not match original data.')
    if len(inserted_article_cats) + len(existing_article_cats) + len(failed_article_cats) != len(article_cats):
        raise ValueError(f'Inserted article categories do not match original data.')

    # Mark the task as done
    s3 = boto3.resource('s3')
    s3_obj = s3.Object(*parse_s3_path(s3_path))
    s3_obj.put(Body="")