def run(): # Extract environmental variables s3_path_in = os.environ['BATCHPAR_s3_path_in'] first_index = int(os.environ['BATCHPAR_first_index']) last_index = int(os.environ['BATCHPAR_last_index']) # Load the chunk s3 = boto3.resource('s3') s3_obj_in = s3.Object(*parse_s3_path(s3_path_in)) data = json.load(s3_obj_in.get()['Body']) # Extract ngrams ngrammer = Ngrammer(config_filepath="mysqldb.config", database="production") processed = [] for i, row in enumerate(data[first_index:last_index]): new_row = { k: ngrammer.process_document(v) if type(v) is str and len(v) > 50 else v for k, v in row.items() } processed.append(new_row) # Mark the task as done and save the data if "BATCHPAR_outinfo" in os.environ: s3_path_out = os.environ["BATCHPAR_outinfo"] s3 = boto3.resource('s3') s3_obj = s3.Object(*parse_s3_path(s3_path_out)) s3_obj.put(Body=json.dumps(processed))
def run(): '''Make a single request for country's worldbank data''' # Remove the 2 default parameters for the batch job outpath = os.environ.pop("BATCHPAR_outinfo") _ = os.environ.pop("BATCHPAR_done") kwargs = {} for k, v in os.environ.items(): if not k.startswith("BATCHPAR_"): continue if k.isupper(): continue new_key = k.replace("BATCHPAR_", "") if v.isdigit(): v = int(v) kwargs[new_key] = v kwargs['data_key_path'] = ast.literal_eval(kwargs['data_key_path']) print("===>", kwargs) country_data = country_data_single_request(**kwargs) # Generate the output json data = json.dumps(country_data).encode('utf8') # Upload the data to S3 s3 = boto3.resource('s3') s3_obj = s3.Object(*parse_s3_path(outpath)) s3_obj.put(Body=data)
def run(): logging.getLogger().setLevel(logging.INFO) # Fetch the input parameters member_ids = literal_eval(os.environ["BATCHPAR_member_ids"]) s3_path = os.environ["BATCHPAR_outinfo"] db = os.environ["BATCHPAR_db"] # Generate the groups for these members output = [] for member_id in member_ids: response = get_member_details(member_id, max_results=200) output += get_member_groups(response) logging.info("Got %s groups", len(output)) # Load connection to the db, and create the tables objs = insert_data("BATCHPAR_config", "mysqldb", db, Base, GroupMember, output) logging.info("Inserted %s groups", len(objs)) # Mark the task as done s3 = boto3.resource('s3') s3_obj = s3.Object(*parse_s3_path(s3_path)) s3_obj.put(Body="") return len(objs)
def run(): table_name = os.environ["BATCHPAR_table_name"] url = os.environ["BATCHPAR_url"] db_name = os.environ["BATCHPAR_db_name"] s3_path = os.environ["BATCHPAR_outinfo"] # Setup the database connectors engine = get_mysql_engine("BATCHPAR_config", "mysqldb", db_name) try_until_allowed(Base.metadata.create_all, engine) _class = get_class_by_tablename(Base, table_name) Session = try_until_allowed(sessionmaker, engine) session = try_until_allowed(Session) # Commit the data all_pks = set() objs = [] pkey_cols = _class.__table__.primary_key.columns for row in iterrows(url): if len(row) == 0: continue if session.query(exists(_class, **row)).scalar(): continue pk = tuple([row[pkey.name] for pkey in pkey_cols]) if pk in all_pks: continue all_pks.add(pk) objs.append(_class(**row)) session.bulk_save_objects(objs) session.commit() session.close() # Mark the task as done s3 = boto3.resource('s3') s3_obj = s3.Object(*parse_s3_path(s3_path)) s3_obj.put(Body="")
def deep_split(s3_path): """Return subbucket path: <s3:pathto/subbucket_name>/keys Args: s3_path (str): S3 path string. Returns: subbucket_path (str): Path to the subbucket. """ s3_bucket, s3_key = s3.parse_s3_path(s3_path) subbucket, _ = os.path.split(s3_key) return s3_bucket, subbucket, s3_key
def run(): # Get variables out s3_path_in = os.environ['BATCHPAR_s3_path_in'] s3_path_out = os.environ["BATCHPAR_outinfo"] first_index = int(os.environ['BATCHPAR_first_index']) last_index = int(os.environ['BATCHPAR_last_index']) # Load the data s3 = boto3.resource('s3') s3_obj_in = s3.Object(*parse_s3_path(s3_path_in)) data = json.load(s3_obj_in.get()['Body']) # Create a "corpus" by joining together text fields # which have been analysed by the ngrammer already n_topics = 10 n_topics_per_doc = 3 # Assign fake topics topic_loop = cycle(range(0, n_topics)) topic_nums = list(range(0, n_topics)) all_topics = [] for _ in range(0, len(data)): idx = np.random.choice(topic_nums) topics = [] counter = 0 for i, jdx in enumerate(topic_loop): if idx == jdx: counter += 1 if counter > 0: topics.append(f"FAKE_TOPIC_{i}") counter += 1 if counter > n_topics_per_doc: break all_topics.append(topics) # Mark the task as done if s3_path_out != "": s3 = boto3.resource('s3') s3_obj = s3.Object(*parse_s3_path(s3_path_out)) s3_obj.put(Body=json.dumps(all_topics))
def run(): s3_path_in = os.environ['BATCHPAR_s3_path_in'] text_field = optional('text_field', 'body') id_field = optional('id_field', 'id') binary = optional('binary', False) min_df = optional('min_df', 1) max_df = optional('max_df', 1.0) # Load the chunk s3 = boto3.resource('s3') s3_obj_in = s3.Object(*parse_s3_path(s3_path_in)) data = json.load(s3_obj_in.get()['Body']) # Extract text and indexes from the data, then delete the dead weight _data = [merge_lists(row[text_field]) for row in data] index = [row[id_field] for row in data] assert len(_data) == len(data) del data # Build the corpus dct = Dictionary(_data) dct.filter_extremes(no_below=np.ceil(min_df * len(_data)), no_above=max_df) # Write the data as JSON body = json.dumps([ dict(id=idx, **term_counts(dct, row, binary)) for idx, row in zip(index, _data) ]) del _data del index del dct # Mark the task as done and save the data if "BATCHPAR_outinfo" in os.environ: s3_path_out = os.environ["BATCHPAR_outinfo"] s3 = boto3.resource('s3') s3_obj = s3.Object(*parse_s3_path(s3_path_out)) s3_obj.put(Body=body)
def run(): '''Gets the name and age of the muppet, and increments the age. The result is transferred to S3.''' # Get parameters for the batch job outpath = os.environ["BATCHPAR_outinfo"] age = int(os.environ["BATCHPAR_age"]) name = os.environ["BATCHPAR_name"] # Generate the output json data = json.dumps({"name": name, "age": age + 1}).encode('utf8') # Upload the data to S3 s3 = boto3.resource('s3') s3_obj = s3.Object(*parse_s3_path(outpath)) s3_obj.put(Body=data)
def run(): test = literal_eval(os.environ["BATCHPAR_test"]) db_name = os.environ["BATCHPAR_db_name"] batch_size = int(os.environ["BATCHPAR_batch_size"]) # example parameter s3_path = os.environ["BATCHPAR_outinfo"] start_string = os.environ["BATCHPAR_start_string"], # example parameter offset = int(os.environ["BATCHPAR_offset"]) # reduce records in test mode if test: limit = 50 logging.info(f"Limiting to {limit} rows in test mode") else: limit = batch_size logging.info(f"Processing {offset} - {offset + limit}") # database setup logging.info(f"Using {db_name} database") engine = get_mysql_engine("BATCHPAR_config", "mysqldb", db_name) try_until_allowed(Base.metadata.create_all, engine) with db_session(engine) as session: # consider moving this query and the one from the prepare step into a package batch_records = (session.query(MyTable.id, MyTable.name).filter( MyTable.founded_on > '2007-01-01').offset(offset).limit(limit)) # process and insert data processed_batch = [] for row in batch_records: processed_row = some_func(start_string=start_string, row=row) processed_batch.append(processed_row) logging.info(f"Inserting {len(processed_batch)} rows") insert_data("BATCHPAR_config", 'mysqldb', db_name, Base, MyOtherTable, processed_batch, low_memory=True) logging.info(f"Marking task as done to {s3_path}") s3 = boto3.resource('s3') s3_obj = s3.Object(*parse_s3_path(s3_path)) s3_obj.put(Body="") logging.info("Batch job complete.")
def run(): logging.getLogger().setLevel(logging.INFO) # Fetch the input parameters group_urlnames = literal_eval(os.environ["BATCHPAR_group_urlnames"]) group_urlnames = [x.decode("utf8") for x in group_urlnames] s3_path = os.environ["BATCHPAR_outinfo"] db = os.environ["BATCHPAR_db"] # Generate the groups for these members _output = [] for urlname in group_urlnames: _info = get_group_details(urlname, max_results=200) if len(_info) == 0: continue _output.append(_info) logging.info("Processed %s groups", len(_output)) # Flatten the output output = flatten_data(_output, keys = [('category', 'name'), ('category', 'shortname'), ('category', 'id'), 'created', 'country', 'city', 'description', 'id', 'lat', 'lon', 'members', 'name', 'topics', 'urlname']) objs = insert_data("BATCHPAR_config", "mysqldb", db, Base, Group, output[48:49]) # Mark the task as done s3 = boto3.resource('s3') s3_obj = s3.Object(*parse_s3_path(s3_path)) s3_obj.put(Body="") return len(objs)
def run(): # Get variables out s3_path_in = os.environ['BATCHPAR_s3_path_in'] s3_path_out = os.environ["BATCHPAR_outinfo"] first_index = int(os.environ['BATCHPAR_first_index']) last_index = int(os.environ['BATCHPAR_last_index']) # Load the data s3 = boto3.resource('s3') #s3_obj_in = s3.Object(*parse_s3_path(s3_path_in)) #data = json.load(s3_obj_in.get()['Body']) # Curate the output output = {'loss': 100, 'data': {'rows': ["DUMMY", "JSON"]}} # Mark the task as done if s3_path_out != "": s3_obj = s3.Object(*parse_s3_path(s3_path_out)) s3_obj.put(Body=json.dumps(output))
def run(): test = literal_eval(os.environ["BATCHPAR_test"]) db_name = os.environ["BATCHPAR_db_name"] table = os.environ["BATCHPAR_table"] batch_size = int(os.environ["BATCHPAR_batch_size"]) s3_path = os.environ["BATCHPAR_outinfo"] logging.warning(f"Processing {table} file") # database setup engine = get_mysql_engine("BATCHPAR_config", "mysqldb", db_name) try_until_allowed(Base.metadata.create_all, engine) table_name = f"crunchbase_{table}" table_class = get_class_by_tablename(Base, table_name) # collect file nrows = 1000 if test else None df = get_files_from_tar([table], nrows=nrows)[0] logging.warning(f"{len(df)} rows in file") # get primary key fields and set of all those already existing in the db pk_cols = list(table_class.__table__.primary_key.columns) pk_names = [pk.name for pk in pk_cols] with db_session(engine) as session: existing_rows = set(session.query(*pk_cols).all()) # process and insert data processed_rows = process_non_orgs(df, existing_rows, pk_names) for batch in split_batches(processed_rows, batch_size): insert_data("BATCHPAR_config", 'mysqldb', db_name, Base, table_class, processed_rows, low_memory=True) logging.warning(f"Marking task as done to {s3_path}") s3 = boto3.resource('s3') s3_obj = s3.Object(*parse_s3_path(s3_path)) s3_obj.put(Body="") logging.warning("Batch job complete.")
def run(): PAGE_SIZE = int(os.environ['BATCHPAR_PAGESIZE']) page = int(os.environ['BATCHPAR_page']) db = os.environ["BATCHPAR_db"] s3_path = os.environ["BATCHPAR_outinfo"] data = defaultdict(list) # Get all projects on this page projects = read_xml_from_url(TOP_URL, p=page, s=PAGE_SIZE) for project in projects.getchildren(): # Extract the data for the project into 'row' # Then recursively extract data from nested rows into the parent 'row' _, row = extract_data(project) extract_data_recursive(project, row) # Flatten out any list data directly into 'data' unpack_list_data(row, data) # Append the row data[row.pop('entity')].append(row) # Much of the participant data is repeated so remove overlaps if 'participant' in data: deduplicate_participants(data) # Finally, extract links between entities and the core projects extract_link_table(data) objs = [] for table_name, rows in data.items(): _class = get_class_by_tablename(Base, f"gtr_{table_name}") # Remove any fields that aren't in the ORM cleaned_rows = [{k:v for k, v in row.items() if k in _class.__dict__} for row in rows] objs += insert_data("BATCHPAR_config", "mysqldb", db, Base, _class, cleaned_rows) # Mark the task as done if s3_path != "": s3 = boto3.resource('s3') s3_obj = s3.Object(*parse_s3_path(s3_path)) s3_obj.put(Body="") return len(objs)
def run(): # Get variables out s3_path_in = os.environ['BATCHPAR_s3_path_in'] s3_path_out = os.environ["BATCHPAR_outinfo"] first_index = int(os.environ['BATCHPAR_first_index']) last_index = int(os.environ['BATCHPAR_last_index']) logging.info(f"Using pars {s3_path_out} {parse_s3_path(s3_path_out)}") # Load the data s3 = boto3.resource('s3') #s3_obj_in = s3.Object(*parse_s3_path(s3_path_in)) #data = json.load(s3_obj_in.get()['Body']) # Mark the task as done if s3_path_out != "": s3_obj = s3.Object(*parse_s3_path(s3_path_out)) logging.info(f"Putting an object in {s3_path_out}, " f"{parse_s3_path(s3_path_out)}, {s3_obj}") s3_obj.put(Body=json.dumps(["DUMMY", "JSON"]))
def run(): logging.getLogger().setLevel(logging.INFO) # Fetch the input parameters iso2 = os.environ["BATCHPAR_iso2"] name = os.environ["BATCHPAR_name"] category = os.environ["BATCHPAR_cat"] coords = literal_eval(os.environ["BATCHPAR_coords"]) radius = float(os.environ["BATCHPAR_radius"]) s3_path = os.environ["BATCHPAR_outinfo"] db = os.environ["BATCHPAR_db"] # Get the data mcg = MeetupCountryGroups(country_code=iso2, category=category, coords=coords, radius=radius) mcg.get_groups_recursive() output = flatten_data(mcg.groups, country_name=name, country=iso2, timestamp=func.utc_timestamp(), keys=[('category', 'name'), ('category', 'shortname'), ('category', 'id'), 'description', 'created', 'country', 'city', 'id', 'lat', 'lon', 'members', 'name', 'topics', 'urlname']) # Add the data objs = insert_data("BATCHPAR_config", "mysqldb", db, Base, Group, output) # Mark the task as done s3 = boto3.resource('s3') s3_obj = s3.Object(*parse_s3_path(s3_path)) s3_obj.put(Body="") # Mainly for testing return len(objs)
def run(): s3_path_in = os.environ['BATCHPAR_s3_path_in'] n_hidden = int(literal_eval(os.environ['BATCHPAR_n_hidden'])) # Load and shape the data s3 = boto3.resource('s3') s3_obj_in = s3.Object(*parse_s3_path(s3_path_in)) data = json.load(s3_obj_in.get()['Body']) # Pack the data into a sparse matrix ids = [] # Index of each row indptr = [0] # Number of non-null entries per row indices = [] # Positions of non-null entries per row counts = [] # Term counts/weights per position vocab = {} # {Term: position} lookup for row in data: ids.append(row.pop('id')) for term, count in row.items(): idx = vocab.setdefault(term, len(vocab)) indices.append(idx) counts.append(count) indptr.append(len(indices)) X = csr_matrix((counts, indices, indptr), dtype=int) # {Position: term} lookup _vocab = {v: k for k, v in vocab.items()} # Fit the model topic_model = ct.Corex(n_hidden=n_hidden) topic_model.fit(X) topics = topic_model.get_topics() # Generate topic names topic_names = { f'topic_{itop}': [_vocab[idx] for idx, weight in topic] for itop, topic in enumerate(topics) } # Calculate topic weights as sum(bool(term in doc)*{term_weight}) rows = [{ f'topic_{itop}': sum(row.getcol(idx).toarray()[0][0] * weight for idx, weight in topic) for itop, topic in enumerate(topics) } for row in X] # Zip the row indexes back in, and ignore small weights rows = [ dict(id=id, **{k: v for k, v in row.items() if v > WEIGHT_THRESHOLD}) for id, row in zip(ids, rows) ] # Curate the output output = { 'loss': topic_model.tc, 'data': { 'topic_names': topic_names, 'rows': rows } } # Mark the task as done and save the data if "BATCHPAR_outinfo" in os.environ: s3_path_out = os.environ["BATCHPAR_outinfo"] s3 = boto3.resource('s3') s3_obj = s3.Object(*parse_s3_path(s3_path_out)) s3_obj.put(Body=json.dumps(output))
def run(): # Get variables out s3_path_in = os.environ['BATCHPAR_s3_path_in'] s3_path_out = os.environ["BATCHPAR_outinfo"] first_index = int(os.environ['BATCHPAR_first_index']) last_index = int(os.environ['BATCHPAR_last_index']) lower_tfidf_percentile = int(os.environ['BATCHPAR_lower_tfidf_percentile']) upper_tfidf_percentile = int(os.environ['BATCHPAR_upper_tfidf_percentile']) # Load the data s3 = boto3.resource('s3') s3_obj_in = s3.Object(*parse_s3_path(s3_path_in)) data = json.load(s3_obj_in.get()['Body']) # Create a "corpus" by joining together text fields # which have been analysed by the ngrammer already corpus = [] for row in data[first_index:last_index]: doc = [] for k, v in row.items(): if not (type(v) is list): continue doc += [" ".join(item) for item in v] corpus.append(" ".join(doc)) # Calculate tfidf values for all terms tvec = TfidfVectorizer() _transformed = tvec.fit_transform(corpus) # Extract a reverse lookup for indexes to terms lookup = {idx: term for term, idx in tvec.vocabulary_.items()} # Calculate the lower and upper bounds from the percentiles tfidf_values = _transformed[_transformed > 0] lower_cut = np.percentile(tfidf_values, lower_tfidf_percentile) upper_cut = np.percentile(tfidf_values, upper_tfidf_percentile) del tfidf_values # Generate the list of allowed terms for each document good_words_corpus = [] for row in chunker(_transformed, 100): good_words_doc = set(lookup[idx] for idx, value in enumerate(row) if (value > lower_cut) and (value < upper_cut)) good_words_corpus.append(good_words_doc) # Finally, filter the input data outdata = [] for row, good_words in zip(data[first_index:last_index], good_words_corpus): new_row = dict(**row) for k, v in row.items(): if not (type(v) is list): continue new_row[k] = [ " ".join(term for term in sentence if term in good_words) for sentence in v ] outdata.append(new_row) # Mark the task as done if s3_path_out != "": s3 = boto3.resource('s3') s3_obj = s3.Object(*parse_s3_path(s3_path_out)) s3_obj.put(Body=json.dumps(outdata)) else: return outdata
def run(): db_name = os.environ["BATCHPAR_db_name"] s3_path = os.environ["BATCHPAR_outinfo"] start_cursor = int(os.environ["BATCHPAR_start_cursor"]) end_cursor = int(os.environ["BATCHPAR_end_cursor"]) batch_size = end_cursor - start_cursor logging.warning(f"Retrieving {batch_size} articles between {start_cursor - 1}:{end_cursor - 1}") # Setup the database connectors engine = get_mysql_engine("BATCHPAR_config", "mysqldb", db_name) try_until_allowed(Base.metadata.create_all, engine) # load arxiv subject categories to database bucket = 'innovation-mapping-general' cat_file = 'arxiv_classification/arxiv_subject_classifications.csv' load_arxiv_categories("BATCHPAR_config", db_name, bucket, cat_file) # process data articles = [] article_cats = [] resumption_token = request_token() for row in retrieve_arxiv_batch_rows(start_cursor, end_cursor, resumption_token): with db_session(engine) as session: categories = row.pop('categories', []) articles.append(row) for cat in categories: # TODO:this is inefficient and should be queried once to a set. see # iterative proceess. try: session.query(Category).filter(Category.id == cat).one() except NoResultFound: logging.warning(f"missing category: '{cat}' for article {row['id']}. Adding to Category table") session.add(Category(id=cat)) article_cats.append(dict(article_id=row['id'], category_id=cat)) inserted_articles, existing_articles, failed_articles = insert_data( "BATCHPAR_config", "mysqldb", db_name, Base, Article, articles, return_non_inserted=True) logging.warning(f"total article categories: {len(article_cats)}") inserted_article_cats, existing_article_cats, failed_article_cats = insert_data( "BATCHPAR_config", "mysqldb", db_name, Base, ArticleCategory, article_cats, return_non_inserted=True) # sanity checks before the batch is marked as done logging.warning((f'inserted articles: {len(inserted_articles)} ', f'existing articles: {len(existing_articles)} ', f'failed articles: {len(failed_articles)}')) logging.warning((f'inserted article categories: {len(inserted_article_cats)} ', f'existing article categories: {len(existing_article_cats)} ', f'failed article categories: {len(failed_article_cats)}')) if len(inserted_articles) + len(existing_articles) + len(failed_articles) != batch_size: raise ValueError(f'Inserted articles do not match original data.') if len(inserted_article_cats) + len(existing_article_cats) + len(failed_article_cats) != len(article_cats): raise ValueError(f'Inserted article categories do not match original data.') # Mark the task as done s3 = boto3.resource('s3') s3_obj = s3.Object(*parse_s3_path(s3_path)) s3_obj.put(Body="")