def combine(self, job_params): '''Combine the outputs from the batch jobs''' # Retrieve the batched data country_data = defaultdict(dict) n_rows = 0 for i, params in enumerate(job_params): print(i, " of ", len(job_params)) _body = s3.S3Target(params["outinfo"]).open("rb") _country_data = json.loads(_body.read().decode('utf-8')) for country, data in _country_data.items(): for var_name, data_row in data.items(): n_rows += 1 country_data[country][var_name] = data_row print(f"Got {n_rows} rows of data") # Merge with metadata, then flatten and clean country_metadata = get_worldbank_resource("countries") flat_country_data = flatten_country_data(country_data, country_metadata) cleaned_data = clean_variable_names(flat_country_data) # Commit the data engine = get_mysql_engine("MYSQLDB", "mysqldb", self.db_config['database']) Base.metadata.create_all(engine) Session = sessionmaker(engine) session = Session() for row in cleaned_data: country = WorldbankCountry(**row) session.add(country) session.commit() session.close() self.output().touch()
def combine(self, job_params): '''Combine the outputs from the batch jobs''' outdata = [] for params in job_params: _body = s3.S3Target(params["outinfo"]).open("rb") _data = _body.read().decode('utf-8') outdata.append(json.loads(_data)) with self.output().open("wb") as f: f.write(json.dumps(outdata).encode('utf-8'))
def get_input_length(self): """Retrieve the length of the input, which is stored as the value of the output.length file.""" fname = self.derive_file_length_path() f = s3.S3Target(fname).open('rb') total = json.load(f) if type(total) is not int: raise TypeError('Expected to find integer count in ' f'{fname}. Instead found {type(total)}') f.close() return total
def combine(self, job_params): """Combine output by concatenating results.""" # Download and join logging.debug(f"{self.job_name}: Combining " f"{len(job_params)}...") size, outdata = self.combine_all_outputs(job_params) # Write the output logging.debug(f"{self.job_name}: Writing the output " f"(length {len(outdata)})...") if self.combine_outputs: f = self.output().open("wb") f.write(json.dumps(outdata).encode('utf-8')) f.close() # Write the output length as well, for book-keeping f = s3.S3Target(f"{self.s3_path_out}.length").open("wb") f.write(str(size).encode("utf-8")) f.close()
def combine_all_outputs(self, job_params): size = 0 outdata = [] for params in job_params: _body = s3.S3Target(params["outinfo"]).open("rb") _data = _body.read().decode('utf-8') _outdata = json.loads(_data) # Combine if required if len(job_params) == 1: outdata = _outdata elif self.combine_outputs: outdata += _outdata # Get the length of the data if type(_outdata) is not list: _outdata = _outdata['data']['rows'] size += len(_outdata) return size, outdata
def write_to_s3(self, data, ichunk): f = s3.S3Target(f"{self.s3_path_out}/data." f"{ichunk}-{self.test}.json").open("wb") f.write(json.dumps(data).encode('utf-8')) f.close() return [], ichunk + 1
def output(self): return s3.S3Target(f"{self.s3_path_out}/" f"data.{self.test}.length")
def run(self): # Load the input data (note the input contains the path # to the output) _filename = self.cherry_picked if _filename is None: _body = self.input().open("rb") _filename = _body.read().decode('utf-8') obj = s3.S3Target(f"{self.raw_s3_path_prefix}/" f"{_filename}").open('rb') data = json.load(obj) # Get DB connections and settings database = 'dev' if self.test else 'production' engine = get_mysql_engine(self.db_conf_env, 'mysqldb', database) ArticleTopic.__table__.drop(engine) CorExTopic.__table__.drop(engine) # Insert the topic names data topics = [{ 'id': int(topic_name.split('_')[-1]) + 1, 'terms': terms } for topic_name, terms in data['data']['topic_names'].items()] insert_data(self.db_conf_env, 'mysqldb', database, Base, CorExTopic, topics, low_memory=True) logging.info(f'Inserted {len(topics)} topics') # Insert article topic weight data topic_articles = [] done_ids = set() for row in data['data']['rows']: article_id = row.pop('id') if article_id in done_ids: continue done_ids.add(article_id) topic_articles += [{ 'topic_id': int(topic_name.split('_')[-1]) + 1, 'topic_weight': weight, 'article_id': article_id } for topic_name, weight in row.items()] # Flush if len(topic_articles) > self.insert_batch_size: insert_data(self.db_conf_env, 'mysqldb', database, Base, ArticleTopic, topic_articles, low_memory=True) topic_articles = [] # Final flush if len(topic_articles) > 0: insert_data(self.db_conf_env, 'mysqldb', database, Base, ArticleTopic, topic_articles, low_memory=True) # Touch the output self.output().touch()
def output(self): return s3.S3Target(f"{self.s3_path_prefix}/" f"test-{self.test}.best")
def output(self): """Points to the output""" if self.combine_outputs: return s3.S3Target(f"{self.s3_path_out}.json") return s3.S3Target(f"{self.s3_path_out}.length")
def output(self): '''Points to the output S3 target''' outname = (f"{S3PREFIX}/{self.db_table}/{self.db_database}/" f"{self.db_text_field}/{self.date}.json.zip") return s3.S3Target(outname)
def run(self): """Write data to ElasticSearch if required""" if not self.write_es: return self.cherry_picked = (f'gtr/{self.date}/'.encode('utf-8') + b'COREX_TOPIC_MODEL.n_hidden_140-0.' b'VECTORIZER.binary_True.' b'min_df_0-001.' b'text_field_abstractText' b'.NGRAM.TEST_False.json') if self.cherry_picked is None: # Read the topics data file_ptr = self.input().open("rb") path = file_ptr.read() file_ptr.close() else: path = self.cherry_picked file_io_topics = s3.S3Target( f's3://clio-data/{path.decode("utf-8")}').open("rb") topic_json = json.load(file_io_topics) file_io_topics.close() topic_lookup = topic_json['data']['topic_names'] topic_json = {row['id']: row for row in topic_json['data']['rows']} # Read the raw data file_io_input = s3.S3Target(self.s3_path_in).open("rb") dirty_json = json.load(file_io_input) file_io_input.close() uid, cleaned_json, fields = clean(dirty_json, self.dataset) # Assign topics n_topics, n_found = 0, 0 for row in cleaned_json: id_ = row[f'id_of_{self.dataset}'] if id_ not in topic_json: continue topics = [ k for k, v in topic_json[id_].items() if k != 'id' and v >= 0.2 ] n_found += 1 if len(topics) > 0: n_topics += 1 row[f"terms_topics_{self.dataset}"] = topics logging.info(f'{n_found} documents processed from a possible ' f'{len(cleaned_json)}, of which ' f'{n_topics} have been assigned topics.') fields.add(f"terms_topics_{self.dataset}") fields.add("terms_of_countryTags") fields.add("type_of_entity") # Prepare connection to ES prod_label = '' if self.production else '_dev' es_config = get_config('elasticsearch.config', 'clio') es_config['index'] = f"clio_{self.dataset}{prod_label}" aws_auth_region = es_config.pop('region') es = ElasticsearchPlus(hosts=es_config['host'], port=int(es_config['port']), use_ssl=True, entity_type=self.dataset, aws_auth_region=aws_auth_region, country_detection=True, caps_to_camel_case=True) # Dynamically generate the mapping based on a template with open("clio_mapping.json") as f: mapping = json.load(f) for f in fields: kwargs = {} _type = "text" if f.startswith("terms"): kwargs = { "fields": { "keyword": { "type": "keyword" } }, "analyzer": "terms_analyzer" } elif not f.startswith("textBody"): _type = "keyword" mapping["mappings"]["_doc"]["properties"][f] = dict(type=_type, **kwargs) # Drop, create and send data if es.indices.exists(index=es_config['index']): es.indices.delete(index=es_config['index']) es.indices.create(index=es_config['index'], body=mapping) for id_, row in zip(uid, cleaned_json): es.index(index=es_config['index'], doc_type=es_config['type'], id=id_, body=row) # Drop, create and send data es = ElasticsearchPlus(hosts=es_config['host'], port=int(es_config['port']), use_ssl=True, entity_type='topics', aws_auth_region=aws_auth_region, country_detection=False, caps_to_camel_case=False) topic_idx = f"{es_config['index']}_topics" if es.indices.exists(index=topic_idx): es.indices.delete(index=topic_idx) es.indices.create(index=topic_idx) es.index(index=topic_idx, doc_type=es_config['type'], id='topics', body=topic_lookup) # Touch the checkpoint self.output().touch()
def output(self): '''Points to the S3 Target''' return s3.S3Target(S3PREFIX + "final_output_%s.json" % self.date)
def output(self): '''Points to the S3 Target''' return s3.S3Target(S3PREFIX + 'input.json')
def output(self): '''Points to the S3 Target''' return s3.S3Target(f"{S3PREFIX}/meetup-topics-{self.routine_id}.json")
def output(self): return s3.S3Target(self.s3_path)