Ejemplo n.º 1
0
    def combine(self, job_params):
        '''Combine the outputs from the batch jobs'''

        # Retrieve the batched data
        country_data = defaultdict(dict)
        n_rows = 0
        for i, params in enumerate(job_params):
            print(i, " of ", len(job_params))
            _body = s3.S3Target(params["outinfo"]).open("rb")
            _country_data = json.loads(_body.read().decode('utf-8'))
            for country, data in _country_data.items():
                for var_name, data_row in data.items():
                    n_rows += 1
                    country_data[country][var_name] = data_row
        print(f"Got {n_rows} rows of data")

        # Merge with metadata, then flatten and clean
        country_metadata = get_worldbank_resource("countries")
        flat_country_data = flatten_country_data(country_data,
                                                 country_metadata)
        cleaned_data = clean_variable_names(flat_country_data)

        # Commit the data
        engine = get_mysql_engine("MYSQLDB", "mysqldb",
                                  self.db_config['database'])
        Base.metadata.create_all(engine)
        Session = sessionmaker(engine)
        session = Session()
        for row in cleaned_data:
            country = WorldbankCountry(**row)
            session.add(country)
        session.commit()
        session.close()
        self.output().touch()
Ejemplo n.º 2
0
    def combine(self, job_params):
        '''Combine the outputs from the batch jobs'''
        outdata = []
        for params in job_params:
            _body = s3.S3Target(params["outinfo"]).open("rb")
            _data = _body.read().decode('utf-8')
            outdata.append(json.loads(_data))

        with self.output().open("wb") as f:
            f.write(json.dumps(outdata).encode('utf-8'))
Ejemplo n.º 3
0
 def get_input_length(self):
     """Retrieve the length of the input, which is stored as the value
     of the output.length file."""
     fname = self.derive_file_length_path()
     f = s3.S3Target(fname).open('rb')
     total = json.load(f)
     if type(total) is not int:
         raise TypeError('Expected to find integer count in '
                         f'{fname}. Instead found {type(total)}')
     f.close()
     return total
Ejemplo n.º 4
0
    def combine(self, job_params):
        """Combine output by concatenating results."""
        # Download and join
        logging.debug(f"{self.job_name}: Combining " f"{len(job_params)}...")
        size, outdata = self.combine_all_outputs(job_params)
        # Write the output
        logging.debug(f"{self.job_name}: Writing the output "
                      f"(length {len(outdata)})...")
        if self.combine_outputs:
            f = self.output().open("wb")
            f.write(json.dumps(outdata).encode('utf-8'))
            f.close()

        # Write the output length as well, for book-keeping
        f = s3.S3Target(f"{self.s3_path_out}.length").open("wb")
        f.write(str(size).encode("utf-8"))
        f.close()
Ejemplo n.º 5
0
 def combine_all_outputs(self, job_params):
     size = 0
     outdata = []
     for params in job_params:
         _body = s3.S3Target(params["outinfo"]).open("rb")
         _data = _body.read().decode('utf-8')
         _outdata = json.loads(_data)
         # Combine if required
         if len(job_params) == 1:
             outdata = _outdata
         elif self.combine_outputs:
             outdata += _outdata
         # Get the length of the data
         if type(_outdata) is not list:
             _outdata = _outdata['data']['rows']
         size += len(_outdata)
     return size, outdata
Ejemplo n.º 6
0
 def write_to_s3(self, data, ichunk):
     f = s3.S3Target(f"{self.s3_path_out}/data."
                     f"{ichunk}-{self.test}.json").open("wb")
     f.write(json.dumps(data).encode('utf-8'))
     f.close()
     return [], ichunk + 1
Ejemplo n.º 7
0
 def output(self):
     return s3.S3Target(f"{self.s3_path_out}/" f"data.{self.test}.length")
Ejemplo n.º 8
0
    def run(self):
        # Load the input data (note the input contains the path
        # to the output)
        _filename = self.cherry_picked
        if _filename is None:
            _body = self.input().open("rb")
            _filename = _body.read().decode('utf-8')
        obj = s3.S3Target(f"{self.raw_s3_path_prefix}/"
                          f"{_filename}").open('rb')
        data = json.load(obj)

        # Get DB connections and settings
        database = 'dev' if self.test else 'production'
        engine = get_mysql_engine(self.db_conf_env, 'mysqldb', database)
        ArticleTopic.__table__.drop(engine)
        CorExTopic.__table__.drop(engine)

        # Insert the topic names data
        topics = [{
            'id': int(topic_name.split('_')[-1]) + 1,
            'terms': terms
        } for topic_name, terms in data['data']['topic_names'].items()]
        insert_data(self.db_conf_env,
                    'mysqldb',
                    database,
                    Base,
                    CorExTopic,
                    topics,
                    low_memory=True)
        logging.info(f'Inserted {len(topics)} topics')

        # Insert article topic weight data
        topic_articles = []
        done_ids = set()
        for row in data['data']['rows']:
            article_id = row.pop('id')
            if article_id in done_ids:
                continue
            done_ids.add(article_id)
            topic_articles += [{
                'topic_id': int(topic_name.split('_')[-1]) + 1,
                'topic_weight': weight,
                'article_id': article_id
            } for topic_name, weight in row.items()]
            # Flush
            if len(topic_articles) > self.insert_batch_size:
                insert_data(self.db_conf_env,
                            'mysqldb',
                            database,
                            Base,
                            ArticleTopic,
                            topic_articles,
                            low_memory=True)
                topic_articles = []

        # Final flush
        if len(topic_articles) > 0:
            insert_data(self.db_conf_env,
                        'mysqldb',
                        database,
                        Base,
                        ArticleTopic,
                        topic_articles,
                        low_memory=True)

        # Touch the output
        self.output().touch()
Ejemplo n.º 9
0
 def output(self):
     return s3.S3Target(f"{self.s3_path_prefix}/" f"test-{self.test}.best")
Ejemplo n.º 10
0
 def output(self):
     """Points to the output"""
     if self.combine_outputs:
         return s3.S3Target(f"{self.s3_path_out}.json")
     return s3.S3Target(f"{self.s3_path_out}.length")
Ejemplo n.º 11
0
 def output(self):
     '''Points to the output S3 target'''
     outname = (f"{S3PREFIX}/{self.db_table}/{self.db_database}/"
                f"{self.db_text_field}/{self.date}.json.zip")
     return s3.S3Target(outname)
Ejemplo n.º 12
0
    def run(self):
        """Write data to ElasticSearch if required"""
        if not self.write_es:
            return

        self.cherry_picked = (f'gtr/{self.date}/'.encode('utf-8') +
                              b'COREX_TOPIC_MODEL.n_hidden_140-0.'
                              b'VECTORIZER.binary_True.'
                              b'min_df_0-001.'
                              b'text_field_abstractText'
                              b'.NGRAM.TEST_False.json')
        if self.cherry_picked is None:
            # Read the topics data
            file_ptr = self.input().open("rb")
            path = file_ptr.read()
            file_ptr.close()
        else:
            path = self.cherry_picked

        file_io_topics = s3.S3Target(
            f's3://clio-data/{path.decode("utf-8")}').open("rb")

        topic_json = json.load(file_io_topics)
        file_io_topics.close()
        topic_lookup = topic_json['data']['topic_names']
        topic_json = {row['id']: row for row in topic_json['data']['rows']}

        # Read the raw data
        file_io_input = s3.S3Target(self.s3_path_in).open("rb")
        dirty_json = json.load(file_io_input)
        file_io_input.close()
        uid, cleaned_json, fields = clean(dirty_json, self.dataset)

        # Assign topics
        n_topics, n_found = 0, 0
        for row in cleaned_json:
            id_ = row[f'id_of_{self.dataset}']
            if id_ not in topic_json:
                continue
            topics = [
                k for k, v in topic_json[id_].items() if k != 'id' and v >= 0.2
            ]
            n_found += 1
            if len(topics) > 0:
                n_topics += 1
            row[f"terms_topics_{self.dataset}"] = topics
        logging.info(f'{n_found} documents processed from a possible '
                     f'{len(cleaned_json)}, of which '
                     f'{n_topics} have been assigned topics.')
        fields.add(f"terms_topics_{self.dataset}")
        fields.add("terms_of_countryTags")
        fields.add("type_of_entity")

        # Prepare connection to ES
        prod_label = '' if self.production else '_dev'
        es_config = get_config('elasticsearch.config', 'clio')
        es_config['index'] = f"clio_{self.dataset}{prod_label}"
        aws_auth_region = es_config.pop('region')
        es = ElasticsearchPlus(hosts=es_config['host'],
                               port=int(es_config['port']),
                               use_ssl=True,
                               entity_type=self.dataset,
                               aws_auth_region=aws_auth_region,
                               country_detection=True,
                               caps_to_camel_case=True)

        # Dynamically generate the mapping based on a template
        with open("clio_mapping.json") as f:
            mapping = json.load(f)
        for f in fields:
            kwargs = {}
            _type = "text"
            if f.startswith("terms"):
                kwargs = {
                    "fields": {
                        "keyword": {
                            "type": "keyword"
                        }
                    },
                    "analyzer": "terms_analyzer"
                }
            elif not f.startswith("textBody"):
                _type = "keyword"
            mapping["mappings"]["_doc"]["properties"][f] = dict(type=_type,
                                                                **kwargs)

        # Drop, create and send data
        if es.indices.exists(index=es_config['index']):
            es.indices.delete(index=es_config['index'])
        es.indices.create(index=es_config['index'], body=mapping)
        for id_, row in zip(uid, cleaned_json):
            es.index(index=es_config['index'],
                     doc_type=es_config['type'],
                     id=id_,
                     body=row)

        # Drop, create and send data
        es = ElasticsearchPlus(hosts=es_config['host'],
                               port=int(es_config['port']),
                               use_ssl=True,
                               entity_type='topics',
                               aws_auth_region=aws_auth_region,
                               country_detection=False,
                               caps_to_camel_case=False)
        topic_idx = f"{es_config['index']}_topics"
        if es.indices.exists(index=topic_idx):
            es.indices.delete(index=topic_idx)
        es.indices.create(index=topic_idx)
        es.index(index=topic_idx,
                 doc_type=es_config['type'],
                 id='topics',
                 body=topic_lookup)

        # Touch the checkpoint
        self.output().touch()
Ejemplo n.º 13
0
 def output(self):
     '''Points to the S3 Target'''
     return s3.S3Target(S3PREFIX + "final_output_%s.json" % self.date)
Ejemplo n.º 14
0
 def output(self):
     '''Points to the S3 Target'''
     return s3.S3Target(S3PREFIX + 'input.json')
Ejemplo n.º 15
0
 def output(self):
     '''Points to the S3 Target'''
     return s3.S3Target(f"{S3PREFIX}/meetup-topics-{self.routine_id}.json")
Ejemplo n.º 16
0
 def output(self):
     return s3.S3Target(self.s3_path)