def test_embedding_trainer_doc2vec_s3(self):
        client = boto3.client('s3')
        client.create_bucket(Bucket='fake-open-skills',
                             ACL='public-read-write')
        s3_path = f"s3://fake-open-skills/model_cache/embedding"
        s3_storage = S3Store(path=s3_path)

        document_schema_fields = [
            'description', 'experienceRequirements', 'qualifications', 'skills'
        ]
        job_postings_generator = JobPostingCollectionSample(num_records=30)
        corpus_generator = Doc2VecGensimCorpusCreator(
            job_postings_generator,
            document_schema_fields=document_schema_fields)
        d2v = Doc2VecModel(storage=s3_storage,
                           size=10,
                           min_count=3,
                           iter=4,
                           window=6,
                           workers=3)

        trainer = EmbeddingTrainer(corpus_generator, d2v)
        trainer.train(lookup=True)
        trainer.save_model()

        vocab_size = len(d2v.wv.vocab.keys())
        s3 = s3fs.S3FileSystem()
        files = [f.split('/')[-1] for f in s3.ls(s3_path)]
        assert d2v.model_name == trainer.model_name
        assert set(files) == set([trainer.model_name])
        self.assertDictEqual(trainer.lookup_dict, d2v.lookup_dict)

        # Save as different name
        d2v.save('other_name.model')

        s3 = s3fs.S3FileSystem()
        files = [f.split('/')[-1] for f in s3.ls(s3_path)]
        assert set(files) == set([trainer.model_name, 'other_name.model'])

        # Load
        d2v_loaded = Doc2VecModel.load(s3_storage, trainer.model_name)
        assert d2v_loaded.metadata['embedding_model']['hyperparameters'][
            'vector_size'] == trainer.metadata['embedding_model'][
                'hyperparameters']['vector_size']
        # Change the store directory
        new_s3_path = "s3://fake-open-skills/model_cache/embedding/other_directory"
        trainer.save_model(S3Store(new_s3_path))
        s3 = s3fs.S3FileSystem()
        files = [f.split('/')[-1] for f in s3.ls(new_s3_path)]
        assert set(files) == set([trainer.model_name])
Beispiel #2
0
def test_geocode_cacher():
    with patch('time.sleep') as time_mock:
        with open('tests/sample_geocode_result.json') as f:
            sample_geocode_result = json.load(f)
        client = boto3.resource('s3')
        client.create_bucket(Bucket='geobucket')
        cache_storage = S3Store('geobucket')
        cache_fname = 'cbsas.json'
        geocode_result = namedtuple('GeocodeResult', ['json'])
        geocode_func = MagicMock(return_value=geocode_result(
            json=sample_geocode_result))
        geocoder = CachedGeocoder(cache_storage=cache_storage,
                                  cache_fname=cache_fname,
                                  geocode_func=geocode_func,
                                  sleep_time=1)
        geocoder.geocode('Canarsie, NY')
        geocoder.geocode('Poughkeepsie, NY')
        geocoder.geocode('Canarsie, NY')
        geocoder.save()
        assert geocode_func.call_count == 2
        assert geocode_func.call_args_list == [
            call('Canarsie, NY'),
            call('Poughkeepsie, NY')
        ]
        assert time_mock.call_count == 2

        new_geocoder = CachedGeocoder(cache_storage=cache_storage,
                                      cache_fname=cache_fname,
                                      geocode_func=geocode_func,
                                      sleep_time=1)
        assert new_geocoder.all_cached_geocodes == {
            'Canarsie, NY': sample_geocode_result,
            'Poughkeepsie, NY': sample_geocode_result,
        }
Beispiel #3
0
    def test_knn_doc2vec_cls_s3(self):
        client = boto3.client('s3')
        client.create_bucket(Bucket='fake-open-skills',
                             ACL='public-read-write')
        s3_path = f"s3://fake-open-skills/model_cache/soc_classifiers"
        s3_storage = S3Store(path=s3_path)
        model_storage = ModelStorage(s3_storage)
        corpus_generator = FakeCorpusGenerator()

        # Embedding has no lookup_dict
        d2v = Doc2VecModel(size=10,
                           min_count=1,
                           dm=0,
                           alpha=0.025,
                           min_alpha=0.025)
        trainer = EmbeddingTrainer(d2v, model_storage=model_storage)
        trainer.train(corpus_generator, lookup=False)

        self.assertRaises(ValueError,
                          lambda: KNNDoc2VecClassifier(embedding_model=d2v))

        d2v = Doc2VecModel(size=10,
                           min_count=1,
                           dm=0,
                           alpha=0.025,
                           min_alpha=0.025)
        trainer = EmbeddingTrainer(d2v, model_storage=model_storage)
        trainer.train(corpus_generator, lookup=True)

        # KNNDoc2VecClassifier only supports doc2vec now
        self.assertRaises(NotImplementedError,
                          lambda: KNNDoc2VecClassifier(Word2VecModel()))

        doc = docs.split(',')[0].split()

        knn = KNNDoc2VecClassifier(embedding_model=d2v, k=0)
        self.assertRaises(ValueError, lambda: knn.predict_soc([doc]))

        knn = KNNDoc2VecClassifier(embedding_model=d2v, k=10)
        soc_cls = SocClassifier(knn)

        assert knn.predict_soc([doc])[0][0] == soc_cls.predict_soc([doc])[0][0]

        # Build Annoy index
        knn.build_ann_indexer(num_trees=5)
        assert isinstance(knn.indexer, AnnoyIndexer)

        # Save
        s3 = s3fs.S3FileSystem()
        model_storage.save_model(knn, knn.model_name)
        files = [f.split('/')[-1] for f in s3.ls(s3_path)]
        assert set(files) == set([knn.model_name])

        # Load
        new_knn = model_storage.load_model(knn.model_name)
        assert new_knn.model_name == knn.model_name
        assert new_knn.predict_soc([doc])[0][0] == '29-2061.00'

        # Have to re-build the index whenever ones load the knn model to the memory
        assert new_knn.indexer == None
Beispiel #4
0
    def test_with_iterable_pipelin(self):
        import boto3
        client = boto3.client('s3')
        client.create_bucket(Bucket='fake-open-skills',
                             ACL='public-read-write')
        s3 = S3Store('fake-open-skills/models')
        model_storage = ModelStorage(storage=s3)
        fake = FakeModel('fake')

        model_storage.save_model(fake, fake.model_name)
        vectorize_for_pipeline = partial(nlp.vectorize,
                                         embedding_model=SerializedByStorage(
                                             storage=s3,
                                             model_name=fake.model_name,
                                             model=fake))
        pipe = IterablePipeline(vectorize_for_pipeline)

        pipe_unpickled = pickle.loads(pickle.dumps(pipe))
        # make sure the fake model wasn't pickled but the reference
        assert pipe_unpickled.functions[-1].keywords[
            'embedding_model']._model == None
        assert pipe_unpickled.functions[-1].keywords[
            'embedding_model'].storage.path == s3.path
        # The model will be loaded when it's needed
        assert list(pipe_unpickled([1])) == [[1, 2, 3, 4]]
Beispiel #5
0
    def test_s3store(self):
        import boto3
        client = boto3.client('s3')
        client.create_bucket(Bucket='fake-open-skills',
                             ACL='public-read-write')
        s3 = s3fs.S3FileSystem()

        storage = S3Store(path=f"s3://fake-open-skills/model_cache")
        assert not s3.exists(storage.path) == True

        model = FakeModel('val')
        model_pickled = pickle.dumps(model)
        storage.write(model_pickled, 'for_testing.model')

        assert storage.exists("for_testing.model")

        model_loaded = storage.load('for_testing.model')
        model_loaded = pickle.loads(model_loaded)
        assert model_loaded.val == 'val'

        fake_lookup = {'1': 1, '2': 2, '3': 3}
        fake_lookup_bytes = json.dumps(fake_lookup).encode()
        storage.write(fake_lookup_bytes, 'for_testing.json')
        assert storage.exists("for_testing.json")

        fake_lookup_loaded = json.loads(
            storage.load('for_testing.json').decode())
        assert fake_lookup == fake_lookup_loaded

        storage.delete('for_testing.model')
        assert not storage.exists("for_testing.model")
Beispiel #6
0
    def test_with_grid_search(self):
        import boto3
        client = boto3.client('s3')
        client.create_bucket(Bucket='fake-open-skills',
                             ACL='public-read-write')
        s3 = S3Store('fake-open-skills')
        model_storage = ModelStorage(s3)

        from sklearn.ensemble import RandomForestClassifier
        from sklearn.model_selection import GridSearchCV

        gs = GridSearchCV(RandomForestClassifier(), {})
        proxy_gs = ProxyObjectWithStorage(model_obj=gs,
                                          storage=s3,
                                          model_name='rf.grid')

        X = np.random.rand(20, 2)
        y = np.random.randint(2, size=20)

        proxy_gs.fit(X, y)
        model_storage.save_model(proxy_gs, 'rf.grid')

        loaded_proxy_gs = model_storage.load_model('rf.grid')

        assert loaded_proxy_gs.storage.path == s3.path
        assert proxy_gs.predict([[5, 6]]) == gs.predict([[5, 6]])
def test_aggregate_properties():
    client = boto3.resource('s3')
    client.create_bucket(Bucket='test-bucket')
    s3_storage = S3Store('s3://test-bucket/aggregations')
    aggregate_properties(out_filename='2015',
                         grouping_properties=[
                             FakeGroupingPropertyOne(),
                             FakeGroupingPropertyTwo()
                         ],
                         aggregate_properties=[
                             FakeAggregationPropertyOne(),
                             FakeAggregationPropertyTwo()
                         ],
                         aggregate_functions={
                             'aggregation_property_two': [numpy.sum],
                             'aggregation_property_one':
                             [partial(listy_n_most_common, 2)]
                         },
                         storage=s3_storage,
                         aggregation_name='fake_agg')
    s3 = s3fs.S3FileSystem()
    with s3.open('s3://test-bucket/aggregations/fake_agg/2015.csv', 'rb') as f:
        reader = csv.reader(f)
        num_rows = len([row for row in f])
        assert num_rows == 5
Beispiel #8
0
    def setUp(self):
        client = boto3.resource('s3')
        bucket = client.create_bucket(Bucket='test-bucket')
        storage = S3Store('s3://test-bucket/computed_properties')
        description = 'This is my description'

        class MockClassifier(object):
            def predict_soc(self, document):
                assert document.strip() == description.lower()
                return '11-1234.00'

            @property
            def name(self):
                return "MockClassifier"

            @property
            def description(self):
                return "fake algorithm"

        self.computed_property = SOCClassifyProperty(
            storage=storage,
            classifier_obj=MockClassifier(),
        )
        self.job_postings = [
            utils.job_posting_factory(datePosted=self.datestring,
                                      description=description,
                                      skills='',
                                      qualifications='',
                                      experienceRequirements='')
        ]
        self.computed_property.compute_on_collection(self.job_postings)
Beispiel #9
0
 def setUp(self):
     self.client = boto3.resource('s3')
     self.client.create_bucket(Bucket='test-bucket')
     self.storage = S3Store('s3://test-bucket/computed_properties')
     self.computed_property = PostingIdPresent(self.storage)
     self.job_postings = [
         utils.job_posting_factory(datePosted=self.datestring)
     ]
     self.computed_property.compute_on_collection(self.job_postings)
Beispiel #10
0
 def setUp(self):
     self.client = boto3.resource('s3')
     self.client.create_bucket(Bucket='test-bucket')
     self.storage = S3Store('s3://test-bucket/computed_properties')
     self.computed_property = TitleCleanPhaseOne(self.storage)
     self.job_postings = [
         utils.job_posting_factory(datePosted=self.datestring,
                                   title='Software Engineer - Tulsa')
     ]
     self.computed_property.compute_on_collection(self.job_postings)
Beispiel #11
0
    def setUp(self):
        client = boto3.resource('s3')
        bucket = client.create_bucket(Bucket='test-bucket')
        storage = S3Store('s3://test-bucket/computed_properties')
        cache_storage = S3Store('s3://test-bucket')

        class SampleJobGeoQuerier(JobGeographyQuerier):
            name = 'blah'
            output_columns = (('city', 'the city'), )

            def _query(self, job_posting):
                return ['Fargo']

        self.computed_property = Geography(
            geo_querier=SampleJobGeoQuerier(),
            storage=storage,
        )
        self.job_postings = [
            utils.job_posting_factory(datePosted=self.datestring)
        ]
        self.computed_property.compute_on_collection(self.job_postings)
Beispiel #12
0
    def test_s3store(self):
        import boto3
        client = boto3.client('s3')
        client.create_bucket(Bucket='fake-open-skills', ACL='public-read-write')
        storage = S3Store(path=f"s3://fake-open-skills/apath")

        # 1. Ensure that a new file is correctly created and saved to
        storage_one = PersistedJSONDict(storage, 'test.json')
        storage_one['key1'] = 'value1'
        storage_one['key2'] = {'nestedkey2': 'value2'}
        storage_one.save()
        loaded = json.loads(storage.load('test.json').decode())
        assert loaded == {'key1': 'value1', 'key2': {'nestedkey2': 'value2'}}

        # 2. Ensure that an existing file is correctly read, updated, and saved to
        storage_two = PersistedJSONDict(storage, 'test.json')
        assert 'key1' in storage_two
        assert storage_two['key1'] == 'value1'
        storage_two['key3'] = 'value3'
        storage_two.save()
        loaded = json.loads(storage.load('test.json').decode())
        assert loaded == {'key1': 'value1', 'key2': {'nestedkey2': 'value2'}, 'key3': 'value3'}

        # 3. Ensure that, in the same thread, updating and svaing an old one gets new chagnes too
        storage_one['key4'] = 'value4'
        storage_one.save()
        loaded = json.loads(storage.load('test.json').decode())
        assert loaded == {'key1': 'value1', 'key2': {'nestedkey2': 'value2'}, 'key3': 'value3', 'key4': 'value4'}

        # 4. test autosave - this will be the fourth update of this object
        storage_one.SAVE_EVERY_N_UPDATES = 4
        storage_one['key5'] = 'value5'
        loaded = json.loads(storage.load('test.json').decode())
        assert loaded == {'key1': 'value1', 'key2': {'nestedkey2': 'value2'}, 'key3': 'value3', 'key4': 'value4', 'key5': 'value5'}

        # 5. test length checking
        assert len(storage_one) == 5

        # 6.  test iteration
        assert sorted(
            [(key, value) for key, value in storage_one.items()],
            key=lambda x: x[0]
        ) == [
            ('key1', 'value1'),
            ('key2', {'nestedkey2': 'value2'}),
            ('key3', 'value3'),
            ('key4', 'value4'),
            ('key5', 'value5')

        ]
Beispiel #13
0
 def setUp(self):
     s3_conn = boto.connect_s3()
     client = boto3.resource('s3')
     bucket = client.create_bucket(Bucket='test-bucket')
     storage = S3Store('s3://test-bucket/computed_properties')
     skill_extractor = ExactMatchSkillExtractor(utils.sample_framework())
     self.computed_property = SkillCounts(
         skill_extractor=skill_extractor,
         storage=storage,
     )
     self.job_postings = [
         utils.job_posting_factory(datePosted=self.datestring,
                                   description='reading comprehension')
     ]
     self.computed_property.compute_on_collection(self.job_postings)
Beispiel #14
0
def test_cbsa_finder_nohits():
    client = boto3.resource('s3')
    client.create_bucket(Bucket='geobucket')
    shapefile_name = 'tests/sample_cbsa_shapefile.shp'
    cache_storage = S3Store('geobucket')
    cache_fname = 'cbsas.json'
    finder = CachedCBSAFinder(cache_storage=cache_storage,
                              cache_fname=cache_fname,
                              shapefile_name=shapefile_name)
    sample_input = {
        "bbox": {
            "northeast": [65.2, 65.8],
            "southwest": [65.2, 65.8]
        },
    }
    assert finder.query(sample_input) == None
Beispiel #15
0
 def setUp(self):
     self.client = boto3.resource('s3')
     self.client.create_bucket(Bucket='test-bucket')
     self.storage = S3Store('s3://test-bucket/computed_properties')
     self.computed_property = TitleCleanPhaseTwo(self.storage)
     self.job_postings = [
         utils.job_posting_factory(datePosted=self.datestring,
                                   title='Software Engineer Tulsa')
     ]
     with patch(
             'skills_ml.algorithms.jobtitle_cleaner.clean.negative_positive_dict',
             return_value={
                 'places': ['tulsa'],
                 'states': [],
                 'onetjobs': ['software engineer']
             }):
         self.computed_property.compute_on_collection(self.job_postings)
Beispiel #16
0
    def test_with_iterable_pipeline(self):
        import boto3
        client=boto3.client('s3')
        client.create_bucket(Bucket='fake-open-skills', ACL='public-read-write')
        s3 = S3Store('fake-open-skills')
        model_storage = ModelStorage(s3)

        proxy_fake = ProxyObjectWithStorage(model_obj=FakeModel('fake'), storage=s3, model_name='fake')
        model_storage.save_model(proxy_fake, proxy_fake.model_name)

        vectorize_for_pipeline = partial(nlp.vectorize, embedding_model=SerializedByStorage(model=proxy_fake, model_name=proxy_fake.model_name))
        pipe = IterablePipeline(vectorize_for_pipeline)

        s3.write(pickle.dumps(pipe), 'fake.pipe')
        pipe_unpickled = pickle.loads(s3.load('fake.pipe'))

        assert list(pipe_unpickled([1])) == [[1, 2, 3, 4]]
Beispiel #17
0
def test_cbsa_finder_twohits():
    client = boto3.resource('s3')
    client.create_bucket(Bucket='geobucket')
    shapefile_name = 'tests/sample_cbsa_shapefile.shp'
    cache_storage = S3Store('geobucket')
    cache_fname = 'cbsas.json'
    finder = CachedCBSAFinder(cache_storage=cache_storage,
                              cache_fname=cache_fname,
                              shapefile_name=shapefile_name)
    sample_input = {
        "bbox": {
            "northeast": [38.00, -81.05],
            "southwest": [35.13, -88.18]
        },
    }
    assert finder.query(sample_input) == (
        '40080',
        'Richmond-Berea, KY Micro Area',
    )
Beispiel #18
0
    def test_save_load(self):
        import boto3
        client=boto3.client('s3')
        client.create_bucket(Bucket='fake-open-skills', ACL='public-read-write')
        s3 = S3Store('fake-open-skills')
        model_storage = ModelStorage(storage=s3)
        fake = FakeModel('fake')

        model_storage.save_model(fake, fake.model_name)
        proxy_fake = ProxyObjectWithStorage(model_obj=fake, storage=s3, model_name=fake.model_name)

        assert proxy_fake.storage == s3

        proxy_fake_unpickled = pickle.loads(pickle.dumps(proxy_fake))
        assert proxy_fake_unpickled.val == proxy_fake.val

        model_storage.save_model(proxy_fake, 'proxy_'+ proxy_fake.model_name)
        proxy_fake_loaded= model_storage.load_model('proxy_'+ proxy_fake.model_name)

        assert proxy_fake_loaded.val == proxy_fake.val == fake.val
Beispiel #19
0
def test_cbsa_finder_empty_cache():
    client = boto3.resource('s3')
    geobucket = client.create_bucket(Bucket='geobucket')
    cache_storage = S3Store('geobucket')
    cache_fname = 'cbsas.json'
    cbsa_finder = CachedCBSAFinder(
        cache_storage=cache_storage,
        cache_fname=cache_fname,
        shapefile_name='tests/sample_cbsa_shapefile.shp')
    # set the cache to something that JSON loads as None, not empty dict
    geobucket.put_object(Body='', Key='cbsas.json')
    geocode_results = {
        'East of Charlotte, NC': {
            "bbox": {
                "northeast": [35.2268961, -80.8461711],
                "southwest": [35.2267961, -80.8462711]
            },
        },
        'Flushing, NY': {
            "bbox": {
                "northeast": [40.7654801, -73.8173791],
                "southwest": [40.7653801, -73.8174791]
            },
        }
    }

    cbsa_finder.find_all_cbsas_and_save(geocode_results)

    new_finder = CachedCBSAFinder(
        cache_storage=cache_storage,
        cache_fname=cache_fname,
        shapefile_name='tests/sample_cbsa_shapefile.shp')
    print(new_finder.all_cached_cbsa_results._storage)
    assert new_finder.all_cached_cbsa_results == {
        'East of Charlotte, NC': [
            '16740',
            'Charlotte-Concord-Gastonia, NC-SC Metro Area',
        ],
        'Flushing, NY':
        None
    }
Beispiel #20
0
def test_geocode_search_strings():
    with open('tests/sample_geocode_result.json') as f:
        sample_geocode_result = json.load(f)
    client = boto3.resource('s3')
    client.create_bucket(Bucket='geobucket')
    cache_storage = S3Store('geobucket')
    cache_fname = 'cbsas.json'
    geocode_result = namedtuple('GeocodeResult', ['json'])
    geocode_func = MagicMock(return_value=geocode_result(
        json=sample_geocode_result))
    geocoder = CachedGeocoder(cache_storage=cache_storage,
                              cache_fname=cache_fname,
                              geocode_func=geocode_func,
                              sleep_time=0)
    geocoder.geocode_search_strings_and_save(['string1', 'string2'])

    new_geocoder = CachedGeocoder(
        cache_storage=cache_storage,
        cache_fname=cache_fname,
    )
    assert next(iter(new_geocoder.all_cached_geocodes.values()))\
        == sample_geocode_result
Beispiel #21
0
 def setUp(self):
     self.client = boto3.resource('s3')
     self.client.create_bucket(Bucket='test-bucket')
     self.storage = S3Store('s3://test-bucket/computed_properties')
     self.computed_property = YearlyPay(self.storage)
     self.job_postings = [
         utils.job_posting_factory(id=5,
                                   datePosted=self.datestring,
                                   baseSalary={
                                       'salaryFrequency': 'yearly',
                                       'minValue': 5,
                                       'maxValue': ''
                                   }),
         utils.job_posting_factory(id=6,
                                   datePosted=self.datestring,
                                   baseSalary={
                                       'salaryFrequency': 'yearly',
                                       'minValue': '6.25',
                                       'maxValue': '9.25'
                                   })
     ]
     self.computed_property.compute_on_collection(self.job_postings)
    def test_embedding_trainer_multicore_s3(self):
        client = boto3.client('s3')
        client.create_bucket(Bucket='fake-open-skills',
                             ACL='public-read-write')
        s3_path = f"s3://fake-open-skills/model_cache/embedding"
        s3_storage = S3Store(path=s3_path)
        model_storage = ModelStorage(s3_storage)

        document_schema_fields = [
            'description', 'experienceRequirements', 'qualifications', 'skills'
        ]
        job_postings_generator = JobPostingCollectionSample()
        corpus_generator = Word2VecGensimCorpusCreator(
            job_postings_generator,
            document_schema_fields=document_schema_fields)
        trainer = EmbeddingTrainer(FastTextModel(size=10,
                                                 min_count=3,
                                                 iter=4,
                                                 window=6,
                                                 workers=3),
                                   FastTextModel(size=10,
                                                 min_count=3,
                                                 iter=4,
                                                 window=10,
                                                 workers=3),
                                   Word2VecModel(size=10, workers=3, window=6),
                                   Word2VecModel(size=10,
                                                 min_count=10,
                                                 window=10,
                                                 workers=3),
                                   model_storage=model_storage)
        trainer.train(corpus_generator)
        trainer.save_model()

        s3 = s3fs.S3FileSystem()
        files = [f.split('/')[-1] for f in s3.ls(s3_path)]
        assert set(files) == set(
            [model.model_name for model in trainer._models])
Beispiel #23
0
    def test_pickle_s3(self):
        import boto3
        client = boto3.client('s3')
        client.create_bucket(Bucket='fake-open-skills', ACL='public-read-write')
        s3 = S3Store('fake-open-skills/models')
        model_storage = ModelStorage(storage=s3)
        fake = FakeModel('fake')
        model_storage.save_model(fake, fake.model_name)

        s_fake = SerializedByStorage(fake, s3, fake.model_name)
        s3.write(pickle.dumps(s_fake), 'fake.pickle')
        fake_unpickled = pickle.loads(s3.load('fake.pickle'))
        # make sure the fake model wasn't pickled but the reference
        assert fake_unpickled._model == None
        assert fake_unpickled.storage.path == s3.path
        assert fake_unpickled.val == fake.val

        # if the object to be pickled doesn't have storage attribute and didn't provide the storage
        # to SerializedByStorage, it will be serialized normally
        s_fake = SerializedByStorage(model=fake, model_name=fake.model_name)
        s3.write(pickle.dumps(s_fake), 'fake.pickle')
        fake_unpickled = pickle.loads(s3.load('fake.pickle'))
        assert fake_unpickled._model != None
Beispiel #24
0
def test_cbsa_finder_onehit():
    client = boto3.resource('s3')
    client.create_bucket(Bucket='geobucket')
    shapefile_name = 'tests/sample_cbsa_shapefile.shp'
    cache_storage = S3Store('geobucket')
    cache_fname = 'cbsas.json'
    finder = CachedCBSAFinder(cache_storage=cache_storage,
                              cache_fname=cache_fname,
                              shapefile_name=shapefile_name)
    sample_input = {
        "lng": -80.8462211,
        "ok": True,
        "location": "East of Charlotte, NC",
        "provider": "osm",
        "country": "United States of America",
        "bbox": {
            "northeast": [35.2268961, -80.8461711],
            "southwest": [35.2267961, -80.8462711]
        },
        "importance": 0.325,
        "quality": "postcode",
        "accuracy": 0.325,
        "address": "NC 28202, United States of America",
        "confidence": 10,
        "lat": 35.2268461,
        "type": "postcode",
        "place_rank": "25",
        "status_code": 200,
        "status": "OK",
        "place_id": "210190423",
        "encoding": "utf-8",
        "postal": "NC 28202"
    }
    assert finder.query(sample_input) == (
        '16740',
        'Charlotte-Concord-Gastonia, NC-SC Metro Area',
    )
def test_BratExperiment_add_allocation():
    # given a user name
    # find the next allocation to use that the user has not annotated yet
    # create a directory with the users name
    # record in metadata the fact that the user has been allocated this

    # setup: create a bucket for the brat config
    s3 = boto3.resource('s3')
    storage = S3Store('s3://test-bucket/samples')
    s3.create_bucket(Bucket='test-bucket')
    job_postings = [job_posting_factory(id=i, description=str(i)) for i in range(100, 200)]
    sample = sample_factory(job_postings, name='300_weighted', storage=storage)

    experiment = BratExperiment(
        experiment_name='initial_skills_tag',
        brat_s3_path='test-bucket/brat'
    )
    experiment.start(
        sample=sample,
        minimum_annotations_per_posting=2,
        max_postings_per_allocation=20,
        entities_with_shortcuts=(
            ('c', 'Competency'),
        )
    )
    # initialize the experiment in this bucket
    experiment = BratExperiment(
        experiment_name='initial_skills_tag',
        brat_s3_path='test-bucket/brat'
    )

    username = '******'
    # should not be able to allocate without creating a user
    with pytest.raises(ValueError):
        experiment.add_allocation(username)

    # set up a user to allocate to
    experiment.user_pw_store[username] = 'password'
    experiment.user_pw_store.save()
    allocated_directory = experiment.add_allocation(username)

    allocations = experiment.metadata['allocations'][username]
    assert len(allocations) == 1

    s3 = s3fs.S3FileSystem()
    filenames = s3.ls(allocated_directory)
    # there should be two files for each job posting: the .txt. and the .ann
    assert len(filenames) == len(experiment.metadata['units'][allocations[0]]) * 2

    # simulate continued allocation with more users
    user_two = 'user_two'
    user_three = 'user_three'
    experiment.add_user(user_two, 'pass')
    experiment.add_user(user_three, 'pass')
    for i in range(0, 4):
        experiment.add_allocation(user_two)
        experiment.add_allocation(user_three)
    # at this point, trying to re-allocate to either user two or three
    # should fail as they have now tagged everything
    with pytest.raises(ValueError):
        experiment.add_allocation(user_two)

    # user one should still work for now
    for i in range(0, 4):
        new_directory = experiment.add_allocation(username)
        assert new_directory != allocated_directory

    # once they have seen the whole thing, no more!
    with pytest.raises(ValueError):
        experiment.add_allocation(username)
    def test_embedding_trainer_word2vec_s3(self):
        client = boto3.client('s3')
        client.create_bucket(Bucket='fake-open-skills',
                             ACL='public-read-write')
        s3_path = f"s3://fake-open-skills/model_cache/embedding"
        s3_storage = S3Store(path=s3_path)

        document_schema_fields = [
            'description', 'experienceRequirements', 'qualifications', 'skills'
        ]
        job_postings_generator = JobPostingCollectionSample(num_records=30)
        corpus_generator = Word2VecGensimCorpusCreator(
            job_postings_generator,
            document_schema_fields=document_schema_fields)
        w2v = Word2VecModel(storage=s3_storage,
                            size=10,
                            min_count=3,
                            iter=4,
                            window=6,
                            workers=3)

        trainer = EmbeddingTrainer(corpus_generator, w2v)
        trainer.train()
        trainer.save_model()

        vocab_size = len(w2v.wv.vocab.keys())

        s3 = s3fs.S3FileSystem()
        files = [f.split('/')[-1] for f in s3.ls(s3_path)]
        assert w2v.model_name == trainer.model_name
        assert set(files) == set([trainer.model_name])

        # Test online training
        job_postings_generator = JobPostingCollectionSample(num_records=50)
        corpus_generator = Word2VecGensimCorpusCreator(
            job_postings_generator,
            document_schema_fields=document_schema_fields)

        w2v_loaded = Word2VecModel.load(s3_storage, w2v.model_name)

        new_trainer = EmbeddingTrainer(corpus_generator, w2v_loaded)
        new_trainer.train()
        new_trainer.save_model()

        new_vocab_size = len(w2v_loaded.wv.vocab.keys())

        s3 = s3fs.S3FileSystem()
        files = [f.split('/')[-1] for f in s3.ls(s3_path)]
        assert set(files) == set([new_trainer.model_name, trainer.model_name])
        assert new_trainer.metadata['embedding_trainer'][
            'model_name'] != trainer.metadata['embedding_trainer']['model_name']
        assert vocab_size <= new_vocab_size

        # Save as different name
        w2v.save('other_name.model')

        s3 = s3fs.S3FileSystem()
        files = [f.split('/')[-1] for f in s3.ls(s3_path)]
        assert set(files) == set(
            [trainer.model_name, new_trainer.model_name, 'other_name.model'])

        # Change the store directory
        new_s3_path = "s3://fake-open-skills/model_cache/embedding/other_directory"
        new_trainer.save_model(S3Store(new_s3_path))
        s3 = s3fs.S3FileSystem()
        files = [f.split('/')[-1] for f in s3.ls(new_s3_path)]
        assert set(files) == set([new_trainer.model_name])
def test_BratExperiment_start():
    # create a bucket that will contain both the source samples and BRAT config
    s3 = boto3.resource('s3')
    bucket = s3.create_bucket(Bucket='test-bucket')
    storage = S3Store('s3://test-bucket/samples')

    # create a sample.
    # sample format is one file, one job posting per line, in common schema JSON format
    job_postings = [job_posting_factory(
        id=i,
        description=str(i),
        experienceRequirements='',
        qualifications='',
        skills=''
    ) for i in range(100, 200)]
    sample = sample_factory(job_postings, name='300_weighted', storage=storage)

    experiment = BratExperiment(
        experiment_name='initial_skills_tag',
        brat_s3_path='test-bucket/brat'
    )
    experiment.start(
        sample=sample,
        minimum_annotations_per_posting=2,
        max_postings_per_allocation=20,
        entities_with_shortcuts=(
            ('c', 'Competency'),
        )
    )

    # find metadata about what it created
    s3 = s3fs.S3FileSystem()

    # first assert that some shallow metadata was passed through
    assert experiment.metadata['sample_base_path'] == 's3://test-bucket/samples'
    assert experiment.metadata['sample_name'] == '300_weighted'
    assert experiment.metadata['entities_with_shortcuts'] == (('c', 'Competency'),)
    assert experiment.metadata['minimum_annotations_per_posting'] == 2
    assert experiment.metadata['max_postings_per_allocation'] == 20

    # next look at the posting texts themselves.
    # we expect them all of them to be present but split across a number of units
    units = experiment.metadata['units']
    assert len(units) == 5  # 100/20
    retrieved_descriptions = []
    for unit_name, documents in units.items():
        for posting_key, original_job_id in documents:
            # we should not expose the original posting ids
            # otherwise we don't care what the keys are but that they exist where we expect them to
            assert posting_key is not original_job_id
            with s3.open('{data_path}/.{unit_name}/{posting_key}.txt'.format(
                    data_path=experiment.data_path,
                    unit_name=unit_name,
                    posting_key=posting_key
            ), mode='rb') as f:
                posting = f.read().decode('utf-8')
                retrieved_descriptions.append(posting.strip())
            # make sure that the blank annotation file is there too
            with s3.open('{data_path}/.{unit_name}/{posting_key}.ann'.format(
                    data_path=experiment.data_path,
                    unit_name=unit_name,
                    posting_key=posting_key
            ), mode='rb') as f:
                assert len(f.read().decode('utf-8')) == 0
    # our fake descriptions were just the string values for the range numbers
    # so that's what should get written
    assert sorted(retrieved_descriptions) == sorted([str(i) for i in range(100, 200)])

    def assert_conf_contains(conf_name, expected):
        with s3.open('{path}/{conf_name}'.format(
                path=experiment.brat_config_path,
                conf_name=conf_name
        ), 'rb') as f:
            assert expected in f.read().decode('utf-8')

    assert_conf_contains('visual.conf', '[labels]\nCompetency\n')
    assert_conf_contains('annotation.conf', '[entities]\nCompetency\n')
    assert_conf_contains('kb_shortcuts.conf', 'c Competency\n')