def test_employment_type(self): js = JobSampler(job_posting_generator=self.fake_corpus_train, keys='employmentType') result = [] for i in range(self.num_loops): result.extend(list(map(lambda x: x[1], js.sample(self.sample_size)))) counts = dict(Counter(result)) assert np.mean(np.array(list(counts.values()))) == self.num_loops * self.sample_size / len(self.employment_type)
def test_soc(self): js = JobSampler(job_posting_generator=self.fake_corpus_train) result = [] for i in range(self.num_loops): result.extend(list(map(lambda x: x[0]['onet_soc_code'], js.sample(self.sample_size)))) counts = dict(Counter(result)) assert np.mean(np.array(list(counts.values()))) == self.num_loops * self.sample_size / self.occ_num
def test_state(self): js = JobSampler(job_posting_generator=self.fake_corpus_train, keys=['jobLocation', 'address', 'addressRegion']) result = [] for i in range(self.num_loops): result.extend(list(map(lambda x: x[1], js.sample(self.sample_size)))) counts = dict(Counter(result)) assert np.mean(np.array(list(counts.values()))) == self.num_loops * self.sample_size / len(self.states)
def test_major_group(self): ratio = self.weights['13'] / self.weights['11'] js = JobSampler(job_posting_generator=self.fake_corpus_train, weights=self.weights, major_group=True) result = [] for i in range(self.num_loops): r = list(map(lambda x: x[1][:2], js.sample(self.sample_size))) counts = dict(Counter(r)) result.append(counts['13'] / counts['11']) hist = np.histogram(result, bins=[0, 1, 2, 3, 4, 5]) # Check if the ratio of the weights (this case is 2.0) falls into the interval with maximum counts # in the histogram as we expect after looping for 200 times assert ratio >= hist[1][np.argmax(hist[0])] and ratio <= hist[1][np.argmax(hist[0]) + 1]