def test_employment_type(self): js = JobSampler(job_posting_generator=self.fake_corpus_train, keys='employmentType') result = [] for i in range(self.num_loops): result.extend(list(map(lambda x: x[1], js.sample(self.sample_size)))) counts = dict(Counter(result)) assert np.mean(np.array(list(counts.values()))) == self.num_loops * self.sample_size / len(self.employment_type)
def test_state(self): js = JobSampler(job_posting_generator=self.fake_corpus_train, keys=['jobLocation', 'address', 'addressRegion']) result = [] for i in range(self.num_loops): result.extend(list(map(lambda x: x[1], js.sample(self.sample_size)))) counts = dict(Counter(result)) assert np.mean(np.array(list(counts.values()))) == self.num_loops * self.sample_size / len(self.states)
def test_soc(self): js = JobSampler(job_posting_generator=self.fake_corpus_train) result = [] for i in range(self.num_loops): result.extend(list(map(lambda x: x[0]['onet_soc_code'], js.sample(self.sample_size)))) counts = dict(Counter(result)) assert np.mean(np.array(list(counts.values()))) == self.num_loops * self.sample_size / self.occ_num
def test_major_group(self): ratio = self.weights['13'] / self.weights['11'] major_group_filter = lambda job: job['onet_soc_code'][:2] in ['11', '13'] filtered_jobposting = JobPostingFilterer( self.fake_corpus_train, [major_group_filter] ) js = JobSampler( job_posting_generator=filtered_jobposting, k=self.sample_size, weights=self.weights, key=lambda job: job['onet_soc_code'][:2] ) result = [] for i in range(self.num_loops): r = list(map(lambda x: x['onet_soc_code'][:2], js)) counts = dict(Counter(r)) result.append(counts['13'] / counts['11']) hist = np.histogram(result, bins=[0, 1, 2, 3, 4, 5]) # Check if the ratio of the weights (this case is 2.0) falls into the interval with maximum counts # in the histogram as we expect after looping for 200 times assert ratio >= hist[1][np.argmax(hist[0])] and ratio <= hist[1][np.argmax(hist[0]) + 1]
def test_major_group(self): ratio = self.weights['13'] / self.weights['11'] js = JobSampler(job_posting_generator=self.fake_corpus_train, weights=self.weights, major_group=True) result = [] for i in range(self.num_loops): r = list(map(lambda x: x[1][:2], js.sample(self.sample_size))) counts = dict(Counter(r)) result.append(counts['13'] / counts['11']) hist = np.histogram(result, bins=[0, 1, 2, 3, 4, 5]) # Check if the ratio of the weights (this case is 2.0) falls into the interval with maximum counts # in the histogram as we expect after looping for 200 times assert ratio >= hist[1][np.argmax(hist[0])] and ratio <= hist[1][np.argmax(hist[0]) + 1]
def test_state(self): transformer = lambda job: safe_get(job, 'jobLocation', 'address', 'addressRegion') js = JobSampler( job_posting_generator=self.fake_corpus_train, k=self.sample_size, ) result = [] for i in range(self.num_loops): result.extend(list(map(lambda x: transformer(x), js))) counts = dict(Counter(result)) assert np.mean(np.array(list(counts.values()))) == self.num_loops * self.sample_size / len(self.states)