def test_onet_skill_extractor_knowledge(): with tempfile.NamedTemporaryFile(mode='w+') as outputfile: extractor = OnetSkillListProcessor(output_filename=outputfile.name, onet_source=MockOnetSkillCache(), hash_function=md5, ksa_types=['knowledge']) extractor.run() outputfile.seek(0) output = pd.read_csv(outputfile, sep='\t').T.to_dict().values() # +2 base rows in input of K file assert len(output) == 2 assert len([row for row in output if row['ksa_type'] == 'knowledge']) == 2 # make sure uuid is hashed version of the KSA for row in output: assert row['skill_uuid'] == md5(row['ONET KSA'])
def sequence_tagged_annotations(self): """Fetch sequence tagged annotations Expects these annotations to be produced by BRAT in CoNLL format. Returns: (dict), keys are tuples of (job posting id, tagger_id) and values are lists of (entity, token) tuples """ annotations_by_posting_and_user = {} for user_name, unit_names in self.metadata['allocations'].items(): for unit_name in unit_names: posting_id_lookup = dict(self.metadata['units'][unit_name]) allocation_path = self.allocation_path(user_name, unit_name) for key in self.s3.ls(allocation_path + '/'): # this will iterate through posting text (.txt), annotation (.ann), # and CoNLL (.conll) files. In this case we only care about conll if key.endswith('.conll'): posting_key = key.split('/')[-1].replace('.conll', '') with self.s3.open(key) as f: logging.info('Reading conll file at %s', key) job_posting_id = posting_id_lookup[int( posting_key)] raw_tokens = csv.reader(f, delimiter='\t') tokens = [] for token_line in raw_tokens: logging.info('Found token line %s', token_line) if len(token_line) == 0: tokens.append((None, None)) else: tag, _, _, token = token_line tokens.append((tag, token)) key = (job_posting_id, md5(user_name)) if any(token for token in tokens if token[0] not in {'O', None}): annotations_by_posting_and_user[key] = tokens else: logging.warning( 'No annotations found in file. Skipping') return annotations_by_posting_and_user
def test_onet_skill_extractor_all(): with tempfile.NamedTemporaryFile(mode='w+') as outputfile: extractor = OnetSkillListProcessor(output_filename=outputfile.name, onet_source=MockOnetSkillCache(), hash_function=md5) extractor.run() outputfile.seek(0) output = pd.read_csv(outputfile, sep='\t').T.to_dict().values() # +17 base rows in input across the K,S,A,T files # -7 rows that don't have scale LV # -1 row that has 'Not Relevant' marked Y # -1 row that has 'Data Value' below 0 # -1 row that is a dupe assert len(output) == 7 assert len([row for row in output if row['ksa_type'] == 'knowledge']) == 2 assert len([row for row in output if row['ksa_type'] == 'skill']) == 1 assert len([row for row in output if row['ksa_type'] == 'ability']) == 1 assert len([row for row in output if row['ksa_type'] == 'tool']) == 3 # make sure uuid is hashed version of the KSA for row in output: assert row['skill_uuid'] == md5(row['ONET KSA']) # make sure nlp_a is cleaned version of skill assert next( row['nlp_a'] for row in output if row['ONET KSA'] == '10-key calculators') == '10key calculators' # make sure duplicate entries pick first SOC Code assert next( row['O*NET-SOC Code'] for row in output if row['ONET KSA'] == 'written comprehension') == '11-1011.00' # make sure duplicate entries pick first element id assert next( row['Element ID'] for row in output if row['ONET KSA'] == 'written comprehension') == '1.a.1.a.2'
def test_sequence_tagged_annotations(self): s3 = boto3.resource('s3') s3.create_bucket(Bucket='test-bucket') experiment = BratExperiment(experiment_name='initial_skills_tag', brat_s3_path='test-bucket/brat') tags = { 'user_1': { 'unit_1/0': [ 'O\t0\t4\tthis', 'B-SKILL\t5\t7\tis', 'O\t8\t14\tpython', ], 'unit_1/1': [ 'O\t0\t4\tthis', 'O\t5\t7\tis', 'B-SKILL\t8\t14\tpython', ] }, 'user_2': { 'unit_1/0': [ 'O\t0\t4\tthis', 'O\t5\t7\tis', 'B-SKILL\t8\t14\tpython', ], 'unit_1/1': [ 'O\t0\t4\tthis', 'O\t5\t7\tis', 'B-SKILL\t8\t14\tpython', ] }, } experiment.metadata['units'] = { 'unit_1': [ (0, 'ABC_91238'), (1, 'ABC_4823943'), ] } experiment.metadata['allocations'] = {} for user_name, annotations in tags.items(): experiment.metadata['allocations'][user_name] = [] for key, token_lines in annotations.items(): unit_name, num = key.split('/') if unit_name not in experiment.metadata['allocations'][ user_name]: experiment.metadata['allocations'][user_name].append( unit_name) base_path = '{}/{}'.format( experiment.user_allocations_path(user_name), key) with experiment.s3.open('{}.txt'.format(base_path), 'wb') as f: f.write( 'does not matter we are not reading'.encode('utf-8')) with experiment.s3.open('{}.ann'.format(base_path), 'wb') as f: f.write( 'does not matter we are not reading'.encode('utf-8')) with experiment.s3.open('{}.conll'.format(base_path), 'wb') as f: f.write('\n'.join(token_lines).encode('utf-8')) experiment.metadata.save() self.maxDiff = None expected_tokens = { ('ABC_91238', md5('user_1')): [('O', 'this'), ('B-SKILL', 'is'), ('O', 'python')], ('ABC_91238', md5('user_2')): [('O', 'this'), ('O', 'is'), ('B-SKILL', 'python')], ('ABC_4823943', md5('user_1')): [('O', 'this'), ('O', 'is'), ('B-SKILL', 'python')], ('ABC_4823943', md5('user_2')): [('O', 'this'), ('O', 'is'), ('B-SKILL', 'python')], } self.assertDictEqual(experiment.sequence_tagged_annotations, expected_tokens)
def test_onet_skill_extractor(): skills_content = [ [ 'O*NET-SOC Code', 'Element ID', 'Element Name', 'Scale ID', 'Data Value', 'N', 'Standard Error', 'Lower CI Bound', 'Upper CI Bound', 'Recommend Suppress', 'Not Relevant', 'Date', 'Domain Source' ], [ '11-1011.00', '2.A.1.a', 'Reading Comprehension', 'IM', '4.12', '8', '0.13', '3.88', '4.37', 'N', 'n/a', '07/2014', 'Analyst' ], [ '11-1011.00', '2.A.1.a', 'Reading Comprehension', 'LV', '4.75', '8', '0.16', '4.43', '5.07', 'N', 'N', '07/2014', 'Analyst' ], [ '11-1011.00', '2.A.1.b', 'Active Listening', 'IM', '4.12', '8', '0.13', '3.88', '4.37', 'N', 'n/a', '07/2014', 'Analyst' ], [ '11-1011.00', '2.A.1.b', 'Active Listening', 'LV', '-4.88', '8', '0.23', '4.43', '5.32', 'N', 'N', '07/2014', 'Analyst' ], ] abilities_content = [ [ 'O*NET-SOC Code', 'Element ID', 'Element Name', 'Scale ID', 'Data Value', 'N', 'Standard Error', 'Lower CI Bound', 'Upper CI Bound', 'Recommend Suppress', 'Not Relevant', 'Date', 'Domain Source' ], [ '11-1011.00', '1.A.1.a.1', 'Oral Comprehension', 'IM', '4.50', '8', '0.19', '4.13', '4.87', 'N', 'n/a', '07/2014', 'Analyst' ], [ '11-1011.00', '1.A.1.a.1', 'Oral Comprehension', 'LV', '4.88', '8', '0.13', '4.63', '5.12', 'N', 'Y', '07/2014', 'Analyst' ], [ '11-1011.00', '1.A.1.a.2', 'Written Comprehension', 'IM', '4.25', '8', '0.16', '3.93', '4.57', 'N', 'n/a', '07/2014', 'Analyst' ], [ '11-1011.00', '1.A.1.a.2', 'Written Comprehension', 'LV', '4.62', '8', '0.18', '4.27', '4.98', 'N', 'N', '07/2014', 'Analyst' ], [ '11-2031.00', '1.A.1.a.3', 'Written Comprehension', 'IM', '4.25', '8', '0.16', '3.93', '4.57', 'N', 'n/a', '07/2014', 'Analyst' ], [ '11-2031.00', '1.A.1.a.3', 'Written Comprehension', 'LV', '4.62', '8', '0.18', '4.27', '4.98', 'N', 'N', '07/2014', 'Analyst' ], ] knowledge_content = [ [ 'O*NET-SOC Code', 'Element ID', 'Element Name', 'Scale ID', 'Data Value', 'N', 'Standard Error', 'Lower CI Bound', 'Upper CI Bound', 'Recommend Suppress', 'Not Relevant', 'Date', 'Domain Source' ], [ '11-1011.00', '2.C.1.a', 'Administration and Management', 'IM', '4.75', '27', '0.09', '4.56', '4.94', 'N', 'n/a', '07/2014', 'Incumbent' ], [ '11-1011.00', '2.C.1.a', 'Administration and Management', 'LV', '6.23', '27', '0.17', '5.88', '6.57', 'N', 'N', '07/2014', 'Incumbent' ], [ '11-1011.00', '2.C.1.b', 'Clerical', 'IM', '2.66', '27', '0.22', '2.21', '3.11', 'N', 'n/a', '07/2014', 'Incumbent' ], [ '11-1011.00', '2.C.1.b', 'Clerical', 'LV', '3.50', '27', '0.41', '2.66', '4.34', 'N', 'N', '07/2014', 'Incumbent' ], ] class MockOnetSkillCache(object): @contextlib.contextmanager def ensure_file(self, dataset): fake_data_lookup = { 'Skills.txt': skills_content, 'Abilities.txt': abilities_content, 'Knowledge.txt': knowledge_content } with utils.makeNamedTemporaryCSV(fake_data_lookup[dataset], '\t') as temp: yield temp with tempfile.NamedTemporaryFile(mode='w+') as outputfile: extractor = OnetSkillImportanceExtractor( output_filename=outputfile.name, onet_source=MockOnetSkillCache(), hash_function=md5) extractor.run() outputfile.seek(0) output = pd.read_csv(outputfile, sep='\t').T.to_dict().values() # +14 base rows in input across the K,S,A files assert len(output) == 14 # make sure uuid is hashed version of the KSA for row in output: assert row['skill_uuid'] == md5(row['ONET KSA']) # otherwise, this is a simple concat so not much to assert # we do use these rows though so make sure they're there assert 'Data Value' in row assert 'O*NET-SOC Code' in row assert 'ONET KSA' in row assert row['Scale ID'] in ['im', 'lv']
def test_onet_skill_extractor(): skills_content = [ [ 'O*NET-SOC Code', 'Element ID', 'Element Name', 'Scale ID', 'Data Value', 'N', 'Standard Error', 'Lower CI Bound', 'Upper CI Bound', 'Recommend Suppress', 'Not Relevant', 'Date', 'Domain Source' ], [ '11-1011.00', '2.A.1.a', 'Reading Comprehension', 'IM', '4.12', '8', '0.13', '3.88', '4.37', 'N', 'n/a', '07/2014', 'Analyst' ], [ '11-1011.00', '2.A.1.a', 'Reading Comprehension', 'LV', '4.75', '8', '0.16', '4.43', '5.07', 'N', 'N', '07/2014', 'Analyst' ], [ '11-1011.00', '2.A.1.b', 'Active Listening', 'IM', '4.12', '8', '0.13', '3.88', '4.37', 'N', 'n/a', '07/2014', 'Analyst' ], [ '11-1011.00', '2.A.1.b', 'Active Listening', 'LV', '-4.88', '8', '0.23', '4.43', '5.32', 'N', 'N', '07/2014', 'Analyst' ], ] abilities_content = [ [ 'O*NET-SOC Code', 'Element ID', 'Element Name', 'Scale ID', 'Data Value', 'N', 'Standard Error', 'Lower CI Bound', 'Upper CI Bound', 'Recommend Suppress', 'Not Relevant', 'Date', 'Domain Source' ], [ '11-1011.00', '1.A.1.a.1', 'Oral Comprehension', 'IM', '4.50', '8', '0.19', '4.13', '4.87', 'N', 'n/a', '07/2014', 'Analyst' ], [ '11-1011.00', '1.A.1.a.1', 'Oral Comprehension', 'LV', '4.88', '8', '0.13', '4.63', '5.12', 'N', 'Y', '07/2014', 'Analyst' ], [ '11-1011.00', '1.A.1.a.2', 'Written Comprehension', 'IM', '4.25', '8', '0.16', '3.93', '4.57', 'N', 'n/a', '07/2014', 'Analyst' ], [ '11-1011.00', '1.A.1.a.2', 'Written Comprehension', 'LV', '4.62', '8', '0.18', '4.27', '4.98', 'N', 'N', '07/2014', 'Analyst' ], [ '11-2031.00', '1.A.1.a.3', 'Written Comprehension', 'IM', '4.25', '8', '0.16', '3.93', '4.57', 'N', 'n/a', '07/2014', 'Analyst' ], [ '11-2031.00', '1.A.1.a.3', 'Written Comprehension', 'LV', '4.62', '8', '0.18', '4.27', '4.98', 'N', 'N', '07/2014', 'Analyst' ], ] knowledge_content = [ [ 'O*NET-SOC Code', 'Element ID', 'Element Name', 'Scale ID', 'Data Value', 'N', 'Standard Error', 'Lower CI Bound', 'Upper CI Bound', 'Recommend Suppress', 'Not Relevant', 'Date', 'Domain Source' ], [ '11-1011.00', '2.C.1.a', 'Administration and Management', 'IM', '4.75', '27', '0.09', '4.56', '4.94', 'N', 'n/a', '07/2014', 'Incumbent' ], [ '11-1011.00', '2.C.1.a', 'Administration and Management', 'LV', '6.23', '27', '0.17', '5.88', '6.57', 'N', 'N', '07/2014', 'Incumbent' ], [ '11-1011.00', '2.C.1.b', 'Clerical', 'IM', '2.66', '27', '0.22', '2.21', '3.11', 'N', 'n/a', '07/2014', 'Incumbent' ], [ '11-1011.00', '2.C.1.b', 'Clerical', 'LV', '3.50', '27', '0.41', '2.66', '4.34', 'N', 'N', '07/2014', 'Incumbent' ], ] tools_content = [ [ 'O*NET-SOC Code', 'T2 Type', 'T2 Example', 'Commodity Code', 'Commodity Title' ], [ '11-1011.00', 'Tools', '10-key calculators', '44101809', 'Desktop calculator' ], [ '11-1011.00', 'Tools', 'Desktop computers', '43211507', 'Desktop computers' ], [ '11-1011.00', 'Tools', 'Laptop computers', '43211503', 'Notebook computers' ], [ '11-1011.00', 'Tools', 'Personal computers', '43211508', 'Personal computers' ], [ '11-1011.00', 'Tools', 'Personal digital assistants PDA', '43211504', 'Personal digital assistant PDAs or organizers' ], ['11-1011.00', 'Tools', 'Smartphones', '43191501', 'Mobile phones'], [ '11-1011.00', 'Tools', 'Universal serial bus USB flash drives', '43201813', 'High capacity removable media drives' ], [ '11-1011.00', 'Technology', 'Adobe Systems Adobe Acrobat software', '43232202', 'Document management software' ], [ '11-1011.00', 'Technology', 'AdSense Tracker', '43232306', 'Data base user interface and query software' ], [ '11-1011.00', 'Technology', 'Blackbaud The Raiser\'s Edge', '43232303', 'Customer relationship management CRM software' ], ] class MockOnetDownloader(object): def download(self, source_file): fake_data_lookup = { 'Skills': skills_content, 'Abilities': abilities_content, 'Knowledge': knowledge_content, 'Tools and Technology': tools_content, } with utils.makeNamedTemporaryCSV(fake_data_lookup[source_file], '\t') as tempname: with open(tempname) as fh: return fh.read() with patch( 'skills_ml.datasets.skill_importances.onet.OnetToMemoryDownloader', MockOnetDownloader): with tempfile.TemporaryDirectory() as output_dir: storage = FSStore(output_dir) extractor = OnetSkillImportanceExtractor( output_dataset_name='skills', storage=storage, hash_function=md5) extractor.run() pdin = io.StringIO(storage.load('skills.tsv').decode('utf-8')) output = pd.read_csv(pdin, sep='\t').T.to_dict().values() # +24 base rows in input across the K,S,A,T files assert len(output) == 24 # make sure uuid is hashed version of the KSA for row in output: assert row['nlp_a'] == md5(row['ONET KSA']) # otherwise, this is a simple concat so not much to assert # we do use these rows though so make sure they're there assert 'O*NET-SOC Code' in row assert 'ONET KSA' in row
def test_onet_title_extractor(): occupation_content = [ ['O*NET-SOC Code', 'Title', 'Description'], ['11-1011.00', 'Chief Executives', 'Not important'], ['11-1011.03', 'Chief Sustainability Officers', 'Not important'], ['11-1021.00', 'General and Operations Managers', 'Not important'], ['11-1031.00', 'Legislators', 'Not important'], ] alternate_title_content = [ ['O*NET-SOC Code', 'Alternate Title', 'Short Title', 'Source(s)'], ['11-1011.00', 'Aeronautics Commission Director', 'n/a', '08'], ['11-1011.00', 'Agricultural Services Director', 'n/a', '08'], ['11-1011.00', 'Alcohol and Drug Abuse Assistance Admin', 'n/a', '08'], ] sample_content = [ ['O*NET-SOC Code', 'Reported Job Title', 'Shown in My Next Move'], ['11-1011.00', 'Chief Diversity Officer (CDO)', 'N'], ['11-1011.00', 'Chief Executive Officer (CEO)', 'Y'], ['11-1011.00', 'Chief Financial Officer (CFO)', 'Y'], ] class MockOnetTitleCache(object): @contextlib.contextmanager def ensure_file(self, dataset): fake_data_lookup = { 'Sample of Reported Titles.txt': sample_content, 'Occupation Data.txt': occupation_content, 'Alternate Titles.txt': alternate_title_content, } with utils.makeNamedTemporaryCSV(fake_data_lookup[dataset], '\t') as temp: yield temp with tempfile.NamedTemporaryFile(mode='w+') as outputfile: extractor = OnetTitleExtractor(output_filename=outputfile.name, onet_source=MockOnetTitleCache(), hash_function=md5) extractor.run() outputfile.seek(0) output = pd.read_csv(outputfile, sep='\t').T.to_dict().values() # the new file should be the three files concatenated assert len(output) == 10 # for non-occupations, original title should be occupation assert next(row['Original Title'] for row in output if row['Title'] == 'Aeronautics Commission Director') == 'Chief Executives' # for occupations, the original titles should also be occupation assert next( row['Original Title'] for row in output if row['Title'] == 'Chief Executives') == 'Chief Executives' # make sure uuid is hashed version of the title for row in output: assert row['job_uuid'] == md5(row['Original Title']) # make sure nlp_a is cleaned version of title assert next( row['nlp_a'] for row in output if row['Title'] == 'Chief Diversity Officer (CDO)') == 'chief diversity officer cdo'