def parse_line(self, line): parts = line.strip().split('\t') output = Bunch(self.parse_entity(parts[1]), wn_id=parts[0], wn_key=parts[1], definition=parts[-1]) return output
def draw_items_from_pool(draw, which, min_size, max_size, get_id_column_name=lambda x: x + '_id'): """ Draw a list of NeoBunch containing example items from one of the pools of objects. NeoBunch is a subclass of dict where the items can be accessed like attributes and can be used both like an object and like a dict. :param draw: Callable for drawing examples from strategies. :param which: The name of the object pool to sample from. :param min_size: The minimum number of items to draw :param max_size: The maximum number of items to draw :return: List of dicts containing data for requested example items. """ pool = fake_object_pools[which] strategy = st.sampled_from(pool) items = draw( non_empty_lists(strategy, min_size=min_size, max_size=max_size)) return [ Bunch( # generate id - autoincrement starts from 1 {get_id_column_name(which): i + 1}, **x, ) for i, x in enumerate(items) ]
def _read_data_files(wndb_path, lexnames, pos_files=['noun', 'verb', 'adj', 'adv']): data = {} for pos_file in pos_files: data_path = os.path.join(wndb_path, 'data.' + pos_file) with open(data_path) as ifp: for line in ifp: if line.startswith(' '): continue part = line.strip().split() _num_words = int(part[3], 16) _num_pointers = int(part[4 + _num_words * 2]) _pp = 4 + _num_words * 2 + 1 _gloss_p = part.index('|') + 1 data_entry = Bunch( offset=part[0], lexname=lexnames[part[1]], synset_type=part[2], words=DBWordNetParser._parse_words( DBWordNetParser._chunklist(part[4:_pp - 1])), pointers=DBWordNetParser._parse_pointers( DBWordNetParser._chunklist( part[_pp:_pp + _num_pointers * 4], 4)), gloss=' '.join(part[_gloss_p:])) data['{}-{}'.format(pos_file, part[0])] = data_entry return data
def lists_of_employees(draw, locations, job_titles, companies, min_size, max_size): """ Returns a strategy which generates a list of NeoBunch containing data describing an employee instances. NeoBunch is a subclass of dict where the items can be accessed like attributes and can be used both like an object and like a dict. :param draw: Callable for drawing examples from strategies. :param locations: The available locations drawn by hypothesis. :param job_titles: The available job titles drawn by hypothesis. :param companies: The available companies drawn by hypothesis. :param min_size: The minimum number of employee instances to generate. :param max_size: The maximum number of employee instances to generate. :return: Strategy for generating a ``dict`` with employee data. """ strategy = employees(locations, job_titles, companies) items = draw( non_empty_lists(strategy, min_size=min_size, max_size=max_size)) return [ # generate id - autoincrement starts from 1 Bunch(employee_id=i + 1, **x) for i, x in enumerate(items) ]
def employees(draw, locations, job_titles, companies): """ Returns a strategy which generates a NeoBunch containing data describing an employee instance. NeoBunch is a subclass of dict where the items can be accessed like attributes and can be used both like an object and like a dict. :param draw: Callable for drawing examples from strategies. :param locations: The available locations drawn by hypothesis. :param job_titles: The available job titles drawn by hypothesis. :param companies: The available companies drawn by hypothesis. :return: Strategy for generating a ``dict`` with employee data. """ def draw_name(which): pool = fake_object_pools[which] strategy = st.sampled_from(pool) return draw(strategy)[which] def draw_id(which): return draw(st.integers(min_value=1, max_value=len(which))) return Bunch( first_name=draw_name('first_name'), last_name=draw_name('last_name'), date_of_birth=draw(dates_of_birth()), location_id=draw_id(locations), job_title_id=draw_id(job_titles), company_id=draw_id(companies), )
def _parse_pointers(tups): pointers = [] for tup in tups: pointers.append( Bunch(pointer=tup[0], offset='{}-{}'.format( DBWordNetParser._POS_FILE_MAP_[tup[2]], tup[1]), source=tup[3][:2], target=tup[3][2:])) return pointers
def data_info(self, data): n, c, w, h = data.shape center = w // 2 l, r = center - self.remove_size, center + self.remove_size return Bunch(n=n, c=c, w=w, h=h, l=l, r=r, center=center, rs=self.remove_size)
def perform_integration_test(data, page, page_name, get_expected, compare_functions, **parameters): """ Perform an integration test to verify that a query works end to end (load page -> set params -> show results :param data: Employee test data :param page: The capybara page object :param page_name: The name of the page to load :param get_expected: Callable for retrieving the expected result :param compare_functions: iterable containing functions to use to validate the results comparing the actual with the expected. :param **parameters: Keyword parameters which should be passed to get_expected. Also defines the forms which will be filled with generated data. :return: """ def get_url(): return 'sqlite:///:memory:' def data_importer(session): return import_data(data, session) with patch('employee_insights.database.get_url', get_url), \ patch('employee_insights.database.import_data', data_importer): if page.current_path != '/' + page_name: page.visit(page_name) for name, value in parameters.items(): page.fill_in(name, value=value) if len(parameters): page.click_button("Go") time.sleep(0.1) actual_result = page.html actual_result = (Bunch(x) for x in table2dicts(actual_result)) expected_result = list(get_expected(data, **parameters)) if expected_result: for actual, expected in zip_longest(actual_result, expected_result): for compare_function in compare_functions: if not compare_function(actual, expected): assert actual == expected else: expected = 'No rows' actual = page.find('#results').text assert expected in actual
def get_entries(self, idx_entries): out_entries = [] for idx_entry in idx_entries: for synset_offset in idx_entry.synset_offsets: data_entry = self.data[synset_offset] sense_key = '{}-{}-{}'.format(idx_entry.lemma, data_entry.synset_type, data_entry.offset) entry = Bunch(lemma=idx_entry.lemma, pos=idx_entry.pos, synset_type=data_entry.synset_type, sense_number=self.sense_numbers[sense_key], gloss=data_entry.gloss, words=[e.word for e in data_entry.words]) out_entries.append(entry) return out_entries
def read_index_files(wndb_path, pos_files=['noun', 'verb', 'adj', 'adv']): entries = {} for pos_file in pos_files: idx_path = os.path.join(wndb_path, 'index.' + pos_file) with open(idx_path) as ifp: for line in ifp: if line.startswith(' '): continue part = line.strip().split() _ap = 4 + int(part[3]) lemma = part[0] idx_entry = Bunch(lemma=lemma, pos=part[1], pointers=part[4:_ap], num_tagsenes=int(part[_ap + 1]), synset_offsets=[ '{}-{}'.format(pos_file, _o) for _o in part[_ap + 2:] ]) entries[lemma] = entries.get(lemma, []) entries[lemma].append(idx_entry) return entries
def parse_entity(self, e): parts = e[2:].split('_') return Bunch(word=' '.join(parts[0:-2]), pos=parts[-2], sense_id=parts[-1])
def _parse_words(tups): words = [] for tup in tups: words.append(Bunch(word=tup[0], lex_id=tup[1])) return words
def parse_synset_name(self, name): parts = name.split('.') return Bunch(pos=parts[-2], sense_id=int(parts[-1]), wn_id=name)
data_path = os.path.join(wndb_path, 'data.' + pos_file) with open(data_path) as ifp: for line in ifp: if line.startswith(' '): continue part = line.strip().split() _num_words = int(part[3], 16) _num_pointers = int(part[4 + _num_words * 2]) _pp = 4 + _num_words * 2 + 1 _gloss_p = part.index('|') + 1 data_entry = Bunch( offset=part[0], lexname=lexnames[part[1]], synset_type=part[2], words=DBWordNetParser._parse_words( DBWordNetParser._chunklist(part[4:_pp - 1])), pointers=DBWordNetParser._parse_pointers( DBWordNetParser._chunklist( part[_pp:_pp + _num_pointers * 4], 4)), gloss=' '.join(part[_gloss_p:])) data['{}-{}'.format(pos_file, part[0])] = data_entry return data if __name__ == '__main__': opt = Bunch(wndb_dir='data/wndb') parser = DBWordNetParser(opt) idx_entries, inflected = parser.get_idx_entries('tests') for e in parser.get_entries(idx_entries): print(e)
def get_expected(employee_data, location, min_percentage): """ Calculate the percentage of employees of a company at a particular location, filtered by a minimum percentage of employees using python. :param employee_data: Hypothesis generated employee database source data. :param location: The location to get employee percentage for. This is a / separated list of any of the following combinations: - continent/country/state/city - continent/country/state - continent/country - continent :param min_percentage: The minimum percentage of employees a location should have to be included in the output. :return: Generator which when iterated yields an instance ``Expected`` which represents the expected result from get_employees_percentage_by_location. """ location_fields = ['city', 'state', 'country', 'continent'] continent, country, state, city = (x for x, y in itertools.zip_longest( location.split('/'), location_fields)) params = Bunch(continent=continent, country=country, state=state, city=city) def get_location(employee): return employee_data.locations[employee.location_id - 1] def are_all_attrs_equal(lhs, rhs, attrs): return all(getattr(lhs, attr) == getattr(rhs, attr) for attr in attrs) def get_location_and_where(n): """ Get the location to include in the result and a where function to filter the company employees for inclusion when calculating the percentage at a location. """ fields = location_fields[n:] filter_func = partial(are_all_attrs_equal, params, attrs=fields) employee_location = next(filter(filter_func, employee_data.locations)) location = '/'.join( getattr(employee_location, x) for x in reversed(fields)) def where(employee): return are_all_attrs_equal(get_location(employee), params, fields) return location, where for company, company_employees in get_company_employees(employee_data): if city: location, where = get_location_and_where(0) elif state: location, where = get_location_and_where(1) elif country: location, where = get_location_and_where(2) else: location, where = get_location_and_where(3) n_employees_in_location = count(filter(where, company_employees)) n_employees = float(len(company_employees)) percentage = n_employees_in_location / n_employees * 100 if 0 < percentage > min_percentage: yield Expected(company.company_id, company.company_name, location, percentage)