def setUp(self): current_dir = os.path.abspath(os.path.dirname(__file__)) self.fixture_dir = os.path.join(current_dir, 'fixtures') self.test_url = 'http://127.0.0.1:8080/' entry_point = 'http://127.0.0.1:8080/?json_response={}' # For various navigation tests self.nav_values = {'next': "http://127.0.0.1:8080/docs?tag=npr_api&profile=someGUIDvalue&offset=10", 'prev': "http://127.0.0.1:8080/docs?", 'first': "http://127.0.0.1:8080/docs?tag=npr_api&profile=someGUIDvalue", 'last': "http://127.0.0.1:8080/docs?tag=npr_api&profile=someGUIDvalue&offset=13130", 'current': "http://127.0.0.1:8080/docs?tag=npr_api&profile=someGUIDvalue"} # Fixture locations self.home_doc = os.path.join(self.fixture_dir, 'homedoc.json') self.auth_doc = os.path.join(self.fixture_dir, 'authdetails.json') self.data_doc = os.path.join(self.fixture_dir, 'datadoc.json') # Can also return JSON via these urls and testing server self.server_process = Process(target=run_forever) self.server_process.start() self.test_entry_point = entry_point.format(self.home_doc) self.auth_url = entry_point.format(self.auth_doc) self.data_url = entry_point.format(self.data_doc) with open(self.home_doc, 'r') as jfile: home = PelicanJson(json.loads(jfile.read())) home.set_nested_value(['links', 'auth', 3, 'href'], self.auth_url) self.home_values = home.convert()
def test_set_all_paths(self): test_pelican = PelicanJson(self.item) for path in test_pelican.paths(): set_nested_value(self.item, path, "NEWVALUE") for path in test_pelican.paths(): value = get_nested_value(self.item, path) self.assertEqual(value, "NEWVALUE")
def validate(data, profile_type="story"): """Validator for a profile: it will return False if profile contains required fields that have not been filled in. """ # Do we want to return a Boolean or raise an error name, profile_file = profiles[profile_type] current_dir = os.path.abspath(os.path.dirname(__file__)) profile_location = os.path.join(current_dir, 'stored_profiles', profile_file) with open(profile_location, 'r') as f: profile = json.loads(f.read()) required = profile['REQUIRED'] temp_data = PelicanJson(data) try: next(temp_data.search_value(required)) return False except StopIteration: pass # next step is to type-check all the values: pelican_profile = PelicanJson(profile) for path, value in temp_data.enumerate(): try: if type(value) != type(pelican_profile.get_nested_value(path)): return False, path except (TypeError, IndexError, KeyError): return False, path return True, None
def test_find_and_replace(self): test_pelican = PelicanJson(self.pelecanus_occidentalis) test_pelican.find_and_replace('Pelecanus occidentalis', 'Brown Pelican') replace_paths = [['query', 'normalized', 0, 'to'], ['query', 'pages', '1266004', 'title']] for path in test_pelican.search_value('Brown Pelican'): self.assertIn(path, replace_paths)
def test_get_item(self): query_continue = self.ricketts['query-continue'] test_pelican = PelicanJson(self.ricketts) query_pelican = PelicanJson(query_continue) self.assertEqual(test_pelican.get('query-continue'), query_pelican) self.assertEqual(test_pelican.get('NO-KEY'), None)
def test_delete_item(self): test_pelican = PelicanJson(self.pelecanus_occidentalis) test_pelican['query'] = ['some', 'new' 'vals'] self.assertEqual(test_pelican['query'], ['some', 'new' 'vals']) del test_pelican['query'] self.assertEqual(test_pelican.get('query'), None) with self.assertRaises(KeyError): del test_pelican['none']
def test_create_path_raise_badpath(self): test_rickettsi = PelicanJson(self.ricketts) bad_path = [[4, 'query', 'normalized', 0, 'from'], [('some', 'tuple'), 'query'], [{'some': 'dict'}, 'query']] with self.assertRaises(BadPath): test_rickettsi.create_path(bad_path[0], "VALUE") test_rickettsi.create_path(bad_path[1], "VALUE") test_rickettsi.create_path(bad_path[2], "VALUE")
def test_iter(self): pkeys = {'query', 'normalized', 'to', 'from', 'pages', '1266004', 'ns', 'title', 'pageid'} bkeys = {'thumbnail_url', 'preview_url', 'bib_key', 'info_url', 'ISBN:9780804720687', 'preview'} test_pelican = PelicanJson(self.pelecanus_occidentalis) test_book = PelicanJson(self.book) self.assertEqual(set(test_pelican.keys()), pkeys) self.assertEqual(set(test_book.keys()), bkeys)
def test_contains(self): test_pelican = PelicanJson(self.book) self.assertIn('ISBN:9780804720687', test_pelican) self.assertIn('preview', test_pelican) self.assertIn('bib_key', test_pelican) test_pelican = PelicanJson(self.monterrey) for key in test_pelican.keys(): self.assertIn(key, test_pelican) self.assertFalse(test_pelican.__contains__('NADA'))
def test_generate_paths_inside_list(self): test_pelican = PelicanJson(self.item) expected_paths = [] for path in test_pelican.search_value(''): expected_paths.append(path) for path in generate_paths(self.item): value = get_nested_value(self.item, path) if value == '': self.assertIn(path, expected_paths)
def test_paths(self): test_book = PelicanJson(self.book) book_paths = [['ISBN:9780804720687', 'thumbnail_url'], ['ISBN:9780804720687', 'bib_key'], ['ISBN:9780804720687', 'preview_url'], ['ISBN:9780804720687', 'info_url'], ['ISBN:9780804720687', 'preview']] for item in test_book.paths(): self.assertIn(item, book_paths)
def test_set_nested_value_force_add_to_list(self): path = ['attributes', 'tags', 4] test_pelican = PelicanJson(self.item) test_pelican.set_nested_value(path, 'New Tag', force=True) new_tag = test_pelican.get_nested_value(path) self.assertEqual(new_tag, 'New Tag') none_placeholder = ['attributes', 'tags', 3] self.assertEqual(test_pelican.get_nested_value(none_placeholder), None)
def wiki_summary_by_name(self): # building link link_asked_wiki = 'https://en.wikipedia.org/w/api.php?format=json&action=query&prop=extracts&redirects=1' \ '&exintro=&explaintext=&titles=' + str(self) wiki_response = PelicanJson(requests.get(link_asked_wiki).json()) # getting a json tree for item in wiki_response.enumerate(): tree_path = item # printing wiki content page return wiki_response.get_nested_value(tree_path[0])
def strip_guids(json_result): guidpat = re.compile(r'\w{8}-\w{4}-\w{4}-\w{4}-\w{12}') pelican = PelicanJson(json_result) for path, value in pelican.enumerate(): if type(value) == str and guidpat.search(value): pelican.set_nested_value(path, re.sub(guidpat, 'someGUIDvalue', value)) return pelican.convert()
def clean_urls(json_result): urlpat = re.compile(r'https://[api|publish]-sandbox.pmp.io') pelican = PelicanJson(json_result) for path, value in pelican.enumerate(): if type(value) == str and urlpat.search(value): pelican.set_nested_value(path, re.sub(urlpat, 'http://127.0.0.1:8080', value)) return pelican.convert()
def test_set_nested_value_force_previous_index_error(self): test_rickettsi = PelicanJson(self.ricketts) success_msg = "Should now work" # IndexErrors overridden: paths created index_error_paths = [['query', 'normalized', 1, 'from'], ['query', 'normalized', 1, 'to']] for path in index_error_paths: test_rickettsi.set_nested_value(path, success_msg, force=True) self.assertEqual(test_rickettsi.get_nested_value(path), success_msg)
def test_update_from_list(self): test_pelican = PelicanJson(self.ricketts) values = [{'*': '//www.worldcat.org/identities/lccn-n79-055298'}, {'*': 'http://edricketts.stanford.edu/'}, {'*': 'http://holisticbiology.stanford.edu/philosophy.html'}] extlinks = self.ricketts['query']['pages']['1422396']['extlinks'] first, *_, second_last, last = test_pelican._update_from_list(extlinks) self.assertEqual(first, PelicanJson(values[0])) self.assertEqual(second_last, PelicanJson(values[1])) self.assertEqual(last, PelicanJson(values[2]))
def test_len(self): keys = {'query', 'normalized', 'to', 'from', 'pages', '1266004', 'ns', 'title', 'pageid'} length = len(keys) test_pelican = PelicanJson(self.pelecanus_occidentalis) self.assertEqual(len(test_pelican), length) self.assertEqual(set(test_pelican.keys()), keys) self.assertEqual(len(test_pelican), len(list(test_pelican.keys())), len(keys)) test_pelican['new'] = 'VALUE' self.assertEqual(len(test_pelican), length + 1)
def test_set_nested_value_force_type_error(self): test_rickettsi = PelicanJson(self.ricketts) success_msg = "Should now work" # TypeErrors overridden: path created type_error_path = ['query-continue', 'extlinks', 'eloffset', 'newdict-with-key'] test_rickettsi.set_nested_value(type_error_path, success_msg, force=True) self.assertEqual(test_rickettsi.get_nested_value(type_error_path), success_msg)
def test_set_nested_value_force_key_error(self): test_rickettsi = PelicanJson(self.ricketts) success_msg = "Should now work" # KeyErrors overidden key_error_paths = [['unknownKey', 'unknownKey2'], ['query-continue', 'unknownKey']] for path in key_error_paths: test_rickettsi.set_nested_value(path, success_msg, force=True) self.assertEqual(test_rickettsi.get_nested_value(path), success_msg)
def empty_values(data, profile_type="story"): """Function that lists all of the fields missing values. """ name, profile_file = profiles[profile_type] current_dir = os.path.abspath(os.path.dirname(__file__)) profile_location = os.path.join(current_dir, 'stored_profiles', profile_file) with open(profile_location, 'r') as f: profile = json.loads(f.read()) optional = profile['OPTIONAL'] temp_data = PelicanJson(data) return list(temp_data.search_value(optional))
def test_pluck(self): answer = PelicanJson({'to': 'Pelecanus occidentalis', 'from': 'Pelecanus_occidentalis'}) test_pelican = PelicanJson(self.pelecanus_occidentalis) self.assertEqual(answer, next(test_pelican.pluck('to', 'Pelecanus occidentalis'))) self.assertEqual(answer, next(test_pelican.pluck('from', 'Pelecanus_occidentalis'))) pelican_item = PelicanJson(self.item) self.assertEqual(pelican_item['attributes'], next(pelican_item.pluck('byline', 'Emily Reddy')))
def run_on_index(self, docs: List[dict], doc_paths: List[str], ratio, algorithm: List[str]): """Generate summary based on tokenized text retrieved from es fields Parameters: docs (list): list of documents doc_paths (list): list of fields ratio (float): ratio to use for summarization algorithm (list): list of algorithms for sumy Returns: list:stack """ stack = [] algorithm = ast.literal_eval(algorithm) summarizers = self.get_summarizers(algorithm) for document in docs: wrapper = PelicanJson(document) for doc_path in doc_paths: doc_path_as_list = doc_path.split(".") content = wrapper.safe_get_nested_value(doc_path_as_list, default=[]) if content and isinstance(content, str): ratio_count = SumyTokenizer().sentences_ratio( content, float(ratio)) parser = PlaintextParser.from_string( content, SumyTokenizer()) else: ratio_count = SumyTokenizer().sentences_ratio( document[doc_path], float(ratio)) parser = PlaintextParser.from_string( document[doc_path], SumyTokenizer()) summaries = {} for name, summarizer in summarizers.items(): try: summarization = summarizer(parser.document, float(ratio_count)) except Exception as e: logging.getLogger(ERROR_LOGGER).exception(e) continue summary = [sent._text for sent in summarization] summary = "\n".join(summary) summaries[doc_path + "_" + name] = summary stack.append(summaries) return stack
def test_enumerate(self): book_url = 'https://openlibrary.org/books/OL7928788M/' book_url += 'Between_Pacific_Tides' test_book_enums = [ (['ISBN:9780804720687', 'thumbnail_url'], 'https://covers.openlibrary.org/b/id/577352-S.jpg'), (['ISBN:9780804720687', 'bib_key'], 'ISBN:9780804720687'), (['ISBN:9780804720687', 'preview_url'], book_url), (['ISBN:9780804720687', 'info_url'], book_url), (['ISBN:9780804720687', 'preview'], 'noview')] test_book = PelicanJson(self.book) for path, value in test_book.enumerate(): self.assertIn((path, value), test_book_enums)
def test_get_nested_value(self): pelican_item = PelicanJson(self.item) answer = pelican_item.get_nested_value(['attributes', 'tags', 0]) self.assertEqual(answer, 'npr_api') test_monty = PelicanJson(self.monterrey) answers = [('gov.noaa.ncdc:C00822', ['results', 7, 'uid']), ('gov.noaa.ncdc:C00040', ['results', 0, 'uid'])] for answer, path in answers: self.assertEqual(test_monty.get_nested_value(path), answer) # Testing with a tuple, also should be allowed path = ('results', 9, 'uid') self.assertEqual(test_monty.get_nested_value(path), 'gov.noaa.ncdc:C00313')
def test_get_nested_value_raises_bad_path(self): test_pelican = PelicanJson(self.monterrey) # Try a string with self.assertRaises(BadPath): test_pelican.get_nested_value("STRING") # Howsbout a dict? somedict = {'results': 'value', '9': 'value2'} with self.assertRaises(BadPath): test_pelican.get_nested_value(somedict) # What happens with an integer? with self.assertRaises(BadPath): test_pelican.get_nested_value(8) # ...and a set? with self.assertRaises(BadPath): test_pelican.get_nested_value({'results', 8})
def test_create_path_new_object_inside_list(self): test_rickettsi = PelicanJson(self.ricketts) paths = [['query', 'pages', '1422396', 'images', 7, 'title'], ['query', 'normalized', 10, 'NEW']] check_for_none = ['query', 'normalized', 5] test_rickettsi.create_path(paths[0], "VALUE APPENDED TO LIST") test_rickettsi.create_path(paths[1], "VALUE in LIST with BACKFILL") self.assertEqual(test_rickettsi.get_nested_value(paths[0]), "VALUE APPENDED TO LIST") self.assertEqual(test_rickettsi.get_nested_value(paths[1]), "VALUE in LIST with BACKFILL") self.assertEqual(test_rickettsi.get_nested_value(check_for_none), None) self.assertEqual(len(test_rickettsi.get_nested_value(['query', 'normalized'])), 11)
def test_create_path_in_dict(self): test_rickettsi = PelicanJson(self.ricketts) overwrite_path = ['query-continue', 'extlinks', 'eloffset', 'newdict-with-key'] msg = "Previous value overwritten with dictionary" test_rickettsi.create_path(overwrite_path, msg) self.assertEqual(test_rickettsi.get_nested_value(overwrite_path), msg) newdict = test_rickettsi.get_nested_value(overwrite_path[:-1]) self.assertTrue(isinstance(newdict, PelicanJson)) paths = [['query-continue', 'extlinks', 'newkey1'], ['query-continue', 'newkey1', 'newkey2'], ['newkey1', 'newkey2', 'newkey3']] for path in paths: with self.assertRaises(KeyError): test_rickettsi.get_nested_value(path) test_rickettsi.create_path(path, "NEWVALUE") self.assertEqual(test_rickettsi.get_nested_value(path), "NEWVALUE")
def test_values(self): pvalues = {'Pelecanus occidentalis', 'Pelecanus_occidentalis', 0, 'Pelecanus occidentalis', 1266004} book_url = 'https://openlibrary.org/books/OL7928788M/' book_url += 'Between_Pacific_Tides' monty_uid = 'gov.noaa.ncdc:C00345' rval = 'Ed_Ricketts' rval2 = 'File:Pacific Biological Laboratories.JPG' test_pelican = PelicanJson(self.pelecanus_occidentalis) test_book = PelicanJson(self.book) test_monty = PelicanJson(self.monterrey) test_rickettsi = PelicanJson(self.ricketts) self.assertEqual(pvalues, set(test_pelican.values())) self.assertIn(book_url, set(test_book.values())) self.assertIn(monty_uid, set(test_monty.values())) self.assertIn(rval, set(test_rickettsi.values())) self.assertIn(rval2, set(test_rickettsi.values()))
def parse_doc_texts(doc_path: str, document: dict) -> list: """ Function for parsing text values from a nested dictionary given a field path. :param doc_path: Dot separated path of fields to the value we wish to parse. :param document: Document to be worked on. :return: List of text fields that will be processed by MLP. """ wrapper = PelicanJson(document) doc_path_as_list = doc_path.split(".") content = wrapper.safe_get_nested_value(doc_path_as_list, default=[]) if content and isinstance(content, str): return [content] # Check that content is non-empty list and there are only stings in the list. elif content and isinstance(content, list) and all( [isinstance(list_content, str) for list_content in content]): return content # In case the field path is faulty and it gives you a dictionary instead. elif isinstance(content, dict): return [] else: return []
def test_create_path_totally_new_path(self): test_rickettsi = PelicanJson(self.ricketts) path = ['new', 'path', 'in', 1, 'object'] with self.assertRaises(KeyError): test_rickettsi.get_nested_value(path) test_rickettsi.create_path(path, "TEST VALUE") self.assertEqual(test_rickettsi.get_nested_value(path), "TEST VALUE")
def test_create_path_add_item_to_list(self): test_item = PelicanJson(self.item) paths = [['attributes', 'tags', 2], ['attributes', 'tags', 5]] check_for_none = paths[1][:-1] check_for_none.append(4) test_item.create_path(paths[0], "New value inside list") self.assertEqual(test_item.get_nested_value(paths[0]), "New value inside list") test_item.create_path(paths[1], "New value inside list with None") self.assertEqual(test_item.get_nested_value(paths[1]), "New value inside list with None") self.assertEqual(test_item.get_nested_value(check_for_none), None)
def test_searchkey(self): test_rickettsi = PelicanJson(self.ricketts) paths = [['query', 'pages', '1422396', 'extlinks'] + [n, '*'] for n in range(10)] for item in test_rickettsi.search_key('*'): self.assertIn(item, paths) for key in test_rickettsi.keys(): self.assertTrue(next(test_rickettsi.search_key(key))) pelican_item = PelicanJson(self.item) for key in pelican_item.keys(): self.assertTrue(next(pelican_item.search_key(key)))