Beispiel #1
0
    def setUp(self):
        current_dir = os.path.abspath(os.path.dirname(__file__))
        self.fixture_dir = os.path.join(current_dir, 'fixtures')
        self.test_url = 'http://127.0.0.1:8080/'
        entry_point = 'http://127.0.0.1:8080/?json_response={}'

        # For various navigation tests
        self.nav_values = {'next': "http://127.0.0.1:8080/docs?tag=npr_api&profile=someGUIDvalue&offset=10",
                           'prev': "http://127.0.0.1:8080/docs?",
                           'first': "http://127.0.0.1:8080/docs?tag=npr_api&profile=someGUIDvalue",
                           'last': "http://127.0.0.1:8080/docs?tag=npr_api&profile=someGUIDvalue&offset=13130",
                           'current': "http://127.0.0.1:8080/docs?tag=npr_api&profile=someGUIDvalue"}

        # Fixture locations
        self.home_doc = os.path.join(self.fixture_dir, 'homedoc.json')
        self.auth_doc = os.path.join(self.fixture_dir, 'authdetails.json')
        self.data_doc = os.path.join(self.fixture_dir, 'datadoc.json')

        # Can also return JSON via these urls and testing server
        self.server_process = Process(target=run_forever)
        self.server_process.start()
        self.test_entry_point = entry_point.format(self.home_doc)
        self.auth_url = entry_point.format(self.auth_doc)
        self.data_url = entry_point.format(self.data_doc)

        with open(self.home_doc, 'r') as jfile:
            home = PelicanJson(json.loads(jfile.read()))
        home.set_nested_value(['links', 'auth', 3, 'href'],
                              self.auth_url)
        self.home_values = home.convert()
 def test_set_all_paths(self):
     test_pelican = PelicanJson(self.item)
     for path in test_pelican.paths():
         set_nested_value(self.item, path, "NEWVALUE")
     for path in test_pelican.paths():
         value = get_nested_value(self.item, path)
         self.assertEqual(value, "NEWVALUE")
Beispiel #3
0
def validate(data, profile_type="story"):
    """Validator for a profile: it will return False if profile
    contains required fields that have not been filled in.
    """
    # Do we want to return a Boolean or raise an error
    name, profile_file = profiles[profile_type]
    current_dir = os.path.abspath(os.path.dirname(__file__))
    profile_location = os.path.join(current_dir,
                                    'stored_profiles',
                                    profile_file)
    with open(profile_location, 'r') as f:
        profile = json.loads(f.read())
    required = profile['REQUIRED']
    temp_data = PelicanJson(data)
    try:
        next(temp_data.search_value(required))
        return False
    except StopIteration:
        pass

    # next step is to type-check all the values:
    pelican_profile = PelicanJson(profile)
    for path, value in temp_data.enumerate():
        try:
            if type(value) != type(pelican_profile.get_nested_value(path)):
                return False, path
        except (TypeError, IndexError, KeyError):
            return False, path
    return True, None
 def test_find_and_replace(self):
     test_pelican = PelicanJson(self.pelecanus_occidentalis)
     test_pelican.find_and_replace('Pelecanus occidentalis',
                                   'Brown Pelican')
     replace_paths = [['query', 'normalized', 0, 'to'],
                      ['query', 'pages', '1266004', 'title']]
     for path in test_pelican.search_value('Brown Pelican'):
         self.assertIn(path, replace_paths)
 def test_get_item(self):
     query_continue = self.ricketts['query-continue']
     test_pelican = PelicanJson(self.ricketts)
     query_pelican = PelicanJson(query_continue)
     self.assertEqual(test_pelican.get('query-continue'),
                      query_pelican)
     self.assertEqual(test_pelican.get('NO-KEY'),
                      None)
 def test_delete_item(self):
     test_pelican = PelicanJson(self.pelecanus_occidentalis)
     test_pelican['query'] = ['some', 'new' 'vals']
     self.assertEqual(test_pelican['query'], ['some', 'new' 'vals'])
     del test_pelican['query']
     self.assertEqual(test_pelican.get('query'),
                      None)
     with self.assertRaises(KeyError):
         del test_pelican['none']
 def test_create_path_raise_badpath(self):
     test_rickettsi = PelicanJson(self.ricketts)
     bad_path = [[4, 'query', 'normalized', 0, 'from'],
                 [('some', 'tuple'), 'query'],
                 [{'some': 'dict'}, 'query']]
     with self.assertRaises(BadPath):
         test_rickettsi.create_path(bad_path[0], "VALUE")
         test_rickettsi.create_path(bad_path[1], "VALUE")
         test_rickettsi.create_path(bad_path[2], "VALUE")
 def test_iter(self):
     pkeys = {'query', 'normalized', 'to', 'from', 'pages',
              '1266004', 'ns', 'title', 'pageid'}
     bkeys = {'thumbnail_url', 'preview_url', 'bib_key',
              'info_url', 'ISBN:9780804720687', 'preview'}
     test_pelican = PelicanJson(self.pelecanus_occidentalis)
     test_book = PelicanJson(self.book)
     self.assertEqual(set(test_pelican.keys()), pkeys)
     self.assertEqual(set(test_book.keys()), bkeys)
 def test_contains(self):
     test_pelican = PelicanJson(self.book)
     self.assertIn('ISBN:9780804720687', test_pelican)
     self.assertIn('preview', test_pelican)
     self.assertIn('bib_key', test_pelican)
     test_pelican = PelicanJson(self.monterrey)
     for key in test_pelican.keys():
         self.assertIn(key, test_pelican)
     self.assertFalse(test_pelican.__contains__('NADA'))
 def test_generate_paths_inside_list(self):
     test_pelican = PelicanJson(self.item)
     expected_paths = []
     for path in test_pelican.search_value(''):
         expected_paths.append(path)
     for path in generate_paths(self.item):
         value = get_nested_value(self.item, path)
         if value == '':
             self.assertIn(path, expected_paths)
 def test_paths(self):
     test_book = PelicanJson(self.book)
     book_paths = [['ISBN:9780804720687', 'thumbnail_url'],
                   ['ISBN:9780804720687', 'bib_key'],
                   ['ISBN:9780804720687', 'preview_url'],
                   ['ISBN:9780804720687', 'info_url'],
                   ['ISBN:9780804720687', 'preview']]
     for item in test_book.paths():
         self.assertIn(item, book_paths)
 def test_set_nested_value_force_add_to_list(self):
     path = ['attributes', 'tags', 4]
     test_pelican = PelicanJson(self.item)
     test_pelican.set_nested_value(path, 'New Tag', force=True)
     new_tag = test_pelican.get_nested_value(path)
     self.assertEqual(new_tag, 'New Tag')
     none_placeholder = ['attributes', 'tags', 3]
     self.assertEqual(test_pelican.get_nested_value(none_placeholder),
                      None)
Beispiel #13
0
 def wiki_summary_by_name(self):
     # building link
     link_asked_wiki = 'https://en.wikipedia.org/w/api.php?format=json&action=query&prop=extracts&redirects=1' \
                       '&exintro=&explaintext=&titles=' + str(self)
     wiki_response = PelicanJson(requests.get(link_asked_wiki).json())
     # getting a json tree
     for item in wiki_response.enumerate():
         tree_path = item
     # printing wiki content page
     return wiki_response.get_nested_value(tree_path[0])
Beispiel #14
0
def strip_guids(json_result):
    guidpat = re.compile(r'\w{8}-\w{4}-\w{4}-\w{4}-\w{12}')
    pelican = PelicanJson(json_result)
    for path, value in pelican.enumerate():
        if type(value) == str and guidpat.search(value):
            pelican.set_nested_value(path,
                                     re.sub(guidpat, 'someGUIDvalue',
                                            value))

    return pelican.convert()
Beispiel #15
0
def clean_urls(json_result):
    urlpat = re.compile(r'https://[api|publish]-sandbox.pmp.io')
    pelican = PelicanJson(json_result)
    for path, value in pelican.enumerate():
        if type(value) == str and urlpat.search(value):
            pelican.set_nested_value(path,
                                     re.sub(urlpat, 'http://127.0.0.1:8080',
                                            value))

    return pelican.convert()
 def test_set_nested_value_force_previous_index_error(self):
     test_rickettsi = PelicanJson(self.ricketts)
     success_msg = "Should now work"
     # IndexErrors overridden: paths created
     index_error_paths = [['query', 'normalized', 1, 'from'],
                          ['query', 'normalized', 1, 'to']]
     for path in index_error_paths:
         test_rickettsi.set_nested_value(path, success_msg, force=True)
         self.assertEqual(test_rickettsi.get_nested_value(path),
                          success_msg)
 def test_update_from_list(self):
     test_pelican = PelicanJson(self.ricketts)
     values = [{'*': '//www.worldcat.org/identities/lccn-n79-055298'},
               {'*': 'http://edricketts.stanford.edu/'},
               {'*': 'http://holisticbiology.stanford.edu/philosophy.html'}]
     extlinks = self.ricketts['query']['pages']['1422396']['extlinks']
     first, *_, second_last, last = test_pelican._update_from_list(extlinks)
     self.assertEqual(first, PelicanJson(values[0]))
     self.assertEqual(second_last, PelicanJson(values[1]))
     self.assertEqual(last, PelicanJson(values[2]))
 def test_len(self):
     keys = {'query', 'normalized', 'to', 'from', 'pages',
             '1266004', 'ns', 'title', 'pageid'}
     length = len(keys)
     test_pelican = PelicanJson(self.pelecanus_occidentalis)
     self.assertEqual(len(test_pelican), length)
     self.assertEqual(set(test_pelican.keys()), keys)
     self.assertEqual(len(test_pelican), len(list(test_pelican.keys())),
                      len(keys))
     test_pelican['new'] = 'VALUE'
     self.assertEqual(len(test_pelican), length + 1)
 def test_set_nested_value_force_type_error(self):
     test_rickettsi = PelicanJson(self.ricketts)
     success_msg = "Should now work"
     # TypeErrors overridden: path created
     type_error_path = ['query-continue', 'extlinks',
                        'eloffset', 'newdict-with-key']
     test_rickettsi.set_nested_value(type_error_path,
                                     success_msg,
                                     force=True)
     self.assertEqual(test_rickettsi.get_nested_value(type_error_path),
                      success_msg)
 def test_set_nested_value_force_key_error(self):
     test_rickettsi = PelicanJson(self.ricketts)
     success_msg = "Should now work"
     # KeyErrors overidden
     key_error_paths = [['unknownKey', 'unknownKey2'],
                        ['query-continue', 'unknownKey']]
     for path in key_error_paths:
         test_rickettsi.set_nested_value(path,
                                         success_msg,
                                         force=True)
         self.assertEqual(test_rickettsi.get_nested_value(path),
                          success_msg)
Beispiel #21
0
def empty_values(data, profile_type="story"):
    """Function that lists all of the fields missing values.
    """
    name, profile_file = profiles[profile_type]
    current_dir = os.path.abspath(os.path.dirname(__file__))
    profile_location = os.path.join(current_dir,
                                    'stored_profiles',
                                    profile_file)
    with open(profile_location, 'r') as f:
        profile = json.loads(f.read())
    optional = profile['OPTIONAL']
    temp_data = PelicanJson(data)
    return list(temp_data.search_value(optional))
 def test_pluck(self):
     answer = PelicanJson({'to': 'Pelecanus occidentalis',
                           'from': 'Pelecanus_occidentalis'})
     test_pelican = PelicanJson(self.pelecanus_occidentalis)
     self.assertEqual(answer,
                      next(test_pelican.pluck('to',
                                              'Pelecanus occidentalis')))
     self.assertEqual(answer,
                      next(test_pelican.pluck('from',
                                              'Pelecanus_occidentalis')))
     pelican_item = PelicanJson(self.item)
     self.assertEqual(pelican_item['attributes'],
                      next(pelican_item.pluck('byline', 'Emily Reddy')))
Beispiel #23
0
    def run_on_index(self, docs: List[dict], doc_paths: List[str], ratio,
                     algorithm: List[str]):
        """Generate summary based on tokenized text retrieved from es fields

            Parameters:
            docs (list): list of documents
            doc_paths (list): list of fields
            ratio (float): ratio to use for summarization
            algorithm (list): list of algorithms for sumy

            Returns:
            list:stack

        """
        stack = []
        algorithm = ast.literal_eval(algorithm)
        summarizers = self.get_summarizers(algorithm)
        for document in docs:
            wrapper = PelicanJson(document)
            for doc_path in doc_paths:
                doc_path_as_list = doc_path.split(".")
                content = wrapper.safe_get_nested_value(doc_path_as_list,
                                                        default=[])
                if content and isinstance(content, str):
                    ratio_count = SumyTokenizer().sentences_ratio(
                        content, float(ratio))
                    parser = PlaintextParser.from_string(
                        content, SumyTokenizer())
                else:
                    ratio_count = SumyTokenizer().sentences_ratio(
                        document[doc_path], float(ratio))
                    parser = PlaintextParser.from_string(
                        document[doc_path], SumyTokenizer())

                summaries = {}
                for name, summarizer in summarizers.items():
                    try:
                        summarization = summarizer(parser.document,
                                                   float(ratio_count))
                    except Exception as e:
                        logging.getLogger(ERROR_LOGGER).exception(e)
                        continue

                    summary = [sent._text for sent in summarization]
                    summary = "\n".join(summary)
                    summaries[doc_path + "_" + name] = summary

                stack.append(summaries)

        return stack
 def test_enumerate(self):
     book_url = 'https://openlibrary.org/books/OL7928788M/'
     book_url += 'Between_Pacific_Tides'
     test_book_enums = [
         (['ISBN:9780804720687', 'thumbnail_url'],
          'https://covers.openlibrary.org/b/id/577352-S.jpg'),
         (['ISBN:9780804720687', 'bib_key'],
          'ISBN:9780804720687'),
         (['ISBN:9780804720687', 'preview_url'], book_url),
         (['ISBN:9780804720687', 'info_url'], book_url),
         (['ISBN:9780804720687', 'preview'], 'noview')]
     test_book = PelicanJson(self.book)
     for path, value in test_book.enumerate():
         self.assertIn((path, value), test_book_enums)
 def test_get_nested_value(self):
     pelican_item = PelicanJson(self.item)
     answer = pelican_item.get_nested_value(['attributes', 'tags', 0])
     self.assertEqual(answer, 'npr_api')
     test_monty = PelicanJson(self.monterrey)
     answers = [('gov.noaa.ncdc:C00822', ['results', 7, 'uid']),
                ('gov.noaa.ncdc:C00040', ['results', 0, 'uid'])]
     for answer, path in answers:
         self.assertEqual(test_monty.get_nested_value(path),
                          answer)
     # Testing with a tuple, also should be allowed
     path = ('results', 9, 'uid')
     self.assertEqual(test_monty.get_nested_value(path),
                      'gov.noaa.ncdc:C00313')
 def test_get_nested_value_raises_bad_path(self):
     test_pelican = PelicanJson(self.monterrey)
     # Try a string
     with self.assertRaises(BadPath):
         test_pelican.get_nested_value("STRING")
     # Howsbout a dict?
     somedict = {'results': 'value', '9': 'value2'}
     with self.assertRaises(BadPath):
         test_pelican.get_nested_value(somedict)
     # What happens with an integer?
     with self.assertRaises(BadPath):
         test_pelican.get_nested_value(8)
     # ...and a set?
     with self.assertRaises(BadPath):
         test_pelican.get_nested_value({'results', 8})
 def test_create_path_new_object_inside_list(self):
     test_rickettsi = PelicanJson(self.ricketts)
     paths = [['query', 'pages', '1422396', 'images', 7, 'title'],
              ['query', 'normalized', 10, 'NEW']]
     check_for_none = ['query', 'normalized', 5]
     test_rickettsi.create_path(paths[0], "VALUE APPENDED TO LIST")
     test_rickettsi.create_path(paths[1], "VALUE in LIST with BACKFILL")
     self.assertEqual(test_rickettsi.get_nested_value(paths[0]),
                      "VALUE APPENDED TO LIST")
     self.assertEqual(test_rickettsi.get_nested_value(paths[1]),
                      "VALUE in LIST with BACKFILL")
     self.assertEqual(test_rickettsi.get_nested_value(check_for_none),
                      None)
     self.assertEqual(len(test_rickettsi.get_nested_value(['query',
                                                           'normalized'])),
                      11)
    def test_create_path_in_dict(self):
        test_rickettsi = PelicanJson(self.ricketts)
        overwrite_path = ['query-continue', 'extlinks',
                          'eloffset', 'newdict-with-key']
        msg = "Previous value overwritten with dictionary"
        test_rickettsi.create_path(overwrite_path, msg)
        self.assertEqual(test_rickettsi.get_nested_value(overwrite_path),
                         msg)
        newdict = test_rickettsi.get_nested_value(overwrite_path[:-1])
        self.assertTrue(isinstance(newdict, PelicanJson))

        paths = [['query-continue', 'extlinks', 'newkey1'],
                 ['query-continue', 'newkey1', 'newkey2'],
                 ['newkey1', 'newkey2', 'newkey3']]
        for path in paths:
            with self.assertRaises(KeyError):
                test_rickettsi.get_nested_value(path)
            test_rickettsi.create_path(path, "NEWVALUE")
            self.assertEqual(test_rickettsi.get_nested_value(path),
                             "NEWVALUE")
 def test_values(self):
     pvalues = {'Pelecanus occidentalis', 'Pelecanus_occidentalis',
                0, 'Pelecanus occidentalis', 1266004}
     book_url = 'https://openlibrary.org/books/OL7928788M/'
     book_url += 'Between_Pacific_Tides'
     monty_uid = 'gov.noaa.ncdc:C00345'
     rval = 'Ed_Ricketts'
     rval2 = 'File:Pacific Biological Laboratories.JPG'
     test_pelican = PelicanJson(self.pelecanus_occidentalis)
     test_book = PelicanJson(self.book)
     test_monty = PelicanJson(self.monterrey)
     test_rickettsi = PelicanJson(self.ricketts)
     self.assertEqual(pvalues, set(test_pelican.values()))
     self.assertIn(book_url, set(test_book.values()))
     self.assertIn(monty_uid, set(test_monty.values()))
     self.assertIn(rval, set(test_rickettsi.values()))
     self.assertIn(rval2, set(test_rickettsi.values()))
Beispiel #30
0
def parse_doc_texts(doc_path: str, document: dict) -> list:
    """
    Function for parsing text values from a nested dictionary given a field path.
    :param doc_path: Dot separated path of fields to the value we wish to parse.
    :param document: Document to be worked on.
    :return: List of text fields that will be processed by MLP.
    """
    wrapper = PelicanJson(document)
    doc_path_as_list = doc_path.split(".")
    content = wrapper.safe_get_nested_value(doc_path_as_list, default=[])
    if content and isinstance(content, str):
        return [content]
    # Check that content is non-empty list and there are only stings in the list.
    elif content and isinstance(content, list) and all(
        [isinstance(list_content, str) for list_content in content]):
        return content
    # In case the field path is faulty and it gives you a dictionary instead.
    elif isinstance(content, dict):
        return []
    else:
        return []
 def test_create_path_totally_new_path(self):
     test_rickettsi = PelicanJson(self.ricketts)
     path = ['new', 'path', 'in', 1, 'object']
     with self.assertRaises(KeyError):
         test_rickettsi.get_nested_value(path)
     test_rickettsi.create_path(path, "TEST VALUE")
     self.assertEqual(test_rickettsi.get_nested_value(path), "TEST VALUE")
 def test_create_path_add_item_to_list(self):
     test_item = PelicanJson(self.item)
     paths = [['attributes', 'tags', 2],
              ['attributes', 'tags', 5]]
     check_for_none = paths[1][:-1]
     check_for_none.append(4)
     test_item.create_path(paths[0], "New value inside list")
     self.assertEqual(test_item.get_nested_value(paths[0]),
                      "New value inside list")
     test_item.create_path(paths[1], "New value inside list with None")
     self.assertEqual(test_item.get_nested_value(paths[1]),
                      "New value inside list with None")
     self.assertEqual(test_item.get_nested_value(check_for_none),
                      None)
 def test_searchkey(self):
     test_rickettsi = PelicanJson(self.ricketts)
     paths = [['query', 'pages', '1422396', 'extlinks'] + [n, '*']
              for n in range(10)]
     for item in test_rickettsi.search_key('*'):
         self.assertIn(item, paths)
     for key in test_rickettsi.keys():
         self.assertTrue(next(test_rickettsi.search_key(key)))
     pelican_item = PelicanJson(self.item)
     for key in pelican_item.keys():
         self.assertTrue(next(pelican_item.search_key(key)))