def test_converts_with_multiple_definitions(self): json_str = """ { "a": "b", "c": 1 } """ self.assertEqual(parse(json_str), {'a': 'b', 'c': 1})
def test_twice_nested_object(self): json_str = """ { "a": [ {"b" : 1}, {"c" : []} ] } """ self.assertEqual(parse(json_str), {"a": [{"b": 1}, {"c": []}]})
def test_supports_recursive_structures(self): json_str = """ { "a": { "b" : 1 } } """ self.assertEqual(parse(json_str), {'a': {'b': 1}})
def test_converts_json_with_one_definition(self): self.assertEqual( parse(""" { \"a\": \"bcd\" }"""), {'a': 'bcd'})
def test_it_returns_none_when_not_correct(self): self.assertIsNone(parse('''[[]}'''))
def test_converts_empty_json_to_empty_dict(self): self.assertEqual(parse("{}"), {})
def test_parses_arrays(self): json_str = """ ["foo", "bar"] """ self.assertEqual(parse(json_str), ['foo', 'bar'])
frequency = defaultdict(int) unique_tokens = set() total_unique_tokens = 0 filename = None current_doc_id = 1 try: start = time.time() for path, folder, filenames in os.walk(os.getcwd() + CORPUS_PATH): for filename in filenames: rem_filename = filename if filename.endswith('.gitignore'): continue # Parse JSON here url, content, encoding = jsonparse.parse(path + '\\' + filename) doc_ids[current_doc_id] = url if FRAGMENT.search(url) is not None: continue if not is_valid(url): print('[SKIPPING] URL is not valid: {}\n'.format(url)) # Tokenize Content frequency = tokenizer.tokenize_index(content, encoding) if len(frequency) == 0: continue # print(frequency, '\n') # -------------- Filtering done here