Esempio n. 1
0
    def test_converts_with_multiple_definitions(self):
        json_str = """
            {
              "a": "b",
              "c": 1
            }
        """

        self.assertEqual(parse(json_str), {'a': 'b', 'c': 1})
Esempio n. 2
0
    def test_twice_nested_object(self):
        json_str = """
            {
                "a": [
                    {"b" : 1},
                    {"c" : []}
                ]
            }
        """

        self.assertEqual(parse(json_str), {"a": [{"b": 1}, {"c": []}]})
Esempio n. 3
0
    def test_supports_recursive_structures(self):

        json_str = """
            {
                "a": {
                    "b" : 1
                }
            }
        """

        self.assertEqual(parse(json_str), {'a': {'b': 1}})
Esempio n. 4
0
 def test_converts_json_with_one_definition(self):
     self.assertEqual(
         parse("""
         {
             \"a\": \"bcd\"
         }"""), {'a': 'bcd'})
Esempio n. 5
0
 def test_it_returns_none_when_not_correct(self):
     self.assertIsNone(parse('''[[]}'''))
Esempio n. 6
0
 def test_converts_empty_json_to_empty_dict(self):
     self.assertEqual(parse("{}"), {})
Esempio n. 7
0
    def test_parses_arrays(self):
        json_str = """
            ["foo", "bar"]
        """

        self.assertEqual(parse(json_str), ['foo', 'bar'])
    frequency = defaultdict(int)
    unique_tokens = set()
    total_unique_tokens = 0
    filename = None

    current_doc_id = 1
    try:
        start = time.time()
        for path, folder, filenames in os.walk(os.getcwd() + CORPUS_PATH):
            for filename in filenames:
                rem_filename = filename
                if filename.endswith('.gitignore'):
                    continue

                # Parse JSON here
                url, content, encoding = jsonparse.parse(path + '\\' +
                                                         filename)
                doc_ids[current_doc_id] = url

                if FRAGMENT.search(url) is not None:
                    continue

                if not is_valid(url):
                    print('[SKIPPING] URL is not valid: {}\n'.format(url))

                # Tokenize Content
                frequency = tokenizer.tokenize_index(content, encoding)
                if len(frequency) == 0:
                    continue

                # print(frequency, '\n')
                # -------------- Filtering done here