Beispiel #1
0
 def process_item(self, good_item, spider):
     if 'ingredients' in good_item:
         ingredients_as_string = good_item['ingredients']
         extracted_e_additives = string_processor.parse_e_additives(
             ingredients_as_string)
         good_item['e_additives'] = extracted_e_additives
     return good_item
Beispiel #2
0
 def process_item(self, good_item, spider):
     if 'ingredients' in good_item:
         ingredients_as_string = good_item['ingredients']
         extracted_e_additives = string_processor.parse_e_additives(
             ingredients_as_string)
         good_item['e_additives'] = extracted_e_additives
     return good_item
 def test_additive_tailing_russian_A(self):
     string_under_test = (
         u"сахар, сироп глюкозы, вода, желатин, ароматические "
         u"вещества, кислота(лимонная), порошок лакриц, "
         u"красители (Е100,Е120,Е133,Е153, Е160А), вещества "
         u"наносимые на поверхность(растительные масла, Е903). "
         u"Возможно незначительное содержание лесного ореха.")
     self.assertEqual(parse_e_additives(string_under_test), [u'E100', u'E120', u'E133', u'E153', u'E160a', u'E903'])
 def test_additive_tailing_russian_A(self):
     string_under_test = (
         u"сахар, сироп глюкозы, вода, желатин, ароматические "
         u"вещества, кислота(лимонная), порошок лакриц, "
         u"красители (Е100,Е120,Е133,Е153, Е160А), вещества "
         u"наносимые на поверхность(растительные масла, Е903). "
         u"Возможно незначительное содержание лесного ореха.")
     self.assertEqual(
         parse_e_additives(string_under_test),
         [u'E100', u'E120', u'E133', u'E153', u'E160a', u'E903'])
Beispiel #5
0
 def process_item(self, good_item, spider):
     if 'ingredients' in good_item:
         ingredients_as_string = good_item['ingredients']
         ingredients_as_string = string_processor.remove_substring_in_paranthesis(
             ingredients_as_string)
         ingredients_fragments = string_processor.split_ingredients(ingredients_as_string)
         #log.msg("ingredients after splitting: {0}".format(ingredients_fragments))
         for fragment in ingredients_fragments:
             if not string_processor.parse_e_additives(fragment):
                 fragment = string_processor.remove_weight(fragment)
                 fragment = string_processor.remove_percents(fragment)
                 agrovoc_match = self.agrovoc_graph.find_ingredient_by_name(fragment.strip())
                 if agrovoc_match:
                     #log.msg('found ingredient {0}'.format(agrovoc_match))
                     good_item['agrovoc_ingredients'] = (
                         good_item.get('agrovoc_ingredients', []) + [agrovoc_match])
                 else:
                     self.not_parsed_fragments[fragment] = (
                         self.not_parsed_fragments.get(fragment, 0) + 1)
     return good_item
Beispiel #6
0
 def process_item(self, good_item, spider):
     if 'ingredients' in good_item:
         ingredients_as_string = good_item['ingredients']
         ingredients_as_string = string_processor.remove_substring_in_paranthesis(
             ingredients_as_string)
         ingredients_fragments = string_processor.split_ingredients(
             ingredients_as_string)
         #log.msg("ingredients after splitting: {0}".format(ingredients_fragments))
         for fragment in ingredients_fragments:
             if not string_processor.parse_e_additives(fragment):
                 fragment = string_processor.remove_weight(fragment)
                 fragment = string_processor.remove_percents(fragment)
                 agrovoc_match = self.agrovoc_graph.find_ingredient_by_name(
                     fragment.strip())
                 if agrovoc_match:
                     #log.msg('found ingredient {0}'.format(agrovoc_match))
                     good_item['agrovoc_ingredients'] = (
                         good_item.get('agrovoc_ingredients', []) +
                         [agrovoc_match])
                 else:
                     self.not_parsed_fragments[fragment] = (
                         self.not_parsed_fragments.get(fragment, 0) + 1)
     return good_item
 def test_no_additives(self):
     self.assertEqual(parse_e_additives(u"a303"), [])
 def test_no_additives_if_additive_is_not_separate_word(self):
     self.assertEqual(parse_e_additives(u"abce304"), [])
 def test_one_additive_in_lowercase(self):
     self.assertEqual(parse_e_additives(u"е100"), [u'E100'])
 def test_additive_tailing_russian_non_latin_character_is_ignored(self):
     string_under_test = u"E160б"
     self.assertEqual(parse_e_additives(string_under_test), [u'E160'])
 def test_additive_russian_tailing_russian_c_in_lowercase(self):
     string_under_test = u"E160с"
     self.assertEqual(parse_e_additives(string_under_test), [u'E160c'])
 def test_one_additive_in_uppercase(self):
     self.assertEqual(parse_e_additives(u"Е200"), [u'E200'])
 def test_additive_with_extra_digit(self):
     self.assertEqual(parse_e_additives(u"Е1525"), [u'E1525'])
 def test_additive_russian_tailing_russian_c_in_lowercase(self):
     string_under_test = u"E160с"
     self.assertEqual(parse_e_additives(string_under_test), [u'E160c'])
 def test_additive_russian_tailing_russian_C_in_uppercase(self):
     string_under_test = u"Е160С"
     self.assertEqual(parse_e_additives(string_under_test), [u'E160c'])
 def test_two_additives_in_lower_and_upper_cases(self):
     self.assertEqual(parse_e_additives(u"е201, Е202"), [u'E201', u'E202'])
 def test_additive_with_hyphen(self):
     self.assertEqual(parse_e_additives(u"е-100"), [u'E100'])
 def test_additive_with_extra_digit(self):
     self.assertEqual(parse_e_additives(u"Е1525"), [u'E1525'])
 def test_additive_with_extra_letter(self):
     self.assertEqual(parse_e_additives(u"е201B"), [u'E201b'])
 def test_no_additives_if_additive_is_not_separate_word(self):
     self.assertEqual(parse_e_additives(u"abce304"), [])
 def test_additive_with_extra_letter(self):
     self.assertEqual(parse_e_additives(u"е201B"), [u'E201b'])
 def test_additive_russian_leading_russian_E_in_lowercase(self):
     string_under_test = u"е160"
     self.assertEqual(parse_e_additives(string_under_test), [u'E160'])
 def test_additive_with_hyphen(self):
     self.assertEqual(parse_e_additives(u"е-100"), [u'E100'])
 def test_additive_tailing_russian_non_latin_character_is_ignored(self):
     string_under_test = u"E160б"
     self.assertEqual(parse_e_additives(string_under_test), [u'E160'])
 def test_additive_russian_tailing_russian_C_in_uppercase(self):
     string_under_test = u"Е160С"
     self.assertEqual(parse_e_additives(string_under_test), [u'E160c'])
 def test_additive_with_space(self):
     string_under_test = (
         u"влагоудерживающий агент E 452, регулятор кислотности E 451, "
         u"специи, декстроза, загустители E 407, E 412; "
     )
     self.assertEqual(parse_e_additives(string_under_test), [u'E452', u'E451', u'E407', u'E412'])
 def test_additive_russian_leading_russian_E_in_lowercase(self):
     string_under_test = u"е160"
     self.assertEqual(parse_e_additives(string_under_test), [u'E160'])
 def test_one_additive_in_uppercase(self):
     self.assertEqual(parse_e_additives(u"Е200"), [u'E200'])
 def test_additive_with_space(self):
     string_under_test = (
         u"влагоудерживающий агент E 452, регулятор кислотности E 451, "
         u"специи, декстроза, загустители E 407, E 412; ")
     self.assertEqual(parse_e_additives(string_under_test),
                      [u'E452', u'E451', u'E407', u'E412'])
 def test_two_additives_in_lower_and_upper_cases(self):
     self.assertEqual(parse_e_additives(u"е201, Е202"), [u'E201', u'E202'])
 def test_one_additive_in_lowercase(self):
     self.assertEqual(parse_e_additives(u"е100"), [u'E100'])
 def test_no_additives(self):
     self.assertEqual(parse_e_additives(u"a303"), [])