def test_contains_extractor_with_list(): text = ["Simple", "is", "better", "than", "complex"] features = contains_extractor(text) assert_true(features['contains(Simple)']) assert_false(features.get("contains(simple)", False)) assert_true(features['contains(complex)']) assert_false(features.get("contains(derp)", False))
def test_contains_extractor_with_string(): text = "Simple is better than complex" features = contains_extractor(text) assert_true(features["contains(Simple)"]) assert_false(features.get('contains(simple)', False)) assert_true(features['contains(complex)']) assert_false(features.get("contains(derp)", False))
def extractor_base(document, train_set=None): """ Renvoyer des propriétés du texte """ features = contains_extractor(document, train_set) features['has_digit_sequence(7)'] = bool(re.search(REGEX_DIGIT_SEQUENCE_7, document)) features['has_digit_sequence(4)'] = bool(re.search(REGEX_DIGIT_SEQUENCE_4, document)) features['has_digit_sequence(2)'] = bool(re.search(REGEX_DIGIT_SEQUENCE_2, document)) features['has_digits(7)'] = bool(re.search(REGEX_DIGIT_PRESENCE_7, document)) features['has_curency()'] = bool(re.search(REGEX_CURRENCY_PRESENCE, document)) features['has_email_markers()'] = bool(re.search(REGEX_EMAIL_MARKERS, document)) features['has_hyperlink()'] = bool(re.search(REGEX_HYPERLINK_MARKER, document)) return features