Beispiel #1
0
 def test_state_extractor(self):
     doc = {'content': ["", "ID#182730", "-", "florida", "LegendaryDave", "-", "Salt", "Lake", "-", "Gay", "Escorts", "&", "Gay", "Massage", "", "LegendaryDave", "2120.5", "Miles", "Away", "THE", "LEGENDARY", "DAVE-", "1ST", "TIME", "IN", "SALT", "LAKE", "CITY", "Over", "200", "Reviews!", "***Voted", "one", "of", "the", "Top", "50", "Escorts", "in", "the", "USA,", "for", "the", "last", "5", "years", "in", "a", "row***", "Secure,", "Masculine,", "Adventurous.", "Top/Vers", "Stud.", "Teacher", "to", "the", "New/Curious.", "Good-looking,", "Bi,", "Sense", "of", "Humor,", "Fun.", "Great", "Body.", "Educated,", "Honest,", "Sane.", "Healthy/DDF/Neg.", "AM,PM,", "24/7,", "Overnights,", "Multi-Days.", "Very", "Skilled:", "Massage", "or", "Vanilla", "or", "Kink", "or", "Wild.", "Discreet", "&", "Professional.", "Incalls", "&", "Outcalls:", "USA", "&", "Abroad.", "VISIT", "MY", "WEBSITE", "FOR", "RATES,", "MORE", "HOT", "PICS,", "ALL", "MY", "REVIEWS,", "HIRING", "TIPS", "&", "MORE.", "his", "stats", "AGE:", "45", "Role:", "Versatile", "HEIGHT:", "5'11\"", "(180cm)", "WEIGHT:", "150", "-", "170", "lbs", "(68", "-", "77", "kg)", "Piercings:", "Not", "Specified", "RACE:", "White", "HAIR", "COLOR:", "Dark", "Brown", "EYE", "COLOR:", "Green", "Open", "To", "LTR", ":", "Yes", "Languages:", "English,", "Other", "BODY", "TYPE:", "Muscular/Buff", "BODY", "HAIR:", "Moderately", "hairy", "Tattoos:", "Not", "Specified", "Smoker:", "Not", "Specified", "reviews", "\r", "M4RN", "Reviews:", "8", "Most", "Recent", "M4RN", "Star", "Review:", "04/30/2016", "", "services", "provided", "Escort", "yes", "HOT", "Massage", "yes", "Massage", "yes", "Registered", "Therapist", "yes", "In", "Calls", "yes", "Out", "Calls", "yes", "US", "Travel", "yes", "Int'l", "Travel", "yes", "Advertiser", "Since", "Dec", "'07", "contact", "info", "Phone:", "PREFERS", "PHONE", "CONTACT", "location", "2120.5", "Miles", "area:", "Salt", "Lake", "City", "/", "Ogden", "Local", "City:", "Salt", "Lake", "Postal", "Code:", "84101", "availability", "", "", "s", "m", "t", "w", "t", "f", "s", "7am-11am", "", "11am-3pm", "", "3pm-7pm", "", "7pm-11pm", "", "11pm-3am", "", "3am-7am", "", "", "Elite", "&", "Platinum", "Advertisers", "ELITE", "Toronto", "Barcelona", "San", "Francisco", "/", "Oakland", "Cleveland", "/", "Lorain", "/", "Elyria", "Los", "Angeles", "/", "West", "Hollywood", "Find", "LegendaryDave,", "Rent", "Men", "and", "Male", "Massage", "in", "Salt", "Lake", "", ""]}
     states = populate_trie(map(lambda x: x.lower(), self.load_file("states.json")))
     stop_words = populate_trie(map(lambda x: x.lower(), self.load_file("stop_words.json")))
     extractor = get_city_dictionary_extractor(states, stop_words)
     extractor_processor = ExtractorProcessor().set_input_fields(['content']).set_output_field('state').set_extractor(extractor)
     updated_doc = extractor_processor.extract(doc)
     self.assertEqual(updated_doc['state'][0]['result'][0]['value'], u'florida')
Beispiel #2
0
    def test_city_extractor(self):
        doc = {'content': ["orlando", "Teacher", "to", "the", "New/Curious.", "Good-looking,", "Bi,", "Sense", "of", "Humor,", "Fun.", "Great", "Body.", "Educated,", "Honest,", "Sane.", "Healthy/DDF/Neg.", "AM,PM,", "24/7,", "Overnights,", "Multi-Days.", "Very", "Skilled:", "Massage", "or", "Vanilla", "or", "Kink", "or", "Wild.", "Discreet", "&", "Professional.", "Incalls", "&", "Outcalls:Abroad.", "VISIT", "MY", "WEBSITE", "FOR", "RATES,", "MORE", "HOT", "PICS,", "ALL", "MY", "REVIEWS,", "HIRING", "TIPS", "&", "MORE.", "", "", "", "", "", "his", "stats", "", "", "", "", "AGE:", "", "45", "", "Role:", "", "", "Versatile", "", "HEIGHT:", "", "5'11\"", "(180cm)", "", "WEIGHT:", "", "150", "-", "170", "lbs", "(68", "-", "77", "kg)", "", "Piercings:", "", "Not", "Specified", "", "", "", "RACE:", "", "White", "", "HAIR", "COLOR:", "", "Dark", "Brown", "", "EYE", "COLOR:", "", "Green", "", "Open", "To", "", "LTR", ":", "", "Yes", "", "Languages:", "", "English,", "Other", "", "", "", "BODY", "TYPE:", "", "Muscular/Buff", "", "BODY", "HAIR:", "", "Moderately", "hairy", "", "Tattoos:", "", "Not", "Specified", "", "Smoker:", "", "Not", "Specified", "", "", "", "", "", "", "", "reviews", "", "", "\r", "", "M4RN", "Reviews:", "", "8", "", "Most", "Recent", "M4RN", "Star", "Review:", "", "04/30/2016", "", "", "", "", "", "", "", "", "", "", "services", "provided", "", "", "", "Escort", "", "yes", "", "HOT", "Massage", "", "yes", "", "Massage", "", "yes", "", "Registered", "Therapi"]}
        cities = populate_trie(map(lambda x: x.lower(), self.load_file("cities.json")))
        stop_words = populate_trie(map(lambda x: x.lower(), self.load_file("stop_words.json")))

        extractor = get_city_dictionary_extractor(cities, stop_words)
        extractor_processor = ExtractorProcessor().set_input_fields(['content']).set_output_field('cities').set_extractor(extractor)
        updated_doc = extractor_processor.extract(doc)
        self.assertEqual(updated_doc['cities'][0]['result'][0]['value'], u'orlando') 
    def test_ngrams_words_name_extractor(self):
        names = self.load_file()
        t = populate_trie(map(lambda x: x.lower(), names))
        self.assertTrue(isinstance(t.get('barbara'), basestring))
        self.assertFalse(isinstance(t.get('bar'), basestring))

        doc = {
            "foo": [
                "at", "the", "market", "jean", "marie", "bought", "a", "loaf",
                "of", "bread"
            ]
        }
        e = get_name_dictionary_extractor(t)
        e.set_ngrams(2)
        e.set_joiner(' ')
        ep = ExtractorProcessor().set_input_fields('foo').set_output_field(
            'names').set_extractor(e)

        updated_doc = ep.extract(doc)
        self.assertEquals(updated_doc['names'][0]['result'][0]['value'],
                          'jean')
        self.assertEquals(updated_doc['names'][0]['result'][1]['value'],
                          'marie')
        self.assertEquals(updated_doc['names'][0]['result'][2]['value'],
                          'jean marie')
    def test_name_extractor(self):
        names = self.load_file()
        t = populate_trie(map(lambda x: x.lower(), names))
        self.assertTrue(isinstance(t.get('barbara'), basestring))
        self.assertFalse(isinstance(t.get('bar'), basestring))

        doc = {"foo": ['bar', 'Barbara']}
        e = get_name_dictionary_extractor(t)
        ep = ExtractorProcessor().set_input_fields('foo').set_output_field(
            'names').set_extractor(e)

        updated_doc = ep.extract(doc)
        self.assertEquals(updated_doc['names'][0]['result'][0]['value'],
                          'barbara')
Beispiel #5
0
# -*- coding: utf-8 -*-
# @Author: ZwEin
# @Date:   2016-10-05 16:01:52
# @Last Modified by:   ZwEin
# @Last Modified time: 2016-10-07 14:00:17

from digDictionaryExtractor.populate_trie import populate_trie
from digDictionaryExtractor.dictionary_extractor import DictionaryExtractor

from names import names
default_names_trie = populate_trie(iter(names))


def get_nationality_extractor(names_trie=default_names_trie):
    return DictionaryExtractor()\
        .set_trie(names_trie)\
        .set_pre_filter(lambda x: True)\
        .set_pre_process(lambda x: x.lower())\
        .set_metadata({'extractor': 'dig_nationality_dictionary_extractor'})\
        .set_renamed_input_fields('tokens')\
        .set_ngrams(3)
# -*- coding: utf-8 -*-
# @Author: ZwEin
# @Date:   2016-10-05 16:01:52
# @Last Modified by:   ZwEin
# @Last Modified time: 2016-10-10 15:43:22

from digDictionaryExtractor.populate_trie import populate_trie
from digDictionaryExtractor.dictionary_extractor import DictionaryExtractor


def generate_age_dictionary(start=18, end=50):
    dictionary = [str(_) for _ in range(start, end + 1)]
    return dictionary


default_ages = generate_age_dictionary()

default_ages_trie = populate_trie(iter(default_ages))


def get_age_dictionary_extractor(ages_trie=default_ages_trie):
    """Method for creating default name dictionary extractor"""
    return DictionaryExtractor()\
        .set_trie(ages_trie)\
        .set_metadata({'extractor': 'dig_age_dictionary_extractor'})
 def test_populate_trie(self):
     values = ['abacus', 'abate', 'adder', 'brave']
     t = populate_trie(iter(values))
     self.assertTrue(isinstance(t.get('abate'), basestring))
     self.assertFalse(isinstance(t.get('debate'), basestring))