Esempio n. 1
0
def test_chou_rouge():
    e = ["choux rouge au vinaigre", "vinaigre balsamique"]
    entities = Entities(e)
    match = entities.match("choux rouge au vinaigre balsamique")
    assert len(match) == 2
    assert match[0].text == e[0]
    assert match[1].text == e[1]
Esempio n. 2
0
def test_chou_rave():
    e = ["choux rave", "rave party"]
    entities = Entities(e)
    match = entities.match("choux rave party")
    assert len(match) == 2
    assert match[0].text == e[0]
    assert match[1].text == e[1]
Esempio n. 3
0
def test_transform():
    entities = Entities(["Data-Publica"], transform=lambda t: t.lower())
    match = entities.match("data-publica")
    assert len(match) == 1
    assert match[0].text == "data-publica"

    entities = Entities(["data-publica"], transform=lambda t: t.lower())
    match = entities.match("Data-Publica")
    assert len(match) == 1
    assert match[0].text == "Data-Publica"
    def __init__(self, cache_results=False, check_supported=True, **kwargs):
        """
        :param cache_results (boolean) if true, the result caching mechanism is enabled
        """

        self.logger = logging.getLogger("textmining:address_detecter")
        self.logger.setLevel(logging.INFO)
        if "fvoies" in kwargs:
            raise DeprecationWarning(
                "fvoies is deprecated.\n"
                "Please use detect_address(..., fvoies=X)")

        resources_dir = os.path.join(LIB_PATH, "resources/localization")
        self.check_supported = check_supported

        if not os.path.exists(resources_dir):
            raise NotImplementedError("No resources")

        # Cache where country specific resources are cached
        self.localization_cache = {}

        # Iterating over coutry specific resources
        for path in os.listdir(resources_dir):
            # We consider that all directories in the resources_dir represent a country
            if os.path.isdir(os.path.join(resources_dir, path)):
                country_name = path
                country_path = os.path.join(resources_dir, path)
                country = namedtuple(
                    "Country",
                    [
                        "cities",  # Set of entities,
                        "zipcodes",
                        "voies",
                        "main_regex",
                        "secondary_regex",
                        "streets_matcher"
                    ])
                # For some country (like France), there’s no need for a harcoded street list
                country.streets = None
                street_path = os.path.join(country_path, "streets.txt")
                # If streets.txt exists, a hardcoded street list is needed
                if os.path.exists(street_path):
                    with open(street_path, "r") as f:
                        country.streets = set(f.read().splitlines())
                        country.streets_matcher = Matcher()
                        country.streets_matcher.set_words(country.streets)

                with open(os.path.join(country_path, "main.regex"), "r") as f:
                    regex = f.read().strip()
                    country.main_regex = re.compile(regex, re.S | re.U | re.I)

                try:
                    with open(os.path.join(country_path, "secondary.regex"),
                              "r") as f:
                        regex = f.read().strip()
                        country.secondary_regex = re.compile(
                            regex, re.S | re.U | re.I)
                except IOError as e:
                    country.secondary_regex = None
                    print("Unable to open file secondary.rege"
                          )  # Does not exist OR no read permissions

                country.voies = cities = country.zipcodes = set()
                try:
                    with open(os.path.join(country_path, "cities.csv"),
                              "r") as f:
                        reader = csv.reader(f, delimiter=",")
                        for zipcode, city in reader:
                            zipcode = str(
                                int(zipcode)
                            )  # Ensures consistency when zipcode begins with 0
                            country.zipcodes.add(zipcode)
                            city = normalize_text(city)
                            cities.add(city)
                        country.cities = Entities(cities)
                except IOError as e:
                    country.cities = None
                    print("Unable to open file cities.csv"
                          )  # Does not exist OR no read permissions

                try:
                    # Populating voies set with resource file
                    with open(os.path.join(country_path, "voies.csv"),
                              "r") as f:
                        for row in f.readlines():
                            row = row.strip().lower()
                            row = row.split(",")
                            voies = map(normalize_text, row)
                            country.voies.update(voies)
                except IOError as e:
                    country.voies = None
                    print("Unable to open file voies.csv"
                          )  # Does not exist OR no read permissions

                self.localization_cache[country_name] = country
        self.results_cache = None
        if cache_results:
            # Caches matched strings in a set for bad results, and a dict for the one that yielded a good result
            self.empty_cache()
Esempio n. 5
0
    def _fetch(self, params="api/entities"):
        """
        Fetches entities recorded for the given url

        :param url:
        :return: The corresponding list of entities
        """
        self.logger.info("Fetching url [%s] " % self.api.base_url + "/" +
                         params)

        #fetching...
        response = self.api.fetch(params)
        if not response.status_code == requests.codes.ok:
            self.logger.warning(
                "Fetching the API doesn't give an 200 HTTP response...")
            raise CannotFetchUrlException()

        results = response.json()
        if len(results) == 0:
            raise CannotFetchUrlException()

        entities_list = defaultdict()
        # Preparing entities based on matching method type
        entities_list[MatchMethod.EXACT] = Entities([],
                                                    tokenizer=self.tokenizer,
                                                    transform=identity)
        entities_list[MatchMethod.SHORT_LABEL] = Entities(
            [], tokenizer=self.tokenizer, transform=normalization)
        # Preparing expression matching using Ahocorasick data structure
        entities_list[MatchMethod.LONG_LABEL] = aho.Trie()

        count = 0
        blacklisted = list()
        for result in results:
            id_ = result["id"]
            type_ = result["type"]
            labels = result["labels"]
            for entity in labels:
                if entity["method"] in ["SHORT_LABEL", "EXACT"]:
                    if admit(entity["label"], self.bl):
                        entities_list[MatchMethod(
                            entity["method"])].add_entity(
                                Entity(entity["label"], (id_, type_)))
                        count += 1
                    else:
                        blacklisted.append(entity["label"])
                        self.logger.warning("blacklisting short-label %s" %
                                            entity["label"])

                elif entity["method"] == "LONG_LABEL":
                    if admit_label(entity["label"], self.bl):
                        entities_list[MatchMethod(entity["method"])].add_word(
                            ascii_normalization(entity["label"]), (id_, type_))
                        count += 1
                    else:
                        blacklisted.append(entity["label"])
                        self.logger.warning("blacklisting long-label %s" %
                                            entity["label"])

                else:
                    raise UnkownMatchingMethod(
                        "method [%s] is not implemented." % entity["method"])

        # computing automaton to prepare ahocorasick on long-label entities
        entities_list[MatchMethod("LONG_LABEL")].make_automaton()

        self.logger.info("total items retrieved : %d" % count)
        self.logger.warning("Has blacklisted %d items." % len(blacklisted))
        return entities_list
Esempio n. 6
0
def test_accents_in_entities():
    entities = Entities(["abcéefg"])
    match = entities.match("abc abcéefg abc")
    assert len(match) == 1
Esempio n. 7
0
def test_choux_de_bruxelles():
    e = ["Choux de Bruxelles", "Choux", "Choux Fleur"]
    entities = Entities(e)
    match = entities.match(e[0])
    assert len(match) == 1
    assert match[0].text == e[0]
Esempio n. 8
0
def test_deleted_spaces():
    entities = Entities(["data - publica"])
    match = entities.match("data-publica")
    assert len(match) == 1
Esempio n. 9
0
def test_oud():
    entities = Entities(["oud"])
    match = entities.match("ou d")
    assert len(match) == 0
    match = entities.match("oud")
    assert len(match) == 1