コード例 #1
0
def demo2():
    print "=" * 80
    print "Demo 2"
    print "=" * 80

    print "Strings originales"
    for s in testStrings:
        print s
    print ""

    # df = pd.read_csv("audiencias-raw.csv")
    # testStrings = df.lugar

    clusters, counts = group_fingerprint_strings(testStrings)
    print "Clusters encontrados"
    pprint(clusters)
    print ""

    print "Conteos de strings"
    pprint(counts)
    print ""

    # Para cada cluster tomo la string raw que resulte mas apta
    d = get_best_replacements(clusters, counts)
    print "Strings de replazo para cada key"
    pprint(d)
    print ""

    # Reemplazo las keys que matchean el fingerprint una version
    print "Output:"
    for s in replace_by_key(d, testStrings):
        print s
    print ""
コード例 #2
0
ファイル: data_cleaner.py プロジェクト: ganipa93/data-cleaner
    def string(self, field, sufix=None, sort_tokens=False,
               remove_duplicates=False, keep_original=False, inplace=False):
        """Regla para todos los strings.

        Aplica un algoritimo de clustering para normalizar strings que son
        demasiado parecidos, sin pérdida de información.

        Args:
            field (str): Campo a limpiar.

        Returns:
            pandas.Series: Serie de strings limpios.
        """
        sufix = sufix or self.DEFAULT_SUFIX
        field = self._normalize_field(field)
        series = self.df[field]

        clusters, counts = group_fingerprint_strings(
            series, sort_tokens=sort_tokens,
            remove_duplicates=remove_duplicates)
        replacements = get_best_replacements(clusters, counts)
        parsed_series = pd.Series(replace_by_key(replacements, series))
        parsed_series = parsed_series.str.strip()

        if inplace:
            self._update_series(field=field, sufix=sufix,
                                keep_original=keep_original,
                                new_series=parsed_series)

        return parsed_series
コード例 #3
0
def demo2():
    print "=" * 80
    print "Demo 2"
    print "=" * 80

    print "Strings originales"
    for s in testStrings:
        print s
    print ""

    # df = pd.read_csv("audiencias-raw.csv")
    # testStrings = df.lugar

    clusters, counts = group_fingerprint_strings(testStrings)
    print "Clusters encontrados"
    pprint(clusters)
    print ""

    print "Conteos de strings"
    pprint(counts)
    print ""

    # Para cada cluster tomo la string raw que resulte mas apta
    d = get_best_replacements(clusters, counts)
    print "Strings de replazo para cada key"
    pprint(d)
    print ""

    # Reemplazo las keys que matchean el fingerprint una version
    print "Output:"
    for s in replace_by_key(d, testStrings):
        print s
    print ""