def demo2(): print "=" * 80 print "Demo 2" print "=" * 80 print "Strings originales" for s in testStrings: print s print "" # df = pd.read_csv("audiencias-raw.csv") # testStrings = df.lugar clusters, counts = group_fingerprint_strings(testStrings) print "Clusters encontrados" pprint(clusters) print "" print "Conteos de strings" pprint(counts) print "" # Para cada cluster tomo la string raw que resulte mas apta d = get_best_replacements(clusters, counts) print "Strings de replazo para cada key" pprint(d) print "" # Reemplazo las keys que matchean el fingerprint una version print "Output:" for s in replace_by_key(d, testStrings): print s print ""
def string(self, field, sufix=None, sort_tokens=False, remove_duplicates=False, keep_original=False, inplace=False): """Regla para todos los strings. Aplica un algoritimo de clustering para normalizar strings que son demasiado parecidos, sin pérdida de información. Args: field (str): Campo a limpiar. Returns: pandas.Series: Serie de strings limpios. """ sufix = sufix or self.DEFAULT_SUFIX field = self._normalize_field(field) series = self.df[field] clusters, counts = group_fingerprint_strings( series, sort_tokens=sort_tokens, remove_duplicates=remove_duplicates) replacements = get_best_replacements(clusters, counts) parsed_series = pd.Series(replace_by_key(replacements, series)) parsed_series = parsed_series.str.strip() if inplace: self._update_series(field=field, sufix=sufix, keep_original=keep_original, new_series=parsed_series) return parsed_series