def index_collection(collection, sync=False): """Index a collection.""" if collection.deleted_at is not None: return delete_collection(collection.id) data = get_collection(collection.id) if data is None: return log.info( "[%s] Index: %s (%s things)...", collection, data.get("label"), data.get("count"), ) text = [data.get("label")] text.append(normalize(data.get("label"))) text.append(normalize(data.get("foreign_id"))) text.append(normalize(data.get("summary"))) data["text"] = text data.pop("id", None) return index_safe(collections_index(), collection.id, data, refresh=refresh_sync(sync))
def test_empty(self): self.assertEqual(None, slugify(None)) self.assertEqual(None, ascii_text(None)) self.assertEqual(None, latinize_text(None)) self.assertEqual(None, normalize(None)) self.assertEqual(None, normalize('')) self.assertEqual(None, normalize(' '))
def __init__(self, corpus_path): self.automaton = ahocorasick.Automaton() log.info("Building country automaton...") names_count = 0 with io.open(corpus_path, 'r', encoding='utf-8') as fh: for row in csv.reader(fh, delimiter='\t'): if row[7] in self.FEATURES: continue country = normalize(row[8]) if country is None: continue names = set(row[3].split(',')) names.add(row[1]) names.add(row[2]) for name in names: name = normalize(name) if name is None or len(name) < 4: continue names_count += 1 self.automaton.add_word(name, country) self.automaton.make_automaton() log.info("...done: %s names", names_count)
def make_csv_file_name(meta, table, out_folder): bank_name = normalize(meta['BankName'], lowercase=False) if bank_name is None: bank_name = 'Untitled Database' table_abbr = normalize(table['abbr'], lowercase=False) table_name = normalize(table['name'], lowercase=False) file_name = '%s - %s - %s.csv' % (bank_name, table_abbr, table_name) return os.path.join(out_folder, file_name)
def match_prefix(self, prefix): prefix = normalize(prefix) if not self.abstract: if normalize(self.name).startswith(prefix): return True elif normalize(self.label).startswith(prefix): return True return False
def compare_names(left, right): left_list = [normalize(n, latinize=True) for n in left.names] right_list = [normalize(n, latinize=True) for n in right.names] try: return max( Levenshtein.ratio(left, right) for left, right in IT.product(left_list, right_list)) except ValueError: return 0
def compare_names(left, right): result = 0 left_list = [normalize(n, latinize=True) for n in left.names] right_list = [normalize(n, latinize=True) for n in right.names] for (left, right) in itertools.product(left_list, right_list): similarity = jaro(left, right) score = similarity * dampen(2, 20, shortest(left, right)) result = max(result, score) return result
def text_score(match, candidates): if isinstance(candidates, basestring): candidates = [candidates] match_n = normalize(match) best_score = 0 for candidate in candidates: cand_n = normalize(candidate) score = jaro_winkler(match_n, cand_n, 0.02) * 100 best_score = max(int(score), best_score) return best_score
def ExtractCountries(self, request_iterator, context): country_tags = [] word_count = 0 for text_obj in request_iterator: text = normalize(text_obj.text) if text is None: continue word_count += len(text.split()) for index, country in self.automaton.iter(text): # log.debug("Matched: %s -> %s", name, country) country_tags.append(country) doc_tags = [] co_counts = Counter(country_tags) top_n = co_counts.most_common(self.MAX_TAGS) for tag_num in range(1, self.MAX_TAGS + 1): if len(top_n) >= tag_num: freq = top_n[tag_num - 1][1] / max(1, word_count) log.info('tag is %s, freq = %.2f', top_n[tag_num - 1], freq) if freq >= self.TAG_FREQUENCY_CUT: doc_tags.append(top_n[tag_num - 1][0]) return CountryTags(countries=doc_tags)
def transform(self, source, target): text = source.data() text = normalize(text, lowercase=self.config.get('lowercase', True), transliterate=self.config.get('transliterate', False), collapse=self.config.get('collapse', True)) target.save_data(text.encode('utf-8'))
def find_matches(dataset, text, filter=None, exclude=None): entities = Entity.__table__ match_text = normalize(text, dataset)[:254] # select text column and apply necesary transformations text_field = entities.c.name if dataset.normalize_text: text_field = entities.c.normalized if dataset.ignore_case: text_field = func.lower(text_field) text_field = func.left(text_field, 254) # calculate the difference percentage l = func.greatest(1.0, func.least(len(match_text), func.length(text_field))) score = func.greatest(0.0, ((l - func.levenshtein(text_field, match_text)) / l) * 100.0) score = func.max(score).label("score") # coalesce the canonical identifier id_ = func.coalesce(entities.c.canonical_id, entities.c.id).label("id") # apply filters filters = [entities.c.dataset_id == dataset.id, entities.c.invalid == False] # noqa if not dataset.match_aliases: filters.append(entities.c.canonical_id == None) # noqa if exclude is not None: filters.append(entities.c.id != exclude) if filter is not None: filters.append(text_field.ilike("%%%s%%" % filter)) q = select([id_, score], and_(*filters), [entities], group_by=[id_], order_by=[score.desc()]) return Matches(q)
def fingerprint(name): name = name.lower() for p, r in REPLS.items(): # print p, r, name name = p.sub(r, name) name = normalize(name) tokens = set([n for n in name.split(" ") if len(n)]) return unicode(" ".join(sorted(tokens)))
def to_shortname(name): # Remove Hector-style extensions. name = name.replace("_emissions", "").replace("_concentrations", "") normalized_name = normalize(name).replace(" ", "") try: return mappings[normalized_name] except KeyError: return name
def normalize_strong(text): """Perform heavy normalisation of a given text. The goal of this function is not to retain a readable version of the given string, but rather to yield a normalised version suitable for comparisons and machine analysis. """ return normalize(text, lowercase=True, ascii=True)
def index_collection(collection, sync=False): """Index a collection.""" if collection.deleted_at is not None: return delete_collection(collection.id) log.info("Index [%s]: %s", collection.id, collection.label) data = get_collection(collection.id) text = [data.get('label')] text.append(normalize(data.get('label'))) text.append(normalize(data.get('foreign_id'))) text.append(normalize(data.get('summary'))) data['text'] = text data.pop('id', None) return index_safe(collections_index(), collection.id, data, refresh=refresh_sync(sync))
def match_form(text): """Turn a string into a form appropriate for name matching. The goal of this function is not to retain a readable version of the given string, but rather to yield a normalised version suitable for comparisons and machine analysis. """ return normalize(text, lowercase=True, ascii=True)
def clean(mapping, bind, values): """ Perform several types of string cleaning for titles etc.. """ categories = {'C': ' '} for value in values: if isinstance(value, six.string_types): value = normality.normalize(value, lowercase=False, collapse=True, decompose=False, replace_categories=categories) yield value
def parse_row(phrases, data): lang = data['lang'] if lang != 'en': return desc = data['description'] tokens = normality.normalize(desc).split(' ') for i in [2, 3, 4, 5, 6, 7]: for ngram in ngrams(tokens, i): ngram = ' '.join(ngram) phrases[(ngram, i)].add(str(data['case_id']))
def by_name(cls, dataset, name): q = cls.query.filter_by(dataset=dataset) attr = Entity.name if dataset.normalize_text: attr = Entity.normalized name = normalize(name) if dataset.ignore_case: attr = func.lower(attr) if isinstance(name, basestring): name = name.lower() q = q.filter(attr == name) return q.first()
def jsonify(csvlist, preheadings=None, heading_line=0, data_start_line=1): """ Convert CSV into a JSON using either `preheadings` or csvilst[heading_line] as keys. Yields generator of dicts. """ if not preheadings: preheadings = csvlist[0] headings = [normalize(x).replace(' ', '_') for x in preheadings] csvlist_data_only = csvlist[data_start_line:] for line in csvlist_data_only: yield dict(zip(headings, line))
def match_form(self, text): """Turn a string into a form appropriate for name matching.""" # The goal of this function is not to retain a readable version of the # string, but rather to yield a normalised version suitable for # comparisons and machine analysis. text = normalize(text, lowercase=True, ascii=True) if text is None: return # TODO: this is a weird heuristic, but to avoid overly aggressive # matching it may make sense: if ' ' not in text: return return text.encode('utf-8')
def jsonify(csvlist, preheadings=None, heading_line=0, data_start_line=1): """ Convert CSV into a JSON using either `preheadings` or csvilst[heading_line] as keys. Yields generator of dicts. """ if not preheadings: preheadings = csvlist[0] headings = [normalize(x).replace(" ", "_") for x in preheadings] csvlist_data_only = csvlist[data_start_line:] for line in csvlist_data_only: yield dict(zip(headings, line))
def search_party_names(text): if text is None: return text = PARTIES_SPLIT.split(text) text = normalize(text[0]) parties = set() for party, rex in PARTIES_REGEX.items(): if rex.findall(text): parties.add(party) if not len(parties): return parties = ':'.join(sorted(parties)) return parties
def makeBigrams(word, **scoreOptions): ''' Normalize set of bigrams into an ordered string to aid processing ''' # Should probably allow stop words # Should probably strip of spaces(?) and punctuation process = normalize(word) stopwords = scoreOptions.get('stopwords', None) if stopwords: process = ' '.join(w for w in process.split() if w not in stopwords) return ''.join( sorted(set(process[i:i + 2] for i in range(len(process) - 1))))
def prepare_geonames(): with io.open(GEONAMES_RAW_PATH, 'r', encoding='utf-8') as fh: with shelve.open(GEONAMES_DB_PATH) as db: for row in csv.reader(fh, delimiter='\t'): country = normalize(row[8]) if country is None: continue names = set(row[3].split(',')) names.add(row[1]) names.add(row[2]) for name in names: name = normalize(name) if name is None: continue countries = db.get(name) if countries: countries.append(country) db[name] = countries else: db[name] = [country] for name in db: countries = db[name] db[name] = max(set(countries), key=countries.count)
def _normalize_names(names): """Generate a sequence of comparable names for an entity. This also generates a `fingerprint`, i.e. a version of the name where all tokens are sorted alphabetically, and some parts, such as company suffixes, have been removed.""" seen = set() for name in names: plain = normalize(name, ascii=True) if plain is not None and plain not in seen: seen.add(plain) yield plain fp = fingerprints.generate(name) if fp is not None and len(fp) > 6 and fp not in seen: seen.add(fp) yield fp
def create(cls, dataset, data, account): state = EntityState(dataset, None) data = EntitySchema().to_python(data, state) entity = cls() entity.dataset = dataset entity.creator = account entity.name = data['name'] entity.normalized = normalize(entity.name) entity.attributes = data.get('attributes', {}) entity.reviewed = data['reviewed'] entity.invalid = data['invalid'] entity.canonical = data['canonical'] db.session.add(entity) db.session.flush() return entity
def create_filename(row): file_type = row["FileType"] if file_type == "Translation": file_type = "NDC_Translation" elif file_type == "Addendum": file_type = "NDC_Addendum" name = "{}_{}".format(row["Number"], file_type) if "revised" in row["Title"].lower(): name += "_Revised" elif "archived" in row["Title"].lower(): name = "{}_NDC_Archived".format(row["Number"]) # Special case PSE with multiple Addendums if row["Party"] == "State of Palestine": if "SPM" in row["OriginalFilename"]: name += "_Summary_Policy_Makers" elif "Cobenefits" in row["Title"]: name += "_Cobenefits" elif "Implementation" in row["Title"]: name += "_Implementation_Road_Map" elif "Approval" in row["Title"]: name += "_Approval" # Special case PNG with multiple Addendums if row["Party"] == "Papua New Guinea": if "letter" in row["Title"]: name += "_Satisfactory_Letter" elif "Explanatory Note" in row["Title"]: name += "_Explanatory_Note" elif "PA Implementation Act" in row["Title"]: name += "_PA_Implementation_Act" elif "Management Act" in row["Title"]: name += "_Climate_Change_Management_Act" code = row["Code"] party = normalize(row["Party"], lowercase=False).replace(" ", "-") if row["OriginalFilename"].startswith("LV-03-06-EU") and code in eu28: code = "EUU" party = "European-Union" name = "{}_{}_{}_{}.pdf".format(code, party, name, row["Language"]) return name
def find_matches(dataset, text, filter=None, exclude=None): entities = Entity.__table__ match_text = (normalize(text) or '')[:254] # select text column and apply necesary transformations text_field = entities.c.name if dataset.normalize_text: text_field = entities.c.normalized if dataset.ignore_case: text_field = func.lower(text_field) text_field = func.left(text_field, 254) # calculate the difference percentage min_l = func.greatest(1.0, func.least(len(match_text), func.length(text_field))) score = func.greatest( 0.0, ((min_l - func.levenshtein(text_field, match_text)) / min_l) * 100.0) score = func.max(score).label('score') # coalesce the canonical identifier id_ = func.coalesce(entities.c.canonical_id, entities.c.id).label('id') # apply filters filters = [ entities.c.dataset_id == dataset.id, entities.c.invalid == False ] # noqa if not dataset.match_aliases: filters.append(entities.c.canonical_id == None) # noqa if exclude is not None: filters.append(entities.c.id != exclude) if filter is not None: filters.append(text_field.ilike('%%%s%%' % filter)) q = select([id_, score], and_(*filters), [entities], group_by=[id_], order_by=[score.desc()]) return Matches(q)
def update(self, data, account): state = EntityState(self.dataset, self) data = EntitySchema().to_python(data, state) self.creator = account self.name = data['name'] self.normalized = normalize(self.name) self.attributes = data['attributes'] self.reviewed = data['reviewed'] self.invalid = data['invalid'] self.canonical = data['canonical'] # redirect all aliases of this entity if self.canonical: if self.canonical.canonical_id: if self.canonial.canonical_id == self.id: self.canonical.canonical = None else: self.canonical = self.canonical.canonical for alias in self.aliases: alias.canonical = self.canonical db.session.add(self)
def normalized(self): return normalize(self.query)
def value(self, value): attr = qualified[self.attribute] conv = attr.converter(attr) self._value = conv.serialize_safe(value) self.normalized = normalize(self._value)
def normalize_label(label): return normalize(label, lowercase=True, ascii=True)
# coding: utf-8 from normality import normalize import csv import normality from grano.logic import Loader rows = ["id", "name", "sort_name", "email", "twitter", "facebook", "group", "group_id", "area_id", "area", "chamber", "term", "start_date", "end_date", "image", "gender"] if __name__ == "__main__": loader = Loader('senegal', project_label='Senegal') person = loader.make_entity("politician") with open('senegal/senegal-politicians.csv', 'r') as csvfile: records_reader = csv.reader(csvfile, delimiter=",") for row in records_reader: for x in range(0, len(rows)): #print "%s :: %s" % (rows[x], row[x]) if row[x]: val = normalize(str(row[x])) person.set( rows[x], val ) person.save() loader.persist()
file_type = "Addendum" elif any(sub in filename.lower() for sub in translations): file_type = "Translation" elif filename == "Belarus.pdf": file_type = "Addendum" elif filename == "Liberia_INDC Submission.002.pdf": file_type = "Addendum" elif filename.startswith("Sierra Leone INDC Submission to UNFCCC"): file_type = "Addendum" else: file_type = "INDC" print("{} : {} ({}) : {}".format(name, file_type, language, filename)) clean_filename = "{}_{}_{}_{}.pdf".format( code, normalize(name, lowercase=False).replace(" ", "-"), file_type, language ) if file_type == "Translation": clean_filename = clean_filename.replace( "Translation", "INDC_Translation") elif file_type == "Addendum": clean_filename = clean_filename.replace( "Addendum", "INDC_Addendum") clean_filepath = pdfs_dir / clean_filename indcs.append(OrderedDict([ ("Code", code), ("Party", name),
def fingerprint(name): if name is None: return name = FP_REMOVE.sub(' ', name.strip()) return normalize(name).replace(' ', '-')
def normalize(cls, text): return normalize(text)
u'respondent_appointee': 'person', 'claimants_counsel': 'person', 'respondents_counsel': 'person', u'annulment_committee_members': 'person' } G = nx.Graph() nodes = {} cases = {} # types = set() for entity in entities: label = entity.get('entity') if label in SKIP: continue key = normalize(label) # types.add(entity.get('type')) if entity.get('case_url') not in cases: cases[entity.get('case_url')] = set() cases[entity.get('case_url')].add(key) if not G.has_node(key): G.add_node(key, label=label, type=entity.get('type'), group=GROUPS[entity.get('type')]) nodes[key] = label for case_url, involved in cases.items(): # print case_url, len(involved) for (s, t) in combinations(involved, 2):
def normalize(self, text, **kwargs): """Normalize for comparison.""" ids = super(IdentifierType, self).normalize(text, **kwargs) return [normalize(i) for i in ids]
def index_collection(collection, sync=False): """Index a collection.""" if collection.deleted_at is not None: return delete_collection(collection.id) data = { 'foreign_id': collection.foreign_id, 'created_at': collection.created_at, 'updated_at': collection.updated_at, 'label': collection.label, 'kind': collection.kind, 'summary': collection.summary, 'category': Collection.DEFAULT, 'publisher': collection.publisher, 'publisher_url': collection.publisher_url, 'info_url': collection.info_url, 'data_url': collection.data_url, 'casefile': collection.casefile, 'secret': collection.secret, 'collection_id': collection.id, 'schemata': {}, 'team': [] } texts = [v for v in data.values() if isinstance(v, str)] if collection.category in Collection.CATEGORIES: data['category'] = collection.category if collection.creator is not None: data['creator'] = { 'id': collection.creator.id, 'type': collection.creator.type, 'name': collection.creator.name } texts.append(collection.creator.name) for role in collection.team: data['team'].append({ 'id': role.id, 'type': role.type, 'name': role.name }) texts.append(role.name) stats = get_collection_stats(collection.id) data['count'] = stats['count'] # expose entities by schema count. thing = model.get(Entity.THING) for schema, count in stats['schemata'].items(): schema = model.get(schema) if schema is not None and schema.is_a(thing): data['schemata'][schema.name] = count # if no countries or langs are given, take the most common from the data. countries = ensure_list(collection.countries) countries = countries or stats['countries'].keys() data['countries'] = registry.country.normalize_set(countries) languages = ensure_list(collection.languages) languages = languages or stats['languages'].keys() data['languages'] = registry.language.normalize_set(languages) texts.extend([normalize(t, ascii=True) for t in texts]) data['text'] = index_form(texts) return index_safe(collections_index(), collection.id, data, refresh=sync)
def name_tokens(name): name = normality.normalize(name, latinize=True) # if len(name) > 2: # return [name] # return [] return [n for n in name.split(' ') if len(n)]
def text_norm(text): return normalize(text, ascii=True)