Example #1
0
def index_collection(collection, sync=False):
    """Index a collection."""
    if collection.deleted_at is not None:
        return delete_collection(collection.id)

    data = get_collection(collection.id)
    if data is None:
        return

    log.info(
        "[%s] Index: %s (%s things)...",
        collection,
        data.get("label"),
        data.get("count"),
    )
    text = [data.get("label")]
    text.append(normalize(data.get("label")))
    text.append(normalize(data.get("foreign_id")))
    text.append(normalize(data.get("summary")))
    data["text"] = text
    data.pop("id", None)
    return index_safe(collections_index(),
                      collection.id,
                      data,
                      refresh=refresh_sync(sync))
Example #2
0
 def test_empty(self):
     self.assertEqual(None, slugify(None))
     self.assertEqual(None, ascii_text(None))
     self.assertEqual(None, latinize_text(None))
     self.assertEqual(None, normalize(None))
     self.assertEqual(None, normalize(''))
     self.assertEqual(None, normalize(' '))
Example #3
0
    def __init__(self, corpus_path):
        self.automaton = ahocorasick.Automaton()

        log.info("Building country automaton...")
        names_count = 0
        with io.open(corpus_path, 'r', encoding='utf-8') as fh:
            for row in csv.reader(fh, delimiter='\t'):
                if row[7] in self.FEATURES:
                    continue

                country = normalize(row[8])
                if country is None:
                    continue

                names = set(row[3].split(','))
                names.add(row[1])
                names.add(row[2])

                for name in names:
                    name = normalize(name)
                    if name is None or len(name) < 4:
                        continue
                    names_count += 1
                    self.automaton.add_word(name, country)

        self.automaton.make_automaton()
        log.info("...done: %s names", names_count)
Example #4
0
def make_csv_file_name(meta, table, out_folder):
    bank_name = normalize(meta['BankName'], lowercase=False)
    if bank_name is None:
        bank_name = 'Untitled Database'
    table_abbr = normalize(table['abbr'], lowercase=False)
    table_name = normalize(table['name'], lowercase=False)
    file_name = '%s - %s - %s.csv' % (bank_name, table_abbr, table_name)
    return os.path.join(out_folder, file_name)
Example #5
0
def make_csv_file_name(meta, table, out_folder):
    bank_name = normalize(meta['BankName'], lowercase=False)
    if bank_name is None:
        bank_name = 'Untitled Database'
    table_abbr = normalize(table['abbr'], lowercase=False)
    table_name = normalize(table['name'], lowercase=False)
    file_name = '%s - %s - %s.csv' % (bank_name, table_abbr, table_name)
    return os.path.join(out_folder, file_name)
Example #6
0
 def match_prefix(self, prefix):
     prefix = normalize(prefix)
     if not self.abstract:
         if normalize(self.name).startswith(prefix):
             return True
         elif normalize(self.label).startswith(prefix):
             return True
     return False
def compare_names(left, right):
    left_list = [normalize(n, latinize=True) for n in left.names]
    right_list = [normalize(n, latinize=True) for n in right.names]
    try:
        return max(
            Levenshtein.ratio(left, right)
            for left, right in IT.product(left_list, right_list))
    except ValueError:
        return 0
Example #8
0
def compare_names(left, right):
    result = 0
    left_list = [normalize(n, latinize=True) for n in left.names]
    right_list = [normalize(n, latinize=True) for n in right.names]
    for (left, right) in itertools.product(left_list, right_list):
        similarity = jaro(left, right)
        score = similarity * dampen(2, 20, shortest(left, right))
        result = max(result, score)
    return result
Example #9
0
def text_score(match, candidates):
    if isinstance(candidates, basestring):
        candidates = [candidates]
    match_n = normalize(match)
    best_score = 0
    for candidate in candidates:
        cand_n = normalize(candidate)
        score = jaro_winkler(match_n, cand_n, 0.02) * 100
        best_score = max(int(score), best_score)
    return best_score
Example #10
0
    def ExtractCountries(self, request_iterator, context):
        country_tags = []
        word_count = 0

        for text_obj in request_iterator:
            text = normalize(text_obj.text)
            if text is None:
                continue

            word_count += len(text.split())
            for index, country in self.automaton.iter(text):
                # log.debug("Matched: %s -> %s", name, country)
                country_tags.append(country)

        doc_tags = []
        co_counts = Counter(country_tags)
        top_n = co_counts.most_common(self.MAX_TAGS)

        for tag_num in range(1, self.MAX_TAGS + 1):
            if len(top_n) >= tag_num:
                freq = top_n[tag_num - 1][1] / max(1, word_count)
                log.info('tag is %s, freq = %.2f', top_n[tag_num - 1], freq)
                if freq >= self.TAG_FREQUENCY_CUT:
                    doc_tags.append(top_n[tag_num - 1][0])

        return CountryTags(countries=doc_tags)
Example #11
0
 def transform(self, source, target):
     text = source.data()
     text = normalize(text,
                      lowercase=self.config.get('lowercase', True),
                      transliterate=self.config.get('transliterate', False),
                      collapse=self.config.get('collapse', True))
     target.save_data(text.encode('utf-8'))
Example #12
0
def find_matches(dataset, text, filter=None, exclude=None):
    entities = Entity.__table__
    match_text = normalize(text, dataset)[:254]

    # select text column and apply necesary transformations
    text_field = entities.c.name
    if dataset.normalize_text:
        text_field = entities.c.normalized
    if dataset.ignore_case:
        text_field = func.lower(text_field)
    text_field = func.left(text_field, 254)

    # calculate the difference percentage
    l = func.greatest(1.0, func.least(len(match_text), func.length(text_field)))
    score = func.greatest(0.0, ((l - func.levenshtein(text_field, match_text)) / l) * 100.0)
    score = func.max(score).label("score")

    # coalesce the canonical identifier
    id_ = func.coalesce(entities.c.canonical_id, entities.c.id).label("id")

    # apply filters
    filters = [entities.c.dataset_id == dataset.id, entities.c.invalid == False]  # noqa
    if not dataset.match_aliases:
        filters.append(entities.c.canonical_id == None)  # noqa
    if exclude is not None:
        filters.append(entities.c.id != exclude)
    if filter is not None:
        filters.append(text_field.ilike("%%%s%%" % filter))

    q = select([id_, score], and_(*filters), [entities], group_by=[id_], order_by=[score.desc()])
    return Matches(q)
Example #13
0
def fingerprint(name):
    name = name.lower()
    for p, r in REPLS.items():
        # print p, r, name
        name = p.sub(r, name)
    name = normalize(name)
    tokens = set([n for n in name.split(" ") if len(n)])
    return unicode(" ".join(sorted(tokens)))
Example #14
0
def to_shortname(name):
    # Remove Hector-style extensions.
    name = name.replace("_emissions", "").replace("_concentrations", "")
    normalized_name = normalize(name).replace(" ", "")
    try:
        return mappings[normalized_name]
    except KeyError:
        return name
Example #15
0
def normalize_strong(text):
    """Perform heavy normalisation of a given text.

    The goal of this function is not to retain a readable version of the given
    string, but rather to yield a normalised version suitable for comparisons
    and machine analysis.
    """
    return normalize(text, lowercase=True, ascii=True)
Example #16
0
def index_collection(collection, sync=False):
    """Index a collection."""
    if collection.deleted_at is not None:
        return delete_collection(collection.id)

    log.info("Index [%s]: %s", collection.id, collection.label)
    data = get_collection(collection.id)
    text = [data.get('label')]
    text.append(normalize(data.get('label')))
    text.append(normalize(data.get('foreign_id')))
    text.append(normalize(data.get('summary')))
    data['text'] = text
    data.pop('id', None)
    return index_safe(collections_index(),
                      collection.id,
                      data,
                      refresh=refresh_sync(sync))
Example #17
0
def match_form(text):
    """Turn a string into a form appropriate for name matching.

    The goal of this function is not to retain a readable version of the given
    string, but rather to yield a normalised version suitable for comparisons
    and machine analysis.
    """
    return normalize(text, lowercase=True, ascii=True)
Example #18
0
def clean(mapping, bind, values):
    """ Perform several types of string cleaning for titles etc.. """
    categories = {'C': ' '}
    for value in values:
        if isinstance(value, six.string_types):
            value = normality.normalize(value, lowercase=False, collapse=True,
                                        decompose=False,
                                        replace_categories=categories)
        yield value
Example #19
0
def parse_row(phrases, data):
    lang = data['lang']
    if lang != 'en':
        return

    desc = data['description']
    tokens = normality.normalize(desc).split(' ')
    for i in [2, 3, 4, 5, 6, 7]:
        for ngram in ngrams(tokens, i):
            ngram = ' '.join(ngram)
            phrases[(ngram, i)].add(str(data['case_id']))
Example #20
0
def clean(mapping, bind, values):
    """ Perform several types of string cleaning for titles etc.. """
    categories = {'C': ' '}
    for value in values:
        if isinstance(value, six.string_types):
            value = normality.normalize(value,
                                        lowercase=False,
                                        collapse=True,
                                        decompose=False,
                                        replace_categories=categories)
        yield value
Example #21
0
 def by_name(cls, dataset, name):
     q = cls.query.filter_by(dataset=dataset)
     attr = Entity.name
     if dataset.normalize_text:
         attr = Entity.normalized
         name = normalize(name)
     if dataset.ignore_case:
         attr = func.lower(attr)
         if isinstance(name, basestring):
             name = name.lower()
     q = q.filter(attr == name)
     return q.first()
Example #22
0
 def by_name(cls, dataset, name):
     q = cls.query.filter_by(dataset=dataset)
     attr = Entity.name
     if dataset.normalize_text:
         attr = Entity.normalized
         name = normalize(name)
     if dataset.ignore_case:
         attr = func.lower(attr)
         if isinstance(name, basestring):
             name = name.lower()
     q = q.filter(attr == name)
     return q.first()
Example #23
0
def jsonify(csvlist, preheadings=None, heading_line=0, data_start_line=1):
    """
    Convert CSV into a JSON
    using either `preheadings` or csvilst[heading_line]
    as keys. Yields generator of dicts.
    """
    if not preheadings:
        preheadings = csvlist[0]
        headings = [normalize(x).replace(' ', '_') for x in preheadings]

    csvlist_data_only = csvlist[data_start_line:]
    for line in csvlist_data_only:
        yield dict(zip(headings, line))
Example #24
0
 def match_form(self, text):
     """Turn a string into a form appropriate for name matching."""
     # The goal of this function is not to retain a readable version of the
     # string, but rather to yield a normalised version suitable for
     # comparisons and machine analysis.
     text = normalize(text, lowercase=True, ascii=True)
     if text is None:
         return
     # TODO: this is a weird heuristic, but to avoid overly aggressive
     # matching it may make sense:
     if ' ' not in text:
         return
     return text.encode('utf-8')
Example #25
0
def jsonify(csvlist, preheadings=None, heading_line=0, data_start_line=1):
    """
    Convert CSV into a JSON
    using either `preheadings` or csvilst[heading_line]
    as keys. Yields generator of dicts.
    """
    if not preheadings:
        preheadings = csvlist[0]
        headings = [normalize(x).replace(" ", "_") for x in preheadings]

    csvlist_data_only = csvlist[data_start_line:]
    for line in csvlist_data_only:
        yield dict(zip(headings, line))
Example #26
0
def search_party_names(text):
    if text is None:
        return
    text = PARTIES_SPLIT.split(text)
    text = normalize(text[0])
    parties = set()
    for party, rex in PARTIES_REGEX.items():
        if rex.findall(text):
            parties.add(party)
    if not len(parties):
        return
    parties = ':'.join(sorted(parties))
    return parties
Example #27
0
def makeBigrams(word, **scoreOptions):
    '''
    Normalize set of bigrams into an ordered string to aid processing
    '''
    # Should probably allow stop words
    # Should probably strip of spaces(?) and punctuation
    process = normalize(word)
    stopwords = scoreOptions.get('stopwords', None)
    if stopwords:
        process = ' '.join(w for w in process.split() if w not in stopwords)

    return ''.join(
        sorted(set(process[i:i + 2] for i in range(len(process) - 1))))
Example #28
0
def search_party_names(text):
    if text is None:
        return
    text = PARTIES_SPLIT.split(text)
    text = normalize(text[0])
    parties = set()
    for party, rex in PARTIES_REGEX.items():
        if rex.findall(text):
            parties.add(party)
    if not len(parties):
        return
    parties = ':'.join(sorted(parties))
    return parties
Example #29
0
def prepare_geonames():
    with io.open(GEONAMES_RAW_PATH, 'r', encoding='utf-8') as fh:
        with shelve.open(GEONAMES_DB_PATH) as db:
            for row in csv.reader(fh, delimiter='\t'):
                country = normalize(row[8])
                if country is None:
                    continue
                names = set(row[3].split(','))
                names.add(row[1])
                names.add(row[2])
                for name in names:
                    name = normalize(name)
                    if name is None:
                        continue
                    countries = db.get(name)
                    if countries:
                        countries.append(country)
                        db[name] = countries
                    else:
                        db[name] = [country]
            for name in db:
                countries = db[name]
                db[name] = max(set(countries), key=countries.count)
Example #30
0
def _normalize_names(names):
    """Generate a sequence of comparable names for an entity. This also
    generates a `fingerprint`, i.e. a version of the name where all tokens
    are sorted alphabetically, and some parts, such as company suffixes,
    have been removed."""
    seen = set()
    for name in names:
        plain = normalize(name, ascii=True)
        if plain is not None and plain not in seen:
            seen.add(plain)
            yield plain
        fp = fingerprints.generate(name)
        if fp is not None and len(fp) > 6 and fp not in seen:
            seen.add(fp)
            yield fp
Example #31
0
 def create(cls, dataset, data, account):
     state = EntityState(dataset, None)
     data = EntitySchema().to_python(data, state)
     entity = cls()
     entity.dataset = dataset
     entity.creator = account
     entity.name = data['name']
     entity.normalized = normalize(entity.name)
     entity.attributes = data.get('attributes', {})
     entity.reviewed = data['reviewed']
     entity.invalid = data['invalid']
     entity.canonical = data['canonical']
     db.session.add(entity)
     db.session.flush()
     return entity
Example #32
0
 def create(cls, dataset, data, account):
     state = EntityState(dataset, None)
     data = EntitySchema().to_python(data, state)
     entity = cls()
     entity.dataset = dataset
     entity.creator = account
     entity.name = data['name']
     entity.normalized = normalize(entity.name)
     entity.attributes = data.get('attributes', {})
     entity.reviewed = data['reviewed']
     entity.invalid = data['invalid']
     entity.canonical = data['canonical']
     db.session.add(entity)
     db.session.flush()
     return entity
Example #33
0
def create_filename(row):
    file_type = row["FileType"]
    if file_type == "Translation":
        file_type = "NDC_Translation"
    elif file_type == "Addendum":
        file_type = "NDC_Addendum"
    name = "{}_{}".format(row["Number"], file_type)

    if "revised" in row["Title"].lower():
        name += "_Revised"
    elif "archived" in row["Title"].lower():
        name = "{}_NDC_Archived".format(row["Number"])

    # Special case PSE with multiple Addendums
    if row["Party"] == "State of Palestine":
        if "SPM" in row["OriginalFilename"]:
            name += "_Summary_Policy_Makers"
        elif "Cobenefits" in row["Title"]:
            name += "_Cobenefits"
        elif "Implementation" in row["Title"]:
            name += "_Implementation_Road_Map"
        elif "Approval" in row["Title"]:
            name += "_Approval"

    # Special case PNG with multiple Addendums
    if row["Party"] == "Papua New Guinea":
        if "letter" in row["Title"]:
            name += "_Satisfactory_Letter"
        elif "Explanatory Note" in row["Title"]:
            name += "_Explanatory_Note"
        elif "PA Implementation Act" in row["Title"]:
            name += "_PA_Implementation_Act"
        elif "Management Act" in row["Title"]:
            name += "_Climate_Change_Management_Act"

    code = row["Code"]
    party = normalize(row["Party"], lowercase=False).replace(" ", "-")
    if row["OriginalFilename"].startswith("LV-03-06-EU") and code in eu28:
        code = "EUU"
        party = "European-Union"
    name = "{}_{}_{}_{}.pdf".format(code, party, name, row["Language"])
    return name
Example #34
0
def find_matches(dataset, text, filter=None, exclude=None):
    entities = Entity.__table__
    match_text = (normalize(text) or '')[:254]

    # select text column and apply necesary transformations
    text_field = entities.c.name
    if dataset.normalize_text:
        text_field = entities.c.normalized
    if dataset.ignore_case:
        text_field = func.lower(text_field)
    text_field = func.left(text_field, 254)

    # calculate the difference percentage
    min_l = func.greatest(1.0,
                          func.least(len(match_text), func.length(text_field)))
    score = func.greatest(
        0.0,
        ((min_l - func.levenshtein(text_field, match_text)) / min_l) * 100.0)
    score = func.max(score).label('score')

    # coalesce the canonical identifier
    id_ = func.coalesce(entities.c.canonical_id, entities.c.id).label('id')

    # apply filters
    filters = [
        entities.c.dataset_id == dataset.id, entities.c.invalid == False
    ]  # noqa
    if not dataset.match_aliases:
        filters.append(entities.c.canonical_id == None)  # noqa
    if exclude is not None:
        filters.append(entities.c.id != exclude)
    if filter is not None:
        filters.append(text_field.ilike('%%%s%%' % filter))

    q = select([id_, score],
               and_(*filters), [entities],
               group_by=[id_],
               order_by=[score.desc()])
    return Matches(q)
Example #35
0
    def update(self, data, account):
        state = EntityState(self.dataset, self)
        data = EntitySchema().to_python(data, state)
        self.creator = account
        self.name = data['name']
        self.normalized = normalize(self.name)
        self.attributes = data['attributes']
        self.reviewed = data['reviewed']
        self.invalid = data['invalid']
        self.canonical = data['canonical']

        # redirect all aliases of this entity
        if self.canonical:
            if self.canonical.canonical_id:
                if self.canonial.canonical_id == self.id:
                    self.canonical.canonical = None
                else:
                    self.canonical = self.canonical.canonical

            for alias in self.aliases:
                alias.canonical = self.canonical

        db.session.add(self)
Example #36
0
    def update(self, data, account):
        state = EntityState(self.dataset, self)
        data = EntitySchema().to_python(data, state)
        self.creator = account
        self.name = data['name']
        self.normalized = normalize(self.name)
        self.attributes = data['attributes']
        self.reviewed = data['reviewed']
        self.invalid = data['invalid']
        self.canonical = data['canonical']

        # redirect all aliases of this entity
        if self.canonical:
            if self.canonical.canonical_id:
                if self.canonial.canonical_id == self.id:
                    self.canonical.canonical = None
                else:
                    self.canonical = self.canonical.canonical

            for alias in self.aliases:
                alias.canonical = self.canonical

        db.session.add(self)
Example #37
0
 def normalized(self):
     return normalize(self.query)
Example #38
0
 def value(self, value):
     attr = qualified[self.attribute]
     conv = attr.converter(attr)
     self._value = conv.serialize_safe(value)
     self.normalized = normalize(self._value)
Example #39
0
File: util.py Project: pudo/aleph
def normalize_label(label):
    return normalize(label, lowercase=True, ascii=True)
Example #40
0
# coding: utf-8
from normality import normalize
import csv
import normality
from grano.logic import Loader
rows = ["id", "name", "sort_name", "email", "twitter", "facebook", "group", "group_id", "area_id", "area", "chamber", "term", "start_date", "end_date", "image", "gender"]


if __name__ == "__main__":

    loader = Loader('senegal', project_label='Senegal')
    person = loader.make_entity("politician")


    with open('senegal/senegal-politicians.csv', 'r') as csvfile:
        records_reader = csv.reader(csvfile, delimiter=",")
        for row in records_reader:
            for x in range(0, len(rows)):
                #print "%s :: %s" % (rows[x], row[x])
                if row[x]:
                    val = normalize(str(row[x]))
                    person.set( rows[x], val )
            person.save()
        loader.persist()
Example #41
0
            file_type = "Addendum"
        elif any(sub in filename.lower() for sub in translations):
            file_type = "Translation"
        elif filename == "Belarus.pdf":
            file_type = "Addendum"
        elif filename == "Liberia_INDC Submission.002.pdf":
            file_type = "Addendum"
        elif filename.startswith("Sierra Leone INDC Submission to UNFCCC"):
            file_type = "Addendum"
        else:
            file_type = "INDC"
        print("{} : {} ({}) : {}".format(name, file_type, language, filename))

        clean_filename = "{}_{}_{}_{}.pdf".format(
            code,
            normalize(name, lowercase=False).replace(" ", "-"),
            file_type,
            language
        )
        if file_type == "Translation":
            clean_filename = clean_filename.replace(
                "Translation", "INDC_Translation")
        elif file_type == "Addendum":
            clean_filename = clean_filename.replace(
                "Addendum", "INDC_Addendum")

        clean_filepath = pdfs_dir / clean_filename

        indcs.append(OrderedDict([
            ("Code", code),
            ("Party", name),
Example #42
0
def fingerprint(name):
    if name is None:
        return
    name = FP_REMOVE.sub(' ', name.strip())
    return normalize(name).replace(' ', '-')
Example #43
0
 def normalize(cls, text):
     return normalize(text)
Example #44
0
    u'respondent_appointee': 'person',
    'claimants_counsel': 'person',
    'respondents_counsel': 'person',
    u'annulment_committee_members': 'person'
}

G = nx.Graph()

nodes = {}
cases = {}
# types = set()
for entity in entities:
    label = entity.get('entity')
    if label in SKIP:
        continue
    key = normalize(label)
    # types.add(entity.get('type'))
    if entity.get('case_url') not in cases:
        cases[entity.get('case_url')] = set()
    cases[entity.get('case_url')].add(key)

    if not G.has_node(key):
        G.add_node(key,
                   label=label,
                   type=entity.get('type'),
                   group=GROUPS[entity.get('type')])
    nodes[key] = label

for case_url, involved in cases.items():
    # print case_url, len(involved)
    for (s, t) in combinations(involved, 2):
Example #45
0
 def normalize(self, text, **kwargs):
     """Normalize for comparison."""
     ids = super(IdentifierType, self).normalize(text, **kwargs)
     return [normalize(i) for i in ids]
Example #46
0
 def transform(self, source, target):
     text = source.data()
     text = normalize(text, lowercase=self.config.get('lowercase', True),
                      transliterate=self.config.get('transliterate', False),
                      collapse=self.config.get('collapse', True))
     target.save_data(text.encode('utf-8'))
Example #47
0
def index_collection(collection, sync=False):
    """Index a collection."""
    if collection.deleted_at is not None:
        return delete_collection(collection.id)

    data = {
        'foreign_id': collection.foreign_id,
        'created_at': collection.created_at,
        'updated_at': collection.updated_at,
        'label': collection.label,
        'kind': collection.kind,
        'summary': collection.summary,
        'category': Collection.DEFAULT,
        'publisher': collection.publisher,
        'publisher_url': collection.publisher_url,
        'info_url': collection.info_url,
        'data_url': collection.data_url,
        'casefile': collection.casefile,
        'secret': collection.secret,
        'collection_id': collection.id,
        'schemata': {},
        'team': []
    }
    texts = [v for v in data.values() if isinstance(v, str)]

    if collection.category in Collection.CATEGORIES:
        data['category'] = collection.category

    if collection.creator is not None:
        data['creator'] = {
            'id': collection.creator.id,
            'type': collection.creator.type,
            'name': collection.creator.name
        }
        texts.append(collection.creator.name)

    for role in collection.team:
        data['team'].append({
            'id': role.id,
            'type': role.type,
            'name': role.name
        })
        texts.append(role.name)

    stats = get_collection_stats(collection.id)
    data['count'] = stats['count']

    # expose entities by schema count.
    thing = model.get(Entity.THING)
    for schema, count in stats['schemata'].items():
        schema = model.get(schema)
        if schema is not None and schema.is_a(thing):
            data['schemata'][schema.name] = count

    # if no countries or langs are given, take the most common from the data.
    countries = ensure_list(collection.countries)
    countries = countries or stats['countries'].keys()
    data['countries'] = registry.country.normalize_set(countries)

    languages = ensure_list(collection.languages)
    languages = languages or stats['languages'].keys()
    data['languages'] = registry.language.normalize_set(languages)

    texts.extend([normalize(t, ascii=True) for t in texts])
    data['text'] = index_form(texts)
    return index_safe(collections_index(), collection.id, data, refresh=sync)
Example #48
0
def name_tokens(name):
    name = normality.normalize(name, latinize=True)
    # if len(name) > 2:
    #     return [name]
    # return []
    return [n for n in name.split(' ') if len(n)]
Example #49
0
def text_norm(text):
    return normalize(text, ascii=True)
Example #50
0
File: alert.py Project: pudo/aleph
 def normalized(self):
     return normalize(self.query)