Python normalize Examples, normality.normalize Python Examples

Example #1

0

Show file

def index_collection(collection, sync=False):
    """Index a collection."""
    if collection.deleted_at is not None:
        return delete_collection(collection.id)

    data = get_collection(collection.id)
    if data is None:
        return

    log.info(
        "[%s] Index: %s (%s things)...",
        collection,
        data.get("label"),
        data.get("count"),
    )
    text = [data.get("label")]
    text.append(normalize(data.get("label")))
    text.append(normalize(data.get("foreign_id")))
    text.append(normalize(data.get("summary")))
    data["text"] = text
    data.pop("id", None)
    return index_safe(collections_index(),
                      collection.id,
                      data,
                      refresh=refresh_sync(sync))

Example #2

0

Show file

File: test_normality.py Project: sunu/normality

 def test_empty(self):
     self.assertEqual(None, slugify(None))
     self.assertEqual(None, ascii_text(None))
     self.assertEqual(None, latinize_text(None))
     self.assertEqual(None, normalize(None))
     self.assertEqual(None, normalize(''))
     self.assertEqual(None, normalize(' '))

Example #3

0

Show file

File: service.py Project: gavinrozzi/aleph

    def __init__(self, corpus_path):
        self.automaton = ahocorasick.Automaton()

        log.info("Building country automaton...")
        names_count = 0
        with io.open(corpus_path, 'r', encoding='utf-8') as fh:
            for row in csv.reader(fh, delimiter='\t'):
                if row[7] in self.FEATURES:
                    continue

                country = normalize(row[8])
                if country is None:
                    continue

                names = set(row[3].split(','))
                names.add(row[1])
                names.add(row[2])

                for name in names:
                    name = normalize(name)
                    if name is None or len(name) < 4:
                        continue
                    names_count += 1
                    self.automaton.add_word(name, country)

        self.automaton.make_automaton()
        log.info("...done: %s names", names_count)

Example #4

0

Show file

File: parser.py Project: VsyachePuz/cronosparser

def make_csv_file_name(meta, table, out_folder):
    bank_name = normalize(meta['BankName'], lowercase=False)
    if bank_name is None:
        bank_name = 'Untitled Database'
    table_abbr = normalize(table['abbr'], lowercase=False)
    table_name = normalize(table['name'], lowercase=False)
    file_name = '%s - %s - %s.csv' % (bank_name, table_abbr, table_name)
    return os.path.join(out_folder, file_name)

Example #5

0

Show file

def make_csv_file_name(meta, table, out_folder):
    bank_name = normalize(meta['BankName'], lowercase=False)
    if bank_name is None:
        bank_name = 'Untitled Database'
    table_abbr = normalize(table['abbr'], lowercase=False)
    table_name = normalize(table['name'], lowercase=False)
    file_name = '%s - %s - %s.csv' % (bank_name, table_abbr, table_name)
    return os.path.join(out_folder, file_name)

Example #6

0

Show file

File: util.py Project: mychapati/nomenklatura

 def match_prefix(self, prefix):
     prefix = normalize(prefix)
     if not self.abstract:
         if normalize(self.name).startswith(prefix):
             return True
         elif normalize(self.label).startswith(prefix):
             return True
     return False

Example #7

0

Show file

File: pipeline.py Project: alephdata/followthemoney-predict

def compare_names(left, right):
    left_list = [normalize(n, latinize=True) for n in left.names]
    right_list = [normalize(n, latinize=True) for n in right.names]
    try:
        return max(
            Levenshtein.ratio(left, right)
            for left, right in IT.product(left_list, right_list))
    except ValueError:
        return 0

Example #8

0

Show file

File: compare.py Project: wayne9qiu/followthemoney

def compare_names(left, right):
    result = 0
    left_list = [normalize(n, latinize=True) for n in left.names]
    right_list = [normalize(n, latinize=True) for n in right.names]
    for (left, right) in itertools.product(left_list, right_list):
        similarity = jaro(left, right)
        score = similarity * dampen(2, 20, shortest(left, right))
        result = max(result, score)
    return result

Example #9

0

Show file

File: util.py Project: mychapati/nomenklatura

def text_score(match, candidates):
    if isinstance(candidates, basestring):
        candidates = [candidates]
    match_n = normalize(match)
    best_score = 0
    for candidate in candidates:
        cand_n = normalize(candidate)
        score = jaro_winkler(match_n, cand_n, 0.02) * 100
        best_score = max(int(score), best_score)
    return best_score

Example #10

0

Show file

File: service.py Project: gavinrozzi/aleph

    def ExtractCountries(self, request_iterator, context):
        country_tags = []
        word_count = 0

        for text_obj in request_iterator:
            text = normalize(text_obj.text)
            if text is None:
                continue

            word_count += len(text.split())
            for index, country in self.automaton.iter(text):
                # log.debug("Matched: %s -> %s", name, country)
                country_tags.append(country)

        doc_tags = []
        co_counts = Counter(country_tags)
        top_n = co_counts.most_common(self.MAX_TAGS)

        for tag_num in range(1, self.MAX_TAGS + 1):
            if len(top_n) >= tag_num:
                freq = top_n[tag_num - 1][1] / max(1, word_count)
                log.info('tag is %s, freq = %.2f', top_n[tag_num - 1], freq)
                if freq >= self.TAG_FREQUENCY_CUT:
                    doc_tags.append(top_n[tag_num - 1][0])

        return CountryTags(countries=doc_tags)

Example #11

0

Show file

 def transform(self, source, target):
     text = source.data()
     text = normalize(text,
                      lowercase=self.config.get('lowercase', True),
                      transliterate=self.config.get('transliterate', False),
                      collapse=self.config.get('collapse', True))
     target.save_data(text.encode('utf-8'))

Example #12

0

Show file

File: matching.py Project: rlugojr/nomenklatura

def find_matches(dataset, text, filter=None, exclude=None):
    entities = Entity.__table__
    match_text = normalize(text, dataset)[:254]

    # select text column and apply necesary transformations
    text_field = entities.c.name
    if dataset.normalize_text:
        text_field = entities.c.normalized
    if dataset.ignore_case:
        text_field = func.lower(text_field)
    text_field = func.left(text_field, 254)

    # calculate the difference percentage
    l = func.greatest(1.0, func.least(len(match_text), func.length(text_field)))
    score = func.greatest(0.0, ((l - func.levenshtein(text_field, match_text)) / l) * 100.0)
    score = func.max(score).label("score")

    # coalesce the canonical identifier
    id_ = func.coalesce(entities.c.canonical_id, entities.c.id).label("id")

    # apply filters
    filters = [entities.c.dataset_id == dataset.id, entities.c.invalid == False]  # noqa
    if not dataset.match_aliases:
        filters.append(entities.c.canonical_id == None)  # noqa
    if exclude is not None:
        filters.append(entities.c.id != exclude)
    if filter is not None:
        filters.append(text_field.ilike("%%%s%%" % filter))

    q = select([id_, score], and_(*filters), [entities], group_by=[id_], order_by=[score.desc()])
    return Matches(q)

Example #13

0

Show file

File: bigshots_gen.py Project: 4bic/scraper_Mozmbq

def fingerprint(name):
    name = name.lower()
    for p, r in REPLS.items():
        # print p, r, name
        name = p.sub(r, name)
    name = normalize(name)
    tokens = set([n for n in name.split(" ") if len(n)])
    return unicode(" ".join(sorted(tokens)))

Example #14

0

Show file

def to_shortname(name):
    # Remove Hector-style extensions.
    name = name.replace("_emissions", "").replace("_concentrations", "")
    normalized_name = normalize(name).replace(" ", "")
    try:
        return mappings[normalized_name]
    except KeyError:
        return name

Example #15

0

Show file

File: text.py Project: wilbrodn/aleph

def normalize_strong(text):
    """Perform heavy normalisation of a given text.

    The goal of this function is not to retain a readable version of the given
    string, but rather to yield a normalised version suitable for comparisons
    and machine analysis.
    """
    return normalize(text, lowercase=True, ascii=True)

Example #16

0

Show file

File: collections.py Project: we1l1n/aleph

def index_collection(collection, sync=False):
    """Index a collection."""
    if collection.deleted_at is not None:
        return delete_collection(collection.id)

    log.info("Index [%s]: %s", collection.id, collection.label)
    data = get_collection(collection.id)
    text = [data.get('label')]
    text.append(normalize(data.get('label')))
    text.append(normalize(data.get('foreign_id')))
    text.append(normalize(data.get('summary')))
    data['text'] = text
    data.pop('id', None)
    return index_safe(collections_index(),
                      collection.id,
                      data,
                      refresh=refresh_sync(sync))

Example #17

0

Show file

File: text.py Project: tpreusse/aleph

def match_form(text):
    """Turn a string into a form appropriate for name matching.

    The goal of this function is not to retain a readable version of the given
    string, but rather to yield a normalised version suitable for comparisons
    and machine analysis.
    """
    return normalize(text, lowercase=True, ascii=True)

Example #18

0

Show file

File: transforms.py Project: backgroundcheck/jsonmapping

def clean(mapping, bind, values):
    """ Perform several types of string cleaning for titles etc.. """
    categories = {'C': ' '}
    for value in values:
        if isinstance(value, six.string_types):
            value = normality.normalize(value, lowercase=False, collapse=True,
                                        decompose=False,
                                        replace_categories=categories)
        yield value

Example #19

0

Show file

def parse_row(phrases, data):
    lang = data['lang']
    if lang != 'en':
        return

    desc = data['description']
    tokens = normality.normalize(desc).split(' ')
    for i in [2, 3, 4, 5, 6, 7]:
        for ngram in ngrams(tokens, i):
            ngram = ' '.join(ngram)
            phrases[(ngram, i)].add(str(data['case_id']))

Example #20

0

Show file

File: transforms.py Project: rlugojr/jsonmapping

def clean(mapping, bind, values):
    """ Perform several types of string cleaning for titles etc.. """
    categories = {'C': ' '}
    for value in values:
        if isinstance(value, six.string_types):
            value = normality.normalize(value,
                                        lowercase=False,
                                        collapse=True,
                                        decompose=False,
                                        replace_categories=categories)
        yield value

Example #21

0

Show file

File: entity.py Project: rlugojr/nomenklatura

 def by_name(cls, dataset, name):
     q = cls.query.filter_by(dataset=dataset)
     attr = Entity.name
     if dataset.normalize_text:
         attr = Entity.normalized
         name = normalize(name)
     if dataset.ignore_case:
         attr = func.lower(attr)
         if isinstance(name, basestring):
             name = name.lower()
     q = q.filter(attr == name)
     return q.first()

Example #22

0

Show file

File: entity.py Project: e6/nomenklatura

 def by_name(cls, dataset, name):
     q = cls.query.filter_by(dataset=dataset)
     attr = Entity.name
     if dataset.normalize_text:
         attr = Entity.normalized
         name = normalize(name)
     if dataset.ignore_case:
         attr = func.lower(attr)
         if isinstance(name, basestring):
             name = name.lower()
     q = q.filter(attr == name)
     return q.first()

Example #23

0

Show file

def jsonify(csvlist, preheadings=None, heading_line=0, data_start_line=1):
    """
    Convert CSV into a JSON
    using either `preheadings` or csvilst[heading_line]
    as keys. Yields generator of dicts.
    """
    if not preheadings:
        preheadings = csvlist[0]
        headings = [normalize(x).replace(' ', '_') for x in preheadings]

    csvlist_data_only = csvlist[data_start_line:]
    for line in csvlist_data_only:
        yield dict(zip(headings, line))

Example #24

0

Show file

File: corasick_entity.py Project: KarrieK/aleph

 def match_form(self, text):
     """Turn a string into a form appropriate for name matching."""
     # The goal of this function is not to retain a readable version of the
     # string, but rather to yield a normalised version suitable for
     # comparisons and machine analysis.
     text = normalize(text, lowercase=True, ascii=True)
     if text is None:
         return
     # TODO: this is a weird heuristic, but to avoid overly aggressive
     # matching it may make sense:
     if ' ' not in text:
         return
     return text.encode('utf-8')

Example #25

0

Show file

File: csvtodb.py Project: rv816/csvtodb

def jsonify(csvlist, preheadings=None, heading_line=0, data_start_line=1):
    """
    Convert CSV into a JSON
    using either `preheadings` or csvilst[heading_line]
    as keys. Yields generator of dicts.
    """
    if not preheadings:
        preheadings = csvlist[0]
        headings = [normalize(x).replace(" ", "_") for x in preheadings]

    csvlist_data_only = csvlist[data_start_line:]
    for line in csvlist_data_only:
        yield dict(zip(headings, line))

Example #26

0

Show file

File: parties.py Project: bundestag/normdatei

def search_party_names(text):
    if text is None:
        return
    text = PARTIES_SPLIT.split(text)
    text = normalize(text[0])
    parties = set()
    for party, rex in PARTIES_REGEX.items():
        if rex.findall(text):
            parties.add(party)
    if not len(parties):
        return
    parties = ':'.join(sorted(parties))
    return parties

Example #27

0

Show file

File: __init__.py Project: aquablue8200/csv-reconcile

def makeBigrams(word, **scoreOptions):
    '''
    Normalize set of bigrams into an ordered string to aid processing
    '''
    # Should probably allow stop words
    # Should probably strip of spaces(?) and punctuation
    process = normalize(word)
    stopwords = scoreOptions.get('stopwords', None)
    if stopwords:
        process = ' '.join(w for w in process.split() if w not in stopwords)

    return ''.join(
        sorted(set(process[i:i + 2] for i in range(len(process) - 1))))

Example #28

0

Show file

def search_party_names(text):
    if text is None:
        return
    text = PARTIES_SPLIT.split(text)
    text = normalize(text[0])
    parties = set()
    for party, rex in PARTIES_REGEX.items():
        if rex.findall(text):
            parties.add(party)
    if not len(parties):
        return
    parties = ':'.join(sorted(parties))
    return parties

Example #29

0

Show file

File: prepare_geonames.py Project: arezola/aleph

def prepare_geonames():
    with io.open(GEONAMES_RAW_PATH, 'r', encoding='utf-8') as fh:
        with shelve.open(GEONAMES_DB_PATH) as db:
            for row in csv.reader(fh, delimiter='\t'):
                country = normalize(row[8])
                if country is None:
                    continue
                names = set(row[3].split(','))
                names.add(row[1])
                names.add(row[2])
                for name in names:
                    name = normalize(name)
                    if name is None:
                        continue
                    countries = db.get(name)
                    if countries:
                        countries.append(country)
                        db[name] = countries
                    else:
                        db[name] = [country]
            for name in db:
                countries = db[name]
                db[name] = max(set(countries), key=countries.count)

Example #30

0

Show file

def _normalize_names(names):
    """Generate a sequence of comparable names for an entity. This also
    generates a `fingerprint`, i.e. a version of the name where all tokens
    are sorted alphabetically, and some parts, such as company suffixes,
    have been removed."""
    seen = set()
    for name in names:
        plain = normalize(name, ascii=True)
        if plain is not None and plain not in seen:
            seen.add(plain)
            yield plain
        fp = fingerprints.generate(name)
        if fp is not None and len(fp) > 6 and fp not in seen:
            seen.add(fp)
            yield fp

Example #31

0

Show file

File: entity.py Project: rlugojr/nomenklatura

 def create(cls, dataset, data, account):
     state = EntityState(dataset, None)
     data = EntitySchema().to_python(data, state)
     entity = cls()
     entity.dataset = dataset
     entity.creator = account
     entity.name = data['name']
     entity.normalized = normalize(entity.name)
     entity.attributes = data.get('attributes', {})
     entity.reviewed = data['reviewed']
     entity.invalid = data['invalid']
     entity.canonical = data['canonical']
     db.session.add(entity)
     db.session.flush()
     return entity

Example #32

0

Show file

File: entity.py Project: e6/nomenklatura

 def create(cls, dataset, data, account):
     state = EntityState(dataset, None)
     data = EntitySchema().to_python(data, state)
     entity = cls()
     entity.dataset = dataset
     entity.creator = account
     entity.name = data['name']
     entity.normalized = normalize(entity.name)
     entity.attributes = data.get('attributes', {})
     entity.reviewed = data['reviewed']
     entity.invalid = data['invalid']
     entity.canonical = data['canonical']
     db.session.add(entity)
     db.session.flush()
     return entity

Example #33

0

Show file

def create_filename(row):
    file_type = row["FileType"]
    if file_type == "Translation":
        file_type = "NDC_Translation"
    elif file_type == "Addendum":
        file_type = "NDC_Addendum"
    name = "{}_{}".format(row["Number"], file_type)

    if "revised" in row["Title"].lower():
        name += "_Revised"
    elif "archived" in row["Title"].lower():
        name = "{}_NDC_Archived".format(row["Number"])

    # Special case PSE with multiple Addendums
    if row["Party"] == "State of Palestine":
        if "SPM" in row["OriginalFilename"]:
            name += "_Summary_Policy_Makers"
        elif "Cobenefits" in row["Title"]:
            name += "_Cobenefits"
        elif "Implementation" in row["Title"]:
            name += "_Implementation_Road_Map"
        elif "Approval" in row["Title"]:
            name += "_Approval"

    # Special case PNG with multiple Addendums
    if row["Party"] == "Papua New Guinea":
        if "letter" in row["Title"]:
            name += "_Satisfactory_Letter"
        elif "Explanatory Note" in row["Title"]:
            name += "_Explanatory_Note"
        elif "PA Implementation Act" in row["Title"]:
            name += "_PA_Implementation_Act"
        elif "Management Act" in row["Title"]:
            name += "_Climate_Change_Management_Act"

    code = row["Code"]
    party = normalize(row["Party"], lowercase=False).replace(" ", "-")
    if row["OriginalFilename"].startswith("LV-03-06-EU") and code in eu28:
        code = "EUU"
        party = "European-Union"
    name = "{}_{}_{}_{}.pdf".format(code, party, name, row["Language"])
    return name

Example #34

0

Show file

File: matching.py Project: unicef/nomenklatura

def find_matches(dataset, text, filter=None, exclude=None):
    entities = Entity.__table__
    match_text = (normalize(text) or '')[:254]

    # select text column and apply necesary transformations
    text_field = entities.c.name
    if dataset.normalize_text:
        text_field = entities.c.normalized
    if dataset.ignore_case:
        text_field = func.lower(text_field)
    text_field = func.left(text_field, 254)

    # calculate the difference percentage
    min_l = func.greatest(1.0,
                          func.least(len(match_text), func.length(text_field)))
    score = func.greatest(
        0.0,
        ((min_l - func.levenshtein(text_field, match_text)) / min_l) * 100.0)
    score = func.max(score).label('score')

    # coalesce the canonical identifier
    id_ = func.coalesce(entities.c.canonical_id, entities.c.id).label('id')

    # apply filters
    filters = [
        entities.c.dataset_id == dataset.id, entities.c.invalid == False
    ]  # noqa
    if not dataset.match_aliases:
        filters.append(entities.c.canonical_id == None)  # noqa
    if exclude is not None:
        filters.append(entities.c.id != exclude)
    if filter is not None:
        filters.append(text_field.ilike('%%%s%%' % filter))

    q = select([id_, score],
               and_(*filters), [entities],
               group_by=[id_],
               order_by=[score.desc()])
    return Matches(q)

Example #35

0

Show file

File: entity.py Project: rlugojr/nomenklatura

    def update(self, data, account):
        state = EntityState(self.dataset, self)
        data = EntitySchema().to_python(data, state)
        self.creator = account
        self.name = data['name']
        self.normalized = normalize(self.name)
        self.attributes = data['attributes']
        self.reviewed = data['reviewed']
        self.invalid = data['invalid']
        self.canonical = data['canonical']

        # redirect all aliases of this entity
        if self.canonical:
            if self.canonical.canonical_id:
                if self.canonial.canonical_id == self.id:
                    self.canonical.canonical = None
                else:
                    self.canonical = self.canonical.canonical

            for alias in self.aliases:
                alias.canonical = self.canonical

        db.session.add(self)

Example #36

0

Show file

File: entity.py Project: e6/nomenklatura

    def update(self, data, account):
        state = EntityState(self.dataset, self)
        data = EntitySchema().to_python(data, state)
        self.creator = account
        self.name = data['name']
        self.normalized = normalize(self.name)
        self.attributes = data['attributes']
        self.reviewed = data['reviewed']
        self.invalid = data['invalid']
        self.canonical = data['canonical']

        # redirect all aliases of this entity
        if self.canonical:
            if self.canonical.canonical_id:
                if self.canonial.canonical_id == self.id:
                    self.canonical.canonical = None
                else:
                    self.canonical = self.canonical.canonical

            for alias in self.aliases:
                alias.canonical = self.canonical

        db.session.add(self)

Example #37

0

Show file

 def normalized(self):
     return normalize(self.query)

Example #38

0

Show file

File: statement.py Project: mychapati/nomenklatura

 def value(self, value):
     attr = qualified[self.attribute]
     conv = attr.converter(attr)
     self._value = conv.serialize_safe(value)
     self.normalized = normalize(self._value)

Example #39

0

Show file

File: util.py Project: pudo/aleph

def normalize_label(label):
    return normalize(label, lowercase=True, ascii=True)

Example #40

0

Show file

File: senegal_script.py Project: momor666/grano

# coding: utf-8
from normality import normalize
import csv
import normality
from grano.logic import Loader
rows = ["id", "name", "sort_name", "email", "twitter", "facebook", "group", "group_id", "area_id", "area", "chamber", "term", "start_date", "end_date", "image", "gender"]


if __name__ == "__main__":

    loader = Loader('senegal', project_label='Senegal')
    person = loader.make_entity("politician")


    with open('senegal/senegal-politicians.csv', 'r') as csvfile:
        records_reader = csv.reader(csvfile, delimiter=",")
        for row in records_reader:
            for x in range(0, len(rows)):
                #print "%s :: %s" % (rows[x], row[x])
                if row[x]:
                    val = normalize(str(row[x]))
                    person.set( rows[x], val )
            person.save()
        loader.persist()

Example #41

0

Show file

File: process.py Project: openclimatedata/indcs

            file_type = "Addendum"
        elif any(sub in filename.lower() for sub in translations):
            file_type = "Translation"
        elif filename == "Belarus.pdf":
            file_type = "Addendum"
        elif filename == "Liberia_INDC Submission.002.pdf":
            file_type = "Addendum"
        elif filename.startswith("Sierra Leone INDC Submission to UNFCCC"):
            file_type = "Addendum"
        else:
            file_type = "INDC"
        print("{} : {} ({}) : {}".format(name, file_type, language, filename))

        clean_filename = "{}_{}_{}_{}.pdf".format(
            code,
            normalize(name, lowercase=False).replace(" ", "-"),
            file_type,
            language
        )
        if file_type == "Translation":
            clean_filename = clean_filename.replace(
                "Translation", "INDC_Translation")
        elif file_type == "Addendum":
            clean_filename = clean_filename.replace(
                "Addendum", "INDC_Addendum")

        clean_filepath = pdfs_dir / clean_filename

        indcs.append(OrderedDict([
            ("Code", code),
            ("Party", name),

Example #42

0

Show file

File: text.py Project: bundestag/normdatei

def fingerprint(name):
    if name is None:
        return
    name = FP_REMOVE.sub(' ', name.strip())
    return normalize(name).replace(' ', '-')

Example #43

0

Show file

File: selector.py Project: arc64/datawi.re

 def normalize(cls, text):
     return normalize(text)

Example #44

0

Show file

File: graph.py Project: 01-/investor-disputes

    u'respondent_appointee': 'person',
    'claimants_counsel': 'person',
    'respondents_counsel': 'person',
    u'annulment_committee_members': 'person'
}

G = nx.Graph()

nodes = {}
cases = {}
# types = set()
for entity in entities:
    label = entity.get('entity')
    if label in SKIP:
        continue
    key = normalize(label)
    # types.add(entity.get('type'))
    if entity.get('case_url') not in cases:
        cases[entity.get('case_url')] = set()
    cases[entity.get('case_url')].add(key)

    if not G.has_node(key):
        G.add_node(key,
                   label=label,
                   type=entity.get('type'),
                   group=GROUPS[entity.get('type')])
    nodes[key] = label

for case_url, involved in cases.items():
    # print case_url, len(involved)
    for (s, t) in combinations(involved, 2):

Example #45

0

Show file

File: identifier.py Project: arezola/followthemoney

 def normalize(self, text, **kwargs):
     """Normalize for comparison."""
     ids = super(IdentifierType, self).normalize(text, **kwargs)
     return [normalize(i) for i in ids]

Example #46

0

Show file

File: normalize.py Project: 01-/loadkit

 def transform(self, source, target):
     text = source.data()
     text = normalize(text, lowercase=self.config.get('lowercase', True),
                      transliterate=self.config.get('transliterate', False),
                      collapse=self.config.get('collapse', True))
     target.save_data(text.encode('utf-8'))

Example #47

0

Show file

File: collections.py Project: jbaehne/aleph

def index_collection(collection, sync=False):
    """Index a collection."""
    if collection.deleted_at is not None:
        return delete_collection(collection.id)

    data = {
        'foreign_id': collection.foreign_id,
        'created_at': collection.created_at,
        'updated_at': collection.updated_at,
        'label': collection.label,
        'kind': collection.kind,
        'summary': collection.summary,
        'category': Collection.DEFAULT,
        'publisher': collection.publisher,
        'publisher_url': collection.publisher_url,
        'info_url': collection.info_url,
        'data_url': collection.data_url,
        'casefile': collection.casefile,
        'secret': collection.secret,
        'collection_id': collection.id,
        'schemata': {},
        'team': []
    }
    texts = [v for v in data.values() if isinstance(v, str)]

    if collection.category in Collection.CATEGORIES:
        data['category'] = collection.category

    if collection.creator is not None:
        data['creator'] = {
            'id': collection.creator.id,
            'type': collection.creator.type,
            'name': collection.creator.name
        }
        texts.append(collection.creator.name)

    for role in collection.team:
        data['team'].append({
            'id': role.id,
            'type': role.type,
            'name': role.name
        })
        texts.append(role.name)

    stats = get_collection_stats(collection.id)
    data['count'] = stats['count']

    # expose entities by schema count.
    thing = model.get(Entity.THING)
    for schema, count in stats['schemata'].items():
        schema = model.get(schema)
        if schema is not None and schema.is_a(thing):
            data['schemata'][schema.name] = count

    # if no countries or langs are given, take the most common from the data.
    countries = ensure_list(collection.countries)
    countries = countries or stats['countries'].keys()
    data['countries'] = registry.country.normalize_set(countries)

    languages = ensure_list(collection.languages)
    languages = languages or stats['languages'].keys()
    data['languages'] = registry.language.normalize_set(languages)

    texts.extend([normalize(t, ascii=True) for t in texts])
    data['text'] = index_form(texts)
    return index_safe(collections_index(), collection.id, data, refresh=sync)

Example #48

0

Show file

File: esnames.py Project: pudo/aleph

def name_tokens(name):
    name = normality.normalize(name, latinize=True)
    # if len(name) > 2:
    #     return [name]
    # return []
    return [n for n in name.split(' ') if len(n)]

Example #49

0

Show file

def text_norm(text):
    return normalize(text, ascii=True)

Example #50

0

Show file

File: alert.py Project: pudo/aleph

 def normalized(self):
     return normalize(self.query)