Ejemplo n.º 1
0
    def __init__(self, manager, name, email):
        self.email = ascii_text(stringify(email))
        self.name = stringify(name)
        if not registry.email.validate(self.email):
            self.email = None
        if registry.email.validate(self.name):
            self.email = self.email or ascii_text(self.name)
            self.name = None

        # This should be using formataddr, but I cannot figure out how
        # to use that without encoding the name.
        self.label = None
        if self.name is not None and self.email is not None:
            self.label = "%s <%s>" % (self.name, self.email)
        elif self.name is None and self.email is not None:
            self.label = self.email
        elif self.email is None and self.name is not None:
            self.label = self.name

        self.entity = None
        key = registry.email.node_id_safe(self.email)
        if self.name is not None and len(self.name) > 10:
            key = key or registry.name.node_id_safe(self.name)
        if key is not None:
            fragment = safe_fragment(self.label)
            self.entity = manager.make_entity("Person")
            self.entity.context = {"mutable": False}
            self.entity.make_id(key)
            self.entity.add("name", self.name)
            self.entity.add("email", self.email)
            manager.emit_entity(self.entity, fragment=fragment)
Ejemplo n.º 2
0
    def __init__(self, manager, name, email):
        self.email = ascii_text(stringify(email))
        self.name = stringify(name)
        if not registry.email.validate(self.email):
            self.email = None
        if registry.email.validate(self.name):
            self.email = self.email or ascii_text(self.name)
            self.name = None

        # This should be using formataddr, but I cannot figure out how
        # to use that without encoding the name.
        self.label = None
        if self.name is not None and self.email is not None:
            self.label = '%s <%s>' % (self.name, self.email)
        elif self.name is None and self.email is not None:
            self.label = self.email
        elif self.email is None and self.name is not None:
            self.label = self.name

        self.entity = None
        if self.email is not None:
            key = self.email.lower().strip()
            fragment = safe_fragment(self.label)
            self.entity = manager.make_entity('Person')
            self.entity.make_id(key)
            self.entity.add('name', self.name)
            self.entity.add('email', self.email)
            manager.emit_entity(self.entity, fragment=fragment)
Ejemplo n.º 3
0
def suggest_entities(prefix, authz, min_count=0, schemas=None, size=5):
    """Auto-complete API."""
    options = []
    if prefix is not None and len(prefix.strip()):
        q = {'match_phrase_prefix': {'name': prefix.strip()}}
        if min_count > 0:
            q = add_filter(q, {'range': {'doc_count': {'gte': min_count}}})
        if schemas is not None and len(schemas):
            q = add_filter(q, {'terms': {'$schema': schemas}})

        # TODO: is this correct? should we allow filter by dataset entities?
        q = add_filter(q, {'terms': {'collection_id': authz.collections_read}})

        q = {
            'size': size,
            'sort': [{
                'doc_count': 'desc'
            }, '_score'],
            'query': q,
            '_source': ['name', 'schema', 'fingerprints', 'doc_count']
        }
        ref = ascii_text(prefix)
        result = es.search(index=es_index, doc_type=TYPE_ENTITY, body=q)
        for res in result.get('hits', {}).get('hits', []):
            ent = res.get('_source')
            terms = [ascii_text(t) for t in ent.pop('fingerprints', [])]
            ent['match'] = ref in terms
            ent['score'] = res.get('_score')
            ent['id'] = res.get('_id')
            options.append(ent)
    return {'prefix': prefix, 'results': options}
Ejemplo n.º 4
0
 def to_index_dict(self):
     data = self.meta.to_index_dict()
     data['text'] = index_form(self.text_parts())
     data['schema'] = self.SCHEMA
     data['schemata'] = [self.SCHEMA]
     data['name_sort'] = ascii_text(data.get('title'))
     data['title_latin'] = ascii_text(data.get('title'))
     data['summary_latin'] = ascii_text(data.get('summary'))
     return self._add_to_dict(data)
Ejemplo n.º 5
0
def index_document(document, index_records=True):
    log.info("Index document: %r", document)
    data = document.to_index_dict()
    data['text'] = get_text(document)
    data['entities'] = generate_entities(document)
    data['title_latin'] = ascii_text(data.get('title'))
    data['summary_latin'] = ascii_text(data.get('summary'))
    es.index(index=es_index, doc_type=TYPE_DOCUMENT, body=data, id=document.id)

    if index_records:
        clear_records(document.id)
        bulk_op(generate_records(document))
Ejemplo n.º 6
0
def generate_records(document):
    """Generate index records, based on document rows or pages."""
    if document.type == Document.TYPE_TEXT:
        for page in document.pages:
            tid = sha1(str(document.id))
            tid.update(str(page.id))
            tid = tid.hexdigest()

            text = stringify(page.text)
            latin = ascii_text(text)

            yield {
                '_id': tid,
                '_type': TYPE_RECORD,
                '_index': six.text_type(es_index),
                '_parent': document.id,
                '_source': {
                    'type': 'page',
                    'content_hash': document.content_hash,
                    'document_id': document.id,
                    'collection_id': document.collection_id,
                    'page': page.number,
                    'text': text,
                    'text_latin': latin
                }
            }
    elif document.type == Document.TYPE_TABULAR:
        for record in document.records:
            data = {k: stringify(v) for (k, v) in record.data.items()}

            text = [v for v in data.values()]
            latin = [ascii_text(t) for t in text]
            latin = [t for t in latin if t not in text and t is not None]

            yield {
                '_id': record.tid,
                '_type': TYPE_RECORD,
                '_index': six.text_type(es_index),
                '_parent': document.id,
                '_source': {
                    'type': 'row',
                    'content_hash': document.content_hash,
                    'document_id': document.id,
                    'collection_id': document.collection_id,
                    'row_id': record.row_id,
                    'sheet': record.sheet,
                    'text': text,
                    'text_latin': latin,
                    'raw': data
                }
            }
Ejemplo n.º 7
0
 def test_empty(self):
     self.assertEqual(None, slugify(None))
     self.assertEqual(None, ascii_text(None))
     self.assertEqual(None, latinize_text(None))
     self.assertEqual(None, normalize(None))
     self.assertEqual(None, normalize(''))
     self.assertEqual(None, normalize(' '))
Ejemplo n.º 8
0
def get_text(document):
    """Generate an array with the full text of the given document.

    This will limit document length to TEXT_MAX_LEN in order to avoid
    uploading extremely long documents.
    """
    texts = []
    for text in document.text_parts():
        text = stringify(text)
        texts.append(text)
        latin = ascii_text(text)
        if latin != text:
            texts.append(latin)

        text_len = sum((len(t) for t in texts))
        # First, try getting rid of duplicate entries, which are more likely in
        # tabular documents. If that does not help, partial text will be
        # returned.
        if text_len >= TEXT_MAX_LEN:
            texts = list(set(texts))

            text_len = sum((len(t) for t in texts))
            if text_len >= TEXT_MAX_LEN:
                return texts

    return texts
Ejemplo n.º 9
0
 def normalize(self, name):
     name = ascii_text(name)
     name = category_replace(name, UNICODE_CATEGORIES)
     if name.upper() == name:
         name = name.replace(WS, '_')
         name = name.lower()
     else:
         name = stringcase.snakecase(name)
     return re.sub('_+', '_', name)
Ejemplo n.º 10
0
def clean_strict(text, boundary=WS):
    """Super-hardcore string scrubbing."""
    # transliterate to ascii
    text = ascii_text(text)
    # replace punctuation and symbols
    text = CHARACTERS_REMOVE_RE.sub('', text)
    text = category_replace(text)
    # pad out for company type replacements
    text = ''.join((boundary, collapse_spaces(text), boundary))
    return text
Ejemplo n.º 11
0
def clean_strict(text, boundary=WS):
    """Super-hardcore string scrubbing."""
    # transliterate to ascii
    text = ascii_text(text)
    # replace punctuation and symbols
    text = CHARACTERS_REMOVE_RE.sub('', text)
    text = category_replace(text)
    # pad out for company type replacements
    text = ''.join((boundary, collapse_spaces(text), boundary))
    return text
Ejemplo n.º 12
0
def finalize_index(data, schema):
    """Apply final denormalisations to the index."""
    properties = data.get('properties', {})

    texts = []
    for vs in properties.values():
        for v in ensure_list(vs):
            texts.append(v)

    data['text'] = index_form(texts)
    data['fingerprints'] = data.get('fingerprints', [])

    # Generate inverted representations of the data stored in properties.
    for prop in schema.properties:
        values = properties.get(prop.name, [])
        if not len(values):
            continue

        # Find an set the name property
        if prop.is_label:
            data['name'] = values[0]

        # Generate key material
        # TODO: this should probably be record-based.
        data['fingerprints'].extend(prop.type.fingerprint(values))

        # Add inverted properties. This takes all the properties
        # of a specific type (names, dates, emails etc.)
        invert = prop.type.index_invert
        if invert:
            if invert not in data:
                data[invert] = []
            for norm in prop.type.normalize(values):
                if norm not in data[invert]:
                    data[invert].append(norm)

    data['fingerprints'] = list(set(data['fingerprints']))

    # Add latinised names
    names = data.get('names', [])
    for name in list(names):
        names.append(ascii_text(name))
    data['names'] = list(set(names))

    # Get implied schemata (i.e. parents of the actual schema)
    data['schema'] = schema.name
    data['schemata'] = []
    for parent in schema.schemata:
        if not parent.hidden:
            data['schemata'].append(parent.name)

    # Second name field for non-tokenised sorting.
    if 'name' in data:
        data['name_sort'] = data.get('name')
    return data
Ejemplo n.º 13
0
def index_names(data):
    """Handle entity names on documents and entities."""
    names = data.get('names', [])
    fps = [fingerprints.generate(name) for name in names]
    fps = [fp for fp in fps if fp is not None]
    data['fingerprints'] = list(set(fps))

    # Add latinised names
    for name in list(names):
        names.append(ascii_text(name))
    data['names'] = list(set(names))
Ejemplo n.º 14
0
def pending(id):
    collection = obj_or_404(Collection.by_id(id))
    request.authz.require(request.authz.collection_read(collection))
    q = collection.pending_entities()
    q = q.limit(30)
    entities = []
    for entity in q.all():
        data = entity.to_dict()
        data['name_latin'] = ascii_text(entity.name)
        entities.append(data)
    return jsonify({'results': entities, 'total': len(entities)})
Ejemplo n.º 15
0
def text_query_string(text, literal=False):
    if text is None or not len(text.strip()):
        return match_all()
    if literal:
        text = '"%s"' % ascii_text(text)
    return {
        'query_string': {
            'query': text,
            'fields': ['text'],
            'default_operator': 'AND',
            'use_dis_max': True
        }
    }
Ejemplo n.º 16
0
def clean_strict(text: Optional[str], boundary: str = WS) -> Optional[str]:
    """Super-hardcore string scrubbing."""
    # transliterate to ascii
    text = ascii_text(text)
    if not isinstance(text, str):
        return None
    # replace punctuation and symbols
    text = CHARACTERS_REMOVE_RE.sub("", text)
    text = category_replace(text)
    text = collapse_spaces(text)
    if text is None:
        return None
    # pad out for company type replacements
    return "".join((boundary, text, boundary))
Ejemplo n.º 17
0
 def test_georgian(self):
     text = u'ავლაბრის ფონდი'
     self.assertEqual('avlabris pondi', ascii_text(text))
Ejemplo n.º 18
0
 def test_ahmad(self):
     text = u'FUAD ALIYEV ƏHMƏD OĞLU'
     self.assertEqual('FUAD ALIYEV AHMAD OGLU', ascii_text(text))
Ejemplo n.º 19
0
 def test_petro(self):
     text = u'Порошенко Петро Олексійович'
     self.assertEqual('porosenko-petro-oleksijovic', slugify(text))
     self.assertEqual('Porosenko Petro Oleksijovic', ascii_text(text))
     self.assertEqual(u'Porošenko Petro Oleksíjovič', latinize_text(text))
     self.assertEqual(u'порошенко петро олексіиович', normalize(text))
Ejemplo n.º 20
0
def normalize(text):
    text = ascii_text(text)
    text = text.replace("'", '')
    return text
Ejemplo n.º 21
0
 def test_georgian(self):
     text = u"ავლაბრის ფონდი"
     self.assertEqual("avlabris pondi", ascii_text(text))
Ejemplo n.º 22
0
 def test_azeri(self):
     text = u"FUAD ALIYEV ƏHMƏD OĞLU"
     self.assertEqual("FUAD ALIYEV AHMAD OGLU", ascii_text(text))
Ejemplo n.º 23
0
 def test_ahmad(self):
     text = u"əhməd"
     self.assertEqual("ahmad", ascii_text(text))
Ejemplo n.º 24
0
 def test_petro(self):
     text = u"Порошенко Петро Олексійович"
     self.assertEqual("porosenko-petro-oleksijovic", slugify(text))
     self.assertEqual("Porosenko Petro Oleksijovic", ascii_text(text))
     self.assertEqual(u"Porošenko Petro Oleksíjovič", latinize_text(text))
     self.assertEqual(u"порошенко петро олексіиович", normalize(text))
Ejemplo n.º 25
0
 def test_ahmad(self):
     text = u'əhməd'
     self.assertEqual('ahmad', ascii_text(text))
Ejemplo n.º 26
0
def normalize(text):
    text = category_replace(text, replacements=UNICODE_CATEGORIES)
    text = ascii_text(text)
    if text is not None:
        return text.lower()
Ejemplo n.º 27
0
 def test_german(self):
     text = u'Häschen Spaß'
     self.assertEqual('Haschen Spass', ascii_text(text))
Ejemplo n.º 28
0
def strconv(text):
    if text is None or not len(text.strip()):
        return
    return ascii_text(text)
Ejemplo n.º 29
0
 def test_german(self):
     text = u"Häschen Spaß"
     self.assertEqual("Haschen Spass", ascii_text(text))
     self.assertEqual("haschen-spass", slugify(text, sep="-"))
Ejemplo n.º 30
0
def latin_alt(value):
    """Make a latin version of a string and return if it differs
    from the input."""
    trans_value = ascii_text(value)
    if trans_value.lower() != value.lower():
        return trans_value
Ejemplo n.º 31
0
 def normalize_value(self, value):
     value = collapse_spaces(value)
     return value, ascii_text(value)