Example #1
0
def ingest_directory(collection_id, meta, local_path, base_path=None,
                     move=False):
    """Ingest all the files in a directory."""
    # This is somewhat hacky, see issue #55 for the rationale.
    if not os.path.exists(local_path):
        log.error("Invalid path: %r", local_path)
        return

    base_path = base_path or local_path
    if not os.path.isdir(local_path):
        child = meta.make_child()
        child.source_path = base_path
        return ingest_file(collection_id, child, local_path, move=move)

    # handle bundles
    claimed = []
    for cls in get_ingestors():
        if not hasattr(cls, 'bundle'):
            continue
        bundler = cls(collection_id)
        claimed.extend(bundler.bundle(meta, local_path))

    # recurse downward into the directory:
    for entry in os.listdir(local_path):
        entry_path = os.path.join(local_path, string_value(entry))
        entry_base = os.path.join(base_path, string_value(entry))
        if entry in SKIP_ENTRIES or entry in claimed:
            log.debug("Ignore: %r", entry_base)
            continue
        log.info("Handle [%s]: %s", meta.crawler_run, entry_base)
        # We don't care if it is a file, this is handled at
        # the beginning anyway.
        ingest_directory(collection_id, meta, entry_path,
                         base_path=entry_base, move=move)
Example #2
0
    def crawl(self, directory=None, collection=None, meta={}):
        collection = collection or directory
        collection = Collection.create({
            'foreign_id': 'directory:%s' % slugify(collection),
            'label': collection
        })
        db.session.commit()
        collection_id = collection.id

        if os.path.isfile(directory):
            self.crawl_file(collection_id, directory, meta)

        directory = directory or os.getcwd()
        directory = directory.encode('utf-8')
        for (dirname, dirs, files) in os.walk(directory):
            dirparts = [d for d in dirname.split(os.path.sep)
                        if d in SKIP_DIRECTORIES]
            if len(dirparts):
                continue
            log.info("Descending: %r", dirname)
            for file_name in files:
                dirname = string_value(dirname)
                file_name = string_value(file_name)
                if file_name in SKIP_FILES:
                    continue
                file_path = os.path.join(dirname, file_name)
                self.crawl_file(collection_id, file_path, meta)
Example #3
0
def ingest_directory(collection_id, meta, local_path, base_path=None,
                     move=False):
    """Ingest all the files in a directory."""
    # This is somewhat hacky, see issue #55 for the rationale.
    if not os.path.exists(local_path):
        log.error("Invalid path: %r", local_path)
        return

    base_path = base_path or local_path
    if not os.path.isdir(local_path):
        child = meta.make_child()
        child.source_path = base_path
        return ingest_file(collection_id, child, local_path, move=move)

    # handle bundles
    claimed = []
    for cls in get_ingestors():
        if not hasattr(cls, 'bundle'):
            continue
        bundler = cls(collection_id)
        claimed.extend(bundler.bundle(meta, local_path))

    # recurse downward into the directory:
    for entry in os.listdir(local_path):
        entry_path = os.path.join(local_path, string_value(entry))
        entry_base = os.path.join(base_path, string_value(entry))
        if entry in SKIP_ENTRIES or entry in claimed:
            log.debug("Ignore: %r", entry_base)
            continue
        log.info("Handle [%s]: %s", meta.crawler_run, entry_base)
        # We don't care if it is a file, this is handled at
        # the beginning anyway.
        ingest_directory(collection_id, meta, entry_path,
                         base_path=entry_base, move=move)
Example #4
0
    def parse_headers(self, header, meta):
        meta.title = header.get('Subject')

        if header.get('Message-Id'):
            meta.foreign_id = string_value(header.get('Message-Id'))

        if header.get('From'):
            addr = address.parse(header.get('From'))
            if addr is not None:
                meta.author = addr.to_unicode()
                meta.add_email(addr.address)

        for hdr in ['To', 'CC', 'BCC']:
            if header.get(hdr):
                for addr in address.parse_list(header.get(hdr)):
                    meta.add_email(addr.address)

        date = header.get('Date')
        date = rfc822.parsedate(date)
        if date is not None:
            dt = datetime.fromtimestamp(mktime(date))
            meta.add_date(dt)

        meta.headers = dict([(k, string_value(v)) for k, v in header.items()])
        return meta
Example #5
0
def extract_pdf(path, languages=None):
    """
    Extract content from a PDF file.

    This will attempt to use pdfminer to extract textual content from
    each page. If none is found, it'll send the images through OCR.
    """
    fh = open(path, "rb")
    result = {"pages": []}
    try:
        rsrcmgr = PDFResourceManager()
        laparams = LAParams()
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        parser = PDFParser(fh)
        doc = PDFDocument(parser, "")

        if len(doc.info):
            for k, v in doc.info[-1].items():
                k = k.lower().strip()
                v = string_value(v)
                if k != "pages" and v is not None and "<PDFObjRef:" not in v:
                    result[k] = string_value(v)

        for i, page in enumerate(PDFPage.create_pages(doc)):
            result["pages"].append(_convert_page(interpreter, page, device, i + 1, path, languages))
        device.close()
        return result
    except PSEOF as eof:
        log.info("Unexpected EOF: %r", eof)
        return result
    finally:
        fh.close()
Example #6
0
    def crawl(self, directory=None, source=None, meta={}):
        source = source or directory
        source = Source.create({
            'foreign_id': 'directory:%s' % slugify(source),
            'label': source
        })
        db.session.commit()
        source_id = source.id

        if os.path.isfile(directory):
            self.crawl_file(source_id, directory, meta)

        directory = directory or os.getcwd()
        directory = directory.encode('utf-8')
        for (dirname, dirs, files) in os.walk(directory):
            dirparts = [
                d for d in dirname.split(os.path.sep) if d in SKIP_DIRECTORIES
            ]
            if len(dirparts):
                continue
            log.info("Descending: %r", dirname)
            for file_name in files:
                dirname = string_value(dirname)
                file_name = string_value(file_name)
                if file_name in SKIP_FILES:
                    continue
                file_path = os.path.join(dirname, file_name)
                self.crawl_file(source_id, file_path, meta)
Example #7
0
 def check_filters(self, data):
     for k, v in self.data.get('filters', {}).items():
         if string_value(v) != data.get(k):
             return False
     for k, v in self.data.get('filters_not', {}).items():
         if string_value(v) == data.get(k):
             return False
     return True
Example #8
0
 def text_parts(self):
     """Utility method to get all text snippets in a record."""
     if self.data is not None:
         for value in self.data.values():
             text = string_value(value)
             if text is not None:
                 yield text
     text = string_value(self.text)
     if text is not None:
         yield text
Example #9
0
def generate_records(document):
    """Generate index records, based on document rows or pages."""
    if document.type == Document.TYPE_TEXT:
        for page in document.pages:
            tid = sha1(str(document.id))
            tid.update(str(page.id))
            tid = tid.hexdigest()

            text = string_value(page.text)
            latin = latinize_text(text)

            yield {
                '_id': tid,
                '_type': TYPE_RECORD,
                '_index': six.text_type(es_index),
                '_parent': document.id,
                '_source': {
                    'type': 'page',
                    'content_hash': document.content_hash,
                    'document_id': document.id,
                    'collection_id': document.collection_id,
                    'page': page.number,
                    'text': text,
                    'text_latin': latin
                }
            }
    elif document.type == Document.TYPE_TABULAR:
        for record in document.records:
            data = {k: string_value(v) for (k, v) in record.data.items()}

            text = [v for v in data.values() if v is not None]
            latin = [latinize_text(t) for t in text]
            latin = [t for t in latin if t not in text]

            yield {
                '_id': record.tid,
                '_type': TYPE_RECORD,
                '_index': six.text_type(es_index),
                '_parent': document.id,
                '_source': {
                    'type': 'row',
                    'content_hash': document.content_hash,
                    'document_id': document.id,
                    'collection_id': document.collection_id,
                    'row_id': record.row_id,
                    'sheet': record.sheet,
                    'text': text,
                    'text_latin': latin,
                    'raw': data
                }
            }
Example #10
0
 def iterrows(self):
     """Iterate through the table applying filters on-the-go."""
     mapping = {ref.split('.')[-1]: ref for ref in self.active_refs}
     for csv_url in self.csv_urls:
         log.info("Import [%s]: %s", self.dataset.name, csv_url)
         for row in self.read_csv(csv_url):
             data = {}
             for k, v in row.items():
                 k = mapping.get(string_value(k))
                 if k is None:
                     continue
                 data[k] = string_value(v)
             if self.check_filters(data):
                 yield data
Example #11
0
def get_text(document):
    """Generate an array with the full text of the given document.

    This will limit document length to TEXT_MAX_LEN in order to avoid
    uploading extremely long documents.
    """
    texts = []
    for text in document.text_parts():
        text = string_value(text)
        texts.append(text)
        latin = latinize_text(text)
        if latin != text:
            texts.append(latin)

        text_len = sum((len(t) for t in texts))
        # First, try getting rid of duplicate entries, which are more likely in
        # tabular documents. If that does not help, partial text will be
        # returned.
        if text_len >= TEXT_MAX_LEN:
            texts = list(set(texts))

            text_len = sum((len(t) for t in texts))
            if text_len >= TEXT_MAX_LEN:
                return texts

    return texts
Example #12
0
 def add_language(self, language):
     lang = string_value(language)
     if lang is None:
         return
     lang = lang.lower()
     if is_language_code(lang) and lang not in self._languages:
         self._languages.append(lang)
Example #13
0
 def headers(self, headers):
     self._headers = {}
     if not isinstance(headers, Mapping):
         return
     for key, value in headers.items():
         key = slugify(key, sep='_')
         self._headers[key] = string_value(value)
Example #14
0
def parse_date(text, guess=True, date_format=None):
    """The classic: date parsing, every which way."""
    # handle date/datetime before converting to text.
    if isinstance(text, datetime):
        text = text.date()
    if isinstance(text, date):
        return text.isoformat()

    text = string_value(text)
    if text is None:
        return
    elif date_format is not None:
        # parse with a specified format
        try:
            obj = datetime.strptime(text, date_format)
            return obj.date().isoformat()
        except:
            pass
    elif guess and not is_partial_date(text):
        # use dateparser to guess the format
        try:
            obj = fuzzy_date_parser(text)
            return obj.date().isoformat()
        except Exception:
            pass
    else:
        # limit to the date part of a presumed date string
        text = text[:10]

    # strip -00-00 from dates because it makes ES barf.
    text = CUT_ZEROES.sub('', text)

    if is_partial_date(text):
        return text
Example #15
0
def extract_pdf(path, languages=None):
    """
    Extract content from a PDF file.

    This will convert the whole file to XML using `pdftohtml`, then run OCR
    on individual images within the file.
    """
    temp_dir = make_tempdir()
    try:
        out_file = os.path.join(temp_dir, 'pdf.xml')
        log.info("Converting PDF to XML: %r...", path)
        pdftohtml = get_config('PDFTOHTML_BIN')
        args = [pdftohtml, '-xml', '-hidden', '-q', '-nodrm', path, out_file]
        subprocess.call(args)

        if not os.path.exists(out_file):
            raise IngestorException("Could not convert PDF to XML: %s" % path)

        with open(out_file, 'r') as fh:
            xml = string_value(fh.read())
            xml = xml.replace('encoding="UTF-8"', '')
            parser = etree.XMLParser(recover=True, remove_comments=True)
            doc = etree.fromstring(xml, parser=parser)
            log.debug("Parsed XML: %r", path)

        pages = []
        for page in doc.findall('./page'):
            pages.append(extract_page(path, temp_dir, page, languages))

        return {'pages': pages}
    finally:
        remove_tempdir(temp_dir)
Example #16
0
def chomp(text, lower=False):
    text = string_value(text)
    if text is not None:
        text = text.strip()
        if not len(text):
            return None
        return text.lower() if lower else text
Example #17
0
 def content_hash(self):
     if self._content_hash is not None:
         return self._content_hash
     if self._foreign_id is not None:
         foreign_id = string_value(self.foreign_id)
         if foreign_id is not None:
             foreign_id = foreign_id.encode('utf-8')
             return sha1(foreign_id).hexdigest()
Example #18
0
 def ingest(self, meta, local_path):
     work_dir = make_tempdir()
     try:
         bin_path = os.environ.get('READPST_BIN', 'readpst')
         args = [
             bin_path, '-D', '-e', '-8', '-b', '-o', work_dir, local_path
         ]
         log.debug('Converting Outlook PST file: %r', ' '.join(args))
         subprocess.call(args)
         for (dirpath, dirnames, filenames) in os.walk(work_dir):
             reldir = os.path.relpath(string_value(dirpath),
                                      string_value(work_dir))
             for filename in filenames:
                 filename = string_value(filename)
                 child = meta.make_child()
                 for kw in reldir.split(os.path.sep):
                     child.add_keyword(kw)
                 fid = os.path.join(string_value(meta.foreign_id),
                                    string_value(reldir), filename)
                 child.foreign_id = string_value(fid)
                 file_path = os.path.join(string_value(dirpath), filename)
                 ingest_file(self.collection_id,
                             child,
                             file_path,
                             move=True)
     finally:
         remove_tempdir(work_dir)
Example #19
0
 def add_language(self, language):
     self.meta.setdefault('languages', [])
     lang = string_value(language)
     if lang is None:
         return
     lang = lang.lower()
     if is_language_code(lang) and lang not in self.meta['languages']:
         self.meta['languages'].append(lang)
         self.update_meta()
Example #20
0
def phone(value, prop=None, **kwargs):
    try:
        value = string_value(value)
        if value is None:
            return
        num = phonenumbers.parse(value, prop.country)
        if phonenumbers.is_possible_number(num):
            return phonenumbers.format_number(num, phonenumbers.PhoneNumberFormat.INTERNATIONAL)  # noqa
    except Exception:
        return
Example #21
0
    def file_title(self):
        """The file title is a human-readable interpretation of the file name.
        It is used for labelling or as a backup title. It should not be used
        to generate an actual file system path."""
        file_title = self.meta.get('file_name')

        # derive file name from headers
        disposition = self.headers.get('content_disposition')
        if file_title is None and disposition is not None:
            _, attrs = cgi.parse_header(disposition)
            filename = attrs.get('filename') or ''
            file_title = string_value(unquote(filename))

        if file_title is None and self.source_url:
            parsed = urlparse(self.source_url)
            file_title = os.path.basename(parsed.path) or ''
            file_title = string_value(unquote(file_title))

        return file_title
Example #22
0
 def by_foreign_id(cls, foreign_id, collection_id, deleted=False):
     foreign_id = string_value(foreign_id)
     if foreign_id is None:
         return None
     q = cls.all(deleted=deleted)
     q = q.filter(Entity.collection_id == collection_id)
     foreign_id = func.cast([foreign_id], ARRAY(db.Unicode()))
     q = q.filter(cls.foreign_ids.contains(foreign_id))
     q = q.order_by(Entity.deleted_at.desc().nullsfirst())
     return q.first()
Example #23
0
 def crawl_file(self, source, file_path, base_meta):
     try:
         meta = self.make_meta(base_meta)
         file_path = string_value(file_path)
         meta.foreign_id = file_path
         meta.source_path = file_path
         meta.file_name = os.path.basename(file_path)
         ingest_file(source.id, meta, file_path, move=False)
     except Exception as ex:
         log.exception(ex)
Example #24
0
def phone(value, prop=None, **kwargs):
    try:
        value = string_value(value)
        if value is None:
            return
        num = phonenumbers.parse(value, prop.country)
        if phonenumbers.is_possible_number(num):
            return phonenumbers.format_number(num, phonenumbers.PhoneNumberFormat.INTERNATIONAL)  # noqa
    except Exception:
        return
Example #25
0
 def update(self, entity):
     data = entity.get('data') or {}
     data['name'] = entity.get('name')
     self.data = self.schema.validate(data)
     self.name = self.data.pop('name')
     fid = [string_value(f) for f in entity.get('foreign_ids') or []]
     self.foreign_ids = list(set([f for f in fid if f is not None]))
     self.state = entity.pop('state', self.STATE_ACTIVE)
     self.updated_at = datetime.utcnow()
     db.session.add(self)
Example #26
0
    def file_title(self):
        file_title = self._file_name

        # derive file name from headers
        disposition = self.headers.get('content_disposition')
        if file_title is None and disposition is not None:
            _, attrs = cgi.parse_header(disposition)
            filename = attrs.get('filename') or ''
            file_title = string_value(unquote(filename))

        if file_title is None and self.source_path:
            file_title = os.path.basename(self.source_path) or ''
            file_title = string_value(file_title)

        if file_title is None and self.source_url:
            parsed = urlparse(self.source_url)
            file_title = os.path.basename(parsed.path) or ''
            file_title = string_value(unquote(file_title))

        return file_title
Example #27
0
def report(collection_id):
    collection = obj_or_404(Collection.by_id(collection_id))
    require(request.authz.can_read(collection.id))
    output = generate_excel(collection,
                            request.authz,
                            links=arg_bool('links'),
                            one_sheet=arg_bool('merge'))
    outputfile = "%s Cross-referenced.xlsx" % string_value(collection.label)
    return send_file(output,
                     as_attachment=True,
                     attachment_filename=outputfile)
Example #28
0
 def add_column(self, label):
     label = string_value(label)
     column = slugify(label or '', sep='_')
     column = column or 'column'
     column = column[:55]
     name, i = column, 2
     # de-dupe: column, column_2, column_3, ...
     while name in [c.name for c in self.columns]:
         name = '%s_%s' % (name, i)
         i += 1
     column = {'label': label, 'name': column}
     self.schema['columns'].append(column)
     return TabularColumn(self, column)
Example #29
0
 def generate_rows():
     for i, row in enumerate(row_set):
         record = {}
         try:
             for cell, column in zip(row, columns):
                 record[column.name] = string_value(cell.value)
             if len(record):
                 for column in columns:
                     record[column.name] = record.get(column.name, None)
                 yield record
         except Exception as exception:
             log.warning("Could not decode row %s in %s: %s",
                         i, meta, exception)
Example #30
0
 def crawl_file(self, collection_id, file_path, base_meta):
     try:
         if not os.path.isfile(file_path):
             log.info('Invalid file path: %r', file_path)
             return
         meta = self.make_meta(base_meta)
         file_path = string_value(file_path)
         meta.foreign_id = file_path
         meta.source_path = file_path
         meta.file_name = os.path.basename(file_path)
         ingest_file(collection_id, meta, file_path, move=False)
     except Exception as ex:
         log.exception(ex)
Example #31
0
 def crawl_file(self, source_id, file_path, base_meta):
     try:
         if not os.path.isfile(file_path):
             log.info('Invalid file path: %r', file_path)
             return
         meta = self.make_meta(base_meta)
         file_path = string_value(file_path)
         meta.foreign_id = file_path
         meta.source_path = file_path
         meta.file_name = os.path.basename(file_path)
         ingest_file(source_id, meta, file_path, move=False)
     except Exception as ex:
         log.exception(ex)
Example #32
0
 def add_column(self, label):
     label = string_value(label)
     column = slugify(label or '', sep='_')
     column = column or 'column'
     column = column[:55]
     name, i = column, 2
     # de-dupe: column, column_2, column_3, ...
     while name in [c.name for c in self.columns]:
         name = '%s_%s' % (name, i)
         i += 1
     column = {'label': label, 'name': column}
     self.schema['columns'].append(column)
     return TabularColumn(self, column)
Example #33
0
def extract_pdf(path, languages=None):
    """
    Extract content from a PDF file.

    This will attempt to use pdfminer to extract textual content from
    each page. If none is found, it'll send the images through OCR.
    """
    with open(path, 'rb') as fh:
        rsrcmgr = PDFResourceManager()
        laparams = LAParams()
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        parser = PDFParser(fh)
        doc = PDFDocument(parser, '')

        result = {'pages': []}
        if len(doc.info):
            for k, v in doc.info[-1].items():
                k = k.lower().strip()
                v = string_value(v)
                if k != 'pages' and v is not None and '<PDFObjRef:' not in v:
                    result[k] = string_value(v)

        for i, page in enumerate(PDFPage.create_pages(doc)):
            text = None
            try:
                interpreter.process_page(page)
                layout = device.get_result()
                text = _convert_page(layout, path)
            except Exception as ex:
                log.warning("Failed to parse PDF page: %r", ex)

            if text is None or len(text) < 3:
                log.info("OCR: %r, pg. %s", path, i + 1)
                text = _extract_image_page(path, i + 1, languages)
            result['pages'].append(text)
        device.close()
        return result
Example #34
0
def extract_pdf(path, languages=None):
    """
    Extract content from a PDF file.

    This will attempt to use pdfminer to extract textual content from
    each page. If none is found, it'll send the images through OCR.
    """
    with open(path, 'rb') as fh:
        rsrcmgr = PDFResourceManager()
        laparams = LAParams()
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        parser = PDFParser(fh)
        doc = PDFDocument(parser, '')

        result = {'pages': []}
        if len(doc.info):
            for k, v in doc.info[-1].items():
                k = k.lower().strip()
                v = string_value(v)
                if k != 'pages' and v is not None and '<PDFObjRef:' not in v:
                    result[k] = string_value(v)

        for i, page in enumerate(PDFPage.create_pages(doc)):
            text = None
            try:
                interpreter.process_page(page)
                layout = device.get_result()
                text = _convert_page(layout, path)
            except Exception as ex:
                log.warning("Failed to parse PDF page: %r", ex)

            if text is None or len(text) < 3:
                log.info("OCR: %r, pg. %s", path, i + 1)
                text = _extract_image_page(path, i + 1, languages)
            result['pages'].append(text)
        device.close()
        return result
Example #35
0
    def ingest_attachment(self, part, meta):
        if part.body is None:
            log.warning("Empty attachment [%r]: %s", meta, part)
            return

        child = meta.make_child()
        child.mime_type = six.text_type(part.detected_content_type)
        child.file_name = string_value(part.detected_file_name)
        out_path = self.write_temp(part.body, child.extension)

        try:
            ingest_file(self.collection_id, child, out_path, move=True)
        finally:
            remove_tempfile(out_path)
Example #36
0
 def crawl(self, directory=None, collection=None, meta={}):
     directory = string_value(directory)
     if directory is None or not os.path.exists(directory):
         log.error("Invalid directory: %r", directory)
         return
     directory = os.path.abspath(os.path.normpath(directory))
     collection = collection or directory
     collection = Collection.create({
         'foreign_id': 'directory:%s' % slugify(collection),
         'label': collection
     })
     db.session.commit()
     meta = self.make_meta(meta)
     meta.source_path = directory
     ingest_directory(collection.id, meta, directory)
Example #37
0
 def ingest_attachment(self, attachment, meta):
     try:
         if attachment.data is None:
             log.warning("Attachment is empty [%r]: %s", meta,
                         attachment.longFilename)
             return
         out_path = make_tempfile()
         with open(out_path, 'w') as fh:
             fh.write(attachment.data)
         child = meta.make_child()
         child.file_name = string_value(attachment.longFilename)
         ingest_file(self.collection_id, child, out_path, move=True)
         remove_tempfile(out_path)
     except Exception as ex:
         log.exception(ex)
Example #38
0
    def generate_rows(self, db, columns):
        if db.numrec == 0:
            return
        text = []
        for i in xrange(0, db.numrec):
            for v in db.select(i).values():
                if isinstance(v, str):
                    text.append(v)

        for i in xrange(0, db.numrec):
            row = db.select(i)
            record = {}
            for k, value in row.items():
                name = columns.get(k)
                record[name] = string_value(value)
            if len(record):
                for name in columns.values():
                    record[name] = record.get(name, None)
                yield record
Example #39
0
def fingerprint(value, **kwargs):
    return fingerprints.generate(string_value(value))
Example #40
0
def make_tempdir(name=None):
    name = string_value(name) or 'data'
    dirpath = path.join(mkdtemp(prefix=TMP_PREFIX), name)
    os.makedirs(dirpath)
    return dirpath
Example #41
0
def make_tempfile(name=None, suffix=None):
    name = string_value(name) or 'data'
    suffix = string_value(suffix)
    if suffix is not None:
        name = '%s.%s' % (name, suffix.strip('.'))
    return os.path.join(make_tempdir(), name)
Example #42
0
def trim(value, **kwargs):
    return string_value(value).strip()
Example #43
0
def lowercase(value, **kwargs):
    return string_value(value).lower()
Example #44
0
def addressfp(value, **kwargs):
    value = string_value(value)
    if value is None:
        return
    value = value.replace("<br/>", " ")
    return fingerprints.generate(value, keep_order=True)
Example #45
0
def make_fingerprint(text, **kwargs):
    """Generate a normalised entity name, used for the graph."""
    return fingerprints.generate(string_value(text))