Python string_value Exemples, aleph.text.string_value Python Exemples

Exemple #1

0

Afficher le fichier

def ingest_directory(collection_id, meta, local_path, base_path=None,
                     move=False):
    """Ingest all the files in a directory."""
    # This is somewhat hacky, see issue #55 for the rationale.
    if not os.path.exists(local_path):
        log.error("Invalid path: %r", local_path)
        return

    base_path = base_path or local_path
    if not os.path.isdir(local_path):
        child = meta.make_child()
        child.source_path = base_path
        return ingest_file(collection_id, child, local_path, move=move)

    # handle bundles
    claimed = []
    for cls in get_ingestors():
        if not hasattr(cls, 'bundle'):
            continue
        bundler = cls(collection_id)
        claimed.extend(bundler.bundle(meta, local_path))

    # recurse downward into the directory:
    for entry in os.listdir(local_path):
        entry_path = os.path.join(local_path, string_value(entry))
        entry_base = os.path.join(base_path, string_value(entry))
        if entry in SKIP_ENTRIES or entry in claimed:
            log.debug("Ignore: %r", entry_base)
            continue
        log.info("Handle [%s]: %s", meta.crawler_run, entry_base)
        # We don't care if it is a file, this is handled at
        # the beginning anyway.
        ingest_directory(collection_id, meta, entry_path,
                         base_path=entry_base, move=move)

Exemple #2

0

Afficher le fichier

Fichier : directory.py Projet : adamchainz/aleph

    def crawl(self, directory=None, collection=None, meta={}):
        collection = collection or directory
        collection = Collection.create({
            'foreign_id': 'directory:%s' % slugify(collection),
            'label': collection
        })
        db.session.commit()
        collection_id = collection.id

        if os.path.isfile(directory):
            self.crawl_file(collection_id, directory, meta)

        directory = directory or os.getcwd()
        directory = directory.encode('utf-8')
        for (dirname, dirs, files) in os.walk(directory):
            dirparts = [d for d in dirname.split(os.path.sep)
                        if d in SKIP_DIRECTORIES]
            if len(dirparts):
                continue
            log.info("Descending: %r", dirname)
            for file_name in files:
                dirname = string_value(dirname)
                file_name = string_value(file_name)
                if file_name in SKIP_FILES:
                    continue
                file_path = os.path.join(dirname, file_name)
                self.crawl_file(collection_id, file_path, meta)

Exemple #3

0

Afficher le fichier

Fichier : __init__.py Projet : CodeForAfrica/aleph

def ingest_directory(collection_id, meta, local_path, base_path=None,
                     move=False):
    """Ingest all the files in a directory."""
    # This is somewhat hacky, see issue #55 for the rationale.
    if not os.path.exists(local_path):
        log.error("Invalid path: %r", local_path)
        return

    base_path = base_path or local_path
    if not os.path.isdir(local_path):
        child = meta.make_child()
        child.source_path = base_path
        return ingest_file(collection_id, child, local_path, move=move)

    # handle bundles
    claimed = []
    for cls in get_ingestors():
        if not hasattr(cls, 'bundle'):
            continue
        bundler = cls(collection_id)
        claimed.extend(bundler.bundle(meta, local_path))

    # recurse downward into the directory:
    for entry in os.listdir(local_path):
        entry_path = os.path.join(local_path, string_value(entry))
        entry_base = os.path.join(base_path, string_value(entry))
        if entry in SKIP_ENTRIES or entry in claimed:
            log.debug("Ignore: %r", entry_base)
            continue
        log.info("Handle [%s]: %s", meta.crawler_run, entry_base)
        # We don't care if it is a file, this is handled at
        # the beginning anyway.
        ingest_directory(collection_id, meta, entry_path,
                         base_path=entry_base, move=move)

Exemple #4

0

Afficher le fichier

    def parse_headers(self, header, meta):
        meta.title = header.get('Subject')

        if header.get('Message-Id'):
            meta.foreign_id = string_value(header.get('Message-Id'))

        if header.get('From'):
            addr = address.parse(header.get('From'))
            if addr is not None:
                meta.author = addr.to_unicode()
                meta.add_email(addr.address)

        for hdr in ['To', 'CC', 'BCC']:
            if header.get(hdr):
                for addr in address.parse_list(header.get(hdr)):
                    meta.add_email(addr.address)

        date = header.get('Date')
        date = rfc822.parsedate(date)
        if date is not None:
            dt = datetime.fromtimestamp(mktime(date))
            meta.add_date(dt)

        meta.headers = dict([(k, string_value(v)) for k, v in header.items()])
        return meta

Exemple #5

0

Afficher le fichier

Fichier : pdf.py Projet : CodeForAfrica/aleph

def extract_pdf(path, languages=None):
    """
    Extract content from a PDF file.

    This will attempt to use pdfminer to extract textual content from
    each page. If none is found, it'll send the images through OCR.
    """
    fh = open(path, "rb")
    result = {"pages": []}
    try:
        rsrcmgr = PDFResourceManager()
        laparams = LAParams()
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        parser = PDFParser(fh)
        doc = PDFDocument(parser, "")

        if len(doc.info):
            for k, v in doc.info[-1].items():
                k = k.lower().strip()
                v = string_value(v)
                if k != "pages" and v is not None and "<PDFObjRef:" not in v:
                    result[k] = string_value(v)

        for i, page in enumerate(PDFPage.create_pages(doc)):
            result["pages"].append(_convert_page(interpreter, page, device, i + 1, path, languages))
        device.close()
        return result
    except PSEOF as eof:
        log.info("Unexpected EOF: %r", eof)
        return result
    finally:
        fh.close()

Exemple #6

0

Afficher le fichier

Fichier : directory.py Projet : OpenOil-UG/aleph

    def crawl(self, directory=None, source=None, meta={}):
        source = source or directory
        source = Source.create({
            'foreign_id': 'directory:%s' % slugify(source),
            'label': source
        })
        db.session.commit()
        source_id = source.id

        if os.path.isfile(directory):
            self.crawl_file(source_id, directory, meta)

        directory = directory or os.getcwd()
        directory = directory.encode('utf-8')
        for (dirname, dirs, files) in os.walk(directory):
            dirparts = [
                d for d in dirname.split(os.path.sep) if d in SKIP_DIRECTORIES
            ]
            if len(dirparts):
                continue
            log.info("Descending: %r", dirname)
            for file_name in files:
                dirname = string_value(dirname)
                file_name = string_value(file_name)
                if file_name in SKIP_FILES:
                    continue
                file_path = os.path.join(dirname, file_name)
                self.crawl_file(source_id, file_path, meta)

Exemple #7

0

Afficher le fichier

 def check_filters(self, data):
     for k, v in self.data.get('filters', {}).items():
         if string_value(v) != data.get(k):
             return False
     for k, v in self.data.get('filters_not', {}).items():
         if string_value(v) == data.get(k):
             return False
     return True

Exemple #8

0

Afficher le fichier

 def text_parts(self):
     """Utility method to get all text snippets in a record."""
     if self.data is not None:
         for value in self.data.values():
             text = string_value(value)
             if text is not None:
                 yield text
     text = string_value(self.text)
     if text is not None:
         yield text

Exemple #9

0

Afficher le fichier

Fichier : records.py Projet : wethepeopleonline/aleph

def generate_records(document):
    """Generate index records, based on document rows or pages."""
    if document.type == Document.TYPE_TEXT:
        for page in document.pages:
            tid = sha1(str(document.id))
            tid.update(str(page.id))
            tid = tid.hexdigest()

            text = string_value(page.text)
            latin = latinize_text(text)

            yield {
                '_id': tid,
                '_type': TYPE_RECORD,
                '_index': six.text_type(es_index),
                '_parent': document.id,
                '_source': {
                    'type': 'page',
                    'content_hash': document.content_hash,
                    'document_id': document.id,
                    'collection_id': document.collection_id,
                    'page': page.number,
                    'text': text,
                    'text_latin': latin
                }
            }
    elif document.type == Document.TYPE_TABULAR:
        for record in document.records:
            data = {k: string_value(v) for (k, v) in record.data.items()}

            text = [v for v in data.values() if v is not None]
            latin = [latinize_text(t) for t in text]
            latin = [t for t in latin if t not in text]

            yield {
                '_id': record.tid,
                '_type': TYPE_RECORD,
                '_index': six.text_type(es_index),
                '_parent': document.id,
                '_source': {
                    'type': 'row',
                    'content_hash': document.content_hash,
                    'document_id': document.id,
                    'collection_id': document.collection_id,
                    'row_id': record.row_id,
                    'sheet': record.sheet,
                    'text': text,
                    'text_latin': latin,
                    'raw': data
                }
            }

Exemple #10

0

Afficher le fichier

 def iterrows(self):
     """Iterate through the table applying filters on-the-go."""
     mapping = {ref.split('.')[-1]: ref for ref in self.active_refs}
     for csv_url in self.csv_urls:
         log.info("Import [%s]: %s", self.dataset.name, csv_url)
         for row in self.read_csv(csv_url):
             data = {}
             for k, v in row.items():
                 k = mapping.get(string_value(k))
                 if k is None:
                     continue
                 data[k] = string_value(v)
             if self.check_filters(data):
                 yield data

Exemple #11

0

Afficher le fichier

def get_text(document):
    """Generate an array with the full text of the given document.

    This will limit document length to TEXT_MAX_LEN in order to avoid
    uploading extremely long documents.
    """
    texts = []
    for text in document.text_parts():
        text = string_value(text)
        texts.append(text)
        latin = latinize_text(text)
        if latin != text:
            texts.append(latin)

        text_len = sum((len(t) for t in texts))
        # First, try getting rid of duplicate entries, which are more likely in
        # tabular documents. If that does not help, partial text will be
        # returned.
        if text_len >= TEXT_MAX_LEN:
            texts = list(set(texts))

            text_len = sum((len(t) for t in texts))
            if text_len >= TEXT_MAX_LEN:
                return texts

    return texts

Exemple #12

0

Afficher le fichier

 def add_language(self, language):
     lang = string_value(language)
     if lang is None:
         return
     lang = lang.lower()
     if is_language_code(lang) and lang not in self._languages:
         self._languages.append(lang)

Exemple #13

0

Afficher le fichier

 def headers(self, headers):
     self._headers = {}
     if not isinstance(headers, Mapping):
         return
     for key, value in headers.items():
         key = slugify(key, sep='_')
         self._headers[key] = string_value(value)

Exemple #14

0

Afficher le fichier

Fichier : parse.py Projet : wethepeopleonline/aleph

def parse_date(text, guess=True, date_format=None):
    """The classic: date parsing, every which way."""
    # handle date/datetime before converting to text.
    if isinstance(text, datetime):
        text = text.date()
    if isinstance(text, date):
        return text.isoformat()

    text = string_value(text)
    if text is None:
        return
    elif date_format is not None:
        # parse with a specified format
        try:
            obj = datetime.strptime(text, date_format)
            return obj.date().isoformat()
        except:
            pass
    elif guess and not is_partial_date(text):
        # use dateparser to guess the format
        try:
            obj = fuzzy_date_parser(text)
            return obj.date().isoformat()
        except Exception:
            pass
    else:
        # limit to the date part of a presumed date string
        text = text[:10]

    # strip -00-00 from dates because it makes ES barf.
    text = CUT_ZEROES.sub('', text)

    if is_partial_date(text):
        return text

Exemple #15

0

Afficher le fichier

Fichier : poppler.py Projet : wethepeopleonline/aleph

def extract_pdf(path, languages=None):
    """
    Extract content from a PDF file.

    This will convert the whole file to XML using `pdftohtml`, then run OCR
    on individual images within the file.
    """
    temp_dir = make_tempdir()
    try:
        out_file = os.path.join(temp_dir, 'pdf.xml')
        log.info("Converting PDF to XML: %r...", path)
        pdftohtml = get_config('PDFTOHTML_BIN')
        args = [pdftohtml, '-xml', '-hidden', '-q', '-nodrm', path, out_file]
        subprocess.call(args)

        if not os.path.exists(out_file):
            raise IngestorException("Could not convert PDF to XML: %s" % path)

        with open(out_file, 'r') as fh:
            xml = string_value(fh.read())
            xml = xml.replace('encoding="UTF-8"', '')
            parser = etree.XMLParser(recover=True, remove_comments=True)
            doc = etree.fromstring(xml, parser=parser)
            log.debug("Parsed XML: %r", path)

        pages = []
        for page in doc.findall('./page'):
            pages.append(extract_page(path, temp_dir, page, languages))

        return {'pages': pages}
    finally:
        remove_tempdir(temp_dir)

Exemple #16

0

Afficher le fichier

def chomp(text, lower=False):
    text = string_value(text)
    if text is not None:
        text = text.strip()
        if not len(text):
            return None
        return text.lower() if lower else text

Exemple #17

0

Afficher le fichier

 def content_hash(self):
     if self._content_hash is not None:
         return self._content_hash
     if self._foreign_id is not None:
         foreign_id = string_value(self.foreign_id)
         if foreign_id is not None:
             foreign_id = foreign_id.encode('utf-8')
             return sha1(foreign_id).hexdigest()

Exemple #18

0

Afficher le fichier

Fichier : email_.py Projet : wilbrodn/aleph

 def ingest(self, meta, local_path):
     work_dir = make_tempdir()
     try:
         bin_path = os.environ.get('READPST_BIN', 'readpst')
         args = [
             bin_path, '-D', '-e', '-8', '-b', '-o', work_dir, local_path
         ]
         log.debug('Converting Outlook PST file: %r', ' '.join(args))
         subprocess.call(args)
         for (dirpath, dirnames, filenames) in os.walk(work_dir):
             reldir = os.path.relpath(string_value(dirpath),
                                      string_value(work_dir))
             for filename in filenames:
                 filename = string_value(filename)
                 child = meta.make_child()
                 for kw in reldir.split(os.path.sep):
                     child.add_keyword(kw)
                 fid = os.path.join(string_value(meta.foreign_id),
                                    string_value(reldir), filename)
                 child.foreign_id = string_value(fid)
                 file_path = os.path.join(string_value(dirpath), filename)
                 ingest_file(self.collection_id,
                             child,
                             file_path,
                             move=True)
     finally:
         remove_tempdir(work_dir)

Exemple #19

0

Afficher le fichier

 def add_language(self, language):
     self.meta.setdefault('languages', [])
     lang = string_value(language)
     if lang is None:
         return
     lang = lang.lower()
     if is_language_code(lang) and lang not in self.meta['languages']:
         self.meta['languages'].append(lang)
         self.update_meta()

Exemple #20

0

Afficher le fichier

def phone(value, prop=None, **kwargs):
    try:
        value = string_value(value)
        if value is None:
            return
        num = phonenumbers.parse(value, prop.country)
        if phonenumbers.is_possible_number(num):
            return phonenumbers.format_number(num, phonenumbers.PhoneNumberFormat.INTERNATIONAL)  # noqa
    except Exception:
        return

Exemple #21

0

Afficher le fichier

    def file_title(self):
        """The file title is a human-readable interpretation of the file name.
        It is used for labelling or as a backup title. It should not be used
        to generate an actual file system path."""
        file_title = self.meta.get('file_name')

        # derive file name from headers
        disposition = self.headers.get('content_disposition')
        if file_title is None and disposition is not None:
            _, attrs = cgi.parse_header(disposition)
            filename = attrs.get('filename') or ''
            file_title = string_value(unquote(filename))

        if file_title is None and self.source_url:
            parsed = urlparse(self.source_url)
            file_title = os.path.basename(parsed.path) or ''
            file_title = string_value(unquote(file_title))

        return file_title

Exemple #22

0

Afficher le fichier

 def by_foreign_id(cls, foreign_id, collection_id, deleted=False):
     foreign_id = string_value(foreign_id)
     if foreign_id is None:
         return None
     q = cls.all(deleted=deleted)
     q = q.filter(Entity.collection_id == collection_id)
     foreign_id = func.cast([foreign_id], ARRAY(db.Unicode()))
     q = q.filter(cls.foreign_ids.contains(foreign_id))
     q = q.order_by(Entity.deleted_at.desc().nullsfirst())
     return q.first()

Exemple #23

0

Afficher le fichier

 def crawl_file(self, source, file_path, base_meta):
     try:
         meta = self.make_meta(base_meta)
         file_path = string_value(file_path)
         meta.foreign_id = file_path
         meta.source_path = file_path
         meta.file_name = os.path.basename(file_path)
         ingest_file(source.id, meta, file_path, move=False)
     except Exception as ex:
         log.exception(ex)

Exemple #24

0

Afficher le fichier

Fichier : converter.py Projet : CodeForAfrica/aleph

def phone(value, prop=None, **kwargs):
    try:
        value = string_value(value)
        if value is None:
            return
        num = phonenumbers.parse(value, prop.country)
        if phonenumbers.is_possible_number(num):
            return phonenumbers.format_number(num, phonenumbers.PhoneNumberFormat.INTERNATIONAL)  # noqa
    except Exception:
        return

Exemple #25

0

Afficher le fichier

 def update(self, entity):
     data = entity.get('data') or {}
     data['name'] = entity.get('name')
     self.data = self.schema.validate(data)
     self.name = self.data.pop('name')
     fid = [string_value(f) for f in entity.get('foreign_ids') or []]
     self.foreign_ids = list(set([f for f in fid if f is not None]))
     self.state = entity.pop('state', self.STATE_ACTIVE)
     self.updated_at = datetime.utcnow()
     db.session.add(self)

Exemple #26

0

Afficher le fichier

    def file_title(self):
        file_title = self._file_name

        # derive file name from headers
        disposition = self.headers.get('content_disposition')
        if file_title is None and disposition is not None:
            _, attrs = cgi.parse_header(disposition)
            filename = attrs.get('filename') or ''
            file_title = string_value(unquote(filename))

        if file_title is None and self.source_path:
            file_title = os.path.basename(self.source_path) or ''
            file_title = string_value(file_title)

        if file_title is None and self.source_url:
            parsed = urlparse(self.source_url)
            file_title = os.path.basename(parsed.path) or ''
            file_title = string_value(unquote(file_title))

        return file_title

Exemple #27

0

Afficher le fichier

def report(collection_id):
    collection = obj_or_404(Collection.by_id(collection_id))
    require(request.authz.can_read(collection.id))
    output = generate_excel(collection,
                            request.authz,
                            links=arg_bool('links'),
                            one_sheet=arg_bool('merge'))
    outputfile = "%s Cross-referenced.xlsx" % string_value(collection.label)
    return send_file(output,
                     as_attachment=True,
                     attachment_filename=outputfile)

Exemple #28

0

Afficher le fichier

Fichier : tabular.py Projet : CodeForAfrica/aleph

 def add_column(self, label):
     label = string_value(label)
     column = slugify(label or '', sep='_')
     column = column or 'column'
     column = column[:55]
     name, i = column, 2
     # de-dupe: column, column_2, column_3, ...
     while name in [c.name for c in self.columns]:
         name = '%s_%s' % (name, i)
         i += 1
     column = {'label': label, 'name': column}
     self.schema['columns'].append(column)
     return TabularColumn(self, column)

Exemple #29

0

Afficher le fichier

Fichier : tabular.py Projet : wilbrodn/aleph

 def generate_rows():
     for i, row in enumerate(row_set):
         record = {}
         try:
             for cell, column in zip(row, columns):
                 record[column.name] = string_value(cell.value)
             if len(record):
                 for column in columns:
                     record[column.name] = record.get(column.name, None)
                 yield record
         except Exception as exception:
             log.warning("Could not decode row %s in %s: %s",
                         i, meta, exception)

Exemple #30

0

Afficher le fichier

Fichier : directory.py Projet : adamchainz/aleph

 def crawl_file(self, collection_id, file_path, base_meta):
     try:
         if not os.path.isfile(file_path):
             log.info('Invalid file path: %r', file_path)
             return
         meta = self.make_meta(base_meta)
         file_path = string_value(file_path)
         meta.foreign_id = file_path
         meta.source_path = file_path
         meta.file_name = os.path.basename(file_path)
         ingest_file(collection_id, meta, file_path, move=False)
     except Exception as ex:
         log.exception(ex)

Exemple #31

0

Afficher le fichier

Fichier : directory.py Projet : OpenOil-UG/aleph

 def crawl_file(self, source_id, file_path, base_meta):
     try:
         if not os.path.isfile(file_path):
             log.info('Invalid file path: %r', file_path)
             return
         meta = self.make_meta(base_meta)
         file_path = string_value(file_path)
         meta.foreign_id = file_path
         meta.source_path = file_path
         meta.file_name = os.path.basename(file_path)
         ingest_file(source_id, meta, file_path, move=False)
     except Exception as ex:
         log.exception(ex)

Exemple #32

0

Afficher le fichier

Fichier : tabular.py Projet : wcyn/aleph

 def add_column(self, label):
     label = string_value(label)
     column = slugify(label or '', sep='_')
     column = column or 'column'
     column = column[:55]
     name, i = column, 2
     # de-dupe: column, column_2, column_3, ...
     while name in [c.name for c in self.columns]:
         name = '%s_%s' % (name, i)
         i += 1
     column = {'label': label, 'name': column}
     self.schema['columns'].append(column)
     return TabularColumn(self, column)

Exemple #33

0

Afficher le fichier

def extract_pdf(path, languages=None):
    """
    Extract content from a PDF file.

    This will attempt to use pdfminer to extract textual content from
    each page. If none is found, it'll send the images through OCR.
    """
    with open(path, 'rb') as fh:
        rsrcmgr = PDFResourceManager()
        laparams = LAParams()
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        parser = PDFParser(fh)
        doc = PDFDocument(parser, '')

        result = {'pages': []}
        if len(doc.info):
            for k, v in doc.info[-1].items():
                k = k.lower().strip()
                v = string_value(v)
                if k != 'pages' and v is not None and '<PDFObjRef:' not in v:
                    result[k] = string_value(v)

        for i, page in enumerate(PDFPage.create_pages(doc)):
            text = None
            try:
                interpreter.process_page(page)
                layout = device.get_result()
                text = _convert_page(layout, path)
            except Exception as ex:
                log.warning("Failed to parse PDF page: %r", ex)

            if text is None or len(text) < 3:
                log.info("OCR: %r, pg. %s", path, i + 1)
                text = _extract_image_page(path, i + 1, languages)
            result['pages'].append(text)
        device.close()
        return result

Exemple #34

0

Afficher le fichier

Fichier : pdf.py Projet : nivertech/aleph

def extract_pdf(path, languages=None):
    """
    Extract content from a PDF file.

    This will attempt to use pdfminer to extract textual content from
    each page. If none is found, it'll send the images through OCR.
    """
    with open(path, 'rb') as fh:
        rsrcmgr = PDFResourceManager()
        laparams = LAParams()
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        parser = PDFParser(fh)
        doc = PDFDocument(parser, '')

        result = {'pages': []}
        if len(doc.info):
            for k, v in doc.info[-1].items():
                k = k.lower().strip()
                v = string_value(v)
                if k != 'pages' and v is not None and '<PDFObjRef:' not in v:
                    result[k] = string_value(v)

        for i, page in enumerate(PDFPage.create_pages(doc)):
            text = None
            try:
                interpreter.process_page(page)
                layout = device.get_result()
                text = _convert_page(layout, path)
            except Exception as ex:
                log.warning("Failed to parse PDF page: %r", ex)

            if text is None or len(text) < 3:
                log.info("OCR: %r, pg. %s", path, i + 1)
                text = _extract_image_page(path, i + 1, languages)
            result['pages'].append(text)
        device.close()
        return result

Exemple #35

0

Afficher le fichier

Fichier : email_.py Projet : wilbrodn/aleph

    def ingest_attachment(self, part, meta):
        if part.body is None:
            log.warning("Empty attachment [%r]: %s", meta, part)
            return

        child = meta.make_child()
        child.mime_type = six.text_type(part.detected_content_type)
        child.file_name = string_value(part.detected_file_name)
        out_path = self.write_temp(part.body, child.extension)

        try:
            ingest_file(self.collection_id, child, out_path, move=True)
        finally:
            remove_tempfile(out_path)

Exemple #36

0

Afficher le fichier

 def crawl(self, directory=None, collection=None, meta={}):
     directory = string_value(directory)
     if directory is None or not os.path.exists(directory):
         log.error("Invalid directory: %r", directory)
         return
     directory = os.path.abspath(os.path.normpath(directory))
     collection = collection or directory
     collection = Collection.create({
         'foreign_id': 'directory:%s' % slugify(collection),
         'label': collection
     })
     db.session.commit()
     meta = self.make_meta(meta)
     meta.source_path = directory
     ingest_directory(collection.id, meta, directory)

Exemple #37

0

Afficher le fichier

 def ingest_attachment(self, attachment, meta):
     try:
         if attachment.data is None:
             log.warning("Attachment is empty [%r]: %s", meta,
                         attachment.longFilename)
             return
         out_path = make_tempfile()
         with open(out_path, 'w') as fh:
             fh.write(attachment.data)
         child = meta.make_child()
         child.file_name = string_value(attachment.longFilename)
         ingest_file(self.collection_id, child, out_path, move=True)
         remove_tempfile(out_path)
     except Exception as ex:
         log.exception(ex)

Exemple #38

0

Afficher le fichier

Fichier : dbf.py Projet : CodeForAfrica/aleph

    def generate_rows(self, db, columns):
        if db.numrec == 0:
            return
        text = []
        for i in xrange(0, db.numrec):
            for v in db.select(i).values():
                if isinstance(v, str):
                    text.append(v)

        for i in xrange(0, db.numrec):
            row = db.select(i)
            record = {}
            for k, value in row.items():
                name = columns.get(k)
                record[name] = string_value(value)
            if len(record):
                for name in columns.values():
                    record[name] = record.get(name, None)
                yield record

Exemple #39

0

Afficher le fichier

Fichier : converter.py Projet : nivertech/aleph

def fingerprint(value, **kwargs):
    return fingerprints.generate(string_value(value))

Exemple #40

0

Afficher le fichier

Fichier : util.py Projet : nivertech/aleph

def make_tempdir(name=None):
    name = string_value(name) or 'data'
    dirpath = path.join(mkdtemp(prefix=TMP_PREFIX), name)
    os.makedirs(dirpath)
    return dirpath

Exemple #41

0

Afficher le fichier

Fichier : util.py Projet : nivertech/aleph

def make_tempfile(name=None, suffix=None):
    name = string_value(name) or 'data'
    suffix = string_value(suffix)
    if suffix is not None:
        name = '%s.%s' % (name, suffix.strip('.'))
    return os.path.join(make_tempdir(), name)

Exemple #42

0

Afficher le fichier

Fichier : converter.py Projet : CodeForAfrica/aleph

def trim(value, **kwargs):
    return string_value(value).strip()

Exemple #43

0

Afficher le fichier

Fichier : converter.py Projet : CodeForAfrica/aleph

def lowercase(value, **kwargs):
    return string_value(value).lower()

Exemple #44

0

Afficher le fichier

Fichier : converter.py Projet : CodeForAfrica/aleph

def addressfp(value, **kwargs):
    value = string_value(value)
    if value is None:
        return
    value = value.replace("<br/>", " ")
    return fingerprints.generate(value, keep_order=True)

Exemple #45

0

Afficher le fichier

Fichier : common.py Projet : CodeForAfrica/aleph

def make_fingerprint(text, **kwargs):
    """Generate a normalised entity name, used for the graph."""
    return fingerprints.generate(string_value(text))