Esempio n. 1
0
 def headers(self, headers):
     self._headers = {}
     if not isinstance(headers, Mapping):
         return
     for key, value in headers.items():
         key = slugify(key, sep='_')
         self._headers[key] = string_value(value)
Esempio n. 2
0
 def on_text(self, text):
     if text is None or len(text) <= 100:
         return
     try:
         hint_language_code = None
         if len(self.meta.languages) == 1:
             hint_language_code = self.meta.languages[0]
         text = Text(text, hint_language_code=hint_language_code)
         for entity in text.entities:
             if entity.tag == 'I-LOC':
                 continue
             parts = [t for t in entity if t.lower() != t.upper()]
             if len(parts) < 2:
                 continue
             name = ' '.join(parts)
             if len(name) < 5 or len(name) > 150:
                 continue
             schema = SCHEMAS.get(entity.tag, DEFAULT_SCHEMA)
             fk = '%s:%s' % (self.origin, slugify(name))
             self.entity_schemata[fk].append(schema)
             self.entity_names[fk] = name
     except ValueError as ve:
         log.info('NER value error: %r', ve)
     except Exception as ex:
         log.warning('NER failed: %r', ex)
Esempio n. 3
0
def make_filename(file_name, sep='-'):
    if file_name is not None:
        file_name = os.path.basename(six.text_type(file_name))
        slugs = [slugify(s, sep=sep) for s in file_name.rsplit('.', 1)]
        slugs = [s[:200] for s in slugs if s is not None]
        file_name = '.'.join(slugs)
        file_name = file_name.strip('.').strip(sep)
        file_name = six.text_type(file_name)
        if not len(file_name.strip()):
            file_name = None
    return file_name
Esempio n. 4
0
 def add_column(self, label):
     label = string_value(label)
     column = slugify(label or '', sep='_')
     column = column or 'column'
     column = column[:55]
     name, i = column, 2
     # de-dupe: column, column_2, column_3, ...
     while name in [c.name for c in self.columns]:
         name = '%s_%s' % (name, i)
         i += 1
     column = {'label': label, 'name': column}
     self.schema['columns'].append(column)
     return TabularColumn(self, column)
Esempio n. 5
0
 def crawl(self, directory=None, foreign_id=None, meta={}):
     directory = string_value(directory)
     if directory is None or not os.path.exists(directory):
         log.error("Invalid directory: %r", directory)
         return
     directory = os.path.abspath(os.path.normpath(directory))
     collection = None
     if foreign_id is None:
         foreign_id = 'directory:%s' % slugify(directory)
     collection = self.load_collection({
         'foreign_id': foreign_id,
         'label': directory,
         'managed': True
     })
     db.session.commit()
     meta = self.make_meta(meta)
     meta.source_path = directory
     ingest_directory(collection.id, meta, directory)
Esempio n. 6
0
 def headers(self):
     # normalize header names
     raw = self._headers or {}
     return {slugify(k, sep='_'): v for k, v in raw.items()}
Esempio n. 7
0
 def headers(self):
     raw = self.meta.get('headers', {})
     return {slugify(k, sep='_'): v for k, v in raw.items()}