def ingest_directory(collection_id, meta, local_path, base_path=None, move=False): """Ingest all the files in a directory.""" # This is somewhat hacky, see issue #55 for the rationale. if not os.path.exists(local_path): log.error("Invalid path: %r", local_path) return base_path = base_path or local_path if not os.path.isdir(local_path): child = meta.make_child() child.source_path = base_path return ingest_file(collection_id, child, local_path, move=move) # handle bundles claimed = [] for cls in get_ingestors(): if not hasattr(cls, 'bundle'): continue bundler = cls(collection_id) claimed.extend(bundler.bundle(meta, local_path)) # recurse downward into the directory: for entry in os.listdir(local_path): entry_path = os.path.join(local_path, string_value(entry)) entry_base = os.path.join(base_path, string_value(entry)) if entry in SKIP_ENTRIES or entry in claimed: log.debug("Ignore: %r", entry_base) continue log.info("Handle [%s]: %s", meta.crawler_run, entry_base) # We don't care if it is a file, this is handled at # the beginning anyway. ingest_directory(collection_id, meta, entry_path, base_path=entry_base, move=move)
def crawl(self, directory=None, collection=None, meta={}): collection = collection or directory collection = Collection.create({ 'foreign_id': 'directory:%s' % slugify(collection), 'label': collection }) db.session.commit() collection_id = collection.id if os.path.isfile(directory): self.crawl_file(collection_id, directory, meta) directory = directory or os.getcwd() directory = directory.encode('utf-8') for (dirname, dirs, files) in os.walk(directory): dirparts = [d for d in dirname.split(os.path.sep) if d in SKIP_DIRECTORIES] if len(dirparts): continue log.info("Descending: %r", dirname) for file_name in files: dirname = string_value(dirname) file_name = string_value(file_name) if file_name in SKIP_FILES: continue file_path = os.path.join(dirname, file_name) self.crawl_file(collection_id, file_path, meta)
def parse_headers(self, header, meta): meta.title = header.get('Subject') if header.get('Message-Id'): meta.foreign_id = string_value(header.get('Message-Id')) if header.get('From'): addr = address.parse(header.get('From')) if addr is not None: meta.author = addr.to_unicode() meta.add_email(addr.address) for hdr in ['To', 'CC', 'BCC']: if header.get(hdr): for addr in address.parse_list(header.get(hdr)): meta.add_email(addr.address) date = header.get('Date') date = rfc822.parsedate(date) if date is not None: dt = datetime.fromtimestamp(mktime(date)) meta.add_date(dt) meta.headers = dict([(k, string_value(v)) for k, v in header.items()]) return meta
def extract_pdf(path, languages=None): """ Extract content from a PDF file. This will attempt to use pdfminer to extract textual content from each page. If none is found, it'll send the images through OCR. """ fh = open(path, "rb") result = {"pages": []} try: rsrcmgr = PDFResourceManager() laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) parser = PDFParser(fh) doc = PDFDocument(parser, "") if len(doc.info): for k, v in doc.info[-1].items(): k = k.lower().strip() v = string_value(v) if k != "pages" and v is not None and "<PDFObjRef:" not in v: result[k] = string_value(v) for i, page in enumerate(PDFPage.create_pages(doc)): result["pages"].append(_convert_page(interpreter, page, device, i + 1, path, languages)) device.close() return result except PSEOF as eof: log.info("Unexpected EOF: %r", eof) return result finally: fh.close()
def crawl(self, directory=None, source=None, meta={}): source = source or directory source = Source.create({ 'foreign_id': 'directory:%s' % slugify(source), 'label': source }) db.session.commit() source_id = source.id if os.path.isfile(directory): self.crawl_file(source_id, directory, meta) directory = directory or os.getcwd() directory = directory.encode('utf-8') for (dirname, dirs, files) in os.walk(directory): dirparts = [ d for d in dirname.split(os.path.sep) if d in SKIP_DIRECTORIES ] if len(dirparts): continue log.info("Descending: %r", dirname) for file_name in files: dirname = string_value(dirname) file_name = string_value(file_name) if file_name in SKIP_FILES: continue file_path = os.path.join(dirname, file_name) self.crawl_file(source_id, file_path, meta)
def check_filters(self, data): for k, v in self.data.get('filters', {}).items(): if string_value(v) != data.get(k): return False for k, v in self.data.get('filters_not', {}).items(): if string_value(v) == data.get(k): return False return True
def text_parts(self): """Utility method to get all text snippets in a record.""" if self.data is not None: for value in self.data.values(): text = string_value(value) if text is not None: yield text text = string_value(self.text) if text is not None: yield text
def generate_records(document): """Generate index records, based on document rows or pages.""" if document.type == Document.TYPE_TEXT: for page in document.pages: tid = sha1(str(document.id)) tid.update(str(page.id)) tid = tid.hexdigest() text = string_value(page.text) latin = latinize_text(text) yield { '_id': tid, '_type': TYPE_RECORD, '_index': six.text_type(es_index), '_parent': document.id, '_source': { 'type': 'page', 'content_hash': document.content_hash, 'document_id': document.id, 'collection_id': document.collection_id, 'page': page.number, 'text': text, 'text_latin': latin } } elif document.type == Document.TYPE_TABULAR: for record in document.records: data = {k: string_value(v) for (k, v) in record.data.items()} text = [v for v in data.values() if v is not None] latin = [latinize_text(t) for t in text] latin = [t for t in latin if t not in text] yield { '_id': record.tid, '_type': TYPE_RECORD, '_index': six.text_type(es_index), '_parent': document.id, '_source': { 'type': 'row', 'content_hash': document.content_hash, 'document_id': document.id, 'collection_id': document.collection_id, 'row_id': record.row_id, 'sheet': record.sheet, 'text': text, 'text_latin': latin, 'raw': data } }
def iterrows(self): """Iterate through the table applying filters on-the-go.""" mapping = {ref.split('.')[-1]: ref for ref in self.active_refs} for csv_url in self.csv_urls: log.info("Import [%s]: %s", self.dataset.name, csv_url) for row in self.read_csv(csv_url): data = {} for k, v in row.items(): k = mapping.get(string_value(k)) if k is None: continue data[k] = string_value(v) if self.check_filters(data): yield data
def get_text(document): """Generate an array with the full text of the given document. This will limit document length to TEXT_MAX_LEN in order to avoid uploading extremely long documents. """ texts = [] for text in document.text_parts(): text = string_value(text) texts.append(text) latin = latinize_text(text) if latin != text: texts.append(latin) text_len = sum((len(t) for t in texts)) # First, try getting rid of duplicate entries, which are more likely in # tabular documents. If that does not help, partial text will be # returned. if text_len >= TEXT_MAX_LEN: texts = list(set(texts)) text_len = sum((len(t) for t in texts)) if text_len >= TEXT_MAX_LEN: return texts return texts
def add_language(self, language): lang = string_value(language) if lang is None: return lang = lang.lower() if is_language_code(lang) and lang not in self._languages: self._languages.append(lang)
def headers(self, headers): self._headers = {} if not isinstance(headers, Mapping): return for key, value in headers.items(): key = slugify(key, sep='_') self._headers[key] = string_value(value)
def parse_date(text, guess=True, date_format=None): """The classic: date parsing, every which way.""" # handle date/datetime before converting to text. if isinstance(text, datetime): text = text.date() if isinstance(text, date): return text.isoformat() text = string_value(text) if text is None: return elif date_format is not None: # parse with a specified format try: obj = datetime.strptime(text, date_format) return obj.date().isoformat() except: pass elif guess and not is_partial_date(text): # use dateparser to guess the format try: obj = fuzzy_date_parser(text) return obj.date().isoformat() except Exception: pass else: # limit to the date part of a presumed date string text = text[:10] # strip -00-00 from dates because it makes ES barf. text = CUT_ZEROES.sub('', text) if is_partial_date(text): return text
def extract_pdf(path, languages=None): """ Extract content from a PDF file. This will convert the whole file to XML using `pdftohtml`, then run OCR on individual images within the file. """ temp_dir = make_tempdir() try: out_file = os.path.join(temp_dir, 'pdf.xml') log.info("Converting PDF to XML: %r...", path) pdftohtml = get_config('PDFTOHTML_BIN') args = [pdftohtml, '-xml', '-hidden', '-q', '-nodrm', path, out_file] subprocess.call(args) if not os.path.exists(out_file): raise IngestorException("Could not convert PDF to XML: %s" % path) with open(out_file, 'r') as fh: xml = string_value(fh.read()) xml = xml.replace('encoding="UTF-8"', '') parser = etree.XMLParser(recover=True, remove_comments=True) doc = etree.fromstring(xml, parser=parser) log.debug("Parsed XML: %r", path) pages = [] for page in doc.findall('./page'): pages.append(extract_page(path, temp_dir, page, languages)) return {'pages': pages} finally: remove_tempdir(temp_dir)
def chomp(text, lower=False): text = string_value(text) if text is not None: text = text.strip() if not len(text): return None return text.lower() if lower else text
def content_hash(self): if self._content_hash is not None: return self._content_hash if self._foreign_id is not None: foreign_id = string_value(self.foreign_id) if foreign_id is not None: foreign_id = foreign_id.encode('utf-8') return sha1(foreign_id).hexdigest()
def ingest(self, meta, local_path): work_dir = make_tempdir() try: bin_path = os.environ.get('READPST_BIN', 'readpst') args = [ bin_path, '-D', '-e', '-8', '-b', '-o', work_dir, local_path ] log.debug('Converting Outlook PST file: %r', ' '.join(args)) subprocess.call(args) for (dirpath, dirnames, filenames) in os.walk(work_dir): reldir = os.path.relpath(string_value(dirpath), string_value(work_dir)) for filename in filenames: filename = string_value(filename) child = meta.make_child() for kw in reldir.split(os.path.sep): child.add_keyword(kw) fid = os.path.join(string_value(meta.foreign_id), string_value(reldir), filename) child.foreign_id = string_value(fid) file_path = os.path.join(string_value(dirpath), filename) ingest_file(self.collection_id, child, file_path, move=True) finally: remove_tempdir(work_dir)
def add_language(self, language): self.meta.setdefault('languages', []) lang = string_value(language) if lang is None: return lang = lang.lower() if is_language_code(lang) and lang not in self.meta['languages']: self.meta['languages'].append(lang) self.update_meta()
def phone(value, prop=None, **kwargs): try: value = string_value(value) if value is None: return num = phonenumbers.parse(value, prop.country) if phonenumbers.is_possible_number(num): return phonenumbers.format_number(num, phonenumbers.PhoneNumberFormat.INTERNATIONAL) # noqa except Exception: return
def file_title(self): """The file title is a human-readable interpretation of the file name. It is used for labelling or as a backup title. It should not be used to generate an actual file system path.""" file_title = self.meta.get('file_name') # derive file name from headers disposition = self.headers.get('content_disposition') if file_title is None and disposition is not None: _, attrs = cgi.parse_header(disposition) filename = attrs.get('filename') or '' file_title = string_value(unquote(filename)) if file_title is None and self.source_url: parsed = urlparse(self.source_url) file_title = os.path.basename(parsed.path) or '' file_title = string_value(unquote(file_title)) return file_title
def by_foreign_id(cls, foreign_id, collection_id, deleted=False): foreign_id = string_value(foreign_id) if foreign_id is None: return None q = cls.all(deleted=deleted) q = q.filter(Entity.collection_id == collection_id) foreign_id = func.cast([foreign_id], ARRAY(db.Unicode())) q = q.filter(cls.foreign_ids.contains(foreign_id)) q = q.order_by(Entity.deleted_at.desc().nullsfirst()) return q.first()
def crawl_file(self, source, file_path, base_meta): try: meta = self.make_meta(base_meta) file_path = string_value(file_path) meta.foreign_id = file_path meta.source_path = file_path meta.file_name = os.path.basename(file_path) ingest_file(source.id, meta, file_path, move=False) except Exception as ex: log.exception(ex)
def update(self, entity): data = entity.get('data') or {} data['name'] = entity.get('name') self.data = self.schema.validate(data) self.name = self.data.pop('name') fid = [string_value(f) for f in entity.get('foreign_ids') or []] self.foreign_ids = list(set([f for f in fid if f is not None])) self.state = entity.pop('state', self.STATE_ACTIVE) self.updated_at = datetime.utcnow() db.session.add(self)
def file_title(self): file_title = self._file_name # derive file name from headers disposition = self.headers.get('content_disposition') if file_title is None and disposition is not None: _, attrs = cgi.parse_header(disposition) filename = attrs.get('filename') or '' file_title = string_value(unquote(filename)) if file_title is None and self.source_path: file_title = os.path.basename(self.source_path) or '' file_title = string_value(file_title) if file_title is None and self.source_url: parsed = urlparse(self.source_url) file_title = os.path.basename(parsed.path) or '' file_title = string_value(unquote(file_title)) return file_title
def report(collection_id): collection = obj_or_404(Collection.by_id(collection_id)) require(request.authz.can_read(collection.id)) output = generate_excel(collection, request.authz, links=arg_bool('links'), one_sheet=arg_bool('merge')) outputfile = "%s Cross-referenced.xlsx" % string_value(collection.label) return send_file(output, as_attachment=True, attachment_filename=outputfile)
def add_column(self, label): label = string_value(label) column = slugify(label or '', sep='_') column = column or 'column' column = column[:55] name, i = column, 2 # de-dupe: column, column_2, column_3, ... while name in [c.name for c in self.columns]: name = '%s_%s' % (name, i) i += 1 column = {'label': label, 'name': column} self.schema['columns'].append(column) return TabularColumn(self, column)
def generate_rows(): for i, row in enumerate(row_set): record = {} try: for cell, column in zip(row, columns): record[column.name] = string_value(cell.value) if len(record): for column in columns: record[column.name] = record.get(column.name, None) yield record except Exception as exception: log.warning("Could not decode row %s in %s: %s", i, meta, exception)
def crawl_file(self, collection_id, file_path, base_meta): try: if not os.path.isfile(file_path): log.info('Invalid file path: %r', file_path) return meta = self.make_meta(base_meta) file_path = string_value(file_path) meta.foreign_id = file_path meta.source_path = file_path meta.file_name = os.path.basename(file_path) ingest_file(collection_id, meta, file_path, move=False) except Exception as ex: log.exception(ex)
def crawl_file(self, source_id, file_path, base_meta): try: if not os.path.isfile(file_path): log.info('Invalid file path: %r', file_path) return meta = self.make_meta(base_meta) file_path = string_value(file_path) meta.foreign_id = file_path meta.source_path = file_path meta.file_name = os.path.basename(file_path) ingest_file(source_id, meta, file_path, move=False) except Exception as ex: log.exception(ex)
def extract_pdf(path, languages=None): """ Extract content from a PDF file. This will attempt to use pdfminer to extract textual content from each page. If none is found, it'll send the images through OCR. """ with open(path, 'rb') as fh: rsrcmgr = PDFResourceManager() laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) parser = PDFParser(fh) doc = PDFDocument(parser, '') result = {'pages': []} if len(doc.info): for k, v in doc.info[-1].items(): k = k.lower().strip() v = string_value(v) if k != 'pages' and v is not None and '<PDFObjRef:' not in v: result[k] = string_value(v) for i, page in enumerate(PDFPage.create_pages(doc)): text = None try: interpreter.process_page(page) layout = device.get_result() text = _convert_page(layout, path) except Exception as ex: log.warning("Failed to parse PDF page: %r", ex) if text is None or len(text) < 3: log.info("OCR: %r, pg. %s", path, i + 1) text = _extract_image_page(path, i + 1, languages) result['pages'].append(text) device.close() return result
def ingest_attachment(self, part, meta): if part.body is None: log.warning("Empty attachment [%r]: %s", meta, part) return child = meta.make_child() child.mime_type = six.text_type(part.detected_content_type) child.file_name = string_value(part.detected_file_name) out_path = self.write_temp(part.body, child.extension) try: ingest_file(self.collection_id, child, out_path, move=True) finally: remove_tempfile(out_path)
def crawl(self, directory=None, collection=None, meta={}): directory = string_value(directory) if directory is None or not os.path.exists(directory): log.error("Invalid directory: %r", directory) return directory = os.path.abspath(os.path.normpath(directory)) collection = collection or directory collection = Collection.create({ 'foreign_id': 'directory:%s' % slugify(collection), 'label': collection }) db.session.commit() meta = self.make_meta(meta) meta.source_path = directory ingest_directory(collection.id, meta, directory)
def ingest_attachment(self, attachment, meta): try: if attachment.data is None: log.warning("Attachment is empty [%r]: %s", meta, attachment.longFilename) return out_path = make_tempfile() with open(out_path, 'w') as fh: fh.write(attachment.data) child = meta.make_child() child.file_name = string_value(attachment.longFilename) ingest_file(self.collection_id, child, out_path, move=True) remove_tempfile(out_path) except Exception as ex: log.exception(ex)
def generate_rows(self, db, columns): if db.numrec == 0: return text = [] for i in xrange(0, db.numrec): for v in db.select(i).values(): if isinstance(v, str): text.append(v) for i in xrange(0, db.numrec): row = db.select(i) record = {} for k, value in row.items(): name = columns.get(k) record[name] = string_value(value) if len(record): for name in columns.values(): record[name] = record.get(name, None) yield record
def fingerprint(value, **kwargs): return fingerprints.generate(string_value(value))
def make_tempdir(name=None): name = string_value(name) or 'data' dirpath = path.join(mkdtemp(prefix=TMP_PREFIX), name) os.makedirs(dirpath) return dirpath
def make_tempfile(name=None, suffix=None): name = string_value(name) or 'data' suffix = string_value(suffix) if suffix is not None: name = '%s.%s' % (name, suffix.strip('.')) return os.path.join(make_tempdir(), name)
def trim(value, **kwargs): return string_value(value).strip()
def lowercase(value, **kwargs): return string_value(value).lower()
def addressfp(value, **kwargs): value = string_value(value) if value is None: return value = value.replace("<br/>", " ") return fingerprints.generate(value, keep_order=True)
def make_fingerprint(text, **kwargs): """Generate a normalised entity name, used for the graph.""" return fingerprints.generate(string_value(text))