def test_emails(self): meta = Metadata() meta.add_email('*****@*****.**') assert len(meta.emails) == 1, meta.emails assert len(meta.domains) == 1, (meta.emails, meta.domains) assert meta.domains[0] == 'pudo.org', meta.domains meta = Metadata() meta.add_email('not-an-email') assert len(meta.emails) == 0, meta.emails
def test_dates(self): meta = Metadata() meta.date = 'yada yada' assert len(meta.dates) == 0, meta.dates meta = Metadata() meta.date = '2001-01-20' assert len(meta.dates) == 1, meta.dates meta.authored_at = '2001-01-20' assert len(meta.dates) == 1, meta.dates meta.published_at = '2002-01-20' assert len(meta.dates) == 2, meta.dates
def test_dates(self): meta = Metadata() meta.add_date('yada yada') assert len(meta.dates) == 0, meta.dates # meta.add_date('today') # assert len(meta.dates) == 1, meta.dates meta = Metadata() meta.add_date('2001-01-20') assert len(meta.dates) == 1, meta.dates meta.add_date('2001-01-20') assert len(meta.dates) == 1, meta.dates meta.add_date('2002-01-20') assert len(meta.dates) == 2, meta.dates
def test_urls(self): meta = Metadata() meta.urls = ['http://google.com'] assert len(meta.urls) == 1, meta.urls assert len(meta.domains) == 1, meta.domains assert meta.domains[0] == 'google.com', meta.domains meta = Metadata() meta.add_url('http://') assert len(meta.urls) == 0, meta.urls meta = Metadata() meta.add_url('http://www.google.com/xxx') assert len(meta.urls) == 1, meta.urls assert len(meta.domains) == 1, meta.domains
def ingest_url(source_id, metadata, url): clear_session() meta = Metadata(data=metadata) try: with NamedTemporaryFile() as fh: log.info("Ingesting URL: %r", url) res = requests.get(url, stream=True) if res.status_code >= 400: log.error("Error ingesting %r: %r", url, res.status_code) for chunk in res.iter_content(chunk_size=1024): if chunk: fh.write(chunk) fh.flush() if not meta.has('source_url'): meta.source_url = res.url meta.headers = res.headers meta = get_archive().archive_file(fh.name, meta, move=True) except Exception as ex: log.exception(ex) process.exception(process.INGEST, component='ingest_url', source_id=source_id, meta=meta, exception=ex) return ingest.delay(source_id, meta.data)
def test_basic_functions(self): meta = Metadata() meta.file_name = 'foo.doc' meta.title = ' ' meta.languages = ['en', 'xx'] assert meta.file_name == 'foo.doc', meta.file_name assert meta.title == 'foo.doc', meta.title assert not len(meta.countries), meta.countries
def ingest(source_id, metadata): meta = Metadata(data=metadata) try: process.log(process.INGEST, component='ingest', meta=meta, source_id=source_id) except Exception as ex: log.exception(ex) Ingestor.dispatch(source_id, meta)
def test_basic_functions(self): meta = Metadata() meta.file_name = 'foo.doc' meta.title = ' ' meta.languages = ['en', 'xx'] meta.author = 'The Man' assert meta.file_name == 'foo.doc', meta.file_name assert meta.title == 'foo.doc', meta.title assert meta.extension == 'doc', meta.extension assert not len(meta.countries), meta.countries assert meta.author == 'The Man', meta.author
def test_languages(self): meta = Metadata() meta.countries = ['xx', 'de'] assert len(meta.countries) == 1, meta.countries assert meta.countries[0] == 'de', meta.countries
def test_keywords(self): meta = Metadata() meta.keywords = ['test'] assert len(meta.keywords) == 1, meta.keywords assert meta.keywords[0] == 'test', meta.keywords
def test_file_names(self): meta = Metadata() meta.file_name = 'Foo Schnasel.doc' assert meta.file_name == 'foo_schnasel.doc', meta.file_name assert meta.file_title == 'Foo Schnasel.doc', meta.file_title
def test_file_names(self): meta = Metadata() meta.file_name = 'Foo Schnasel.doc' assert meta.safe_file_name == 'Foo_Schnasel.doc', meta.safe_file_name assert meta.file_name == 'Foo Schnasel.doc', meta.file_name
def meta(self): self._meta = self._meta or {} self._meta['content_hash'] = self.content_hash self._meta['foreign_id'] = self.foreign_id return Metadata(data=self._meta or {})