Exemple #1
0
def ingest_url(source_id, metadata, url):
    clear_session()
    meta = Metadata(data=metadata)
    try:
        with NamedTemporaryFile() as fh:
            log.info("Ingesting URL: %r", url)
            res = requests.get(url, stream=True)
            if res.status_code >= 400:
                log.error("Error ingesting %r: %r", url, res.status_code)
            for chunk in res.iter_content(chunk_size=1024):
                if chunk:
                    fh.write(chunk)
            fh.flush()
            if not meta.has('source_url'):
                meta.source_url = res.url
            meta.headers = res.headers
            meta = get_archive().archive_file(fh.name, meta, move=True)
    except Exception as ex:
        log.exception(ex)
        process.exception(process.INGEST,
                          component='ingest_url',
                          source_id=source_id,
                          meta=meta,
                          exception=ex)
        return
    ingest.delay(source_id, meta.data)
Exemple #2
0
 def test_basic_functions(self):
     meta = Metadata()
     meta.file_name = 'foo.doc'
     meta.title = '  '
     meta.languages = ['en', 'xx']
     assert meta.file_name == 'foo.doc', meta.file_name
     assert meta.title == 'foo.doc', meta.title
     assert not len(meta.countries), meta.countries
Exemple #3
0
 def test_basic_functions(self):
     meta = Metadata()
     meta.file_name = 'foo.doc'
     meta.title = '  '
     meta.languages = ['en', 'xx']
     meta.author = 'The Man'
     assert meta.file_name == 'foo.doc', meta.file_name
     assert meta.title == 'foo.doc', meta.title
     assert meta.extension == 'doc', meta.extension
     assert not len(meta.countries), meta.countries
     assert meta.author == 'The Man', meta.author
Exemple #4
0
def ingest_url(source_id, metadata, url):
    clear_session()
    meta = Metadata(data=metadata)
    with NamedTemporaryFile() as fh:
        log.info("Ingesting URL: %r", url)
        res = requests.get(url, stream=True)
        if res.status_code >= 400:
            log.error("Error ingesting %r: %r", url, res.status_code)
        for chunk in res.iter_content(chunk_size=1024):
            if chunk:
                fh.write(chunk)
        fh.flush()
        if not meta.has("source_url"):
            meta.source_url = res.url
        meta.headers = res.headers
        meta = archive.archive_file(fh.name, meta, move=True)
        ingest.delay(source_id, meta.data)
Exemple #5
0
    def test_emails(self):
        meta = Metadata()
        meta.add_email('*****@*****.**')
        assert len(meta.emails) == 1, meta.emails
        assert len(meta.domains) == 1, (meta.emails, meta.domains)
        assert meta.domains[0] == 'pudo.org', meta.domains

        meta = Metadata()
        meta.add_email('not-an-email')
        assert len(meta.emails) == 0, meta.emails
Exemple #6
0
    def test_dates(self):
        meta = Metadata()
        meta.date = 'yada yada'
        assert len(meta.dates) == 0, meta.dates

        meta = Metadata()
        meta.date = '2001-01-20'
        assert len(meta.dates) == 1, meta.dates
        meta.authored_at = '2001-01-20'
        assert len(meta.dates) == 1, meta.dates
        meta.published_at = '2002-01-20'
        assert len(meta.dates) == 2, meta.dates
Exemple #7
0
def ingest(source_id, metadata):
    meta = Metadata(data=metadata)
    try:
        process.log(process.INGEST,
                    component='ingest',
                    meta=meta,
                    source_id=source_id)
    except Exception as ex:
        log.exception(ex)
    Ingestor.dispatch(source_id, meta)
Exemple #8
0
def ingest_url(source_id, metadata, url):
    clear_session()
    meta = Metadata(data=metadata)
    try:
        with NamedTemporaryFile() as fh:
            log.info("Ingesting URL: %r", url)
            res = requests.get(url, stream=True)
            if res.status_code >= 400:
                log.error("Error ingesting %r: %r", url, res.status_code)
            for chunk in res.iter_content(chunk_size=1024):
                if chunk:
                    fh.write(chunk)
            fh.flush()
            if not meta.has('source_url'):
                meta.source_url = res.url
            meta.headers = res.headers
            meta = get_archive().archive_file(fh.name, meta, move=True)
    except Exception as ex:
        log.exception(ex)
        process.exception(process.INGEST, component='ingest_url',
                          source_id=source_id, meta=meta, exception=ex)
        return
    ingest.delay(source_id, meta.data)
Exemple #9
0
 def test_basic_functions(self):
     meta = Metadata()
     meta.file_name = 'foo.doc'
     meta.title = '  '
     meta.languages = ['en', 'xx']
     meta.author = 'The Man'
     assert meta.file_name == 'foo.doc', meta.file_name
     assert meta.title == 'foo.doc', meta.title
     assert meta.extension == 'doc', meta.extension
     assert not len(meta.countries), meta.countries
     assert meta.author == 'The Man', meta.author
Exemple #10
0
    def test_dates(self):
        meta = Metadata()
        meta.date = 'yada yada'
        assert len(meta.dates) == 0, meta.dates

        meta = Metadata()
        meta.date = '2001-01-20'
        assert len(meta.dates) == 1, meta.dates
        meta.authored_at = '2001-01-20'
        assert len(meta.dates) == 1, meta.dates
        meta.published_at = '2002-01-20'
        assert len(meta.dates) == 2, meta.dates
Exemple #11
0
 def meta(self):
     self._meta = self._meta or {}
     self._meta['content_hash'] = self.content_hash
     self._meta['foreign_id'] = self.foreign_id
     return Metadata(data=self._meta or {})
Exemple #12
0
    def test_dates(self):
        meta = Metadata()
        meta.add_date('yada yada')
        assert len(meta.dates) == 0, meta.dates

        # meta.add_date('today')
        # assert len(meta.dates) == 1, meta.dates

        meta = Metadata()
        meta.add_date('2001-01-20')
        assert len(meta.dates) == 1, meta.dates
        meta.add_date('2001-01-20')
        assert len(meta.dates) == 1, meta.dates
        meta.add_date('2002-01-20')
        assert len(meta.dates) == 2, meta.dates
Exemple #13
0
    def test_urls(self):
        meta = Metadata()
        meta.urls = ['http://google.com']
        assert len(meta.urls) == 1, meta.urls
        assert len(meta.domains) == 1, meta.domains
        assert meta.domains[0] == 'google.com', meta.domains

        meta = Metadata()
        meta.add_url('http://')
        assert len(meta.urls) == 0, meta.urls

        meta = Metadata()
        meta.add_url('http://www.google.com/xxx')
        assert len(meta.urls) == 1, meta.urls
        assert len(meta.domains) == 1, meta.domains
Exemple #14
0
 def test_languages(self):
     meta = Metadata()
     meta.countries = ['xx', 'de']
     assert len(meta.countries) == 1, meta.countries
     assert meta.countries[0] == 'de', meta.countries
Exemple #15
0
 def test_keywords(self):
     meta = Metadata()
     meta.keywords = ['test']
     assert len(meta.keywords) == 1, meta.keywords
     assert meta.keywords[0] == 'test', meta.keywords
Exemple #16
0
 def test_file_names(self):
     meta = Metadata()
     meta.file_name = 'Foo Schnasel.doc'
     assert meta.file_name == 'foo_schnasel.doc', meta.file_name
     assert meta.file_title == 'Foo Schnasel.doc', meta.file_title
Exemple #17
0
 def test_file_names(self):
     meta = Metadata()
     meta.file_name = 'Foo Schnasel.doc'
     assert meta.safe_file_name == 'Foo_Schnasel.doc', meta.safe_file_name
     assert meta.file_name == 'Foo Schnasel.doc', meta.file_name
Exemple #18
0
 def test_file_names(self):
     meta = Metadata()
     meta.file_name = 'Foo Schnasel.doc'
     assert meta.safe_file_name == 'Foo_Schnasel.doc', meta.safe_file_name
     assert meta.file_name == 'Foo Schnasel.doc', meta.file_name
Exemple #19
0
 def test_languages(self):
     meta = Metadata()
     meta.countries = ['xx', 'de']
     assert len(meta.countries) == 1, meta.countries
     assert meta.countries[0] == 'de', meta.countries
Exemple #20
0
 def test_keywords(self):
     meta = Metadata()
     meta.keywords = ['test']
     assert len(meta.keywords) == 1, meta.keywords
     assert meta.keywords[0] == 'test', meta.keywords