def process(self, item):
        '''Generate a random dataset from a fake identifier'''
        # Get or create a harvested dataset with this identifier.
        # Harvest metadata are already filled on creation.
        dataset = self.get_dataset(item.remote_id)

        # Here you comes your implementation. You should :
        # - fetch the remote dataset (if necessary)
        # - validate the fetched payload
        # - map its content to the dataset fields
        # - store extra significant data in the `extra` attribute
        # - map resources data

        dataset.title = faker.sentence()
        dataset.description = faker.text()
        dataset.tags = list(set(faker.words(nb=faker.pyint())))

        # Resources
        for i in range(faker.pyint()):
            dataset.resources.append(
                Resource(title=faker.sentence(),
                         description=faker.text(),
                         url=faker.url(),
                         filetype='remote',
                         mime=faker.mime_type(category='text'),
                         format=faker.file_extension(category='text'),
                         filesize=faker.pyint()))

        return dataset
Example #2
0
    def feed(self, feed_title, title, content, url, published=None, summary=None,
             enclosure=None, media_thumbnail=None):
        feed = FeedGenerator()
        feed.title(feed_title)
        feed.description(faker.sentence())
        feed.link({'href': WP_FEED_URL})

        entry = feed.add_entry()
        entry.title(title)
        entry.link({'href': url})
        entry.author(name=faker.name())
        entry.content(content, type="cdata")
        if summary:
            entry.description(summary)
        if enclosure:
            entry.enclosure(url=enclosure['url'],
                            type=enclosure['type'],
                            length=str(faker.pyint()))
        if media_thumbnail:
            feed.load_extension('media')
            entry.media.thumbnail({'url': media_thumbnail})
        tz = pytz.timezone(faker.timezone())
        published = published or faker.date_time(tzinfo=tz)
        entry.published(published)
        entry.updated(faker.date_time_between(start_date=published, tzinfo=tz))

        return feed.rss_str().decode('utf8')
 def initialize(self):
     '''Generate a list of fake identifiers to harvest'''
     # Here you comes your implementation.
     # You should iter over a remote endpoint to list identifiers
     # to harvest and optionnaly store extra data
     for _ in range(faker.pyint()):
         self.add_item(faker.uuid4())  # Accept kwargs to store data
Example #4
0
 def feed(self, feed_title, title, content, url, published=None, summary=None,
          enclosure=None, media_thumbnail=None):
     feed = AtomFeed(feed_title, feed_url=WP_FEED_URL)
     tz = pytz.timezone(faker.timezone())
     published = published or faker.date_time(tzinfo=tz)
     kwargs = {
         'content_type': 'html',
         'author': faker.name(),
         'url': url,
         'updated': faker.date_time_between(start_date=published, tzinfo=tz),
         'published': published
     }
     if summary:
         kwargs['summary'] = summary
     if enclosure:
         kwargs['links'] = [{
             'type': enclosure['type'],
             'href': enclosure['url'],
             'rel': 'enclosure',
             'length': faker.pyint(),
         }]
     feed.add(title, content, **kwargs)
     out = feed.to_string()
     if media_thumbnail:
         el = '<media:thumbnail url="{0}" />'.format(media_thumbnail)
         out = out.replace('<feed', '<feed xmlns:media="http://search.yahoo.com/mrss/"')
         out = out.replace('</entry>', '{0}</entry>'.format(el))
     return out
Example #5
0
    def feed(self,
             feed_title,
             title,
             content,
             url,
             published=None,
             summary=None,
             enclosure=None,
             media_thumbnail=None):
        feed = FeedGenerator()
        feed.title(feed_title)
        feed.description(faker.sentence())
        feed.link({'href': WP_FEED_URL})

        entry = feed.add_entry()
        entry.title(title)
        entry.link({'href': url})
        entry.author(name=faker.name())
        entry.content(content, type="cdata")
        if summary:
            entry.description(summary)
        if enclosure:
            entry.enclosure(url=enclosure['url'],
                            type=enclosure['type'],
                            length=str(faker.pyint()))
        if media_thumbnail:
            feed.load_extension('media')
            entry.media.thumbnail({'url': media_thumbnail})
        tz = pytz.timezone(faker.timezone())
        published = published or faker.date_time(tzinfo=tz)
        entry.published(published)
        entry.updated(faker.date_time_between(start_date=published, tzinfo=tz))

        return feed.rss_str().decode('utf8')
Example #6
0
 def feed(self, feed_title, title, content, url, published=None, summary=None,
          enclosure=None, media_thumbnail=None):
     feed = AtomFeed(feed_title, feed_url=WP_FEED_URL)
     tz = pytz.timezone(faker.timezone())
     published = published or faker.date_time(tzinfo=tz)
     kwargs = {
         'content_type': 'html',
         'author': faker.name(),
         'url': url,
         'updated': faker.date_time_between(start_date=published, tzinfo=tz),
         'published': published
     }
     if summary:
         kwargs['summary'] = summary
     if enclosure:
         kwargs['links'] = [{
             'type': enclosure['type'],
             'href': enclosure['url'],
             'rel': 'enclosure',
             'length': faker.pyint(),
         }]
     feed.add(title, content, **kwargs)
     out = feed.to_string()
     if media_thumbnail:
         el = '<media:thumbnail url="{0}" />'.format(media_thumbnail)
         out = out.replace('<feed', '<feed xmlns:media="http://search.yahoo.com/mrss/"')
         out = out.replace('</entry>', '{0}</entry>'.format(el))
     return out
Example #7
0
def metadata_factory(url, data=None):
    response = {
        'etag': '',
        'url': url,
        'content-length': faker.pyint(),
        'content-disposition': '',
        'content-md5': faker.md5(),
        'content-location': '',
        'expires': faker.iso8601(),
        'status': 200,
        'updated': faker.iso8601(),
        'last-modified': faker.iso8601(),
        'content-encoding': 'gzip',
        'content-type': faker.mime_type()
    }
    if data:
        response.update(data)
    return json.dumps(response)
Example #8
0
    def test_all_resource_fields(self):
        node = BNode()
        g = Graph()

        title = faker.sentence()
        url = faker.uri()
        description = faker.paragraph()
        filesize = faker.pyint()
        issued = faker.date_time_between(start_date='-60d', end_date='-30d')
        modified = faker.past_datetime(start_date='-30d')
        mime = faker.mime_type()
        sha1 = faker.sha1()

        g.add((node, RDF.type, DCAT.Distribution))
        g.add((node, DCT.title, Literal(title)))
        g.add((node, DCT.description, Literal(description)))
        g.add((node, DCAT.downloadURL, Literal(url)))
        g.add((node, DCT.issued, Literal(issued)))
        g.add((node, DCT.modified, Literal(modified)))
        g.add((node, DCAT.bytesSize, Literal(filesize)))
        g.add((node, DCAT.mediaType, Literal(mime)))
        g.add((node, DCT.term('format'), Literal('CSV')))

        checksum = BNode()
        g.add((node, SPDX.checksum, checksum))
        g.add((checksum, RDF.type, SPDX.Checksum))
        g.add((checksum, SPDX.algorithm, SPDX.checksumAlgorithm_sha1))
        g.add((checksum, SPDX.checksumValue, Literal(sha1)))

        resource = resource_from_rdf(g)
        resource.validate()

        assert isinstance(resource, Resource)
        assert resource.title == title
        assert resource.url == url
        assert resource.description == description
        assert resource.filesize == filesize
        assert resource.mime == mime
        assert isinstance(resource.checksum, Checksum)
        assert resource.checksum.type == 'sha1'
        assert resource.checksum.value == sha1
        assert resource.published == issued
        assert resource.modified == modified
        assert resource.format == 'csv'
Example #9
0
def metadata_factory(url, data=None):
    """Base for a mocked Croquemort HTTP response"""
    response = {
        'etag': '',
        'checked-url': url,
        'content-length': faker.pyint(),
        'content-disposition': '',
        'content-md5': faker.md5(),
        'content-location': '',
        'expires': faker.iso8601(),
        'final-status-code': 200,
        'updated': faker.iso8601(),
        'last-modified': faker.iso8601(),
        'content-encoding': 'gzip',
        'content-type': faker.mime_type()
    }
    if data:
        response.update(data)
    return json.dumps(response)
Example #10
0
class NestedFactory(MongoEngineFactory):
    class Meta:
        model = NestedFake

    key = factory.LazyAttribute(lambda o: faker.word())
    value = factory.LazyAttribute(lambda o: faker.pyint())