def process(self, item): '''Generate a random dataset from a fake identifier''' # Get or create a harvested dataset with this identifier. # Harvest metadata are already filled on creation. dataset = self.get_dataset(item.remote_id) # Here you comes your implementation. You should : # - fetch the remote dataset (if necessary) # - validate the fetched payload # - map its content to the dataset fields # - store extra significant data in the `extra` attribute # - map resources data dataset.title = faker.sentence() dataset.description = faker.text() dataset.tags = list(set(faker.words(nb=faker.pyint()))) # Resources for i in range(faker.pyint()): dataset.resources.append( Resource(title=faker.sentence(), description=faker.text(), url=faker.url(), filetype='remote', mime=faker.mime_type(category='text'), format=faker.file_extension(category='text'), filesize=faker.pyint())) return dataset
def feed(self, feed_title, title, content, url, published=None, summary=None, enclosure=None, media_thumbnail=None): feed = FeedGenerator() feed.title(feed_title) feed.description(faker.sentence()) feed.link({'href': WP_FEED_URL}) entry = feed.add_entry() entry.title(title) entry.link({'href': url}) entry.author(name=faker.name()) entry.content(content, type="cdata") if summary: entry.description(summary) if enclosure: entry.enclosure(url=enclosure['url'], type=enclosure['type'], length=str(faker.pyint())) if media_thumbnail: feed.load_extension('media') entry.media.thumbnail({'url': media_thumbnail}) tz = pytz.timezone(faker.timezone()) published = published or faker.date_time(tzinfo=tz) entry.published(published) entry.updated(faker.date_time_between(start_date=published, tzinfo=tz)) return feed.rss_str().decode('utf8')
def initialize(self): '''Generate a list of fake identifiers to harvest''' # Here you comes your implementation. # You should iter over a remote endpoint to list identifiers # to harvest and optionnaly store extra data for _ in range(faker.pyint()): self.add_item(faker.uuid4()) # Accept kwargs to store data
def feed(self, feed_title, title, content, url, published=None, summary=None, enclosure=None, media_thumbnail=None): feed = AtomFeed(feed_title, feed_url=WP_FEED_URL) tz = pytz.timezone(faker.timezone()) published = published or faker.date_time(tzinfo=tz) kwargs = { 'content_type': 'html', 'author': faker.name(), 'url': url, 'updated': faker.date_time_between(start_date=published, tzinfo=tz), 'published': published } if summary: kwargs['summary'] = summary if enclosure: kwargs['links'] = [{ 'type': enclosure['type'], 'href': enclosure['url'], 'rel': 'enclosure', 'length': faker.pyint(), }] feed.add(title, content, **kwargs) out = feed.to_string() if media_thumbnail: el = '<media:thumbnail url="{0}" />'.format(media_thumbnail) out = out.replace('<feed', '<feed xmlns:media="http://search.yahoo.com/mrss/"') out = out.replace('</entry>', '{0}</entry>'.format(el)) return out
def feed(self, feed_title, title, content, url, published=None, summary=None, enclosure=None, media_thumbnail=None): feed = FeedGenerator() feed.title(feed_title) feed.description(faker.sentence()) feed.link({'href': WP_FEED_URL}) entry = feed.add_entry() entry.title(title) entry.link({'href': url}) entry.author(name=faker.name()) entry.content(content, type="cdata") if summary: entry.description(summary) if enclosure: entry.enclosure(url=enclosure['url'], type=enclosure['type'], length=str(faker.pyint())) if media_thumbnail: feed.load_extension('media') entry.media.thumbnail({'url': media_thumbnail}) tz = pytz.timezone(faker.timezone()) published = published or faker.date_time(tzinfo=tz) entry.published(published) entry.updated(faker.date_time_between(start_date=published, tzinfo=tz)) return feed.rss_str().decode('utf8')
def feed(self, feed_title, title, content, url, published=None, summary=None, enclosure=None, media_thumbnail=None): feed = AtomFeed(feed_title, feed_url=WP_FEED_URL) tz = pytz.timezone(faker.timezone()) published = published or faker.date_time(tzinfo=tz) kwargs = { 'content_type': 'html', 'author': faker.name(), 'url': url, 'updated': faker.date_time_between(start_date=published, tzinfo=tz), 'published': published } if summary: kwargs['summary'] = summary if enclosure: kwargs['links'] = [{ 'type': enclosure['type'], 'href': enclosure['url'], 'rel': 'enclosure', 'length': faker.pyint(), }] feed.add(title, content, **kwargs) out = feed.to_string() if media_thumbnail: el = '<media:thumbnail url="{0}" />'.format(media_thumbnail) out = out.replace('<feed', '<feed xmlns:media="http://search.yahoo.com/mrss/"') out = out.replace('</entry>', '{0}</entry>'.format(el)) return out
def metadata_factory(url, data=None): response = { 'etag': '', 'url': url, 'content-length': faker.pyint(), 'content-disposition': '', 'content-md5': faker.md5(), 'content-location': '', 'expires': faker.iso8601(), 'status': 200, 'updated': faker.iso8601(), 'last-modified': faker.iso8601(), 'content-encoding': 'gzip', 'content-type': faker.mime_type() } if data: response.update(data) return json.dumps(response)
def test_all_resource_fields(self): node = BNode() g = Graph() title = faker.sentence() url = faker.uri() description = faker.paragraph() filesize = faker.pyint() issued = faker.date_time_between(start_date='-60d', end_date='-30d') modified = faker.past_datetime(start_date='-30d') mime = faker.mime_type() sha1 = faker.sha1() g.add((node, RDF.type, DCAT.Distribution)) g.add((node, DCT.title, Literal(title))) g.add((node, DCT.description, Literal(description))) g.add((node, DCAT.downloadURL, Literal(url))) g.add((node, DCT.issued, Literal(issued))) g.add((node, DCT.modified, Literal(modified))) g.add((node, DCAT.bytesSize, Literal(filesize))) g.add((node, DCAT.mediaType, Literal(mime))) g.add((node, DCT.term('format'), Literal('CSV'))) checksum = BNode() g.add((node, SPDX.checksum, checksum)) g.add((checksum, RDF.type, SPDX.Checksum)) g.add((checksum, SPDX.algorithm, SPDX.checksumAlgorithm_sha1)) g.add((checksum, SPDX.checksumValue, Literal(sha1))) resource = resource_from_rdf(g) resource.validate() assert isinstance(resource, Resource) assert resource.title == title assert resource.url == url assert resource.description == description assert resource.filesize == filesize assert resource.mime == mime assert isinstance(resource.checksum, Checksum) assert resource.checksum.type == 'sha1' assert resource.checksum.value == sha1 assert resource.published == issued assert resource.modified == modified assert resource.format == 'csv'
def metadata_factory(url, data=None): """Base for a mocked Croquemort HTTP response""" response = { 'etag': '', 'checked-url': url, 'content-length': faker.pyint(), 'content-disposition': '', 'content-md5': faker.md5(), 'content-location': '', 'expires': faker.iso8601(), 'final-status-code': 200, 'updated': faker.iso8601(), 'last-modified': faker.iso8601(), 'content-encoding': 'gzip', 'content-type': faker.mime_type() } if data: response.update(data) return json.dumps(response)
class NestedFactory(MongoEngineFactory): class Meta: model = NestedFake key = factory.LazyAttribute(lambda o: faker.word()) value = factory.LazyAttribute(lambda o: faker.pyint())