def detect_by_url(self, row): url = row['url'] hashed_url = hash_url(url) found = False datasets = Dataset.objects.filter(resources__urlhash=hashed_url) for dataset in datasets: resource = get_by(dataset.resources, 'urlhash', hashed_url) self.resources.append({ 'dataset': dataset, 'resource': resource, 'data': row, }) found = True resources = CommunityResource.objects.filter(urlhash=hashed_url) for resource in resources: self.community_resources.append({ 'resource': resource, 'data': row, }) found = True if not found: log.error('No resource found by url', extra={ 'hashed_url': hashed_url, 'url': url })
def detect_download_objects(self): for row in self.rows: if 'url' not in row: continue last_url_match = re.match(LATEST_URL_REGEX, row['url']) resource_id = last_url_match and last_url_match.group(1) if resource_id: self.detect_by_resource_id(resource_id, row) else: hashed_url = hash_url(row['url']) self.detect_by_hashed_url(hashed_url, row)
def handle_downloads(self, row, day): if 'url' in row: try: hashed_url = hash_url(row['url']) data = ( Dataset.objects(resources__urlhash=hashed_url).first() or CommunityResource.objects(urlhash=hashed_url).first() ) if isinstance(data, Dataset): dataset = data resource = get_by(dataset.resources, 'urlhash', hashed_url) log.debug('Found resource download: %s', resource.url) self.count(resource, day, row) metric = ResourceViews(resource) metric.compute() # Use the MongoDB positionnal operator ($) cmd = 'set__resources__S__metrics__{0}'.format(metric.name) qs = Dataset.objects(id=dataset.id, resources__id=resource.id) qs.update(**{cmd: metric.value}) if dataset.organization: OrgResourcesDownloads(dataset.organization).compute() elif isinstance(data, CommunityResource): resource = data log.debug('Found community resource download: %s', resource.url) self.count(resource, day, row) metric = CommunityResourceViews(resource) metric.compute() resource.metrics[metric.name] = metric.value resource.save() except: log.exception('Unable to count download for %s', row['url']) if 'subtable' in row: for subrow in row['subtable']: self.handle_downloads(subrow, day)
def url_exists(cls, url): urlhash = hash_url(url) return cls.objects(urlhash=urlhash).count() > 0
def clean(self): super(Reuse, self).clean() '''Auto populate urlhash from url''' if not self.urlhash or 'url' in self._get_changed_fields(): self.urlhash = hash_url(self.url)
def clean(self): super(ResourceMixin, self).clean() if not self.urlhash or 'url' in self._get_changed_fields(): self.urlhash = hash_url(self.url)
def clean(self): '''Auto populate urlhash from url''' if not self.urlhash or 'url' in self._get_changed_fields(): self.urlhash = hash_url(self.url) super(Reuse, self).clean()