def test(): global Configurator Configurator = fake_config ws = DAMWorkspace.objects.get(pk=1) user = User.objects.get(username='******') t = TriggerEvent.objects.get_or_create(name="test")[0] print 't.pk=', t.pk try: print 'attempting to reuse pipeline' pipeline = Pipeline.objects.get(name='test4', workspace=ws) except: print 'creating new pipeline' pipeline = Pipeline.objects.create(name="test4", description='', params=dumps(test_pipe2), workspace=ws) print 'ok' pipeline.triggers.add(t) print 'ok2' pipeline.save() print 'done' process = Process.objects.create(pipeline=pipeline, workspace=ws, launched_by=user) for n in xrange(15): print 'adding target %d' % n process.add_params(item='item%d' % n) try: batch = Batch(process) d = batch.run() d.addBoth(end_test, process) except Exception, e: log.error("Fatal initialization error: %s" % str(e)) reactor.stop()
def _iterate(self): """ Run the actions listed in schedule on the items returned by _new_batch """ #log.debug('_iterate: oustanding=%s' % self.outstanding) #d if self.gameover: log.debug('_iterate: gameover') return action, task = self._get_action() if action: item, schedule = task['item'], task['schedule'] method, params = self.scripts[action] try: item_params = loads(item.params) # tmp bug fixing starts here for k in params.keys(): if params[k] == '' and (k in item_params[action]): params[k] = item_params[action][k] # tmp bug fixing ends here params.update(item_params.get('*', {})) x = re.compile('^[a-z_]+') # cut out digits from action name params.update(item_params.get(x.match(action).group(), {})) self.outstanding += 1 #params = {u'source_variant_name': u'original'} d = method(self.process.workspace, item.target_id, **params) except Exception, e: log.error('ERROR in %s: %s %s' % (str(method), type(e), str(e))) self._handle_err(str(e), item, schedule, action, params) else: d.addCallbacks(self._handle_ok, self._handle_err, callbackArgs=[item, schedule, action, params], errbackArgs=[item, schedule, action, params])
def _iterate(self): """ Run the actions listed in schedule on the items returned by _new_batch """ #log.debug('_iterate: oustanding=%s' % self.outstanding) #d if self.gameover: log.debug('_iterate: gameover') return action, task = self._get_action() if action: item, schedule = task['item'], task['schedule'] method, params = self.scripts[action] try: item_params = loads(item.params) # tmp bug fixing starts here for k in params.keys(): if params[k] == '' and (k in item_params[action]): params[k] = item_params[action][k] # tmp bug fixing ends here params.update(item_params.get('*', {})) x = re.compile('^[a-z_]+' ) # cut out digits from action name params.update(item_params.get(x.match(action).group(), {})) self.outstanding += 1 #params = {u'source_variant_name': u'original'} d = method(self.process.workspace, item.target_id, **params) except Exception, e: log.error('ERROR in %s: %s %s' % (str(method), type(e), str(e))) self._handle_err(str(e), item, schedule, action, params) else: d.addCallbacks(self._handle_ok, self._handle_err, callbackArgs=[item, schedule, action, params], errbackArgs=[item, schedule, action, params])
def test(): global Configurator Configurator = fake_config ws = DAMWorkspace.objects.get(pk=1) user = User.objects.get(username='******') t = TriggerEvent.objects.get_or_create(name="test")[0] print 't.pk=', t.pk try: print 'attempting to reuse pipeline' pipeline = Pipeline.objects.get(name='test4', workspace=ws) except: print 'creating new pipeline' pipeline = Pipeline.objects.create(name="test4", description='', params=dumps(test_pipe2), workspace=ws) print 'ok' pipeline.triggers.add(t) print 'ok2' pipeline.save() print 'done' process = Process.objects.create(pipeline=pipeline, workspace=ws, launched_by=user) for n in xrange(15): print 'adding target %d' % n process.add_params(item = 'item%d' % n) try: batch = Batch(process) d = batch.run() d.addBoth(end_test, process) except Exception, e: log.error("Fatal initialization error: %s" % str(e)) reactor.stop()
def handle_result(self, result, *args): #log.debug('= handle_result %s' % str(result)[:128]) try: return_value = self.parse_stdout(result['data'], *args) self.deferred.callback(return_value) except Exception, e: log.error('Error in %s: %s %s' % (self.__class__.__name__, type(e), str(e))) self.deferred.errback(e)
def execute(self, **params): # get basic data (avoid creating stuff in DB) try: self.get_cmdline(**params) args = splitstring(self.cmdline) except Exception, e: log.error('Error in %s: %s %s' % (self.__class__.__name__, type(e), str(e))) self.deferred.errback(e)
def _save_features(self, features, extractor_type): "save results of basic extractions" metadata_list, delete_list = [], [] #log.debug('ExtractBasicFeatures._save_features: %s' % features) ctype = ContentType.objects.get_for_model(self.source) try: save_type(ctype, self.source) except Exception, e: log.error("Failed to save component format as DC:Format: %s" % (str(e)))
def _handle_err(self, result, item, schedule, action, params): log.error('_handle_err action %s on target_id=%s: %s' % (action, item.target_id, str(result))) self.outstanding -= 1 cancelled = schedule.fail(action) self._update_item_stats(item, action, str(result), 0, 1, 0) for a in cancelled: self._update_item_stats(item, a, "cancelled on failed %s" % action, 0, 0, 1) if self.outstanding < self.max_outstanding: #log.debug('_handle_err: rescheduling') #d reactor.callLater(0, self._iterate) item.save()
def _cb_xmp_ok(self, features): try: ctype_component = ContentType.objects.get_for_model(self.component) ctype = ContentType.objects.get_for_model(self.item) xpath = re.compile(r"(?P<prefix>\w+):(?P<property>\w+)(?P<array_index>\[\d+\]){,1}") user = self.item.uploaded_by() metadata_default_language = get_metadata_default_language(user) except Exception, e: log.error("Error in %s: %s %s" % (self.__class__.__name__, type(e), str(e))) self.deferred.errback(e) return
def handle_result(self, result, component): log.debug('handle_result %s' % str(result)) log.debug("[save_component] component %s" % component.pk) if result: directory, name = os.path.split(result) component.uri = name component.save() else: log.error('Empty result passed to save_and_extract_features') self.deferred.callback(result)
def handle_result(self, result, component): log.debug("handle_result %s" % str(result)) log.debug("[save_component] component %s" % component.pk) if result: directory, name = os.path.split(result) component.uri = name component.save() else: log.error("Empty result passed to save_and_extract_features") self.deferred.callback(result)
def execute(self, output_variant_name, output_type, **params): # get basic data (avoid creating stuff in DB) try: self.get_cmdline(output_variant_name, output_type, **params) output_variant = Variant.objects.get(name = output_variant_name) self.out_comp = self.item.create_variant(output_variant, self.workspace, self.out_type) self.out_comp.source = self.source args = splitstring(self.cmdline) except Exception, e: log.error('Error in %s: %s %s' % (self.__class__.__name__, type(e), str(e))) self.deferred.errback(e)
def execute(self, output_variant_name, output_type, **params): # get basic data (avoid creating stuff in DB) try: self.get_cmdline(output_variant_name, output_type, **params) output_variant = Variant.objects.get(name=output_variant_name) self.out_comp = self.item.create_variant(output_variant, self.workspace, self.out_type) self.out_comp.source = self.source args = splitstring(self.cmdline) except Exception, e: log.error('Error in %s: %s %s' % (self.__class__.__name__, type(e), str(e))) self.deferred.errback(e)
def _cb_xmp_ok(self, features): try: ctype_component = ContentType.objects.get_for_model(self.component) ctype = ContentType.objects.get_for_model(self.item) xpath = re.compile( r'(?P<prefix>\w+):(?P<property>\w+)(?P<array_index>\[\d+\]){,1}' ) user = self.item.uploaded_by() metadata_default_language = get_metadata_default_language(user) except Exception, e: log.error('Error in %s: %s %s' % (self.__class__.__name__, type(e), str(e))) self.deferred.errback(e) return
def _read_xmp_features(self, features): xpath = re.compile(r"(?P<prefix>\w+):(?P<property>\w+)(?P<array_index>\[\d+\]){,1}") ctype = ContentType.objects.get_for_model(self.item) ctype_component = ContentType.objects.get_for_model(self.component) user = self.item.uploaded_by() metadata_default_language = get_metadata_default_language(user) metadata_dict = {} metadata_list = [] delete_list = [] log.debug("READ XMP FEATURES") if not isinstance(features, dict): item.state = 1 item.save() return [], [] for feature in features.keys(): try: namespace_obj = XMPNamespace.objects.get(uri=feature) except Exception, e: log.error("####### Error: unknown namespace %s: %s" % (feature, str(e))) continue metadata_dict[namespace_obj] = {} namespace_properties = MetadataProperty.objects.filter(namespace=namespace_obj) for property_values in features[feature]: property_xpath = property_values[0] property_value = property_values[1] property_options = property_values[2] xpath_splitted = xpath.findall(property_xpath) metadata_property = xpath_splitted[0][1].strip() metadata_index = xpath_splitted[0][2].strip() found_property = namespace_properties.filter(field_name__iexact=metadata_property) if found_property.count() > 0 and len(property_value.strip()) > 0: if found_property[0].is_array == "not_array": delete_list.append(found_property[0]) if property_options["IS_QUALIFIER"] and xpath_splitted[-1][1] == "lang": # log.debug('############# setting throw away IS_QUALIFIER option') find_xpath = property_xpath.replace("/?xml:lang", "") if metadata_dict[namespace_obj].has_key(find_xpath): if property_value == "x-default": property_value = metadata_default_language metadata_dict[namespace_obj][find_xpath].language = property_value else: log.debug("metadata property not found: " + find_xpath) pass # log.debug('###@@@@ %s: (%s)' % (find_xpath, property_value)) else: if found_property[0].is_variant: x = MetadataValue( schema=found_property[0], object_id=self.component.pk, content_type=ctype_component, value=property_value, xpath=property_xpath, ) else: x = MetadataValue( schema=found_property[0], object_id=self.item.pk, content_type=ctype, value=property_value, xpath=property_xpath, ) metadata_dict[namespace_obj][property_xpath] = x metadata_list.append(x)
def _cb_xmp_ok(self, features): try: ctype_component = ContentType.objects.get_for_model(self.component) ctype = ContentType.objects.get_for_model(self.item) xpath = re.compile(r"(?P<prefix>\w+):(?P<property>\w+)(?P<array_index>\[\d+\]){,1}") user = self.item.uploaded_by() metadata_default_language = get_metadata_default_language(user) except Exception, e: log.error("Error in %s: %s %s" % (self.__class__.__name__, type(e), str(e))) self.deferred.errback(e) return try: save_type(ctype, self.component) except Exception, e: log.error("Failed to save component format as DC:Format: %s" % (str(e))) self.deferred.errback(e) return try: xmp_metadata_list, xmp_delete_list = self._read_xmp_features(features) MetadataValue.objects.filter( schema__in=xmp_delete_list, object_id=self.component.pk, content_type=ctype_component ).delete() latitude = None longitude = None for x in xmp_metadata_list: if x.xpath == "exif:GPSLatitude": latitude = x.value elif x.xpath == "exif:GPSLongitude":
def _read_xmp_features(self, features): xpath = re.compile( r'(?P<prefix>\w+):(?P<property>\w+)(?P<array_index>\[\d+\]){,1}') ctype = ContentType.objects.get_for_model(self.item) ctype_component = ContentType.objects.get_for_model(self.component) user = self.item.uploaded_by() metadata_default_language = get_metadata_default_language(user) metadata_dict = {} metadata_list = [] delete_list = [] log.debug('READ XMP FEATURES') if not isinstance(features, dict): item.state = 1 item.save() return [], [] for feature in features.keys(): try: namespace_obj = XMPNamespace.objects.get(uri=feature) except Exception, e: log.error('####### Error: unknown namespace %s: %s' % (feature, str(e))) continue metadata_dict[namespace_obj] = {} namespace_properties = MetadataProperty.objects.filter( namespace=namespace_obj) for property_values in features[feature]: property_xpath = property_values[0] property_value = property_values[1] property_options = property_values[2] xpath_splitted = xpath.findall(property_xpath) metadata_property = xpath_splitted[0][1].strip() metadata_index = xpath_splitted[0][2].strip() found_property = namespace_properties.filter( field_name__iexact=metadata_property) if found_property.count() > 0 and len( property_value.strip()) > 0: if found_property[0].is_array == 'not_array': delete_list.append(found_property[0]) if property_options['IS_QUALIFIER'] and xpath_splitted[-1][ 1] == 'lang': #log.debug('############# setting throw away IS_QUALIFIER option') find_xpath = property_xpath.replace('/?xml:lang', '') if metadata_dict[namespace_obj].has_key(find_xpath): if property_value == 'x-default': property_value = metadata_default_language metadata_dict[namespace_obj][ find_xpath].language = property_value else: log.debug('metadata property not found: ' + find_xpath) pass #log.debug('###@@@@ %s: (%s)' % (find_xpath, property_value)) else: if found_property[0].is_variant: x = MetadataValue(schema=found_property[0], object_id=self.component.pk, content_type=ctype_component, value=property_value, xpath=property_xpath) else: x = MetadataValue(schema=found_property[0], object_id=self.item.pk, content_type=ctype, value=property_value, xpath=property_xpath) metadata_dict[namespace_obj][property_xpath] = x metadata_list.append(x)
ctype = ContentType.objects.get_for_model(self.item) xpath = re.compile( r'(?P<prefix>\w+):(?P<property>\w+)(?P<array_index>\[\d+\]){,1}' ) user = self.item.uploaded_by() metadata_default_language = get_metadata_default_language(user) except Exception, e: log.error('Error in %s: %s %s' % (self.__class__.__name__, type(e), str(e))) self.deferred.errback(e) return try: save_type(ctype, self.component) except Exception, e: log.error("Failed to save component format as DC:Format: %s" % (str(e))) self.deferred.errback(e) return try: xmp_metadata_list, xmp_delete_list = self._read_xmp_features( features) MetadataValue.objects.filter( schema__in=xmp_delete_list, object_id=self.component.pk, content_type=ctype_component).delete() latitude = None longitude = None for x in xmp_metadata_list: if x.xpath == 'exif:GPSLatitude':
class ExtractBasic(Analyzer): md_server = "LowLoad" exe_list = { 'image_basic': 'identify', 'media_basic': 'mediainfo', 'doc_basic': 'pdfinfo' } cmd_image_basic = '"file://%(infile)s"' cmd_media_basic = '-f "--Output=XML" "file://%(infile)s"' cmd_doc_basic = '"file://%(infile)s"' regex = r'(?P<filename>[^ \[]*)(?P<frame>\[\d+\]){0,1}\s(?P<type>[\w._-]+)\s' \ r'(?P<width>\d+)x(?P<height>\d+)\s' \ r'(?P<wcrop>\d+)x(?P<hcrop>\d+)[\+-](?P<wcropoff>\d+)[\+-](?P<hcropoff>\d+)\s' \ r'(?P<depth>\d+)-(?P<depth_unit>\w+)\s' RE = None def get_cmdline(self): extractor_type = self.source.get_extractor() log.debug('######333\n########## ExtractBasic: using %s' % extractor_type) self.remote_exe = self.exe_list[extractor_type] self.cmdline = getattr(self, 'cmd_%s' % extractor_type) self.parser = getattr(self, 'parse_%s' % extractor_type) self.cmdline = self.cmdline % {'infile': self.source.uri} def parse_stdout(self, result): return self.parser(result, self.source.uri) def parse_image_basic(self, result, filename): features = {} if not self.RE: self.RE = re.compile(self.regex) fullpath = str(self._fc.abspath(filename)) #log.debug('image_basic_extractor: entering') m = self.RE.match(result) if not m: raise Exception('Unable to parse script output') d = m.groupdict() size = os.stat(fullpath).st_size features['size'] = size features['width'] = d['width'] features['height'] = d['height'] features['codec'] = d['type'].lower() features['has_frame'] = d['frame'] is not None features['has_sound'] = False features['depth'] = d['depth'] # depth in bits features['depth_unit'] = d['depth_unit'] # depth in bits self._save_features(features, 'image_basic') return 'ok' def parse_media_basic(self, result, filename): log.debug('parse_media_basic: entering "%s"' % type(result)) fullpath = str(self._fc.abspath(filename)) parser = Parser() parseString(result.encode('utf-8'), parser) features = parser.parsed self._save_features(features, 'media_basic') return 'ok' def parse_doc_basic(self, result, filename): log.debug('parse_doc_basic: entering') features = {} lines = result.split('\n') for line in lines: sep = line.find(':') if sep < 0: continue key = line[:sep].strip() value = line[sep + 1:].strip() features[key] = value features['size'] = long(features.get('File size', '-1').split()[0]) features['pages'] = features['Pages'] self._save_features(features, 'doc_basic') return 'ok' def _save_features(self, features, extractor_type): "save results of basic extractions" metadata_list, delete_list = [], [] #log.debug('ExtractBasicFeatures._save_features: %s' % features) ctype = ContentType.objects.get_for_model(self.source) try: save_type(ctype, self.source) except Exception, e: log.error("Failed to save component format as DC:Format: %s" % (str(e))) try: self.source._features = dumps(features) self.source.save() print 'saved' except Exception, e: log.error("Failed to save features in component object: %s" % str(e))