def get_names(self): """ Return the list of names of configurations supported by the GSM """ configs = Publisher.all(sort=[{"publisher_name.exact" : {"order" : "asc"}}]) names = [p['publisher_name'] for p in configs] return names
def license_detect(self, record): # get all the URL-s from ES into a list # need some way of getting facets from the DAO, ideally # directly in list form as well as the raw form all_configs = Publisher.all(sort=[{'publisher_name': 'asc'}]) # always get them in the same order relative to each other url_index = self._generate_publisher_config_index_by_url(all_configs) url_index = OrderedDict(sorted(url_index.iteritems(), key=lambda x: len(x[0]), reverse=True)) # longest url-s first id_index = self._generate_publisher_config_index_by_id(all_configs) # get all the configs that match matching_configs = [] work_on = record.provider_urls work_on = self.clean_urls(work_on, strip_leading_www=True) for config_url, config_id in url_index.items(): for incoming_url in work_on: if incoming_url.startswith(config_url): matching_configs.append(id_index[config_id]) # future: # use tries to prefix match them to the incoming URL # if the results of this could be ordered by URL length that # would be great, or stop at first match option urls_contents = {} # prefetch the content, we'll be reusing it a lot for incoming_url in record.provider_urls: unused_response, urls_contents[incoming_url], unused_content_length = util.http_stream_get(incoming_url) # order their license statements by whether they have a version, # and then by length successful_config = None current_licenses_count = len(record.license) new_licenses_count = 0 for config in matching_configs: matching_config_licenses = config['licenses'] matching_config_licenses = sorted( matching_config_licenses, key=lambda lic: ( lic.get('version'), # with reverse=True, this will actually sort licenses in REVERSE ALPHABETICAL order of their versions, blank versions go last len(lic['license_statement']) # longest first with reverse=True ), reverse=True ) # try matching like that lic_statements = [] for l in matching_config_licenses: lic_statement = {} lic_statement[l['license_statement']] = {'type': l['license_type'], 'version': l['version']} lic_statements.append(lic_statement) for incoming_url, content in urls_contents.iteritems(): self.simple_extract(lic_statements, record, incoming_url, first_match=True, content=content, handler=config.publisher_name) new_licenses_count = len(record.license) # if we find a license, stop trying the different URL-s if new_licenses_count > current_licenses_count: break # if we find a license, stop trying the configs and record which config found it if new_licenses_count > current_licenses_count: # found it! successful_config = config break # if no config exists which can match the license, then try the flat list # do not try the flat list of statements if a matching config has been found # this keeps these "virtual" plugins, i.e. the configs, consistent with how # the rest of the system operates lic_statements = [] flat_license_list_success = False if len(matching_configs) <= 0: all_statements = LicenseStatement.all() all_statements = sorted( all_statements, key=lambda lic: ( lic.get('version', '') == '', # does it NOT have a version? last! # see http://stackoverflow.com/questions/9386501/sorting-in-python-and-empty-strings len(lic['license_statement']) # length of license statement ) ) for l in all_statements: lic_statement = {} lic_statement[l['license_statement']] = {'type': l['license_type'], 'version': l.get('version', '')} lic_statements.append(lic_statement) for incoming_url, content in urls_contents.iteritems(): self.simple_extract(lic_statements, record, incoming_url, first_match=True, content=content) # default handler - the plugin's name new_licenses_count = len(record.license) # if we find a license, stop trying the different URL-s if new_licenses_count > current_licenses_count: break if new_licenses_count > current_licenses_count: # one of the flat license index did it flat_license_list_success = True if successful_config: return successful_config.publisher_name, self.__version__ elif flat_license_list_success: return self._short_name, self.__version__ # in case everything fails, return 'oag' as the handler to # be consistent with the failure handler in the workflow module # so that way, all "completely failed" licenses will have 'oag' # on them, except that the GSM ones will have the GSM's current # version return 'oag', self.__version__