def test_strip_url(): """Test utils.strip_url function. Procedure: - Create input and expected output for the following cases. * Normal URL. * Non URL. * URL without the protocol prefix. * URL that points to subroute. - Call utils.strip_url for all the inputs. Verification: - Check if all the outputs generated by the function is equal to corresponding expected output. """ ip1 = 'http://abc.com/entity' op1 = 'entity' ip2 = 'entity' op2 = 'entity' ip3 = 'abc.com/entity' op3 = 'entity' ip4 = 'http://abc.com/sub/entity' op4 = 'entity' assert utils.strip_url(ip1) == op1, 'Test a normal URL.' assert utils.strip_url(ip2) == op2, 'Test a non URL.' assert utils.strip_url(ip3) == op3, 'Test a URL without protocol.' assert utils.strip_url(ip4) == op4, 'Test a nested URL.'
def clean_backlog(self): """ :return: """ if not self.entries: print('entries not yet loaded') return # get urls from entries included_urls = osg.all_urls(self.entries) included_urls = list(included_urls.keys()) # only need the URLs here # get urls from rejected file text = utils.read_text(c.rejected_file) regex = re.compile(r"\((http.*?)\)", re.MULTILINE) matches = regex.findall(text) rejected_urls = [] for match in matches: urls = match.split(',') urls = [x.strip() for x in urls] rejected_urls.extend(urls) included_urls.extend(rejected_urls) # those that only have a web archive version, also get the original version more_urls = [] for url in included_urls: if url.startswith('https://web.archive.org/web'): # print(url) # sometimes the http is missing in archive links (would need proper parsing) url = url[url.index('http', 5):] more_urls.append(url) included_urls.extend(more_urls) # now we strip the urls stripped_urls = [utils.strip_url(x) for x in included_urls] stripped_urls = set( stripped_urls) # removes duplicates for performance # read backlog and get urls from there text = utils.read_text(c.backlog_file) text = text.split('\n') # remove those that are in stripped_game_urls text = [x for x in text if utils.strip_url(x) not in stripped_urls] # remove duplicates and sort text = sorted(list(set(text)), key=str.casefold) print('backlog contains {} items'.format(len(text))) # join and save again text = '\n'.join(text) utils.write_text(c.backlog_file, text) print('backlog cleaned')
def clean_backlog(stripped_game_urls): # read backlog and split file = os.path.join(c.root_path, 'tools', 'backlog.txt') text = utils.read_text(file) text = text.split('\n') # remove those that are in stripped_game_urls text = [x for x in text if utils.strip_url(x) not in stripped_game_urls] # remove duplicates and sort text = sorted(list(set(text)), key=str.casefold) print('backlog contains {} items'.format(len(text))) # join and save again text = '\n'.join(text) utils.write_text(file, text)
def __enum_to_proto(self, class_to_prop: Dict[str, Set[PropertyToParent]], enumerations: Set[str]): """Call EnumDescriptor.to_proto() and get proto code for every schema enumeration. Args: class_to_prop (dict(set): Dictionary containing set of properties for every class. enumerations (set): Set containing the enumerations in the schema. Returns: str: The proto code for all the schema enumerations in enumerations as a string. """ proto_enum = '// Definition of enumerations begin here.\n\n' for x in sorted(enumerations): enum_values = set() for ev, _, _ in self.graph.triples( (None, constants.schema_constants['Type'], utils.add_url(x))): enum_values.add(utils.strip_url(ev)) comment = '' for _, _, c in self.graph.triples( (utils.add_url(x), constants.schema_constants['Comment'], None)): comment += c soup = BeautifulSoup(comment, 'html.parser') comment = soup.get_text() proto_enum += enum_descriptor.EnumDescriptor( x, list(class_to_prop[x]), list(enum_values)).to_proto(comment) proto_enum += '\n' return proto_enum
def __get_json_descriptor(self, class_to_prop: Dict[str, Set[PropertyToParent]], prop_to_class: Dict[str, Set[str]], enumerations: Set[str]) -> Dict: """Return a json descriptor for the given schema. Args: dict[str, set[PropertyToParent]]: Dictionary containing set of properties for every class. dict[str, set[str]]: Dictionary containing range of class/datatypes for every property. set[str]: Set containing the enumerations in the schema. Returns: dict: The json descriptor for the schema. """ defined_classes = set(class_to_prop.keys()) total_classes = set() for _, _, property_name in self.graph.triples( (None, utils.constants.schema_constants['rangeIncludes'], None)): total_classes.add(utils.strip_url(property_name)) undefined_classes = total_classes.difference(defined_classes) undefined_classes = undefined_classes | set( utils.constants.schema_primitives.keys()) message_descriptor = {} for x in sorted(class_to_prop.keys()): if ((x not in enumerations) and (x not in constants.schema_datatypes) and (x not in constants.schema_primitives)): o = {} o['@type'] = utils.strip_url(x) prop_from_self = list() prop_inherited = dict() o['fields'] = list() o['fields'].append('@id') for p in class_to_prop[x]: if p.parent == x: prop_from_self.append(p.name) else: if p.parent not in prop_inherited: prop_inherited[p.parent] = list() prop_inherited[p.parent].append(p.name) prop_from_self = sorted(prop_from_self) prop_inherited = collections.OrderedDict( sorted(prop_inherited.items())) for p in prop_from_self: o['fields'].append(p) for ky in prop_inherited: props = sorted(prop_inherited[ky]) o['fields'].extend(props) message_descriptor[x] = o for x in sorted(prop_to_class.keys()): if len(prop_to_class[x]) > 0: o = {} o['@type'] = 'Property' o['fields'] = sorted(list(prop_to_class[x])) message_descriptor[x] = o for x in sorted(enumerations): enum_values = set() for ev, _, _ in self.graph.triples( (None, constants.schema_constants['Type'], utils.add_url(x))): enum_values.add(ev) o = {} o['@type'] = 'EnumWrapper' o['values'] = sorted(list(enum_values)) o['values'].insert(0, 'Unknown') o['fields'] = ['id', x + 'Class'] o2 = {} o2['@type'] = x prop_from_self = list() prop_inherited = dict() o2['fields'] = list() o2['fields'].append('@id') for p in class_to_prop[x]: if p.parent == x: prop_from_self.append(p.name) else: if p.parent not in prop_inherited: prop_inherited[p.parent] = list() prop_inherited[p.parent].append(p.name) prop_from_self = sorted(prop_from_self) prop_inherited = collections.OrderedDict( sorted(prop_inherited.items())) for p in prop_from_self: o2['fields'].append(p) for ky in prop_inherited: props = sorted(prop_inherited[ky]) o2['fields'].extend(props) message_descriptor[x] = o message_descriptor[x + 'Class'] = o2 message_descriptor['Date'] = {} message_descriptor['Date']['@type'] = 'DatatypeDate' message_descriptor['DateTime'] = {} message_descriptor['DateTime']['@type'] = 'DatatypeDateTime' message_descriptor['Time'] = {} message_descriptor['Time']['@type'] = 'DatatypeTime' message_descriptor['Duration'] = {} message_descriptor['Duration']['@type'] = 'DatatypeDuration' message_descriptor['Distance'] = {} message_descriptor['Distance']['@type'] = 'DatatypeQuantitative' message_descriptor['Energy'] = {} message_descriptor['Energy']['@type'] = 'DatatypeQuantitative' message_descriptor['Mass'] = {} message_descriptor['Mass']['@type'] = 'DatatypeQuantitative' json_descriptor = {} json_descriptor['messages'] = message_descriptor json_descriptor['primitives'] = list(sorted(undefined_classes)) return json_descriptor
def __get_values( self ) -> Tuple[Dict[str, Set[PropertyToParent]], Dict[str, Set[str]], Set[str]]: """Call utils.toplogical_sort(), compress the inheritance heirarchy and return mappings between schema classes, schema properties and schema enumerations. Returns: dict[str, set[PropertyToParent]]: Dictionary containing set of properties for every class. dict[str, set[str]]: Dictionary containing range of class/datatypes for every property. set[str]: Set containing the enumerations in the schema. """ class_to_prop = dict() inheritance_graph = dict() for class_name, _, _ in self.graph.triples( (None, constants.schema_constants['Type'], constants.schema_constants['Class'])): class_to_prop[utils.strip_url(class_name)] = set() for property_name, _, _ in self.graph.triples( (None, constants.schema_constants['domainIncludes'], class_name)): prop = utils.PropertyToParent(utils.strip_url(property_name), utils.strip_url(class_name)) class_to_prop[utils.strip_url(class_name)].add(prop) for class_name, _, _ in self.graph.triples( (None, constants.schema_constants['Type'], constants.schema_constants['Class'])): if class_name not in inheritance_graph: inheritance_graph[class_name] = set() for _, _, parent_class in self.graph.triples( (class_name, constants.schema_constants['subClassOf'], None)): if parent_class not in inheritance_graph: inheritance_graph[parent_class] = set() inheritance_graph[parent_class].add(class_name) topsort_order = utils.topological_sort(inheritance_graph) for class_name in topsort_order: for _, _, parent_class in self.graph.triples( (class_name, constants.schema_constants['subClassOf'], None)): if utils.strip_url(parent_class) in class_to_prop: class_to_prop[utils.strip_url(class_name)] = class_to_prop[ utils.strip_url(class_name)] | class_to_prop[ utils.strip_url(parent_class)] enumerations = set() for enum, _, _ in self.graph.triples( (None, constants.schema_constants['subClassOf'], constants.schema_constants['Enumeration'])): enumerations.add(utils.strip_url(enum)) class_to_children = utils.get_children(inheritance_graph) # Temporary Code # class_to_children[rdflib.URIRef('http://schema.org/Audience')].add(rdflib.URIRef("http://schema.org/Researcher")) # class_to_prop["SteeringPositionValue"] = class_to_prop["Enumeration"] # class_to_prop["DriveWheelConfigurationValue"] = class_to_prop["Enumeration"] # enumerations.add("SteeringPositionValue") # enumerations.add("DriveWheelConfigurationValue") # End of temporary code prop_to_class = dict() for property_name, _, _ in self.graph.triples( (None, constants.schema_constants['Type'], constants.schema_constants['Property'])): prop_to_class[utils.strip_url(property_name)] = set() for _, _, class_name in self.graph.triples( (property_name, constants.schema_constants['rangeIncludes'], None)): prop_to_class[utils.strip_url(property_name)].add( utils.strip_url(class_name)) if class_name in class_to_children: prop_to_class[utils.strip_url( property_name )] = prop_to_class[utils.strip_url(property_name)] | set( map(utils.strip_url, class_to_children[class_name])) if class_name == constants.schema_constants['Number']: prop_to_class[utils.strip_url(property_name)].add( utils.strip_url(constants.schema_constants['Integer'])) prop_to_class[utils.strip_url(property_name)].add( utils.strip_url(constants.schema_constants['Float'])) if class_name == constants.schema_constants['Text']: prop_to_class[utils.strip_url(property_name)].add( utils.strip_url(constants.schema_constants['URL'])) return class_to_prop, prop_to_class, enumerations
osgc_frameworks = osgc_entry['framework'] if type(osgc_frameworks) == str: osgc_frameworks = [osgc_frameworks] our_frameworks = our_entry.get('Code dependency', []) our_frameworks = [x.casefold() for x in our_frameworks] our_frameworks = [x if x not in our_framework_replacements else our_framework_replacements[x] for x in our_frameworks] osgc_frameworks = [x.casefold() for x in osgc_frameworks] p += compare_sets(osgc_frameworks, our_frameworks, 'framework/dependencies') # compare their repo with our code repository and download if 'repo' in osgc_entry: osgc_repos = osgc_entry['repo'] if type(osgc_repos) == str: osgc_repos = [osgc_repos] osgc_repos = [u.strip_url(url) for url in osgc_repos] osgc_repos = [x for x in osgc_repos if not x.startswith( 'sourceforge.net/projects/')] # we don't need the general sites there # osgc_repos = [x for x in osgc_repos if not x.startswith('https://sourceforge.net/projects/')] # ignore some our_repos = our_entry.get('Code repository', []) our_repos = [u.strip_url(url) for url in our_repos] our_repos = [x for x in our_repos if not x.startswith( 'gitlab.com/osgames/')] # we do not yet spread our own deeds (but we will some day) our_repos = [x for x in our_repos if 'cvs.sourceforge.net' not in x and 'svn.code.sf.net/p/' not in x] # no cvs or svn anymore our_downloads = our_entry.get('Download', []) our_downloads = [u.strip_url(url) for url in our_downloads] p += compare_sets(osgc_repos, our_repos + our_downloads, 'repo', 'notthem') # if their repos are not in our downloads or repos p += compare_sets(osgc_repos, our_repos[:1], 'repo', 'notus') # if our main repo is not in their repo
our_repos = our_entry.get('code repository', []) for repo in repos: if repo.startswith( 'https://sourceforge.net/projects/'): continue if (repo not in our_repos) and ( repo + '.git' not in our_repos ): # add .git automatically and try it too p += ' code repository {} missing\n'.format(repo) # url (ignore http/https) if 'url' in osgc_entry: urls = osgc_entry['url'] if type(urls) == str: urls = [urls] urls = [utils.strip_url(url) for url in urls] our_urls = our_entry['home'] our_urls = [utils.strip_url(url) for url in our_urls] for url in urls: if url not in our_urls: p += ' home url {} missing\n'.format(url) # status if 'status' in osgc_entry: status = osgc_entry['status'] our_status = our_entry['state'] # essential field if status == 'playable' and 'mature' not in our_status: p += ' status playable, not mature with us\n' if status != 'playable' and 'mature' in our_status: p += ' status {}, mature with us\n'.format(status) if status == 'unplayable':
text = utils.read_text(os.path.join(c.root_path, 'tools', 'rejected.txt')) regex = re.compile(r"\((http.*?)\)", re.MULTILINE) matches = regex.findall(text) rejected_urls = [] for match in matches: urls = match.split(',') urls = [x.strip() for x in urls] rejected_urls.extend(urls) game_urls.extend(rejected_urls) more_urls = [] for url in game_urls: if url.startswith('https://web.archive.org/web'): url = url[url.index('http', 5):] more_urls.append(url) game_urls.extend(more_urls) stripped_game_urls = [utils.strip_url(x) for x in game_urls] clean_backlog(stripped_game_urls) # check for unfilled template lines check_template_leftovers() # fix entries fix_entries() # assemble info infos = osg.assemble_infos() # recount and write to readme and to tocs update_readme_and_tocs(infos) # generate report