Python strip_urlの例、utils.utils.strip_url Pythonの例

コード例 #1

0

ファイルを表示

ファイル: test_utils.py プロジェクト: googleinterns/schemaorg-generator

def test_strip_url():
    """Test utils.strip_url function.

    Procedure:
        - Create input and expected output for the following cases.
            * Normal URL.
            * Non URL.
            * URL without the protocol prefix.
            * URL that points to subroute.
        - Call utils.strip_url for all the inputs.

    Verification:
        - Check if all the outputs generated by the function is equal to 
          corresponding expected output.
    """

    ip1 = 'http://abc.com/entity'
    op1 = 'entity'

    ip2 = 'entity'
    op2 = 'entity'

    ip3 = 'abc.com/entity'
    op3 = 'entity'

    ip4 = 'http://abc.com/sub/entity'
    op4 = 'entity'

    assert utils.strip_url(ip1) == op1, 'Test a normal URL.'
    assert utils.strip_url(ip2) == op2, 'Test a non URL.'
    assert utils.strip_url(ip3) == op3, 'Test a URL without protocol.'
    assert utils.strip_url(ip4) == op4, 'Test a nested URL.'

コード例 #2

0

ファイルを表示

    def clean_backlog(self):
        """

        :return:
        """
        if not self.entries:
            print('entries not yet loaded')
            return
        # get urls from entries
        included_urls = osg.all_urls(self.entries)
        included_urls = list(included_urls.keys())  # only need the URLs here

        # get urls from rejected file
        text = utils.read_text(c.rejected_file)
        regex = re.compile(r"\((http.*?)\)", re.MULTILINE)
        matches = regex.findall(text)
        rejected_urls = []
        for match in matches:
            urls = match.split(',')
            urls = [x.strip() for x in urls]
            rejected_urls.extend(urls)
        included_urls.extend(rejected_urls)

        # those that only have a web archive version, also get the original version
        more_urls = []
        for url in included_urls:
            if url.startswith('https://web.archive.org/web'):
                # print(url) # sometimes the http is missing in archive links (would need proper parsing)
                url = url[url.index('http', 5):]
                more_urls.append(url)
        included_urls.extend(more_urls)

        # now we strip the urls
        stripped_urls = [utils.strip_url(x) for x in included_urls]
        stripped_urls = set(
            stripped_urls)  # removes duplicates for performance

        # read backlog and get urls from there
        text = utils.read_text(c.backlog_file)
        text = text.split('\n')

        # remove those that are in stripped_game_urls
        text = [x for x in text if utils.strip_url(x) not in stripped_urls]

        # remove duplicates and sort
        text = sorted(list(set(text)), key=str.casefold)
        print('backlog contains {} items'.format(len(text)))

        # join and save again
        text = '\n'.join(text)
        utils.write_text(c.backlog_file, text)

        print('backlog cleaned')

コード例 #3

0

ファイルを表示

ファイル: maintenance.py プロジェクト: gsthamu/opensourcegames

def clean_backlog(stripped_game_urls):

    # read backlog and split
    file = os.path.join(c.root_path, 'tools', 'backlog.txt')
    text = utils.read_text(file)
    text = text.split('\n')

    # remove those that are in stripped_game_urls
    text = [x for x in text if utils.strip_url(x) not in stripped_game_urls]

    # remove duplicates and sort
    text = sorted(list(set(text)), key=str.casefold)
    print('backlog contains {} items'.format(len(text)))

    # join and save again
    text = '\n'.join(text)
    utils.write_text(file, text)

コード例 #4

0

ファイルを表示

    def __enum_to_proto(self, class_to_prop: Dict[str, Set[PropertyToParent]],
                        enumerations: Set[str]):
        """Call EnumDescriptor.to_proto() and get proto code for every schema
        enumeration.

        Args:
            class_to_prop (dict(set): Dictionary containing set of properties
                                      for every class.
            enumerations (set): Set containing the enumerations in the schema.

        Returns:
            str: The proto code for all the schema enumerations in enumerations
                 as a string.
        """

        proto_enum = '// Definition of enumerations begin here.\n\n'

        for x in sorted(enumerations):
            enum_values = set()

            for ev, _, _ in self.graph.triples(
                (None, constants.schema_constants['Type'], utils.add_url(x))):
                enum_values.add(utils.strip_url(ev))

            comment = ''
            for _, _, c in self.graph.triples(
                (utils.add_url(x), constants.schema_constants['Comment'],
                 None)):
                comment += c

            soup = BeautifulSoup(comment, 'html.parser')
            comment = soup.get_text()

            proto_enum += enum_descriptor.EnumDescriptor(
                x, list(class_to_prop[x]), list(enum_values)).to_proto(comment)
            proto_enum += '\n'

        return proto_enum

コード例 #5

0

ファイルを表示

    def __get_json_descriptor(self, class_to_prop: Dict[str,
                                                        Set[PropertyToParent]],
                              prop_to_class: Dict[str, Set[str]],
                              enumerations: Set[str]) -> Dict:
        """Return a json descriptor for the given schema.

        Args:
            dict[str, set[PropertyToParent]]: Dictionary containing set of
                                              properties for every class.
            dict[str, set[str]]: Dictionary containing range of class/datatypes
                                 for every property.
            set[str]: Set containing the enumerations in the schema.

        Returns:
            dict: The json descriptor for the schema.
        """

        defined_classes = set(class_to_prop.keys())
        total_classes = set()

        for _, _, property_name in self.graph.triples(
            (None, utils.constants.schema_constants['rangeIncludes'], None)):
            total_classes.add(utils.strip_url(property_name))

        undefined_classes = total_classes.difference(defined_classes)
        undefined_classes = undefined_classes | set(
            utils.constants.schema_primitives.keys())

        message_descriptor = {}

        for x in sorted(class_to_prop.keys()):
            if ((x not in enumerations)
                    and (x not in constants.schema_datatypes)
                    and (x not in constants.schema_primitives)):
                o = {}
                o['@type'] = utils.strip_url(x)

                prop_from_self = list()
                prop_inherited = dict()

                o['fields'] = list()
                o['fields'].append('@id')

                for p in class_to_prop[x]:
                    if p.parent == x:
                        prop_from_self.append(p.name)
                    else:
                        if p.parent not in prop_inherited:
                            prop_inherited[p.parent] = list()

                        prop_inherited[p.parent].append(p.name)

                prop_from_self = sorted(prop_from_self)
                prop_inherited = collections.OrderedDict(
                    sorted(prop_inherited.items()))

                for p in prop_from_self:
                    o['fields'].append(p)

                for ky in prop_inherited:
                    props = sorted(prop_inherited[ky])
                    o['fields'].extend(props)

                message_descriptor[x] = o

        for x in sorted(prop_to_class.keys()):
            if len(prop_to_class[x]) > 0:
                o = {}
                o['@type'] = 'Property'
                o['fields'] = sorted(list(prop_to_class[x]))
                message_descriptor[x] = o

        for x in sorted(enumerations):
            enum_values = set()

            for ev, _, _ in self.graph.triples(
                (None, constants.schema_constants['Type'], utils.add_url(x))):
                enum_values.add(ev)

            o = {}
            o['@type'] = 'EnumWrapper'
            o['values'] = sorted(list(enum_values))
            o['values'].insert(0, 'Unknown')
            o['fields'] = ['id', x + 'Class']

            o2 = {}
            o2['@type'] = x
            prop_from_self = list()
            prop_inherited = dict()

            o2['fields'] = list()
            o2['fields'].append('@id')

            for p in class_to_prop[x]:
                if p.parent == x:
                    prop_from_self.append(p.name)
                else:
                    if p.parent not in prop_inherited:
                        prop_inherited[p.parent] = list()

                    prop_inherited[p.parent].append(p.name)

            prop_from_self = sorted(prop_from_self)
            prop_inherited = collections.OrderedDict(
                sorted(prop_inherited.items()))

            for p in prop_from_self:
                o2['fields'].append(p)

            for ky in prop_inherited:
                props = sorted(prop_inherited[ky])
                o2['fields'].extend(props)

            message_descriptor[x] = o
            message_descriptor[x + 'Class'] = o2

        message_descriptor['Date'] = {}
        message_descriptor['Date']['@type'] = 'DatatypeDate'

        message_descriptor['DateTime'] = {}
        message_descriptor['DateTime']['@type'] = 'DatatypeDateTime'

        message_descriptor['Time'] = {}
        message_descriptor['Time']['@type'] = 'DatatypeTime'

        message_descriptor['Duration'] = {}
        message_descriptor['Duration']['@type'] = 'DatatypeDuration'

        message_descriptor['Distance'] = {}
        message_descriptor['Distance']['@type'] = 'DatatypeQuantitative'

        message_descriptor['Energy'] = {}
        message_descriptor['Energy']['@type'] = 'DatatypeQuantitative'

        message_descriptor['Mass'] = {}
        message_descriptor['Mass']['@type'] = 'DatatypeQuantitative'

        json_descriptor = {}
        json_descriptor['messages'] = message_descriptor
        json_descriptor['primitives'] = list(sorted(undefined_classes))

        return json_descriptor

コード例 #6

0

ファイルを表示

    def __get_values(
        self
    ) -> Tuple[Dict[str, Set[PropertyToParent]], Dict[str, Set[str]],
               Set[str]]:
        """Call utils.toplogical_sort(), compress the inheritance heirarchy and
        return mappings between schema classes, schema properties and schema
        enumerations.

        Returns:
            dict[str, set[PropertyToParent]]: Dictionary containing set of
                                              properties for every class.
            dict[str, set[str]]: Dictionary containing range of
                                 class/datatypes for every property.
            set[str]: Set containing the enumerations in the schema.
        """

        class_to_prop = dict()
        inheritance_graph = dict()

        for class_name, _, _ in self.graph.triples(
            (None, constants.schema_constants['Type'],
             constants.schema_constants['Class'])):
            class_to_prop[utils.strip_url(class_name)] = set()

            for property_name, _, _ in self.graph.triples(
                (None, constants.schema_constants['domainIncludes'],
                 class_name)):
                prop = utils.PropertyToParent(utils.strip_url(property_name),
                                              utils.strip_url(class_name))
                class_to_prop[utils.strip_url(class_name)].add(prop)

        for class_name, _, _ in self.graph.triples(
            (None, constants.schema_constants['Type'],
             constants.schema_constants['Class'])):

            if class_name not in inheritance_graph:
                inheritance_graph[class_name] = set()

            for _, _, parent_class in self.graph.triples(
                (class_name, constants.schema_constants['subClassOf'], None)):

                if parent_class not in inheritance_graph:
                    inheritance_graph[parent_class] = set()

                inheritance_graph[parent_class].add(class_name)

        topsort_order = utils.topological_sort(inheritance_graph)

        for class_name in topsort_order:
            for _, _, parent_class in self.graph.triples(
                (class_name, constants.schema_constants['subClassOf'], None)):
                if utils.strip_url(parent_class) in class_to_prop:
                    class_to_prop[utils.strip_url(class_name)] = class_to_prop[
                        utils.strip_url(class_name)] | class_to_prop[
                            utils.strip_url(parent_class)]

        enumerations = set()

        for enum, _, _ in self.graph.triples(
            (None, constants.schema_constants['subClassOf'],
             constants.schema_constants['Enumeration'])):
            enumerations.add(utils.strip_url(enum))

        class_to_children = utils.get_children(inheritance_graph)

        # Temporary Code
        # class_to_children[rdflib.URIRef('http://schema.org/Audience')].add(rdflib.URIRef("http://schema.org/Researcher"))
        # class_to_prop["SteeringPositionValue"] = class_to_prop["Enumeration"]
        # class_to_prop["DriveWheelConfigurationValue"] = class_to_prop["Enumeration"]
        # enumerations.add("SteeringPositionValue")
        # enumerations.add("DriveWheelConfigurationValue")
        # End of temporary code

        prop_to_class = dict()

        for property_name, _, _ in self.graph.triples(
            (None, constants.schema_constants['Type'],
             constants.schema_constants['Property'])):
            prop_to_class[utils.strip_url(property_name)] = set()

            for _, _, class_name in self.graph.triples(
                (property_name, constants.schema_constants['rangeIncludes'],
                 None)):
                prop_to_class[utils.strip_url(property_name)].add(
                    utils.strip_url(class_name))
                if class_name in class_to_children:
                    prop_to_class[utils.strip_url(
                        property_name
                    )] = prop_to_class[utils.strip_url(property_name)] | set(
                        map(utils.strip_url, class_to_children[class_name]))

                if class_name == constants.schema_constants['Number']:
                    prop_to_class[utils.strip_url(property_name)].add(
                        utils.strip_url(constants.schema_constants['Integer']))
                    prop_to_class[utils.strip_url(property_name)].add(
                        utils.strip_url(constants.schema_constants['Float']))

                if class_name == constants.schema_constants['Text']:
                    prop_to_class[utils.strip_url(property_name)].add(
                        utils.strip_url(constants.schema_constants['URL']))

        return class_to_prop, prop_to_class, enumerations

コード例 #7

0

ファイルを表示

ファイル: osgameclones_synchronization.py プロジェクト: q4a/opensourcegames

                    osgc_frameworks = osgc_entry['framework']
                    if type(osgc_frameworks) == str:
                        osgc_frameworks = [osgc_frameworks]
                    our_frameworks = our_entry.get('Code dependency', [])
                    our_frameworks = [x.casefold() for x in our_frameworks]
                    our_frameworks = [x if x not in our_framework_replacements else our_framework_replacements[x] for x
                                      in our_frameworks]
                    osgc_frameworks = [x.casefold() for x in osgc_frameworks]
                    p += compare_sets(osgc_frameworks, our_frameworks, 'framework/dependencies')

                # compare their repo with our code repository and download
                if 'repo' in osgc_entry:
                    osgc_repos = osgc_entry['repo']
                    if type(osgc_repos) == str:
                        osgc_repos = [osgc_repos]
                    osgc_repos = [u.strip_url(url) for url in osgc_repos]
                    osgc_repos = [x for x in osgc_repos if not x.startswith(
                        'sourceforge.net/projects/')]  # we don't need the general sites there
                    # osgc_repos = [x for x in osgc_repos if not x.startswith('https://sourceforge.net/projects/')] # ignore some
                    our_repos = our_entry.get('Code repository', [])
                    our_repos = [u.strip_url(url) for url in our_repos]
                    our_repos = [x for x in our_repos if not x.startswith(
                        'gitlab.com/osgames/')]  # we do not yet spread our own deeds (but we will some day)
                    our_repos = [x for x in our_repos if
                                 'cvs.sourceforge.net' not in x and 'svn.code.sf.net/p/' not in x]  # no cvs or svn anymore
                    our_downloads = our_entry.get('Download', [])
                    our_downloads = [u.strip_url(url) for url in our_downloads]
                    p += compare_sets(osgc_repos, our_repos + our_downloads, 'repo',
                                      'notthem')  # if their repos are not in our downloads or repos
                    p += compare_sets(osgc_repos, our_repos[:1], 'repo',
                                      'notus')  # if our main repo is not in their repo

コード例 #8

0

ファイルを表示

                    our_repos = our_entry.get('code repository', [])
                    for repo in repos:
                        if repo.startswith(
                                'https://sourceforge.net/projects/'):
                            continue
                        if (repo not in our_repos) and (
                                repo + '.git' not in our_repos
                        ):  # add .git automatically and try it too
                            p += ' code repository {} missing\n'.format(repo)

                # url (ignore http/https)
                if 'url' in osgc_entry:
                    urls = osgc_entry['url']
                    if type(urls) == str:
                        urls = [urls]
                    urls = [utils.strip_url(url) for url in urls]
                    our_urls = our_entry['home']
                    our_urls = [utils.strip_url(url) for url in our_urls]
                    for url in urls:
                        if url not in our_urls:
                            p += ' home url {} missing\n'.format(url)

                # status
                if 'status' in osgc_entry:
                    status = osgc_entry['status']
                    our_status = our_entry['state']  # essential field
                    if status == 'playable' and 'mature' not in our_status:
                        p += ' status playable, not mature with us\n'
                    if status != 'playable' and 'mature' in our_status:
                        p += ' status {}, mature with us\n'.format(status)
                    if status == 'unplayable':

コード例 #9

0

ファイルを表示

ファイル: maintenance.py プロジェクト: gsthamu/opensourcegames

    text = utils.read_text(os.path.join(c.root_path, 'tools', 'rejected.txt'))
    regex = re.compile(r"\((http.*?)\)", re.MULTILINE)
    matches = regex.findall(text)
    rejected_urls = []
    for match in matches:
        urls = match.split(',')
        urls = [x.strip() for x in urls]
        rejected_urls.extend(urls)
    game_urls.extend(rejected_urls)
    more_urls = []
    for url in game_urls:
        if url.startswith('https://web.archive.org/web'):
            url = url[url.index('http', 5):]
            more_urls.append(url)
    game_urls.extend(more_urls)
    stripped_game_urls = [utils.strip_url(x) for x in game_urls]
    clean_backlog(stripped_game_urls)

    # check for unfilled template lines
    check_template_leftovers()

    # fix entries
    fix_entries()

    # assemble info
    infos = osg.assemble_infos()

    # recount and write to readme and to tocs
    update_readme_and_tocs(infos)

    # generate report