Esempio n. 1
0
def parse(location, handler):
    """
    Given the location of an XML file and a handler function accepting a
    etree document, parse the file at location and invoke the handler on
    the etree doc. If parsing fails while calling handler, another
    approach to parsing is used.

    This is a workaround some lxml bug/weirdness wrt unicode in the 2.3
    version in use.

    The `handler` function must have no side effects and can be called
    again on failures without risk.

    Try first to call lxml from a location then try from a string to
    deal with weird encodings
    """
    try:
        parser = etree.XMLParser(recover=True, remove_blank_text=True, resolve_entities=False)
        xdoc = etree.parse(location, parser)
        return handler(xdoc)
    except:
        parser = etree.XMLParser(recover=True, remove_blank_text=True, resolve_entities=False)
        text = analysis.unicode_text(location)
        xdoc= etree.fromstring(_as_unicode_bytes(text), parser)
        return handler(xdoc)
Esempio n. 2
0
    def __init__(self, location):
        # NOTE: most of this is copied over from Pom.__init__
        try:
            with codecs.open(location, 'rb', encoding='UTF-8') as fh:
                xml = fh.read()
        except UnicodeDecodeError as _a:
            xml = analysis.unicode_text(location)

        xml = xml[xml.find('<project'):]
        xml = STRIP_NAMESPACE_RE.sub('<project>', xml, 1)

        self._xml = etree.fromstring(xml, parser=POM_PARSER)

        # FXIME: we do not use a client for now. there are pending issues at pymaven to address this
        self._client = None

        self.model_version = self._get_attribute('modelVersion')
        self.group_id = self._get_attribute('groupId')
        self.artifact_id = self._get_attribute('artifactId')
        self.version = self._get_attribute('version')
        self.classifier = self._get_attribute('classifier')
        self.packaging = self._get_attribute('packaging') or 'jar'
        self.name = self._get_attribute('name')
        self.description = self._get_attribute('description')
        self.inception_year = self._get_attribute('inceptionYear')
        self.url = self._get_attribute('url')
        self.organization_name = self._get_attribute('organization/name')
        self.organization_url = self._get_attribute('organization/url')
        self.licenses = list(self._find_licenses())
        self.developers = list(self._find_parties('developers/developer'))
        self.contributors = list(
            self._find_parties('contributors/contributor'))
        self.mailing_lists = list(self._find_mailing_lists())
        self.scm = self._find_scm()
        self.issue_management = self._find_issue_management()
        self.ci_management = self._find_ci_management()
        self.distribution_management = self._find_distribution_management()
        self.repositories = list(
            self._find_repositories('repositories/repository'))
        self.plugin_repositories = list(
            self._find_repositories('pluginRepositories/pluginRepository'))
        self.modules = self._get_attributes_list('modules/module')

        # FIXME: this attribute should be collected with the parent but
        # is not retrieved yet by pymaven it points to the relative path
        # where to find the full parent POM
        self.parent_relative_path = self._get_attribute(
            'relativePath')  # or '../pom.xml'

        # FIXME: Other types that are not collected for now (or
        # indirectly through dependencies management) include: build,
        # reporting, profiles, etc

        # dynamic attributes
        self._parent = None
        self._dep_mgmt = None
        self._dependencies = None
        self._properties = None
Esempio n. 3
0
def _parse_as_string(location):
    """
    Return an etree doc from the XML file at `location` trying hard to get
    unicode.
    """
    parser = etree.XMLParser(recover=True,
                             remove_blank_text=True,
                             resolve_entities=False)
    text = analysis.unicode_text(location)
    return etree.fromstring(_as_unicode_bytes(text), parser)
Esempio n. 4
0
    def __init__(self, location=None, text=None):
        """
        Build a POM from a location or unicode text.
        """
        assert (location or text) and (not (location and text))

        if location:
            try:
                with io.open(location, encoding='utf-8') as fh:
                    xml_text = fh.read()
            except UnicodeDecodeError as _a:
                xml_text = analysis.unicode_text(location)
        else:
            xml_text = text
        xml_text = strip_namespace(xml_text)
        xml_text = xml_text.encode('utf-8')
        if TRACE:
            logger.debug('MavenPom.__init__: xml_text: {}'.format(xml_text))

        self._pom_data = etree.fromstring(xml_text, parser=pom.POM_PARSER)

        # collect and then remove XML comments from the XML elements tree
        self.comments = self._get_comments()
        etree.strip_tags(self._pom_data, etree.Comment)

        # FIXME: we do not use a client for now.
        # There are pending issues at pymaven to address this
        self._client = None

        self.model_version = self._get_attribute('modelVersion')
        if not self.model_version:
            # for older POM version 3
            self.model_version = self._get_attribute('pomVersion')
        self.group_id = self._get_attribute('groupId')
        self.artifact_id = self._get_attribute('artifactId')
        if TRACE:
            logger.debug('MavenPom.__init__: self.artifact_id: {}'.format(self.artifact_id))

        self.version = self._get_attribute('version')
        self.classifier = self._get_attribute('classifier')
        self.packaging = self._get_attribute('packaging') or 'jar'
        self.name = self._get_attribute('name')
        self.description = self._get_attribute('description')
        self.inception_year = self._get_attribute('inceptionYear')
        self.url = self._get_attribute('url')
        self.organization_name = self._get_attribute('organization/name')
        self.organization_url = self._get_attribute('organization/url')
        self.licenses = list(self._find_licenses())
        self.developers = list(self._find_parties('developers/developer'))
        self.contributors = list(self._find_parties('contributors/contributor'))
        self.mailing_lists = list(self._find_mailing_lists())
        self.scm = self._find_scm()
        self.issue_management = self._find_issue_management()
        self.ci_management = self._find_ci_management()
        self.distribution_management = self._find_distribution_management()
        self.repositories = list(self._find_repositories('repositories/repository'))
        self.plugin_repositories = list(self._find_repositories('pluginRepositories/pluginRepository'))
        self.modules = self._get_attributes_list('modules/module')

        # FIXME: this attribute should be collected with the parent but
        # is not retrieved yet by pymaven it points to the relative path
        # where to find the full parent POM
        self.parent_relative_path = self._get_attribute('relativePath')  # or '../pom.xml_text'

        # FIXME: Other types that are not collected for now (or
        # indirectly through dependencies management) include: build,
        # reporting, profiles, etc

        # dynamic attributes
        self._parent = None
        self._dep_mgmt = None
        self._dependencies = None
        self._properties = None
Esempio n. 5
0
    def __init__(self, location=None, text=None):
        """
        Build a POM from a location or unicode text.
        """
        assert (location or text) and (not (location and text))
        # NOTE: most of this is derived from pymaven.Pom.__init__
        if location:
            try:
                with io.open(location, encoding='utf-8') as fh:
                    xml = fh.read()
            except UnicodeDecodeError as _a:
                xml = analysis.unicode_text(location)
        else:
            xml = text

        xml = xml[xml.find('<project'):]
        xml = STRIP_NAMESPACE_RE.sub('<project>', xml, 1)

        parser = etree.XMLParser(
            recover=True,
            # we keep comments in case there is a license in the comments
            remove_comments=False,
            remove_pis=True,
            remove_blank_text=True, resolve_entities=False
        )

        self._xml = etree.fromstring(xml, parser=parser)

        # collect and then remove XML comments from the XML elements tree
        self.comments = self._get_comments()
        etree.strip_tags(self._xml, etree.Comment)

        # FIXME: we do not use a client for now. There are pending issues at pymaven to address this
        self._client = None

        self.model_version = self._get_attribute('modelVersion')
        if not self.model_version:
            # for version 3
            self.model_version = self._get_attribute('pomVersion')
        self.group_id = self._get_attribute('groupId')
        self.artifact_id = self._get_attribute('artifactId')
        if TRACE: logger.debug('MavenPom.__init__: self.artifact_id: {}'.format(self.artifact_id))
        self.version = self._get_attribute('version')
        self.classifier = self._get_attribute('classifier')
        self.packaging = self._get_attribute('packaging') or 'jar'
        self.name = self._get_attribute('name')
        self.description = self._get_attribute('description')
        self.inception_year = self._get_attribute('inceptionYear')
        self.url = self._get_attribute('url')
        self.organization_name = self._get_attribute('organization/name')
        self.organization_url = self._get_attribute('organization/url')
        self.licenses = list(self._find_licenses())
        self.developers = list(self._find_parties('developers/developer'))
        self.contributors = list(self._find_parties('contributors/contributor'))
        self.mailing_lists = list(self._find_mailing_lists())
        self.scm = self._find_scm()
        self.issue_management = self._find_issue_management()
        self.ci_management = self._find_ci_management()
        self.distribution_management = self._find_distribution_management()
        self.repositories = list(self._find_repositories('repositories/repository'))
        self.plugin_repositories = list(self._find_repositories('pluginRepositories/pluginRepository'))
        self.modules = self._get_attributes_list('modules/module')

        # FIXME: this attribute should be collected with the parent but
        # is not retrieved yet by pymaven it points to the relative path
        # where to find the full parent POM
        self.parent_relative_path = self._get_attribute('relativePath')  # or '../pom.xml'

        # FIXME: Other types that are not collected for now (or
        # indirectly through dependencies management) include: build,
        # reporting, profiles, etc

        # dynamic attributes
        self._parent = None
        self._dep_mgmt = None
        self._dependencies = None
        self._properties = None
Esempio n. 6
0
    def __init__(self, location=None, text=None):
        """
        Build a POM from a location or unicode text.
        """
        assert (location or text) and (not (location and text))
        # NOTE: most of this is copied over from Pom.__init__
        if location:
            try:
                with codecs.open(location, 'rb', encoding='UTF-8') as fh:
                    xml = fh.read()
            except UnicodeDecodeError as _a:
                xml = analysis.unicode_text(location)
        else:
            xml = text

        xml = xml[xml.find('<project'):]
        xml = STRIP_NAMESPACE_RE.sub('<project>', xml, 1)

        parser = etree.XMLParser(
            recover=True,
            remove_comments=True,
            remove_pis=True,
            remove_blank_text=True, resolve_entities=False
        )

        self._xml = etree.fromstring(xml, parser=parser)

        # FXIME: we do not use a client for now. there are pending issues at pymaven to address this
        self._client = None

        self.model_version = self._get_attribute('modelVersion')
        if not self.model_version:
            # for version 3
            self.model_version = self._get_attribute('pomVersion')
        self.group_id = self._get_attribute('groupId')
        self.artifact_id = self._get_attribute('artifactId')
        self.version = self._get_attribute('version')
        self.classifier = self._get_attribute('classifier')
        self.packaging = self._get_attribute('packaging') or 'jar'
        self.name = self._get_attribute('name')
        self.description = self._get_attribute('description')
        self.inception_year = self._get_attribute('inceptionYear')
        self.url = self._get_attribute('url')
        self.organization_name = self._get_attribute('organization/name')
        self.organization_url = self._get_attribute('organization/url')
        self.licenses = list(self._find_licenses())
        self.developers = list(self._find_parties('developers/developer'))
        self.contributors = list(self._find_parties('contributors/contributor'))
        self.mailing_lists = list(self._find_mailing_lists())
        self.scm = self._find_scm()
        self.issue_management = self._find_issue_management()
        self.ci_management = self._find_ci_management()
        self.distribution_management = self._find_distribution_management()
        self.repositories = list(self._find_repositories('repositories/repository'))
        self.plugin_repositories = list(self._find_repositories('pluginRepositories/pluginRepository'))
        self.modules = self._get_attributes_list('modules/module')

        # FIXME: this attribute should be collected with the parent but
        # is not retrieved yet by pymaven it points to the relative path
        # where to find the full parent POM
        self.parent_relative_path = self._get_attribute('relativePath')  # or '../pom.xml'

        # FIXME: Other types that are not collected for now (or
        # indirectly through dependencies management) include: build,
        # reporting, profiles, etc

        # dynamic attributes
        self._parent = None
        self._dep_mgmt = None
        self._dependencies = None
        self._properties = None