Beispiel #1
0
	def extract_rows(self, *args, **kwargs):
		"""
		Row data extraction for extract_tabular
		"""
		result_list = []
		result = kwargs.get('result', {})

		try:
			sel = cssselect.CSSSelector(kwargs.get('selector', ''))
			values = sel(self.tree)
			if len(kwargs.get('table_headers', [])) >= len(values):
				from itertools import izip_longest
				pairs = izip_longest(kwargs.get('table_headers', []), values, fillvalue=kwargs.get('default', ''))
			else:
				from itertools import izip
				pairs = izip(kwargs.get('table_headers', []), values)
			for head, val in pairs:
				if kwargs.get('verbosity', 0) > 1:
					print("\nExtracting", head, "attribute", sep=' ', end='')
				if kwargs.get('attr', 'text') == "text":
					try:
						content = kwargs.get('connector', '').join([make_ascii(x).strip() for x in val.itertext()])
					except Exception:
						content = kwargs.get('default', '')
					content = content.replace("\n", " ").strip()
				else:
					content = val.get(kwargs.get('attr', 'text'))
					if kwargs.get('attr', 'text') in ["href", "src"]:
						content = urljoin(self.url, content)
				result[head] = content
			result_list.append(result)
		except TypeError:
			raise Exception("Selector expression string to be provided. Got " + kwargs.get('selector', ''))

		return result_list
Beispiel #2
0
	def extract_rows(self, result={}, selector='', table_headers=[], attr='', connector='', default='', verbosity=0, *args, **kwargs):
		"""
		Row data extraction for extract_tabular
		"""
		result_list = []

		try:
			values = self.get_tree_tag(selector)
			if len(table_headers) >= len(values):
				from itertools import izip_longest
				pairs = izip_longest(table_headers, values, fillvalue=default)
			else:
				from itertools import izip
				pairs = izip(table_headers, values)
			for head, val in pairs:
				if verbosity > 1:
					print("\nExtracting", head, "attribute", sep=' ', end='')
				if attr.lower() == "text":
					try:
						content = connector.join([make_ascii(x).strip() for x in val.itertext()])
					except Exception:
						content = default
					content = content.replace("\n", " ").strip()
				else:
					content = val.get(attr)
					if attr in ["href", "src"]:
						content = urljoin(self.url, content)
				result[head] = content
			result_list.append(result)
		except XPathError:
			raise Exception("Invalid %s selector - %s" % (self.__selector_type__, selector))
		except TypeError:
			raise Exception("Selector expression string to be provided. Got " + selector)

		return result_list
Beispiel #3
0
    def extract_columns(self, *args, **kwargs):
        """
		Column data extraction for extract_tabular
		"""
        result_list = []
        result = kwargs.get('result', {})

        try:
            if type(kwargs.get('selector', '')) in [str, unicode]:
                selectors = [kwargs.get('selector', '')]
            elif type(kwargs.get('selector', '')) == list:
                selectors = kwargs.get('selector', '')
            else:
                raise Exception(
                    "Use a list of selector expressions for the various columns"
                )
            from itertools import izip, count
            pairs = izip(kwargs.get('table_headers', []), selectors)
            columns = {}
            for head, selector in pairs:
                columns[head] = self.tree.xpath(selector)
            try:
                for i in count(start=0):
                    r = result.copy()
                    for head in columns.keys():
                        if kwargs.get('verbosity', 0) > 1:
                            print("\nExtracting",
                                  head,
                                  "attribute",
                                  sep=' ',
                                  end='')
                        col = columns[head][i]
                        if kwargs.get('attr', 'text') == "text":
                            try:
                                content = kwargs.get('connector', '').join([
                                    make_ascii(x).strip()
                                    for x in col.itertext()
                                ])
                            except Exception:
                                content = kwargs.get('default', '')
                            content = content.replace("\n", " ").strip()
                        else:
                            content = col.get(kwargs.get('attr', 'text'))
                            if kwargs.get('attr', 'text') in ["href", "src"]:
                                content = urljoin(self.url, content)
                        r[head] = content
                    result_list.append(r)
            except IndexError:
                pass
        except XPathError:
            raise Exception("Invalid XPath selector " + selector)
        except TypeError:
            raise Exception("Selector expression string to be provided. Got " +
                            selector)

        return result_list
Beispiel #4
0
    def extract_content(self,
                        selector='',
                        attr='',
                        default='',
                        connector='',
                        *args,
                        **kwargs):
        """
		Method for performing the content extraction for the particular selector type. \

		If the selector is "url", the URL of the current web page is returned.
		Otherwise, the selector expression is used to extract content. The particular \
		attribute to be extracted ("text", "href", etc.) is specified in the method \
		arguments, and this is used to extract the required content. If the content \
		extracted is a link (from an attr value of "href" or "src"), the URL is parsed \
		to convert the relative path into an absolute path.

		If the selector does not fetch any content, the default value is returned. \
		If no default value is specified, an exception is raised.

		:param selector: The XPath expression
		:param attr: The attribute to be extracted from the selected tag
		:param default: The default value to be used if the selector does not return any data
		:param connector: String connector for list of data returned for a particular selector
		:return: The extracted content
		"""
        try:
            if selector.lower() == "url":
                return self.url
            if attr.lower() == "text":
                tag = self.get_tree_tag(selector=selector, get_one=True)
                content = connector.join(
                    [make_ascii(x).strip() for x in tag.itertext()])
                content = content.replace("\n", " ").strip()
            else:
                tag = self.get_tree_tag(selector=selector, get_one=True)
                content = tag.get(attr)
                if attr in ["href", "src"]:
                    content = urljoin(self.url, content)
            return content
        except IndexError:
            if default is not "":
                return default
            raise Exception("There is no content for the %s selector - %s" %
                            (self.__selector_type__, selector))
        except XPathError:
            raise Exception("Invalid %s selector - %s" %
                            (self.__selector_type__, selector))
Beispiel #5
0
    def extract_content(self, *args, **kwargs):
        """
		Method for performing the content extraction for the given CSS selector.

		The cssselect library is used to handle CSS selector expressions. \
		XPath expressions have a higher speed of execution, so the given CSS selector \
		expression is translated into the corresponding XPath expression, by the \
		``cssselect.CSSSelector`` class. This selector can be used to extract content \
		from the element tree corresponding to the fetched web page.

		If the selector is "url", the URL of the current web page is returned.
		Otherwise, the selector expression is used to extract content. The particular \
		attribute to be extracted ("text", "href", etc.) is specified in the method \
		arguments, and this is used to extract the required content. If the content \
		extracted is a link (from an attr value of "href" or "src"), the URL is parsed \
		to convert the relative path into an absolute path.

		If the selector does not fetch any content, the default value is returned. \
		If no default value is specified, an exception is raised.

		:param selector: The CSS selector expression
		:param attr: The attribute to be extracted from the selected tag
		:param default: The default value to be used if the selector does not return any data
		:return: The extracted content

		"""
        try:
            selector, attr, default, connector = [
                kwargs.get(x, '')
                for x in ['selector', 'attr', 'default', 'connector']
            ]
            if selector == "url":
                return self.url
            sel = cssselect.CSSSelector(selector)
            if attr == "text":
                tag = sel(self.tree)[0]
                content = connector.join(
                    [make_ascii(x).strip() for x in tag.itertext()])
                content = content.replace("\n", " ").strip()
            else:
                content = sel(self.tree)[0].get(attr)
                if attr in ["href", "src"]:
                    content = urljoin(self.url, content)
            return content
        except IndexError:
            if default is not "":
                return default
            raise Exception("There is no content for the selector " + selector)
Beispiel #6
0
    def extract_rows(self,
                     result={},
                     selector='',
                     table_headers=[],
                     attr='',
                     connector='',
                     default='',
                     verbosity=0,
                     *args,
                     **kwargs):
        """
		Row data extraction for extract_tabular
		"""
        result_list = []

        try:
            values = self.get_tree_tag(selector)
            if len(table_headers) >= len(values):
                from itertools import izip_longest
                pairs = izip_longest(table_headers, values, fillvalue=default)
            else:
                from itertools import izip
                pairs = izip(table_headers, values)
            for head, val in pairs:
                if verbosity > 1:
                    print("\nExtracting", head, "attribute", sep=' ', end='')
                if attr.lower() == "text":
                    try:
                        content = connector.join(
                            [make_ascii(x).strip() for x in val.itertext()])
                    except Exception:
                        content = default
                    content = content.replace("\n", " ").strip()
                else:
                    content = val.get(attr)
                    if attr in ["href", "src"]:
                        content = urljoin(self.url, content)
                result[head] = content
            result_list.append(result)
        except XPathError:
            raise Exception("Invalid %s selector - %s" %
                            (self.__selector_type__, selector))
        except TypeError:
            raise Exception("Selector expression string to be provided. Got " +
                            selector)

        return result_list
Beispiel #7
0
	def extract_columns(self, *args, **kwargs):
		"""
		Column data extraction for extract_tabular
		"""
		result_list = []
		result = kwargs.get('result', {})

		try:
			if type(kwargs.get('selector', '')) in [str, unicode]:
				selectors = [kwargs.get('selector', '')]
			elif type(kwargs.get('selector', '')) == list:
				selectors = kwargs.get('selector', '')
			else:
				raise Exception("Use a list of selector expressions for the various columns")
			from itertools import izip, count
			pairs = izip(kwargs.get('table_headers', []), selectors)
			columns = {}
			for head, selector in pairs:
				sel = cssselect.CSSSelector(selector)
				columns[head] = sel(self.tree)
			try:
				for i in count(start=0):
					r = result.copy()
					for head in columns.keys():
						if kwargs.get('verbosity', 0) > 1:
							print("\nExtracting", head, "attribute", sep=' ', end='')
						col = columns[head][i]
						if kwargs.get('attr', 'text') == "text":
							try:
								content = kwargs.get('connector', '').join([make_ascii(x).strip() for x in col.itertext()])
							except Exception:
								content = kwargs.get('default', '')
							content = content.replace("\n", " ").strip()
						else:
							content = col.get(kwargs.get('attr', 'text'))
							if kwargs.get('attr', 'text') in ["href", "src"]:
								content = urljoin(self.url, content)
						r[head] = content
					result_list.append(r)
			except IndexError:
				pass
		except TypeError:
			raise Exception("Selector expression string to be provided. Got " + selector)

		return result_list
Beispiel #8
0
	def extract_columns(self, result={}, selector='', table_headers=[], attr='', connector='', default='', verbosity=0, *args, **kwargs):
		"""
		Column data extraction for extract_tabular
		"""
		result_list = []

		try:
			if type(selector) in [str, unicode]:
				selectors = [selector]
			elif type(selector) == list:
				selectors = selector[:]
			else:
				raise Exception("Use a list of selector expressions for the various columns")
			from itertools import izip, count
			pairs = izip(table_headers, selectors)
			columns = {}
			for head, selector in pairs:
				columns[head] = self.get_tree_tag(selector)
			try:
				for i in count(start=0):
					r = result.copy()
					for head in columns.keys():
						if verbosity > 1:
							print("\nExtracting", head, "attribute", sep=' ', end='')
						col = columns[head][i]
						if attr == "text":
							try:
								content = connector.join([make_ascii(x).strip() for x in col.itertext()])
							except Exception:
								content = default
							content = content.replace("\n", " ").strip()
						else:
							content = col.get(attr)
							if attr in ["href", "src"]:
								content = urljoin(self.url, content)
						r[head] = content
					result_list.append(r)
			except IndexError:
				pass
		except XPathError:
			raise Exception("Invalid %s selector - %s" % (self.__selector_type__, selector))
		except TypeError:
			raise Exception("Selector expression string to be provided. Got " + selector)

		return result_list
Beispiel #9
0
	def extract_content(self, *args, **kwargs):
		"""
		Method for performing the content extraction for the given CSS selector.

		The cssselect library is used to handle CSS selector expressions. \
		XPath expressions have a higher speed of execution, so the given CSS selector \
		expression is translated into the corresponding XPath expression, by the \
		``cssselect.CSSSelector`` class. This selector can be used to extract content \
		from the element tree corresponding to the fetched web page.

		If the selector is "url", the URL of the current web page is returned.
		Otherwise, the selector expression is used to extract content. The particular \
		attribute to be extracted ("text", "href", etc.) is specified in the method \
		arguments, and this is used to extract the required content. If the content \
		extracted is a link (from an attr value of "href" or "src"), the URL is parsed \
		to convert the relative path into an absolute path.

		If the selector does not fetch any content, the default value is returned. \
		If no default value is specified, an exception is raised.

		:param selector: The CSS selector expression
		:param attr: The attribute to be extracted from the selected tag
		:param default: The default value to be used if the selector does not return any data
		:return: The extracted content

		"""
		try:
			selector, attr, default, connector = [kwargs.get(x, '') for x in ['selector', 'attr', 'default', 'connector']]
			if selector == "url":
				return self.url
			sel = cssselect.CSSSelector(selector)
			if attr == "text":
				tag = sel(self.tree)[0]
				content = connector.join([make_ascii(x).strip() for x in tag.itertext()])
				content = content.replace("\n", " ").strip()				
			else:
				content = sel(self.tree)[0].get(attr)
				if attr in ["href", "src"]:
					content = urljoin(self.url, content)
			return content
		except IndexError:
			if default is not "":
				return default
			raise Exception("There is no content for the selector " + selector)
Beispiel #10
0
    def extract_rows(self, *args, **kwargs):
        """
		Row data extraction for extract_tabular
		"""
        result_list = []
        result = kwargs.get('result', {})

        try:
            values = self.tree.xpath(kwargs.get('selector', ''))
            if len(kwargs.get('table_headers', [])) >= len(values):
                from itertools import izip_longest
                pairs = izip_longest(kwargs.get('table_headers', []),
                                     values,
                                     fillvalue=kwargs.get('default', ''))
            else:
                from itertools import izip
                pairs = izip(kwargs.get('table_headers', []), values)
            for head, val in pairs:
                if kwargs.get('verbosity', 0) > 1:
                    print("\nExtracting", head, "attribute", sep=' ', end='')
                if kwargs.get('attr', 'text') == "text":
                    try:
                        content = kwargs.get('connector', '').join(
                            [make_ascii(x).strip() for x in val.itertext()])
                    except Exception:
                        content = kwargs.get('default', '')
                    content = content.replace("\n", " ").strip()
                else:
                    content = val.get(kwargs.get('attr', 'text'))
                    if kwargs.get('attr', 'text') in ["href", "src"]:
                        content = urljoin(self.url, content)
                result[head] = content
            result_list.append(result)
        except XPathError:
            raise Exception("Invalid XPath selector " +
                            kwargs.get('selector', ''))
        except TypeError:
            raise Exception("Selector expression string to be provided. Got " +
                            kwargs.get('selector', ''))

        return result_list
Beispiel #11
0
	def extract_content(self, selector='', attr='', default='', connector='', *args, **kwargs):
		"""
		Method for performing the content extraction for the particular selector type. \

		If the selector is "url", the URL of the current web page is returned.
		Otherwise, the selector expression is used to extract content. The particular \
		attribute to be extracted ("text", "href", etc.) is specified in the method \
		arguments, and this is used to extract the required content. If the content \
		extracted is a link (from an attr value of "href" or "src"), the URL is parsed \
		to convert the relative path into an absolute path.

		If the selector does not fetch any content, the default value is returned. \
		If no default value is specified, an exception is raised.

		:param selector: The XPath expression
		:param attr: The attribute to be extracted from the selected tag
		:param default: The default value to be used if the selector does not return any data
		:param connector: String connector for list of data returned for a particular selector
		:return: The extracted content
		"""
		try:
			if selector.lower() == "url":
				return self.url
			if attr.lower() == "text":
				tag = self.get_tree_tag(selector=selector, get_one=True)
				content = connector.join([make_ascii(x).strip() for x in tag.itertext()])
				content = content.replace("\n", " ").strip()
			else:
				tag = self.get_tree_tag(selector=selector, get_one=True)
				content = tag.get(attr)
				if attr in ["href", "src"]:
					content = urljoin(self.url, content)
			return content
		except IndexError:
			if default is not "":
				return default
			raise Exception("There is no content for the %s selector - %s" % (self.__selector_type__, selector))
		except XPathError:
			raise Exception("Invalid %s selector - %s" % (self.__selector_type__, selector))
Beispiel #12
0
    def extract_columns(self,
                        result={},
                        selector='',
                        table_headers=[],
                        attr='',
                        connector='',
                        default='',
                        verbosity=0,
                        *args,
                        **kwargs):
        """
		Column data extraction for extract_tabular
		"""
        result_list = []

        try:
            if type(selector) in [str, unicode]:
                selectors = [selector]
            elif type(selector) == list:
                selectors = selector[:]
            else:
                raise Exception(
                    "Use a list of selector expressions for the various columns"
                )
            from itertools import izip, count
            pairs = izip(table_headers, selectors)
            columns = {}
            for head, selector in pairs:
                columns[head] = self.get_tree_tag(selector)
            try:
                for i in count(start=0):
                    r = result.copy()
                    for head in columns.keys():
                        if verbosity > 1:
                            print("\nExtracting",
                                  head,
                                  "attribute",
                                  sep=' ',
                                  end='')
                        col = columns[head][i]
                        if attr == "text":
                            try:
                                content = connector.join([
                                    make_ascii(x).strip()
                                    for x in col.itertext()
                                ])
                            except Exception:
                                content = default
                            content = content.replace("\n", " ").strip()
                        else:
                            content = col.get(attr)
                            if attr in ["href", "src"]:
                                content = urljoin(self.url, content)
                        r[head] = content
                    result_list.append(r)
            except IndexError:
                pass
        except XPathError:
            raise Exception("Invalid %s selector - %s" %
                            (self.__selector_type__, selector))
        except TypeError:
            raise Exception("Selector expression string to be provided. Got " +
                            selector)

        return result_list