コード例 #1
0
 def __init__(self, web_request, web_object_factory, data_handler):
     super(OptionFilter).__init__()
     self.web_request = web_request
     self.web_object_factory = web_object_factory
     self.data_handler = data_handler
     self.filtered_data = []
     self.filtered_data_keywords = []
     self.filtered_recursive_data = []
     self.filtered_recursive_data_keywords = []
     self.web_data_objects = []
     self.view = ConsoleView()
コード例 #2
0
 def __init__(self):
     super(OptionFilter).__init__()
     self.url = ''
     self.url_padding = ''
     self.recursive_urls = []
     self.requests = requests
     self.request_data = None
     self.requests_status_code = None
     self.recursive_request_data = []
     self.recursive_request_data_count = 0
     self.view = ConsoleView()
コード例 #3
0
ファイル: webdata.py プロジェクト: jaredsethnz/web_scraper
 def __init__(self, web_request, web_object_factory, data_handler):
     super(OptionFilter).__init__()
     self.web_request = web_request
     self.web_object_factory = web_object_factory
     self.data_handler = data_handler
     self.filtered_data = []
     self.filtered_data_keywords = []
     self.filtered_recursive_data = []
     self.filtered_recursive_data_keywords = []
     self.web_data_objects = []
     self.view = ConsoleView()
コード例 #4
0
ファイル: webrequest.py プロジェクト: jaredsethnz/web_scraper
 def __init__(self):
     super(OptionFilter).__init__()
     self.url = ''
     self.url_padding = ''
     self.recursive_urls = []
     self.requests = requests
     self.request_data = None
     self.requests_status_code = None
     self.recursive_request_data = []
     self.recursive_request_data_count = 0
     self.view = ConsoleView()
コード例 #5
0
ファイル: webobject.py プロジェクト: jaredsethnz/web_scraper
 def __init__(self):
     self.web_object_factory = WebObjectFactory()
     self.view = ConsoleView()
     self.save_list = self.init_save_list()
コード例 #6
0
ファイル: webobject.py プロジェクト: jaredsethnz/web_scraper
class DataHandler(object):

    def __init__(self):
        self.web_object_factory = WebObjectFactory()
        self.view = ConsoleView()
        self.save_list = self.init_save_list()

    def init_save_list(self):
        s_list = []
        try:
            with open('savelist.pickle', 'rb') as input_file:
                s_list = pickle.load(input_file)
        except (EOFError, FileNotFoundError):
            self.view.display_item('No save list found, creating new save list.....')
            with open('savelist.pickle', 'wb') as output_file:
                pickle.dump(s_list, output_file)
        return s_list

    def save_save_list(self):
        try:
            with open('savelist.pickle', 'wb') as output_file:
                pickle.dump(self.save_list, output_file)
        except (EOFError, FileNotFoundError):
            self.view.display_item('Error saving save list.....')

    def display_save_list(self):
        self.view.display_item('displaying file save locations.....')
        self.view.display_items(self.save_list)

    def add_save_list(self, save_location):
        save_location = abspath(save_location)
        if save_location not in self.save_list:
            self.save_list.append(save_location)
            self.save_save_list()

    def remove_save_list(self, save_location):
        save_location = abspath(save_location)
        self.save_list.remove(save_location)
        self.save_save_list()

    def save_objects(self, web_objs, path):
        try:
            sys.setrecursionlimit(1000000)
            objs = []
            for obj in web_objs:
                dict = obj.__dict__.copy()
                for key in obj.__dict__:
                    if key.startswith('__') and key.endswith('__'):
                        del dict[key]
                objs.append(dict)
            output_file = open(path, 'wb')
            pickle.dump(objs, output_file)
            self.add_save_list(path)
        except FileNotFoundError:
            self.view.display_item('Error saving file.....')

    def remove_objects(self, path):
        try:
            self.view.display_item('removing file ' + path + '.....')
            remove(path)
            self.remove_save_list(path)
        except FileNotFoundError:
            self.view.display_item('File ' + path + ' not found.....')

    def load_objects(self, path):
        loaded_objs = []
        try:
            with open(path, 'rb') as input_file:
                web_objs = pickle.load(input_file)
            for obj in web_objs:
                loaded_objs.append(self.web_object_factory.build_object('product', obj))
        except FileNotFoundError:
            self.view.display_item('File ' + path + ' not found.....')
        return loaded_objs
コード例 #7
0
 def __init__(self, web_data):
     super(OptionFilter).__init__()
     self.web_data = web_data
     self.data = {}
     self.graph_type = None
     self.view = ConsoleView()
コード例 #8
0
class GraphCreator(OptionFilter):
    def __init__(self, web_data):
        super(OptionFilter).__init__()
        self.web_data = web_data
        self.data = {}
        self.graph_type = None
        self.view = ConsoleView()

    def handle_command(self, args):
        return self.command(args, graph_creator_options)

    def display_graph(self, *args):

        labels = []
        sizes = []
        colors = []
        explode = []
        color_count = len(graph_colors) - 1
        count = 0
        largest_value = 0
        for key, value in self.data.items():
            largest_value = value if value >= largest_value else largest_value
            color = graph_colors[count]
            labels.append(key)
            sizes.append(value)
            colors.append(color)
            explode.append(0.0)
            count = (count + 1) if count < color_count else 0
        if len(self.data) > 0:
            explode_index = sizes.index(largest_value)
            explode[explode_index] = 0.1

            plt.title(args[self.PARAMETER_ONE])
            plt.pie(sizes,
                    explode=explode,
                    labels=labels,
                    colors=colors,
                    autopct='%1.1f%%',
                    shadow=True,
                    startangle=90)
            plt.axis('equal')
            self.view.display_item('displaying graph.....')
            plt.show()

    def graph_data(self, *args):
        self.data.clear()
        web_data = self.web_data.get_data()
        attr_name = args[self.PARAMETER_ONE]
        self.view.display_item('gathering data.....')
        for wd in web_data:
            try:
                wd_attr = getattr(wd, attr_name)
                if type(wd_attr) is decimal.Decimal:
                    self.currency_data(wd_attr)
                elif type(wd_attr) is date:
                    self.date_data(wd_attr)
                else:
                    self.str_data(wd_attr)
            except (AttributeError, UnboundLocalError):
                self.view.\
                    display_item('Error, WebObject contains no attribute ' +
                                 attr_name + '.....')

    def currency_data(self, value):
        self.view.display_item('currency')

    def date_data(self, value):
        pass

    def str_data(self, value):
        if self.data.get(value) is None:
            self.data[value] = 1
        else:
            self.data[value] = self.data.get(value) + 1

    def display_graph_data(self, *args):
        for key, value in self.data.items():
            print(key + ': ' + str(value))
コード例 #9
0
class GraphCreator(OptionFilter):

    def __init__(self, web_data):
        super(OptionFilter).__init__()
        self.web_data = web_data
        self.data = {}
        self.graph_type = None
        self.view = ConsoleView()

    def handle_command(self, args):
        return self.command(args, graph_creator_options)

    def display_graph(self, *args):

        labels = []
        sizes = []
        colors = []
        explode = []
        color_count = len(graph_colors) - 1
        count = 0
        largest_value = 0
        for key, value in self.data.items():
            largest_value = value if value >= largest_value else largest_value
            color = graph_colors[count]
            labels.append(key)
            sizes.append(value)
            colors.append(color)
            explode.append(0.0)
            count = (count + 1) if count < color_count else 0
        if len(self.data) > 0:
            explode_index = sizes.index(largest_value)
            explode[explode_index] = 0.1

            plt.title(args[self.PARAMETER_ONE])
            plt.pie(sizes, explode=explode, labels=labels, colors=colors,
                    autopct='%1.1f%%', shadow=True, startangle=90)
            plt.axis('equal')
            self.view.display_item('displaying graph.....')
            plt.show()

    def graph_data(self, *args):
        self.data.clear()
        web_data = self.web_data.get_data()
        attr_name = args[self.PARAMETER_ONE]
        self.view.display_item('gathering data.....')
        for wd in web_data:
            try:
                wd_attr = getattr(wd, attr_name)
                if type(wd_attr) is decimal.Decimal:
                    self.currency_data(wd_attr)
                elif type(wd_attr) is date:
                    self.date_data(wd_attr)
                else:
                    self.str_data(wd_attr)
            except (AttributeError, UnboundLocalError):
                self.view.display_item('Error, WebObject contains no attribute ' + attr_name + '.....')

    def currency_data(self, value):
        self.view.display_item('currency')

    def date_data(self, value):
        pass

    def str_data(self, value):
        if self.data.get(value) is None:
            self.data[value] = 1
        else:
            self.data[value] = self.data.get(value) + 1

    def display_graph_data(self, *args):
        for key, value in self.data.items():
            print(key + ': ' + str(value))
コード例 #10
0
 def __init__(self, web_data):
     super(OptionFilter).__init__()
     self.web_data = web_data
     self.data = {}
     self.graph_type = None
     self.view = ConsoleView()
コード例 #11
0
class WebData(OptionFilter):
    TAG_TYPE = 0
    CLASS_ID = 1
    CLASS_ID_NAME = 2
    CONSOLIDATE_DATA_PARAM_COUNT = 2
    CONSOLIDATE_ERROR_MSG = 'Error consolidating data, please try again...'

    def __init__(self, web_request, web_object_factory, data_handler):
        super(OptionFilter).__init__()
        self.web_request = web_request
        self.web_object_factory = web_object_factory
        self.data_handler = data_handler
        self.filtered_data = []
        self.filtered_data_keywords = []
        self.filtered_recursive_data = []
        self.filtered_recursive_data_keywords = []
        self.web_data_objects = []
        self.view = ConsoleView()

    def handle_command(self, args):
        return self.command(args, web_data_options)

    def get_data(self):
        return self.web_data_objects

    def clear_filtered_data(self, *args):
        self.view.display_item('clearing filtered data.....')
        del self.filtered_data[:]
        del self.filtered_data_keywords[:]
        del self.filtered_recursive_data[:]
        del self.filtered_recursive_data_keywords[:]
        del self.web_data_objects[:]

    def print_data(self, *args):
        attr = self.method_options(args[self.COMMAND_OPTION],
                                   web_data_print_options)
        if attr is not None:
            if not isinstance(attr, list):
                self.view.display_item(args[self.COMMAND_OPTION] + ': ' +
                                       str(attr))
            else:
                self.view.display_items(attr)

    def print_web_data_object(self, *args):
        self.view.display_item('displaying web data objects.....')
        self.view.display_item('---------------------------------'
                               '-------------------------------')
        for wo in self.web_data_objects:
            wo.func_display_data(wo, self.view)
            self.view.display_item('-----------------------------'
                                   '-----------------------------------')

    def display_saves(self, *args):
        self.data_handler.display_save_list()

    def load_saved_data(self, *args):
        self.view.display_item('loading saved data.....')
        loaded_objs = self.data_handler.load_objects(args[self.PARAMETER_ONE])
        self.web_data_objects = loaded_objs

    def save_data(self, *args):
        self.view.display_item('saving data to disk.....')
        self.data_handler.save_objects(self.web_data_objects,
                                       args[self.PARAMETER_ONE])

    def remove_data(self, *args):
        self.data_handler.remove_objects(args[self.PARAMETER_ONE])

    def get_request_data(self, *args):
        try:
            data_options = self.check_second_level_args(args)[
                self.COMMAND_OPTION]
            data = self.web_request.get_request_data()
            req_data = BeautifulSoup(data, 'html.parser') \
                .findAll(data_options[self.TAG_TYPE],
                         attrs={data_options[self.CLASS_ID]: data_options
                         [self.CLASS_ID_NAME]})
            for data in req_data:
                self.filtered_data.append(data)
                self.view.display_item('filtering data.....')
        except TypeError:
            self.view.display_item(self.COMMAND_ERROR_MSG)
            return

    def get_recursive_request_data(self, *args):
        try:
            data_options = self.check_second_level_args(args)[
                self.COMMAND_OPTION]
            for data in self.web_request.get_recursive_request_data():
                self.view.display_item('filtering recursive data.....')
                rec_data = BeautifulSoup(data, 'html.parser') \
                    .find(data_options[self.TAG_TYPE],
                          attrs={data_options[self.CLASS_ID]: data_options
                          [self.CLASS_ID_NAME]})
                self.filtered_recursive_data.append(rec_data)
        except TypeError:
            self.view.display_item(self.COMMAND_ERROR_MSG)
            return

    def filter_urls(self, *args):
        try:
            data_options = self.check_second_level_args(args)[
                self.COMMAND_OPTION]
            self.view.display_item('filtering urls.....')
            for data in self.filtered_data:
                tag_depth = self.check_data_int(data_options[self.CLASS_ID])
                if tag_depth is not None:
                    url = data.find_all(data_options[self.TAG_TYPE])
                    self.web_request.add_recursive_url(url[tag_depth]['href'])
                else:
                    url = data.find(data_options[self.TAG_TYPE],
                                    attrs={
                                        data_options[self.CLASS_ID]:
                                        data_options[self.CLASS_ID_NAME]
                                    })
                    self.web_request.add_recursive_url(url['href'])
        except (TypeError, KeyError, IndexError):
            self.view.display_item(self.COMMAND_ERROR_MSG)
            return

    def set_data_keywords(self, *args):
        kw_pairs = self.check_second_level_args(args)
        if kw_pairs is not None:
            for kw_pair in kw_pairs:
                keywords = [kw_pair[0], kw_pair[1], kw_pair[2]]
                self.view.display_item('adding tag, class pair: ' +
                                       str(keywords))
                self.filtered_data_keywords.append(keywords)

    def set_recursive_data_keywords(self, *args):
        kw_pairs = self.check_second_level_args(args)
        if kw_pairs is not None:
            for kw_pair in kw_pairs:
                r_keywords = [kw_pair[0], kw_pair[1], kw_pair[2]]
                self.view.display_item('adding tag, class pair: ' +
                                       str(r_keywords))
                self.filtered_recursive_data_keywords.append(r_keywords)

    def consolidate_data(self, *args):
        params = self.check_second_level_args(args)
        if params is not None and self.check_second_level_param_count(
                params, self.CONSOLIDATE_DATA_PARAM_COUNT):
            func_one = self.method_options(
                params[self.PARAMETER_ONE][self.PARAMETER_ONE],
                web_data_consolidate_options)
            func_two = self.method_options(
                params[self.PARAMETER_TWO][self.PARAMETER_ONE],
                web_data_consolidate_options)
            try:
                attr_one = func_one(
                    self.filtered_data, self.filtered_data_keywords,
                    params[self.PARAMETER_ONE][self.PARAMETER_TWO],
                    params[self.PARAMETER_ONE][self.PARAMETER_THREE])
                attr_two = func_two(
                    self.filtered_recursive_data,
                    self.filtered_recursive_data_keywords,
                    params[self.PARAMETER_TWO][self.PARAMETER_TWO],
                    params[self.PARAMETER_TWO][self.PARAMETER_THREE])
                self.create_web_data_object(attr_one, attr_two)
            except TypeError:
                self.view.display_item(self.CONSOLIDATE_ERROR_MSG)

    def filter_by_children(self, *args):
        obj_attrs = []
        data = args[0]
        for d in data:
            names = OrderedSet()
            attrs = {}
            try:
                for dc in d.find_all('div'):
                    name = dc.find('span')
                    value = dc.find('div')
                    if value and name is not None:
                        if name.text not in names:
                            names.add(name.text)
                            attrs[name.text] = value.text
                obj_attrs.append(attrs)
            except AttributeError:
                self.view.display_item('Error filtering data '
                                       'from children.....')
        web_objs = self.sanitise_attributes(obj_attrs)
        return web_objs

    def filter_by_keywords(self, *args):
        data = args[self.PARAMETER_ONE]
        data_kw = args[self.PARAMETER_TWO]
        obj_attr = []
        for d in data:
            try:
                attrs = {}
                for kw_pair in data_kw:
                    tag_depth = self.check_data_int(
                        kw_pair[self.PARAMETER_TWO])
                    if tag_depth is not None:
                        value = d.find_all(kw_pair[self.PARAMETER_ONE])
                        value = value[tag_depth].string
                        value = self.check_data_type(value) if \
                            value else 'unknown'
                        attrs[kw_pair[self.PARAMETER_THREE]] = value
                    else:
                        value = d.find(
                            kw_pair[self.PARAMETER_ONE], {
                                kw_pair[self.PARAMETER_TWO]:
                                kw_pair[self.PARAMETER_THREE]
                            }).string
                        value = self.check_data_type(value) if \
                            value else 'unknown'
                        attrs[kw_pair[self.PARAMETER_THREE]] = value
                obj_attr.append(attrs)
            except (TypeError, KeyError, IndexError):
                self.view.display_item(self.CONSOLIDATE_ERROR_MSG)
        return obj_attr

    def sanitise_attributes(self, obj_attrs):
        sanitised_obj_attrs = []
        for dict in obj_attrs:
            attrs = {}
            for key, value in dict.items():
                value = value.replace(key, '')
                key = key.replace(value, '')
                sanitized_key = key.replace('\n', '').replace(' ', '').lower()
                sanitized_value = re.sub('[ ]+', ' ',
                                         value.replace('\n', '')).strip()
                sanitized_value = self.check_data_type(sanitized_value)
                attrs[sanitized_key] = sanitized_value
            sanitised_obj_attrs.append(attrs)
        return sanitised_obj_attrs

    def create_web_data_object(self, attr_one, attr_two, obj_name='product'):
        if len(attr_two) > 0:
            for attr_one, attr_two in zip(attr_one, attr_two):
                self.view.display_item('creating object.....')
                attr_one.update(attr_two)
                new_obj = self.web_object_factory.build_object(
                    obj_name, attr_one)
                self.web_data_objects.append(new_obj)
        else:
            for attr_one in attr_one:
                self.view.display_item('creating object.....')
                new_obj = self.web_object_factory.build_object(
                    obj_name, attr_one)
                self.web_data_objects.append(new_obj)
コード例 #12
0
ファイル: webdata.py プロジェクト: jaredsethnz/web_scraper
class WebData(OptionFilter):
    TAG_TYPE = 0
    CLASS_ID = 1
    CLASS_ID_NAME = 2
    CONSOLIDATE_DATA_PARAM_COUNT = 2
    CONSOLIDATE_ERROR_MSG = 'Error consolidating data, please try again...'

    def __init__(self, web_request, web_object_factory, data_handler):
        super(OptionFilter).__init__()
        self.web_request = web_request
        self.web_object_factory = web_object_factory
        self.data_handler = data_handler
        self.filtered_data = []
        self.filtered_data_keywords = []
        self.filtered_recursive_data = []
        self.filtered_recursive_data_keywords = []
        self.web_data_objects = []
        self.view = ConsoleView()

    def handle_command(self, args):
        return self.command(args, web_data_options)

    def get_data(self):
        return self.web_data_objects

    def clear_filtered_data(self, *args):
        self.view.display_item('clearing filtered data.....')
        del self.filtered_data[:]
        del self.filtered_data_keywords[:]
        del self.filtered_recursive_data[:]
        del self.filtered_recursive_data_keywords[:]
        del self.web_data_objects[:]

    def print_data(self, *args):
        attr = self.method_options(args[self.COMMAND_OPTION], web_data_print_options)
        if attr is not None:
            if not isinstance(attr, list):
                self.view.display_item(args[self.COMMAND_OPTION] + ': ' + str(attr))
            else:
                self.view.display_items(attr)

    def print_web_data_object(self, *args):
        self.view.display_item('displaying web data objects.....')
        self.view.display_item('----------------------------------------------------------------')
        for wo in self.web_data_objects:
            wo.func_display_data(wo, self.view)
            self.view.display_item('----------------------------------------------------------------')

    def display_saves(self, *args):
        self.data_handler.display_save_list()

    def load_saved_data(self, *args):
        self.view.display_item('loading saved data.....')
        loaded_objs = self.data_handler.load_objects(args[self.PARAMETER_ONE])
        self.web_data_objects = loaded_objs

    def save_data(self, *args):
        self.view.display_item('saving data to disk.....')
        self.data_handler.save_objects(self.web_data_objects, args[self.PARAMETER_ONE])

    def remove_data(self, *args):
        self.data_handler.remove_objects(args[self.PARAMETER_ONE])

    def get_request_data(self, *args):
        try:
            data_options = self.check_second_level_args(args)[self.COMMAND_OPTION]
            data = self.web_request.get_request_data()
            req_data = BeautifulSoup(data, 'html.parser') \
                .findAll(data_options[self.TAG_TYPE],
                         attrs={data_options[self.CLASS_ID]: data_options[self.CLASS_ID_NAME]})
            for data in req_data:
                self.filtered_data.append(data)
                self.view.display_item('filtering data.....')
        except TypeError:
            self.view.display_item(self.COMMAND_ERROR_MSG)
            return

    def get_recursive_request_data(self, *args):
        try:
            data_options = self.check_second_level_args(args)[self.COMMAND_OPTION]
            for data in self.web_request.get_recursive_request_data():
                self.view.display_item('filtering recursive data.....')
                rec_data = BeautifulSoup(data, 'html.parser') \
                    .find(data_options[self.TAG_TYPE],
                          attrs={data_options[self.CLASS_ID]: data_options[self.CLASS_ID_NAME]})
                self.filtered_recursive_data.append(rec_data)
        except TypeError:
            self.view.display_item(self.COMMAND_ERROR_MSG)
            return

    def filter_urls(self, *args):
        try:
            data_options = self.check_second_level_args(args)[self.COMMAND_OPTION]
            self.view.display_item('filtering urls.....')
            for data in self.filtered_data:
                tag_depth = self.check_data_int(data_options[self.CLASS_ID])
                if tag_depth is not None:
                    url = data.find_all(data_options[self.TAG_TYPE])
                    self.web_request.add_recursive_url(url[tag_depth]['href'])
                else:
                    url = data.find(data_options[self.TAG_TYPE],
                                    attrs={data_options[self.CLASS_ID]: data_options[self.CLASS_ID_NAME]})
                    self.web_request.add_recursive_url(url['href'])
        except (TypeError, KeyError, IndexError):
            self.view.display_item(self.COMMAND_ERROR_MSG)
            return

    def set_data_keywords(self, *args):
        kw_pairs = self.check_second_level_args(args)
        if kw_pairs is not None:
            for kw_pair in kw_pairs:
                keywords = [kw_pair[0], kw_pair[1], kw_pair[2]]
                self.view.display_item('adding tag, class pair: ' + str(keywords))
                self.filtered_data_keywords.append(keywords)

    def set_recursive_data_keywords(self, *args):
        kw_pairs = self.check_second_level_args(args)
        if kw_pairs is not None:
            for kw_pair in kw_pairs:
                r_keywords = [kw_pair[0], kw_pair[1], kw_pair[2]]
                self.view.display_item('adding tag, class pair: ' + str(r_keywords))
                self.filtered_recursive_data_keywords.append(r_keywords)

    def consolidate_data(self, *args):
        params = self.check_second_level_args(args)
        if params is not None and self.check_second_level_param_count(params, self.CONSOLIDATE_DATA_PARAM_COUNT):
            func_one = self.method_options(params[self.PARAMETER_ONE][self.PARAMETER_ONE], web_data_consolidate_options)
            func_two = self.method_options(params[self.PARAMETER_TWO][self.PARAMETER_ONE], web_data_consolidate_options)
            try:
                attr_one = func_one(self.filtered_data, self.filtered_data_keywords,
                                    params[self.PARAMETER_ONE][self.PARAMETER_TWO],
                                    params[self.PARAMETER_ONE][self.PARAMETER_THREE])
                attr_two = func_two(self.filtered_recursive_data, self.filtered_recursive_data_keywords,
                                    params[self.PARAMETER_TWO][self.PARAMETER_TWO],
                                    params[self.PARAMETER_TWO][self.PARAMETER_THREE])
                self.create_web_data_object(attr_one, attr_two)
            except TypeError:
                self.view.display_item(self.CONSOLIDATE_ERROR_MSG)

    def filter_by_children(self, *args):
        obj_attrs = []
        data = args[0]
        for d in data:
            names = OrderedSet()
            attrs = {}
            try:
                for dc in d.find_all('div'):
                    name = dc.find('span')
                    value = dc.find('div')
                    if value and name is not None:
                        if name.text not in names:
                            names.add(name.text)
                            attrs[name.text] = value.text
                obj_attrs.append(attrs)
            except AttributeError:
                self.view.display_item('Error filtering data from children.....')
        web_objs = self.sanitise_attributes(obj_attrs)
        return web_objs

    def filter_by_keywords(self, *args):
        data = args[self.PARAMETER_ONE]
        data_kw = args[self.PARAMETER_TWO]
        obj_attr = []
        for d in data:
            try:
                attrs = {}
                for kw_pair in data_kw:
                    tag_depth = self.check_data_int(kw_pair[self.PARAMETER_TWO])
                    if tag_depth is not None:
                        value = d.find_all(kw_pair[self.PARAMETER_ONE])
                        value = value[tag_depth].string
                        value = self.check_data_type(value) if value else 'unknown'
                        attrs[kw_pair[self.PARAMETER_THREE]] = value
                    else:
                        value = d.find(kw_pair[self.PARAMETER_ONE],
                                       {kw_pair[self.PARAMETER_TWO]: kw_pair[self.PARAMETER_THREE]}).string
                        value = self.check_data_type(value) if value else 'unknown'
                        attrs[kw_pair[self.PARAMETER_THREE]] = value
                obj_attr.append(attrs)
            except (TypeError, KeyError, IndexError):
                self.view.display_item(self.CONSOLIDATE_ERROR_MSG)
        return obj_attr

    def sanitise_attributes(self, obj_attrs):
        sanitised_obj_attrs = []
        for dict in obj_attrs:
            attrs = {}
            for key, value in dict.items():
                value = value.replace(key, '')
                key = key.replace(value, '')
                sanitized_key = key.replace('\n', '').replace(' ', '').lower()
                sanitized_value = re.sub('[ ]+', ' ', value.replace('\n', '')).strip()
                sanitized_value = self.check_data_type(sanitized_value)
                attrs[sanitized_key] = sanitized_value
            sanitised_obj_attrs.append(attrs)
        return sanitised_obj_attrs

    def create_web_data_object(self, attr_one, attr_two, obj_name='product'):
        if len(attr_two) > 0:
            for attr_one, attr_two in zip(attr_one, attr_two):
                self.view.display_item('creating object.....')
                attr_one.update(attr_two)
                new_obj = self.web_object_factory.build_object(obj_name, attr_one)
                self.web_data_objects.append(new_obj)
        else:
            for attr_one in attr_one:
                self.view.display_item('creating object.....')
                new_obj = self.web_object_factory.build_object(obj_name, attr_one)
                self.web_data_objects.append(new_obj)
コード例 #13
0
 def __init__(self):
     self.web_object_factory = WebObjectFactory()
     self.view = ConsoleView()
     self.save_list = self.init_save_list()
コード例 #14
0
class DataHandler(object):
    def __init__(self):
        self.web_object_factory = WebObjectFactory()
        self.view = ConsoleView()
        self.save_list = self.init_save_list()

    def init_save_list(self):
        s_list = []
        try:
            with open('savelist.pickle', 'rb') as input_file:
                s_list = pickle.load(input_file)
        except (EOFError, FileNotFoundError):
            self.view.display_item('No save list found, '
                                   'creating new save list.....')
            with open('savelist.pickle', 'wb') as output_file:
                pickle.dump(s_list, output_file)
        return s_list

    def save_save_list(self):
        try:
            with open('savelist.pickle', 'wb') as output_file:
                pickle.dump(self.save_list, output_file)
        except (EOFError, FileNotFoundError):
            self.view.display_item('Error saving save list.....')

    def display_save_list(self):
        self.view.display_item('displaying file save locations.....')
        self.view.display_items(self.save_list)

    def add_save_list(self, save_location):
        save_location = abspath(save_location)
        if save_location not in self.save_list:
            self.save_list.append(save_location)
            self.save_save_list()

    def remove_save_list(self, save_location):
        save_location = abspath(save_location)
        self.save_list.remove(save_location)
        self.save_save_list()

    def save_objects(self, web_objs, path):
        try:
            sys.setrecursionlimit(50000)
            objs = []
            for obj in web_objs:
                dict = obj.__dict__.copy()
                for key in obj.__dict__:
                    if key.startswith('__') and key.endswith('__'):
                        del dict[key]
                objs.append(dict)
            output_file = open(path, 'wb')
            pickle.dump(objs, output_file)
            self.add_save_list(path)
        except FileNotFoundError:
            self.view.display_item('Error saving file.....')

    def remove_objects(self, path):
        try:
            self.view.display_item('removing file ' + path + '.....')
            remove(path)
            self.remove_save_list(path)
        except FileNotFoundError:
            self.view.display_item('File ' + path + ' not found.....')

    def load_objects(self, path):
        loaded_objs = []
        try:
            with open(path, 'rb') as input_file:
                web_objs = pickle.load(input_file)
            for obj in web_objs:
                loaded_objs.append(
                    self.web_object_factory.build_object('product', obj))
        except FileNotFoundError:
            self.view.display_item('File ' + path + ' not found.....')
        return loaded_objs
コード例 #15
0
class WebRequest(OptionFilter, MessageHandler):

    PRINT_DATA_MSG = 'No data to display.....'
    URL_NOT_VALID_MSG = 'please enter a valid url.....'
    CONNECTION_ERROR_MSG = 'data fetch error.....'

    def __init__(self):
        super(OptionFilter).__init__()
        self.url = ''
        self.url_padding = ''
        self.recursive_urls = []
        self.requests = requests
        self.request_data = None
        self.requests_status_code = None
        self.recursive_request_data = []
        self.recursive_request_data_count = 0
        self.view = ConsoleView()

    def handle_command(self, args):
        return self.command(args, web_request_options)

    def print_data(self, *args):
        attr = self.method_options(args[self.COMMAND_OPTION],
                                   web_request_print_options)
        if attr is not None:
            if isinstance(attr, str):
                self.view.display_item(args[self.COMMAND_OPTION] + ': ' +
                                       str(attr))
            else:
                self.view.display_items(attr)

    def set_url(self, *args):
        match = urlparse(args[self.COMMAND_OPTION])
        if match[self.URL_SCHEME] == self.URL_SCHEME_HTTP or \
           match[self.URL_SCHEME] == self.URL_SCHEME_HTTPS:
            self.url = args[self.COMMAND_OPTION]
            self.view.display_item('setting url.....')
        else:
            self.view.display_item(self.URL_NOT_VALID_MSG)

    def set_url_padding(self, *args):
        match = urlparse(args[self.COMMAND_OPTION])
        if match[self.URL_SCHEME] == self.URL_SCHEME_HTTP or \
           match[self.URL_SCHEME] == self.URL_SCHEME_HTTPS:
            self.url_padding = args[self.COMMAND_OPTION]
            self.view.display_item('setting url padding.....')
        else:
            self.view.display_item(self.URL_NOT_VALID_MSG)

    def add_recursive_url(self, *args):
        if self.check_url((self.url_padding + args[self.COMMAND_OPTION])):
            self.recursive_urls.append(self.url_padding +
                                       args[self.COMMAND_OPTION])
            self.view.display_item('adding url.....')
        else:
            self.view.display_item(self.URL_NOT_VALID_MSG)

    def fetch_html(self, *args):
        if MessageHandler.check_none_condition(self, self.url,
                                               'url not set.....'):
            self.view.display_item('fetching html from ' + self.url + '.....')
            try:
                result = self.requests.get(self.url)
                self.requests_status_code = result.status_code
                self.request_data = result.text
            except requests.RequestException:
                self.view.display_item(self.CONNECTION_ERROR_MSG)

    def recursive_fetch(self, *args):
        try:
            if len(self.recursive_urls) > 0:
                self.view.display_item('fetching recursive html.....')
                for url in self.recursive_urls:
                    self.view.display_item('fetching html from ' + url +
                                           '.....')
                    result = self.requests.get(url)
                    self.requests_status_code = result.status_code
                    self.recursive_request_data.append(result.text)
                    self.recursive_request_data_count += 1
            else:
                self.view.display_item('no recursive urls set.....')
        except requests.RequestException:
            self.view.display_item(self.CONNECTION_ERROR_MSG)

    def get_request_data(self):
        return self.request_data

    def get_recursive_request_data(self):
        return self.recursive_request_data
コード例 #16
0
ファイル: webrequest.py プロジェクト: jaredsethnz/web_scraper
class WebRequest(OptionFilter, MessageHandler):

    PRINT_DATA_MSG = 'No data to display.....'
    URL_NOT_VALID_MSG = 'please enter a valid url.....'
    CONNECTION_ERROR_MSG = 'data fetch error.....'

    def __init__(self):
        super(OptionFilter).__init__()
        self.url = ''
        self.url_padding = ''
        self.recursive_urls = []
        self.requests = requests
        self.request_data = None
        self.requests_status_code = None
        self.recursive_request_data = []
        self.recursive_request_data_count = 0
        self.view = ConsoleView()

    def handle_command(self, args):
        return self.command(args, web_request_options)

    def print_data(self, *args):
        attr = self.method_options(args[self.COMMAND_OPTION], web_request_print_options)
        if attr is not None:
            if isinstance(attr, str):
                self.view.display_item(args[self.COMMAND_OPTION] + ': ' + str(attr))
            else:
                self.view.display_items(attr)

    def set_url(self, *args):
        match = urlparse(args[self.COMMAND_OPTION])
        if match[self.URL_SCHEME] == self.URL_SCHEME_HTTP or match[self.URL_SCHEME] == self.URL_SCHEME_HTTPS:
            self.url = args[self.COMMAND_OPTION]
            self.view.display_item('setting url.....')
        else:
            self.view.display_item(self.URL_NOT_VALID_MSG)

    def set_url_padding(self, *args):
        match = urlparse(args[self.COMMAND_OPTION])
        if match[self.URL_SCHEME] == self.URL_SCHEME_HTTP or match[self.URL_SCHEME] == self.URL_SCHEME_HTTPS:
            self.url_padding = args[self.COMMAND_OPTION]
            self.view.display_item('setting url padding.....')
        else:
            self.view.display_item(self.URL_NOT_VALID_MSG)

    def add_recursive_url(self, *args):
        if self.check_url((self.url_padding + args[self.COMMAND_OPTION])):
            self.recursive_urls.append(self.url_padding + args[self.COMMAND_OPTION])
            self.view.display_item('adding url.....')
        else:
            self.view.display_item(self.URL_NOT_VALID_MSG)

    def fetch_html(self, *args):
        if MessageHandler.check_none_condition(self, self.url, 'url not set.....'):
            self.view.display_item('fetching html from ' + self.url + '.....')
            try:
                result = self.requests.get(self.url)
                self.requests_status_code = result.status_code
                self.request_data = result.text
            except requests.RequestException:
                self.view.display_item(self.CONNECTION_ERROR_MSG)

    def recursive_fetch(self, *args):
        try:
            if len(self.recursive_urls) > 0:
                self.view.display_item('fetching recursive html.....')
                for url in self.recursive_urls:
                    self.view.display_item('fetching html from ' + url + '.....')
                    result = self.requests.get(url)
                    self.requests_status_code = result.status_code
                    self.recursive_request_data.append(result.text)
                    self.recursive_request_data_count += 1
            else:
                self.view.display_item('no recursive urls set.....')
        except requests.RequestException:
            self.view.display_item(self.CONNECTION_ERROR_MSG)

    def get_request_data(self):
        return self.request_data

    def get_recursive_request_data(self):
        return self.recursive_request_data