def get_idxurl(self, gsoup, rurl, urle): """ Get the list of the elements inside the gallery index based on the root domain url and html div tags Args: gsoup (bs-obj): beatifulSoup object containing the gallery's element list rurl (str): root URL of the domain to complete the element url urle (str): HTML <div> keyword to process the Page's scraped gallery urls Raises: exp: raise a generic exception if something goes wrong Returns: ans (list): list with each of the gallery's unique urls """ try: gm = self.gallery ans = gm.get_idxurl(gsoup, rurl, urle) # returning answer return ans # exception handling except Exception as exp: Err.reraise(exp, "Controller: get_idxurl")
def get_idxtitle(self, gsoup, etitle): """ Get the element titles from the gallery main page Args: gsoup (bs-obj): beatifulSoup object containing the gallery's element list etitle HTML <div> keyword to process the scraped data from the gallery's soup to get the element titles Raises: exp: raise a generic exception if something goes wrong Returns: ans (list): gallery element (paints) titles in string """ try: gm = self.gallery ans = gm.get_idxtitle(gsoup, etitle) # returning answer return ans # exception handling except Exception as exp: Err.reraise(exp, "Controller: get_idxtitle")
def scrapidx(self, gurl, stime, div, attrs): """ Scrap the gallery, create a new index and recover all elements in it Args: gurl (str): URL for the gallery to scrap data div (str): HTML <div> keyword to search and scrap attrs (dict): decorative attributes in the <div> keyword to refine the search and scrap stime (float): waiting time between requests Raises: exp: raise a generic exception if something goes wrong Returns: ans (bs-obj): div and attrs filtered beatifulsoup object """ try: gm = self.gallery ans = gm.scrapidx(gurl, stime, div, attrs) return ans # exception handling except Exception as exp: Err.reraise(exp, "Controller: scrapidx")
def findin(self, division, attributes=None, multiple=True): """ find HTML tags inside a BeautifulSoup class attribute. Args: division (str): HTML tag to find in soup ie.: "div", or "li" attributes (dict, optional): decorators to highlight the divs options. Defaults to None. multiple (bool, optional): True to find multiple tag occurrences in the HTML, False if not. Default to True Raises: exp: raise a generic exception if something goes wrong Returns: ans (bs-obj): filtered BeautifulSoup object """ try: ans = None if multiple is True: ans = self.sbody.findAll(division, attrs=attributes) elif multiple is False: ans = self.sbody.find(division, attrs=attributes) return ans # exception handling except Exception as exp: Err.reraise(exp, "Page: findin")
def setup_local(self, *args): """ Set up local gallery filepath acording to the root gallery folder and other subfolders Args: rootf (str): name of the main gallery local folder subfolders (list, optional): the subfolders names to the gallery conforming the absolute dirpath Raises: exp: raise a generic exception if something goes wrong Returns: wpath (str): returns the local filepath to the gallery """ try: # answer with realpath local subfoders wpath = str() wpath = os.path.join(*args) # if the path doesnt exists you create it if not os.path.exists(wpath): os.makedirs(wpath) return wpath # exception handling except Exception as exp: Err.reraise(exp, "Controller: setup_local")
def get_idxurl(self, gsoup, rurl, urle): # TODO: remove after implement the Topic() class """ get the list of the elements inside the gallery index based on the root domain url and html div tags Args: gsoup (bs-obj): beatifulSoup object containing the gallery's element list rurl (str): root URL of the domain to complete the element url urle (str): HTML <div> keyword to process the Page's scraped gallery urls Raises: exp: raise a generic exception if something goes wrong Returns: ans (list): list with each of the gallery's unique urls """ try: ans = list() for title in gsoup: turl = urllib.parse.urljoin(rurl, title.get(urle)) ans.append(turl) # returning answer return ans # exception handling except Exception as exp: Err.reraise(exp, "Gallery: get_idxurl")
def newidx(self, cols, data): """ creates a new dataframe in the model based on the columns names and new data. Args: columns (list): list of column names to create the new dataframe data (list:list, pandas/numpy matrix): data for the columns the new dataframe Raises: exp: raise a generic exception if something goes wrong Returns: ans (bool): true if the function created a new df-frame, false otherwise """ try: ans = False self.data_frame = pd.DataFrame(columns=self.schema) for col, td in zip(cols, data): self.data_frame[col] = td ans = True return ans # exception handling except Exception as exp: Err.reraise(exp, "Gallery: newidx")
def get_idxid(self, gsoup, ide, clean): # TODO: remove after implement the Topic() class """ get the unique identifier (ID) of the gallery elements (paints) and list them to introduce them itto the dataframe Args: gsoup (bs-obj): list with gallery elements in Beatiful Soup format ide (str): HTML <div> keyword to extract the element (paint) ID Raises: exp: raise a generic exception if something goes wrong Returns: ans (list): list with the elements (paints) IDs """ try: ans = list() for element in gsoup: tid = element.get(ide).replace(clean, "") ans.append(tid) # returning answer return ans # exception handling except Exception as exp: Err.reraise(exp, "Gallery: get_idxid")
def clean_dlurl(self, gsoup, rurl, urle): # TODO: remove after implement the Topic() class """ recovers the download URL for a gallery element Args: gsoup (bs-obj): beatifulSoup object with gallery element list rurl (str): domain root URL to complete the gallery index urle (str): HTML <div> keyword to scrap the gallery index urls to download files Raises: exp: raise a generic exception if something goes wrong Returns: ans (str): unique URL with the downloadable element's file """ try: ans = None if gsoup is not None: url = gsoup.get(urle) ans = urllib.parse.urljoin(rurl, url) # returning answer return ans # exception handling except Exception as exp: Err.reraise(exp, "Gallery: clean_dlurl")
def updata(self, column, data): """ updates a single column with new data, the size of the data needs to be the same as the existing records Args: column (str): name of the column in the dataframe to update data (list/np.array): dataframe of the data to update Raises: exp: raise a generic exception if something goes wrong Returns: ans (dataframe.info()): updated pandas dataframe description """ try: ans = False self.data_frame[column] = data if self.data_frame[column] is not None: ans = True return ans # exception handling except Exception as exp: Err.reraise(exp, "Gallery: updata")
def save_gallery(self, fn, dfolder): """ save the in memory dataframe into a CSV file with UTF-8 encoding Args: fn (str): file name with .csv extension dfolder (file-object): valid dirpath str or array with valid folders. Raises: exp: raise a generic exception if something goes wrong """ try: # pandas function to save dataframe in CSV file ans = False gfp = os.path.join(os.getcwd(), dfolder, fn) tdata = self.data_frame.to_csv(gfp, sep=",", index=False, encoding="utf-8", mode="w", quoting=csv.QUOTE_ALL) if tdata is None: ans = True return ans # exception handling except Exception as exp: Err.reraise(exp, "Gallery: save_gallery")
def create_localfolders(self, *args): """ Creates local subfolders with the gallery folder as root for them Args: gfolder (str): name of the main gallery folder coln (str): name of the ID column to create the folders Raises: exp: raise a generic exception if something goes wrong """ try: gfolder = args[0] coln = args[1] # looping throught ID list as folder names for the local gallery for folder in self.getdata(coln): # create the local folder path to create if necessary tfp = os.path.join(gfolder, folder) # if the local folder doesnt exists if not os.path.exists(tfp): os.makedirs(tfp) # the local forlder already ecists elif os.path.exists(tfp): pass # exception handling except Exception as exp: Err.reraise(exp, "Controller: create_localfolders")
def export_json(self, gfolder, incol, expcol, fname): """ export the data from one column in the model's dataframe into JSON file in an specific local gallery folder Args: gfolder (str): name of the main gallery folder incol (str): name of the column in the dataframe with the gallery index with unique IDs for each elements (same as the local folder's names) expcol (str): name of the column with the data to export to JSON fname (str): name of the file to save Raises: exp: raise a generic exception if something goes wrong """ try: # working variables idd = self.getdata(incol) expd = self.getdata(expcol) for tindex, tdata in zip(idd, expd): tfile = fname + ".json" self.write_json(tdata, tfile, gfolder, tindex) time.sleep(DEFAULT_SHORT_SLEEP_TIME) # exception handling except Exception as exp: Err.reraise(exp, "Controller: export_json")
def write_json(self, data, filename, *args): """ Save a json into a local file according to the gallery folder and subfolders Args: data (JSON): JSON data to save in file filename (str): JSON fole name gfolder (str): name of the main gallery folder subfolders (str): list of subfolder names to the main gallery folder, can be as much as neeeded Raises: exp: raise a generic exception if something goes wrong """ try: # configuring local filepath lfp = os.path.join(*args, filename) # saving data in with utf-8 encoding with open(lfp, "w", encoding="utf-8") as file: file.write(data) file.close() # exception handling except Exception as exp: Err.reraise(exp, "Controller: write_json")
def getdata(self, coln, *args, **kwargs): """ get the data based in the column name of the model's dataframe Args: coln (str): column name of the gallery dataframe to get Raises: exp: raise a generic exception if something goes wrong Returns: ans (list): data from the column name """ try: # getting the element url in the gallery ans = list() gm = self.gallery ans = gm.getdata(coln, *args, **kwargs) # returning answer return ans # exception handling except Exception as exp: Err.reraise(exp, "Controller: getdata")
def __init__(self, *args, **kwargs): """ Controller() class creator Args: webg_path (str): URL for the gallery to scrap data localg_path (str): local dirpath for the gallery data schema (list): array with the column names for the model gallery (Gallery): object with the gallery dataframe model # wpage (Page): the current webpage the controller is scrapping Raises: exp: raise a generic exception if something goes wrong Returns: Controller (Model): return a new Controller() object """ try: # Controller default values self.webg_path = str() self.localg_path = str() self.imgd_path = str() self.schema = copy.deepcopy(DEFAULT_FRAME_SCHEMA) self.gallery = Gallery() self.wpage = Page() # when arguments are pass as parameters if len(args) > 0: i = 0 for i in range(int(len(args))): # URL of the remote gallery to scrap if i == 0: self.webg_path = args[i] # local dirpath to save the gallery CSV if i == 1: self.localg_path = args[i] # painting list containing the data of the gallery if i == 2: self.imgd_path = args[i] # if there are dict decrators in the creator if len(kwargs) > 0: for key in list(kwargs.keys()): # updating schema in the controller if key == "schema": self.schema = copy.deepcopy(kwargs[key]) # setting the max size of the gallery if key == "model": self.gallery = kwargs[key] # exception handling except Exception as exp: Err.reraise(exp, "Controller: __init__")
def load_gallery(self, fn, dfolder): """ loads the gallery from a CSV file in UTF-8 encoding Args: fn (str): file name with .csv extension dfolder (file-object): valid dirpath str or array with valid folders. Raises: exp: raise a generic exception if something goes wrong """ try: # read an existing CSV fileto update the dataframe ans = False gfp = os.path.join(os.getcwd(), dfolder, fn) self.data_frame = pd.read_csv(gfp, sep=",", encoding="utf-8", engine="python", quoting=csv.QUOTE_ALL) if self.data_frame is not None: ans = True return ans # exception handling except Exception as exp: Err.reraise(exp, "Gallery: load_gallery")
def get_srcimgs(self, sfp, sfext): """ Recover the images inside the localpath using the file extension Args: sfp (str): local folderpath of the source image to scan sfext (str): source image file extension, ie.: "jpg" Raises: exp: raise a generic exception if something goes wrong Returns: ans (list): list of the source images local filepaths """ try: # default answer ans = list() files = os.listdir(sfp) # cheking if there is files in folder if len(files) > 0: # finding the proper image extension file for f in files: if f.endswith(sfext): fn = os.path.join(sfp, f) ans.append(fn) # returning answer return ans # exception handling except Exception as exp: Err.reraise(exp, "Gallery: get_srcimgs")
def __init__(self, *args, **kwargs): """ creator of the class gallery() Args: webg_path (str): URL for the gallery to scrap data localg_path (str): local dirpath for the gallery data schema (list): array with the column names for the model data_frame (data_frame, optional): panda df with data (ie.: paints) in the gallery, you can pass an existing df, Default is empty wpage (Page): the current webpage the controller is scrapping Raises: exp: raise a generic exception if something goes wrong Returns: Model (Model): return a new Model() object """ try: # default creator attributes self.webg_path = str() self.localg_path = str() self.imgd_path = str() self.schema = copy.deepcopy(DEFAULT_FRAME_SCHEMA) self.data_frame = pd.DataFrame(columns=DEFAULT_FRAME_SCHEMA) self.wpage = Page() # when arguments are pass as parameters if len(args) > 0: for arg in args: # URL of the remote gallery to scrap if args.index(arg) == 0: self.webg_path = arg # local dirpath to save the gallery CSV if args.index(arg) == 1: self.localg_path = arg # local dirpath to save the images if args.index(arg) == 2: self.imgd_path = arg # dataframes containing the data of the gallery if args.index(arg) == 3: self.data_frame = arg # if there are dict decrators in the creator if len(kwargs) > 0: for key in list(kwargs.keys()): # updating schema in the model if key == "schema": self.schema = copy.deepcopy(kwargs[key]) self.data_frame = pd.DataFrame(columns=self.schema) # exception handling except Exception as exp: Err.reraise(exp, "Gallery: __init__")
def clean_imgfn(self, text, elem, clean): """ scrap elements within a link based on the <div>, html marks and other attributes or decoratos Args: text (str): text to be clean elem (str): keyword to split the str and process clean (str): keyword to clean in the text Raises: exp: raise a generic exception if something goes wrong Returns: ans (str): clean file name with extension """ try: ans = None ans = text.split(elem)[1].strip().strip(clean) # returning answer return ans # exception handling except Exception as exp: Err.reraise(exp, "Gallery: clean_imgfn")
def scrapidx(self, gurl, stime, div, attrs): """ Scrap the gallery index and recover all the elements in it Args: gurl (str): gallery URL to scrap data div (str): HTML <div> keyword to search and scrap attrs (dict): decorative attributes in the <div> keyword to refine the search and scrap stime (float): waiting time between requests Raises: exp: raise a generic exception if something goes wrong Returns: ans (bs-obj): div and attrs filtered beatifulsoup object """ try: # reset working web page self.wpage = Page() ans = None # getting the basic element list from gallery online index self.wpage.get_collection(gurl, stime) ans = self.wpage.findin(div, attributes=attrs) # returning answer return ans # exception handling except Exception as exp: Err.reraise(exp, "Gallery: scrapidx")
def dlpaints(self, *args): """ download the paint files from the list of available asset url in the gallery Args: dlurl_coln (str): column name of known download URLs gfolder (str): name of the main gallery folder div (str): HTML <div> search and scrap keyword attrs (dict): decorative <div> keywords to refine the scrap elem (str): secondary <div> keyword to refine the search and scrap process Raises: exp: raise a generic exception if something goes wrong Returns: ans (list): list of boolean marking if it is possible to download a picture file or not """ try: # getting the element url in the gallery ans = list() gm = self.gallery dlurl_coln = args[0] gf = args[1] div = args[2] attrs = args[3] elem = args[4] clean = args[5] for url in self.getdata(dlurl_coln): # the url is valid, it can be null or na or none if validators.url(str(url)) is True: # recovers the image file name tsoup = gm.get_imgfn(url, div, attrs) # clean the name to save timgf = gm.clean_imgfn(tsoup, elem, clean) # download and save the image in the local folder tans = gm.get_imgf(gf, url, timgf) ans.append(tans) # invalid url else: tans = False ans.append(tans) time.sleep(DEFAULT_SLEEP_TIME) # returning answer return ans # exception handling except Exception as exp: Err.reraise(exp, "Controller: dlpaints")
def clean_relwork(self, rurl, soup, elem, clean): # TODO: remove after implement the Topic() class """ process the scraped data from the beatifulSoup object and saves the related work information into a JSON files Args: rurl (str): domain root URL to complete the related-work link soup (bs-obj): beatifulSoup object with the related-work data elem (str): HTML <div> keyword to scrap the related-work data clean (list): secondary <div> to clean the related-work data Raises: exp: raise a generic exception if something goes wrong Returns: ans (dict): Element (paint) clean related-work """ try: # default answer ans = dict() # checking if searchtags exists if soup is not None: # finding searhtags <article> in the sou relworks = soup[0].findAll(elem) # processing related work i = 1 for rw in relworks: # cleaning data and getting all keys and values key = str(rw.find(clean[0]).string) key = self.clrtext(key) url = rw.find(clean[1]) url = url.get(clean[2]) value = str(urllib.parse.urljoin(rurl, url)) # may names are similar in related work if key in ans.keys(): # creating alternate key for the dict key = key + " " + str(i) i += 1 # updating answer dict td = {key: value} ans.update(copy.deepcopy(td)) # returning answer return ans # exception handling except Exception as exp: Err.reraise(exp, "Gallery: clean_relwork")
def scrap_relwork(self, *args, **kwargs): """ able to scrap the related work data from the webpage using the dataframe's column name, the HTML divs and other decorators in the url Args: coln (str): ID column name of the gallery dataframe rurl (str): root URL of the domain to complete the related work div (str): HTML <div> keyword to search and scrap attrs (dict): decorative attributes in the <div> keyword to refine elem (str): element is a secondary <div> keyword to refine the search and scrap process Raises: exp: raise a generic exception if something goes wrong Returns: ans (list): the list of the related work recovered from the gallery elements """ try: # get the url list from the dataframe in the model ans = list() gm = self.gallery coln = args[0] rurl = args[1] div = args[2] attrs = args[3] elem = args[4] clean = args[5] for url in self.getdata(coln): # scraping elements each gallery page tsoup = gm.scrape(url, div, attrs, **kwargs) # # default empty dict to return tans = dict() # checking if there is any related work to process if len(tsoup) > 0: # extracting the search tags from the soup tans = gm.clean_relwork(rurl, tsoup, elem, clean) # compose answer tans = self.to_json(tans) ans.append(tans) time.sleep(DEFAULT_SLEEP_TIME) # returning answer return ans # exception handling except Exception as exp: Err.reraise(exp, "Controller: scrap_relwork")
def export_paints(self, *args): """ Export the images from a source folder into a target folder, the target images are in color and in grayscale Args: coln (str): ID column name of the gallery dataframe sfext (str): source image file extension, ie.: "jpg" tfext (dict): target image file extension, ie.: "jpg" tsufix (dict): target image file sufix, ie.: "-rgb" Raises: exp: raise a generic exception if something goes wrong Returns: ans (list): the list of dict with the relative localpath file for each gallery element (ej.: {"rgb": "/Data/Img/s0004V1962r-rgb.jpg", "bw": "/Data/Img/s0004V1962r-b&w.jpg" }) """ try: # default answer ans = list() # working variables coln = args[0] sfext = args[1] tfext = args[2] tsufix = args[3] gm = self.gallery # iterating over the index data for tid in self.getdata(coln): # config source and target folders srcf = os.path.join(self.localg_path, tid) tgtf = os.path.join(self.imgd_path, tid) # recovering source images srcfn = gm.get_srcimgs(srcf, sfext) # setting target images tgtfn = gm.set_tgtimgs(srcfn, tgtf, tfext, tsufix) # exporting images tans = gm.export_imgs(srcfn, tgtfn, tsufix) # compose answer tans = self.to_json(tans) ans.append(tans) time.sleep(DEFAULT_SHORT_SLEEP_TIME) # return answer list return ans # exception handling except Exception as exp: Err.reraise(exp, "Controller: export_paints")
def clean_searchtags(self, rurl, soup, elem, clean): # TODO: remove after implement the Topic() class """ Clean the page's search-tags from the beatifulSoup object Args: rurl (str): root URL of the domain to complete the search-tags soup (bs-obj): beatifulSoup object with the search-tags data elem (str): HTML <div> keyword to scrap the search-tags data clean (str): secondary <div> keyword to clean the data from the scrap Raises: exp: raise a generic exception if something goes wrong Returns: ans (dict): Element (paint) clean search-tags """ try: # default answer ans = dict() # checking if searchtags exists if soup is not None: # checking is the correct collection search tags if len(soup) > 0: # finding searhtags <a> in the sou tags = soup[0].findAll(elem) # processing the search tags if len(tags) > 0 and isinstance(tags, list) is True: for tag in tags: # cleaning data key = str(tag.string) key = self.clrtext(key) url = tag.get(clean) # reconstructing all the url from the page value = str(urllib.parse.urljoin(rurl, url)) td = {key: value} # updating answer dict ans.update(copy.deepcopy(td)) # returning answer return ans # exception handling except Exception as exp: Err.reraise(exp, "Gallery: clean_searchtags")
def export_shapes(self, *args): """ Export the image shapes from the exported images in the target folder Args: coln (str): ID column name of the gallery dataframe sfext (str): source image file extension, ie.: "jpg" tfext (dict): target image file extension, ie.: "jpg" tsufix (dict): target image file sufix, ie.: "-rgb" Raises: exp: raise a generic exception if something goes wrong Returns: ans (list) the list of dict with the shape of each gallery element (ej.: {"rgb": (450, 280, 3), "bw": (450, 280)}) """ try: # default answer ans = list() # working variables coln = args[0] tfext = args[1] tsufix = args[2] gm = self.gallery ip = self.imgd_path # iterating over the index data for tid in self.getdata(coln): # config source and target folders tgtf = os.path.join(ip, tid) # recovering source images tgtfn = gm.get_srcimgs(tgtf, tfext) # exporting shapes tans = gm.export_shapes(tgtfn, tsufix) # compose answer tans = self.to_json(tans) ans.append(tans) time.sleep(DEFAULT_SHORT_SLEEP_TIME) # return answer list return ans # exception handling except Exception as exp: Err.reraise(exp, "Controller: export_shapes")
def get_imgf(self, gfolder, dlurl, pfn): # TODO: remove after implement the Topic() class """ save the paint file from the asset URL in the local folder path Args: gfolder (str): root local dirpath where the file is going to be save dlurl (str): url address with the downlodable image file pfn (str): filename to save the image Raises: exp: raise a generic exception if something goes wrong Returns: ans (bool): True if the file was downloaded in the local dirpath, False if not """ try: # default answer ans = False # parsing the URL to choose the local folder to save the file imgf = urllib.parse.urlparse(dlurl) imgf = imgf.path.split("/")[len(imgf.path.split("/")) - 1] fp = os.path.join(gfolder, imgf, pfn) # if the file doesnt exists if not os.path.exists(fp): # saving file from content requests in bit form data = self.wpage.content with open(fp, "wb") as file: file.write(data) file.close() ans = True return ans # if the file already exists elif os.path.exists(fp): ans = True return ans # returning answer return ans # exception handling except Exception as exp: Err.reraise(exp, "Gallery: get_imgf")
def clean_objdata(self, soup, elem): # TODO: remove after implement the Topic() class """ Clean the page's object-data from the beatifulSoup object Args: soup (bs-obj): beatifulSoup object with the object-data data elem (str): HTML <div> keyword to scrap the object-data data Raises: exp: raise a generic exception if something goes wrong Returns: ans (dict): Element (paint) clean object-data """ try: # default answer ans = dict() # checking if object-data exists if soup is not None: # finding <dt> and <dd> from the soup keys = soup.findAll(elem[0]) values = soup.findAll(elem[1]) # soup keys and values must have data if len(keys) > 0 and len(values) > 0: # looping over the <dt> and <dd> data for key, value in zip(keys, values): # cleaning data for dictionary key = str(key.string) key = self.clrtext(key) value = str(value.string) value = self.clrtext(value) # temp dict for complete answer td = {key: value} # updating answer dict ans.update(copy.deepcopy(td)) # returning answer return ans # exception handling except Exception as exp: Err.reraise(exp, "Gallery: clean_objdata")
def scrap_searchtags(self, *args, **kwargs): """ Scrap the elements (paints) search-tags using the ID column name in the index, the domain URL, HTML divisions <divs>, decorative attributes, secondary HTML elements and cleaning HTML divisions Args: coln (str): ID column name of the gallery dataframe rurl (str): root URL of the domain to complete the search-tags div (str): HTML <div> keyword to scrap the search-tags attrs (dict): decorative attributes in the <div> keyword to refine elem (str): element is a secondary <div> keyword to refine the search and scrap process clean (str): secondary <div> keyword to clean the data from the scrap Raises: exp: raise a generic exception if something goes wrong Returns: ans (list): list of element search-tags in JSON format """ try: # get the url list from the dataframe in the model ans = list() gm = self.gallery coln = args[0] rurl = args[1] div = args[2] attrs = args[3] elem = args[4] clean = args[5] for url in self.getdata(coln): # scraping elements each gallery page tsoup = gm.scrape(url, div, attrs, **kwargs) # extracting the search tags from the soup tans = gm.clean_searchtags(rurl, tsoup, elem, clean) # compose answer tans = self.to_json(tans) ans.append(tans) time.sleep(DEFAULT_SLEEP_TIME) # returning answer return ans # exception handling except Exception as exp: Err.reraise(exp, "Controller: scrap_searchtags")