def update_image_storage(): """This operation should clear DB if run multiple times on the same DB 8 Points: 3 for xpath (1 each), 1 for correct resource GET, 2 for correct DB entry handling, 2 for correct regex, 2 for Transaction explanation -0.5 for small mistakes (return value...) :return: json if successful or not """ # TODO get BASE_URL_DATASET, we would suggest requests for it (already installed) Think about error handling. answer = requests.get(BASE_URL_DATASET) # Error handling # TODO Please explain what this line is doing. Why is it needed? In which case? (directly here as comment) with database_holder.database.transaction(): # Empty databases Image.delete().execute() # pylint: disable=no-value-for-parameter Caption.delete().execute() # pylint: disable=no-value-for-parameter # TODO We encourage you to use the html.fromstring method provided by the lxml package (already installed). tree = None # TODO After parsing the XML tree, please use the xpath method to iterate over all elements for pictureTree in tree.xpath(''): # TODO get image src by xpath method, you can check lxml documentation or use a debugger to find attributes src = None # TODO parse category by appling a regex to src, probably check out regex101.com # check out re docs of Python3 category = None # save Image in DB, nothing magical here imageDb = Image(src=src, category=category) imageDb.save() # TODO iterate over all captions by using xpath method. Try to make the xpath expression as short as # possible for captionTree in []: caption_text = '' Caption(text=caption_text, image=imageDb).save() return json.dumps({'status': 'finished'}), 200
def update_image_storage(): """This operation should clear DB if run multiple times on the same DB 8 Points: 3 for xpath (1 each), 1 for correct resource GET, 2 for correct DB handling, 1 for correct regex, 1 for Transaction explanation -0.5 for small mistakes (return value...) :return: json if successful or not """ answer = requests.get(BASE_URL_DATASET) # Error handling answer.raise_for_status() # This line starts a new transaction and automatically commits it at the end of the with-clause # It is needed because database operations can fail. Then, the transaction would have to be aborted. # The with-clause also takes care of this and issues a rollback. with database_holder.database.transaction(): # Empty databases Image.delete().execute() # pylint: disable=no-value-for-parameter Caption.delete().execute() # pylint: disable=no-value-for-parameter tree = html.fromstring(answer.text) # for every picture (corresponds to tr) for pictureTree in tree.xpath('/html/body/table/tr'): # get source and category src = pictureTree.xpath('td/img/@src')[0] category = re.match(r'(\w+)\/', src).group(1) # save Image in DB, nothing magical here imageDb = Image(src=src, category=category) imageDb.save() # get all captions and save them for captionTree in pictureTree.xpath('td//td'): caption_text = captionTree.text[1:] Caption(text=caption_text, image=imageDb).save() return json.dumps({'status': 'finished'}), 200
def update_image_storage(): """This operation should clear DB if run multiple times on the same DB 8 Points: 3 for xpath (1 each), 1 for correct resource GET, 1 for correct DB cleaning, 1 for correct DB saving, 2 for correct regex, 2 for Transaction explanation -0.5 for small mistakes (return value...) :return: json if successful or not """ # TODO get BASE_URL_DATASET, we would suggest requests for it (already installed) Think about error handling. try: page = requests.get(BASE_URL_DATASET) except requests.exceptions.Timeout: print('A timeout occured.') # Maybe set up for a retry, or continue in a retry loop except requests.exceptions.TooManyRedirects: print('Too many redirects were made.') # Tell the user their URL was bad and try a different one except requests.exceptions.RequestException as e: print('An error occured', e) # catastrophic error. bail. sys.exit(1) # TODO Please explain what this line is doing. Why is it needed? In which case? (directly here as comment) # The `with` keyword guarantees that some cleanup routine for the to-be-executed routine is implicitly run # after the scope exits. In this particular case, the clean-up-routine is the return statement, such that # the 'status': 'finished' value is guaranteed to be sent together with the 200 status code. with database_holder.database.transaction(): # Empty databases Image.delete().execute() # pylint: disable=no-value-for-parameter Caption.delete().execute() # pylint: disable=no-value-for-parameter # TODO We encourage you to use the html.fromstring method provided by the lxml package (already installed). tree = html.fromstring(page.text) # "status": "/html/body/table/tr[1000]/td[2]/table/tr[5]/td" pictureTrees = tree.xpath('/html/body/table/tr'); # TODO After parsing the XML tree, please use the xpath method to iterate over all elements for index, pictureTree in enumerate(pictureTrees, start=1): # print('processing pictureTree #', index); # Extract the source attribute src = next(iter(pictureTree.xpath('td[1]/img/@src')), None) if src == None: continue # skip entry if no image is in row # print('src is ', src); # Take only substring with category descriptor category = re.match('^(\w.*)\/', src).group(1) if category == None: continue # skip entry if category could can't be extracted # print('category is ', category); # Save Image in DB, nothing magical here imageDb = Image(src=src, category=category) imageDb.save() # print('saved image entry!'); # Save the captions additionally for captionTree in pictureTree.xpath('td[2]/table/*/td/text()'): # Remove whitespaces on edges caption_text = captionTree.strip() Caption(text=caption_text, image=imageDb).save() # print('Added caption', caption_text); return json.dumps({'status': 'finished'}), 200