def update_image_storage(): """This operation should clear DB if run multiple times on the same DB 8 Points: 3 for xpath (1 each), 1 for correct resource GET, 2 for correct DB entry handling, 2 for correct regex, 2 for Transaction explanation -0.5 for small mistakes (return value...) :return: json if successful or not """ # TODO get BASE_URL_DATASET, we would suggest requests for it (already installed) Think about error handling. answer = requests.get(BASE_URL_DATASET) # Error handling # TODO Please explain what this line is doing. Why is it needed? In which case? (directly here as comment) with database_holder.database.transaction(): # Empty databases Image.delete().execute() # pylint: disable=no-value-for-parameter Caption.delete().execute() # pylint: disable=no-value-for-parameter # TODO We encourage you to use the html.fromstring method provided by the lxml package (already installed). tree = None # TODO After parsing the XML tree, please use the xpath method to iterate over all elements for pictureTree in tree.xpath(''): # TODO get image src by xpath method, you can check lxml documentation or use a debugger to find attributes src = None # TODO parse category by appling a regex to src, probably check out regex101.com # check out re docs of Python3 category = None # save Image in DB, nothing magical here imageDb = Image(src=src, category=category) imageDb.save() # TODO iterate over all captions by using xpath method. Try to make the xpath expression as short as # possible for captionTree in []: caption_text = '' Caption(text=caption_text, image=imageDb).save() return json.dumps({'status': 'finished'}), 200
def get_closed_captions(video_id): video_caption_text = get_video_captions(video_id) video_caption = Caption(body=video_caption_text, video_id=video_id) db.session.add(video_caption) db.session.commit() return_JSON = {"status" : 'success'} return(jsonify(return_JSON))
def update_image_storage(): """This operation should clear DB if run multiple times on the same DB 8 Points: 3 for xpath (1 each), 1 for correct resource GET, 2 for correct DB handling, 1 for correct regex, 1 for Transaction explanation -0.5 for small mistakes (return value...) :return: json if successful or not """ answer = requests.get(BASE_URL_DATASET) # Error handling answer.raise_for_status() # This line starts a new transaction and automatically commits it at the end of the with-clause # It is needed because database operations can fail. Then, the transaction would have to be aborted. # The with-clause also takes care of this and issues a rollback. with database_holder.database.transaction(): # Empty databases Image.delete().execute() # pylint: disable=no-value-for-parameter Caption.delete().execute() # pylint: disable=no-value-for-parameter tree = html.fromstring(answer.text) # for every picture (corresponds to tr) for pictureTree in tree.xpath('/html/body/table/tr'): # get source and category src = pictureTree.xpath('td/img/@src')[0] category = re.match(r'(\w+)\/', src).group(1) # save Image in DB, nothing magical here imageDb = Image(src=src, category=category) imageDb.save() # get all captions and save them for captionTree in pictureTree.xpath('td//td'): caption_text = captionTree.text[1:] Caption(text=caption_text, image=imageDb).save() return json.dumps({'status': 'finished'}), 200
def write_caption(token): caption_json = request.get_json() if not caption_json: abort(403) caption_check = Caption.query.filter_by(video_id=caption_json['video_id']).first() if caption_check: return {'message':caption_json['video_id']+' already collected'} for caption_request in caption_json['caption']: caption = Caption(text=caption_request['text'], start=caption_request['start'], duration=caption_request['duration'], video_id=caption_json['video_id']) # print(caption.__repr__()) db.session.add(caption) db.session.commit() return {'message':'הכל בסדר'}
def search_for_caption(): q = request.args.get('q') try: page = int(request.args.get('page')) except: page = 1 try: per_page = int(request.args.get('per_page')) except: per_page = 10 try: captions, total = Caption.search(q, page = page, per_page=per_page) except (AttributeError, TypeError): return {'message':'looks like there is not a page'} captions = [caption.__repr__() for caption in captions.all()] return {'captions':captions}
def caption_data(video_ids): transcript_data = YouTubeTranscriptApi.get_transcripts( video_ids=video_ids, continue_after_error=True) for vid in transcript_data[0]: text_list = [] counter = 0 for trans_dict in transcript_data[0][vid]: #I think this is where they are getting concatenateds if counter < 2: print(trans_dict['text']) counter += 1 text_list.append(trans_dict['text']) text_list.append(' ') caption_text = "".join(text_list) caption = Caption(body=caption_text, video_id=vid) db.session.add(caption) db.session.commit() return_JSON = {"status": 'success'} return return_JSON
def update_image_storage(): """This operation should clear DB if run multiple times on the same DB 8 Points: 3 for xpath (1 each), 1 for correct resource GET, 1 for correct DB cleaning, 1 for correct DB saving, 2 for correct regex, 2 for Transaction explanation -0.5 for small mistakes (return value...) :return: json if successful or not """ # TODO get BASE_URL_DATASET, we would suggest requests for it (already installed) Think about error handling. try: page = requests.get(BASE_URL_DATASET) except requests.exceptions.Timeout: print('A timeout occured.') # Maybe set up for a retry, or continue in a retry loop except requests.exceptions.TooManyRedirects: print('Too many redirects were made.') # Tell the user their URL was bad and try a different one except requests.exceptions.RequestException as e: print('An error occured', e) # catastrophic error. bail. sys.exit(1) # TODO Please explain what this line is doing. Why is it needed? In which case? (directly here as comment) # The `with` keyword guarantees that some cleanup routine for the to-be-executed routine is implicitly run # after the scope exits. In this particular case, the clean-up-routine is the return statement, such that # the 'status': 'finished' value is guaranteed to be sent together with the 200 status code. with database_holder.database.transaction(): # Empty databases Image.delete().execute() # pylint: disable=no-value-for-parameter Caption.delete().execute() # pylint: disable=no-value-for-parameter # TODO We encourage you to use the html.fromstring method provided by the lxml package (already installed). tree = html.fromstring(page.text) # "status": "/html/body/table/tr[1000]/td[2]/table/tr[5]/td" pictureTrees = tree.xpath('/html/body/table/tr'); # TODO After parsing the XML tree, please use the xpath method to iterate over all elements for index, pictureTree in enumerate(pictureTrees, start=1): # print('processing pictureTree #', index); # Extract the source attribute src = next(iter(pictureTree.xpath('td[1]/img/@src')), None) if src == None: continue # skip entry if no image is in row # print('src is ', src); # Take only substring with category descriptor category = re.match('^(\w.*)\/', src).group(1) if category == None: continue # skip entry if category could can't be extracted # print('category is ', category); # Save Image in DB, nothing magical here imageDb = Image(src=src, category=category) imageDb.save() # print('saved image entry!'); # Save the captions additionally for captionTree in pictureTree.xpath('td[2]/table/*/td/text()'): # Remove whitespaces on edges caption_text = captionTree.strip() Caption(text=caption_text, image=imageDb).save() # print('Added caption', caption_text); return json.dumps({'status': 'finished'}), 200