コード例 #1
0
def update_image_storage():
    """This operation should clear DB if run multiple times on the same DB

    8 Points: 3 for xpath (1 each), 1 for correct resource GET, 2 for correct DB entry handling,
              2 for correct regex, 2 for Transaction explanation
    -0.5 for small mistakes (return value...)

    :return: json if successful or not
    """

    # TODO get BASE_URL_DATASET, we would suggest requests for it (already installed) Think about error handling.
    answer = requests.get(BASE_URL_DATASET)
    # Error handling

    # TODO Please explain what this line is doing. Why is it needed? In which case? (directly here as comment)
    with database_holder.database.transaction():
        # Empty databases
        Image.delete().execute()  # pylint: disable=no-value-for-parameter
        Caption.delete().execute()  # pylint: disable=no-value-for-parameter

        # TODO We encourage you to use the html.fromstring method provided by the lxml package (already installed).
        tree = None

        # TODO After parsing the XML tree, please use the xpath method to iterate over all elements
        for pictureTree in tree.xpath(''):

            # TODO get image src by xpath method, you can check lxml documentation or use a debugger to find attributes
            src = None

            # TODO parse category by appling a regex to src, probably check out regex101.com
            # check out re docs of Python3
            category = None

            # save Image in DB, nothing magical here
            imageDb = Image(src=src, category=category)
            imageDb.save()

            # TODO iterate over all captions by using xpath method. Try to make the xpath expression as short as
            # possible
            for captionTree in []:
                caption_text = ''
                Caption(text=caption_text, image=imageDb).save()

    return json.dumps({'status': 'finished'}), 200
コード例 #2
0
def get_closed_captions(video_id):
	video_caption_text = get_video_captions(video_id)
	video_caption = Caption(body=video_caption_text, video_id=video_id)
	db.session.add(video_caption)

	db.session.commit()

	return_JSON = {"status" : 'success'}

	return(jsonify(return_JSON))
コード例 #3
0
def update_image_storage():
    """This operation should clear DB if run multiple times on the same DB

    8 Points: 3 for xpath (1 each), 1 for correct resource GET, 2 for correct DB handling,
              1 for correct regex, 1 for Transaction explanation
    -0.5 for small mistakes (return value...)

    :return: json if successful or not
    """

    answer = requests.get(BASE_URL_DATASET)
    # Error handling
    answer.raise_for_status()

    # This line starts a new transaction and automatically commits it at the end of the with-clause
    # It is needed because database operations can fail. Then, the transaction would have to be aborted.
    # The with-clause also takes care of this and issues a rollback.
    with database_holder.database.transaction():
        # Empty databases
        Image.delete().execute()  # pylint: disable=no-value-for-parameter
        Caption.delete().execute()  # pylint: disable=no-value-for-parameter

        tree = html.fromstring(answer.text)

        # for every picture (corresponds to tr)
        for pictureTree in tree.xpath('/html/body/table/tr'):
            # get source and category
            src = pictureTree.xpath('td/img/@src')[0]
            category = re.match(r'(\w+)\/', src).group(1)

            # save Image in DB, nothing magical here
            imageDb = Image(src=src, category=category)
            imageDb.save()

            #  get all captions and save them
            for captionTree in pictureTree.xpath('td//td'):
                caption_text = captionTree.text[1:]
                Caption(text=caption_text, image=imageDb).save()

    return json.dumps({'status': 'finished'}), 200
コード例 #4
0
def write_caption(token):
    caption_json = request.get_json()
    if not caption_json:
        abort(403)
    caption_check = Caption.query.filter_by(video_id=caption_json['video_id']).first()
    if caption_check:
        return {'message':caption_json['video_id']+' already collected'}
    for caption_request in caption_json['caption']:
        caption = Caption(text=caption_request['text'], start=caption_request['start'], duration=caption_request['duration'], video_id=caption_json['video_id'])
        # print(caption.__repr__())
        db.session.add(caption)        
    db.session.commit()
    return {'message':'הכל בסדר'}
コード例 #5
0
def search_for_caption():
    q = request.args.get('q')
    try:
        page = int(request.args.get('page'))
    except:
        page = 1
    try:
        per_page = int(request.args.get('per_page'))
    except:
        per_page = 10
    try:
        captions, total = Caption.search(q, page = page, per_page=per_page)
    except (AttributeError, TypeError):
        return {'message':'looks like there is not a page'}
    captions = [caption.__repr__() for caption in captions.all()]
    return {'captions':captions}
コード例 #6
0
def caption_data(video_ids):
    transcript_data = YouTubeTranscriptApi.get_transcripts(
        video_ids=video_ids, continue_after_error=True)
    for vid in transcript_data[0]:
        text_list = []
        counter = 0
        for trans_dict in transcript_data[0][vid]:
            #I think this is where they are getting concatenateds
            if counter < 2:
                print(trans_dict['text'])
                counter += 1
            text_list.append(trans_dict['text'])
            text_list.append(' ')

        caption_text = "".join(text_list)
        caption = Caption(body=caption_text, video_id=vid)
        db.session.add(caption)
        db.session.commit()

    return_JSON = {"status": 'success'}
    return return_JSON
コード例 #7
0
def update_image_storage():
    """This operation should clear DB if run multiple times on the same DB

    8 Points: 3 for xpath (1 each), 1 for correct resource GET, 1 for correct DB cleaning, 1 for correct DB saving,
              2 for correct regex, 2 for Transaction explanation
    -0.5 for small mistakes (return value...)

    :return: json if successful or not
    """

    # TODO get BASE_URL_DATASET, we would suggest requests for it (already installed) Think about error handling.
    try:
        page = requests.get(BASE_URL_DATASET)
    except requests.exceptions.Timeout:
        print('A timeout occured.')
        # Maybe set up for a retry, or continue in a retry loop
    except requests.exceptions.TooManyRedirects:
        print('Too many redirects were made.')
        # Tell the user their URL was bad and try a different one
    except requests.exceptions.RequestException as e:
        print('An error occured', e)
        # catastrophic error. bail.
        sys.exit(1)


    # TODO Please explain what this line is doing. Why is it needed? In which case? (directly here as comment)
    # The `with` keyword guarantees that some cleanup routine for the to-be-executed routine is implicitly run
    # after the scope exits. In this particular case, the clean-up-routine is the return statement, such that
    # the 'status': 'finished' value is guaranteed to be sent together with the 200 status code.
    with database_holder.database.transaction():
        # Empty databases
        Image.delete().execute()  # pylint: disable=no-value-for-parameter
        Caption.delete().execute()  # pylint: disable=no-value-for-parameter

        # TODO We encourage you to use the html.fromstring method provided by the lxml package (already installed).
        tree = html.fromstring(page.text)
        
        # "status": "/html/body/table/tr[1000]/td[2]/table/tr[5]/td"
        
        pictureTrees = tree.xpath('/html/body/table/tr');

        # TODO After parsing the XML tree, please use the xpath method to iterate over all elements
        for index, pictureTree in enumerate(pictureTrees, start=1):
            
            # print('processing pictureTree #', index);
                   
            # Extract the source attribute
            src = next(iter(pictureTree.xpath('td[1]/img/@src')), None)
            if src == None:
                continue  # skip entry if no image is in row

            # print('src is ', src);

            # Take only substring with category descriptor
            category = re.match('^(\w.*)\/', src).group(1)
            if category == None:
                continue  # skip entry if category could can't be extracted
            
            # print('category is ', category);

            # Save Image in DB, nothing magical here
            imageDb = Image(src=src, category=category)
            imageDb.save()

            # print('saved image entry!');

            # Save the captions additionally
            for captionTree in pictureTree.xpath('td[2]/table/*/td/text()'):
                # Remove whitespaces on edges
                caption_text = captionTree.strip()
                Caption(text=caption_text, image=imageDb).save()
                
                # print('Added caption', caption_text);

    return json.dumps({'status': 'finished'}), 200