def validate_result(self, result): # Ensure incoming result is dict and check allowed fields self.check_if_dict(result, "Result") self.check_allowed_fields(result_allowed_fields, result, "Result") # If duration included, ensure valid duration can be parsed from it if "duration" in result: try: parse_duration(result["duration"]) except ISO8601Error as e: self.return_error("Error with result duration - %s" % e.message) # If success or completion included, ensure they are boolean if "success" in result: if not isinstance(result["success"], bool): self.return_error("Result success must be a boolean value") if "completion" in result: if not isinstance(result["completion"], bool): self.return_error("Result completion must be a boolean value") # If response in result, ensure it is a string if "response" in result: if not isinstance(result["response"], basestring): self.return_error("Result response must be a string") # If extensions, validate if "extensions" in result: self.validate_extensions(result["extensions"], "result") # If score included, validate it if "score" in result: self.validate_score(result["score"])
def validate_result(self, result): # Ensure incoming result is dict and check allowed fields self.check_if_dict(result, "Result") self.check_allowed_fields(result_allowed_fields, result, "Result") # If duration included, ensure valid duration can be parsed from it if 'duration' in result: try: parse_duration(result['duration']) except ISO8601Error as e: self.return_error("Error with result duration - %s" % e.message) # If success or completion included, ensure they are boolean if 'success' in result: if not isinstance(result['success'], bool): self.return_error("Result success must be a boolean value") if 'completion' in result: if not isinstance(result['completion'], bool): self.return_error("Result completion must be a boolean value") # If response in result, ensure it is a string if 'response' in result: if not isinstance(result['response'], basestring): self.return_error("Result response must be a string") # If extensions, validate if 'extensions' in result: self.validate_extensions(result['extensions'], 'result') # If score included, validate it if 'score' in result: self.validate_score(result['score'])
def validate_result(self, result): # Ensure incoming result is dict and check allowed fields self.check_if_dict(result, "Result") self.check_allowed_fields(result_allowed_fields, result, "Result") # If duration included, ensure valid duration can be parsed from it if 'duration' in result: try: parse_duration(result['duration']) except Exception as e: self.return_error("Error with result duration - %s" % e.message) # If success or completion included, ensure they are boolean if 'success' in result: if not isinstance(result['success'], bool): self.return_error("Result success must be a boolean value") if 'completion' in result: if not isinstance(result['completion'], bool): self.return_error("Result completion must be a boolean value") # If response in result, ensure it is a string if 'response' in result: if not isinstance(result['response'], basestring): self.return_error("Result response must be a string") # If extensions, validate if 'extensions' in result: self.validate_extensions(result['extensions'], 'result') # If score included, validate it if 'score' in result: self.validate_score(result['score'])
def populateResult(self, stmt_data, verb): log_message(self.log_dict, "Populating result", __name__, self.populateResult.__name__) resultExts = {} #Catch contradictory results if 'extensions' in stmt_data['result']: result = {key: value for key, value in stmt_data['result'].items() if not key == 'extensions'} resultExts = stmt_data['result']['extensions'] else: result = stmt_data['result'] self.validateVerbResult(result, verb, stmt_data['object']) # Validate duration, throw error if duration is not formatted correctly if 'duration' in result: try: dur = parse_duration(result['duration']) except ISO8601Error as e: log_message(self.log_dict, e.message, __name__, self.populateResult.__name__, True) update_parent_log_status(self.log_dict, 400) raise exceptions.ParamError(e.message) #Once found that the results are valid against the verb, check score object and save if 'score' in result.keys(): result['score'] = self.validateScoreResult(result['score']) result['score'] = self.saveScoreToDB(result['score']) #Save result return self.saveResultToDB(result, resultExts)
def populateResult(self, stmt_data): log_message(self.log_dict, "Populating result", __name__, self.populateResult.__name__) resultExts = {} #Catch contradictory results if 'extensions' in stmt_data['result']: result = dict((key, value) for (key, value) in stmt_data['result'].items() if not key == 'extensions') resultExts = stmt_data['result']['extensions'] else: result = stmt_data['result'] # Validate duration, throw error if duration is not formatted correctly if 'duration' in result: try: dur = parse_duration(result['duration']) except ISO8601Error as e: log_message(self.log_dict, e.message, __name__, self.populateResult.__name__, True) update_parent_log_status(self.log_dict, 400) raise exceptions.ParamError(e.message) if 'score' in result.keys(): result['score'] = self.validateScoreResult(result['score']) result['score'] = self.saveScoreToDB(result['score']) #Save result return self.saveResultToDB(result, resultExts)
def isoduration_to_seconds(d): #============================= """ Convert an ISO duration to a number of seconds. :param v: A string representing a duration, formatted as ISO 8601. :return: The number of seconds. :rtype: float """ try: td = isoduration.parse_duration(d) return td.days * 86400 + td.seconds + td.microseconds / 1000000.0 except: try: return float(d) ## Virtuoso strips "PT" etc on import... except: return 0
def isoduration_to_seconds(d): # ============================= """ Convert an ISO duration to a number of seconds. :param v: A string representing a duration, formatted as ISO 8601. :return: The number of seconds. :rtype: float """ try: td = isoduration.parse_duration(d) return td.days * 86400 + td.seconds + td.microseconds / 1000000.0 except: try: return float(d) ## Virtuoso strips "PT" etc on import... except: return 0
def scrape(self, video_id, fileName): mongo_client = MongoClient() db = mongo_client['nekotube'] es = Elasticsearch() # Loads metadata metadata = self.youtube.videos().list(part='snippet,contentDetails', id=video_id).execute() try: snippet = metadata['items'][0]['snippet'] except: raise ValueError('Video not found.') try: duration = parse_duration(metadata['items'][0]['contentDetails'] ['duration']).total_seconds() except: raise ValueError('Could not parse duration') # Loads captions for specified YouTube video id if len(fileName) == 0: results = self.youtube.captions().list(part='snippet', videoId=video_id).execute() if 'items' not in results: raise ValueError('no captions in video ' + video_id) # Locate captions resource captions_resource = None for item in results['items']: if item['snippet']['language'] == 'ja': captions_resource = item if captions_resource is None: raise ValueError('no Japanese captions in video ' + video_id) # Download raw captions file from youtube captions = self.youtube.captions().download( id=captions_resource['id']).execute().decode('utf-8') else: # Transcript file was provided so skip the YouTube captions download f = open(fileName, encoding='utf-8') captions = f.read() f.close() # Tokenizes the Japanese captions and translates Kanji into Hirigana parser = CaptionParser(captions) parsed_captions = parser.parse() # Create record in Mongo result = db.videos.insert_one({ 'youtubeVideoId': video_id, 'title': snippet['title'], 'thumbnails': snippet['thumbnails'], 'captionData': parsed_captions, 'duration': duration }) print('Video inserted. id: {}'.format(result.inserted_id)) # Index each caption line for line_index, line in enumerate(parsed_captions): doc = { 'youtubeVideoId': video_id, 'refId': str(result.inserted_id), 'chunkIndex': line_index, 'original': line['original'], 'inverted': line['inverted'] } res = es.index(index='nekotube', doc_type='caption_line', body=doc) print(res) print(json.dumps(parsed_captions))
def parse_interval(interval_string): if not isinstance(interval_string, string_types): raise TypeError("Expecing a string") segment_count = interval_string.count(SEGMENT_DELIM) if segment_count < 1 or segment_count > 2: raise ISO8601Error( "Improper number of interval string segments. Must have 1 or 2") segments = interval_string.split(SEGMENT_DELIM) for idx, seg in enumerate(segments): if len(seg) == 0: return ISO8601Error("Interval segment index %s was empty" % idx) count = None if len(segments) == 3: # Rn/start/end # Rn/start/duration # Rn/duration/end s0 = segments[0] match = ISO8601_REPEAT_REGEX.match(s0) if not match: raise ISO8601Error("Repeat notation did not match expected") groups = match.groupdict() count = groups.get("count", None) if len(count) > 0: count = int(count) segments = segments[1:] s0 = segments[0] s1 = segments[1] # remaining segments are either # 1) start/end. # start must be a fully specified datetime format # end can either be a time, date, or datetime # 2) start/duration # start must be a fully specified datetime format # duration must be a valid duration format # 3) duration/end # duration must be a valid duration format # end must be a fully specified datetime format start = None end = None duration = None try: # (1) start = parse_datetime(s0) print("second to last term is a datetime") except: try: duration = parse_duration(s0) print("second to last term is a datetime") except: raise ISO8601Error( "First term after repeat must be either " + "a fully specified datetime or a valid duration") # look at last term # this isn't the prettiest way to do it, but it is effective # could also build the regexes from other modules, but delegation avoids code duplication if start: # last term must be a duration, date, time or datetime try: end = parse_datetime(s1) print("last term is a datetime") except: try: end = parse_date(s1) print("last term is a date") except: try: end = parse_time(s1) print("last term is a time") except: try: duration = parse_duration(s1) print("last term is a duration") except: raise ISO8601Error( "When first term after repeat is a datetime, " + "last term must be either a duration, datetime, date, or time" ) elif duration: # last term must be the end datetime try: end = parse_datetime(s1) except: raise ISO8601Error("If first term after repeat is a duration, " + "last term must be a datetime") interval = Interval(start=start, end=end, duration=duration, repeat=count) print(interval)
def duration_from_iso(iso_duration: str) -> Duration: """ Converts an ISO-8601 format duration into a :class:`pendulum.Duration`. Raises: - :exc:`isodate.isoerror.ISO8601Error` for bad input - :exc:`ValueError` if the input had non-integer year or month values - The ISO-8601 duration format is ``P[n]Y[n]M[n]DT[n]H[n]M[n]S``; see https://en.wikipedia.org/wiki/ISO_8601#Durations. - ``pendulum.Duration.min`` and ``pendulum.Duration.max`` values are ``Duration(weeks=-142857142, days=-5)`` and ``Duration(weeks=142857142, days=6)`` respectively. - ``isodate`` supports negative durations of the format ``-P<something>``, such as ``-PT5S`` for "minus 5 seconds", but not e.g. ``PT-5S``. - I'm not clear if ISO-8601 itself supports negative durations. This suggests not: https://github.com/moment/moment/issues/2408. But lots of implementations (including to some limited extent ``isodate``) do support this concept. .. code-block:: python from pendulum import DateTime from cardinal_pythonlib.datetimefunc import duration_from_iso from cardinal_pythonlib.logs import main_only_quicksetup_rootlogger main_only_quicksetup_rootlogger() d1 = duration_from_iso("P5W") d2 = duration_from_iso("P3Y1DT3H1M2S") d3 = duration_from_iso("P7000D") d4 = duration_from_iso("P1Y7000D") d5 = duration_from_iso("PT10053.22S") d6 = duration_from_iso("PT-10053.22S") # raises ISO8601 error d7 = duration_from_iso("-PT5S") d7 = duration_from_iso("PT-5S") # raises ISO8601 error now = DateTime.now() print(now) print(now + d1) print(now + d2) print(now + d3) print(now + d4) """ duration = parse_duration( iso_duration ) # type: Union[datetime.timedelta, IsodateDuration] # noqa if isinstance(duration, datetime.timedelta): result = pendulum_duration_from_timedelta(duration) elif isinstance(duration, IsodateDuration): result = pendulum_duration_from_isodate_duration(duration) else: raise AssertionError( f"Bug in isodate.parse_duration, which returned unknown duration " f"type: {duration!r}") # log.debug("Converted {!r} -> {!r} -> {!r}".format( # iso_duration, duration, result)) return result