def guess_date(string, node=None, options=None): date, span = search_date( string, options.get('date_year_first') if options else False, options.get('date_day_first') if options else False) if date: return {'date': date}, span else: return None, None
def guess_date(string, node=None, options=None): date, span = search_date( string, options.get('date_year_first') if options else False, options.get('date_day_first') if options else False) if date and span and DefaultValidator.validate_string( string, span): # ensure we have a separator before and after date return {'date': date}, span return None, None
def guess_date(self, string, node=None, options=None): date, span = search_date( string, options.get("date_year_first") if options else False, options.get("date_day_first") if options else False, ) if date: return {"date": date}, span else: return None, None
def guess_date(self, string, node=None, options=None): date, span = search_date(string) if date: return {'date': date}, span else: return None, None
def guess_date(string): date, span = search_date(string) if date: return { 'date': date }, span else: return None, None
def test_date(self): self.assertEqual(search_year(' in the year 2000... '), (2000, (13, 17))) self.assertEqual(search_year(' they arrived in 1492. '), (None, None)) today = date.today() today_year_2 = int(str(today.year)[2:]) future = today + timedelta(days=1000) future_year_2 = int(str(future.year)[2:]) past = today - timedelta(days=10000) past_year_2 = int(str(past.year)[2:]) self.assertEqual(search_date(' Something before 2002-04-22 '), (date(2002, 4, 22), (18, 28))) self.assertEqual(search_date(' 2002-04-22 Something after '), (date(2002, 4, 22), (1, 11))) self.assertEqual(search_date(' This happened on 2002-04-22. '), (date(2002, 4, 22), (18, 28))) self.assertEqual(search_date(' This happened on 22-04-2002. '), (date(2002, 4, 22), (18, 28))) self.assertEqual(search_date(' This happened on 13-04-%s. ' % (today_year_2,)), (date(today.year, 4, 13), (18, 26))) self.assertEqual(search_date(' This happened on 22-04-%s. ' % (future_year_2,)), (date(future.year, 4, 22), (18, 26))) self.assertEqual(search_date(' This happened on 20-04-%s. ' % (past_year_2)), (date(past.year, 4, 20), (18, 26))) self.assertEqual(search_date(' This happened on 13-06-14. ', year_first=True), (date(2013, 6, 14), (18, 26))) self.assertEqual(search_date(' This happened on 13-05-14. ', year_first=False), (date(2014, 5, 13), (18, 26))) self.assertEqual(search_date(' This happened on 04-13-%s. ' % (today_year_2,)), (date(today.year, 4, 13), (18, 26))) self.assertEqual(search_date(' This happened on 04-22-%s. ' % (future_year_2,)), (date(future.year, 4, 22), (18, 26))) self.assertEqual(search_date(' This happened on 04-20-%s. ' % (past_year_2)), (date(past.year, 4, 20), (18, 26))) self.assertEqual(search_date(' This happened on 35-12-%s. ' % (today_year_2,)), (None, None)) self.assertEqual(search_date(' This happened on 37-18-%s. ' % (future_year_2,)), (None, None)) self.assertEqual(search_date(' This happened on 44-42-%s. ' % (past_year_2)), (None, None)) self.assertEqual(search_date(' This happened on %s. ' % (today, )), (today, (18, 28))) self.assertEqual(search_date(' This happened on %s. ' % (future, )), (future, (18, 28))) self.assertEqual(search_date(' This happened on %s. ' % (past, )), (past, (18, 28))) self.assertEqual(search_date(' released date: 04-03-1901? '), (None, None)) self.assertEqual(search_date(' There\'s no date in here. '), (None, None)) self.assertEqual(search_date(' Something 01-02-03 '), (date(2003, 2, 1), (11, 19))) self.assertEqual(search_date(' Something 01-02-03 ', year_first=False, day_first=True), (date(2003, 2, 1), (11, 19))) self.assertEqual(search_date(' Something 01-02-03 ', year_first=True), (date(2001, 2, 3), (11, 19))) self.assertEqual(search_date(' Something 01-02-03 ', day_first=False), (date(2003, 1, 2), (11, 19)))
def guess_date(string): date, span = search_date(string) if date: return {'date': date}, span else: return None, None
def guess_date(string, node=None, options=None): date, span = search_date(string, options.get('date_year_first') if options else False, options.get('date_day_first') if options else False) if date and span and DefaultValidator.validate_string(string, span): # ensure we have a separator before and after date return {'date': date}, span return None, None
def guess_groups(string, result, filetype): # add sentinels so we can match a separator char at either end of # our groups, even when they are at the beginning or end of the string # we will adjust the span accordingly later # # filetype can either be movie, moviesubtitle, episode, episodesubtitle current = " " + string + " " regions = [] # list of (start, end) of matched regions def guessed(match_dict, confidence): guess = format_guess(Guess(match_dict, confidence=confidence)) result.append(guess) log.debug("Found with confidence %.2f: %s" % (confidence, guess)) return guess def update_found(string, guess, span, span_adjust=(0, 0)): span = (span[0] + span_adjust[0], span[1] + span_adjust[1]) regions.append((span, guess)) return blank_region(string, span) # try to find dates first, as they are very specific date, span = search_date(current) if date: guess = guessed({"date": date}, confidence=1.0) current = update_found(current, guess, span) # for non episodes only, look for year information if filetype not in ("episode", "episodesubtitle"): year, span = search_year(current) if year: guess = guessed({"year": year}, confidence=1.0) current = update_found(current, guess, span) # specific regexps (ie: cd number, season X episode, ...) for rexp, confidence, span_adjust in video_rexps: match = re.search(rexp, current, re.IGNORECASE) if match: metadata = match.groupdict() # is this the better place to put it? (maybe, as it is at least the soonest that we can catch it) if "cdNumberTotal" in metadata and metadata["cdNumberTotal"] is None: del metadata["cdNumberTotal"] guess = guessed(metadata, confidence=confidence) current = update_found(current, guess, match.span(), span_adjust) if filetype in ("episode", "episodesubtitle"): for rexp, confidence, span_adjust in episode_rexps: match = re.search(rexp, current, re.IGNORECASE) if match: metadata = match.groupdict() guess = guessed(metadata, confidence=confidence) current = update_found(current, guess, match.span(), span_adjust) # Now websites, but as exact string instead of regexps clow = current.lower() for site in websites: pos = clow.find(site.lower()) if pos != -1: guess = guessed({"website": site}, confidence=confidence) current = update_found(current, guess, (pos, pos + len(site))) clow = current.lower() # release groups have certain constraints, cannot be included in the previous general regexps group_names = [ r"\.(Xvid)-(?P<releaseGroup>.*?)[ \.]", r"\.(DivX)-(?P<releaseGroup>.*?)[\. ]", r"\.(DVDivX)-(?P<releaseGroup>.*?)[\. ]", ] for rexp in group_names: match = re.search(rexp, current, re.IGNORECASE) if match: metadata = match.groupdict() metadata.update({"videoCodec": match.group(1)}) guess = guessed(metadata, confidence=0.8) current = update_found(current, guess, match.span(), span_adjust=(1, -1)) # common well-defined words and regexps confidence = 1.0 # for all of them for prop, value, pos, end in find_properties(current): guess = guessed({prop: value}, confidence=confidence) current = update_found(current, guess, (pos, end)) # weak guesses for episode number, only run it if we don't have an estimate already if filetype in ("episode", "episodesubtitle"): if not any("episodeNumber" in match for match in result): for rexp, _, span_adjust in weak_episode_rexps: match = re.search(rexp, current, re.IGNORECASE) if match: metadata = match.groupdict() epnum = int(metadata["episodeNumber"]) if epnum > 100: guess = guessed({"season": epnum // 100, "episodeNumber": epnum % 100}, confidence=0.6) else: guess = guessed(metadata, confidence=0.3) current = update_found(current, guess, match.span(), span_adjust) # try to find languages now language, span, confidence = search_language(current) while language: # is it a subtitle language? if "sub" in clean_string(current[: span[0]]).lower().split(" "): guess = guessed({"subtitleLanguage": language}, confidence=confidence) else: guess = guessed({"language": language}, confidence=confidence) current = update_found(current, guess, span) language, span, confidence = search_language(current) # remove our sentinels now and ajust spans accordingly assert current[0] == " " and current[-1] == " " current = current[1:-1] regions = [((start - 1, end - 1), guess) for (start, end), guess in regions] # split into '-' separated subgroups (with required separator chars # around the dash) didx = current.find("-") while didx > 0: regions.append(((didx, didx), None)) didx = current.find("-", didx + 1) # cut our final groups, and rematch the guesses to the group that created # id, None if it is a leftover group region_spans = [span for span, guess in regions] string_groups = split_on_groups(string, region_spans) remaining_groups = split_on_groups(current, region_spans) guesses = [] pos = 0 for group in string_groups: found = False for span, guess in regions: if span[0] == pos: guesses.append(guess) found = True if not found: guesses.append(None) pos += len(group) return zip(string_groups, remaining_groups, guesses)
def guess_date(string, node=None, options=None): date, span = search_date(string, options.get('date_year_first') if options else False, options.get('date_day_first') if options else False) if date: return {'date': date}, span else: return None, None