def parse_single_date(value): """ Given a single string containing a date in arbitrary format, try to return tuple (date: datetime.date, month: int, year: int). """ if not value: return None, None, None if isinstance(value, int): value = str(value) parser = dateparser.DateDataParser() try: # Results in a dict with keys: date_obj, period, locale. parse_result = parser.get_date_data(value) # A datetime object, later we need a date, only. result = parse_result['date_obj'] if result is not None: if parse_result['period'] == 'year': return None, None, result.year elif parse_result['period'] == 'month': return None, result.month, result.year else: return result.date(), result.month, result.year except TypeError as err: print("{} date parsing failed with: {}".format(value, err), file=sys.stderr) return None, None, None
def parse(raw_date): return_vals = [] date_parser = dateparser.DateDataParser(languages=["en"]) date_obj = date_parser.get_date_data(raw_date) new_date, error = "", "" valid = date_obj["date_obj"] is not None flag, ambiguous = raw_date, ambiguous_dates(date_obj) if valid: # If the precision is "day" then we're good to process as is if date_obj["period"] == "day": error = 0 new_date = date_obj["date_obj"] # If the precision is "month", adjust accordingly if date_obj["period"] == "month": month, year = date_obj["date_obj"].month, date_obj["date_obj"].year error = int(math.floor(calendar.monthrange(year, month)[1] / 2.0)) new_date = date_obj["date_obj"].replace(day=error + 1) # If the precision is "year", adjust accordingly if date_obj["period"] == "year": leap = True if calendar.isleap(date_obj["date_obj"].year) \ else False year_count = 366 if leap else 365 error = int(math.floor(year_count) / 2.0) new_date = date_obj["date_obj"].replace(day=1, month=1) new_date += datetime.timedelta(days=error) new_date = new_date.strftime("%Y-%b-%d") # If the date has failed to parse so far, run custom parsers on it else: error, new_date = date_exceptions(raw_date) return_vals.append(new_date) return_vals.append(error) return_vals.append(flag) return_vals.append(ambiguous if ambiguous is True else "") return return_vals
def block_range(self, start, end=None) -> BlockRange: """ Return the range of blocks mined between the given dates """ if self.block_times is None: self.block_times = pd.DataFrame([block.time for block in self], columns=["date"]) self.block_times["height"] = self.block_times.index self.block_times.index = self.block_times["date"] del self.block_times["date"] start_date = pd.to_datetime(start) if end is None: res = dateparser.DateDataParser().get_date_data(start) if res['period'] == 'month': end = start_date + relativedelta(months=1) elif res['period'] == 'day': end = start_date + relativedelta(days=1) elif res['period'] == 'year': end = start_date + relativedelta(years=1) else: end = pd.to_datetime(end) oldest = self.block_times[self.block_times.index >= start_date].iloc[0][0] newest = self.block_times[self.block_times.index <= end].iloc[-1][0] + 1 return self[oldest:newest]
async def convert_time(self, ctx, dtstring=None, to=None, from_=None): """ Convert time from one timezone to another. """ if dtstring is None or to is None: return await ctx.send( f"Usage: `{ctx.prefix}{ctx.command} <datetime string> <to timezone> [optional: from timezone]`" ) user_tzname = database.get_tz(ctx.author.id) if from_ is None and user_tzname is None: return await ctx.send( f"You do not have default timezone set! Use `{ctx.prefix}selftimezone`" ) elif from_ is None: from_ = user_tzname from_ = self.get_tzname(from_) to = self.get_tzname(to) if from_ not in pytz.all_timezones or to not in pytz.all_timezones: return await ctx.send("Timezone doesn't exist!") dp = dateparser.DateDataParser( languages=["en"], settings={ "TIMEZONE": from_, "RETURN_AS_TIMEZONE_AWARE": True, "RELATIVE_BASE": datetime.now(pytz.timezone(from_)), }, ) dateobj = dp.get_date_tuple(dtstring).date_obj if dateobj is None: return await ctx.send("Cannot parse date/time!") fmt = "%Y-%m-%d %H:%M:%S" embed = discord.Embed( title="Time Conversion", colour=0x2859B8, ) embed.add_field(name=f"From ({from_})", value=dateobj.strftime(fmt)) embed.add_field(name=f"To ({to})", value=dateobj.astimezone( pytz.timezone(to)).strftime(fmt)) if user_tzname is not None: curr_time = datetime.now(pytz.timezone(user_tzname)).strftime(fmt) embed.set_footer( icon_url=ctx.author.avatar_url_as(size=64), text=f"Current time for {ctx.author.display_name}: {curr_time}", ) await ctx.send(embed=embed)
def _get_dateparser(language_pool: Tuple[str, ...], settings: Hashable = None) -> dateparser.DateDataParser: settings = dict(settings or ()) parser = dateparser.DateDataParser(allow_redetect_language=True, settings=settings) if language_pool is None: return parser language_codes = set(_language_aliases[lang.lower()] for lang in language_pool if lang in _language_aliases) if not language_codes: # language pool None or empty, fall back to the default language pool. return parser lang_detector = AutoDetectLanguage(list(language_codes), allow_redetection=True) parser.language_detector = lang_detector return parser
## under GNU GPL v3 license # standard import datetime import logging import re from functools import lru_cache # conditional imports with fallbacks for compatibility # coverage for date parsing try: import dateparser # third-party, slow EXTERNAL_PARSER = dateparser.DateDataParser( settings={ 'PREFER_DAY_OF_MONTH': 'first', 'PREFER_DATES_FROM': 'past', 'DATE_ORDER': 'DMY', }) # allow_redetect_language=False, languages=['de', 'en'], EXTERNAL_PARSER_CONFIG = { 'PREFER_DAY_OF_MONTH': 'first', 'PREFER_DATES_FROM': 'past', 'DATE_ORDER': 'DMY' } except ImportError: # try dateutil parser from dateutil.parser import parse as full_parse EXTERNAL_PARSER = None DEFAULT_PARSER_PARAMS = {'dayfirst': True, 'fuzzy': False} else: full_parse = DEFAULT_PARSER_PARAMS = None
## This file is available from https://github.com/adbar/htmldate ## under GNU GPL v3 license import datetime import dateparser # third-party, slow # Download MAX_FILE_SIZE = 20000000 MIN_FILE_SIZE = 10 ## Plausible dates # earliest possible year to take into account (inclusive) MIN_YEAR = 1995 # latest possible date LATEST_POSSIBLE = datetime.date.today() # latest possible year MAX_YEAR = datetime.date.today().year # dateparser module PARSERCONFIG = { 'PREFER_DAY_OF_MONTH': 'first', 'PREFER_DATES_FROM': 'past', 'DATE_ORDER': 'DMY' } PARSER = dateparser.DateDataParser( settings={ 'PREFER_DAY_OF_MONTH': 'first', 'PREFER_DATES_FROM': 'past', 'DATE_ORDER': 'DMY' }) # allow_redetect_language=False, # languages=['de', 'en'],
def find_date(htmlobject, extensive_search=True, outputformat='%Y-%m-%d', dparser=dateparser.DateDataParser(settings=PARSERCONFIG), url=None): """Main function: apply a series of techniques to date the document, from safe to adventurous""" # init tree, htmlstring = load_html(htmlobject) logger.debug('starting') # safety if tree is None: return None if output_format_validator(outputformat) is False: return None # URL if url is None: # link canonical for elem in tree.xpath('//link[@rel="canonical"]'): if 'href' in elem.attrib: url = elem.get('href') if url is not None: dateresult = extract_url_date(url, outputformat) if dateresult is not None: return dateresult # first, try header pagedate = examine_header(tree, outputformat, dparser) if pagedate is not None and date_validator(pagedate, outputformat) is True: return pagedate # <abbr> elements = tree.xpath('//abbr') if elements is not None and len(elements) > 0: reference = 0 for elem in elements: # data-utime (mostly Facebook) if 'data-utime' in elem.attrib: try: candidate = int(elem.get('data-utime')) except ValueError: continue logger.debug('data-utime found: %s', candidate) # look for newest (i.e. largest time delta) if candidate > reference: reference = candidate # class if 'class' in elem.attrib: if elem.get('class') == 'published' or elem.get( 'class') == 'date-published': # other attributes if 'title' in elem.attrib: trytext = elem.get('title') logger.debug('abbr published-title found: %s', trytext) reference = compare_reference(reference, trytext, outputformat, dparser) # dates, not times of the day if elem.text and len(elem.text) > 10: trytext = re.sub(r'^am ', '', elem.text) logger.debug('abbr published found: %s', trytext) reference = compare_reference(reference, trytext, outputformat, dparser) # convert and return if reference > 0: dateobject = datetime.datetime.fromtimestamp(reference) converted = dateobject.strftime(outputformat) # quality control if date_validator(converted, outputformat) is True: return converted # try rescue in abbr content else: dateresult = examine_date_elements(tree, '//abbr', outputformat, dparser) if dateresult is not None and date_validator( dateresult, outputformat) is True: return dateresult # break # expressions + text_content for expr in DATE_EXPRESSIONS: dateresult = examine_date_elements(tree, expr, outputformat, dparser) if dateresult is not None and date_validator(dateresult, outputformat) is True: return dateresult # break # <time> elements = tree.xpath('//time') if elements is not None and len(elements) > 0: # scan all the tags and look for the newest one reference = 0 for elem in elements: # go for datetime if 'datetime' in elem.attrib and len(elem.get('datetime')) > 6: # first choice: entry-date + datetime attribute if 'class' in elem.attrib: if elem.get('class').startswith('entry-date') or elem.get( 'class').startswith('entry-time') or elem.get( 'class') == 'updated': logger.debug('time/datetime found: %s', elem.get('datetime')) reference = compare_reference(reference, elem.get('datetime'), outputformat, dparser) if reference > 0: break # datetime attribute else: logger.debug('time/datetime found: %s', elem.get('datetime')) reference = compare_reference(reference, elem.get('datetime'), outputformat, dparser) # bare text in element elif elem.text is not None and len(elem.text) > 6: logger.debug('time/datetime found: %s', elem.text) reference = compare_reference(reference, elem.text, outputformat, dparser) # else... # ... # return if reference > 0: # convert and return dateobject = datetime.datetime.fromtimestamp(reference) converted = dateobject.strftime(outputformat) # quality control if date_validator(converted, outputformat) is True: return converted # URL 2 if url is not None: dateresult = extract_partial_url_date(url, outputformat) if dateresult is not None: return dateresult # clean before string search cleaned_html = cleaner.clean_html(tree) htmlstring = html.tostring(cleaned_html, encoding='unicode') # remove comments by hand as faulty in lxml htmlstring = re.sub(r'<!--.+?-->', '', htmlstring, flags=re.DOTALL) logger.debug('html cleaned') # date regex timestamp rescue match = re.search(r'"datePublished":"([0-9]{4}-[0-9]{2}-[0-9]{2})', htmlstring) if match and date_validator(match.group(1), '%Y-%m-%d') is True: logger.debug('JSON time found: %s', match.group(0)) return convert_date(match.group(1), '%Y-%m-%d', outputformat) match = re.search( r'([0-9]{4}-[0-9]{2}-[0-9]{2}).[0-9]{2}:[0-9]{2}:[0-9]{2}', htmlstring) if match and date_validator(match.group(1), '%Y-%m-%d') is True: logger.debug('time regex found: %s', match.group(0)) return convert_date(match.group(1), '%Y-%m-%d', outputformat) match = re.search( r'([0-9]{2}\.[0-9]{2}\.[0-9]{4}).[0-9]{2}:[0-9]{2}:[0-9]{2}', htmlstring) if match and date_validator(match.group(1), '%d-%m-%Y') is True: logger.debug('time regex found: %s', match.group(0)) return convert_date(match.group(1), '%d-%m-%Y', outputformat) # last resort if extensive_search is True: logger.debug('extensive search started') pagedate = search_page(htmlstring, outputformat) return pagedate
m = m.replace(u'十', '1') if d.find(u'十') != -1: d = unicode(d) if len(d) == 3: d = d.replace(u'十', '') elif len(d) == 1: d = '10' elif len(d) == 2: if d[0] == '十': d = d.replace(u'十', '1') else: d = d.replace(u'十', '0') return str(datetime(*map(int, (y, m, d)))) date_parser = dateparser.DateDataParser(languages=['en', 'zh']) def parser_chs_date(string): rets = date_parser.get_date_data(string)['date_obj'] return rets def string2timestamp(string): return datetime2timestamp(dateparser.parse(string, languages=['en', 'zh'])) def timestamp2normal_date(ts): #秒级时间 return time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(ts))