Exemple #1
0
def parse_single_date(value):
    """
    Given a single string containing a date in arbitrary format, try to return
    tuple (date: datetime.date, month: int, year: int).
    """
    if not value:
        return None, None, None
    if isinstance(value, int):
        value = str(value)
    parser = dateparser.DateDataParser()
    try:
        # Results in a dict with keys: date_obj, period, locale.
        parse_result = parser.get_date_data(value)
        # A datetime object, later we need a date, only.
        result = parse_result['date_obj']
        if result is not None:
            if parse_result['period'] == 'year':
                return None, None, result.year
            elif parse_result['period'] == 'month':
                return None, result.month, result.year
            else:
                return result.date(), result.month, result.year
    except TypeError as err:
        print("{} date parsing failed with: {}".format(value, err),
              file=sys.stderr)

    return None, None, None
Exemple #2
0
def parse(raw_date):
    return_vals = []
    date_parser = dateparser.DateDataParser(languages=["en"])
    date_obj = date_parser.get_date_data(raw_date)
    new_date, error = "", ""
    valid = date_obj["date_obj"] is not None
    flag, ambiguous = raw_date, ambiguous_dates(date_obj)
    if valid:
        # If the precision is "day" then we're good to process as is
        if date_obj["period"] == "day":
            error = 0
            new_date = date_obj["date_obj"]
        # If the precision is "month", adjust accordingly
        if date_obj["period"] == "month":
            month, year = date_obj["date_obj"].month, date_obj["date_obj"].year
            error = int(math.floor(calendar.monthrange(year, month)[1] / 2.0))
            new_date = date_obj["date_obj"].replace(day=error + 1)
        # If the precision is "year", adjust accordingly
        if date_obj["period"] == "year":
            leap = True if calendar.isleap(date_obj["date_obj"].year) \
                else False
            year_count = 366 if leap else 365
            error = int(math.floor(year_count) / 2.0)
            new_date = date_obj["date_obj"].replace(day=1, month=1)
            new_date += datetime.timedelta(days=error)
        new_date = new_date.strftime("%Y-%b-%d")
    # If the date has failed to parse so far, run custom parsers on it
    else:
        error, new_date = date_exceptions(raw_date)
    return_vals.append(new_date)
    return_vals.append(error)
    return_vals.append(flag)
    return_vals.append(ambiguous if ambiguous is True else "")
    return return_vals
Exemple #3
0
def block_range(self, start, end=None) -> BlockRange:
    """
    Return the range of blocks mined between the given dates
    """
    if self.block_times is None:
        self.block_times = pd.DataFrame([block.time for block in self], columns=["date"])
        self.block_times["height"] = self.block_times.index
        self.block_times.index = self.block_times["date"]
        del self.block_times["date"]

    start_date = pd.to_datetime(start)
    if end is None:
        res = dateparser.DateDataParser().get_date_data(start)
        if res['period'] == 'month':
            end = start_date + relativedelta(months=1)
        elif res['period'] == 'day':
            end = start_date + relativedelta(days=1)
        elif res['period'] == 'year':
            end = start_date + relativedelta(years=1)
    else:
        end = pd.to_datetime(end)

    oldest = self.block_times[self.block_times.index >= start_date].iloc[0][0]
    newest = self.block_times[self.block_times.index <= end].iloc[-1][0] + 1

    return self[oldest:newest]
Exemple #4
0
    async def convert_time(self, ctx, dtstring=None, to=None, from_=None):
        """
        Convert time from one timezone to another.
        """
        if dtstring is None or to is None:
            return await ctx.send(
                f"Usage: `{ctx.prefix}{ctx.command} <datetime string> <to timezone> [optional: from timezone]`"
            )

        user_tzname = database.get_tz(ctx.author.id)

        if from_ is None and user_tzname is None:
            return await ctx.send(
                f"You do not have default timezone set! Use `{ctx.prefix}selftimezone`"
            )
        elif from_ is None:
            from_ = user_tzname

        from_ = self.get_tzname(from_)
        to = self.get_tzname(to)

        if from_ not in pytz.all_timezones or to not in pytz.all_timezones:
            return await ctx.send("Timezone doesn't exist!")

        dp = dateparser.DateDataParser(
            languages=["en"],
            settings={
                "TIMEZONE": from_,
                "RETURN_AS_TIMEZONE_AWARE": True,
                "RELATIVE_BASE": datetime.now(pytz.timezone(from_)),
            },
        )

        dateobj = dp.get_date_tuple(dtstring).date_obj

        if dateobj is None:
            return await ctx.send("Cannot parse date/time!")

        fmt = "%Y-%m-%d %H:%M:%S"

        embed = discord.Embed(
            title="Time Conversion",
            colour=0x2859B8,
        )
        embed.add_field(name=f"From ({from_})", value=dateobj.strftime(fmt))
        embed.add_field(name=f"To ({to})",
                        value=dateobj.astimezone(
                            pytz.timezone(to)).strftime(fmt))

        if user_tzname is not None:
            curr_time = datetime.now(pytz.timezone(user_tzname)).strftime(fmt)
            embed.set_footer(
                icon_url=ctx.author.avatar_url_as(size=64),
                text=f"Current time for {ctx.author.display_name}: {curr_time}",
            )

        await ctx.send(embed=embed)
Exemple #5
0
def _get_dateparser(language_pool: Tuple[str, ...],
                    settings: Hashable = None) -> dateparser.DateDataParser:
    settings = dict(settings or ())
    parser = dateparser.DateDataParser(allow_redetect_language=True,
                                       settings=settings)

    if language_pool is None:
        return parser

    language_codes = set(_language_aliases[lang.lower()]
                         for lang in language_pool
                         if lang in _language_aliases)

    if not language_codes:
        # language pool None or empty, fall back to the default language pool.
        return parser

    lang_detector = AutoDetectLanguage(list(language_codes),
                                       allow_redetection=True)
    parser.language_detector = lang_detector

    return parser
Exemple #6
0
## under GNU GPL v3 license

# standard
import datetime
import logging
import re

from functools import lru_cache

# conditional imports with fallbacks for compatibility
# coverage for date parsing
try:
    import dateparser  # third-party, slow
    EXTERNAL_PARSER = dateparser.DateDataParser(
        settings={
            'PREFER_DAY_OF_MONTH': 'first',
            'PREFER_DATES_FROM': 'past',
            'DATE_ORDER': 'DMY',
        })
    # allow_redetect_language=False, languages=['de', 'en'],
    EXTERNAL_PARSER_CONFIG = {
        'PREFER_DAY_OF_MONTH': 'first',
        'PREFER_DATES_FROM': 'past',
        'DATE_ORDER': 'DMY'
    }
except ImportError:
    # try dateutil parser
    from dateutil.parser import parse as full_parse
    EXTERNAL_PARSER = None
    DEFAULT_PARSER_PARAMS = {'dayfirst': True, 'fuzzy': False}
else:
    full_parse = DEFAULT_PARSER_PARAMS = None
Exemple #7
0
## This file is available from https://github.com/adbar/htmldate
## under GNU GPL v3 license

import datetime
import dateparser  # third-party, slow

# Download
MAX_FILE_SIZE = 20000000
MIN_FILE_SIZE = 10

## Plausible dates
# earliest possible year to take into account (inclusive)
MIN_YEAR = 1995
# latest possible date
LATEST_POSSIBLE = datetime.date.today()
# latest possible year
MAX_YEAR = datetime.date.today().year

# dateparser module
PARSERCONFIG = {
    'PREFER_DAY_OF_MONTH': 'first',
    'PREFER_DATES_FROM': 'past',
    'DATE_ORDER': 'DMY'
}
PARSER = dateparser.DateDataParser(
    settings={
        'PREFER_DAY_OF_MONTH': 'first',
        'PREFER_DATES_FROM': 'past',
        'DATE_ORDER': 'DMY'
    })  # allow_redetect_language=False, # languages=['de', 'en'],
def find_date(htmlobject,
              extensive_search=True,
              outputformat='%Y-%m-%d',
              dparser=dateparser.DateDataParser(settings=PARSERCONFIG),
              url=None):
    """Main function: apply a series of techniques to date the document, from safe to adventurous"""
    # init
    tree, htmlstring = load_html(htmlobject)
    logger.debug('starting')

    # safety
    if tree is None:
        return None
    if output_format_validator(outputformat) is False:
        return None

    # URL
    if url is None:
        # link canonical
        for elem in tree.xpath('//link[@rel="canonical"]'):
            if 'href' in elem.attrib:
                url = elem.get('href')
    if url is not None:
        dateresult = extract_url_date(url, outputformat)
        if dateresult is not None:
            return dateresult

    # first, try header
    pagedate = examine_header(tree, outputformat, dparser)
    if pagedate is not None and date_validator(pagedate, outputformat) is True:
        return pagedate

    # <abbr>
    elements = tree.xpath('//abbr')
    if elements is not None and len(elements) > 0:
        reference = 0
        for elem in elements:
            # data-utime (mostly Facebook)
            if 'data-utime' in elem.attrib:
                try:
                    candidate = int(elem.get('data-utime'))
                except ValueError:
                    continue
                logger.debug('data-utime found: %s', candidate)
                # look for newest (i.e. largest time delta)
                if candidate > reference:
                    reference = candidate
            # class
            if 'class' in elem.attrib:
                if elem.get('class') == 'published' or elem.get(
                        'class') == 'date-published':
                    # other attributes
                    if 'title' in elem.attrib:
                        trytext = elem.get('title')
                        logger.debug('abbr published-title found: %s', trytext)
                        reference = compare_reference(reference, trytext,
                                                      outputformat, dparser)
                    # dates, not times of the day
                    if elem.text and len(elem.text) > 10:
                        trytext = re.sub(r'^am ', '', elem.text)
                        logger.debug('abbr published found: %s', trytext)
                        reference = compare_reference(reference, trytext,
                                                      outputformat, dparser)
        # convert and return
        if reference > 0:
            dateobject = datetime.datetime.fromtimestamp(reference)
            converted = dateobject.strftime(outputformat)
            # quality control
            if date_validator(converted, outputformat) is True:
                return converted
        # try rescue in abbr content
        else:
            dateresult = examine_date_elements(tree, '//abbr', outputformat,
                                               dparser)
            if dateresult is not None and date_validator(
                    dateresult, outputformat) is True:
                return dateresult  # break

    # expressions + text_content
    for expr in DATE_EXPRESSIONS:
        dateresult = examine_date_elements(tree, expr, outputformat, dparser)
        if dateresult is not None and date_validator(dateresult,
                                                     outputformat) is True:
            return dateresult  # break

    # <time>
    elements = tree.xpath('//time')
    if elements is not None and len(elements) > 0:
        # scan all the tags and look for the newest one
        reference = 0
        for elem in elements:
            # go for datetime
            if 'datetime' in elem.attrib and len(elem.get('datetime')) > 6:
                # first choice: entry-date + datetime attribute
                if 'class' in elem.attrib:
                    if elem.get('class').startswith('entry-date') or elem.get(
                            'class').startswith('entry-time') or elem.get(
                                'class') == 'updated':
                        logger.debug('time/datetime found: %s',
                                     elem.get('datetime'))
                        reference = compare_reference(reference,
                                                      elem.get('datetime'),
                                                      outputformat, dparser)
                        if reference > 0:
                            break
                # datetime attribute
                else:
                    logger.debug('time/datetime found: %s',
                                 elem.get('datetime'))
                    reference = compare_reference(reference,
                                                  elem.get('datetime'),
                                                  outputformat, dparser)
            # bare text in element
            elif elem.text is not None and len(elem.text) > 6:
                logger.debug('time/datetime found: %s', elem.text)
                reference = compare_reference(reference, elem.text,
                                              outputformat, dparser)
            # else...
            # ...
        # return
        if reference > 0:
            # convert and return
            dateobject = datetime.datetime.fromtimestamp(reference)
            converted = dateobject.strftime(outputformat)
            # quality control
            if date_validator(converted, outputformat) is True:
                return converted

    # URL 2
    if url is not None:
        dateresult = extract_partial_url_date(url, outputformat)
        if dateresult is not None:
            return dateresult

    # clean before string search
    cleaned_html = cleaner.clean_html(tree)
    htmlstring = html.tostring(cleaned_html, encoding='unicode')
    # remove comments by hand as faulty in lxml
    htmlstring = re.sub(r'<!--.+?-->', '', htmlstring, flags=re.DOTALL)
    logger.debug('html cleaned')

    # date regex timestamp rescue
    match = re.search(r'"datePublished":"([0-9]{4}-[0-9]{2}-[0-9]{2})',
                      htmlstring)
    if match and date_validator(match.group(1), '%Y-%m-%d') is True:
        logger.debug('JSON time found: %s', match.group(0))
        return convert_date(match.group(1), '%Y-%m-%d', outputformat)
    match = re.search(
        r'([0-9]{4}-[0-9]{2}-[0-9]{2}).[0-9]{2}:[0-9]{2}:[0-9]{2}', htmlstring)
    if match and date_validator(match.group(1), '%Y-%m-%d') is True:
        logger.debug('time regex found: %s', match.group(0))
        return convert_date(match.group(1), '%Y-%m-%d', outputformat)
    match = re.search(
        r'([0-9]{2}\.[0-9]{2}\.[0-9]{4}).[0-9]{2}:[0-9]{2}:[0-9]{2}',
        htmlstring)
    if match and date_validator(match.group(1), '%d-%m-%Y') is True:
        logger.debug('time regex found: %s', match.group(0))
        return convert_date(match.group(1), '%d-%m-%Y', outputformat)

    # last resort
    if extensive_search is True:
        logger.debug('extensive search started')
        pagedate = search_page(htmlstring, outputformat)
        return pagedate
Exemple #9
0
            m = m.replace(u'十', '1')
    if d.find(u'十') != -1:
        d = unicode(d)
        if len(d) == 3:
            d = d.replace(u'十', '')
        elif len(d) == 1:
            d = '10'
        elif len(d) == 2:
            if d[0] == '十':
                d = d.replace(u'十', '1')
            else:
                d = d.replace(u'十', '0')
    return str(datetime(*map(int, (y, m, d))))


date_parser = dateparser.DateDataParser(languages=['en', 'zh'])


def parser_chs_date(string):
    rets = date_parser.get_date_data(string)['date_obj']
    return rets


def string2timestamp(string):
    return datetime2timestamp(dateparser.parse(string, languages=['en', 'zh']))


def timestamp2normal_date(ts):
    #秒级时间
    return time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(ts))