def test_parser_private_warns(): from dateutil.parser import _timelex, _tzparser from dateutil.parser import _parsetz with pytest.warns(DeprecationWarning): _tzparser() with pytest.warns(DeprecationWarning): _timelex('2014-03-03') with pytest.warns(DeprecationWarning): _parsetz('+05:00')
def test_parser_parser_private_not_warns(): from dateutil.parser._parser import _timelex, _tzparser from dateutil.parser._parser import _parsetz with warnings.catch_warnings(): warnings.simplefilter("error") _tzparser() with warnings.catch_warnings(): warnings.simplefilter("error") _timelex('2014-03-03') with warnings.catch_warnings(): warnings.simplefilter("error") _parsetz('+05:00')
def test_parser_parser_private_not_warns(): from dateutil.parser._parser import _timelex, _tzparser from dateutil.parser._parser import _parsetz with pytest.warns(None) as recorder: _tzparser() assert len(recorder) == 0 with pytest.warns(None) as recorder: _timelex('2014-03-03') assert len(recorder) == 0 with pytest.warns(None) as recorder: _parsetz('+05:00') assert len(recorder) == 0
def weekday_reader(self, query): """ This module converts weekdays to their respective dates Params: Input: query - str Output: query - str """ wkday = r'\b(monday|mon|tuesday|tue|wednesday|wed|thursday|thu|friday|fri|saturday|sat|sunday|sun)\b' check_day = re.search(wkday, query, re.I) while check_day: start = datetime.datetime.today() this_day = start.weekday() split_q = list(_timelex(query.lower())) split_q = [x for x in split_q if x != ' '] that_day = self.info.weekday(check_day.group(0)) if that_day >= this_day: if len(split_q) > 1 and split_q[split_q.index(check_day.group(0)) - 1] == 'next': diff = that_day - this_day + 7 diff = diff if diff < 7 + (6 - this_day) else diff - 7 else: diff = that_day - this_day else: diff = (6 - this_day) + (that_day + 1) repl = (start + relativedelta(days=diff)) query = query.replace(check_day.group(0), repl.strftime("%d.%B.%Y"), 1) check_day = re.search(wkday, query, re.I) return query
def word2nummain(self, query): """ This function splits te string based on certain words and calls the the other functions. Params: Input: query - str Output: output_query - str """ try: query = self.preprocess(query) reg1 = regex.compile( r"""\b(month|year|january|jan|february|feb|march|mar|april|apr|may| june|jun|july|jul|august|aug|september|sept|sep|october|oct|november| nov|december|dec|for|to|th|nd|st|rd|of| (?<!hundred\s|thousand\s)and)\b|(\.|:)""") reg1 = reg1.pattern.replace('\n', '').replace(' ', '') sent_list = [ x for x in regex.split(reg1, query, flags=re.I) if isinstance(x, str) ] new_sent_list = self.convertword2num(sent_list) output_query = " ".join(new_sent_list) output_query = ' '.join( [x for x in list(_timelex(output_query)) if x != ' ']) return output_query except Exception as exc: print("the error in main is>>>", traceback.format_exc(), exc)
def timesplit(input_string): batch = [] for token in _timelex(input_string): if timetoken(token): if info.jump(token): continue batch.append(token) else: if batch: yield " ".join(batch) batch = [] if batch: yield " ".join(batch)
def timesplit(input_string): """Helper method used by __extract_dates.""" batch = [] for token in _timelex(input_string): if token in ['to', 'and']: yield " ".join(batch) batch = [] continue if timetoken(token): if dparser.parser().info.jump(token): continue batch.append(token) else: if batch: yield " ".join(batch) batch = [] if batch: yield " ".join(batch)
def preprocess(self, query, delta, dayfirst=True, monthfirst=False, yearfirst=False): """ This module performs basic preprocessing on the input query It replaces 'this month' with the current month name 'next year' with the next year 'yesterday' with the previous day's date 'day after/day after tomorrow' with 2 days from today's date Params: Input: query - str delta - str dayfirst - bool(true by default)[optional] monthfirst - bool(false by default)[optional] yearfirst - bool(false by default)[optional] Output: query - str delta - str """ ptn = re.compile("(th|rd|st|nd)" "(january|jan|february|feb|march|mar|april|apr|may|june|jun|july|jul|" "august|aug|september|sept|sep|october|oct|november|nov|december|dec)") ptn = ptn.pattern.replace('\n', '').replace(' ', '') srch = re.search(ptn, query.lower()) while srch: query = re.sub(srch.group(0), srch.group(1)+" "+srch.group(2), query, flags=re.I) srch = re.search(ptn, query) query = re.sub(r'\b(a day(s)?|a night(s)?)\b', '1 day', query, flags=re.I) query = re.sub(r'\b(a week(s))\b', '1 week', query, flags=re.I) query = re.sub(r'\b(a month(s)?)\b', '1 month', query, flags=re.I) query = re.sub(r'\b(a year(s)?)\b', '1 year', query, flags=re.I) query = list(_timelex(query)) query = [each for each in query if each != ' '] query = ' '.join(query) query = ' '+query+' ' query = self.fixed_delta_search(query) reg1 = regex.compile(r"""(?<!(january|jan|february|feb|march|mar|april|apr|may| june|jun|july|jul|august|aug|september|sept|sep|october| oct|november|nov|december|dec))(\s) (\d+)(\s) (\b(th|st|rd|nd)\b(\s))? (-|\bto\b|\band\b|&)(\s) (\d+)(\s) (\b(th|st|rd|nd)\b(\s))? (\bof\s\b)? \b(january|jan|february|feb|march|mar|april|apr|may|june| jun|july|jul|august|aug|september|sept|sep|october|oct| november|nov|december|dec)\b""") reg1 = reg1.pattern.replace('\n', '').replace(' ', '') query = regex.sub(reg1, r" \3 th - \10 th \16 ", query, flags=re.I) reg2 = re.compile(r"""(?<!\d)(\s) \b(january|jan|february|feb|march|mar|april|apr|may|june|jun|july|jul| august|aug|september|sept|sep|october|oct|november|nov|december|dec)\b (\s)(\d+)(\s) (\b(th|st|rd|nd)\b(\s))? (\bto\b|\band\b|-|&)(\s) (\d+)(\s) (\b(th|st|rd|nd)\b(\s))? (?!(\.|:|\\|\/|-))""") reg2 = reg2.pattern.replace('\n', '').replace(' ', '') query = re.sub(reg2, r" \4 th - \11 th \2 ", query, flags=re.I) reg3 = re.compile(r"""(?<!(\d\s\.|\d\s:|\d\s\\|\d\s\/|\d\s-))(\s) (\d+)(\s)(\bof\s\b)?\b(january|jan|february|feb| march|mar|april|apr|may|june|jun|july|jul|august|aug|september| sept|sep|october|oct|november|nov|december|dec)\b""") reg3 = reg3.pattern.replace('\n', '').replace(' ', '') query = re.sub(reg3, r" \3 th \6 ", query, flags=re.I) reg4 = regex.compile(r"""(?<!(\d\s|\bst\b\s|\bth\b\s|\brd\b\s|\bnd\b\s|\bof\b\s)) \b(january|jan|february|feb|march|mar|april|apr|may|june|jun| july|jul|august|aug|september|sept|sep|october|oct|november|nov| december|dec)\b(\s)(\d+)(\s)""") reg4 = reg4.pattern.replace('\n', '').replace(' ', '') query = regex.sub(reg4, r" \2 \4 th ", query, flags=re.I) query = self.date_format_reader(query, dayfirst, monthfirst, yearfirst) reg5 = re.compile(r"""(\d+)(\s+)?\b(nd|st|rd|th)\b(\s+)? \b(january|jan|february|feb|march|mar|april|apr|may|june|jun|july|jul| august|aug|september|sept|sep|october|oct|november|nov|december|dec| of next month|next month|of this month|this month)\b (\s+)?(\bto\b|\band\b|-)(\s+)? (\d+)(\s+)?\b(nd|st|rd|th)\b(\s+)? \b(january|jan|february|feb|march|mar|april|apr|may|june|jun|july|jul| august|aug|september|sept|sep|october|oct|november|nov|december|dec| of next month|next month|of this month|this month)\b(\s+)?(\d+)""") reg5 = reg5.pattern.replace('\n', '').replace(' ', '') query = re.sub(reg5, r"\1 \3 \5 \15 \7 \9 \11 \13 \15", query, flags=re.I) query = self.weekday_reader(query) this_year = datetime.datetime.now().year query = re.sub(r'(of this year|this year)', 'year ' + str(this_year), query, flags=re.I) next_year = datetime.datetime.now().year + 1 query = re.sub(r'(of next year|next year)', 'year ' + str(next_year), query, flags=re.I) last_year = datetime.datetime.now().year - 1 query = re.sub(r'(of (l|p)ast year|(l|p)ast year|of prev year|of previous year|prev year|previous year)', 'year ' + str(last_year), query, flags=re.I) print(query) this_month = datetime.datetime.now().strftime("%B") query = re.sub(r'(of this month|this month)', this_month, query, flags=re.I) next_month = (datetime.date.today() + relativedelta(months=1)).strftime("%B") query = re.sub(r'(of next month|next month)', next_month, query, flags=re.I) last_month = (datetime.date.today() - relativedelta(months=1)).strftime("%B") query = re.sub(r'(of last month|last month|of previous month|of prev month|previous month|prev month)', last_month, query, flags=re.I) temporal = [r'now', r'today', r'tonight', r'(?<!after\s)tomorrow', r'yesterday'] days = re.findall(r'\b' + r'\b|\b'.join(temporal) + r'\b', query) if days: for day in days: if day == 'today' or day == 'now' or day == 'tonight': query = query.replace(day, datetime.datetime.now().date().strftime("%d.%B.%Y")) if day == 'tomorrow': query = re.sub(r'(?<!after\s)tomorrow', (datetime.date.today() + relativedelta(days=1)).strftime("%d.%B.%Y"), query, flags=re.I) if day == 'yesterday': query = query.replace(day, (datetime.date.today() - relativedelta(days=1)).strftime("%d.%B.%Y")) next_day = re.search(r'day after tomorrow|day after', query, re.I) if next_day: query = query[:next_day.span()[0]] + ((datetime.datetime.now() + relativedelta(days=2)).date()).strftime("%d.%B.%Y") + query[next_day.span()[1]:] return (query, delta)
def timesplit(self, query): """ This module will split query into tokens and filter out irrelevant words. It will only keep month names, and numbers that signify a date or a year. Params: Input: query - str Output: query - str split_query - list of lists """ # import pdb;pdb.set_trace() split_query = list(_timelex(query)) split_query = [x for x in split_query if x != ' '] year_jump = ['of', 'in', 'year', 'years'] date_jump = ['to', 'of', '-'] date_check = ['th', 'rd', 'st', 'nd', '.'] llist = [] print(split_query) now = datetime.datetime.now().year if len(split_query) > 1: for ind, each in enumerate(split_query): if ind == 0: if (split_query[ind].isdigit() and \ split_query[ind+1] in date_check) or \ self.info.month(split_query[ind]): llist.append(split_query[ind]) if ind == len(split_query) - 1: if self.info.month(split_query[ind]): llist.append(split_query[ind]) if split_query[ind].isdigit(): if split_query[ind - 1] in date_check + date_jump or \ self.info.month(split_query[ind - 1]) or \ (split_query[ind - 1] in year_jump and \ split_query[ind + 1] not in ["month", "months"]) or \ int(split_query[ind]) > 1000: if int(split_query[ind]) > 1000: llist.append(split_query[ind]) elif int(split_query[ind]) < 100: future = int(split_query[ind]) + (datetime.datetime.now().year//100*100) past = int(split_query[ind]) + (datetime.datetime.now().year//100*100 - 100) rep = split_query[ind] if (now - past) < (future - now): century = datetime.datetime.now().year//100*100 - 100 else: century = datetime.datetime.now().year//100*100 split_query[ind] = str(int(split_query[ind])+century) query = query.replace(' '+rep+' ', ' '+split_query[ind]+' ') split_query = [str(int(x)+century) if x == rep else x for x in split_query] llist.append(split_query[ind]) if ind > 0 and ind < len(split_query) - 1: if self.info.month(split_query[ind]): llist.append(split_query[ind]) elif split_query[ind].isdigit(): if split_query[ind + 1] in date_check and int(split_query[ind]) < 32: llist.append(split_query[ind]) elif split_query[ind - 1] in date_check + date_jump or \ self.info.month(split_query[ind - 1]) or \ (split_query[ind - 1] in year_jump and \ split_query[ind + 1] not in ["month", "months"]) or \ int(split_query[ind]) > 1000: if int(split_query[ind]) > 1000: llist.append(split_query[ind]) elif int(split_query[ind]) < 100: future = int(split_query[ind]) + (datetime.datetime.now().year//100*100) past = int(split_query[ind]) + (datetime.datetime.now().year//100*100 - 100) rep = split_query[ind] if (now - past) < (future - now): century = datetime.datetime.now().year//100*100 - 100 else: century = datetime.datetime.now().year//100*100 split_query[ind] = str(int(split_query[ind])+century) query = query.replace(' '+rep+' ', ' '+split_query[ind]+' ') split_query = [str(int(x)+century) if x == rep else x for x in split_query] llist.append(split_query[ind]) elif each not in year_jump + list(set(date_check)-set('.')) and not self.info.month(each) and \ (split_query[ind - 1].isdigit() or self.info.month(split_query[ind - 1]) or \ split_query[ind - 1] in list(set(date_check)-set('.')) + date_jump): if each in ['and', 'to', '-']: if split_query[ind - 1] not in list(set(date_check)-set('.')) and \ not split_query[ind + 1].isdigit(): llist.append('_') elif each == '.': if not (split_query[ind - 1].isdigit() and \ self.info.month(split_query[ind + 1])) and \ not (self.info.month(split_query[ind - 1]) and \ split_query[ind + 1].isdigit()): llist.append('_') else: llist.append('_') else: if self.info.month(split_query[0]): llist.append(split_query[0]) print(llist) split_query = [] small = [] for ind, each in enumerate(llist): if each == '_' or ind == len(llist)-1: if each != '_': small.append(each) split_query.append(small) small = [] else: small.append(each) split_query = [each for each in split_query if each != []] print(">>>>>>>",split_query) return split_query, query
def convertword2num(self, sent_list): """ This is the main function where numbers are extracted from their alphabetic equivalents Params: Input: sent_list - list of broken sentences Output: new_list - same list of sentences, but with numbers """ new_list = [] print(sent_list) for sent in sent_list: is_a_part = False words = [] temp_sent = [x for x in list(_timelex(sent)) if x != ' '] for ind, wrd in enumerate(temp_sent): word = '' if wrd in self.num_list + self.sim_list: if not is_a_part: is_a_part = True if wrd in self.sim_list: if ind == 0: sent = sent.replace(wrd, u'one ' + wrd) wrd = u'one ' + wrd words.append(wrd) elif number(temp_sent[ind - 1]) == 0: sent = sent.replace(wrd, u'one ' + wrd) wrd = u'one ' + wrd words.append(wrd) if ind != len(temp_sent) - 1 and \ number(temp_sent[ind]) in range(1, 21) and \ temp_sent[ind+1] in self.num_list and \ number(temp_sent[ind+1]) > 9: sent = sent.replace(wrd, wrd + u' hundred') wrd = wrd + u' hundred' words.append(wrd) elif wrd not in words: words.append(wrd) else: if ind != len(temp_sent)-1 and \ number(temp_sent[ind]) in range(1, 21) and \ temp_sent[ind+1] in self.num_list and \ number(temp_sent[ind+1]) > 9: sent = sent.replace(wrd, wrd + u' hundred') wrd = wrd + u' hundred' words.append(wrd) else: words.append(wrd) elif wrd == 'and' and temp_sent[ind - 1] in self.sim_list: if is_a_part: try: if temp_sent[ind + 1] in self.num_list: words.append(wrd) except: pass else: is_a_part = False word = " ".join([word for word in words]) word = list(_timelex(word)) word = ' '.join([x for x in word if x != ' ']) try: num = number(word) print(num) except Exception: num = word else: if word: sent = sent.replace(word, str(num)) new_list.append(sent) return new_list
def build_test(i, test_string): python_tokens = list(_timelex(test_string)) formatted_tokens = 'vec!["' + '", "'.join(python_tokens) + '"]' return f'''