def run_search_dates_function_invalid_languages(self, text, languages, error_type): try: search_dates(text=text, languages=languages) except Exception as error: self.error = error self.assertIsInstance(self.error, error_type)
def sep(text): k = [] for i in text: if i.count('/') in range(2, 4): k.append(i) elif i.count('-') in range(2, 4): k.append(i) elif "'" in i: k.append(i) elif "’" in i: k.append(i) else: continue l = ' '.join(k) l = search_dates(l) if l != None: for j in l[0]: H = search_dates(j) if H == None: return None else: for i in H: j = i[1].date() j = i[1].strftime('%Y/%m/%d') # RETURN DATE IN YYYY/MM/DD return j else: return None
def run(self, request, response): lang = request['lang'] if lang in self.langs: text = request['text'] debug = request.get('debug', False) start = 0 end = 0 result = [] if search_dates(text, languages=self.langs) is not None: for chunk, date in search_dates(text, languages=self.langs): start = text.index(chunk, end) end = start + len(chunk) if debug: result.append({ 'text': chunk, 'start': start, 'end': end, 'date': date.strftime(_format) }) else: result.append({ 'start': start, 'end': end, 'date': date.strftime(_format) }) return result else: raise MissingLanguage(lang)
def _str_to_date(std): for russian, english in rus_to_eng_dic.items(): std = std.replace(russian, english) if len(search_dates(std)) == 2: return search_dates(std) else: return [('Now', datetime.datetime.today()), ('Now', datetime.datetime.today())]
def parse(self): """ Parse the table on the right hand side or a wiki page that holds the details of a person :return: """ logger.info("parseing the PM page for %s", self.path) data = self.info.findAll("tr", text=True) for tr in self.info.findAll("tr"): invalid_items = ["\n", " ", "\t"] txt = tr.getText(separator="|") cells = [x for x in txt.split("|") if x not in invalid_items] # very cumbersome and slow, but there is some edge cases where unicode characters are near the date # https://en.wikipedia.org/wiki/Spencer_Compton,_1st_Earl_of_Wilmington has no month & day, # only the year and &thinspace; unicode next to it if "Born" in cells and self.birthday is None: # TODO: this is very slow, find a better way, this will find # dates for the likes of 'now', 'today' etc dates = [ i for i in map( lambda x: search_dates( (x.encode("ascii", "ignore")).decode("utf-8") ), cells, ) if i is not None ] if dates: # there may be multiple, but we want the first item in dates, # then search_dates returns a list of tuples, # so get the last item (we expect one only anyway) and get second item in tuple self.birthday = dates[0][-1][-1] elif "Died" in cells and self.died is None: # TODO: this is very slow, find a better way dates = [ i for i in map( lambda x: search_dates( (x.encode("ascii", "ignore")).decode("utf-8") ), cells, ) if i is not None ] if dates and self.died is None: self.died = dates[0][-1][-1] elif "Monarch" in cells and not self.monarchs: self.monarchs = cells[1:]
def date_func(name): dates = "" string = date_format(name) stop_word = date_stopwords(name) date = search_dates(stop_word) print('Date time') print(date) if (date == None): date = "" return date for match in range(len(date)): if ((match + 1) == len(date)): dates = dates + str(date[match]) else: dates = dates + str(date[match]) + "*xxx*" dates = dates + "++" text = nlp(string) for num, sen in enumerate(text.sents): for ent in sen.ents: is_present = False is_date = search_dates(ent.text) if ent.label_ == 'DATE': dates = dates + (str(ent.text)) + "*xx*" st = ent.text for tok in st: if (re.search('or|and|&', st)): is_present = True if (is_present == True): dates = dates + "*uxm*" elif (len(is_date) > 1): dates = dates + "*uxr*" dates = dates + "XXXXX" date = [] matches = (datefinder.find_dates(string)) for match in matches: date.append(match.strftime('%d-%m-%Y')) string = nlp(string) sentence = [token.text for token in string] for token in sentence: if (re.search('today|tomorrow|yesterday', token)): if (token == 'today'): token = datetime.today().strftime('%d-%m-%Y') elif (token == 'yesterday'): token = (datetime.now() - timedelta(days=1)).strftime('%d-%m-%Y') elif (token == 'tomorrow'): token = (datetime.now() + timedelta(days=1)).strftime('%d-%m-%Y') date.append(token) date.sort(key=lambda date: datetime.strptime(date, '%d-%m-%Y')) for d in date: dates = dates + d + "*xx*" if (dates != ""): return ('"datetime": "' + dates + '"') else: return ""
async def on_competitive_feed_post(message: discord, bot: commands.Bot): comp_feed_info = bot.get_channel(ids.COMPETITIVE_FEED_INFO) parts = message.clean_content.strip().split("\n") if len(parts) < 2: await message.delete() await comp_feed_info.send( f" {message.author.mention} your message was deleted because it doesn't follow the format. Please see the pins for an example." ) return await comp_feed_info.send( f"```{message.clean_content[:1990]}```") tournament_name = (parts[0].replace("*", "").replace("> ", "").replace( "_", "").replace("`", "")) description = "\n".join(parts[1:]).strip().replace("> ", "") discord_invite_url = None for word in description.split(): if "https://discord.gg/" in word or "https://discord.com/" in word: discord_invite_url = word break if discord_invite_url is None: await message.delete() await comp_feed_info.send( f" {message.author.mention} your message was deleted because it didn't contain a valid Discord server link with the format `https://discord.gg/asdasd`. Please see the pins for an example." ) return await comp_feed_info.send( f"```{message.clean_content[:1990]}```") try: if re.sub(r"T|\W", r"", parts[1]).isdigit(): # Using ISO string date = datetime.datetime.fromisoformat(parts[1] + "+00:00") description = "\n".join(parts[2:]).strip().replace( "> ", "") # Update description to remove ISO string else: # Get all dates all_dates = [] for line in parts[1:]: if dates := search_dates(line): all_dates += dates # Remove dates that have already passed possible_dates = [ d for d in all_dates if d[1] > datetime.datetime.now(d[1].tzinfo) ] # Check if any date has specified a timezone, if so, strip dates without a timezone if utc_dates := [ d for d in possible_dates if d[1].tzname() == "UTC" ]: # Prioritize specifying UTC possible_dates = utc_dates elif tz_dates := [d for d in possible_dates if d[1].tzname()]: possible_dates = tz_dates
def parse(self, response): for div in response.css('div.txtNew'): concert = { 'website': response.request.url, 'notes': div.css('h1::text').extract(), 'venue': 'B Sharps Jazz Cafe', 'venue_address': '648 W Brevard St', 'venue_website': 'https://www.b-sharps.com/' } # Remove empty strings to start concert['notes'] = [n.strip() for n in concert['notes']] concert['notes'] = filter(None, concert['notes']) # Skip the concert if we're just facing an empty list if concert['notes'] == []: continue # Look for date and time with dateparser for i in range(len(concert['notes'])): found = search_dates(concert['notes'][i]) if found: pass yield concert
def get_links_and_amendments(base_url, elements): """Get link data from a set of BS elements (columns in a row)""" link_data = [] amend_data = [] for e in elements: links = e.find_all('a') for link in links: data = { 'title': link.get('title'), 'href': urljoin(base_url, link.get('href')), } if 'Letter' in data['title']: data['type'] = 'letter of authorization' elif 'Providers' in data['title']: data['type'] = 'fact sheet for healthcare providers' elif 'Patients' in data['title']: data['type'] = 'fact sheet for patients' elif 'Summary' in data['title']: data['type'] = 'eua summary' elif 'Instructions' in data['title']: data['type'] = 'instructions for use' elif 'Amendment' in data['title']: data['type'] = 'amendment' amend_date = search_dates(data['title']) if amend_date: amend_data.append(amend_date[0][1].date()) link_data.append(data) return link_data, amend_data
def date_parser(text): try: dates = search_dates(text) except: dates = [] #print('Date is not present or not able to extract') return dates
def get_entities_col(column): # Loop over each value in the column col_ents = [] for value in column: valstr = str(value) # Evaluate each value in spaCy # NOTE: a "value" can be anything (number, sentence, etc.) doc = nlp.process(valstr) val_ents = [e.label_ for e in doc.ents] if len(val_ents) != 0: # Get most common entity type most_common_ent = max(set(val_ents), key=val_ents.count) col_ents.append(most_common_ent) # TODO: Override Cardinal pretty often... try: float(valstr) except ValueError: if search_dates(valstr) != None: col_ents.append("DATE") df = pd.DataFrame(col_ents) # Hacky fix for cases when there isn't an entity associated with a header # We can do something more elegant here... try: res = df.mode()[0][0] return res except KeyError: pass
def parseMsg(txt, tries=0): print("============================") print(txt) formatted = Manager.formatKnown(txt) timestr = "" res = search_dates(formatted, add_detected_language=True) when = None if res is None: return txt, when, timestr if True: #try: print("######################", formatted) print(res) date = res[0][1].timestamp() timestr = res[0][0] lang = res[0][2] ## remove timestr from reminder diff = time.time() - date print("DIFF", diff, time.ctime(date)) if diff > 0 and tries < 3: print("RETRY ") if tries == 0: new = Manager.changeDay(formatted) else: new = formatted.replace(timestr, "in " + timestr) return Manager.parseMsg(new, tries + 1) else: when = date # if True:#except: # print("EEEEEEEEEEEEEEEEEEEEEE res:",res) # return txt, None, "" if timestr is not "": formatted = txt.replace(timestr, "") return formatted, when, timestr
def processDateExtraction(self, ext_text): try: mnths = [ 'jan', 'january', 'feb', 'february', 'mar', 'march', 'apr', 'april', 'may', 'june', 'jun', 'july', 'jul', 'aug', 'august', 'sept', 'september', 'oct', 'october', 'nov', 'november', 'dec', 'december' ] match = re.search(r'[a-z]+', ext_text, re.IGNORECASE) if (match != None): if (match.group() not in mnths): for m in mnths: if (match.group() in m): ext_text = ext_text.replace(match.group(), m) break d_obj = search_dates(ext_text) if (d_obj != None): date_str = d_obj[0][1].strftime("%Y-%m-%d") return {'date': date_str} else: return {'date': 'null'} except Exception as e: return {'date': 'null'}
def _infiltrate(self, text): """ Given an input text it analyses it and returns a tuple (bool, data) Where first element is predicate if infiltartion completed (some relevant tokens are grasped from some part of the utterance). The second element of tuple is data object, retrieved from utterance Currently only one element may be retrieved (so utterance with multiple datetime objects will announce only the first result) :param text: str :return: tuple: (is something recepted:bool, retrieved data object:any) 1. True, {'raw_subtext': "Завтра в 3 часа", 'value': datetime_obj} 2. False, None """ # try to infiltrate the slot value list_of_result_tuples = search_dates(text) if list_of_result_tuples: if len(list_of_result_tuples) == 1: # ok # import ipdb; ipdb.set_trace() raw_subtext, datetime_obj = list_of_result_tuples[0] return True, {'raw_subtext': raw_subtext, 'value': datetime_obj} else: # investigate print("We have many datetime slot candidates in message we need to resolve this issue!") print(list_of_result_tuples) # import ipdb; # ipdb.set_trace() return False, None
async def parse_datetime(self, arg): dates = search_dates( arg.replace(".", "-"), languages=["en"], settings={ "PREFER_DATES_FROM": "future", "PREFER_DAY_OF_MONTH": "first", "DATE_ORDER": "DMY" }, ) if dates is None: return None, "" weekdays = [ "monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday" ] for day in weekdays: if str("next " + day) in arg.lower() and day in dates[0][0].lower(): date = dates[0][1] + timedelta(days=7) break else: date = dates[0][1] if date < datetime.now(): date = date.replace(day=(datetime.now().day)) if date < datetime.now(): date = date + timedelta(days=1) date_str = dates[0][0] return date, date_str
def search_years(query): re_year = r'[0-9]{4}' parsed_query = search_dates(query) if parsed_query is None: return get_years(query) if len(parsed_query) < 2: return sorted([v.year for i, v in parsed_query]) definite = 0 relative = 0 try: for text, date in parsed_query: print(text, date) matches = re.findall(re_year, text) try: if matches[0] == text: definite = date except IndexError: relative = date now = datetime.datetime.now() relative = relative - (now - definite) return sorted([relative.year, definite.year]) except Exception as e: print(e) return get_years(query)
def ex_date_search(key, cnt: Text, comp, ctx: cla_meta_intf): from dateparser.search import search_dates search_r = search_dates(cnt, languages=[ctx.lang]) if search_r is not None: ctx.add_result(extractor, comp, key, [str(r) for r in search_r]) return True return False
def date_finder(text): date ="" date_pattern = '%{YEAR:year}-%{MONTHNUM:month}-%{MONTHDAY:day}' matches = list(datefinder.find_dates(s)) match_date = re.search('\d{4}-\d{2}-\d{2}', s) try: print "====using dateutil" for i in s.splitlines(): d = parser.parse(i) print(d.strftime("%Y-%m-%d")) except Exception as e: print e try: print "====pygrok===" grok = Grok(date_pattern) print(grok.match(s)) except Exception as e: print e try: print "====using date===" if len(matches) > 0: date = matches[0] print date else: print 'No dates found' except Exception as e: print e try: print "====using date===" date = datetime.datetime.strptime(match_date.group(), '%Y-%m-%d').date() print date except Exception as e: print e try: print "====using Chunkgrams===" chunkGram = r"""NE:{<NNP>+<CD>}""" chunkParser = nltk.RegexpParser(chunkGram) sentences = nltk.sent_tokenize(text) tokenized_sentences = [nltk.word_tokenize(sentence.strip()) for sentence in sentences] tagged_sentences = [nltk.pos_tag(i) for i in tokenized_sentences] chunked_sentences = [chunkParser.parse(i) for i in tagged_sentences] entity_names = [] for tree in chunked_sentences: entity_names.extend(extract_entity_names(tree)) print entity_names except Exception as e: print e try: print "===using pydatum==" datum = Datum() print (datum.from_iso_date_string(text)) except Exception as e: print e try: print "===using dateparser==" date = search_dates(text.decode('ascii','ignore')) print date except Exception as e: print e
def datetime(self, text, lang='en'): """ $ python -m sagas.nlu.extractor_cli datetime 'tomorrow at eight' en $ python -m sagas.nlu.extractor_cli datetime 'two weeks ago' en .. search: [('two weeks ago', datetime.datetime(2019, 11, 29, 1, 57, 25, 466421))] .. parse: 2019-11-29 01:57:25.468518 $ python -m sagas.nlu.extractor_cli datetime 'Jumat lalu' id $ python -m sagas.nlu.extractor_cli datetime '12 Mei 2008' id .. search: [('12 Mei 2008', datetime.datetime(2008, 5, 12, 0, 0))] .. parse: 2008-05-12 00:00:00 $ python -m sagas.nlu.extractor_cli datetime 'Besok malam jam 8' id .. search: [('Besok', datetime.datetime(2019, 12, 1, 23, 22, 16, 689529)), ('jam 8', datetime.datetime(2019, 8, 30, 0, 0))] .. parse: None $ python -m sagas.nlu.extractor_cli datetime 'Minggu depan' id .. search: None .. parse: 2020-02-19 16:47:48.548957 $ python -m sagas.nlu.extractor_cli datetime '三月开始去上学' zh $ python -m sagas.nlu.extractor_cli datetime '2008年12月に上海に行きたいです。' ja :param text: :param lang: :return: """ from dateparser.search import search_dates from dateparser import parse # search_dates('Jumat lalu', languages=['id']) search_r = search_dates(text, languages=[lang]) print(f".. search: {search_r}") # parse('12 Mei 2008', languages=['id']) parse_r = parse(text, languages=[lang]) print(f".. parse: {parse_r}")
async def toEpochTime(self, ctx, *, timeStr: str): """Converts a date to a timestamp, and shows that time in your own local time""" _, time = (search_dates(timeStr.upper(), settings={'RETURN_AS_TIMEZONE_AWARE': True})[0]) await ctx.send( f"`{int(time.timestamp())}` is the timestamp for `{time.strftime('%c in timezone %Z')}`\nThe basic timestamp would look like this: <t:{int(time.timestamp())}:F>" )
def get_dateparser_dates(self, text=None): """ Extract possible dates with dateparser """ text = text or self.TEXT # INFO: 'DATE_ORDER': 'DMY' prevents parsing date like 2004-12-13T00:00:00Z, # use SKIP_TOKENS setting if needed along with DATE_ORDER return search_dates(text, languages=[self.LANGUAGE], settings=self.DATEPARSER_SETTINGS) or []
def parse_dates_possibilities(text): text = expand_event_time(text.upper()) try: matches = search_dates(text, languages=['en'], settings={'TIMEZONE': 'US/Eastern', 'TO_TIMEZONE': 'UTC'}) except Exception as e: print("search dates errored out ===>") print(e) matches = None return matches
def find_dates(tweets): """search text for dates and filter by correct format""" mod_string = re.sub("\#[\w\_]+", "", tweets) mod_string = re.sub("[\(\[].*?[\)\]]", "", mod_string) date = search_dates(mod_string) if date: date = [x[1] for x in date][0] return date return None
def fetchBussinessEvents(): url = 'https://www.eventbrite.com/d/united-kingdom--london/business--events/?crt=regular&end_date=05/31/2018&sort=best&start_date=05/01/2018&subcat=1007' driver.get(url) sleep(5) parsers = html.fromstring(driver.page_source, driver.current_url) pages = parsers.xpath( "/html/body/div[4]/section[2]/div[7]/nav/div/div/ul/li") length = len(pages) nth = length - 1 totalPages = parsers.xpath( "/html/body/div[4]/section[2]/div[7]/nav/div/div/ul/li[%s]/a/text()" % nth) pageNums = int(totalPages[0]) place = [] eventDet = pd.DataFrame([]) for i in range(1, pageNums + 1): pagUrl = 'https://www.eventbrite.com/d/united-kingdom--london/business--events/?crt=regular&end_date=05/31/2018&sort=best&start_date=05/01/2018&subcat=1007&page={}'.format( i) driver.get(pagUrl) sleep(8) parsers = html.fromstring(driver.page_source, driver.current_url) eventsContainer = parsers.xpath( ".//*[@data-automation='event-list-container']/div") for events in eventsContainer: venuePlace = events.xpath( "normalize-space(a/div[2]/div[2]/text())") eventDate = events.xpath("normalize-space(a/div[2]/time/text())") eventName = events.xpath("normalize-space(a/div[2]/div[1]/text())") eventDate = search_dates(eventDate) dateLength = len(eventDate) if (dateLength > 1): eventDate = prevDate prevDate = eventDate eventDate = eventDate[0] eventOn = eventDate[1].date() eventOn = eventOn.strftime('%Y-%m-%d') eventDet = eventDet.append(pd.DataFrame( { 'Event': eventName, 'Venue': venuePlace, 'Date': eventOn, 'Category': 'Bussiness' }, index=[0]), ignore_index=True) return (eventDet)
def check_for_dates(sent: str) -> Triple: dates = search_dates(sent) if dates is not None and len(dates) == 2: new_triple = Triple() new_triple.add_subject('Contribution 1', -1, -1, sent) new_triple.add_predicate('Study date', -1, -1, sent) obj = f'{dates[0][1].date()} - {dates[1][1].date()}' new_triple.add_object(obj, -1, -1, sent) return new_triple
def stage3(self, sentence): l1 = search_dates(sentence) #search_dates(sentence)[0][1].date() if l1 is not None: dates = list() print(l1[0][1].date()) dates.append(str(l1[0][1].date())) return dates return []
def parse_time(time_string, base_time, timezone_string): base_time = datetime_as_timezone(base_time, timezone_string) try: date_time = dateparser.parse(time_string, languages=['en'], settings={ "PREFER_DATES_FROM": 'future', "RELATIVE_BASE": base_time.replace(tzinfo=None) }) except Exception: date_time = None if date_time is None: try: results = search_dates(time_string, languages=['en'], settings={ "PREFER_DATES_FROM": 'future', "RELATIVE_BASE": base_time.replace(tzinfo=None) }) if results is not None: temp_time = results[0][1] if temp_time.tzinfo is None: temp_time = datetime_force_utc(temp_time) if temp_time > base_time: date_time = results[0][1] else: date_time = None except Exception: date_time = None if date_time is None: try: date_time, result_code = cal.parseDT(time_string, base_time) if result_code == 0: date_time = None except Exception: date_time = None if date_time is None: return None if date_time.tzinfo is None: if timezone_string is not None: date_time = pytz.timezone(timezone_string).localize(date_time) else: date_time = datetime_force_utc(date_time) date_time = datetime_as_utc(date_time) return date_time
def dateFromText(text, isprojecttask): dates = search_dates(text) if isprojecttask: if dates is None: return None else: if dates is None: return datetime.datetime.now() return dates[0][1]
def _search_first_date(string): candidates = search_dates(string, languages=['en'], settings={'STRICT_PARSING': True}) if candidates: first_term, first_date = candidates[0] remaining = ''.join(string.split(first_term)[1:]) if _is_complete_date(first_term): return first_date, remaining return None, remaining return None, ''
def parse(self): next_page = True feedback_list = [] while next_page: try: try: table_id = self.driver.find_element_by_xpath( "//*[@id='feedback-table']") # get all of the rows in the table rows = table_id.find_elements_by_xpath( ".//tr[@class='feedback-row']") for row in rows: feedback = {} rating_string = row.find_element_by_xpath( ".//th/div/i/span").get_attribute("innerHTML") feedback['rating'] = int(rating_string.split(' ')[0]) col_2 = row.find_element(By.TAG_NAME, "td") feedback['text'] = col_2.find_element_by_xpath( './/*[@id="-text" or @id="-expanded"]' ).get_attribute("innerHTML") try: div = col_2.find_element_by_xpath( ".//*[@class='a-section a-spacing-top-small feedback-suppressed']" ) value = div.value_of_css_property("display") if value == 'none': feedback['deleted'] = 0 else: feedback['deleted'] = 1 except selenium.common.exceptions.NoSuchElementException: feedback['deleted'] = 0 s = col_2.find_element_by_xpath( './/div/div[2]/span').text try: date = search_dates(s, settings={'TIMEZONE': 'UTC'})[0][1].date() except TypeError: date = None if date is None or date > datetime.today().date(): feedback['date'] = "" else: feedback['date'] = str(date) feedback_list.append(feedback) except selenium.common.exceptions.WebDriverException: current_app.logger.error( str(datetime.now()) + "Selenium did not find Element(s)") return [] self.driver.find_element_by_xpath( "//*[@id='feedback-next-link']").click() current_app.logger.info( str(datetime.now()) + " Clicking NEXT PAGE") time.sleep(10) except selenium.common.exceptions.ElementNotVisibleException: next_page = False return feedback_list
def test_search_dates_returning_detected_languages_if_requested( self, text, add_detected_language, expected ): result = search_dates(text, add_detected_language=add_detected_language) self.assertEqual(result, expected)
def run_search_dates_function_invalid_languages(self, text, languages, error_type): try: search_dates(text=text, languages=languages) except Exception as error: self.error = error self.assertIsInstance(self.error, error_type)
def test_date_search_function(self, text, languages, settings, expected): result = search_dates(text, languages=languages, settings=settings) self.assertEqual(result, expected)