def normalized_entity(normalized): money_ = re.findall(_money, normalized) money_ = [(s, money(s)[1]) for s in money_] dates_ = re.findall(_date, normalized) past_date_string_ = re.findall(_past_date_string, normalized) now_date_string_ = re.findall(_now_date_string, normalized) future_date_string_ = re.findall(_future_date_string, normalized) yesterday_date_string_ = re.findall(_yesterday_tomorrow_date_string, normalized) depan_date_string_ = re.findall(_depan_date_string, normalized) today_time_ = re.findall(_today_time, normalized) time_ = re.findall(_expressions['time'], normalized) left_datetime_ = [ f'{i[0]} {i[1]}' for i in re.findall(_left_datetime, normalized) ] right_datetime_ = [ f'{i[0]} {i[1]}' for i in re.findall(_right_datetime, normalized) ] today_left_datetime_ = [ f'{i[0]} {i[1]}' for i in re.findall(_left_datetodaytime, normalized) ] today_right_datetime_ = [ f'{i[0]} {i[1]}' for i in re.findall(_right_datetodaytime, normalized) ] left_yesterdaydatetime_ = [ f'{i[0]} {i[1]}' for i in re.findall(_left_yesterdaydatetime, normalized) ] right_yesterdaydatetime_ = [ f'{i[0]} {i[1]}' for i in re.findall(_right_yesterdaydatetime, normalized) ] left_yesterdaydatetodaytime_ = [ f'{i[0]} {i[1]}' for i in re.findall(_left_yesterdaydatetodaytime, normalized) ] right_yesterdaydatetodaytime_ = [ f'{i[0]} {i[1]}' for i in re.findall(_right_yesterdaydatetodaytime, normalized) ] dates_ = (dates_ + past_date_string_ + now_date_string_ + future_date_string_ + yesterday_date_string_ + depan_date_string_ + time_ + today_time_ + left_datetime_ + right_datetime_ + today_left_datetime_ + today_right_datetime_ + left_yesterdaydatetime_ + right_yesterdaydatetime_ + left_yesterdaydatetodaytime_ + right_yesterdaydatetodaytime_) dates_ = [multireplace(s, date_replace) for s in dates_] dates_ = [re.sub(r'[ ]+', ' ', s).strip() for s in dates_] dates_ = cluster_words(dates_) dates_ = {s: dateparser.parse(s) for s in dates_} money_ = {s[0]: s[1] for s in money_} return dates_, money_
def _vectorize_sentence(self, corpus, isi_penting, important_words=10, batch_size=10, retry=5, **kwargs): corpus = corpus_checker(corpus) splitted_fullstop = [summary_textcleaning(i) for i in corpus] original_strings = [i[0] for i in splitted_fullstop] cleaned_strings = [i[1] for i in splitted_fullstop] vectors = self._batching(cleaned_strings, batch_size=batch_size) if isi_penting: vectors_isi_penting = self._batching([isi_penting], batch_size=batch_size) if 'DeepSkipThought' in str(self.vectorizer): top_words = [] else: if hasattr(self.vectorizer, 'attention'): attentions = self.vectorizer.attention(corpus, **kwargs) flatten = list(itertools.chain(*attentions)) r = {} for f in flatten: c = simple_textcleaning(f[0]) if c in STOPWORDS: continue if c not in r: r[c] = f[1] else: r[c] += f[1] top_words = sorted(r, key=r.get, reverse=True)[:important_words] else: top_words = [] similar = cosine_similarity(vectors, vectors) if isi_penting: similar_isi_penting = cosine_similarity(vectors, vectors_isi_penting) similar = similar * similar_isi_penting else: similar[similar >= 0.99] = 0 scores = pagerank(similar + 1e-6, retry) ranked_sentences = sorted( ((scores[i], s, i) for i, s in enumerate(original_strings)), reverse=True, ) return ( original_strings, ranked_sentences, top_words, cluster_words(top_words), )
def _vectorize_sentence(self, corpus, isi_penting, important_words=10, retry=5, **kwargs): corpus = corpus_checker(corpus) splitted_fullstop = [summary_textcleaning(i) for i in corpus] original_strings = [i[0] for i in splitted_fullstop] cleaned_strings = [i[1] for i in splitted_fullstop] if isi_penting: isi_penting = [summary_textcleaning(isi_penting)[1]] t = cleaned_strings + isi_penting else: t = cleaned_strings self.vectorizer.fit(t) freq = self.vectorizer.transform(cleaned_strings) if isi_penting: freq_isi_penting = self.vectorizer.transform(isi_penting) if important_words > 0: if hasattr(self.vectorizer, 'idf_'): indices = np.argsort(self.vectorizer.idf_)[::-1] else: indices = np.argsort(np.asarray(freq.sum(axis=0))[0])[::-1] features = self.vectorizer.get_feature_names() top_words = [features[i] for i in indices[:important_words]] else: top_words = [] if isi_penting: t = vstack([freq, freq_isi_penting]) else: t = freq self.model.fit(t) vectors = self.model.transform(freq) if isi_penting: vectors_isi_penting = self.model.transform(freq_isi_penting) similar = cosine_similarity(vectors, vectors) if isi_penting: similar_isi_penting = cosine_similarity(vectors, vectors_isi_penting) similar = similar * similar_isi_penting else: similar[similar >= 0.99] = 0 scores = pagerank(similar + 1e-6, retry) ranked_sentences = sorted( ((scores[i], s, i) for i, s in enumerate(original_strings)), reverse=True, ) return ( original_strings, ranked_sentences, top_words, cluster_words(top_words), )
def _vectorize_word(self, corpus, isi_penting, window_size=10, important_words=10, batch_size=10, **kwargs): corpus = corpus_checker(corpus) splitted_fullstop = [summary_textcleaning(i) for i in corpus] original_strings = [i[0] for i in splitted_fullstop] cleaned_strings = [i[1] for i in splitted_fullstop] ngram_list, splitted = create_ngram(' '.join(cleaned_strings), ngram=window_size) splitted = ' '.join(original_strings).split() if isi_penting: isi_penting = [isi_penting] else: isi_penting = original_strings vectors = self._batching(ngram_list, batch_size=batch_size) vectors_isi_penting = self._batching(isi_penting, batch_size=batch_size) if 'DeepSkipThought' in str(self.vectorizer): top_words = [] else: if hasattr(self.vectorizer, 'attention') and important_words > 0: attentions = self.vectorizer.attention(corpus, **kwargs) flatten = list(itertools.chain(*attentions)) r = {} for f in flatten: c = simple_textcleaning(f[0]) if c in STOPWORDS: continue if c not in r: r[c] = f[1] else: r[c] += f[1] top_words = sorted(r, key=r.get, reverse=True)[:important_words] else: top_words = [] vectors_isi_penting = np.mean(vectors_isi_penting, axis=0) vectors_isi_penting = np.expand_dims(vectors_isi_penting, axis=0) similar_isi_penting = cosine_similarity(vectors, vectors_isi_penting) scores = similar_isi_penting[:, 0] ranked_sentences = sorted( ((scores[i], s, i) for i, s in enumerate(splitted)), reverse=True) return (splitted, ranked_sentences, top_words, cluster_words(top_words))
def _vectorize_word(self, corpus, isi_penting, window_size, important_words=10, **kwargs): corpus = corpus_checker(corpus) splitted_fullstop = [summary_textcleaning(i) for i in corpus] original_strings = [i[0] for i in splitted_fullstop] cleaned_strings = [i[1] for i in splitted_fullstop] ngram_list, splitted = create_ngram(' '.join(cleaned_strings), ngram=window_size) splitted = ' '.join(original_strings).split() if isi_penting: isi_penting = [summary_textcleaning(isi_penting)[1]] else: isi_penting = [' '.join(cleaned_strings)] t = ngram_list + isi_penting self.vectorizer.fit(t) freq = self.vectorizer.transform(ngram_list) freq_isi_penting = self.vectorizer.transform(isi_penting) if important_words > 0: if hasattr(self.vectorizer, 'idf_'): indices = np.argsort(self.vectorizer.idf_)[::-1] else: indices = np.argsort(np.asarray(freq.sum(axis=0))[0])[::-1] features = self.vectorizer.get_feature_names() top_words = [features[i] for i in indices[:important_words]] else: top_words = [] t = vstack([freq, freq_isi_penting]) self.model.fit(t) vectors = self.model.transform(freq) vectors_isi_penting = self.model.transform(freq_isi_penting) similar_isi_penting = cosine_similarity(vectors, vectors_isi_penting) scores = similar_isi_penting[:, 0] ranked_sentences = sorted( ((scores[i], s, i) for i, s in enumerate(splitted)), reverse=True) return (splitted, ranked_sentences, top_words, cluster_words(top_words))
def parse_from_dependency(models, string: str, references: List[str] = ['dia', 'itu', 'ini', 'saya', 'awak', 'kamu', 'kita', 'kami', 'mereka'], rejected_references: List[str] = ['saya', 'awak', 'kamu', 'kita', 'kami', 'mereka', 'nya'], acceptable_subjects: List[str] = ['flat', 'subj', 'nsubj', 'csubj', 'obj'], acceptable_nested_subjects: List[str] = ['compound', 'flat'], split_nya: bool = True, aggregate: Callable = np.mean, top_k: int = 20): """ Apply Coreference Resolution using stacks of dependency models. Parameters ---------- models: list list of dependency models, must has `vectorize` method. string: str references: List[str], optional (default=['dia', 'itu', 'ini', 'saya', 'awak', 'kamu', 'kita', 'kami', 'mereka']) list of references. rejected_references: List[str], optional (default=['saya', 'awak', 'kamu', 'kita', 'kami', 'mereka']) list of rejected references during populating subjects. acceptable_subjects:List[str], optional List of dependency labels for subjects. acceptable_nested_subjects: List[str], optional List of dependency labels for nested subjects, eg, syarikat (obl) facebook (compound). split_nya: bool, optional (default=True) split `nya`, eg, `disifatkannya` -> `disifatkan`, `nya`. aggregate: Callable, optional (default=numpy.mean) Aggregate function to aggregate list of vectors from `model.vectorize`. top_k: int, optional (default=20) only accept near top_k to assume a coherence. Returns ------- result: Dict[text, coref] {'text': ['Husein','Zolkepli','suka','makan','ayam','.','Dia','pun','suka','makan','daging','.'], 'coref': {6: {'index': [0, 1], 'text': ['Husein', 'Zolkepli']}}} """ if not isinstance(models, list): raise ValueError('models must be a list') for m in range(len(models)): if type(models[m]) not in [DependencyBERT, DependencyXLNET]: raise ValueError('model must one of [malaya.model.bert.DependencyBERT, malaya.model.xlnet.DependencyXLNET]') if split_nya: string = _split_nya(string) references = references + ['nya'] tagging, indexing = voting_stack(models, string) result = [] for i in range(len(tagging)): result.append( '%d\t%s\t_\t_\t_\t_\t%d\t%s\t_\t_' % (i + 1, tagging[i][0], int(indexing[i][1]), tagging[i][1]) ) d_object = DependencyGraph('\n'.join(result), top_relation_label='root') rs = [] for i in range(len(indexing)): for s in acceptable_subjects: if d_object.nodes[i]['rel'] == s: r = [] for n_s in acceptable_nested_subjects: s_ = d_object.traverse_children(i, [n_s], initial_label=[s]) s_ = _combined(s_) r.extend(s_) r = [i for i in r if i.lower() not in references and not i.lower() in rejected_references] rs.extend(r) rs = cluster_words(rs, lowercase=True) vs, X = [], None for m in range(len(models)): v = models[m].vectorize(string) X = [i[0] for i in v] y = [i[1] for i in v] vs.append(y) V = aggregate(vs, axis=0) indices, word_indices = {}, [] for no, row in enumerate(rs): ind = [] for word in row.split(): indices[word] = indices.get(word, no) ind.append(X.index(word)) word_indices.append(ind) index_word = [] for key in indices: index_word.append(X.index(key)) index_references = [] for i in range(len(X)): if X[i].lower() in references: index_references.append(i) similarities = cosine_similarity(V) results = {} for r in index_references: r_ = [r, r - 1] i_ = -1 # subject verb object . subject, we want to reject words before punct while X[r + i_] in PUNCTUATION: i_ -= 1 r_.append(r + i_) index_word_ = [i for i in index_word if i < r] sorted_indices = similarities[r].argsort()[-top_k:][::-1] sorted_indices = sorted_indices[np.isin(sorted_indices, index_word_) & ~ np.isin(sorted_indices, r_)] if len(sorted_indices): s = rs[indices[X[sorted_indices[0]]]] index = word_indices[indices[X[sorted_indices[0]]]] results[r] = {'index': index, 'text': s.split()} return {'text': X, 'coref': results}
def summarize(self, corpus, top_k: int = 3, important_words: int = 3, **kwargs): """ Summarize list of strings / corpus Parameters ---------- corpus: str, list top_k: int, (default=3) number of summarized strings. important_words: int, (default=3) number of important words. Returns ------- string: summarized string """ if not isinstance(corpus, list) and not isinstance(corpus, str): raise ValueError('corpus must be a list') if isinstance(corpus, list): if not isinstance(corpus[0], str): raise ValueError('corpus must be list of strings') if isinstance(corpus, str): corpus = split_into_sentences(corpus) else: corpus = '. '.join(corpus) corpus = split_into_sentences(corpus) splitted_fullstop = [summary_textcleaning(i) for i in corpus] original_strings = [i[0] for i in splitted_fullstop] cleaned_strings = [i[1] for i in splitted_fullstop] if 'DEEP_SKIPTHOUGHT' in str(self._vectorizer): sequences = skip_thought.batch_sequence( cleaned_strings, self._vectorizer.dictionary, maxlen=self._vectorizer._maxlen, ) vectors, attention = self._vectorizer._sess.run( [self._vectorizer._logits, self._vectorizer._attention], feed_dict={self._vectorizer._X: np.array(sequences)}, ) attention = attention.sum(axis=0) indices = np.argsort(attention)[::-1] top_words = [ self._vectorizer._rev_dictionary[i] for i in indices if self._vectorizer._rev_dictionary[i] not in STOPWORDS ][:important_words] else: vectors = self._vectorizer.vectorize(corpus) attentions = self._vectorizer.attention(corpus, **kwargs) flatten = list(itertools.chain(*attentions)) r = {} for f in flatten: c = simple_textcleaning(f[0]) if c in STOPWORDS: continue if c not in r: r[c] = f[1] else: r[c] += f[1] top_words = sorted(r, key=r.get, reverse=True)[:important_words] similar = cosine_similarity(vectors, vectors) similar[similar >= 0.99999] = 0 scores = pagerank(similar) ranked_sentences = sorted( ((scores[i], s) for i, s in enumerate(original_strings)), reverse=True, ) summary = [r[1] for r in ranked_sentences[:top_k]] return { 'summary': ' '.join(summary), 'top-words': top_words, 'cluster-top-words': cluster_words(top_words), }
def _base_summarizer( corpus, decomposition, top_k: int = 3, max_df: float = 0.95, min_df: int = 2, ngram: Tuple[int, int] = (1, 3), vectorizer: str = 'bow', important_words: int = 10, retry: int = 5, **kwargs, ): vectorizer = vectorizer.lower() if not vectorizer in ['tfidf', 'bow', 'skip-gram']: raise ValueError( "vectorizer must be in ['tfidf', 'bow', 'skip-gram']") if min_df < 1: raise ValueError('min_df must be bigger than 0') if not (max_df <= 1 and max_df > 0): raise ValueError( 'max_df must be bigger than 0, less than or equal to 1') if not isinstance(corpus, list) and not isinstance(corpus, str): raise ValueError('corpus must be a list') if isinstance(corpus, list): if not isinstance(corpus[0], str): raise ValueError('corpus must be list of strings') if isinstance(corpus, str): corpus = split_into_sentences(corpus) else: corpus = '. '.join(corpus) corpus = split_into_sentences(corpus) splitted_fullstop = [summary_textcleaning(i) for i in corpus] original_strings = [i[0] for i in splitted_fullstop] cleaned_strings = [i[1] for i in splitted_fullstop] stemmed = [sastrawi(i) for i in cleaned_strings] if vectorizer == 'tfidf': Vectorizer = TfidfVectorizer elif vectorizer == 'bow': Vectorizer = CountVectorizer elif vectorizer == 'skip-gram': Vectorizer = SkipGramVectorizer else: raise Exception("vectorizer must be in ['tfidf', 'bow', 'skip-gram']") tf_vectorizer = Vectorizer( max_df=max_df, min_df=min_df, ngram_range=ngram, stop_words=STOPWORDS, **kwargs, ) tf = tf_vectorizer.fit_transform(stemmed) if hasattr(tf_vectorizer, 'idf_'): indices = np.argsort(tf_vectorizer.idf_)[::-1] else: indices = np.argsort(np.asarray(tf.sum(axis=0))[0])[::-1] features = tf_vectorizer.get_feature_names() top_words = [features[i] for i in indices[:important_words]] vectors = decomposition(tf.shape[1] // 2).fit_transform(tf) similar = cosine_similarity(vectors, vectors) similar[similar >= 0.999] = 0 scores = pagerank(similar, retry) ranked_sentences = sorted( ((scores[i], s) for i, s in enumerate(original_strings)), reverse=True) summary = [r[1] for r in ranked_sentences[:top_k]] return { 'summary': ' '.join(summary), 'top-words': top_words, 'cluster-top-words': cluster_words(top_words), }
def normalize(self, string: str, check_english: bool = True): """ Normalize a string Parameters ---------- string : str check_english: bool, (default=True) check a word in english dictionary. Returns ------- string: normalized string """ result, normalized = [], [] tokenized = _tokenizer(string) print(tokenized) index = 0 while index < len(tokenized): word = tokenized[index] if word in '~@#$%^&*()_+{}|[:"\'];<>,.?/-': result.append(word) index += 1 continue normalized.append(rules_normalizer.get(word.lower(), word.lower())) if word.lower() in ignore_words: result.append(word) index += 1 continue if word[0].isupper(): if word.upper() not in ['KE', 'PADA', 'RM', 'SEN', 'HINGGA']: result.append(_normalize_title(word)) index += 1 continue if check_english: if word.lower() in ENGLISH_WORDS: result.append(word) index += 1 continue if word.lower() in MALAY_WORDS and word.lower() not in [ 'pada', 'ke', ]: result.append(word) index += 1 continue if len(word) > 2: if word[-2] in consonants and word[-1] == 'e': word = word[:-1] + 'a' if word[0] == 'x' and len(word) > 1: result_string = 'tak ' word = word[1:] else: result_string = '' if word.lower() == 'ke' and index < (len(tokenized) - 2): if tokenized[index + 1] == '-' and _is_number_regex( tokenized[index + 2]): result.append( ordinal(word + tokenized[index + 1] + tokenized[index + 2])) index += 3 continue elif tokenized[index + 1] == '-' and re.match( '.*(V|X|I|L|D)', tokenized[index + 2]): result.append( ordinal(word + tokenized[index + 1] + str(rom_to_int(tokenized[index + 2])))) index += 3 continue else: result.append('ke') index += 1 continue if _is_number_regex(word) and index < (len(tokenized) - 2): if tokenized[index + 1] == '-' and _is_number_regex( tokenized[index + 2]): result.append( to_cardinal(_string_to_num(word)) + ' hingga ' + to_cardinal(_string_to_num(tokenized[index + 2]))) index += 3 continue if word.lower() == 'pada' and index < (len(tokenized) - 3): if (_is_number_regex(tokenized[index + 1]) and tokenized[index + 2] in '/-' and _is_number_regex(tokenized[index + 3])): result.append('pada %s hari bulan %s' % ( to_cardinal(_string_to_num(tokenized[index + 1])), to_cardinal(_string_to_num(tokenized[index + 3])), )) index += 4 continue else: result.append('pada') index += 1 continue if _is_number_regex(word) and index < (len(tokenized) - 2): if tokenized[index + 1] == '/' and _is_number_regex( tokenized[index + 2]): result.append( fraction(word + tokenized[index + 1] + tokenized[index + 2])) index += 3 continue if re.findall(_money, word.lower()): money_, _ = money(word) result.append(money_) index += 1 continue if re.findall(_date, word.lower()): word = word.lower() word = multireplace(word, date_replace) word = re.sub(r'[ ]+', ' ', word).strip() parsed = dateparser.parse(word) if parsed: result.append(parsed.strftime('%d/%m/%Y')) else: result.append(word) index += 1 continue if re.findall(_expressions['time'], word.lower()): word = word.lower() word = multireplace(word, date_replace) word = re.sub(r'[ ]+', ' ', word).strip() parsed = dateparser.parse(word) if parsed: result.append(parsed.strftime('%H:%M:%S')) else: result.append(word) index += 1 continue cardinal_ = cardinal(word) if cardinal_ != word: result.append(cardinal_) index += 1 continue normalized_ke = ordinal(word) if normalized_ke != word: result.append(normalized_ke) index += 1 continue word, end_result_string = _remove_postfix(word) if word in sounds: result.append(result_string + sounds[word] + end_result_string) index += 1 continue if word in rules_normalizer: result.append(result_string + rules_normalizer[word] + end_result_string) index += 1 continue selected = self._speller.correct(word, string=' '.join(tokenized), index=index) result.append(result_string + selected + end_result_string) index += 1 result = ' '.join(result) normalized = ' '.join(normalized) money_ = re.findall(_money, normalized) money_ = [(s, money(s)[1]) for s in money_] dates_ = re.findall(_date, normalized) past_date_string_ = re.findall(_past_date_string, normalized) now_date_string_ = re.findall(_now_date_string, normalized) future_date_string_ = re.findall(_future_date_string, normalized) yesterday_date_string_ = re.findall(_yesterday_tomorrow_date_string, normalized) depan_date_string_ = re.findall(_depan_date_string, normalized) today_time_ = re.findall(_today_time, normalized) time_ = re.findall(_expressions['time'], normalized) left_datetime_ = [ f'{i[0]} {i[1]}' for i in re.findall(_left_datetime, normalized) ] right_datetime_ = [ f'{i[0]} {i[1]}' for i in re.findall(_right_datetime, normalized) ] today_left_datetime_ = [ f'{i[0]} {i[1]}' for i in re.findall(_left_datetodaytime, normalized) ] today_right_datetime_ = [ f'{i[0]} {i[1]}' for i in re.findall(_right_datetodaytime, normalized) ] left_yesterdaydatetime_ = [ f'{i[0]} {i[1]}' for i in re.findall(_left_yesterdaydatetime, normalized) ] right_yesterdaydatetime_ = [ f'{i[0]} {i[1]}' for i in re.findall(_right_yesterdaydatetime, normalized) ] left_yesterdaydatetodaytime_ = [ f'{i[0]} {i[1]}' for i in re.findall(_left_yesterdaydatetodaytime, normalized) ] right_yesterdaydatetodaytime_ = [ f'{i[0]} {i[1]}' for i in re.findall(_right_yesterdaydatetodaytime, normalized) ] dates_ = (dates_ + past_date_string_ + now_date_string_ + future_date_string_ + yesterday_date_string_ + depan_date_string_ + time_ + today_time_ + left_datetime_ + right_datetime_ + today_left_datetime_ + today_right_datetime_ + left_yesterdaydatetime_ + right_yesterdaydatetime_ + left_yesterdaydatetodaytime_ + right_yesterdaydatetodaytime_) dates_ = [multireplace(s, date_replace) for s in dates_] dates_ = [re.sub(r'[ ]+', ' ', s).strip() for s in dates_] dates_ = cluster_words(dates_) dates_ = {s: dateparser.parse(s) for s in dates_} money_ = {s[0]: s[1] for s in money_} return {'normalize': result, 'date': dates_, 'money': money_}
def normalized_entity(normalized): money_ = re.findall(_expressions['money'], normalized) money_ = [(s, money(s)[1]) for s in money_] dates_ = re.findall(_expressions['date'], normalized) past_date_string_ = re.findall(_past_date_string, normalized) logger.debug(f'past_date_string_: {past_date_string_}') now_date_string_ = re.findall(_now_date_string, normalized) logger.debug(f'now_date_string_: {now_date_string_}') future_date_string_ = re.findall(_future_date_string, normalized) logger.debug(f'future_date_string_: {future_date_string_}') yesterday_date_string_ = re.findall( _yesterday_tomorrow_date_string, normalized ) logger.debug(f'yesterday_date_string_: {yesterday_date_string_}') depan_date_string_ = re.findall(_depan_date_string, normalized) logger.debug(f'depan_date_string_: {depan_date_string_}') today_time_ = re.findall(_today_time, normalized) logger.debug(f'today_time_: {today_time_}') time_ = re.findall(_expressions['time'], normalized) logger.debug(f'time_: {time_}') left_datetime_ = [ f'{i[0]} {i[1]}' for i in re.findall(_left_datetime, normalized) ] logger.debug(f'left_datetime_: {left_datetime_}') right_datetime_ = [ f'{i[0]} {i[1]}' for i in re.findall(_right_datetime, normalized) ] logger.debug(f'right_datetime_: {left_datetime_}') today_left_datetime_ = [ f'{i[0]} {i[1]}' for i in re.findall(_left_datetodaytime, normalized) ] logger.debug(f'today_left_datetime_: {today_left_datetime_}') today_right_datetime_ = [ f'{i[0]} {i[1]}' for i in re.findall(_right_datetodaytime, normalized) ] logger.debug(f'today_right_datetime_: {today_right_datetime_}') left_yesterdaydatetime_ = [ f'{i[0]} {i[1]}' for i in re.findall(_left_yesterdaydatetime, normalized) ] logger.debug(f'left_yesterdaydatetime_: {left_yesterdaydatetime_}') right_yesterdaydatetime_ = [ f'{i[0]} {i[1]}' for i in re.findall(_right_yesterdaydatetime, normalized) ] logger.debug(f'right_yesterdaydatetime_: {right_yesterdaydatetime_}') left_yesterdaydatetodaytime_ = [ f'{i[0]} {i[1]}' for i in re.findall(_left_yesterdaydatetodaytime, normalized) ] logger.debug(f'left_yesterdaydatetodaytime_: {left_yesterdaydatetodaytime_}') right_yesterdaydatetodaytime_ = [ f'{i[0]} {i[1]}' for i in re.findall(_right_yesterdaydatetodaytime, normalized) ] logger.debug(f'right_yesterdaydatetodaytime_: {right_yesterdaydatetodaytime_}') dates_ = ( dates_ + past_date_string_ + now_date_string_ + future_date_string_ + yesterday_date_string_ + depan_date_string_ + time_ + today_time_ + left_datetime_ + right_datetime_ + today_left_datetime_ + today_right_datetime_ + left_yesterdaydatetime_ + right_yesterdaydatetime_ + left_yesterdaydatetodaytime_ + right_yesterdaydatetodaytime_ ) dates_ = [d.replace('.', ':') for d in dates_ if not isinstance(d, tuple)] dates_ = [multireplace(s, date_replace) for s in dates_] dates_ = [re.sub(r'[ ]+', ' ', s).strip() for s in dates_] dates_ = cluster_words(dates_) dates_ = {s: dateparser.parse(s) for s in dates_} money_ = {s[0]: s[1] for s in money_} return dates_, money_