def __alter_repeat_for_dont_think_SV(fixed_df): try: # TODO see if its neccesary to care about should and cant idx_of_think = Nlp_util.get_idx_list_of_word("think", fixed_df["base_form"])[0] df_after_think = fixed_df.loc[idx_of_think + 1:, :].reset_index(drop=True) verb_list = Nlp_util.make_verb_list(df_after_think, type="normal") noun_list = Nlp_util.make_noun_list(df_after_think) # possibly bug happen here since amount of verbs are different in cant do/dont do is_negative_form = Df_util.anything_isin(["not", "never"], df_after_think.loc[:, "base_form"]) # can add possibly or likely(when its negative) head_words = ["so ", "so probably ", "probably ", "so maybe ", "maybe "] random_idx_for_heads_words = randint(0, len(head_words) - 1) if is_negative_form: # まず主語とるそのあとにwouldntいれるその後ろに動詞の原型をいれて、それ以降はつづける head_word = head_words[random_idx_for_heads_words] subj = noun_list["word"].iloc[0] auxiliary_verb = " would " idx_of_not = Nlp_util.get_idx_list_of_word_list(["not", "never"], df_after_think.loc[:, "base_form"])[0] verb_row = verb_list.loc[idx_of_not:, :].iloc[0] verb = verb_row.base_form + " " after_verb = WordFormatter.Series2Str(df_after_think.loc[verb_row.name + 1:, "word"]) return [head_word + subj + auxiliary_verb + verb + after_verb] else: head_word = head_words[random_idx_for_heads_words] subj = noun_list["word"].iloc[0] auxiliary_verb = " wouldnt " verb = verb_list["base_form"].iloc[0] + " " after_verb = WordFormatter.Series2Str(df_after_think.loc[verb_list.index[0] + 1:, "word"]) return [head_word + subj + auxiliary_verb + verb + after_verb] except: logging.exception('') return []
def __alter_repeat_for_need_sent(fixed_df): idx_of_need = Nlp_util.get_idx_list_of_word("need", fixed_df["base_form"])[0] row_of_first_noun = \ Nlp_util.get_wordsDF_of_wordlist_after_idx(fixed_df, Nlp_util.pos_NOUNs + Nlp_util.pos_PRPs, idx_of_need, column_name="pos").iloc[0] if fixed_df.loc[row_of_first_noun.name - 1, "pos"] in Nlp_util.pos_ADJECTIVEs + ["PRP$", "DT"]: noun = WordFormatter.Series2Str( fixed_df.loc[row_of_first_noun.name - 1:row_of_first_noun.name, "word"]) else: noun = fixed_df.loc[row_of_first_noun.name, "word"] noun_nominative = Nlp_util.convert_objective_noun_to_nominative(noun) options = [[ "so " + noun_nominative + " is very important thing for you..", "and sounds its kinda hard to get it now right😢" ], [ "so its like its not easy to get " + noun + " now but you really want..", "and it can frustrate you😞" ], [ "sounds you really want " + noun + "..", "might be tough time for you to seek for it now😓" ]] random_idx_for_options = randint(0, len(options) - 1) return options[random_idx_for_options]
def __alter_repeat_for_wish(fixed_df): wish_idx = Nlp_util.get_idx_list_of_word("wish", fixed_df["base_form"])[0] row_of_subj = Nlp_util.get_wordsDF_of_wordlist_after_idx(fixed_df, Nlp_util.pos_NOUNs+Nlp_util.pos_PRPs, wish_idx, column_name="pos").iloc[0] row_of_verb = Nlp_util.get_wordsDF_of_wordlist_after_idx(fixed_df, Nlp_util.pos_VERBs, row_of_subj.name, column_name="pos").iloc[0] subj = row_of_subj.word verb = row_of_verb.word after_verb = WordFormatter.Series2Str(fixed_df.loc[row_of_verb.name+1:, "word"]) objective_subj = Nlp_util.convert_nominative_noun_to_objective(subj) if subj == "you": repeat_list = [ ["you really want to "+verb+" "+after_verb], ["so you seriously hope to "+verb+" "+after_verb], ["so you are dying to "+verb+" "+after_verb] ] else: repeat_list = [ ["you really want "+objective_subj+" to "+verb+" "+after_verb], ["you really wanna have "+objective_subj+" "+verb+" "+after_verb], ["you really wanna make "+objective_subj+" "+verb+" "+after_verb] ] cmp_list = [ ["but sounds you feel bit too much to expect that now..?"], ["and sounds you feel like its impossible..?"], ["and seems like you dont think it never happen😓"] ] random_idx_for_repeat_list = randint(0, len(repeat_list) - 1) random_idx_for_cmp_list = randint(0, len(cmp_list) - 1) return repeat_list[random_idx_for_repeat_list]+cmp_list[random_idx_for_cmp_list]
def get_idx_list_of_idiom(idiom, series): tokenized_word = nltk.word_tokenize(idiom) ngram_list_of_the_series = Nlp_util.create_ngrams( WordFormatter.Series2Str(series), len(tokenized_word)) return [ idx for idx, ngram in enumerate(ngram_list_of_the_series) if ngram == tokenized_word ]
def __get_sidx_of_normal_and_too_long_sent(cls, df): delete_sidx_list = [] for sidx in set(df.sidx.values): target_df = df[df.sidx == sidx].copy().reset_index(drop=True) if cls.__is_special_type(target_df): pass else: if len(WordFormatter.Series2Str(target_df.word)) > 75: delete_sidx_list.append(sidx) else: pass return delete_sidx_list
def anything_isin(word_list, series): for word in word_list: tokenized_word = nltk.word_tokenize(word) if len(tokenized_word) == 1: if any(series.isin(word_list)): return True else: pass else: ngram_list_of_the_series = Nlp_util.create_ngrams( WordFormatter.Series2Str(series), len(tokenized_word)) if tokenized_word in ngram_list_of_the_series: return True else: pass return False