def cut_sent_by_interjection(cls, df): try: separators = [",", "and", "but", "or", "then", "so", "plus", "cause", "because"] exists_separator = True while exists_separator: separators_in_message = df[(df.word.isin(separators)) & (df.widx != 0)] if separators_in_message.empty: exists_separator = False else: for idx, separator in separators_in_message.iterrows(): if separator.word == 'so' and separator.pos == 'RB': continue elif separator.word == 'cause' and separator.pos != 'VB': continue if cls.__exists_SV_around_cc(df, separators_in_message, separator): df.loc[df[(df.sidx >= separator.sidx) & (df.index >= idx)].index, "sidx"] += 1 df = Nlp_util.reset_widx(df) break else: exists_separator = False return df except: logging.exception('') return df
def cut_sent_by_unimportant_words_at_head(cls, df): try: fixed_df = df while len(fixed_df) != 0: is_fixed_df_modified = False for sidx in set(fixed_df.sidx): if len(fixed_df[fixed_df.sidx == sidx]) == 1: continue head_row = fixed_df[fixed_df.sidx == sidx].iloc[0] head_word = head_row.word if head_word in UNIMPORTANT_WORDS_FOR_REPEAT.word.values: fixed_df.loc[fixed_df.index > head_row.name, "sidx"] += 1 fixed_df = Nlp_util.reset_widx(fixed_df) is_fixed_df_modified = True break elif not head_word.isalpha(): if not head_word.isdigit(): fixed_df = cls.__remove_nums_n_symbols(fixed_df, sidx) is_fixed_df_modified = True break if not is_fixed_df_modified: break return fixed_df except: logging.exception('') return df
def __remove_nums_n_symbols(cls, fixed_df, sidx): try: fixed_df = fixed_df.drop(fixed_df[fixed_df.sidx == sidx].iloc[0].name) fixed_df = fixed_df.reset_index(drop=True) fixed_df.loc[fixed_df.sidx == sidx] = Nlp_util.reset_widx(fixed_df.loc[fixed_df.sidx == sidx]) return fixed_df except: logging.exception('') return fixed_df
def remove_unimportant_words_at_tail(df): try: for sidx in set(df.sidx.values): while len(df.loc[df.sidx == sidx]) != 1: tail_row = df.loc[((df.sidx == sidx) & (df.widx == len(df[df.sidx == sidx]) - 1))] tail_word = tail_row.word.values[0] if tail_word in list(UNIMPORTANT_WORDS_FOR_REPEAT.word.values) + [","]: if tail_word == 'well': if tail_row.widx.values[0] > len(df[df.sidx == tail_row.sidx.values[0]]) / 2: break df = df.drop(tail_row.index[0]) df = Nlp_util.reset_widx(df) else: break df = df.reset_index(drop=True) return df except: logging.exception('') return df