Ejemplo n.º 1
0
    def cut_sent_by_interjection(cls, df):
        try:
            separators = [",", "and", "but", "or", "then", "so", "plus", "cause", "because"]

            exists_separator = True

            while exists_separator:
                separators_in_message = df[(df.word.isin(separators)) & (df.widx != 0)]

                if separators_in_message.empty:
                    exists_separator = False
                else:
                    for idx, separator in separators_in_message.iterrows():
                        if separator.word == 'so' and separator.pos == 'RB':
                            continue
                        elif separator.word == 'cause' and separator.pos != 'VB':
                            continue

                        if cls.__exists_SV_around_cc(df, separators_in_message, separator):
                            df.loc[df[(df.sidx >= separator.sidx) & (df.index >= idx)].index, "sidx"] += 1
                            df = Nlp_util.reset_widx(df)
                            break
                    else:
                        exists_separator = False
            return df
        except:
            logging.exception('')
            return df
Ejemplo n.º 2
0
    def cut_sent_by_unimportant_words_at_head(cls, df):
        try:
            fixed_df = df
            while len(fixed_df) != 0:
                is_fixed_df_modified = False

                for sidx in set(fixed_df.sidx):
                    if len(fixed_df[fixed_df.sidx == sidx]) == 1:
                        continue

                    head_row = fixed_df[fixed_df.sidx == sidx].iloc[0]
                    head_word = head_row.word

                    if head_word in UNIMPORTANT_WORDS_FOR_REPEAT.word.values:
                        fixed_df.loc[fixed_df.index > head_row.name, "sidx"] += 1
                        fixed_df = Nlp_util.reset_widx(fixed_df)

                        is_fixed_df_modified = True
                        break
                    elif not head_word.isalpha():
                        if not head_word.isdigit():
                            fixed_df = cls.__remove_nums_n_symbols(fixed_df, sidx)
                            is_fixed_df_modified = True
                            break

                if not is_fixed_df_modified:
                    break

            return fixed_df
        except:
            logging.exception('')
            return df
Ejemplo n.º 3
0
    def __remove_nums_n_symbols(cls, fixed_df, sidx):
        try:
            fixed_df = fixed_df.drop(fixed_df[fixed_df.sidx == sidx].iloc[0].name)
            fixed_df = fixed_df.reset_index(drop=True)
            fixed_df.loc[fixed_df.sidx == sidx] = Nlp_util.reset_widx(fixed_df.loc[fixed_df.sidx == sidx])

            return fixed_df
        except:
            logging.exception('')
            return fixed_df
Ejemplo n.º 4
0
    def remove_unimportant_words_at_tail(df):
        try:
            for sidx in set(df.sidx.values):
                while len(df.loc[df.sidx == sidx]) != 1:
                    tail_row = df.loc[((df.sidx == sidx) & (df.widx == len(df[df.sidx == sidx]) - 1))]
                    tail_word = tail_row.word.values[0]
                    if tail_word in list(UNIMPORTANT_WORDS_FOR_REPEAT.word.values) + [","]:
                        if tail_word == 'well':
                            if tail_row.widx.values[0] > len(df[df.sidx == tail_row.sidx.values[0]]) / 2:
                                break

                        df = df.drop(tail_row.index[0])
                        df = Nlp_util.reset_widx(df)
                    else:
                        break

            df = df.reset_index(drop=True)
            return df
        except:
            logging.exception('')
            return df