def load_data_context(data_path='data/train.txt', is_train=True): data_list = [] target_list = [] f_data = open(data_path, 'r') data_lines = f_data.readlines() f_data.close() for i, text in enumerate(data_lines): # skip the first line if i == 0: continue tokens = text.split('\t') convers = tokens[1:CONV_PAD_LEN + 1] # normal preprocessing raw_a = convers[0] raw_b = convers[1] raw_c = convers[2] a = processing_pipeline(raw_a) b = processing_pipeline(raw_b) c = processing_pipeline(raw_c) data_list.append((a, b, c, raw_a, raw_b, raw_c)) if is_train: emo = tokens[CONV_PAD_LEN + 1].strip() target_list.append(EMOS_DIC[emo]) if is_train: return data_list, target_list else: return data_list
def clean_sentences(sent_text): """ This function detects if a line should be removed (all empty or no words) And returns a cleaned sentences without duplicated tokens """ to_keep = False if re.match(".*[a-zA-Z]+.*", sent_text): to_keep = True remove_dup = re.sub(r'(.+?)\1+', r'\1', processing_pipeline(sent_text)) return to_keep, remove_dup
def load_data_context(data_path='data/train.txt', is_train=True): # data_path = 'data/train.txt' data_list = [] target_list = [] f_data = open(data_path, 'r') data_lines = f_data.readlines() f_data.close() for i, text in enumerate(data_lines): # skip the first line if i == 0: continue tokens = text.split('\t') convers = tokens[1:CONV_PAD_LEN + 1] a = convers[0] b = convers[1] c = convers[2] a = processing_pipeline(a) b = processing_pipeline(b) c = processing_pipeline(c) a_len = len(a.split()) b_len = len(b.split()) c_len = len(c.split()) data_list.append((a, a_len, b, b_len, c, c_len)) if is_train: emo = tokens[CONV_PAD_LEN + 1].strip() target_list.append(EMOS_DIC[emo]) if is_train: return data_list, target_list else: return data_list
def load_data_context(data_path='/data/SuperMod/test_data.txt', is_train=True): data_list = [] target_list = [] df = pd.read_csv(data_path, encoding="utf8") if len(df.columns) > 4: data_list = df.comment_text.tolist() target_list = df.toxic.tolist() else: data_list = df.comment_text.tolist() target_list = df.toxicity.tolist() clean_sent_list = [ sent_tokenize(processing_pipeline(email)) for email in data_list ] if is_train: return clean_sent_list, target_list else: return clean_sent_list