def preprocessData(dataFilePath, mode): """Load data from a file, process and return indices, conversations and labels in separate lists Input: dataFilePath : Path to train/test file to be processed mode : "train" mode returns labels. "test" mode doesn't return labels. Output: indices : Unique conversation ID list conversations : List of 3 turn conversations, processed and each turn separated by the <eos> tag labels : [Only available in "train" mode] List of labels """ indices = [] conversations = [] labels = [] with io.open(dataFilePath, encoding="utf8") as finput: finput.readline() for line in finput: # Convert multiple instances of . ? ! , to single instance # okay...sure -> okay . sure # okay???sure -> okay ? sure # Add whitespace around such punctuation # okay!sure -> okay ! sure repeatedChars = ['.', '?', '!', ','] for c in repeatedChars: lineSplit = line.split(c) while True: try: lineSplit.remove('') except: break cSpace = ' ' + c + ' ' line = cSpace.join(lineSplit) emoji_repeatedChars = TWEMOJI_LIST for emoji_meta in emoji_repeatedChars: emoji_lineSplit = line.split(emoji_meta) while True: try: emoji_lineSplit.remove('') emoji_lineSplit.remove(' ') emoji_lineSplit.remove(' ') emoji_lineSplit = [ x for x in emoji_lineSplit if x != '' ] except: break emoji_cSpace = ' ' + TWEMOJI[emoji_meta][0] + ' ' line = emoji_cSpace.join(emoji_lineSplit) line = line.strip().split('\t') if mode == "train": # Train data contains id, 3 turns and label label = emotion2label[line[4]] labels.append(label) conv = ' <eos> '.join(line[1:4]) + ' ' # Remove any duplicate spaces duplicateSpacePattern = re.compile(r'\ +') conv = re.sub(duplicateSpacePattern, ' ', conv) string = re.sub("tha+nks ", ' thanks ', conv) string = re.sub("Tha+nks ", ' Thanks ', string) string = re.sub("yes+ ", ' yes ', string) string = re.sub("Yes+ ", ' Yes ', string) string = re.sub("very+ ", ' very ', string) string = re.sub("go+d ", ' good ', string) string = re.sub("Very+ ", ' Very ', string) string = re.sub("why+ ", ' why ', string) string = re.sub("wha+t ", ' what ', string) string = re.sub("sil+y ", ' silly ', string) string = re.sub("hm+ ", ' hmm ', string) string = re.sub("no+ ", ' no ', string) string = re.sub("sor+y ", ' sorry ', string) string = re.sub("so+ ", ' so ', string) string = re.sub("lie+ ", ' lie ', string) string = re.sub("okay+ ", ' okay ', string) string = re.sub(' lol[a-z]+ ', 'laugh out loud', string) string = re.sub(' wow+ ', ' wow ', string) string = re.sub('wha+ ', ' what ', string) string = re.sub(' ok[a-z]+ ', ' ok ', string) string = re.sub(' u+ ', ' you ', string) string = re.sub(' wellso+n ', ' well soon ', string) string = re.sub(' byy+ ', ' bye ', string) string = string.replace('’', '\'').replace('"', ' ').replace("`", "'") string = string.replace('whats ', 'what is ').replace( "what's ", 'what is ').replace("i'm ", 'i am ') string = string.replace("it's ", 'it is ') string = string.replace('Iam ', 'I am ').replace( ' iam ', ' i am ').replace(' dnt ', ' do not ') string = string.replace('I ve ', 'I have ').replace( 'I m ', ' I\'am ').replace('i m ', 'i\'m ') string = string.replace('Iam ', 'I am ').replace('iam ', 'i am ') string = string.replace('dont ', 'do not ').replace( 'google.co.in ', ' google ').replace(' hve ', ' have ') string = string.replace(' F ', ' F**k ').replace( 'Ain\'t ', ' are not ').replace(' lv ', ' love ') string = string.replace(' ok~~ay~~ ', ' okay ').replace( ' Its ', ' It is').replace(' its ', ' it is ') string = string.replace(' Nd ', ' and ').replace( ' nd ', ' and ').replace('i ll ', 'i will ') string = ' ' + string.lower() for item in LOGOGRAM.keys(): string = string.replace(' ' + item + ' ', ' ' + LOGOGRAM[item].lower() + ' ') list_str = ekphrasis_config(string) for index in range(len(list_str)): if list_str[index] in EMOTICONS_TOKEN.keys(): list_str[index] = EMOTICONS_TOKEN[list_str[index]][ 1:len(EMOTICONS_TOKEN[list_str[index]]) - 1].lower() for index in range(len(list_str)): if list_str[index] in LOGOGRAM.keys(): list_str[index] = LOGOGRAM[list_str[index]].lower() for index in range(len(list_str)): if list_str[index] in LOGOGRAM.keys(): list_str[index] = LOGOGRAM[list_str[index]].lower() string = ' '.join(list_str) indices.append(int(line[0])) conversations.append(string.lower()) if mode == "train": return indices, conversations, labels else: return indices, conversations
string = string.replace('dont ', 'do not ').replace( 'google.co.in ', ' google ').replace(' hve ', ' have ') string = string.replace('Ain\'t ', ' are not ').replace(' lv ', ' love ') string = string.replace(' ok~~ay~~ ', ' okay ').replace( ' Its ', ' It is').replace(' its ', ' it is ') string = string.replace(' Nd ', ' and ').replace(' nd ', ' and ').replace( 'i ll ', 'i will ') string = string.replace(" I'd ", ' i would ').replace(''', "'") string = ' ' + string.lower() for item in LOGOGRAM.keys(): string = string.replace(' ' + item + ' ', ' ' + LOGOGRAM[item].lower() + ' ') list_str = ekphrasis_config(string) for index in range(len(list_str)): if list_str[index] in slang_map.keys(): list_str[index] = slang_map[list_str[index]] string = ' '.join(list_str) list_str = string.split() for index in range(len(list_str)): if list_str[index] in EMOTICONS_TOKEN.keys(): # print('kkkkkkkkk') # print(EMOTICONS_TOKEN[list_str[index]][1:len(EMOTICONS_TOKEN[list_str[index]]) - 1].lower()) list_str[index] = EMOTICONS_TOKEN[ list_str[index]][1:len(EMOTICONS_TOKEN[list_str[index]]) - 1].lower() for index in range(len(list_str)):