def processemoji(text):
    repeatedChars = ['user', 'hashtag']
    for c in repeatedChars:
        lineSplit = text.split(c)
        while True:
            try:
                lineSplit.remove('')
            except:
                break
        cSpace = ' ' + c + ' '
        text = cSpace.join(lineSplit)

    emoji_repeatedChars = TWEMOJI_LIST
    for emoji_meta in emoji_repeatedChars:
        emoji_lineSplit = text.split(emoji_meta)
        while True:
            try:
                emoji_lineSplit.remove('')
                emoji_lineSplit.remove(' ')
                emoji_lineSplit.remove('  ')
                emoji_lineSplit = [x for x in emoji_lineSplit if x != '']
            except:
                break
        emoji_cSpace = ' ' + TWEMOJI[emoji_meta][0] + ' '
        text = emoji_cSpace.join(emoji_lineSplit)

    for item in LOGOGRAM.keys():
        text = text.replace(' ' + item + ' ',
                            ' ' + LOGOGRAM[item].lower() + ' ')
        # print(item)

    list_str = ekphrasis_config(text)
    for index in range(len(list_str)):

        if list_str[index] in EMOTICONS_TOKEN.keys():
            list_str[index] = EMOTICONS_TOKEN[
                list_str[index]][1:len(EMOTICONS_TOKEN[list_str[index]]) -
                                 1].lower()
    for index in range(len(list_str)):
        if list_str[index] in LOGOGRAM.keys():
            # print("kkk",list_str[index])
            list_str[index] = LOGOGRAM[list_str[index]].lower()

    string = ' '.join(list_str)
    string = emoji.demojize(string.lower())
    string = re.sub(':\S+?:', '', string)

    return string
def preprocessData(dataFilePath, mode):
    """Load data from a file, process and return indices, conversations and labels in separate lists
    Input:
        dataFilePath : Path to train/test file to be processed
        mode : "train" mode returns labels. "test" mode doesn't return labels.
    Output:
        indices : Unique conversation ID list
        conversations : List of 3 turn conversations, processed and each turn separated by the <eos> tag
        labels : [Only available in "train" mode] List of labels
    """
    indices = []
    conversations = []
    labels = []
    with io.open(dataFilePath, encoding="utf8") as finput:
        finput.readline()
        for line in finput:
            # Convert multiple instances of . ? ! , to single instance
            # okay...sure -> okay . sure
            # okay???sure -> okay ? sure
            # Add whitespace around such punctuation
            # okay!sure -> okay ! sure
            repeatedChars = ['.', '?', '!', ',']
            for c in repeatedChars:
                lineSplit = line.split(c)
                while True:
                    try:
                        lineSplit.remove('')
                    except:
                        break
                cSpace = ' ' + c + ' '
                line = cSpace.join(lineSplit)

            emoji_repeatedChars = TWEMOJI_LIST
            for emoji_meta in emoji_repeatedChars:
                emoji_lineSplit = line.split(emoji_meta)
                while True:
                    try:
                        emoji_lineSplit.remove('')
                        emoji_lineSplit.remove(' ')
                        emoji_lineSplit.remove('  ')
                        emoji_lineSplit = [
                            x for x in emoji_lineSplit if x != ''
                        ]
                    except:
                        break
                emoji_cSpace = ' ' + TWEMOJI[emoji_meta][0] + ' '
                line = emoji_cSpace.join(emoji_lineSplit)

            line = line.strip().split('\t')
            if mode == "train":
                # Train data contains id, 3 turns and label
                label = emotion2label[line[4]]
                labels.append(label)

            conv = ' <eos> '.join(line[1:4]) + ' '

            # Remove any duplicate spaces
            duplicateSpacePattern = re.compile(r'\ +')
            conv = re.sub(duplicateSpacePattern, ' ', conv)

            string = re.sub("tha+nks ", ' thanks ', conv)
            string = re.sub("Tha+nks ", ' Thanks ', string)
            string = re.sub("yes+ ", ' yes ', string)
            string = re.sub("Yes+ ", ' Yes ', string)
            string = re.sub("very+ ", ' very ', string)
            string = re.sub("go+d ", ' good ', string)
            string = re.sub("Very+ ", ' Very ', string)
            string = re.sub("why+ ", ' why ', string)
            string = re.sub("wha+t ", ' what ', string)
            string = re.sub("sil+y ", ' silly ', string)
            string = re.sub("hm+ ", ' hmm ', string)
            string = re.sub("no+ ", ' no ', string)
            string = re.sub("sor+y ", ' sorry ', string)
            string = re.sub("so+ ", ' so ', string)
            string = re.sub("lie+ ", ' lie ', string)
            string = re.sub("okay+ ", ' okay ', string)
            string = re.sub(' lol[a-z]+ ', 'laugh out loud', string)
            string = re.sub(' wow+ ', ' wow ', string)
            string = re.sub('wha+ ', ' what ', string)
            string = re.sub(' ok[a-z]+ ', ' ok ', string)
            string = re.sub(' u+ ', ' you ', string)
            string = re.sub(' wellso+n ', ' well soon ', string)
            string = re.sub(' byy+ ', ' bye ', string)
            string = string.replace('’', '\'').replace('"',
                                                       ' ').replace("`", "'")
            string = string.replace('whats ', 'what is ').replace(
                "what's ", 'what is ').replace("i'm ", 'i am ')
            string = string.replace("it's ", 'it is ')
            string = string.replace('Iam ', 'I am ').replace(
                ' iam ', ' i am ').replace(' dnt ', ' do not ')
            string = string.replace('I ve ', 'I have ').replace(
                'I m ', ' I\'am ').replace('i m ', 'i\'m ')
            string = string.replace('Iam ', 'I am ').replace('iam ', 'i am ')
            string = string.replace('dont ', 'do not ').replace(
                'google.co.in ', ' google ').replace(' hve ', ' have ')
            string = string.replace(' F ', ' F**k ').replace(
                'Ain\'t ', ' are not ').replace(' lv ', ' love ')
            string = string.replace(' ok~~ay~~ ', ' okay ').replace(
                ' Its ', ' It is').replace(' its ', ' it is ')
            string = string.replace('  Nd  ', ' and ').replace(
                ' nd ', ' and ').replace('i ll ', 'i will ')
            string = ' ' + string.lower()
            for item in LOGOGRAM.keys():
                string = string.replace(' ' + item + ' ',
                                        ' ' + LOGOGRAM[item].lower() + ' ')

            list_str = ekphrasis_config(string)
            for index in range(len(list_str)):
                if list_str[index] in EMOTICONS_TOKEN.keys():
                    list_str[index] = EMOTICONS_TOKEN[list_str[index]][
                        1:len(EMOTICONS_TOKEN[list_str[index]]) - 1].lower()

            for index in range(len(list_str)):
                if list_str[index] in LOGOGRAM.keys():
                    list_str[index] = LOGOGRAM[list_str[index]].lower()

            for index in range(len(list_str)):
                if list_str[index] in LOGOGRAM.keys():
                    list_str[index] = LOGOGRAM[list_str[index]].lower()

            string = ' '.join(list_str)
            indices.append(int(line[0]))
            conversations.append(string.lower())
    if mode == "train":
        return indices, conversations, labels
    else:
        return indices, conversations
def review_to_wordlist(review_text):
    repeatedChars = ['.', '?', '!', ',', '"']
    for c in repeatedChars:
        lineSplit = review_text.split(c)
        # print(lineSplit)
        while True:
            try:
                lineSplit.remove('')
            except:
                break
        cSpace = ' ' + c + ' '
        line = cSpace.join(lineSplit)

    emoji_repeatedChars = TWEMOJI_LIST
    for emoji_meta in emoji_repeatedChars:
        emoji_lineSplit = line.split(emoji_meta)
        while True:
            try:
                emoji_lineSplit.remove('')
                emoji_lineSplit.remove(' ')
                emoji_lineSplit.remove('  ')
                emoji_lineSplit = [x for x in emoji_lineSplit if x != '']
            except:
                break
        emoji_cSpace = ' ' + TWEMOJI[emoji_meta][0] + ' '
        review_text = emoji_cSpace.join(emoji_lineSplit)

    review_text = emoji_to_text(review_text)

    review_text = re.sub("(@[\w]*\ )+", " @USER ", review_text)

    duplicateSpacePattern = re.compile(r'\ +')
    review_text = re.sub(duplicateSpacePattern, ' ', review_text).strip()
    # print(review_text)

    string = re.sub("tha+nks ", ' thanks ', review_text)
    string = re.sub("Tha+nks ", ' Thanks ', string)
    string = re.sub("yes+ ", ' yes ', string)
    string = re.sub("Yes+ ", ' Yes ', string)
    string = re.sub("very+ ", ' very ', string)
    string = re.sub("go+d ", ' good ', string)
    string = re.sub("Very+ ", ' Very ', string)
    string = re.sub("why+ ", ' why ', string)
    string = re.sub("wha+t ", ' what ', string)
    string = re.sub("sil+y ", ' silly ', string)
    string = re.sub("hm+ ", ' hmm ', string)
    string = re.sub("no+ ", ' no ', string)
    string = re.sub("sor+y ", ' sorry ', string)
    string = re.sub("so+ ", ' so ', string)
    string = re.sub("lie+ ", ' lie ', string)
    string = re.sub("okay+ ", ' okay ', string)
    string = re.sub(' lol[a-z]+ ', 'laugh out loud', string)
    string = re.sub(' wow+ ', ' wow ', string)
    string = re.sub('wha+ ', ' what ', string)
    string = re.sub(' ok[a-z]+ ', ' ok ', string)
    string = re.sub(' u+ ', ' you ', string)
    string = re.sub(' wellso+n ', ' well soon ', string)
    review_text = re.sub(' byy+ ', ' bye ', string)
    # review_text = re.sub("(im\s)+", " i am ", review_text)
    review_text = re.sub("(\wl\ss\w)+", ' also ', review_text)
    # review_text = re.sub("(IM\s)+", " i am ", review_text)
    review_text = re.sub("(\sbro$)+", " brother ", review_text)
    review_text = re.sub("\stv", " Television ", review_text)
    # review_text = review_text.replace('’', '\'').replace('"', ' ').replace("`", "'")

    review_text = abbreviation_to_text(review_text)

    string = review_text.replace('whats ',
                                 'what is ').replace(" i'm ", 'i am ')
    string = string.replace("it's ", 'it is ')
    string = string.replace('Iam ', 'I am ').replace(' iam ',
                                                     ' i am ').replace(
                                                         ' dnt ', ' do not ')
    string = string.replace('I ve ', 'I have ').replace(' I m ',
                                                        ' I\'am ').replace(
                                                            ' i m ', 'i\'m ')
    string = string.replace(' Iam ', 'I am ').replace(' iam ', 'i am ')
    string = string.replace('dont ', 'do not ').replace(
        'google.co.in ', ' google ').replace(' hve ', ' have ')
    string = string.replace(' F ', ' F**k ').replace('Ain\'t ',
                                                     ' are not ').replace(
                                                         ' lv ', ' love ')
    string = string.replace(' ok~~ay~~ ',
                            ' okay ').replace(' Its ', ' It is').replace(
                                ' its ', ' it is ')
    string = string.replace('  Nd  ', ' and ').replace(' nd ',
                                                       ' and ').replace(
                                                           'i ll ', 'i will ')

    # string = ' ' + string
    # string = abbreviation_to_text(string)
    string = ' ' + string
    for item in LOGOGRAM.keys():
        string = string.replace(' ' + item + ' ', ' ' + LOGOGRAM[item] + ' ')

    list_str = ekphrasis_config(string)
    for index in range(len(list_str)):
        if list_str[index] in EMOTICONS_TOKEN.keys():
            list_str[index] = EMOTICONS_TOKEN[
                list_str[index]][1:len(EMOTICONS_TOKEN[list_str[index]]) - 1]

    for index in range(len(list_str)):
        if list_str[index] in LOGOGRAM.keys():
            list_str[index] = LOGOGRAM[list_str[index]]

    for index in range(len(list_str)):
        if list_str[index] in LOGOGRAM.keys():
            list_str[index] = LOGOGRAM[list_str[index]]

    string = ' '.join(list_str)
    # review_text = re.sub("(@[\w]*\ )+", " @USER ", string)

    # duplicateSpacePattern = re.compile(r'\ +')
    # review_text = re.sub(duplicateSpacePattern, ' ', review_text).strip()
    # print(review_text)

    # review_text = ekphrasis_config(review_text)
    # print(review_text)
    review_text = re.sub("[^a-zA-Z0-9\@\&\:]", " ", string)

    # review_text = review_text.lower()

    words = stanford_tokenizer(review_text)

    return (words)
                                    'i ll ', 'i will ')
        string = string.replace(" I'd ", ' i would ').replace('&apos;', "'")
        string = ' ' + string.lower()
        for item in LOGOGRAM.keys():
            string = string.replace(' ' + item + ' ',
                                    ' ' + LOGOGRAM[item].lower() + ' ')

        list_str = ekphrasis_config(string)
        for index in range(len(list_str)):
            if list_str[index] in slang_map.keys():
                list_str[index] = slang_map[list_str[index]]
        string = ' '.join(list_str)

        list_str = string.split()
        for index in range(len(list_str)):
            if list_str[index] in EMOTICONS_TOKEN.keys():
                # print('kkkkkkkkk')
                # print(EMOTICONS_TOKEN[list_str[index]][1:len(EMOTICONS_TOKEN[list_str[index]]) - 1].lower())
                list_str[index] = EMOTICONS_TOKEN[
                    list_str[index]][1:len(EMOTICONS_TOKEN[list_str[index]]) -
                                     1].lower()

        for index in range(len(list_str)):
            if list_str[index] in LOGOGRAM.keys():
                list_str[index] = LOGOGRAM[list_str[index]].lower()

        for index in range(len(list_str)):
            if list_str[index] in LOGOGRAM.keys():
                list_str[index] = LOGOGRAM[list_str[index]].lower()

        string = ' '.join(list_str)