Beispiel #1
0
def clean_str(string):
    """
        Tokenization/string cleaning for dataset
        Every dataset is lower cased except
        """
    string = string.replace(':)', ' smile ').replace(':-)', ' smile ') \
        .replace(':D', ' smile ').replace('=)', ' smile ').replace('😄', ' smile ').replace('☺', ' smile ')
    string = string.replace('❤', ' like ').replace('<3', ' like ').replace(
        '💕', ' like ').replace('😍', ' like ')
    string = string.replace('🤗', ' happy ').replace(':-)', ' happy ')
    string = string.replace(':(', ' unhappy ').replace(':-(', ' unhappy ').replace('💔', ' unhappy ') \
        .replace('😕', 'unhappy ').replace('😤', ' unhappy ')
    string = string.replace('😡', ' anger ').replace('🙃', ' anger ')
    string = string.replace('😞', ' sadness ').replace('😓',
                                                      ' sadness ').replace(
                                                          '😔', ' sadness ')
    string = string.replace(';-;', ' unhappy ')

    string = string.replace('’', '\'').replace('"', ' ')
    string = string.replace('whats ', 'what is')
    string = string.replace('Iam ', 'I am').replace(' iam ', 'i am').replace(
        ' dnt ', ' do not ')
    string = string.replace('I ve ', 'I have ').replace('I m ',
                                                        'I\'am ').replace(
                                                            'i m ', 'i\'m ')
    string = string.replace('Iam ', 'I am ').replace('iam ', 'i am ')
    string = string.replace('dont ', 'do not ').replace(
        'google.co.in ', 'google').replace('hve ', 'have ')
    string = string.replace(' F ', ' F**k ').replace('Ain\'t ',
                                                     ' are not ').replace(
                                                         ' lv ', ' love ')
    string = string.replace(' ok~~ay~~ ',
                            ' okay ').replace(' Its ', 'It is').replace(
                                ' its ', ' it is ')
    string = string.replace('  Nd  ', ' and ').replace(' nd ', ' and ')
    string = string.replace('Thnx ', ' Thanx ').replace('[#TRIGGERWORD#]', '')

    # delete ascll
    string = re.sub('[^\x00-\x7f]', ' ', string)
    word1_list = string.split()
    for index in range(len(word1_list)):
        if word1_list[index] in LOGOGRAM.keys():
            word1_list[index] = LOGOGRAM[word1_list[index]]
    string = ' '.join(word1_list)
    # letters only
    # string = re.sub("[^a-zA-Z\'.!?]", " ", string)
    string = string.lower()
    word_list = string.split()
    for index in range(len(word_list)):
        if word_list[index] in LOGOGRAM.keys():
            word_list[index] = LOGOGRAM[word_list[index]]

    string = " ".join(word_list)

    # words = stanford_tokenizer(string)

    # stops = set(stopwords.words("english"))
    # meaningful_words = [w for w in words if not w in stops]
    # string = " ".join(words)
    return string
def processemoji(text):
    repeatedChars = ['user', 'hashtag']
    for c in repeatedChars:
        lineSplit = text.split(c)
        while True:
            try:
                lineSplit.remove('')
            except:
                break
        cSpace = ' ' + c + ' '
        text = cSpace.join(lineSplit)

    emoji_repeatedChars = TWEMOJI_LIST
    for emoji_meta in emoji_repeatedChars:
        emoji_lineSplit = text.split(emoji_meta)
        while True:
            try:
                emoji_lineSplit.remove('')
                emoji_lineSplit.remove(' ')
                emoji_lineSplit.remove('  ')
                emoji_lineSplit = [x for x in emoji_lineSplit if x != '']
            except:
                break
        emoji_cSpace = ' ' + TWEMOJI[emoji_meta][0] + ' '
        text = emoji_cSpace.join(emoji_lineSplit)

    for item in LOGOGRAM.keys():
        text = text.replace(' ' + item + ' ',
                            ' ' + LOGOGRAM[item].lower() + ' ')
        # print(item)

    list_str = ekphrasis_config(text)
    for index in range(len(list_str)):

        if list_str[index] in EMOTICONS_TOKEN.keys():
            list_str[index] = EMOTICONS_TOKEN[
                list_str[index]][1:len(EMOTICONS_TOKEN[list_str[index]]) -
                                 1].lower()
    for index in range(len(list_str)):
        if list_str[index] in LOGOGRAM.keys():
            # print("kkk",list_str[index])
            list_str[index] = LOGOGRAM[list_str[index]].lower()

    string = ' '.join(list_str)
    string = emoji.demojize(string.lower())
    string = re.sub(':\S+?:', '', string)

    return string
Beispiel #3
0
def logogram_processing(review):
    string = review.replace('’', '\'')
    string = string.replace('Iam ', ' I am').replace(' iam ', ' i am').replace(
        ' dnt ', ' do not ')
    string = string.replace('I ve ', ' I have ').replace('I m ',
                                                         ' I\'am ').replace(
                                                             'i m ', ' i\'m ')
    string = string.replace('Iam ', ' I am ').replace('iam ', ' i am ')
    string = string.replace('dont ', ' do not ').replace(
        'google.co.in ', 'google').replace('hve ', ' have ')
    string = string.replace(' F ', ' F**k ').replace('Ain\'t ',
                                                     ' can\'t ').replace(
                                                         ' lv ', ' love ')
    string = string.replace(' ok~~ay~~ ', ' okay ').replace(
        ' Its ', ' It is ').replace(' its ', ' it is ')
    string = string.replace('  Nd  ', ' and ').replace(' nd ', ' and ')
    string = string.replace('Thnx ', 'Than')
    review_list = string.split(' ')
    for index in range(len(review_list)):
        if review_list[index] in LOGOGRAM.keys():
            review_list[index] = LOGOGRAM[review_list[index]]
    print(' '.join(review_list))

    return ' '.join(review_list)
def preprocessData(dataFilePath, mode):
    """Load data from a file, process and return indices, conversations and labels in separate lists
    Input:
        dataFilePath : Path to train/test file to be processed
        mode : "train" mode returns labels. "test" mode doesn't return labels.
    Output:
        indices : Unique conversation ID list
        conversations : List of 3 turn conversations, processed and each turn separated by the <eos> tag
        labels : [Only available in "train" mode] List of labels
    """
    indices = []
    conversations = []
    labels = []
    with io.open(dataFilePath, encoding="utf8") as finput:
        finput.readline()
        for line in finput:
            # Convert multiple instances of . ? ! , to single instance
            # okay...sure -> okay . sure
            # okay???sure -> okay ? sure
            # Add whitespace around such punctuation
            # okay!sure -> okay ! sure
            repeatedChars = ['.', '?', '!', ',']
            for c in repeatedChars:
                lineSplit = line.split(c)
                while True:
                    try:
                        lineSplit.remove('')
                    except:
                        break
                cSpace = ' ' + c + ' '
                line = cSpace.join(lineSplit)

            emoji_repeatedChars = TWEMOJI_LIST
            for emoji_meta in emoji_repeatedChars:
                emoji_lineSplit = line.split(emoji_meta)
                while True:
                    try:
                        emoji_lineSplit.remove('')
                        emoji_lineSplit.remove(' ')
                        emoji_lineSplit.remove('  ')
                        emoji_lineSplit = [
                            x for x in emoji_lineSplit if x != ''
                        ]
                    except:
                        break
                emoji_cSpace = ' ' + TWEMOJI[emoji_meta][0] + ' '
                line = emoji_cSpace.join(emoji_lineSplit)

            line = line.strip().split('\t')
            if mode == "train":
                # Train data contains id, 3 turns and label
                label = emotion2label[line[4]]
                labels.append(label)

            conv = ' <eos> '.join(line[1:4]) + ' '

            # Remove any duplicate spaces
            duplicateSpacePattern = re.compile(r'\ +')
            conv = re.sub(duplicateSpacePattern, ' ', conv)

            string = re.sub("tha+nks ", ' thanks ', conv)
            string = re.sub("Tha+nks ", ' Thanks ', string)
            string = re.sub("yes+ ", ' yes ', string)
            string = re.sub("Yes+ ", ' Yes ', string)
            string = re.sub("very+ ", ' very ', string)
            string = re.sub("go+d ", ' good ', string)
            string = re.sub("Very+ ", ' Very ', string)
            string = re.sub("why+ ", ' why ', string)
            string = re.sub("wha+t ", ' what ', string)
            string = re.sub("sil+y ", ' silly ', string)
            string = re.sub("hm+ ", ' hmm ', string)
            string = re.sub("no+ ", ' no ', string)
            string = re.sub("sor+y ", ' sorry ', string)
            string = re.sub("so+ ", ' so ', string)
            string = re.sub("lie+ ", ' lie ', string)
            string = re.sub("okay+ ", ' okay ', string)
            string = re.sub(' lol[a-z]+ ', 'laugh out loud', string)
            string = re.sub(' wow+ ', ' wow ', string)
            string = re.sub('wha+ ', ' what ', string)
            string = re.sub(' ok[a-z]+ ', ' ok ', string)
            string = re.sub(' u+ ', ' you ', string)
            string = re.sub(' wellso+n ', ' well soon ', string)
            string = re.sub(' byy+ ', ' bye ', string)
            string = string.replace('’', '\'').replace('"',
                                                       ' ').replace("`", "'")
            string = string.replace('whats ', 'what is ').replace(
                "what's ", 'what is ').replace("i'm ", 'i am ')
            string = string.replace("it's ", 'it is ')
            string = string.replace('Iam ', 'I am ').replace(
                ' iam ', ' i am ').replace(' dnt ', ' do not ')
            string = string.replace('I ve ', 'I have ').replace(
                'I m ', ' I\'am ').replace('i m ', 'i\'m ')
            string = string.replace('Iam ', 'I am ').replace('iam ', 'i am ')
            string = string.replace('dont ', 'do not ').replace(
                'google.co.in ', ' google ').replace(' hve ', ' have ')
            string = string.replace(' F ', ' F**k ').replace(
                'Ain\'t ', ' are not ').replace(' lv ', ' love ')
            string = string.replace(' ok~~ay~~ ', ' okay ').replace(
                ' Its ', ' It is').replace(' its ', ' it is ')
            string = string.replace('  Nd  ', ' and ').replace(
                ' nd ', ' and ').replace('i ll ', 'i will ')
            string = ' ' + string.lower()
            for item in LOGOGRAM.keys():
                string = string.replace(' ' + item + ' ',
                                        ' ' + LOGOGRAM[item].lower() + ' ')

            list_str = ekphrasis_config(string)
            for index in range(len(list_str)):
                if list_str[index] in EMOTICONS_TOKEN.keys():
                    list_str[index] = EMOTICONS_TOKEN[list_str[index]][
                        1:len(EMOTICONS_TOKEN[list_str[index]]) - 1].lower()

            for index in range(len(list_str)):
                if list_str[index] in LOGOGRAM.keys():
                    list_str[index] = LOGOGRAM[list_str[index]].lower()

            for index in range(len(list_str)):
                if list_str[index] in LOGOGRAM.keys():
                    list_str[index] = LOGOGRAM[list_str[index]].lower()

            string = ' '.join(list_str)
            indices.append(int(line[0]))
            conversations.append(string.lower())
    if mode == "train":
        return indices, conversations, labels
    else:
        return indices, conversations
def review_to_wordlist(review_text):
    repeatedChars = ['.', '?', '!', ',', '"']
    for c in repeatedChars:
        lineSplit = review_text.split(c)
        # print(lineSplit)
        while True:
            try:
                lineSplit.remove('')
            except:
                break
        cSpace = ' ' + c + ' '
        line = cSpace.join(lineSplit)

    emoji_repeatedChars = TWEMOJI_LIST
    for emoji_meta in emoji_repeatedChars:
        emoji_lineSplit = line.split(emoji_meta)
        while True:
            try:
                emoji_lineSplit.remove('')
                emoji_lineSplit.remove(' ')
                emoji_lineSplit.remove('  ')
                emoji_lineSplit = [x for x in emoji_lineSplit if x != '']
            except:
                break
        emoji_cSpace = ' ' + TWEMOJI[emoji_meta][0] + ' '
        review_text = emoji_cSpace.join(emoji_lineSplit)

    review_text = emoji_to_text(review_text)

    review_text = re.sub("(@[\w]*\ )+", " @USER ", review_text)

    duplicateSpacePattern = re.compile(r'\ +')
    review_text = re.sub(duplicateSpacePattern, ' ', review_text).strip()
    # print(review_text)

    string = re.sub("tha+nks ", ' thanks ', review_text)
    string = re.sub("Tha+nks ", ' Thanks ', string)
    string = re.sub("yes+ ", ' yes ', string)
    string = re.sub("Yes+ ", ' Yes ', string)
    string = re.sub("very+ ", ' very ', string)
    string = re.sub("go+d ", ' good ', string)
    string = re.sub("Very+ ", ' Very ', string)
    string = re.sub("why+ ", ' why ', string)
    string = re.sub("wha+t ", ' what ', string)
    string = re.sub("sil+y ", ' silly ', string)
    string = re.sub("hm+ ", ' hmm ', string)
    string = re.sub("no+ ", ' no ', string)
    string = re.sub("sor+y ", ' sorry ', string)
    string = re.sub("so+ ", ' so ', string)
    string = re.sub("lie+ ", ' lie ', string)
    string = re.sub("okay+ ", ' okay ', string)
    string = re.sub(' lol[a-z]+ ', 'laugh out loud', string)
    string = re.sub(' wow+ ', ' wow ', string)
    string = re.sub('wha+ ', ' what ', string)
    string = re.sub(' ok[a-z]+ ', ' ok ', string)
    string = re.sub(' u+ ', ' you ', string)
    string = re.sub(' wellso+n ', ' well soon ', string)
    review_text = re.sub(' byy+ ', ' bye ', string)
    # review_text = re.sub("(im\s)+", " i am ", review_text)
    review_text = re.sub("(\wl\ss\w)+", ' also ', review_text)
    # review_text = re.sub("(IM\s)+", " i am ", review_text)
    review_text = re.sub("(\sbro$)+", " brother ", review_text)
    review_text = re.sub("\stv", " Television ", review_text)
    # review_text = review_text.replace('’', '\'').replace('"', ' ').replace("`", "'")

    review_text = abbreviation_to_text(review_text)

    string = review_text.replace('whats ',
                                 'what is ').replace(" i'm ", 'i am ')
    string = string.replace("it's ", 'it is ')
    string = string.replace('Iam ', 'I am ').replace(' iam ',
                                                     ' i am ').replace(
                                                         ' dnt ', ' do not ')
    string = string.replace('I ve ', 'I have ').replace(' I m ',
                                                        ' I\'am ').replace(
                                                            ' i m ', 'i\'m ')
    string = string.replace(' Iam ', 'I am ').replace(' iam ', 'i am ')
    string = string.replace('dont ', 'do not ').replace(
        'google.co.in ', ' google ').replace(' hve ', ' have ')
    string = string.replace(' F ', ' F**k ').replace('Ain\'t ',
                                                     ' are not ').replace(
                                                         ' lv ', ' love ')
    string = string.replace(' ok~~ay~~ ',
                            ' okay ').replace(' Its ', ' It is').replace(
                                ' its ', ' it is ')
    string = string.replace('  Nd  ', ' and ').replace(' nd ',
                                                       ' and ').replace(
                                                           'i ll ', 'i will ')

    # string = ' ' + string
    # string = abbreviation_to_text(string)
    string = ' ' + string
    for item in LOGOGRAM.keys():
        string = string.replace(' ' + item + ' ', ' ' + LOGOGRAM[item] + ' ')

    list_str = ekphrasis_config(string)
    for index in range(len(list_str)):
        if list_str[index] in EMOTICONS_TOKEN.keys():
            list_str[index] = EMOTICONS_TOKEN[
                list_str[index]][1:len(EMOTICONS_TOKEN[list_str[index]]) - 1]

    for index in range(len(list_str)):
        if list_str[index] in LOGOGRAM.keys():
            list_str[index] = LOGOGRAM[list_str[index]]

    for index in range(len(list_str)):
        if list_str[index] in LOGOGRAM.keys():
            list_str[index] = LOGOGRAM[list_str[index]]

    string = ' '.join(list_str)
    # review_text = re.sub("(@[\w]*\ )+", " @USER ", string)

    # duplicateSpacePattern = re.compile(r'\ +')
    # review_text = re.sub(duplicateSpacePattern, ' ', review_text).strip()
    # print(review_text)

    # review_text = ekphrasis_config(review_text)
    # print(review_text)
    review_text = re.sub("[^a-zA-Z0-9\@\&\:]", " ", string)

    # review_text = review_text.lower()

    words = stanford_tokenizer(review_text)

    return (words)
        string = string.replace('I ve ',
                                'I have ').replace('I m ', ' I am ').replace(
                                    'i m ', 'i am ')
        string = string.replace('Iam ', 'I am ').replace('iam ', 'i am ')
        string = string.replace('dont ', 'do not ').replace(
            'google.co.in ', ' google ').replace(' hve ', ' have ')
        string = string.replace('Ain\'t ',
                                ' are not ').replace(' lv ', ' love ')
        string = string.replace(' ok~~ay~~ ', ' okay ').replace(
            ' Its ', ' It is').replace(' its ', ' it is ')
        string = string.replace('  Nd  ',
                                ' and ').replace(' nd ', ' and ').replace(
                                    'i ll ', 'i will ')
        string = string.replace(" I'd ", ' i would ').replace('&apos;', "'")
        string = ' ' + string.lower()
        for item in LOGOGRAM.keys():
            string = string.replace(' ' + item + ' ',
                                    ' ' + LOGOGRAM[item].lower() + ' ')

        list_str = ekphrasis_config(string)
        for index in range(len(list_str)):
            if list_str[index] in slang_map.keys():
                list_str[index] = slang_map[list_str[index]]
        string = ' '.join(list_str)

        list_str = string.split()
        for index in range(len(list_str)):
            if list_str[index] in EMOTICONS_TOKEN.keys():
                # print('kkkkkkkkk')
                # print(EMOTICONS_TOKEN[list_str[index]][1:len(EMOTICONS_TOKEN[list_str[index]]) - 1].lower())
                list_str[index] = EMOTICONS_TOKEN[