Esempio n. 1
0
def process_data():
    vocab_to_code, code_to_vocab = build_vocabulary()
    max_vocab_size = len(vocab_to_code)
    print('Final Vocab Size : ' + str(max_vocab_size))
    try:
        tokenized_dataset = []
        all_sentences = []
        for i, review in enumerate(read_binary(FORMATTED_FILE_NAME)):
            tokenized_aspect = []
            tokenized_sentences = []

            if i == 0:
                print(review)

            sentences = review[1]
            aspect_words = review[0]
            polarities = review[2]

            for aspect_word in aspect_words:
                tokenized_aspect.append(aspect_word)
                all_sentences.append([aspect_word])

            for sent in sentences:
                tokenized_sentence = []

                # remove duplicate spaces from the sentence. This is causing problem for elmo.
                s = re.sub(' +', ' ', sent[0])

                tokens = NLP.tokenizer(s)
                for token in tokens:
                    tokenized_sentence.append(token.orth_)
                tokenized_sentences.append(tokenized_sentence)

                # all these sentences will be written to a separate txt file at the end of the process.
                all_sentences.append(tokenized_sentence)

            tokenized_review = [
                tokenized_aspect, tokenized_sentences, polarities
            ]

            # dataset
            tokenized_dataset.append(tokenized_review)
            write_binary(tokenized_dataset, PROCESSED_FILE_NAME)
            print('dump at {}'.format(i))

        all_sentences = space_separated_token_string(all_sentences)
        save_sentences_to_text(all_sentences)
        # hack for elmo
        remove_duplicate_sentences()
    except KeyboardInterrupt:
        pass
Esempio n. 2
0
def build_vocabulary(lower=1, n=MAX_VOCAB_SIZE):
    """
    1. Get word frequency distribution
    2. Sort is based on word frequencies
    3. Make a vocab dist using the most frequent words
    4. Store vocab dist in a file in format <word, identifier>

    :param lower: Identifiers below this are reserved
    :param n: Number of unique expected words
    :return: A dict of vocabulary words and an assigned identifier
    """

    try:
        vocab_to_code = read_binary(VOCAB_TO_CODE_FILE)
        code_to_vocab = read_binary(CODE_TO_VOCAB_FILE)
        print('vocabulary loaded')
        return vocab_to_code, code_to_vocab
    except IOError:
        print('building vocabulary')
    freq = build_word_frequency_distribution()

    # sorting words in ascending order based on frequency and then pick top n words
    top_words = list(sorted(freq.items(), key=lambda x: -x[1]))[:n - lower + 1]
    # create optimum vocab size
    print('Vocab count : ' + str(len(top_words)))
    # global MAX_VOCAB_SIZE
    # global UNKNOWN
    max_vocab_size = len(top_words) + 2
    unknown = max_vocab_size - 1
    vocab_to_code = {}
    code_to_vocab = {}

    vocab_to_code['<UNK>'] = unknown
    code_to_vocab[unknown] = '<UNK>'
    vocab_to_code['<PAD>'] = PAD
    code_to_vocab[PAD] = '<PAD>'

    # lower vocab indexes are reserved for padding and unknown words
    i = lower
    for w, freq in top_words:
        vocab_to_code[w] = i
        code_to_vocab[i] = w
        i += 1
    write_binary(vocab_to_code, VOCAB_TO_CODE_FILE)
    write_binary(code_to_vocab, CODE_TO_VOCAB_FILE)
    return vocab_to_code, code_to_vocab
Esempio n. 3
0
def combine_processed_data():
    combined_dataset = []

    restaurant = read_binary(filename=PROCESSED_RESTAURANT_FILE_NAME)
    print('Restaurant-' + str(len(restaurant)))
    combined_dataset.extend(restaurant)
    print(len(combined_dataset))

    laptops = read_binary(filename=PROCESSED_LAPTOPS_FILE_NAME)
    print('Laptops-' + str(len(laptops)))
    combined_dataset.extend(laptops)
    print(len(combined_dataset))

    # organic = read_binary(filename = PROCESSED_ORGANIC_FILE_NAME)
    # print('Organic-' + str(len(organic)))
    # combined_dataset.extend(organic)
    # print(len(combined_dataset))

    write_binary(combined_dataset, OUTPUT_FILE_NAME)
def build_word_frequency_distribution():
    """
    1. Extract tokens from the review text
    2. Calculate frequency of each token
    3. Create a freq dict and store it in a file

    :return: A dict of <token, freq>
    """
    try:
        freq_dist_f = read_binary(WORD_FREQ_FILE)
        print('frequency distribution loaded')
        return freq_dist_f
    except IOError:
        pass

    print('building frequency distribution')
    freq = defaultdict(int)
    if FILE_NAME == 'restaurant':
        for aspect_word in RESTAURANT_ASPECT_WORDS:
            freq[aspect_word] += 1
    elif FILE_NAME == 'laptops':
        for aspect_word in LAPTOPS_ASPECT_WORDS:
            freq[aspect_word] += 1

    files = [FORMATTED_FILE_NAME]
    if EMBEDDING_TYPE == 'fasttext':
        files.append(FORMATTED_FILE_NAME.replace('train', 'test'))
        files.append(FORMATTED_FILE_NAME.replace('train', 'val'))

    for file_path in files:
        print('building vocab from file - ' + file_path)
        for i, review in enumerate(read_binary(file_path)):
            sentences = review[1]

            for sent in sentences:
                tokens = NLP.tokenizer(sent[0])
                for token in tokens:
                    freq[token.orth_] += 1
                if i % 100 == 0:
                    write_binary(freq, WORD_FREQ_FILE)
                    print('dump at {}'.format(i))
            write_binary(freq, WORD_FREQ_FILE)
    return freq
Esempio n. 5
0
def process_data():
    vocab_to_code, code_to_vocab = build_vocabulary()
    max_vocab_size = len(vocab_to_code)
    unknown = max_vocab_size - 1
    print('Final Vocab Size : ' + str(max_vocab_size))
    try:
        coded_dataset = []
        for i, review in enumerate(read_binary(FORMATTED_FILE_NAME)):
            coded_aspect = []
            coded_sentences = []

            if i == 0:
                print(review)

            sentences = review[1]
            aspect_words = review[0]
            polarities = review[2]

            for aspect_word in aspect_words:
                coded_aspect.append(vocab_to_code.get(aspect_word, unknown))

            for sent in sentences:
                coded_sentence = []
                tokens = NLP.tokenizer(sent[0])
                for token in tokens:
                    coded_sentence.append(
                        vocab_to_code.get(token.orth_, unknown))
                coded_sentences.append(coded_sentence)

            coded_review = [coded_aspect, coded_sentences, polarities]

            # dataset
            coded_dataset.append(coded_review)
            write_binary(coded_dataset, PROCESSED_FILE_NAME)
            print('dump at {}'.format(i))

        datapoint = coded_dataset[0]
        print(datapoint)
        print(get_uncoded_data(code_to_vocab, datapoint))
    except KeyboardInterrupt:
        pass
def process_data():
    vocab_to_code, code_to_vocab = build_vocabulary()
    vocab_size = len(vocab_to_code)
    unknown = vocab_size - 1
    print('Final Vocab Size : ' + str(vocab_size))
    coded_dataset = []
    for i, review in enumerate(read_binary(FORMATTED_FILE_NAME)):
        coded_aspect = []
        coded_text = []

        if i == 0:
            print(review)

        text = review[1]
        aspect_words = review[0]
        polarity = review[2]

        for aspect_word in aspect_words:
            a = vocab_to_code.get(aspect_word, unknown)
            if a == unknown:
                print('STOP')
                print(aspect_word)
            coded_aspect.append(a)

        for word in text:
            word_code = vocab_to_code.get(word, unknown)
            coded_text.append(word_code)

        coded_review = [coded_aspect, [coded_text], [polarity]]
        coded_dataset.append(coded_review)
        write_binary(coded_dataset, PROCESSED_FILE_NAME)
        print('dump at {}'.format(i))

    datapoint = coded_dataset[0]
    print(datapoint)
    print(get_uncoded_data(code_to_vocab, datapoint))
Esempio n. 7
0
def make_flat_data():
    """
    [
    [[aspect1], [review1], [polarity]],
    [[aspect2], [review1], [polarity]]
    ]

    [['food', 'quality'], [[['Judging from previous posts this used to be a good place, but not any longer.'], [0, 0, 0, 1]],
                      ,[['We, there were four of us, arrived at noon - the place was empty - and the staff acted like we
                        were imposing on them and they were very rude.'], [0, 0, 0, 1]],
                      [['They never brought us complimentary noodles, ignored repeated requests for sugar,
                        and threw our dishes on the table.'], [0, 0, 0, 1]],
                      [['The food was lousy - too sweet or too salty and the portions tiny.'], [0, 1, 0, 0]],
                      [['After all that, they complained to me about the small tip.'], [0, 0, 0, 1]],
                      [['Avoid this place!'], [0, 0, 0, 1]]
                      ]
    ]

    This method reads data from the original xml file and formats it in the way shown above. If N is the number of
    possible aspects in this dataset then we repeat or augment each review N times once for each aspect. A review can
    consist of any number of sentences. Each sentence in a review has a label. Labels represent sentiment polarity or
    non applicability of a sentence corresponding to an aspect. For instance, in the above example labels for each
    sentence are generated for the aspect food#quality. Sentences which either do dont talk about this particular aspect
    or any of the possible aspects are labeled as N/A in this datapoint. For instance, the last sentence "Avoid this place"
    is maked as N/A in this datapoint. Although this same sentence will be labelled as NEGATIVE in another datapoint of
    the same review for another aspect restaurant#general.
    :return:
    """

    possible_categories = [
        'allgemein', 'atmosphäre', 'connectivity', 'design',
        'gastronomisches_angebot', 'informationen', 'db_app_und_website',
        'service_und_kundenbetreuung', 'komfort_und_ausstattung', 'gepäck',
        'auslastung_und_platzangebot', 'ticketkauf', 'toiletten', 'zugfahrt',
        'reisen_mit_kindern', 'image', 'qr-code', 'barrierefreiheit',
        'sicherheit', 'sonstige_unregelmässigkeiten'
    ]

    global TOTAL_REVIEW_COUNT
    global TOTAL_AUGMENTED_REVIEW_COUNT
    global TOTAL_POSITIVE_LABEL_COUNT
    global TOTAL_NEGATIVE_LABEL_COUNT
    global TOTAL_NEUTRAL_LABEL_COUNT
    global TOTAL_NOT_APPLICABLE_LABEL_COUNT
    global INCLUDE_NOT_APPLICABLE
    global INCLUDE_PERCENTAGE

    doc = read_xml(INPUT_FILE_PATH)
    dataset = []
    for i, review in enumerate(doc['Documents']['Document']):
        TOTAL_REVIEW_COUNT += 1
        print('document-' + str(i))

        tokenized_review_text = []
        category_polarity_map = {}
        text = review['text']
        tokens = NLP(text)

        if 'Opinions' in review.keys():
            opinions = review['Opinions']['Opinion']
            if isinstance(opinions, dict):
                opinions = [opinions]
            for opinion in opinions:
                category = opinion['@category'].lower().split('#')[0]
                update_aspect_to_text_frequency(category)
                polarity = get_categorical_sentiment(opinion['@polarity'])
                category_polarity_map[category] = polarity

        for token in tokens:
            tokenized_review_text.append(token.text)

        if INCLUDE_NOT_APPLICABLE:
            for possible_category in possible_categories:
                sentiment = category_polarity_map.get(possible_category, None)
                if sentiment is None:
                    ran = random.random()
                    if ran <= INCLUDE_PERCENTAGE:
                        sentiment = 3
                        TOTAL_NOT_APPLICABLE_LABEL_COUNT += 1
                    else:
                        continue
                category_tokens = possible_category.split('_')
                if 'und' in category_tokens:
                    category_tokens.remove('und')
                datapoint = [category_tokens, tokenized_review_text, sentiment]
                # print(datapoint)
                dataset.append(datapoint)
                TOTAL_AUGMENTED_REVIEW_COUNT += 1
        else:
            for category, polarity in category_polarity_map.items():
                category_tokens = category.split('_')
                if 'und' in category_tokens:
                    category_tokens.remove('und')
                datapoint = [category_tokens, tokenized_review_text, polarity]
                print(datapoint)
                dataset.append(datapoint)
        print('---------')
    print(dataset[0])
    print(len(dataset))
    write_binary(dataset, filename=OUTPUT_FILE_NAME)
    print('TOTAL_REVIEW_COUNT: ', TOTAL_REVIEW_COUNT)
    print('TOTAL_AUGMENTED_REVIEW_COUNT: ', TOTAL_AUGMENTED_REVIEW_COUNT)
    print('TOTAL_POSITIVE_LABEL_COUNT: ', TOTAL_POSITIVE_LABEL_COUNT)
    print('TOTAL_NEGATIVE_LABEL_COUNT: ', TOTAL_NEGATIVE_LABEL_COUNT)
    print('TOTAL_NEUTRAL_LABEL_COUNT: ', TOTAL_NEUTRAL_LABEL_COUNT)
    print('TOTAL_NOT_APPLICABLE_LABEL_COUNT: ',
          TOTAL_NOT_APPLICABLE_LABEL_COUNT)
    total_label_count = TOTAL_POSITIVE_LABEL_COUNT + TOTAL_NEGATIVE_LABEL_COUNT + TOTAL_NEUTRAL_LABEL_COUNT + TOTAL_NOT_APPLICABLE_LABEL_COUNT
    print('TOTAL_LABELS: ', total_label_count)
    print('CLASS 0: ', (TOTAL_POSITIVE_LABEL_COUNT / total_label_count) * 100)
    print('CLASS 1: ', (TOTAL_NEGATIVE_LABEL_COUNT / total_label_count) * 100)
    print('CLASS 2: ', (TOTAL_NEUTRAL_LABEL_COUNT / total_label_count) * 100)
    print('CLASS 3: ',
          (TOTAL_NOT_APPLICABLE_LABEL_COUNT / total_label_count) * 100)
    print('ASPECT_TO_TEXT_FREQUENCY:')
    for k, v in ASPECT_TO_TEXT_FREQUENCY.items():
        print(k + ": " + str(v))
Esempio n. 8
0
def build_vocabulary(lower=1, n=MAX_VOCAB_SIZE):
    """
    1. Get word frequency distribution
    2. Sort is based on word frequencies
    3. Make a vocab dist using the most frequent words
    4. Store vocab dist in a file in format <word, identifier>

    :param lower: Identifiers below this are reserved
    :param n: Number of unique expected words
    :return: A dict of vocabulary words and an assigned identifier
    """

    try:
        vocab_to_code = read_binary(VOCAB_TO_CODE_FILE)
        code_to_vocab = read_binary(CODE_TO_VOCAB_FILE)
        print('vocabulary loaded')
        return vocab_to_code, code_to_vocab
    except IOError:
        print('building vocabulary')
    freq = build_word_frequency_distribution()

    # get glove embeddings
    print('loading embeddings')
    if EMBEDDING_TYPE == 'glove':
        word_to_embeddings = load_glove_embeddings()
    elif EMBEDDING_TYPE == 'fasttext':
        word_to_embeddings = load_oov_fastText_embeddings()
    else:
        word_to_embeddings = {}

    # sorting words in ascending order based on frequency and then pick top n words
    top_words = list(sorted(freq.items(), key=lambda x: -x[1]))[:n - lower + 1]
    # create optimum vocab size
    print('Vocab count : ' + str(len(top_words)))
    # global MAX_VOCAB_SIZE
    # global UNKNOWN
    max_vocab_size = len(top_words) + 2
    unknown = max_vocab_size - 1
    vocab_to_code = {}
    code_to_vocab = {}

    # an array of embeddings with index referring to the vocab code. First and last index is
    # reserved for padding and unknown words respectively.
    code_to_embed = np.zeros(shape=(max_vocab_size, EMBEDDING_DIMENSION),
                             dtype=np.float32)
    code_to_embed[PAD] = PAD_EMBEDDING
    code_to_embed[unknown] = UNKNOWN_EMBEDDING
    vocab_to_code['<UNK>'] = unknown
    code_to_vocab[unknown] = '<UNK>'
    vocab_to_code['<PAD>'] = PAD
    code_to_vocab[PAD] = '<PAD>'

    # lower vocab indexes are reserved for padding and unknown words
    i = lower
    for w, freq in top_words:
        vocab_to_code[w] = i
        code_to_vocab[i] = w
        try:
            if EMBEDDING_TYPE == 'glove':
                embedding = word_to_embeddings.word_vec(w)
            elif EMBEDDING_TYPE == 'fasttext':
                embedding = word_to_embeddings.get_word_vector(w)
        except KeyError:
            embedding = UNKNOWN_EMBEDDING
        code_to_embed[i] = embedding
        i += 1
    write_binary(vocab_to_code, VOCAB_TO_CODE_FILE)
    write_binary(code_to_vocab, CODE_TO_VOCAB_FILE)
    write_binary(code_to_embed, CODE_TO_EMBED_FILE)
    return vocab_to_code, code_to_vocab
Esempio n. 9
0
def make_flatten_restaurant_data_sentence_level(reviews, mode='train'):
    """
    [
    [[aspect1], [review1], [polarity]],
    [[aspect2], [review1], [polarity]]
    ]

    [['food', 'quality'], [[['Judging from previous posts this used to be a good place, but not any longer.'], [0, 0, 0, 1]],
                      ,[['We, there were four of us, arrived at noon - the place was empty - and the staff acted like we
                        were imposing on them and they were very rude.'], [0, 0, 0, 1]],
                      [['They never brought us complimentary noodles, ignored repeated requests for sugar,
                        and threw our dishes on the table.'], [0, 0, 0, 1]],
                      [['The food was lousy - too sweet or too salty and the portions tiny.'], [0, 1, 0, 0]],
                      [['After all that, they complained to me about the small tip.'], [0, 0, 0, 1]],
                      [['Avoid this place!'], [0, 0, 0, 1]]
                      ]
    ]

    This method reads data from the original xml file and formats it in the way shown above. If N is the number of
    possible aspects in this dataset then we repeat or augment each review N times once for each aspect. A review can
    consist of any number of sentences. Each sentence in a review has a label. Labels represent sentiment polarity or
    non applicability of a sentence corresponding to an aspect. For instance, in the above example labels for each
    sentence are generated for the aspect food#quality. Sentences which either do dont talk about this particular aspect
    or any of the possible aspects are labeled as N/A in this datapoint. For instance, the last sentence "Avoid this place"
    is maked as N/A in this datapoint. Although this same sentence will be labelled as NEGATIVE in another datapoint of
    the same review for another aspect restaurant#general.
    :return:
    """

    restaurant_possible_aspects = [
        'restaurant#general', 'restaurant#prices', 'restaurant#miscellaneous',
        'food#prices', 'food#quality', 'food#style_options', 'drinks#prices',
        'drinks#quality', 'drinks#style_options', 'ambience#general',
        'service#general', 'location#general'
    ]

    # we have 22 entities, 9 attributes so total 198 possible aspects
    # but in training data we have only 81 aspects present. In total we selected 116 aspects based our understanding of
    # which entity-attribute pair makes sense.
    laptops_possible_aspects = [
        'laptop#general', 'laptop#price', 'laptop#quality',
        'laptop#operation_performance', 'laptop#usability',
        'laptop#design_features', 'laptop#portability', 'laptop#connectivity',
        'laptop#miscellaneous', 'display#general', 'display#quality',
        'display#operation_performance', 'display#usability',
        'display#design_features', 'display#portability',
        'display#miscellaneous', 'cpu#general', 'cpu#price', 'cpu#quality',
        'cpu#operation_performance', 'cpu#design_features',
        'cpu#miscellaneous', 'motherboard#general', 'motherboard#price',
        'motherboard#quality', 'motherboard#design_features',
        'motherboard#miscellaneous', 'hard_disc#general', 'hard_disc#price',
        'hard_disc#quality', 'hard_disc#operation_performance',
        'hard_disc#design_features', 'hard_disc#miscellaneous',
        'memory#general', 'memory#price', 'memory#design_features',
        'memory#miscellaneous', 'battery#general', 'battery#quality',
        'battery#operation_performance', 'battery#design_features',
        'battery#miscellaneous', 'power_supply#general', 'power_supply#price',
        'power_supply#quality', 'power_supply#operation_performance',
        'power_supply#design_features', 'power_supply#miscellaneous',
        'keyboard#general', 'keyboard#quality',
        'keyboard#operation_performance', 'keyboard#usability',
        'keyboard#design_features', 'keyboard#miscellaneous', 'mouse#general',
        'mouse#quality', 'mouse#operation_performance', 'mouse#usability',
        'mouse#design_features', 'mouse#miscellaneous', 'fans_cooling#general',
        'fans_cooling#quality', 'fans_cooling#operation_performance',
        'fans_cooling#design_features', 'fans_cooling#miscellaneous',
        'optical_drives#general', 'optical_drives#quality',
        'optical_drives#operation_performance',
        'optical_drives#design_features', 'optical_drives#miscellaneous',
        'ports#general', 'ports#quality', 'ports#operation_performance',
        'ports#design_features', 'ports#miscellaneous', 'graphics#general',
        'graphics#quality', 'graphics#design_features',
        'graphics#miscellaneous', 'multimedia_devices#general',
        'multimedia_devices#quality',
        'multimedia_devices#operation_performance',
        'multimedia_devices#usability', 'multimedia_devices#design_features',
        'multimedia_devices#miscellaneous', 'hardware#general',
        'hardware#quality', 'hardware#operation_performance',
        'hardware#usability', 'hardware#design_features',
        'hardware#miscellaneous', 'os#general', 'os#quality',
        'os#operation_performance', 'os#usability', 'os#design_features',
        'os#miscellaneous', 'software#general', 'software#price',
        'software#quality', 'software#operation_performance',
        'software#usability', 'software#design_features',
        'software#miscellaneous', 'warranty#general', 'warranty#price',
        'warranty#miscellaneous', 'shipping#general', 'shipping#price',
        'shipping#quality', 'shipping#miscellaneous', 'support#general',
        'support#price', 'support#quality', 'support#miscellaneous',
        'company#general'
    ]

    global TOTAL_SENTENCE_COUNT
    global TOTAL_REVIEW_COUNT
    global TOTAL_AUGMENTED_REVIEW_COUNT
    global TOTAL_POSITIVE_LABEL_COUNT
    global TOTAL_NEGATIVE_LABEL_COUNT
    global TOTAL_NEUTRAL_LABEL_COUNT
    global TOTAL_NOT_APPLICABLE_LABEL_COUNT
    global ASPECT_TO_SENTENCE_FREQUENCY
    global DATA_TYPE

    if DATA_TYPE == 'restaurant':
        possible_aspects = restaurant_possible_aspects
    elif DATA_TYPE == 'laptops':
        possible_aspects = laptops_possible_aspects

    TOTAL_SENTENCE_COUNT = 0
    TOTAL_REVIEW_COUNT = 0
    TOTAL_AUGMENTED_REVIEW_COUNT = 0
    TOTAL_POSITIVE_LABEL_COUNT = 0
    TOTAL_NEGATIVE_LABEL_COUNT = 0
    TOTAL_NEUTRAL_LABEL_COUNT = 0
    TOTAL_NOT_APPLICABLE_LABEL_COUNT = 0
    ASPECT_TO_SENTENCE_FREQUENCY = {}

    dataset = []
    for i, review in enumerate(reviews):
        TOTAL_REVIEW_COUNT += 1
        print('review-' + str(i))

        review_text = []
        aspect_sentence_polarity_map = {}
        sentences = review['sentences']['sentence']
        if isinstance(sentences, dict):
            sentences = [sentences]
        for j, sentence in enumerate(sentences):
            TOTAL_SENTENCE_COUNT += 1
            sentence_text = []
            sentence_text.append(sentence['text'])
            if 'Opinions' in sentence.keys():
                opinions = sentence['Opinions']['Opinion']
                if isinstance(opinions, dict):
                    opinions = [opinions]

                for opinion in opinions:
                    aspect_category = opinion['@category'].lower()
                    update_aspect_to_sentence_frequency(aspect_category)
                    polarity = get_categorical_sentiment(opinion['@polarity'])

                    # Here we are trying to create a map of sentences and aspects. Basicly, for the current review which
                    # sentence is related to which aspect.
                    sentence_polarity = aspect_sentence_polarity_map.get(
                        aspect_category, [])
                    sentence_polarity.append([j, polarity])
                    aspect_sentence_polarity_map[
                        aspect_category] = sentence_polarity
            # else:
            #     # no aspect, contains no sentiment, either out of domain or just some fact
            #     sentence_polarity = aspect_sentence_polarity_map.get('relevance', [])
            #     sentence_polarity.append([j, 3])
            #     aspect_sentence_polarity_map['relevance'] = sentence_polarity

            review_text.append(sentence_text)

        # It could be that a particular review has no sentence for some aspects. Here we are just adding an empty
        # sentence list for such aspects.
        if not REDUCED:
            for aspect in possible_aspects:
                if aspect not in aspect_sentence_polarity_map.keys():
                    aspect_sentence_polarity_map[aspect] = []

        # Now for every possible aspect we will create a datapoint using this particular review.
        for a, sent_polarities in aspect_sentence_polarity_map.items():
            TOTAL_AUGMENTED_REVIEW_COUNT += 1
            aspect_words = []
            aspects = a.split('#')
            aspect_words.extend(aspects[0].split('_'))
            if len(aspects) > 1:
                aspect_words.extend(aspects[1].split('_'))
            augmented_review = []
            augmented_polarity = []
            # check which sentences from the current review are related to this aspect 'a' and has some polarity.
            # Iterate over each sentence from the review and check in the aspect's map whether it is present there
            # or not. If yes, mark the sentence's sentiment polarity accordinly or otherwise mark it N/A(3)
            for j, s in enumerate(review_text):
                updated_polarity = 3
                for sent_polarity in sent_polarities:
                    if j == sent_polarity[0]:
                        # sentence j contains current aspect
                        updated_polarity = sent_polarity[1]
                        break
                if updated_polarity == 3:
                    TOTAL_NOT_APPLICABLE_LABEL_COUNT += 1
                augmented_polarity.append(updated_polarity)
                augmented_review.append(s)
            augmented_datapoint = [
                aspect_words, augmented_review, augmented_polarity
            ]
            dataset.append(augmented_datapoint)

            if OVERSAMPLING:
                oversampled_datapoints = oversampling(augmented_datapoint)
                if oversampled_datapoints is not None:
                    for oversampled_datapoint in oversampled_datapoints:
                        TOTAL_NEUTRAL_LABEL_COUNT += 1
                        TOTAL_AUGMENTED_REVIEW_COUNT += 1
                        dataset.append(oversampled_datapoint)

        print('---------')
    print(dataset[0])
    print(dataset[1])
    print(dataset[2])
    print(dataset[3])
    print(dataset[4])
    print(dataset[5])
    print(dataset[6])
    print(dataset[7])
    print(dataset[8])
    print(dataset[9])
    print(dataset[10])
    print(dataset[11])
    print(dataset[12])
    print(len(dataset))
    output_file_name = 'formatted_' + DATA_TYPE + '_' + mode + '.pickle'
    write_binary(dataset, filename=output_file_name)
    print('---', mode, '---')
    print('TOTAL_REVIEW_COUNT: ', TOTAL_REVIEW_COUNT)
    print('TOTAL_SENTENCE_COUNT: ', TOTAL_SENTENCE_COUNT)
    print('TOTAL_AUGMENTED_REVIEW_COUNT: ', TOTAL_AUGMENTED_REVIEW_COUNT)
    print('TOTAL_POSITIVE_LABEL_COUNT: ', TOTAL_POSITIVE_LABEL_COUNT)
    print('TOTAL_NEGATIVE_LABEL_COUNT: ', TOTAL_NEGATIVE_LABEL_COUNT)
    print('TOTAL_NEUTRAL_LABEL_COUNT: ', TOTAL_NEUTRAL_LABEL_COUNT)
    print('TOTAL_NOT_APPLICABLE_LABEL_COUNT: ',
          TOTAL_NOT_APPLICABLE_LABEL_COUNT)
    total_label_count = TOTAL_POSITIVE_LABEL_COUNT + TOTAL_NEGATIVE_LABEL_COUNT + TOTAL_NEUTRAL_LABEL_COUNT + TOTAL_NOT_APPLICABLE_LABEL_COUNT
    print('TOTAL_LABELS: ', total_label_count)
    print('CLASS 0: ', (TOTAL_POSITIVE_LABEL_COUNT / total_label_count) * 100)
    print('CLASS 1: ', (TOTAL_NEGATIVE_LABEL_COUNT / total_label_count) * 100)
    print('CLASS 2: ', (TOTAL_NEUTRAL_LABEL_COUNT / total_label_count) * 100)
    print('CLASS 3: ',
          (TOTAL_NOT_APPLICABLE_LABEL_COUNT / total_label_count) * 100)
    print('ASPECT_TO_SENTENCE_FREQUENCY:')
    for k, v in ASPECT_TO_SENTENCE_FREQUENCY.items():
        print(k + ": " + str(v))
Esempio n. 10
0
def make_flatten_restaurant_data_sentence_level(reviews, mode='train'):
    # we have 9 entities, 14 attributes so total 126 possible aspects
    # but in training data we have only 111 aspects present. In total we selected 111 aspects based our understanding of
    # which entity-attribute pair makes sense.
    organic_possible_aspects = [
        'organic_general#general', 'organic_general#price',
        'organic_general#taste',
        'organic_general#nutritional_quality_freshness_appearance',
        'organic_general#safety', 'organic_general#healthiness',
        'organic_general#chemicals_pesticides', 'organic_general#label',
        'organic_general#origin_source', 'organic_general#local',
        'organic_general#availability', 'organic_general#environment',
        'organic_general#animal_welfare', 'organic_general#productivity',
        'organic_products#general', 'organic_products#price',
        'organic_products#taste',
        'organic_products#nutritional_quality_freshness_appearance',
        'organic_products#safety', 'organic_products#healthiness',
        'organic_products#chemicals_pesticides', 'organic_products#label',
        'organic_products#origin_source', 'organic_products#local',
        'organic_products#availability', 'organic_products#environment',
        'organic_products#animal_welfare', 'organic_products#productivity',
        'organic_farming#general', 'organic_farming#price',
        'organic_farming#taste',
        'organic_farming#nutritional_quality_freshness_appearance',
        'organic_farming#safety', 'organic_farming#healthiness',
        'organic_farming#chemicals_pesticides', 'organic_farming#label',
        'organic_farming#origin_source', 'organic_farming#local',
        'organic_farming#availability', 'organic_farming#environment',
        'organic_farming#animal_welfare', 'organic_farming#productivity',
        'organic_companies#general', 'organic_companies#price',
        'organic_companies#taste',
        'organic_companies#nutritional_quality_freshness_appearance',
        'organic_companies#safety', 'organic_companies#healthiness',
        'organic_companies#chemicals_pesticides', 'organic_companies#label',
        'organic_companies#origin_source', 'organic_companies#local',
        'organic_companies#availability', 'organic_companies#environment',
        'organic_companies#animal_welfare', 'organic_companies#productivity',
        'conventional_general#general', 'conventional_general#price',
        'conventional_general#nutritional_quality_freshness_appearance',
        'conventional_general#safety', 'conventional_general#healthiness',
        'conventional_general#chemicals_pesticides',
        'conventional_general#label', 'conventional_general#origin_source',
        'conventional_general#productivity', 'conventional_products#general',
        'conventional_products#price', 'conventional_products#taste',
        'conventional_products#nutritional_quality_freshness_appearance',
        'conventional_products#safety', 'conventional_products#healthiness',
        'conventional_products#chemicals_pesticides',
        'conventional_products#label', 'conventional_products#origin_source',
        'conventional_products#local', 'conventional_products#availability',
        'conventional_products#environment',
        'conventional_products#animal_welfare',
        'conventional_products#productivity', 'conventional_farming#general',
        'conventional_farming#price', 'conventional_farming#taste',
        'conventional_farming#nutritional_quality_freshness_appearance',
        'conventional_farming#safety', 'conventional_farming#healthiness',
        'conventional_farming#chemicals_pesticides',
        'conventional_farming#label', 'conventional_farming#origin_source',
        'conventional_farming#environment',
        'conventional_farming#animal_welfare',
        'conventional_farming#productivity', 'conventional_companies#general',
        'conventional_companies#taste', 'conventional_companies#safety',
        'conventional_companies#chemicals_pesticides',
        'conventional_companies#label', 'conventional_companies#availability',
        'conventional_companies#environment',
        'conventional_companies#animal_welfare',
        'conventional_companies#productivity',
        'gmo_genetic_engineering#general', 'gmo_genetic_engineering#price',
        'gmo_genetic_engineering#taste',
        'gmo_genetic_engineering#nutritional_quality_freshness_appearance',
        'gmo_genetic_engineering#safety',
        'gmo_genetic_engineering#healthiness',
        'gmo_genetic_engineering#chemicals_pesticides',
        'gmo_genetic_engineering#label',
        'gmo_genetic_engineering#origin_source',
        'gmo_genetic_engineering#environment',
        'gmo_genetic_engineering#productivity'
    ]

    reduced_organic_possible_aspects = [
        'organic#general', 'organic#price', 'organic#quality',
        'organic#safety_healthiness', 'organic#trustworthy_sources',
        'organic#environment', 'conventional#general', 'conventional#price',
        'conventional#quality', 'conventional#safety_healthiness',
        'conventional#trustworthy_sources', 'conventional#environment',
        'gmo_genetic_engineering#general', 'gmo_genetic_engineering#price',
        'gmo_genetic_engineering#quality',
        'gmo_genetic_engineering#safety_healthiness',
        'gmo_genetic_engineering#trustworthy_sources',
        'gmo_genetic_engineering#environment'
    ]

    global TOTAL_SENTENCE_COUNT
    global TOTAL_REVIEW_COUNT
    global TOTAL_AUGMENTED_REVIEW_COUNT
    global TOTAL_POSITIVE_LABEL_COUNT
    global TOTAL_NEGATIVE_LABEL_COUNT
    global TOTAL_NEUTRAL_LABEL_COUNT
    global TOTAL_NOT_APPLICABLE_LABEL_COUNT
    global ASPECT_TO_SENTENCE_FREQUENCY
    global DATA_TYPE

    if DATA_TYPE == 'organic':
        possible_aspects = organic_possible_aspects
    elif DATA_TYPE == 'organic_reduced':
        possible_aspects = reduced_organic_possible_aspects

    TOTAL_SENTENCE_COUNT = 0
    TOTAL_REVIEW_COUNT = 0
    TOTAL_AUGMENTED_REVIEW_COUNT = 0
    TOTAL_POSITIVE_LABEL_COUNT = 0
    TOTAL_NEGATIVE_LABEL_COUNT = 0
    TOTAL_NEUTRAL_LABEL_COUNT = 0
    TOTAL_NOT_APPLICABLE_LABEL_COUNT = 0
    ASPECT_TO_SENTENCE_FREQUENCY = {}

    dataset = []
    for i, review in enumerate(reviews):
        TOTAL_REVIEW_COUNT += 1
        print('review-' + str(i))

        review_text = []
        aspect_sentence_polarity_map = {}
        sentences = review['sentences']['sentence']
        if isinstance(sentences, dict):
            sentences = [sentences]
        for j, sentence in enumerate(sentences):
            TOTAL_SENTENCE_COUNT += 1
            sentence_text = []
            sentence_text.append(sentence['text'])
            if 'Opinions' in sentence.keys():
                opinions = sentence['Opinions']['Opinion']
                if isinstance(opinions, dict):
                    opinions = [opinions]

                for opinion in opinions:
                    aspect_category = opinion['@category'].lower()
                    update_aspect_to_sentence_frequency(aspect_category)
                    polarity = get_categorical_sentiment(opinion['@polarity'])

                    # Here we are trying to create a map of sentences and aspects. Basicly, for the current review which
                    # sentence is related to which aspect.
                    sentence_polarity = aspect_sentence_polarity_map.get(
                        aspect_category, [])
                    sentence_polarity.append([j, polarity])
                    aspect_sentence_polarity_map[
                        aspect_category] = sentence_polarity
            # else:
            #     # no aspect, contains no sentiment, either out of domain or just some fact
            #     sentence_polarity = aspect_sentence_polarity_map.get('relevance', [])
            #     sentence_polarity.append([j, 3])
            #     aspect_sentence_polarity_map['relevance'] = sentence_polarity

            review_text.append(sentence_text)

        # It could be that a particular review has no sentence for some aspects. Here we are just adding an empty
        # sentence list for such aspects.
        if not REDUCED:
            for aspect in possible_aspects:
                if aspect not in aspect_sentence_polarity_map.keys():
                    aspect_sentence_polarity_map[aspect] = []

        # Now for every possible aspect we will create a datapoint using this particular review.
        for a, sent_polarities in aspect_sentence_polarity_map.items():
            TOTAL_AUGMENTED_REVIEW_COUNT += 1
            aspect_words = []
            aspects = a.split('#')
            aspect_words.extend(aspects[0].split('_'))
            if len(aspects) > 1:
                aspect_words.extend(aspects[1].split('_'))
            augmented_review = []
            augmented_polarity = []
            # check which sentences from the current review are related to this aspect 'a' and has some polarity.
            # Iterate over each sentence from the review and check in the aspect's map whether it is present there
            # or not. If yes, mark the sentence's sentiment polarity accordinly or otherwise mark it N/A(3)
            for j, s in enumerate(review_text):
                updated_polarity = 3
                for sent_polarity in sent_polarities:
                    if j == sent_polarity[0]:
                        # sentence j contains current aspect
                        updated_polarity = sent_polarity[1]
                        break
                if updated_polarity == 3:
                    TOTAL_NOT_APPLICABLE_LABEL_COUNT += 1
                augmented_polarity.append(updated_polarity)
                augmented_review.append(s)
            augmented_datapoint = [
                aspect_words, augmented_review, augmented_polarity
            ]
            dataset.append(augmented_datapoint)

            if OVERSAMPLING:
                oversampled_datapoints = oversampling(augmented_datapoint)
                if oversampled_datapoints is not None:
                    for oversampled_datapoint in oversampled_datapoints:
                        TOTAL_NEUTRAL_LABEL_COUNT += 1
                        TOTAL_AUGMENTED_REVIEW_COUNT += 1
                        dataset.append(oversampled_datapoint)

        print('---------')
    print(dataset[0])
    print(dataset[1])
    print(dataset[2])
    print(dataset[3])
    print(dataset[4])
    print(dataset[5])
    print(dataset[6])
    print(dataset[7])
    print(dataset[8])
    print(dataset[9])
    print(dataset[10])
    print(dataset[11])
    print(dataset[12])
    print(len(dataset))
    output_file_name = 'formatted_' + DATA_TYPE + '_' + mode + '.pickle'
    write_binary(dataset, filename=output_file_name)
    print('---', mode, '---')
    print('TOTAL_REVIEW_COUNT: ', TOTAL_REVIEW_COUNT)
    print('TOTAL_SENTENCE_COUNT: ', TOTAL_SENTENCE_COUNT)
    print('TOTAL_AUGMENTED_REVIEW_COUNT: ', TOTAL_AUGMENTED_REVIEW_COUNT)
    print('TOTAL_POSITIVE_LABEL_COUNT: ', TOTAL_POSITIVE_LABEL_COUNT)
    print('TOTAL_NEGATIVE_LABEL_COUNT: ', TOTAL_NEGATIVE_LABEL_COUNT)
    print('TOTAL_NEUTRAL_LABEL_COUNT: ', TOTAL_NEUTRAL_LABEL_COUNT)
    print('TOTAL_NOT_APPLICABLE_LABEL_COUNT: ',
          TOTAL_NOT_APPLICABLE_LABEL_COUNT)
    total_label_count = TOTAL_POSITIVE_LABEL_COUNT + TOTAL_NEGATIVE_LABEL_COUNT + TOTAL_NEUTRAL_LABEL_COUNT + TOTAL_NOT_APPLICABLE_LABEL_COUNT
    print('TOTAL_LABELS: ', total_label_count)
    print('CLASS 0: ', (TOTAL_POSITIVE_LABEL_COUNT / total_label_count) * 100)
    print('CLASS 1: ', (TOTAL_NEGATIVE_LABEL_COUNT / total_label_count) * 100)
    print('CLASS 2: ', (TOTAL_NEUTRAL_LABEL_COUNT / total_label_count) * 100)
    print('CLASS 3: ',
          (TOTAL_NOT_APPLICABLE_LABEL_COUNT / total_label_count) * 100)
    print('ASPECT_TO_SENTENCE_FREQUENCY:')
    for k, v in ASPECT_TO_SENTENCE_FREQUENCY.items():
        print(k + ": " + str(v))