Esempio n. 1
0
def load_most_common_word_ap_list(file, ap_list):
    temp = {}
    for ap in ap_list:
        vocab = set()
        path = file + '/' + ap + '.txt'
        vocab.update(helpers.load_doc(path).split())
        vocab = sorted(vocab)
        temp.update({ap: vocab})
    return temp
embedding_file = '../data/glove.6B.100d.txt'
res_embedding_file = '../data/restaurant_emb.vec'
negative_words = '../data/negative-words.txt'
positive_words = '../data/positive-words.txt'
model_file_name = 'model_invidual_sentiment_ap_classifier'
model_folder = '../data/model'
if not os.path.exists(model_folder):
    os.makedirs(model_folder)
if not os.path.exists(model_folder + '/' + model_file_name):
    os.makedirs(model_folder + '/' + model_file_name)

data_train = pd.read_csv(train_csv, sep='\t')
data_test = pd.read_csv(test_csv, sep='\t')
data_sample = pd.read_csv(sample_csv, sep='\t')

vocab = helpers.load_doc(vocab_file)
vocab = set(vocab.split())

vocab_positive = helpers.load_doc(positive_words)
vocab_positive = set(vocab_positive.split())

vocab_negative = helpers.load_doc(negative_words)
vocab_negative = set(vocab_negative.split())
# define default ap_list
ap_list = [
    'FOOD#QUALITY', 'FOOD#PRICES', 'FOOD#STYLE_OPTIONS', 'RESTAURANT#GENERAL',
    'RESTAURANT#PRICES', 'RESTAURANT#MISCELLANEOUS', 'DRINKS#PRICES',
    'DRINKS#QUALITY', 'DRINKS#STYLE_OPTIONS', 'AMBIENCE#GENERAL',
    'SERVICE#GENERAL', 'LOCATION#GENERAL'
]
Esempio n. 3
0
            embedding_matrix[i] = embedding_vector
    return embedding_matrix


# define file
train_file = '../data/official_data/ABSA16_Restaurants_Train_SB1_v2.xml'
train_csv = '../data/official_data/data_train.csv'
test_file = '../data/official_data/EN_REST_SB1_TEST_gold.xml'
test_csv = '../data/official_data/data_test.csv'
vocab_file = '../data/vocab.txt'
embedding_file = '../data/glove.6B.100d.txt'

data_train = pd.read_csv(train_csv, sep='\t')
data_test = pd.read_csv(test_csv, sep='\t')

vocab = helpers.load_doc(vocab_file)
vocab = set(vocab.split())

# init
train_texts = process_texts(data_train.text, vocab)
test_texts = process_texts(data_test.text, vocab)

tokenizer = create_tokenizer(train_texts)
vocab_size = len(tokenizer.word_index) + 1
print(' Vocabulary size: %d ' % vocab_size)
max_length = max([len(s.split()) for s in train_texts])
print(' Maximum length: %d ' % max_length)

aspect_category_list = data_train.aspect_category.unique()
# X1, review w2v
X1_train = encode_X1(train_texts, tokenizer, max_length)
Esempio n. 4
0
def load_most_common_word(file, ap_list):
    vocab = set()
    for ap in ap_list:
        path = file + '/' + ap + '.txt'
        vocab.update(helpers.load_doc(path).split())
    return vocab