def data():
    # get current directory
    path = os.getcwd()
    # get one directory up
    path = os.path.dirname(path)

    WORD_VECTORS = "../embeddings/word2vec.txt"
    WORD_VECTORS_DIMS = 300

    TRAIN_DATA = path + "/datasets/ABSA16_Restaurants_Train_SB1_v2.xml"
    VAL_DATA = path + "/datasets/EN_REST_SB1_TEST.xml"
    max_length = 80
    # load word embeddings
    print("loading word embeddings...")
    word2idx, idx2word, embeddings = load_word_vectors(WORD_VECTORS,
                                                       WORD_VECTORS_DIMS)
    print("loading categories")
    entity_attribute_pairs = createCategories2()
    # load raw data
    print("loading datasets...")
    train_review, train_ent_attrib = \
        read_xml2_train3(entity_attribute_pairs, TRAIN_DATA)

    gold_review, gold_ent_attrib = \
        read_xml2_train3(entity_attribute_pairs, VAL_DATA)

    y_train = train_ent_attrib
    y_test = gold_ent_attrib
    print("Tokenizing...")
    # nltk tokenizer
    X_train = [
        casual_tokenize(x,
                        preserve_case=False,
                        reduce_len=True,
                        strip_handles=False) for x in train_review
    ]
    X_test = [
        casual_tokenize(x,
                        preserve_case=False,
                        reduce_len=True,
                        strip_handles=False) for x in gold_review
    ]
    print("Vectorizing...")
    X_train = numpy.array(
        [vectorize(x, word2idx, max_length) for x in X_train])
    X_test = numpy.array([vectorize(x, word2idx, max_length) for x in X_test])
    print("Turning test and train data to numpy arrays")
    X_train = numpy.array(X_train)
    y_train = numpy.array(y_train)
    X_test = numpy.array(X_test)
    y_test = numpy.array(y_test)
    label_encoder = LabelBinarizer()
    y_train_res = label_encoder.fit_transform(y_train)
    y_test = label_encoder.fit_transform(y_test)
    # Everything to numpy
    X_train = numpy.array(X_train)
    y_train = numpy.array(y_train_res)
    y_test = numpy.array(y_test)
    return embeddings, X_train, X_test, y_train, y_test, max_length
def create_aspect_emb():
    ''' This a method that creates a vector to initialize the aspect embeddings'''
    # get current directory
    path = os.getcwd()
    # get one directory up
    path = os.path.dirname(path)
    WORD_VECTORS = "../embeddings/glove.6B.300d.txt"
    WORD_VECTORS_DIMS = 300
    print("loading word embeddings...")
    word2idx, idx2word, embeddings = load_word_vectors(WORD_VECTORS,
                                                       WORD_VECTORS_DIMS)

    entity_attribute_pairs = createCategories2()
    TRAIN_DATA = path + "/datasets/ABSA16_Restaurants_Train_SB1_v2.xml"

    d = get_words_for_each_category(entity_attribute_pairs, TRAIN_DATA)

    for i in range(0, 12):
        cat = d[i]
        cat = tokenize(cat)
        cat = set(cat)
        sum_emb = np.zeros(shape=(1, 300))
        word_count = 0
        for word in cat:
            if word in word2idx.keys():
                sum_emb = sum_emb + embeddings[word2idx[word]]
                word_count += 1
        sum_emb = sum_emb / word_count
        with open(path + "/embeddings/aspect_embeddings.txt", "a") as the_file:
            for e in sum_emb:
                for num in e:
                    the_file.write(str(num) + ' ')

                the_file.write('\n')
                sum_emb = np.array(sum_emb)
    return sum_emb
Example #3
0
}

# keep the same seed for random variables
np.random.seed(seed)
torch.manual_seed(seed)
# torch.backends.cudnn.enabled = False
if torch.cuda.is_available():
    torch.cuda.manual_seed(seed)

########################################################
# PREPARE FOR DATA
########################################################

# load word embeddings
print("loading word embeddings...")
word2idx, idx2word, embeddings = load_word_vectors(WORD_VECTORS,
                                                   WORD_VECTORS_DIMS)

tword2idx, idx2tword, topic_embeddings = load_word_vectors(WORD_VECTORS,
                                                           WORD_VECTORS_DIMS)

train_set = SentimentDataset(file=TRAIN_DATA, word2idx=word2idx, tword2idx=tword2idx,
                             max_length=0, max_topic_length=0, topic_bs=True)

print("Batching...")

train_sampler, validation_sampler = split_train_set(train_set, contiguous=True, split_rate=0.1)

loader_train = DataLoader(train_set, batch_size=BATCH_SIZE, sampler=train_sampler,
                          shuffle=False, num_workers=4)

loader_val = DataLoader(train_set, batch_size=BATCH_SIZE, sampler=validation_sampler,
def data():
    # get current directory
    path = os.getcwd()
    # get one directory up
    path = os.path.dirname(path)

    WORD_VECTORS = "../embeddings/glove.6B.300d.txt"
    TRAIN_DATA = path + "/datasets/ABSA16_Restaurants_Train_SB1_v2.xml"
    VAL_DATA = path + "/ABSA/datasets/EN_REST_SB1_TEST.xml"
    BATCH_SIZE = 128
    EPOCHS = 50
    WORD_VECTORS_DIMS = 300
    MAX_LENGTH = 80
    max_length = 80
    _hparams = {
        "rnn_size": 100,
        "bidirectional": True,
        "noise": 0.2,
        "dropout_words": 0.2,
        "dropout_rnn": 0.5,
    }
    # load word embeddings
    print("loading word embeddings...")
    word2idx, idx2word, embeddings = load_word_vectors(WORD_VECTORS,
                                                       WORD_VECTORS_DIMS)
    print("loading categories")
    entity_attribute_pairs = createCategories2()
    polarity_labels = create_polarities()
    # load raw data
    print("loading datasets...")
    train_review, train_ent_attrib, train_polarity,train_aux = \
        read_xml_polarities(entity_attribute_pairs,polarity_labels,TRAIN_DATA)
    gold_review, gold_ent_attrib, gold_polarity,gold_aux = \
        read_xml_polarities(entity_attribute_pairs,polarity_labels,VAL_DATA)
    print("extracting sentiment from texts")
    sentiment_intensity_train = extract_setiment_scores(train_review)
    sentiment_intensity_test = extract_setiment_scores(gold_review)
    y_train = train_polarity
    y_test = gold_polarity
    words = gold_review
    print("Tokenizing...")
    X_train = [tokenize(x) for x in train_review]
    X_test = [tokenize(x) for x in gold_review]
    print("Vectorizing...")
    X_train = numpy.array(
        [vectorize(x, word2idx, MAX_LENGTH) for x in X_train])
    X_test = numpy.array([vectorize(x, word2idx, MAX_LENGTH) for x in X_test])
    print("Turning test and train data to numpy arrays")
    # train and test sentence
    X_train = numpy.array(X_train)
    X_test = numpy.array(X_test)
    # train and test E#A labels
    y_train = numpy.array(y_train)
    y_test = numpy.array(y_test)
    # auxilary input - the aspect that is present in the sentence
    train_aux = numpy.array(train_aux)
    gold_aux = numpy.array(gold_aux)
    # handcrafted feature - the sentiment intensity of each sentence in a 3 way scale
    sentiment_intensity_train = numpy.array(sentiment_intensity_train)
    sentiment_intensity_test = numpy.array(sentiment_intensity_test)
    # entity and attribute pairs are given in this task and we can use them as extra features in each sentence
    train_ent_attrib = numpy.array(train_ent_attrib)
    gold_ent_attrib = numpy.array(gold_ent_attrib)
    classes = len(polarity_labels)
    return embeddings, classes, max_length, X_train, train_aux, sentiment_intensity_train, y_train, X_test, gold_aux, sentiment_intensity_test, y_test
Example #5
0
from preprocess import SentimentDataset
from tools import set_logger, train_validation_split
from load_embeddings import load_word_vectors
from my_neural import CNNClassifier
import matplotlib.pyplot as plt
import copy
torch.manual_seed(1)

BATCH_SIZE = 15
EPOCHS = 20
vec_size = 300
datasets = "laptop"
default_embed_path = "word_embeds/amazon%s.txt" % vec_size
default_train_path = "train_data/reviews_Electronics_5.json.gz"

word2idx, idx2word, embeddings = load_word_vectors(default_embed_path,
                                                   vec_size)
logging = set_logger("slot3.csv")
logging.debug(
    "Epoch,Train acuracy, train f1, train loss, test accuracy, test f1, test Loss"
)
###############################################################################################

train_sentences, emotion_for_sentence, all_categories_train = [], [], []
quota = [0.4, 0.2, 0.4]
max_sentences = 50000
sentences = MySentences(default_train_path, quota, max_sentences)
train_sentences, emotion_for_sentence, all_categories_train = sentences.get_sentiment(
)
import pickle
with open('dict.pickle', 'rb') as handle:
    unserialized_data = pickle.load(handle)