def __init__(self, task_type, config):
     self.task_type = task_type
     self.config = config
     self.embeddings = PretrainedEmbeddings()
     self.numberer_word = Numberer()
     self.numberer_char = Numberer()
     self.numberer_label = Numberer()
     self.numberer_doc = Numberer()
     self.tfidf = TfIdfVectorizer(config.doc_vector_size)
     self.sentiment = VaderSentimentVectorizer(config.doc_vector_size)
     self.vocab = set()  # corresponding to the entire corpus
     self.docs = dict()  # doc_id -> list(token_ids)
     self.train_set = None
     self.val_set = None
     self.vocab_sizes = None  # tuple(total vocab, embeddings only, training only, validation only)
Beispiel #2
0

if __name__ == "__main__":
    if len(sys.argv) != 3:
        sys.stderr.write("Usage: %s TRAIN_SET DEV_SET\n" % sys.argv[0])
        sys.exit(1)

    config = DefaultConfig()

    # Read training and validation data.
    train_lexicon = read_lexicon(sys.argv[1])
    validation_lexicon = read_lexicon(sys.argv[2])

    # Convert word characters and part-of-speech labels to numeral
    # representation.
    chars = Numberer()
    labels = Numberer()
    train_lexicon = recode_lexicon(train_lexicon, chars, labels, train=True)
    validation_lexicon = recode_lexicon(validation_lexicon, chars, labels)

    # Generate batches
    train_batches = generate_instances(train_lexicon,
                                       labels.max_number(),
                                       config.max_timesteps,
                                       batch_size=config.batch_size)
    validation_batches = generate_instances(validation_lexicon,
                                            labels.max_number(),
                                            config.max_timesteps,
                                            batch_size=config.batch_size)

    # Train the model
Beispiel #3
0
        args = sys.argv[1:]
    else:
        option = sys.argv[1]
        w = option == "-write"
        r = option == "-read"
        args = sys.argv[2:]

    # Load embeddings
    (word_ids, embeddings) = pickle.load(open(args[2], "rb"))

    # Load or pre-process data
    if r:
        train_data = pickle.load(open(args[0], "rb"))
        valid_data = pickle.load(open(args[1], "rb"))
    else:
        char_map = Numberer()
        label_map = Numberer()
        emolex = read_emolex(args[3])
        train_data = prepare_data(args[0], word_ids, emolex, char_map,
                                  label_map)
        valid_data = prepare_data(args[1], word_ids, emolex, char_map,
                                  label_map)
        if w:
            with open("traindata", "wb") as train_file:
                pickle.dump(train_data, train_file)
            with open("testdata", "wb") as test_file:
                pickle.dump(valid_data, test_file)

    # Get batches
    config = DefaultConfig()
    train_batches = generate_batches(*train_data, batch_size=config.batch_size)
                         sys.argv[0])
        sys.exit(1)

    config = DefaultConfig()

    # Load the word embeddings and get the embedding matrix
    embeds = gensim.models.Word2Vec.load(sys.argv[1])
    embedding_matrix = embeds.wv.syn0

    # Read training and validation data.
    train_lexicon = read_lexicon(sys.argv[2])
    validation_lexicon = read_lexicon(sys.argv[3])
    nrc_lexicon = read_nrc_lexicon("NRC-Hashtag-Emotion-Lexicon.txt")

    # Convert tweets and emotion labels to numeral representations
    words = Numberer()
    emotions = Numberer()
    train_lexicon = recode_lexicon(train_lexicon, words, emotions)
    validation_lexicon = recode_lexicon(validation_lexicon, words, emotions)
    nrc_lexicon = recode_nrc_lexicon(nrc_lexicon, words, emotions)

    # Generate batches
    train_batches = generate_instances(train_lexicon,
                                       nrc_lexicon,
                                       config.max_emotions,
                                       config.max_timesteps,
                                       batch_size=config.batch_size)
    validation_batches = generate_instances(validation_lexicon,
                                            nrc_lexicon,
                                            config.max_emotions,
                                            config.max_timesteps,
class Preprocessor:
    def __init__(self, task_type, config):
        self.task_type = task_type
        self.config = config
        self.embeddings = PretrainedEmbeddings()
        self.numberer_word = Numberer()
        self.numberer_char = Numberer()
        self.numberer_label = Numberer()
        self.numberer_doc = Numberer()
        self.tfidf = TfIdfVectorizer(config.doc_vector_size)
        self.sentiment = VaderSentimentVectorizer(config.doc_vector_size)
        self.vocab = set()  # corresponding to the entire corpus
        self.docs = dict()  # doc_id -> list(token_ids)
        self.train_set = None
        self.val_set = None
        self.vocab_sizes = None  # tuple(total vocab, embeddings only, training only, validation only)

    def preprocess_tweet(self, text):
        text = text.lower()
        if not self.config.remove_hash_tags_and_mentions:
            stripped = re.sub(r'\burl\b', '', text)
        else:
            stripped = re.sub(r'\burl\b', '', text)
            stripped = re.sub(r'(\b|\s)([@#][\w_-]+)', '', stripped)

        tokens = list(
            map(
                lambda x: x[0],
                textparser.word_tokenize(
                    stripped,
                    stopwords.words('english')
                    if self.config.remove_stopwords else [])))

        return tokens

    def get_offence_class(self, dataset_file, text, label_subtask_a,
                          label_subtask_b, label_subtask_c):
        final_label = None

        if self.task_type == TaskType.Subtask_A:
            if label_subtask_a == "NOT":
                final_label = OffenceClasses.Inoffensive
            elif label_subtask_a == "OFF":
                final_label = OffenceClasses.Offensive
        elif self.task_type == TaskType.Subtask_B:
            if label_subtask_b == "TIN":
                final_label = OffenceClasses.Targeted
            elif not self.config.collapse_negative_classes:
                if label_subtask_a == "NOT":
                    final_label = OffenceClasses.Inoffensive
                else:
                    final_label = OffenceClasses.Untargeted
            else:
                final_label = OffenceClasses.Untargeted
        else:
            if label_subtask_c == "IND":
                final_label = OffenceClasses.TargetIndividual
            elif label_subtask_c == "GRP":
                final_label = OffenceClasses.TargetGroup
            elif label_subtask_c == "OTH" or label_subtask_c == "ORG":
                final_label = OffenceClasses.TargetOther
            elif not self.config.collapse_negative_classes:
                if label_subtask_a == "NOT":
                    final_label = OffenceClasses.Inoffensive
                else:
                    final_label = OffenceClasses.Untargeted
            else:
                final_label = OffenceClasses.Untargeted

        #if final_label == None:
        #	print("unknown label for dataset instance '" + text + "' in file " + dataset_file.path() + "\t" + "labels: " + label_subtask_a + ", " + label_subtask_b + ", " + label_subtask_c)

        return final_label

    def generate_char_ids(self, tokens, numberer_char):
        char_ids = list()
        for word in tokens:
            word_chars = list()
            for char in word:
                word_chars.append(numberer_char.number(char))
            char_ids.append(word_chars)

        return char_ids

    def generate_char_ngram_ids(self, tokens, numberer_char):
        if self.config.char_ngram_size < 1:
            raise AssertionError(
                "character n-gram size must be greater than zero")

        n = self.config.char_ngram_size
        # both @ and # should be stripped from the token input in the preprocessing step
        pad_begin = ("@", ) * (n - 1)
        pad_end = "#"

        sent_char_ngram_ids = []
        for token in tokens:
            padded_seq = list(pad_begin)
            padded_seq.extend([char for char in token])
            padded_seq.append(pad_end)
            char_ngrams = zip(
                *[padded_seq[i:] for i in range(1 + len(pad_begin))])

            char_ngram_ids = [
                numberer_char.number(char_ngram) for char_ngram in char_ngrams
            ]
            sent_char_ngram_ids.append(char_ngram_ids)

        return sent_char_ngram_ids

    def generate_dataset(self,
                         dataset_file,
                         numberer_word,
                         numberer_char,
                         numberer_label,
                         maxsize=-1):
        dataset = Dataset()
        counter = 0

        for entry in dataset_file.lines():
            if maxsize > 0 and counter > maxsize:
                break

            if len(entry) == 5:
                # OffensEval/HatEval Training Dataset - <id> <tweet> <labels>...
                text = entry[1]
                label_a = entry[2]
                label_b = entry[3]
                label_c = entry[4]
            elif len(entry) == 4:
                # OffensEval Trial Dataset - <tweet> <labels>...
                text = entry[0]
                label_a = entry[1]
                label_b = entry[2]
                label_c = entry[3]
            elif len(entry) == 3:
                # TRAC Training Dataset - <id> <tweet> <label>
                # only for binary classification, must be preprocessed to use the OffsenEval tags
                # multi-line entries are skipped
                text = entry[1]
                label_a = entry[2]
                label_b = ""
                label_c = ""
            else:
                #		print("invalid dataset instance " + str(counter + 1) + " in file " + dataset_file.path() \
                #			+ "\tentry: " + str(entry))
                continue

            collapsed_label_id = numberer_label.number(
                self.get_offence_class(dataset_file, text, label_a, label_b,
                                       label_c))
            if collapsed_label_id == None:
                continue

            document_id = self.numberer_doc.number(text)

            # assign a unqiue id to all words and characters
            tokens = self.preprocess_tweet(text)
            word_ids = [numberer_word.number(word) for word in tokens]

            if self.config.use_char_ngrams:
                char_ids = self.generate_char_ngram_ids(tokens, numberer_char)
            else:
                char_ids = self.generate_char_ids(tokens, numberer_char)

            for id in word_ids:
                self.vocab.add(id)

            dataset.put(document_id, word_ids, char_ids, collapsed_label_id)
            self.tfidf.add_document(document_id, word_ids)
            self.docs[document_id] = word_ids

            counter += 1

        return dataset

    def load(self,
             path_embed,
             path_sent_lexicon,
             dataset_file_train,
             dataset_file_val,
             maxsize_train=-1,
             maxsize_val=-1):
        # we load the pretrained embeddings first to initialize our basic word vocabulary
        # this way, the ids assigned to the words implicitly act as indices into the (pretrained) embedding matrix
        # words that don't have a pretrained embedding collect at the end of the embedding matrix
        # this lets us use specific initializers for unknown words in the model
        vocab_embeddings = 0
        vocab_train_only = 0  # words in the training set that don't have an embedding
        vocab_val_only = 0  # same as above but for the validation set

        print("Loading embeddings...")
        self.embeddings.load(path_embed, self.numberer_word)
        print("\tParsed " + str(self.embeddings.get_size()) +
              " words of dimension " + str(self.embeddings.get_dim()))
        vocab_embeddings = self.numberer_word.max_number()

        print("Loading sentiment lexicon...")
        self.sentiment.load_lexicon(DatasetFile(path_sent_lexicon))

        print("Loading training data...")
        self.train_set = self.generate_dataset(dataset_file_train,
                                               self.numberer_word,
                                               self.numberer_char,
                                               self.numberer_label,
                                               maxsize_train)
        vocab_after_train = self.numberer_word.max_number()
        vocab_train_only = vocab_after_train - vocab_embeddings
        print("\tParsed " + str(self.train_set.get_size()) + " instances")
        self.train_set.print_distribution(self.numberer_label)

        print("Loading validation data...")
        self.val_set = self.generate_dataset(dataset_file_val,
                                             self.numberer_word,
                                             self.numberer_char,
                                             self.numberer_label, maxsize_val)
        vocab_after_val = self.numberer_word.max_number()
        vocab_val_only = vocab_after_val - vocab_after_train
        print("\tParsed " + str(self.val_set.get_size()) + " instances")
        self.val_set.print_distribution(self.numberer_label)

        self.vocab_sizes = (len(self.vocab), vocab_embeddings,
                            vocab_train_only, vocab_val_only)

        print("Calculating TF-IDF matrix...")
        self.tfidf.vectorize(self.numberer_doc, self.docs)

        print("Calculating sentiment matrix...")
        self.sentiment.vectorize(self.numberer_doc, self.numberer_word,
                                 self.docs)

    def get_tfidf(self):
        return self.tfidf

    def get_sentiment(self):
        return self.sentiment

    def get_embeddings(self):
        return self.embeddings

    def get_training_set(self):
        return self.train_set

    def get_validation_set(self):
        return self.val_set

    def get_vocab_size_trainonly(self):
        return self.vocab_sizes[2]

    def get_vocab_size_valonly(self):
        return self.vocab_sizes[3]

    def get_charvocab_size(self):
        return self.numberer_char.max_number()

    def get_max_labels(self):
        return self.numberer_label.max_number()

    def get_max_docs(self):
        return self.numberer_doc.max_number()

    def get_task_type(self):
        return self.task_type
Beispiel #6
0
def prepare_data(data_file,
                 id_to_word_mapping,
                 emolex,
                 char_map=Numberer(),
                 label_map=Numberer()):
    # Feature vector for words not in EmoLex
    emolex_default = [0.0] * 10
    # Count unknown tokens for statistics
    overall_tokens = 0
    unknown_tokens = 0

    # Create word-to-id mapping from word list
    word_map = {}
    for (i, word) in enumerate(id_to_word_mapping):
        word_map[word] = str(i)
    unknown_id = len(word_map)

    tokenizer = TweetTokenizer()
    data = ([], [], [], [], [], [])
    sent_len = 0  # stores maximal tweet length
    word_len = 0  # stores maximal word length
    feat_len = 11

    # Process data tweet by tweet, token by token
    with open(data_file, "r", encoding="utf8") as file:
        for line in file:
            split1 = line.find("\t")
            split2 = line.rfind("\t")
            label = label_map.number(line[split2:])
            sentence = line[split1:split2].strip()
            tokenized = tokenizer.tokenize(sentence)

            word_ids = []
            char_vecs = []
            feature_vecs = []
            word_lens = []

            for token in tokenized:
                char_vec = [char_map.number(c) for c in token]
                word_len = max(word_len, len(char_vec))
                word_lens.append(len(char_vec))
                char_vecs.append(char_vec)

                feature_vec = [float(token.isupper())]

                if token[0] == "#":
                    token = "<hashtag>"
                elif token[0] == "@":
                    token = "<user>"
                elif token.isdigit():
                    token = "<number>"
                elif token.startswith("http://") or token.startswith(
                        "https://") or token.startswith("www."):
                    token = "<url>"
                else:
                    token = token.lower()

                feature_vec += emolex.get(token, emolex_default)
                feature_vecs.append(feature_vec)

                token = word_map.get(token, unknown_id)
                overall_tokens += 1
                if token == unknown_id: unknown_tokens += 1

                word_ids.append(token)

            data[0].append(label)
            data[1].append(word_ids)
            data[2].append(char_vecs)
            data[3].append(feature_vecs)
            data[4].append(len(word_ids))
            data[5].append(word_lens)
            sent_len = max(sent_len, len(word_ids))

    # Print statistics
    print("%.2f percent of tokens not in dictionary" %
          (unknown_tokens / overall_tokens * 100))

    # Convert collected data to numpy matrices
    data_len = len(data[0])
    n_of_labels = label_map.max_number()
    data_tensors = (np.zeros([data_len,
                              n_of_labels]), np.zeros([data_len, sent_len]),
                    np.zeros([data_len, sent_len, word_len]),
                    np.zeros([data_len, sent_len,
                              feat_len]), np.array(data[4]),
                    np.zeros([data_len, sent_len]))
    for i in range(data_len):
        data_tensors[0][i, data[0][i]] = 1
        data_tensors[1][i, 0:len(data[1][i])] = data[1][i]
        data_tensors[5][i, 0:len(data[5][i])] = data[5][i]
        for j in range(len(data[1][i])):
            data_tensors[2][i, j, 0:len(data[2][i][j])] = data[2][i][j]
            data_tensors[3][i, j, 0:len(data[3][i][j])] = data[3][i][j]

    return data_tensors