def __init__(self, task_type, config): self.task_type = task_type self.config = config self.embeddings = PretrainedEmbeddings() self.numberer_word = Numberer() self.numberer_char = Numberer() self.numberer_label = Numberer() self.numberer_doc = Numberer() self.tfidf = TfIdfVectorizer(config.doc_vector_size) self.sentiment = VaderSentimentVectorizer(config.doc_vector_size) self.vocab = set() # corresponding to the entire corpus self.docs = dict() # doc_id -> list(token_ids) self.train_set = None self.val_set = None self.vocab_sizes = None # tuple(total vocab, embeddings only, training only, validation only)
if __name__ == "__main__": if len(sys.argv) != 3: sys.stderr.write("Usage: %s TRAIN_SET DEV_SET\n" % sys.argv[0]) sys.exit(1) config = DefaultConfig() # Read training and validation data. train_lexicon = read_lexicon(sys.argv[1]) validation_lexicon = read_lexicon(sys.argv[2]) # Convert word characters and part-of-speech labels to numeral # representation. chars = Numberer() labels = Numberer() train_lexicon = recode_lexicon(train_lexicon, chars, labels, train=True) validation_lexicon = recode_lexicon(validation_lexicon, chars, labels) # Generate batches train_batches = generate_instances(train_lexicon, labels.max_number(), config.max_timesteps, batch_size=config.batch_size) validation_batches = generate_instances(validation_lexicon, labels.max_number(), config.max_timesteps, batch_size=config.batch_size) # Train the model
args = sys.argv[1:] else: option = sys.argv[1] w = option == "-write" r = option == "-read" args = sys.argv[2:] # Load embeddings (word_ids, embeddings) = pickle.load(open(args[2], "rb")) # Load or pre-process data if r: train_data = pickle.load(open(args[0], "rb")) valid_data = pickle.load(open(args[1], "rb")) else: char_map = Numberer() label_map = Numberer() emolex = read_emolex(args[3]) train_data = prepare_data(args[0], word_ids, emolex, char_map, label_map) valid_data = prepare_data(args[1], word_ids, emolex, char_map, label_map) if w: with open("traindata", "wb") as train_file: pickle.dump(train_data, train_file) with open("testdata", "wb") as test_file: pickle.dump(valid_data, test_file) # Get batches config = DefaultConfig() train_batches = generate_batches(*train_data, batch_size=config.batch_size)
sys.argv[0]) sys.exit(1) config = DefaultConfig() # Load the word embeddings and get the embedding matrix embeds = gensim.models.Word2Vec.load(sys.argv[1]) embedding_matrix = embeds.wv.syn0 # Read training and validation data. train_lexicon = read_lexicon(sys.argv[2]) validation_lexicon = read_lexicon(sys.argv[3]) nrc_lexicon = read_nrc_lexicon("NRC-Hashtag-Emotion-Lexicon.txt") # Convert tweets and emotion labels to numeral representations words = Numberer() emotions = Numberer() train_lexicon = recode_lexicon(train_lexicon, words, emotions) validation_lexicon = recode_lexicon(validation_lexicon, words, emotions) nrc_lexicon = recode_nrc_lexicon(nrc_lexicon, words, emotions) # Generate batches train_batches = generate_instances(train_lexicon, nrc_lexicon, config.max_emotions, config.max_timesteps, batch_size=config.batch_size) validation_batches = generate_instances(validation_lexicon, nrc_lexicon, config.max_emotions, config.max_timesteps,
class Preprocessor: def __init__(self, task_type, config): self.task_type = task_type self.config = config self.embeddings = PretrainedEmbeddings() self.numberer_word = Numberer() self.numberer_char = Numberer() self.numberer_label = Numberer() self.numberer_doc = Numberer() self.tfidf = TfIdfVectorizer(config.doc_vector_size) self.sentiment = VaderSentimentVectorizer(config.doc_vector_size) self.vocab = set() # corresponding to the entire corpus self.docs = dict() # doc_id -> list(token_ids) self.train_set = None self.val_set = None self.vocab_sizes = None # tuple(total vocab, embeddings only, training only, validation only) def preprocess_tweet(self, text): text = text.lower() if not self.config.remove_hash_tags_and_mentions: stripped = re.sub(r'\burl\b', '', text) else: stripped = re.sub(r'\burl\b', '', text) stripped = re.sub(r'(\b|\s)([@#][\w_-]+)', '', stripped) tokens = list( map( lambda x: x[0], textparser.word_tokenize( stripped, stopwords.words('english') if self.config.remove_stopwords else []))) return tokens def get_offence_class(self, dataset_file, text, label_subtask_a, label_subtask_b, label_subtask_c): final_label = None if self.task_type == TaskType.Subtask_A: if label_subtask_a == "NOT": final_label = OffenceClasses.Inoffensive elif label_subtask_a == "OFF": final_label = OffenceClasses.Offensive elif self.task_type == TaskType.Subtask_B: if label_subtask_b == "TIN": final_label = OffenceClasses.Targeted elif not self.config.collapse_negative_classes: if label_subtask_a == "NOT": final_label = OffenceClasses.Inoffensive else: final_label = OffenceClasses.Untargeted else: final_label = OffenceClasses.Untargeted else: if label_subtask_c == "IND": final_label = OffenceClasses.TargetIndividual elif label_subtask_c == "GRP": final_label = OffenceClasses.TargetGroup elif label_subtask_c == "OTH" or label_subtask_c == "ORG": final_label = OffenceClasses.TargetOther elif not self.config.collapse_negative_classes: if label_subtask_a == "NOT": final_label = OffenceClasses.Inoffensive else: final_label = OffenceClasses.Untargeted else: final_label = OffenceClasses.Untargeted #if final_label == None: # print("unknown label for dataset instance '" + text + "' in file " + dataset_file.path() + "\t" + "labels: " + label_subtask_a + ", " + label_subtask_b + ", " + label_subtask_c) return final_label def generate_char_ids(self, tokens, numberer_char): char_ids = list() for word in tokens: word_chars = list() for char in word: word_chars.append(numberer_char.number(char)) char_ids.append(word_chars) return char_ids def generate_char_ngram_ids(self, tokens, numberer_char): if self.config.char_ngram_size < 1: raise AssertionError( "character n-gram size must be greater than zero") n = self.config.char_ngram_size # both @ and # should be stripped from the token input in the preprocessing step pad_begin = ("@", ) * (n - 1) pad_end = "#" sent_char_ngram_ids = [] for token in tokens: padded_seq = list(pad_begin) padded_seq.extend([char for char in token]) padded_seq.append(pad_end) char_ngrams = zip( *[padded_seq[i:] for i in range(1 + len(pad_begin))]) char_ngram_ids = [ numberer_char.number(char_ngram) for char_ngram in char_ngrams ] sent_char_ngram_ids.append(char_ngram_ids) return sent_char_ngram_ids def generate_dataset(self, dataset_file, numberer_word, numberer_char, numberer_label, maxsize=-1): dataset = Dataset() counter = 0 for entry in dataset_file.lines(): if maxsize > 0 and counter > maxsize: break if len(entry) == 5: # OffensEval/HatEval Training Dataset - <id> <tweet> <labels>... text = entry[1] label_a = entry[2] label_b = entry[3] label_c = entry[4] elif len(entry) == 4: # OffensEval Trial Dataset - <tweet> <labels>... text = entry[0] label_a = entry[1] label_b = entry[2] label_c = entry[3] elif len(entry) == 3: # TRAC Training Dataset - <id> <tweet> <label> # only for binary classification, must be preprocessed to use the OffsenEval tags # multi-line entries are skipped text = entry[1] label_a = entry[2] label_b = "" label_c = "" else: # print("invalid dataset instance " + str(counter + 1) + " in file " + dataset_file.path() \ # + "\tentry: " + str(entry)) continue collapsed_label_id = numberer_label.number( self.get_offence_class(dataset_file, text, label_a, label_b, label_c)) if collapsed_label_id == None: continue document_id = self.numberer_doc.number(text) # assign a unqiue id to all words and characters tokens = self.preprocess_tweet(text) word_ids = [numberer_word.number(word) for word in tokens] if self.config.use_char_ngrams: char_ids = self.generate_char_ngram_ids(tokens, numberer_char) else: char_ids = self.generate_char_ids(tokens, numberer_char) for id in word_ids: self.vocab.add(id) dataset.put(document_id, word_ids, char_ids, collapsed_label_id) self.tfidf.add_document(document_id, word_ids) self.docs[document_id] = word_ids counter += 1 return dataset def load(self, path_embed, path_sent_lexicon, dataset_file_train, dataset_file_val, maxsize_train=-1, maxsize_val=-1): # we load the pretrained embeddings first to initialize our basic word vocabulary # this way, the ids assigned to the words implicitly act as indices into the (pretrained) embedding matrix # words that don't have a pretrained embedding collect at the end of the embedding matrix # this lets us use specific initializers for unknown words in the model vocab_embeddings = 0 vocab_train_only = 0 # words in the training set that don't have an embedding vocab_val_only = 0 # same as above but for the validation set print("Loading embeddings...") self.embeddings.load(path_embed, self.numberer_word) print("\tParsed " + str(self.embeddings.get_size()) + " words of dimension " + str(self.embeddings.get_dim())) vocab_embeddings = self.numberer_word.max_number() print("Loading sentiment lexicon...") self.sentiment.load_lexicon(DatasetFile(path_sent_lexicon)) print("Loading training data...") self.train_set = self.generate_dataset(dataset_file_train, self.numberer_word, self.numberer_char, self.numberer_label, maxsize_train) vocab_after_train = self.numberer_word.max_number() vocab_train_only = vocab_after_train - vocab_embeddings print("\tParsed " + str(self.train_set.get_size()) + " instances") self.train_set.print_distribution(self.numberer_label) print("Loading validation data...") self.val_set = self.generate_dataset(dataset_file_val, self.numberer_word, self.numberer_char, self.numberer_label, maxsize_val) vocab_after_val = self.numberer_word.max_number() vocab_val_only = vocab_after_val - vocab_after_train print("\tParsed " + str(self.val_set.get_size()) + " instances") self.val_set.print_distribution(self.numberer_label) self.vocab_sizes = (len(self.vocab), vocab_embeddings, vocab_train_only, vocab_val_only) print("Calculating TF-IDF matrix...") self.tfidf.vectorize(self.numberer_doc, self.docs) print("Calculating sentiment matrix...") self.sentiment.vectorize(self.numberer_doc, self.numberer_word, self.docs) def get_tfidf(self): return self.tfidf def get_sentiment(self): return self.sentiment def get_embeddings(self): return self.embeddings def get_training_set(self): return self.train_set def get_validation_set(self): return self.val_set def get_vocab_size_trainonly(self): return self.vocab_sizes[2] def get_vocab_size_valonly(self): return self.vocab_sizes[3] def get_charvocab_size(self): return self.numberer_char.max_number() def get_max_labels(self): return self.numberer_label.max_number() def get_max_docs(self): return self.numberer_doc.max_number() def get_task_type(self): return self.task_type
def prepare_data(data_file, id_to_word_mapping, emolex, char_map=Numberer(), label_map=Numberer()): # Feature vector for words not in EmoLex emolex_default = [0.0] * 10 # Count unknown tokens for statistics overall_tokens = 0 unknown_tokens = 0 # Create word-to-id mapping from word list word_map = {} for (i, word) in enumerate(id_to_word_mapping): word_map[word] = str(i) unknown_id = len(word_map) tokenizer = TweetTokenizer() data = ([], [], [], [], [], []) sent_len = 0 # stores maximal tweet length word_len = 0 # stores maximal word length feat_len = 11 # Process data tweet by tweet, token by token with open(data_file, "r", encoding="utf8") as file: for line in file: split1 = line.find("\t") split2 = line.rfind("\t") label = label_map.number(line[split2:]) sentence = line[split1:split2].strip() tokenized = tokenizer.tokenize(sentence) word_ids = [] char_vecs = [] feature_vecs = [] word_lens = [] for token in tokenized: char_vec = [char_map.number(c) for c in token] word_len = max(word_len, len(char_vec)) word_lens.append(len(char_vec)) char_vecs.append(char_vec) feature_vec = [float(token.isupper())] if token[0] == "#": token = "<hashtag>" elif token[0] == "@": token = "<user>" elif token.isdigit(): token = "<number>" elif token.startswith("http://") or token.startswith( "https://") or token.startswith("www."): token = "<url>" else: token = token.lower() feature_vec += emolex.get(token, emolex_default) feature_vecs.append(feature_vec) token = word_map.get(token, unknown_id) overall_tokens += 1 if token == unknown_id: unknown_tokens += 1 word_ids.append(token) data[0].append(label) data[1].append(word_ids) data[2].append(char_vecs) data[3].append(feature_vecs) data[4].append(len(word_ids)) data[5].append(word_lens) sent_len = max(sent_len, len(word_ids)) # Print statistics print("%.2f percent of tokens not in dictionary" % (unknown_tokens / overall_tokens * 100)) # Convert collected data to numpy matrices data_len = len(data[0]) n_of_labels = label_map.max_number() data_tensors = (np.zeros([data_len, n_of_labels]), np.zeros([data_len, sent_len]), np.zeros([data_len, sent_len, word_len]), np.zeros([data_len, sent_len, feat_len]), np.array(data[4]), np.zeros([data_len, sent_len])) for i in range(data_len): data_tensors[0][i, data[0][i]] = 1 data_tensors[1][i, 0:len(data[1][i])] = data[1][i] data_tensors[5][i, 0:len(data[5][i])] = data[5][i] for j in range(len(data[1][i])): data_tensors[2][i, j, 0:len(data[2][i][j])] = data[2][i][j] data_tensors[3][i, j, 0:len(data[3][i][j])] = data[3][i][j] return data_tensors