def tokenizeAndBuildSets(data, data_train, data_test): """ Tokenize Code as Bag of Words or TF-IDF Parameters ---------- data: All Tests data_train: Tests in Train set data_test: Tests in Test set Returns ------- X_train, X_test: Vector for Train and Test set y_train, y_test: Info about the class Flaky (1) or Non Flaky (0) tokenizer: Tokenizer object to use for Feature Understanding """ print("\n[STEP] Tokenize and Build Sets") # Get Bodies allBody = getFeatures(data) trainBody = getFeatures(data_train) testBody = getFeatures(data_test) if vectorType == "BagOfWords": # Building Tokenizer, fit on whole data (Train + Test) tokenizer = Tokenizer(lower=True) tokenizer.fit_on_texts(allBody) X_train = tokenizer.texts_to_matrix(trainBody, mode='count') X_test = tokenizer.texts_to_matrix(testBody, mode='count') y_train = data_train['Label'].values y_test = data_test['Label'].values # Info print("Vocabulary size:", len(tokenizer.word_index)) print("X_train size:", len(X_train)) print("X_test size:", len(X_test)) return X_train, X_test, y_train, y_test, tokenizer elif vectorType == "TF-IDF": # Building Tokenizer, fit on whole data (Train + Test) tokenizer = TfidfVectorizer() tokenizer.fit(allBody) X_train = tokenizer.transform(trainBody) X_test = tokenizer.transform(testBody) y_train = data_train['Label'].values y_test = data_test['Label'].values # Info print("Vocabulary size:", len(tokenizer.get_feature_names())) return X_train, X_test, y_train, y_test, tokenizer else: sys.exit(0)
class DocToken: def __init__(self, algo, maxWords, ngram_range=None): self.algo = algo self.maxWords = maxWords self.ngram_range = ngram_range self.numWords = 0 self.embeddingMatrix = None if self.algo == 'bow': self.tok = TfidfVectorizer(max_features=maxWords, ngram_range=(1, ngram_range)) else: self.tok = Tokenizer(num_words=maxWords) def fit(self, textX): if self.algo == 'bow': self.tok.fit(textX) self.numWords = min(self.maxWords, len(self.tok.vocabulary_)) else: self.tok.fit_on_texts(textX) self.numWords = min(self.maxWords, len(self.tok.word_index) + 1) def transform(self, textX): if self.algo == 'bow': return self.tok.transform(textX, 'tfidf').toarray() else: return self.tok.texts_to_sequences(textX) def pretrain_word_embeded(self, pretrain_wordvec_path): if self.algo == 'pretrain_word_embeded': self.embeddingMatrix = load_pretrained_wordvec( path=pretrain_wordvec_path, wordNum=self.numWords, wordIdxDict=self.tok.wordIndex, embeddingDim=100) else: raise ValueError( "DocToken algorithm must be 'pretrain_word_embeded'")
class Text(Dataset): def __init__(self, file=None, df=None, feature_col='Text', label_col=''): super().__init__(file, df, feature_col, label_col) self.text = self.X self.weights = None self.split_text() # Split into text train and test def split_text(self): self.refresh() self.sentence_train, self.sentence_test, self.y_train, self.y_test = train_test_split( self.text, self.y) def bag_of_words(self, **kwargs): """Transform text corpus into bag of words i.e ['Hi you, how are you', 'I am doing well, thank you!'] -> [[1, 1, 1, 2, 0, 0, 0, 0, 0], [0, 0, 0, 1, 1, 1, 1, 1, 1]] """ self.vectorizer = CountVectorizer(**kwargs) self.vectorizer.fit(self.sentence_train) self.BoW_train = self.vectorizer.transform( self.sentence_train).toarray() self.BoW_test = self.vectorizer.transform(self.sentence_test).toarray() self.X_train = self.BoW_train self.X_test = self.BoW_test self.feature_names = self.vectorizer.get_feature_names() def vectorize(self, num_words=10000): """Transform text corpus to integers in a tokenizer i.e. ["Hi how are you?", "I'm well, how about you"] becomes [[10, 3, 4, 7, 0], [5, 12, 3, 15, 7]] """ self.vectorizer = Tokenizer(num_words) self.vectorizer.fit_on_texts(self.sentence_train) self.tokenized_train = self.vectorizer.texts_to_sequences( self.sentence_train) self.tokenized_test = self.vectorizer.texts_to_sequences( self.sentence_test) self.wtoi = self.vectorizer.word_index self.itow = self.vectorizer.index_word self.pad_and_refresh() def pad_and_refresh(self, max_len=None): if max_len is None: self.tokenized_train = pad_sequences(self.tokenized_train, padding='post') self.tokenized_test = pad_sequences(self.tokenized_test, padding='post') else: self.tokenized_train = pad_sequences(self.tokenized_train, padding='post', max_len=max_len) self.tokenized_test = pad_sequences(self.tokenized_test, padding='post', max_len=max_len) self.X_train = self.tokenized_train self.X_test = self.tokenized_test self.vocab_size = len(self.wtoi) + 1 def create_pretrained_embedding_matrix(self, path, embedding_dim=300): # works after vectorize self.weights = np.zeros((self.vocab_size, embedding_dim)) with open(path) as f: for line in f: word, vector = line.split() if word in self.vectorizer.word_index: idx = self.wtoi(word) self.weights[idx] = np.array( vector, dtype=np.float32)[:embedding_dim] def word_to_index(self, word): #word to index return self.wtoi[word] def index_to_word(self, idx): #index to word return self.itow[idx] def train_fasttext(self, path, sg=1, embedding_dim=300, min_count=2, max_vocab_size=30000, seed=42, epochs=10, workers=4, lowercase=False, full=False): sentences = self.sentence_train.values self.fasttext_model = FastText(sg=sg, size=embedding_dim, min_count=min_count, max_vocab_size=max_vocab_size, seed=seed, workers=workers) tokenized = list(self._gen_sentences(sentences)) print('Building vocabulary for fasttext model...') self.fasttext_model.build_vocab(sentences=tokenized) print('Training fasttext model...') self.fasttext_model.train(sentences=tokenized, total_examples=len(tokenized), epochs=epochs) self.word_vectors = self.fasttext_model.wv counts = Counter({ word: vocab.count for (word, vocab) in self.word_vectors.vocab.items() }) self.wtoi = { t[0]: i + 1 for i, t in enumerate(counts.most_common(max_vocab_size)) } self.itow = {v: k for k, v in self.wtoi.items()} self.tokenized_train = [[self.wtoi.get(word, 0) for word in sentence] for sentence in tokenized] tok_test = list(self._gen_sentences(self.sentence_test.values)) self.tokenized_test = [[self.wtoi.get(word, 0) for word in sentence] for sentence in tok_test] self.pad_and_refresh() self.save_fasttext(path) self.create_embedding_matrix(embedding_dim) def create_embedding_matrix(self, embedding_dim): self.weights = np.zeros((self.vocab_size, embedding_dim)) for word, i in self.wtoi.items(): if i >= 10000: continue try: embedding_vector = self.word_vectors[word] # words not found in embedding index will be all-zeros. self.weights[i] = embedding_vector except: pass def save_fasttext(self, path): model_path = os.path.join(path, 'fasttext.model') self.fasttext_model.save(model_path) def _gen_sentences(self, sentences, lowercase=False): for s in sentences: yield (list(tokenize(s, lowercase=lowercase)))
def get_data(feature_selection): df = pd.read_excel('trainingObamaRomneytweets.xlsx', sheet_name=0) # df = pd.read_excel('trainingObamaRomneytweets.xlsx',sheet_name=1) pd.options.display.max_colwidth = 200 # df.set_index("id", drop=True, append=False, inplace=False, verify_integrity=False) #### read the data # print (df['Anootated tweet']) # # print ('--- Print the Basic Info of the data ----') # print (df.info()) # print (df.shape) # # # print ('--- Print the Head/Tail Info of the data ----') # print (df.head()) # print ('--------------------------------------') # print (df.tail()) # df['rate'].plot(kind='hist') # plt.show() df = df[df['Class'] != '!!!!'] df = df[df['Class'] != 'IR'] df = df[df['Class'] != 'irrelevant'] df = df[df['Class'] != 'irrevelant'] df = df[df['Class'] != ''] df = df[df['Class'].notnull()] df = df[df['Class'] != 2] df = df[df['Class'] != '2'] short_data = df # short_data = df.head(20) # print(short_data['Anootated tweet'].to_string(index=False)) # print(short_data["Anootated tweet"]) # print(aa) short_data['Anootated tweet'] = short_data['Anootated tweet'].apply( lambda x: BeautifulSoup(str(x), 'lxml').get_text()) # print(short_data['Anootated tweet'].values) # print(aa) short_data['Anootated tweet'] = short_data['Anootated tweet'].apply( lambda x: re.sub('https?://[A-Za-z0-9./]+', '', x)) short_data['Anootated tweet'] = short_data['Anootated tweet'].apply( lambda x: re.sub('http?://[A-Za-z0-9./]+', '', x)) # print(short_data["Anootated tweet"][0]) print(short_data.dtypes) short_data['Class'] = short_data['Class'].astype(int) # short_data['Class'] = short_data['Class'].replace(-1, 3) short_data['Class'] = short_data['Class'].replace(-1, 2) from collections import Counter c = Counter(short_data['Class'].values) print(c) #### remove stop words from nltk.corpus import stopwords stop = stopwords.words("english") # print(short_data['Anootated tweet'].values.tolist()) print("----------- Remove Stop Word -------------") short_data["Anootated tweet"] = short_data["Anootated tweet"].apply( lambda x: ' '.join(word for word in x.split() if word not in stop)) # print(short_data['Anootated tweet'].values.tolist()) #### stemming # from nltk.stem import PorterStemmer # ps = PorterStemmer() # print(short_data['Anootated tweet'].values) # print('---------- Stemming ---------') # short_data['Anootated tweet'] = short_data['Anootated tweet'].apply(lambda x: ' '.join( [ ps.stem(word) for word in x.split() ])) # print(short_data['Anootated tweet'].values) # #### Lemmatization # from nltk.stem.wordnet import WordNetLemmatizer # lmtzr = WordNetLemmatizer() # print("---------- Lemmazation ----------") # short_data['Anootated tweet'] = short_data['Anootated tweet'].apply(lambda x: ' '.join([lmtzr.lemmatize(word, 'v') for word in x.split() ])) #### lower case print("------ Lower Case -------") short_data['Anootated tweet'] = short_data['Anootated tweet'].apply( lambda x: ' '.join(x.lower() for x in x.split())) # print(short_data['Anootated tweet'].values) #### Clean Twitter print("------ remove punctuation ------") # short_data['Anootated tweet'] = short_data['Anootated tweet'].apply(lambda x: re.sub("[^\w\s{P}@;)]+", "", x)) short_data['Anootated tweet'] = short_data['Anootated tweet'].apply( lambda x: re.sub("[^\w\s{P}]+", "", x)) # print(short_data['Anootated tweet'].values) raw_data = short_data['Anootated tweet'].values.tolist() from sklearn.model_selection import KFold X = short_data['Anootated tweet'].values y = short_data['Class'].values kf = KFold(n_splits=10) tr_vec = [] te_vec = [] y_train = [] y_test = [] for train_index, test_index in kf.split(X): X_train, X_test = X[train_index], X[test_index] ans_train, ans_test = y[train_index], y[test_index] y_train.append(ans_train) y_test.append(ans_test) # c = Counter(ans_test) # print(c) # X_train, X_test, y_train, y_test = train_test_split(short_data['Anootated tweet'].values, short_data['Class'].values, test_size=0.2, random_state=0) # from collections import Counter # c = Counter(short_data['Class'].values) # b = Counter(ans_test) # print(b) # print(c) # print(aaa) word_index = {} print("----------- calculate tokenize ---------") if feature_selection == "tokenize": tokenizer = Tokenizer() tokenizer.fit_on_texts(raw_data) tr = tokenizer.texts_to_sequences(X_train) te = tokenizer.texts_to_sequences(X_test) tr_vec.append(pad_sequences(tr, maxlen=sentence_len)) te_vec.append(pad_sequences(te, maxlen=sentence_len)) vocab_size = len(tokenizer.word_index) + 1 word_index = tokenizer.word_index # print(tr_vec) if feature_selection == "tfidf_tokenize": ngram_range = (1, 1) tokenizer = TfidfVectorizer(use_idf=True, ngram_range=ngram_range) tokenizer.fit(raw_data) tr_vec.append(tokenizer.transform(X_train)) te_vec.append(tokenizer.transform(X_test)) vocab_size = len(tokenizer.get_feature_names()) + 1 return tr_vec, te_vec, y_train, y_test, vocab_size, word_index
class StanceDataPipeline(object): def __init__(self, embedding_type='tf_idf'): ''' df_train or df_test [0] == 'stances' df_train or df_test [1] == 'bodies' ''' self.df_train = [None, None] self.df_test = [None, None] self.word_vectorizer = None self.embedding_type = embedding_type for index, (train_file, test_file) in enumerate( zip(dr.TRAIN_DATASET_FILES.values(), dr.TEST_DATASET_FILES.values())): self.df_train[index] = pd.read_csv(train_file) self.df_test[index] = pd.read_csv(test_file) def get_unique_training_words(self): return len(self.df_train[1][col_ref['headline']].unique().tolist()) + \ len(self.df_train[0][col_ref['body']].unique().tolist()) def __preprocess(self): preprocess_count = 0 df = [self.df_train, self.df_test] for preprocess_data in df: for func in preprocess_funcs: preprocess_data[1][col_ref['headline']] = preprocess_data[1]\ [col_ref['headline']].apply(func) preprocess_data[0][col_ref['body']] = preprocess_data[0]\ [col_ref['body']].apply(func) preprocess_count = preprocess_count + 1 print("[+] pre-processing for %d/2 done !" % (preprocess_count)) def __embbed_dataset(self, filename): body = self.df_train[0][col_ref['body']].values, headline = self.df_train[1][col_ref['headline']].values concated_corpus = np.concatenate([body[0], headline]) if self.embedding_type is 'tf_idf': self.word_vectorizer = TfidfVectorizer(stop_words=STOP_WORDS_SET) self.word_vectorizer.fit(concated_corpus) elif self.embedding_type is 'tokenizer': self.word_vectorizer = Tokenizer(num_words=20000) self.word_vectorizer.fit_on_texts(concated_corpus) else: pass print("[+] Transformation of word2vec done with %s" % (self.embedding_type)) pk.dump(self.word_vectorizer, open(filename, "wb")) print("[+] Fitted Corpus %s committed" % (filename)) return self.word_vectorizer def __merge(self): DATA_ATTR = [ self.df_train, self.df_test, ] SAVE_TO_ATTR = [dr.MERGE_TRAIN_FILE, dr.MERGE_TEST_FILE] ITER_ATTR = zip(DATA_ATTR, SAVE_TO_ATTR) for dataset_file, save_to in ITER_ATTR: temp_df = pd.merge(dataset_file[1], dataset_file[0], on=col_ref['id']) temp_df.to_csv(save_to) print("[+] %s data Commited Successfully..." % (save_to)) ''' Pass the name of the pickled word2vec file ''' def startPipeline(self, pickled_filename): self.__preprocess() self.__embbed_dataset( ut.getFilePath(['pickled', 'word2vec'], pickled_filename, create=True)) self.__merge()
class Vectorized(): def __init__(self, data_dto): self.data_dto = data_dto def initialize_with_count_vectorizer( self, count_vectorizer_dto=CountVectorizerDTO()): self.vectorizer = CountVectorizer( strip_accents=count_vectorizer_dto.strip_accents, stop_words=count_vectorizer_dto.stop_words, lowercase=count_vectorizer_dto.lowercase, max_df=count_vectorizer_dto.max_df, min_df=count_vectorizer_dto.min_df, binary=count_vectorizer_dto.binary, ngram_range=count_vectorizer_dto.ngram_range) self.vectorizer.fit(self.data_dto.data_train) self.X_train = self.vectorizer.transform(self.data_dto.data_train) self.X_test = self.vectorizer.transform(self.data_dto.data_test) # Need to transforms the texts in number to be able to use with Keras labelencoder_y_1 = LabelEncoder() self.y_train = to_categorical( labelencoder_y_1.fit_transform(self.data_dto.target_train)) self.y_test = to_categorical( labelencoder_y_1.fit_transform(self.data_dto.target_test)) self.input_dim = self.X_train.shape[1] # Number of features def initialize_with_keras_tokenizer( self, keras_tokenizer_dto=KerasTokenizerDTO()): # define Tokenizer with Vocab Size self.vectorizer = Tokenizer(num_words=self.data_dto.vocab_size) self.vectorizer.fit_on_texts(self.data_dto.data_train) self.X_train = self.vectorizer.texts_to_matrix( self.data_dto.data_train, mode=keras_tokenizer_dto.mode) self.X_test = self.vectorizer.texts_to_matrix( self.data_dto.data_test, mode=keras_tokenizer_dto.mode) encoder = LabelBinarizer() encoder.fit(self.data_dto.target_train) self.y_train = encoder.transform(self.data_dto.target_train) self.y_test = encoder.transform(self.data_dto.target_test) self.vectorizer.mode = keras_tokenizer_dto.mode def initialize_with_word2vec(self): self.vectorizer = CustomVectorizerForWord2Vec(self.data_dto) x_train = self.vectorizer.labelizeTweets(self.data_dto.data_train, 'TRAIN') x_test = self.vectorizer.labelizeTweets(self.data_dto.data_test, 'TEST') self.vectorizer.create_tokenizer(x_train) self.X_train = self.vectorizer.tabeled_tokens_to_matrix(x_train) self.X_test = self.vectorizer.tabeled_tokens_to_matrix(x_test) encoder = LabelBinarizer() encoder.fit(self.data_dto.target_train) self.y_train = encoder.transform(self.data_dto.target_train) self.y_test = encoder.transform(self.data_dto.target_test)