def tokenizeAndBuildSets(data, data_train, data_test): """ Tokenize Code as Bag of Words or TF-IDF Parameters ---------- data: All Tests data_train: Tests in Train set data_test: Tests in Test set Returns ------- X_train, X_test: Vector for Train and Test set y_train, y_test: Info about the class Flaky (1) or Non Flaky (0) tokenizer: Tokenizer object to use for Feature Understanding """ print("\n[STEP] Tokenize and Build Sets") # Get Bodies allBody = getFeatures(data) trainBody = getFeatures(data_train) testBody = getFeatures(data_test) if vectorType == "BagOfWords": # Building Tokenizer, fit on whole data (Train + Test) tokenizer = Tokenizer(lower=True) tokenizer.fit_on_texts(allBody) X_train = tokenizer.texts_to_matrix(trainBody, mode='count') X_test = tokenizer.texts_to_matrix(testBody, mode='count') y_train = data_train['Label'].values y_test = data_test['Label'].values # Info print("Vocabulary size:", len(tokenizer.word_index)) print("X_train size:", len(X_train)) print("X_test size:", len(X_test)) return X_train, X_test, y_train, y_test, tokenizer elif vectorType == "TF-IDF": # Building Tokenizer, fit on whole data (Train + Test) tokenizer = TfidfVectorizer() tokenizer.fit(allBody) X_train = tokenizer.transform(trainBody) X_test = tokenizer.transform(testBody) y_train = data_train['Label'].values y_test = data_test['Label'].values # Info print("Vocabulary size:", len(tokenizer.get_feature_names())) return X_train, X_test, y_train, y_test, tokenizer else: sys.exit(0)
class Text(Dataset): def __init__(self, file=None, df=None, feature_col='Text', label_col=''): super().__init__(file, df, feature_col, label_col) self.text = self.X self.weights = None self.split_text() # Split into text train and test def split_text(self): self.refresh() self.sentence_train, self.sentence_test, self.y_train, self.y_test = train_test_split( self.text, self.y) def bag_of_words(self, **kwargs): """Transform text corpus into bag of words i.e ['Hi you, how are you', 'I am doing well, thank you!'] -> [[1, 1, 1, 2, 0, 0, 0, 0, 0], [0, 0, 0, 1, 1, 1, 1, 1, 1]] """ self.vectorizer = CountVectorizer(**kwargs) self.vectorizer.fit(self.sentence_train) self.BoW_train = self.vectorizer.transform( self.sentence_train).toarray() self.BoW_test = self.vectorizer.transform(self.sentence_test).toarray() self.X_train = self.BoW_train self.X_test = self.BoW_test self.feature_names = self.vectorizer.get_feature_names() def vectorize(self, num_words=10000): """Transform text corpus to integers in a tokenizer i.e. ["Hi how are you?", "I'm well, how about you"] becomes [[10, 3, 4, 7, 0], [5, 12, 3, 15, 7]] """ self.vectorizer = Tokenizer(num_words) self.vectorizer.fit_on_texts(self.sentence_train) self.tokenized_train = self.vectorizer.texts_to_sequences( self.sentence_train) self.tokenized_test = self.vectorizer.texts_to_sequences( self.sentence_test) self.wtoi = self.vectorizer.word_index self.itow = self.vectorizer.index_word self.pad_and_refresh() def pad_and_refresh(self, max_len=None): if max_len is None: self.tokenized_train = pad_sequences(self.tokenized_train, padding='post') self.tokenized_test = pad_sequences(self.tokenized_test, padding='post') else: self.tokenized_train = pad_sequences(self.tokenized_train, padding='post', max_len=max_len) self.tokenized_test = pad_sequences(self.tokenized_test, padding='post', max_len=max_len) self.X_train = self.tokenized_train self.X_test = self.tokenized_test self.vocab_size = len(self.wtoi) + 1 def create_pretrained_embedding_matrix(self, path, embedding_dim=300): # works after vectorize self.weights = np.zeros((self.vocab_size, embedding_dim)) with open(path) as f: for line in f: word, vector = line.split() if word in self.vectorizer.word_index: idx = self.wtoi(word) self.weights[idx] = np.array( vector, dtype=np.float32)[:embedding_dim] def word_to_index(self, word): #word to index return self.wtoi[word] def index_to_word(self, idx): #index to word return self.itow[idx] def train_fasttext(self, path, sg=1, embedding_dim=300, min_count=2, max_vocab_size=30000, seed=42, epochs=10, workers=4, lowercase=False, full=False): sentences = self.sentence_train.values self.fasttext_model = FastText(sg=sg, size=embedding_dim, min_count=min_count, max_vocab_size=max_vocab_size, seed=seed, workers=workers) tokenized = list(self._gen_sentences(sentences)) print('Building vocabulary for fasttext model...') self.fasttext_model.build_vocab(sentences=tokenized) print('Training fasttext model...') self.fasttext_model.train(sentences=tokenized, total_examples=len(tokenized), epochs=epochs) self.word_vectors = self.fasttext_model.wv counts = Counter({ word: vocab.count for (word, vocab) in self.word_vectors.vocab.items() }) self.wtoi = { t[0]: i + 1 for i, t in enumerate(counts.most_common(max_vocab_size)) } self.itow = {v: k for k, v in self.wtoi.items()} self.tokenized_train = [[self.wtoi.get(word, 0) for word in sentence] for sentence in tokenized] tok_test = list(self._gen_sentences(self.sentence_test.values)) self.tokenized_test = [[self.wtoi.get(word, 0) for word in sentence] for sentence in tok_test] self.pad_and_refresh() self.save_fasttext(path) self.create_embedding_matrix(embedding_dim) def create_embedding_matrix(self, embedding_dim): self.weights = np.zeros((self.vocab_size, embedding_dim)) for word, i in self.wtoi.items(): if i >= 10000: continue try: embedding_vector = self.word_vectors[word] # words not found in embedding index will be all-zeros. self.weights[i] = embedding_vector except: pass def save_fasttext(self, path): model_path = os.path.join(path, 'fasttext.model') self.fasttext_model.save(model_path) def _gen_sentences(self, sentences, lowercase=False): for s in sentences: yield (list(tokenize(s, lowercase=lowercase)))
def get_data(feature_selection): df = pd.read_excel('trainingObamaRomneytweets.xlsx', sheet_name=0) # df = pd.read_excel('trainingObamaRomneytweets.xlsx',sheet_name=1) pd.options.display.max_colwidth = 200 # df.set_index("id", drop=True, append=False, inplace=False, verify_integrity=False) #### read the data # print (df['Anootated tweet']) # # print ('--- Print the Basic Info of the data ----') # print (df.info()) # print (df.shape) # # # print ('--- Print the Head/Tail Info of the data ----') # print (df.head()) # print ('--------------------------------------') # print (df.tail()) # df['rate'].plot(kind='hist') # plt.show() df = df[df['Class'] != '!!!!'] df = df[df['Class'] != 'IR'] df = df[df['Class'] != 'irrelevant'] df = df[df['Class'] != 'irrevelant'] df = df[df['Class'] != ''] df = df[df['Class'].notnull()] df = df[df['Class'] != 2] df = df[df['Class'] != '2'] short_data = df # short_data = df.head(20) # print(short_data['Anootated tweet'].to_string(index=False)) # print(short_data["Anootated tweet"]) # print(aa) short_data['Anootated tweet'] = short_data['Anootated tweet'].apply( lambda x: BeautifulSoup(str(x), 'lxml').get_text()) # print(short_data['Anootated tweet'].values) # print(aa) short_data['Anootated tweet'] = short_data['Anootated tweet'].apply( lambda x: re.sub('https?://[A-Za-z0-9./]+', '', x)) short_data['Anootated tweet'] = short_data['Anootated tweet'].apply( lambda x: re.sub('http?://[A-Za-z0-9./]+', '', x)) # print(short_data["Anootated tweet"][0]) print(short_data.dtypes) short_data['Class'] = short_data['Class'].astype(int) # short_data['Class'] = short_data['Class'].replace(-1, 3) short_data['Class'] = short_data['Class'].replace(-1, 2) from collections import Counter c = Counter(short_data['Class'].values) print(c) #### remove stop words from nltk.corpus import stopwords stop = stopwords.words("english") # print(short_data['Anootated tweet'].values.tolist()) print("----------- Remove Stop Word -------------") short_data["Anootated tweet"] = short_data["Anootated tweet"].apply( lambda x: ' '.join(word for word in x.split() if word not in stop)) # print(short_data['Anootated tweet'].values.tolist()) #### stemming # from nltk.stem import PorterStemmer # ps = PorterStemmer() # print(short_data['Anootated tweet'].values) # print('---------- Stemming ---------') # short_data['Anootated tweet'] = short_data['Anootated tweet'].apply(lambda x: ' '.join( [ ps.stem(word) for word in x.split() ])) # print(short_data['Anootated tweet'].values) # #### Lemmatization # from nltk.stem.wordnet import WordNetLemmatizer # lmtzr = WordNetLemmatizer() # print("---------- Lemmazation ----------") # short_data['Anootated tweet'] = short_data['Anootated tweet'].apply(lambda x: ' '.join([lmtzr.lemmatize(word, 'v') for word in x.split() ])) #### lower case print("------ Lower Case -------") short_data['Anootated tweet'] = short_data['Anootated tweet'].apply( lambda x: ' '.join(x.lower() for x in x.split())) # print(short_data['Anootated tweet'].values) #### Clean Twitter print("------ remove punctuation ------") # short_data['Anootated tweet'] = short_data['Anootated tweet'].apply(lambda x: re.sub("[^\w\s{P}@;)]+", "", x)) short_data['Anootated tweet'] = short_data['Anootated tweet'].apply( lambda x: re.sub("[^\w\s{P}]+", "", x)) # print(short_data['Anootated tweet'].values) raw_data = short_data['Anootated tweet'].values.tolist() from sklearn.model_selection import KFold X = short_data['Anootated tweet'].values y = short_data['Class'].values kf = KFold(n_splits=10) tr_vec = [] te_vec = [] y_train = [] y_test = [] for train_index, test_index in kf.split(X): X_train, X_test = X[train_index], X[test_index] ans_train, ans_test = y[train_index], y[test_index] y_train.append(ans_train) y_test.append(ans_test) # c = Counter(ans_test) # print(c) # X_train, X_test, y_train, y_test = train_test_split(short_data['Anootated tweet'].values, short_data['Class'].values, test_size=0.2, random_state=0) # from collections import Counter # c = Counter(short_data['Class'].values) # b = Counter(ans_test) # print(b) # print(c) # print(aaa) word_index = {} print("----------- calculate tokenize ---------") if feature_selection == "tokenize": tokenizer = Tokenizer() tokenizer.fit_on_texts(raw_data) tr = tokenizer.texts_to_sequences(X_train) te = tokenizer.texts_to_sequences(X_test) tr_vec.append(pad_sequences(tr, maxlen=sentence_len)) te_vec.append(pad_sequences(te, maxlen=sentence_len)) vocab_size = len(tokenizer.word_index) + 1 word_index = tokenizer.word_index # print(tr_vec) if feature_selection == "tfidf_tokenize": ngram_range = (1, 1) tokenizer = TfidfVectorizer(use_idf=True, ngram_range=ngram_range) tokenizer.fit(raw_data) tr_vec.append(tokenizer.transform(X_train)) te_vec.append(tokenizer.transform(X_test)) vocab_size = len(tokenizer.get_feature_names()) + 1 return tr_vec, te_vec, y_train, y_test, vocab_size, word_index