def tokenizeAndBuildSets(data, data_train, data_test):
    """
    Tokenize Code as Bag of Words or TF-IDF

    Parameters
    ----------
    data: All Tests
    data_train: Tests in Train set
    data_test: Tests in Test set

    Returns
    -------
    X_train, X_test: Vector for Train and Test set
    y_train, y_test: Info about the class Flaky (1) or Non Flaky (0)
    tokenizer: Tokenizer object to use for Feature Understanding
    """
    print("\n[STEP] Tokenize and Build Sets")
    # Get Bodies
    allBody = getFeatures(data)
    trainBody = getFeatures(data_train)
    testBody = getFeatures(data_test)

    if vectorType == "BagOfWords":
        # Building Tokenizer, fit on whole data (Train + Test)
        tokenizer = Tokenizer(lower=True)
        tokenizer.fit_on_texts(allBody)
        X_train = tokenizer.texts_to_matrix(trainBody, mode='count')
        X_test = tokenizer.texts_to_matrix(testBody, mode='count')
        y_train = data_train['Label'].values
        y_test = data_test['Label'].values
        # Info
        print("Vocabulary size:", len(tokenizer.word_index))
        print("X_train size:", len(X_train))
        print("X_test size:", len(X_test))
        return X_train, X_test, y_train, y_test, tokenizer
    elif vectorType == "TF-IDF":
        # Building Tokenizer, fit on whole data (Train + Test)
        tokenizer = TfidfVectorizer()
        tokenizer.fit(allBody)
        X_train = tokenizer.transform(trainBody)
        X_test = tokenizer.transform(testBody)
        y_train = data_train['Label'].values
        y_test = data_test['Label'].values
        # Info
        print("Vocabulary size:", len(tokenizer.get_feature_names()))
        return X_train, X_test, y_train, y_test, tokenizer
    else:
        sys.exit(0)
class DocToken:
    def __init__(self, algo, maxWords, ngram_range=None):
        self.algo = algo
        self.maxWords = maxWords
        self.ngram_range = ngram_range
        self.numWords = 0
        self.embeddingMatrix = None

        if self.algo == 'bow':
            self.tok = TfidfVectorizer(max_features=maxWords,
                                       ngram_range=(1, ngram_range))
        else:
            self.tok = Tokenizer(num_words=maxWords)

    def fit(self, textX):
        if self.algo == 'bow':
            self.tok.fit(textX)
            self.numWords = min(self.maxWords, len(self.tok.vocabulary_))
        else:
            self.tok.fit_on_texts(textX)
            self.numWords = min(self.maxWords, len(self.tok.word_index) + 1)

    def transform(self, textX):
        if self.algo == 'bow':
            return self.tok.transform(textX, 'tfidf').toarray()
        else:
            return self.tok.texts_to_sequences(textX)

    def pretrain_word_embeded(self, pretrain_wordvec_path):
        if self.algo == 'pretrain_word_embeded':
            self.embeddingMatrix = load_pretrained_wordvec(
                path=pretrain_wordvec_path,
                wordNum=self.numWords,
                wordIdxDict=self.tok.wordIndex,
                embeddingDim=100)
        else:
            raise ValueError(
                "DocToken algorithm must be 'pretrain_word_embeded'")
print('reading the test data...')

df_test = pd.read_csv('../input/test.tsv', sep='\t')
testid = df_test.test_id

df_test.name.fillna('unkname', inplace=True)
df_test.category_name.fillna('unk_cat', inplace=True)
df_test.brand_name.fillna('unk_brand', inplace=True)
df_test.item_description.fillna('nodesc', inplace=True)

df_test.category_name = df_test.category_name.apply(cat_process)
df_test.brand_name = df_test.brand_name.str.lower()
df_test.brand_name = df_test.brand_name.str.replace(' ', '_')

X_cat_test = cat_tok.transform(df_test.category_name)
X_name_test = name_tok.transform(df_test.name)

X_desc_test = desc_tok.transform(df_test.item_description)
X_desc_test = X_desc_test[:, :desc_num_col]

X_item_cond_test = (df_test.item_condition_id - 1).astype('uint8').values.reshape(-1, 1)
X_shipping_test = df_test.shipping.astype('float32').values.reshape(-1, 1)

X_brand_test = df_test.brand_name.apply(lambda b: brands_idx.get(b, 0))
X_brand_test = X_brand_test.values.reshape(-1, 1)

# Predict on the test set
print('applying the model to test...')

n_test = len(df_test)
Exemple #4
0
class Text(Dataset):
    def __init__(self, file=None, df=None, feature_col='Text', label_col=''):
        super().__init__(file, df, feature_col, label_col)
        self.text = self.X
        self.weights = None
        self.split_text()
        # Split into text train and test

    def split_text(self):
        self.refresh()
        self.sentence_train, self.sentence_test, self.y_train, self.y_test = train_test_split(
            self.text, self.y)

    def bag_of_words(self, **kwargs):
        """Transform text corpus into bag of words
        i.e ['Hi you, how are you', 'I am doing well, thank you!'] -> [[1, 1, 1, 2, 0, 0, 0, 0, 0],  [0, 0, 0, 1, 1, 1, 1, 1, 1]]
        """
        self.vectorizer = CountVectorizer(**kwargs)
        self.vectorizer.fit(self.sentence_train)

        self.BoW_train = self.vectorizer.transform(
            self.sentence_train).toarray()
        self.BoW_test = self.vectorizer.transform(self.sentence_test).toarray()
        self.X_train = self.BoW_train
        self.X_test = self.BoW_test

        self.feature_names = self.vectorizer.get_feature_names()

    def vectorize(self, num_words=10000):
        """Transform text corpus to integers in a tokenizer
        i.e. ["Hi how are you?", "I'm well, how about you"] becomes [[10, 3, 4, 7, 0], [5, 12, 3, 15, 7]]
        """
        self.vectorizer = Tokenizer(num_words)
        self.vectorizer.fit_on_texts(self.sentence_train)

        self.tokenized_train = self.vectorizer.texts_to_sequences(
            self.sentence_train)
        self.tokenized_test = self.vectorizer.texts_to_sequences(
            self.sentence_test)

        self.wtoi = self.vectorizer.word_index
        self.itow = self.vectorizer.index_word
        self.pad_and_refresh()

    def pad_and_refresh(self, max_len=None):
        if max_len is None:
            self.tokenized_train = pad_sequences(self.tokenized_train,
                                                 padding='post')
            self.tokenized_test = pad_sequences(self.tokenized_test,
                                                padding='post')
        else:
            self.tokenized_train = pad_sequences(self.tokenized_train,
                                                 padding='post',
                                                 max_len=max_len)
            self.tokenized_test = pad_sequences(self.tokenized_test,
                                                padding='post',
                                                max_len=max_len)

        self.X_train = self.tokenized_train
        self.X_test = self.tokenized_test

        self.vocab_size = len(self.wtoi) + 1

    def create_pretrained_embedding_matrix(self, path, embedding_dim=300):
        # works after vectorize
        self.weights = np.zeros((self.vocab_size, embedding_dim))

        with open(path) as f:
            for line in f:
                word, vector = line.split()
                if word in self.vectorizer.word_index:
                    idx = self.wtoi(word)
                    self.weights[idx] = np.array(
                        vector, dtype=np.float32)[:embedding_dim]

    def word_to_index(self, word):
        #word to index
        return self.wtoi[word]

    def index_to_word(self, idx):
        #index to word
        return self.itow[idx]

    def train_fasttext(self,
                       path,
                       sg=1,
                       embedding_dim=300,
                       min_count=2,
                       max_vocab_size=30000,
                       seed=42,
                       epochs=10,
                       workers=4,
                       lowercase=False,
                       full=False):

        sentences = self.sentence_train.values

        self.fasttext_model = FastText(sg=sg,
                                       size=embedding_dim,
                                       min_count=min_count,
                                       max_vocab_size=max_vocab_size,
                                       seed=seed,
                                       workers=workers)

        tokenized = list(self._gen_sentences(sentences))

        print('Building vocabulary for fasttext model...')
        self.fasttext_model.build_vocab(sentences=tokenized)

        print('Training fasttext model...')
        self.fasttext_model.train(sentences=tokenized,
                                  total_examples=len(tokenized),
                                  epochs=epochs)
        self.word_vectors = self.fasttext_model.wv

        counts = Counter({
            word: vocab.count
            for (word, vocab) in self.word_vectors.vocab.items()
        })

        self.wtoi = {
            t[0]: i + 1
            for i, t in enumerate(counts.most_common(max_vocab_size))
        }
        self.itow = {v: k for k, v in self.wtoi.items()}

        self.tokenized_train = [[self.wtoi.get(word, 0) for word in sentence]
                                for sentence in tokenized]

        tok_test = list(self._gen_sentences(self.sentence_test.values))
        self.tokenized_test = [[self.wtoi.get(word, 0) for word in sentence]
                               for sentence in tok_test]

        self.pad_and_refresh()

        self.save_fasttext(path)
        self.create_embedding_matrix(embedding_dim)

    def create_embedding_matrix(self, embedding_dim):
        self.weights = np.zeros((self.vocab_size, embedding_dim))

        for word, i in self.wtoi.items():
            if i >= 10000:
                continue
            try:
                embedding_vector = self.word_vectors[word]
                # words not found in embedding index will be all-zeros.
                self.weights[i] = embedding_vector
            except:
                pass

    def save_fasttext(self, path):
        model_path = os.path.join(path, 'fasttext.model')
        self.fasttext_model.save(model_path)

    def _gen_sentences(self, sentences, lowercase=False):
        for s in sentences:
            yield (list(tokenize(s, lowercase=lowercase)))
def get_data(feature_selection):

    df = pd.read_excel('trainingObamaRomneytweets.xlsx', sheet_name=0)
    # df = pd.read_excel('trainingObamaRomneytweets.xlsx',sheet_name=1)
    pd.options.display.max_colwidth = 200
    # df.set_index("id", drop=True, append=False, inplace=False, verify_integrity=False)

    #### read the data
    # print (df['Anootated tweet'])
    #
    # print ('--- Print the Basic Info of the data ----')
    # print (df.info())
    # print (df.shape)
    #
    #
    # print ('--- Print the Head/Tail Info of the data ----')
    # print (df.head())
    # print ('--------------------------------------')
    # print (df.tail())

    # df['rate'].plot(kind='hist')
    # plt.show()

    df = df[df['Class'] != '!!!!']
    df = df[df['Class'] != 'IR']
    df = df[df['Class'] != 'irrelevant']
    df = df[df['Class'] != 'irrevelant']

    df = df[df['Class'] != '']
    df = df[df['Class'].notnull()]
    df = df[df['Class'] != 2]
    df = df[df['Class'] != '2']

    short_data = df

    # short_data = df.head(20)
    # print(short_data['Anootated tweet'].to_string(index=False))
    # print(short_data["Anootated tweet"])
    # print(aa)

    short_data['Anootated tweet'] = short_data['Anootated tweet'].apply(
        lambda x: BeautifulSoup(str(x), 'lxml').get_text())
    # print(short_data['Anootated tweet'].values)
    # print(aa)

    short_data['Anootated tweet'] = short_data['Anootated tweet'].apply(
        lambda x: re.sub('https?://[A-Za-z0-9./]+', '', x))

    short_data['Anootated tweet'] = short_data['Anootated tweet'].apply(
        lambda x: re.sub('http?://[A-Za-z0-9./]+', '', x))

    # print(short_data["Anootated tweet"][0])

    print(short_data.dtypes)
    short_data['Class'] = short_data['Class'].astype(int)
    # short_data['Class'] = short_data['Class'].replace(-1, 3)
    short_data['Class'] = short_data['Class'].replace(-1, 2)

    from collections import Counter
    c = Counter(short_data['Class'].values)
    print(c)

    #### remove stop words
    from nltk.corpus import stopwords
    stop = stopwords.words("english")
    # print(short_data['Anootated tweet'].values.tolist())

    print("----------- Remove Stop Word -------------")
    short_data["Anootated tweet"] = short_data["Anootated tweet"].apply(
        lambda x: ' '.join(word for word in x.split() if word not in stop))
    # print(short_data['Anootated tweet'].values.tolist())

    #### stemming
    # from nltk.stem import PorterStemmer
    # ps = PorterStemmer()
    # print(short_data['Anootated tweet'].values)

    # print('---------- Stemming ---------')
    # short_data['Anootated tweet'] = short_data['Anootated tweet'].apply(lambda x: ' '.join( [ ps.stem(word) for word in x.split() ]))
    # print(short_data['Anootated tweet'].values)

    # #### Lemmatization
    # from nltk.stem.wordnet import WordNetLemmatizer
    # lmtzr = WordNetLemmatizer()

    # print("---------- Lemmazation ----------")
    # short_data['Anootated tweet'] = short_data['Anootated tweet'].apply(lambda x: ' '.join([lmtzr.lemmatize(word, 'v') for word in x.split() ]))

    #### lower case
    print("------ Lower Case -------")
    short_data['Anootated tweet'] = short_data['Anootated tweet'].apply(
        lambda x: ' '.join(x.lower() for x in x.split()))

    # print(short_data['Anootated tweet'].values)

    #### Clean Twitter
    print("------ remove punctuation ------")
    # short_data['Anootated tweet'] = short_data['Anootated tweet'].apply(lambda x: re.sub("[^\w\s{P}@;)]+", "", x))
    short_data['Anootated tweet'] = short_data['Anootated tweet'].apply(
        lambda x: re.sub("[^\w\s{P}]+", "", x))

    # print(short_data['Anootated tweet'].values)

    raw_data = short_data['Anootated tweet'].values.tolist()

    from sklearn.model_selection import KFold

    X = short_data['Anootated tweet'].values
    y = short_data['Class'].values

    kf = KFold(n_splits=10)

    tr_vec = []
    te_vec = []
    y_train = []
    y_test = []

    for train_index, test_index in kf.split(X):

        X_train, X_test = X[train_index], X[test_index]
        ans_train, ans_test = y[train_index], y[test_index]

        y_train.append(ans_train)
        y_test.append(ans_test)

        # c = Counter(ans_test)
        # print(c)
        # X_train, X_test, y_train, y_test = train_test_split(short_data['Anootated tweet'].values, short_data['Class'].values, test_size=0.2, random_state=0)

        # from collections import Counter
        # c = Counter(short_data['Class'].values)
        # b = Counter(ans_test)
        # print(b)
        # print(c)
        # print(aaa)

        word_index = {}
        print("----------- calculate tokenize ---------")

        if feature_selection == "tokenize":
            tokenizer = Tokenizer()
            tokenizer.fit_on_texts(raw_data)
            tr = tokenizer.texts_to_sequences(X_train)
            te = tokenizer.texts_to_sequences(X_test)

            tr_vec.append(pad_sequences(tr, maxlen=sentence_len))
            te_vec.append(pad_sequences(te, maxlen=sentence_len))

            vocab_size = len(tokenizer.word_index) + 1
            word_index = tokenizer.word_index
            # print(tr_vec)

        if feature_selection == "tfidf_tokenize":
            ngram_range = (1, 1)
            tokenizer = TfidfVectorizer(use_idf=True, ngram_range=ngram_range)
            tokenizer.fit(raw_data)

            tr_vec.append(tokenizer.transform(X_train))
            te_vec.append(tokenizer.transform(X_test))
            vocab_size = len(tokenizer.get_feature_names()) + 1

    return tr_vec, te_vec, y_train, y_test, vocab_size, word_index
Exemple #6
0
class Vectorized():
    def __init__(self, data_dto):
        self.data_dto = data_dto

    def initialize_with_count_vectorizer(
        self, count_vectorizer_dto=CountVectorizerDTO()):
        self.vectorizer = CountVectorizer(
            strip_accents=count_vectorizer_dto.strip_accents,
            stop_words=count_vectorizer_dto.stop_words,
            lowercase=count_vectorizer_dto.lowercase,
            max_df=count_vectorizer_dto.max_df,
            min_df=count_vectorizer_dto.min_df,
            binary=count_vectorizer_dto.binary,
            ngram_range=count_vectorizer_dto.ngram_range)
        self.vectorizer.fit(self.data_dto.data_train)

        self.X_train = self.vectorizer.transform(self.data_dto.data_train)
        self.X_test = self.vectorizer.transform(self.data_dto.data_test)

        # Need to transforms the texts in number to be able to use with Keras
        labelencoder_y_1 = LabelEncoder()
        self.y_train = to_categorical(
            labelencoder_y_1.fit_transform(self.data_dto.target_train))
        self.y_test = to_categorical(
            labelencoder_y_1.fit_transform(self.data_dto.target_test))

        self.input_dim = self.X_train.shape[1]  # Number of features

    def initialize_with_keras_tokenizer(
        self, keras_tokenizer_dto=KerasTokenizerDTO()):
        # define Tokenizer with Vocab Size
        self.vectorizer = Tokenizer(num_words=self.data_dto.vocab_size)
        self.vectorizer.fit_on_texts(self.data_dto.data_train)

        self.X_train = self.vectorizer.texts_to_matrix(
            self.data_dto.data_train, mode=keras_tokenizer_dto.mode)
        self.X_test = self.vectorizer.texts_to_matrix(
            self.data_dto.data_test, mode=keras_tokenizer_dto.mode)

        encoder = LabelBinarizer()
        encoder.fit(self.data_dto.target_train)
        self.y_train = encoder.transform(self.data_dto.target_train)
        self.y_test = encoder.transform(self.data_dto.target_test)

        self.vectorizer.mode = keras_tokenizer_dto.mode

    def initialize_with_word2vec(self):
        self.vectorizer = CustomVectorizerForWord2Vec(self.data_dto)

        x_train = self.vectorizer.labelizeTweets(self.data_dto.data_train,
                                                 'TRAIN')
        x_test = self.vectorizer.labelizeTweets(self.data_dto.data_test,
                                                'TEST')

        self.vectorizer.create_tokenizer(x_train)

        self.X_train = self.vectorizer.tabeled_tokens_to_matrix(x_train)
        self.X_test = self.vectorizer.tabeled_tokens_to_matrix(x_test)

        encoder = LabelBinarizer()
        encoder.fit(self.data_dto.target_train)
        self.y_train = encoder.transform(self.data_dto.target_train)
        self.y_test = encoder.transform(self.data_dto.target_test)