def tf_idf_vect_feature_vector():
    token_array = text_processed()
    training_token_array, test_token_array = split_string_2_data_array(
        token_array, 0.8)
    # print("token: ", token_array)
    vectorizer = TfidfVectorizer(stop_words='english', analyzer="word")
    # print(vectorizer)
    vec = vectorizer.fit(training_token_array)
    vec_matrix = vectorizer.transform(training_token_array)
    # data_frame = pd.DataFrame(matrix.toarray(), columns=vectorizer.get_feature_names())
    # print(data_frame)
    return (test_token_array, vec, vec_matrix)
def compute_dissimalrity_matrix():
    token_array = text_processed()
    vectorizer = TfidfVectorizer(stop_words='english', analyzer="word")

    td_if = vectorizer.fit_transform(token_array)
    x = td_if.toarray()
    y = vectorizer.get_feature_names()
    print(x)
    print(y)
    matrix = euclidean_distances(td_if)

    # print(matrix)
    return matrix
def tf_idf_vect_feature_vector():
    df = text_processed()

    vectorizer = TfidfVectorizer()

    vec_train = vectorizer.fit_transform(df.Tweets)
    df['tweets_vec'] = list(vec_train.toarray())
    #df.to_csv('tfidf_vector.csv')
    train, test = train_test_split(df, test_size=0.2)

    print(vectorizer.get_feature_names())

    print(df)
    return test, train, df
def tf_idf_trans_feature_vector():
    token_array = text_processed()
    training_token_array, test_token_array = split_string_2_data_array(
        token_array, 0.8)
    print(token_array)
    vectorizer = TfidfTransformer(stop_words='english', analyzer="word")
    # tokenize and build vocab
    X = vectorizer.fit_transform(token_array)
    analyze = vectorizer.build_analyzer()
    print(analyze("subject is not the case"))
    # summarize
    print(vectorizer.get_feature_names())
    # summarize encoded vector
    print(X.toarray())
    return X
def count_vectorizer_feature_vector():
    token_array = text_processed()
    training_token_array, test_token_array = split_string_2_data_array(
        token_array, 0.8)

    vectorizer = CountVectorizer(encoding='utf-8',
                                 analyzer='word',
                                 stop_words='english',
                                 binary='false',
                                 min_df=0.01)
    # tokenize and build vocab
    vec = vectorizer.fit(training_token_array)
    vec_matrix = vectorizer.fit_transform(training_token_array)
    # print(vectorizer.get_feature_names())
    f_vector = vectorizer.transform(training_token_array)
    # print(f_vector.shape)
    # print(f_vector.toarray())
    return (test_token_array, vec, vec_matrix)
def word2vec_feature_vector():
    token_array = text_processed()
    print(token_array)
    model = models.Word2Vec(token_array, min_count=1)
    print(model)
    return