Esempio n. 1
0
def train_doc2vec():
    # def isEnglish(s):
    #     try:
    #         s.encode('ascii')
    #     except UnicodeEncodeError:
    #         return False
    #     else:
    #         return True

    labeled_data, _ = load_vader('./resource/tweets.txt')
    # for i,d in enumerate(labeled_data):
    #     print(i)
    #     if not isEnglish(d):
    #         print('*'*111)
    #         print(i,d)
    # exit()
    unlabeled_data, _ = load_sentiment140('/home/hs/Data/Corpus/training.csv')
    labeled_data = preprocess(labeled_data, replace=True)
    dump_picle(labeled_data, './data/acc/labeled_data.p')
    unlabeled_data = preprocess(unlabeled_data, replace=True)
    dump_picle(unlabeled_data, './data/acc/unlabeled_data.p')
    # labeled_data = load_pickle('./data/acc/labeled_data.p')
    # unlabeled_data = load_pickle('./data/acc/unlabeled_data.p')
    sentence = TaggedLineSentence(labeled_data, unlabeled_data)
    train_docvecs(sentence)
def train_doc2vec():
    # def isEnglish(s):
    #     try:
    #         s.encode('ascii')
    #     except UnicodeEncodeError:
    #         return False
    #     else:
    #         return True

    labeled_data, _ = load_vader('./resource/tweets.txt')
    # for i,d in enumerate(labeled_data):
    #     print(i)
    #     if not isEnglish(d):
    #         print('*'*111)
    #         print(i,d)
    # exit()
    unlabeled_data, _ = load_sentiment140('/home/hs/Data/Corpus/training.csv')
    labeled_data = preprocess(labeled_data, replace=True)
    dump_picle(labeled_data, './data/acc/labeled_data.p')
    unlabeled_data = preprocess(unlabeled_data, replace=True)
    dump_picle(unlabeled_data, './data/acc/unlabeled_data.p')
    # labeled_data = load_pickle('./data/acc/labeled_data.p')
    # unlabeled_data = load_pickle('./data/acc/unlabeled_data.p')
    sentence = TaggedLineSentence(labeled_data, unlabeled_data)
    train_docvecs(sentence)
def run_build_docvecs():
    model = load_embeddings('twitter')
    simple_evaluate(model)
    _, ratings = load_vader('./resource/tweets.txt')

    # Do not account the 1240 and 3516 -th item
    # r = ratings[:1240] + ratings[1241:3516] + ratings[3517:]

    build_docvecs(model, ratings)
def run_build_docvecs():
    model = load_embeddings('twitter')
    simple_evaluate(model)
    _, ratings = load_vader('./resource/tweets.txt')

    # Do not account the 1240 and 3516 -th item
    # r = ratings[:1240] + ratings[1241:3516] + ratings[3517:]

    build_docvecs(model, ratings)

def cv(data, target, multivariant=False):
    X_train, X_test, Y_train, Y_test = cross_validation.train_test_split(data, target, test_size=0.2, random_state=0)
    if multivariant is False:
        linear_regression(X_train, X_test, Y_train, Y_test, plot=False)
    else:
        linear_regression_multivariant(X_train, X_test, Y_train, Y_test, cost_fun='Ridge_Regression')


if __name__ == '__main__':

    from load_data import load_vader

    normalize = True
    corpus, ratings = load_vader(['movie_reviews'])
    corpus = process(corpus)
    # lexicon = load_lexicon(get_file_path('lexicon'))
    from load_data import load_anew
    from file_name import get_file_path
    import numpy as np

    words, valences, _ = load_anew(get_file_path('anew'))
    mark = np.array(ratings) + np.ones(len(ratings), dtype=float) * 5
    lexicon = dict()
    for i, word in enumerate(words):
        lexicon[word] = valences[i]

    # # the following could use to check the same words in corpus and lexicon
    # from visualization import show_common_term
    # show_common_term(corpus, lexicon)
def get_vocab(corpus):
    vocab = defaultdict(float)
    for sent in corpus:
        for word in clean_str(sent).split():
            vocab[word] += 1
    print(len(vocab))
    return vocab


def process(corpus):
    return [clean_str(sent) for sent in corpus]


vec_dim = 300
############################################## all ###############################################
corpus, ratings = load_vader(['tweets', 'movie_reviews', 'product_reviews', 'news_articles'])
# corpus, ratings = load_vader(['movie_reviews'])
# # name: tweets, movie_reviews, product_reviews, news_articles
lexicon_name = get_file_path('anew')
words, valences, _ = load_anew(lexicon_name)
corpus, ratings = screen_data(corpus, ratings, words)
ratings = np.array(ratings) + np.ones(len(ratings), dtype=float) * 5
print(len(corpus), len(corpus))

vocab = get_vocab(corpus)
# dump_picle(vocab, './data/corpus/vader/vocab_moview_reviews.p')
# vocab = load_pickle('./data/corpus/vader/vocab_moview_reviews.p')
print(len(vocab))
W, word_idx_map = build_embedding_matrix(load_embeddings('google_news'), vocab, k=300)
# dump_picle(word_idx_map, './data/corpus/vader/word_idx_map_movie_reviews.p')
# print('dump word_idx_map successful')
Esempio n. 7
0
def get_vocab(corpus):
    vocab = defaultdict(float)
    for sent in corpus:
        for word in clean_str(sent).split():
            vocab[word] += 1
    print(len(vocab))
    return vocab


def process(corpus):
    return [clean_str(sent) for sent in corpus]


vec_dim = 300
############################################## all ###############################################
corpus, ratings = load_vader(
    ['tweets', 'movie_reviews', 'product_reviews', 'news_articles'])
# corpus, ratings = load_vader(['movie_reviews'])
# # name: tweets, movie_reviews, product_reviews, news_articles
lexicon_name = get_file_path('anew')
words, valences, _ = load_anew(lexicon_name)
corpus, ratings = screen_data(corpus, ratings, words)
ratings = np.array(ratings) + np.ones(len(ratings), dtype=float) * 5
print(len(corpus), len(corpus))

vocab = get_vocab(corpus)
# dump_picle(vocab, './data/corpus/vader/vocab_moview_reviews.p')
# vocab = load_pickle('./data/corpus/vader/vocab_moview_reviews.p')
print(len(vocab))
W, word_idx_map = build_embedding_matrix(load_embeddings('google_news'),
                                         vocab,
                                         k=300)
def process(corpus):
    return [clean_str(sent) for sent in corpus]


if __name__ == '__main__':
    program = os.path.basename(sys.argv[0])
    logger = logging.getLogger(program)

    logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s')
    logging.root.setLevel(level=logging.INFO)
    logger.info(r"running %s" % ''.join(sys.argv))
    from load_data import load_vader

    # corpus, ratings = load_vader(['tweets', 'movie_reviews', 'product_reviews', 'news_articles'])
    corpus, ratings = load_vader(['news_articles'])
    lexicon_name = get_file_path('anew')
    logger.info(r"loading lexicon form : " + lexicon_name)
    words, valences, _ = load_anew(lexicon_name)
    corpus, ratings = screen_data(corpus, ratings, words)
    ratings = np.array(ratings) + np.ones(len(ratings), dtype=float) * 5
    print(len(corpus))
    # for i in corpus[:100]:
    #     print(i)

    lexicon = dict()
    for i, word in enumerate(words):
        lexicon[word] = valences[i]
    mean_ratings, tf_means, tfidf_means, geos, tf_geos, tfidf_geos = calculate_ratings(
        corpus, ratings, lexicon)
    dump_picle([
import numpy as np


def statistic(texts):
    avg_length, vocab = 0, set()
    length_list = []
    for text in texts:
        if type(text) is not list:
            text = text.split()
        length_list.append(len(text))
        vocab = vocab.union(set(text))
        # if len(text)>200:
        #     print(text)
    avg_length = np.average(np.array(length_list))
    return avg_length, len(vocab)


if __name__ == '__main__':
    # (['tweets', 'movie_reviews', 'product_reviews', 'news_articles'])
    tweets, _ = load_vader(['tweets'])
    movie, _ = load_vader(['movie_reviews'])
    amazon, _ = load_vader(['product_reviews'])
    NYT, _ = load_vader(['news_articles'])
    cvat = load_corpus(get_file_path('cn_corpus'))

    print(statistic(tweets))
    print(statistic(movie))
    print(statistic(amazon))
    print(statistic(NYT))
    print(statistic(cvat))
    if multivariant is False:
        linear_regression(X_train, X_test, Y_train, Y_test, plot=False)
    else:
        linear_regression_multivariant(X_train,
                                       X_test,
                                       Y_train,
                                       Y_test,
                                       cost_fun='Ridge_Regression')


if __name__ == '__main__':

    from load_data import load_vader

    normalize = True
    corpus, ratings = load_vader(['movie_reviews'])
    corpus = process(corpus)
    # lexicon = load_lexicon(get_file_path('lexicon'))
    from load_data import load_anew
    from file_name import get_file_path
    import numpy as np

    words, valences, _ = load_anew(get_file_path('anew'))
    mark = np.array(ratings) + np.ones(len(ratings), dtype=float) * 5
    lexicon = dict()
    for i, word in enumerate(words):
        lexicon[word] = valences[i]

    # # the following could use to check the same words in corpus and lexicon
    # from visualization import show_common_term
    # show_common_term(corpus, lexicon)
    for sent in corpus:
        for word in clean_str(sent).split():
            vocab[word] += 1
    print(len(vocab))
    return vocab


def process(corpus):
    return [clean_str(sent) for sent in corpus]


vec_dim = 300

############################################## tweets ###############################################
# corpus, ratings = load_vader(['tweets', 'movie_reviews', 'product_reviews', 'news_articles'])
corpus, ratings = load_vader(['tweets'])
# # name: tweets, movie_reviews, product_reviews, news_articles
lexicon_name = get_file_path('anew')
words, valences, _ = load_anew(lexicon_name)
corpus, ratings = screen_data(corpus, ratings, words)
ratings = np.array(ratings) + np.ones(len(ratings), dtype=float) * 5
print(len(corpus), len(corpus))

vocab = get_vocab(corpus)
dump_picle(vocab, './data/corpus/vader/vocab_moview_tweets.p')
# vocab = load_pickle('./data/corpus/vader/vocab_moview_reviews.p')
print('词汇数量:%s' % str(len(vocab)))
W, word_idx_map = build_embedding_matrix(load_embeddings('google_news'), vocab, k=300)
dump_picle(word_idx_map, './data/corpus/vader/word_idx_map_tweets.p')
print('dump word_idx_map successful')
dump_picle(W, './data/corpus/vader/embedding_matrix_tweets.p')