Ejemplo n.º 1
0
def load_word_embedding(data_name='google_news', data_type='bin'):
    logger.info('Start load word2vec word embedding')
    os_name = get_os_name()
    if os_name == "windows":
        file1 = 'D:/Word_Embeddings/GoogleNews-vectors-negative300.bin.gz'
        file2 = 'D:/Word_Embeddings/freebase-vectors-skipgram1000.bin.gz'
        file3 = 'D:/Word_Embeddings/GoogleNews-vectors-negative300.bin'
        file4 = 'D:/Word_Embeddings/freebase-vectors-skipgram1000.bin'
    elif os_name == 'ubuntu':
        file1 = '/home/hs/Data/Word_Embeddings/GoogleNews-vectors-negative300.bin.gz'
        file2 = '/home/hs/Data/Word_Embeddings/freebase-vectors-skipgram1000.bin.gz'
        file3 = '/home/hs/Data/Word_Embeddings/google_news.bin'
        file4 = '/home/hs/Data/Word_Embeddings/freebase.bin'
    if data_name == 'google_news':
        if data_type == 'bin':
            model = Word2Vec.load_word2vec_format(file3, binary=True)
        else:  # load .bin.gz data
            model = Word2Vec.load_word2vec_format(file1, binary=True)
    else:  # load freebase
        if data_type == 'bin':
            model = Word2Vec.load_word2vec_format(file4, binary=True)
        else:
            model = Word2Vec.load_word2vec_format(file2, binary=True)

    # using gzipped/bz2 input works too, no need to unzip:
    logging.info('Loading word embedding complete')
    return model
Ejemplo n.º 2
0
def load_train_data(data_type='Sentiment140'):
    logging.info('Start loading data')
    texts = []
    labels = []
    if data_type == 'Sentiment140':
        if parameters['test_data_size'] == 160000:
            os_name = get_os_name()
            if os_name == "windows":
                file_name = 'C:/Corpus/training.csv'
            elif os_name == 'ubuntu':
                file_name = '/home/hs/Data/Corpus/training.csv'
        else:
            file_name = './data/traindata/Sentiment140/' + str(
                parameters['test_data_size']) + '.csv'
        inpTweets = csv.reader(
            open(
                file_name, 'rt',
                encoding='ISO-8859-1'),  # Please watch out the encoding format
            delimiter=',')
        for row in inpTweets:
            sentiment = (1 if row[0] == '4' else 0)
            tweet = row[5]
            labels.append(sentiment)
            texts.append(tweet)
    logging.info('Load data finished')
    return texts, labels
def load_word_embedding(data_name='google_news', data_type='bin'):
    logger.info('Start load word2vec word embedding')
    os_name = get_os_name()
    if os_name == "windows":
        file1 = 'D:/Word_Embeddings/GoogleNews-vectors-negative300.bin.gz'
        file2 = 'D:/Word_Embeddings/freebase-vectors-skipgram1000.bin.gz'
        file3 = 'D:/Word_Embeddings/GoogleNews-vectors-negative300.bin'
        file4 = 'D:/Word_Embeddings/freebase-vectors-skipgram1000.bin'
    elif os_name == 'ubuntu':
        file1 = '/home/hs/Data/Word_Embeddings/GoogleNews-vectors-negative300.bin.gz'
        file2 = '/home/hs/Data/Word_Embeddings/freebase-vectors-skipgram1000.bin.gz'
        file3 = '/home/hs/Data/Word_Embeddings/google_news.bin'
        file4 = '/home/hs/Data/Word_Embeddings/freebase.bin'
    if data_name == 'google_news':
        if data_type == 'bin':
            model = Word2Vec.load_word2vec_format(file3, binary=True)
        else:  # load .bin.gz data
            model = Word2Vec.load_word2vec_format(file1, binary=True)
    else:  # load freebase
        if data_type == 'bin':
            model = Word2Vec.load_word2vec_format(file4, binary=True)
        else:
            model = Word2Vec.load_word2vec_format(file2, binary=True)

    # using gzipped/bz2 input works too, no need to unzip:
    logging.info('Loading word embedding complete')
    return model
Ejemplo n.º 4
0
def load_selected_data(data_type='train', stem=False):
    logging.info('Start Loading Data')
    if stem == False:
        if data_type == 'train':
            if parameters['test_data_size'] == 160000:
                os_name = get_os_name()
                if os_name == "windows":
                    file_name = 'C:/Corpus/anew_part_of_nostem_160000.csv'
                elif os_name == 'ubuntu':
                    file_name = '/home/hs/Data/Corpus/anew_part_of_nostem_160000.csv'
            else:
                file_name = './data/traindata/Sentiment140/pre-processed/anew_part_of_nostem_' + str(
                    parameters['test_data_size']) + '.csv'
        elif data_type == 'test':
            file_name = './data/testdata/anew_part_of_nostem_test.csv'
    elif stem == True:
        if data_type == 'train':
            if parameters['test_data_size'] == 160000:
                os_name = get_os_name()
                if os_name == "windows":
                    file_name = 'C:/Corpus/preprocessed_training_data_nostem_160000.csv'
                elif os_name == 'ubuntu':
                    file_name = '/home/hs/Data/Corpus/preprocessed_training_data_nostem_160000.csv'
            else:
                file_name = './data/traindata/Sentiment140/pre-processed/preprocessed_training_data_nostem_' + str(
                    parameters['test_data_size']) + '.csv'
        elif data_type == 'test':
            file_name = './data/testdata/preprocessed_test_data_nostem_359.csv'

    with open(file_name, 'r', encoding='ISO-8859-1') as csvfile:
        reader = csv.reader(csvfile, delimiter=',')
        text, label = [], []
        for line in reader:
            text.append(line[1])
            label.append(int(line[0]))
    logging.info('Load Data Completed')
    return text, label
def load_selected_data(data_type='train', stem=False):
    logging.info('Start Loading Data')
    if stem == False:
        if data_type == 'train':
            if parameters['test_data_size'] == 160000:
                os_name = get_os_name()
                if os_name == "windows":
                    file_name = 'C:/Corpus/anew_part_of_nostem_160000.csv'
                elif os_name == 'ubuntu':
                    file_name = '/home/hs/Data/Corpus/anew_part_of_nostem_160000.csv'
            else:
                file_name = './data/traindata/Sentiment140/pre-processed/anew_part_of_nostem_' + str(
                    parameters['test_data_size']) + '.csv'
        elif data_type == 'test':
            file_name = './data/testdata/anew_part_of_nostem_test.csv'
    elif stem == True:
        if data_type == 'train':
            if parameters['test_data_size'] == 160000:
                os_name = get_os_name()
                if os_name == "windows":
                    file_name = 'C:/Corpus/preprocessed_training_data_nostem_160000.csv'
                elif os_name == 'ubuntu':
                    file_name = '/home/hs/Data/Corpus/preprocessed_training_data_nostem_160000.csv'
            else:
                file_name = './data/traindata/Sentiment140/pre-processed/preprocessed_training_data_nostem_' + str(
                    parameters['test_data_size']) + '.csv'
        elif data_type == 'test':
            file_name = './data/testdata/preprocessed_test_data_nostem_359.csv'

    with open(file_name, 'r', encoding= 'ISO-8859-1') as csvfile:
        reader = csv.reader(csvfile, delimiter=',')
        text, label = [], []
        for line in reader:
            text.append(line[1])
            label.append(int(line[0]))
    logging.info('Load Data Completed')
    return text, label
def load_train_data(data_type='Sentiment140'):
    logging.info('Start loading data')
    texts = []
    labels = []
    if data_type == 'Sentiment140':
        if parameters['test_data_size'] == 160000:
            os_name = get_os_name()
            if os_name == "windows":
                file_name = 'C:/Corpus/training.csv'
            elif os_name == 'ubuntu':
                file_name = '/home/hs/Data/Corpus/training.csv'
        else:
            file_name = './data/traindata/Sentiment140/' + str(parameters['test_data_size']) + '.csv'
        inpTweets = csv.reader(
            open(file_name, 'rt', encoding='ISO-8859-1'),  # Please watch out the encoding format
            delimiter=',')
        for row in inpTweets:
            sentiment = (1 if row[0] == '4' else 0)
            tweet = row[5]
            labels.append(sentiment)
            texts.append(tweet)
    logging.info('Load data finished')
    return texts, labels
def preprocess_tweeets(tweets_list, tweets_labels, filename):
    def isEnglish(s):
        try:
            s.encode('ascii')
        except UnicodeEncodeError:
            return False
        else:
            return True

    processed_texts = []
    for line, l in zip(tweets_list, tweets_labels):
        if isEnglish(line):
            processed_texts.append((l, preprocessor(line)))
        # else: # print or not ?
        #     print(line)

    os_name = get_os_name()
    if os_name == 'windows':
        file_dir = 'C:/Corpus/'
    elif os_name == 'ubuntu':
        file_dir = '/home/hs/Data/'
    else:
        return
    csv_save(processed_texts, file_dir + filename)
def preprocess_tweeets(tweets_list, tweets_labels, filename):
    def isEnglish(s):
        try:
            s.encode('ascii')
        except UnicodeEncodeError:
            return False
        else:
            return True

    processed_texts = []
    for line, l in zip(tweets_list, tweets_labels):
        if isEnglish(line):
            processed_texts.append((l, preprocessor(line)))
        # else: # print or not ?
        #     print(line)

    os_name = get_os_name()
    if os_name == 'windows':
        file_dir = 'C:/Corpus/'
    elif os_name == 'ubuntu':
        file_dir = '/home/hs/Data/'
    else:
        return
    csv_save(processed_texts, file_dir + filename)