def load_word_embedding(data_name='google_news', data_type='bin'): logger.info('Start load word2vec word embedding') os_name = get_os_name() if os_name == "windows": file1 = 'D:/Word_Embeddings/GoogleNews-vectors-negative300.bin.gz' file2 = 'D:/Word_Embeddings/freebase-vectors-skipgram1000.bin.gz' file3 = 'D:/Word_Embeddings/GoogleNews-vectors-negative300.bin' file4 = 'D:/Word_Embeddings/freebase-vectors-skipgram1000.bin' elif os_name == 'ubuntu': file1 = '/home/hs/Data/Word_Embeddings/GoogleNews-vectors-negative300.bin.gz' file2 = '/home/hs/Data/Word_Embeddings/freebase-vectors-skipgram1000.bin.gz' file3 = '/home/hs/Data/Word_Embeddings/google_news.bin' file4 = '/home/hs/Data/Word_Embeddings/freebase.bin' if data_name == 'google_news': if data_type == 'bin': model = Word2Vec.load_word2vec_format(file3, binary=True) else: # load .bin.gz data model = Word2Vec.load_word2vec_format(file1, binary=True) else: # load freebase if data_type == 'bin': model = Word2Vec.load_word2vec_format(file4, binary=True) else: model = Word2Vec.load_word2vec_format(file2, binary=True) # using gzipped/bz2 input works too, no need to unzip: logging.info('Loading word embedding complete') return model
def load_train_data(data_type='Sentiment140'): logging.info('Start loading data') texts = [] labels = [] if data_type == 'Sentiment140': if parameters['test_data_size'] == 160000: os_name = get_os_name() if os_name == "windows": file_name = 'C:/Corpus/training.csv' elif os_name == 'ubuntu': file_name = '/home/hs/Data/Corpus/training.csv' else: file_name = './data/traindata/Sentiment140/' + str( parameters['test_data_size']) + '.csv' inpTweets = csv.reader( open( file_name, 'rt', encoding='ISO-8859-1'), # Please watch out the encoding format delimiter=',') for row in inpTweets: sentiment = (1 if row[0] == '4' else 0) tweet = row[5] labels.append(sentiment) texts.append(tweet) logging.info('Load data finished') return texts, labels
def load_selected_data(data_type='train', stem=False): logging.info('Start Loading Data') if stem == False: if data_type == 'train': if parameters['test_data_size'] == 160000: os_name = get_os_name() if os_name == "windows": file_name = 'C:/Corpus/anew_part_of_nostem_160000.csv' elif os_name == 'ubuntu': file_name = '/home/hs/Data/Corpus/anew_part_of_nostem_160000.csv' else: file_name = './data/traindata/Sentiment140/pre-processed/anew_part_of_nostem_' + str( parameters['test_data_size']) + '.csv' elif data_type == 'test': file_name = './data/testdata/anew_part_of_nostem_test.csv' elif stem == True: if data_type == 'train': if parameters['test_data_size'] == 160000: os_name = get_os_name() if os_name == "windows": file_name = 'C:/Corpus/preprocessed_training_data_nostem_160000.csv' elif os_name == 'ubuntu': file_name = '/home/hs/Data/Corpus/preprocessed_training_data_nostem_160000.csv' else: file_name = './data/traindata/Sentiment140/pre-processed/preprocessed_training_data_nostem_' + str( parameters['test_data_size']) + '.csv' elif data_type == 'test': file_name = './data/testdata/preprocessed_test_data_nostem_359.csv' with open(file_name, 'r', encoding='ISO-8859-1') as csvfile: reader = csv.reader(csvfile, delimiter=',') text, label = [], [] for line in reader: text.append(line[1]) label.append(int(line[0])) logging.info('Load Data Completed') return text, label
def load_selected_data(data_type='train', stem=False): logging.info('Start Loading Data') if stem == False: if data_type == 'train': if parameters['test_data_size'] == 160000: os_name = get_os_name() if os_name == "windows": file_name = 'C:/Corpus/anew_part_of_nostem_160000.csv' elif os_name == 'ubuntu': file_name = '/home/hs/Data/Corpus/anew_part_of_nostem_160000.csv' else: file_name = './data/traindata/Sentiment140/pre-processed/anew_part_of_nostem_' + str( parameters['test_data_size']) + '.csv' elif data_type == 'test': file_name = './data/testdata/anew_part_of_nostem_test.csv' elif stem == True: if data_type == 'train': if parameters['test_data_size'] == 160000: os_name = get_os_name() if os_name == "windows": file_name = 'C:/Corpus/preprocessed_training_data_nostem_160000.csv' elif os_name == 'ubuntu': file_name = '/home/hs/Data/Corpus/preprocessed_training_data_nostem_160000.csv' else: file_name = './data/traindata/Sentiment140/pre-processed/preprocessed_training_data_nostem_' + str( parameters['test_data_size']) + '.csv' elif data_type == 'test': file_name = './data/testdata/preprocessed_test_data_nostem_359.csv' with open(file_name, 'r', encoding= 'ISO-8859-1') as csvfile: reader = csv.reader(csvfile, delimiter=',') text, label = [], [] for line in reader: text.append(line[1]) label.append(int(line[0])) logging.info('Load Data Completed') return text, label
def load_train_data(data_type='Sentiment140'): logging.info('Start loading data') texts = [] labels = [] if data_type == 'Sentiment140': if parameters['test_data_size'] == 160000: os_name = get_os_name() if os_name == "windows": file_name = 'C:/Corpus/training.csv' elif os_name == 'ubuntu': file_name = '/home/hs/Data/Corpus/training.csv' else: file_name = './data/traindata/Sentiment140/' + str(parameters['test_data_size']) + '.csv' inpTweets = csv.reader( open(file_name, 'rt', encoding='ISO-8859-1'), # Please watch out the encoding format delimiter=',') for row in inpTweets: sentiment = (1 if row[0] == '4' else 0) tweet = row[5] labels.append(sentiment) texts.append(tweet) logging.info('Load data finished') return texts, labels
def preprocess_tweeets(tweets_list, tweets_labels, filename): def isEnglish(s): try: s.encode('ascii') except UnicodeEncodeError: return False else: return True processed_texts = [] for line, l in zip(tweets_list, tweets_labels): if isEnglish(line): processed_texts.append((l, preprocessor(line))) # else: # print or not ? # print(line) os_name = get_os_name() if os_name == 'windows': file_dir = 'C:/Corpus/' elif os_name == 'ubuntu': file_dir = '/home/hs/Data/' else: return csv_save(processed_texts, file_dir + filename)