def normalize_data(self): raw_datapath = os.path.join(self.data_path, self.info['properties']['data_file']) data = pd.read_csv(raw_datapath, header=None, encoding='utf-8', sep='\t', index_col=False, names=[ 'tweet_id', 'user_id', 'polarity', 'text', ]) # Convert the raw polarity values to a [-1,1] range pol_conv = {"negative": -1, "neutral": 0, "positive": 1} data['polarity'].replace(pol_conv, inplace=True) # Tokenize and clean the test text_data = normalize.normalize_text(data) data = pd.concat([data['polarity'], text_data], axis=1) data.columns = ['polarity', 'text'] # Remove text that was not fetched from source remove = lambda l: l != ['not', 'available'] data = data.loc[data['text'].apply(remove)].reset_index(drop=True) return data
def normalize_data(self): dataset_train = pd.DataFrame(columns=['id', 'fold', 'text', 'polarity', 'rating']) dataset_test = pd.DataFrame(columns=['id', 'fold', 'text', 'polarity', 'rating']) raw_datapath = os.path.join(self.data_path, self.info['properties']['data_file']) self.populate_data(raw_datapath, dataset_train, 'train') self.populate_data(raw_datapath, dataset_test, 'test') dataset = dataset_train.append(dataset_test, ignore_index=True) normalized_text = normalize.normalize_text(dataset) dataset['text'] = normalized_text return dataset
def normalize_data(self): raw_datapath = os.path.join(self.data_path, self.info['properties']['data_file']) data = pd.read_csv(raw_datapath, sep=';') data = data[['polarity', 'tweet']] data['polarity'] = data['polarity'].apply(lambda p: 1 if p == 4 else -1) data['text'] = data['tweet'] # Tokenize and clean the test text_data = normalize.normalize_text(data) data = pd.concat([data['polarity'], text_data], axis=1) data.columns = ['polarity', 'text'] return data
def normalize_data(self): dataset = pd.DataFrame(columns=['id', 'fold', 'text', 'polarity']) raw_datapath = os.path.join(self.data_path, self.info['properties']['data_file']) logger.debug('Normalizing PL04') get_pol = lambda p: 1 if p == 'pos' else -1 count = 0 for pol in ('pos', 'neg'): for file in glob(os.path.join(raw_datapath, '{}/*'.format(pol))): text = self._read_file(file) cv, id_ = self._get_file_cv_id(file) fold = self._choose_fold(cv, folds) polarity = get_pol(pol) dataset.loc[count, :] = [id_, fold, text, polarity] count += 1 normalized_text = normalize.normalize_text(dataset) dataset['text'] = normalized_text return dataset
def normalize_data(self): raw_datapath = os.path.join(self.data_path, self.info['properties']['data_file']) data = pd.read_csv(raw_datapath, header=None, index_col=False, sep='\t', names=['id', 'polarity', 'text']) # Round polarity value data['polarity'] = data['polarity'].apply(self._labelize) # Tokenize and clean the test text_data = normalize.normalize_text(data) data = pd.concat([data['polarity'], text_data], axis=1) data.columns = ['polarity', 'text'] return data
def normalize_data(self): dev_text = self._read_xml_file(os.path.join(self.data_path, 'trial/affectivetext_trial.xml')) dev_valence = self._read_valence_annotation(os.path.join(self.data_path, 'trial/affectivetext_trial.valence.gold')) dev_emotion = self._read_emo_annotation(os.path.join(self.data_path, 'trial/affectivetext_trial.emotions.gold')) dev = pd.concat([dev_text, dev_valence, dev_emotion], axis=1) dev['fold'] = 'dev' test_text = self._read_xml_file(os.path.join(self.data_path, 'test/affectivetext_test.xml')) test_valence = self._read_valence_annotation(os.path.join(self.data_path, 'key/affectivetext_test.valence.gold')) test_emotion = self._read_emo_annotation(os.path.join(self.data_path, 'key/affectivetext_test.emotions.gold')) test = pd.concat([test_text, test_valence, test_emotion], axis=1) test['fold'] = 'test' data = pd.concat([dev, test], axis=0) # Tokenize and clean the test data['original_text'] = data['text'].copy() text_data = normalize.normalize_text(data) data['text'] = text_data return data
def normalize_data(self): raw_datapath = os.path.join(self.data_path, self.info['properties']['data_file']) data = pd.read_csv(raw_datapath, header=None, encoding='utf-8', sep='\t', index_col=False, names=['tweet_id', 'user_id', 'polarity', 'text']) # Convert the raw polarity values to a [-1,1] range pol_conv = {"negative": -1, "neutral": 0, "positive": 1} data['polarity'].replace(pol_conv, inplace=True) # Tokenize and clean the test text_data = normalize.normalize_text(data) data = pd.concat([data['polarity'], text_data], axis=1) data.columns = ['polarity', 'text'] return data
def normalize_data(self): raw_datapath = os.path.join(self.data_path, self.info['properties']['data_file']) trees_path = os.path.join(self.data_path, 'trainDevTestTrees_PTB') if not os.path.isdir(trees_path): os.mkdir(trees_path) shutil.move(raw_datapath, trees_path) stanford_treebank = pytreebank.load_sst(self.data_path) train = self.convert_treebank(stanford_treebank['train'], 'train') dev = self.convert_treebank(stanford_treebank['dev'], 'dev') test = self.convert_treebank(stanford_treebank['test'], 'test') data = pd.concat([train, dev, test], ignore_index=True) # Remove directory to avoid pytreebank library error #shutil.rmtree(raw_datapath) # Tokenize and clean the test text_data = normalize.normalize_text(data) logger.info(data) data['text'] = text_data return data
def normalize_data(self): raw_data_path = os.path.join(self.data_path, self.info['properties']['data_file']) data = pd.read_csv( raw_data_path, encoding='utf-8', sep='\t', index_col=False, ) if len(data) < 1: return data text_data = normalize.normalize_text(data) data = pd.concat([data['polarity'], text_data], axis=1) data.columns = ['polarity', 'text'] # Remove text that was not fetched from source remove = lambda l: l != ['not', 'available'] data = data.loc[data['text'].apply(remove)].reset_index(drop=True) return data
def normalize_data(self): raw_datapath = os.path.join(self.data_path, self.info['properties']['data_file']) data = pd.read_csv( raw_datapath, header=None, encoding='latin-1', index_col=False, names=['polarity', 'id', 'date', 'query', 'user', 'text']) # Convert the raw polarity values to a [-1,1] range pol_conv = {0: -1, 2: 0, 4: 1} data['polarity'].replace(pol_conv, inplace=True) # Tokenize and clean the test text_data = normalize.normalize_text(data) data = pd.concat([data['polarity'], text_data], axis=1) data.columns = ['polarity', 'text'] return data
def test_normalize_text(text_df): norm = normalize.normalize_text(text_df) assert norm[0] == ['the', 'cat', 'is', 'on', 'the', 'mat', '.'] assert norm[1] == ['my','dog','is','running','through','the','garden',',','he','is','so','happy','!','smile']
def norm_text(text_df): return normalize.normalize_text(text_df)