コード例 #1
0
ファイル: semeval14.py プロジェクト: ahmedyes2000/gsitk
    def normalize_data(self):
        raw_datapath = os.path.join(self.data_path,
                                    self.info['properties']['data_file'])

        data = pd.read_csv(raw_datapath,
                           header=None,
                           encoding='utf-8',
                           sep='\t',
                           index_col=False,
                           names=[
                               'tweet_id',
                               'user_id',
                               'polarity',
                               'text',
                           ])

        # Convert the raw polarity values to a [-1,1] range
        pol_conv = {"negative": -1, "neutral": 0, "positive": 1}

        data['polarity'].replace(pol_conv, inplace=True)
        # Tokenize and clean the test

        text_data = normalize.normalize_text(data)
        data = pd.concat([data['polarity'], text_data], axis=1)
        data.columns = ['polarity', 'text']
        # Remove text that was not fetched from source
        remove = lambda l: l != ['not', 'available']
        data = data.loc[data['text'].apply(remove)].reset_index(drop=True)

        return data
コード例 #2
0
 def normalize_data(self):
     dataset_train = pd.DataFrame(columns=['id', 'fold', 'text', 'polarity', 'rating'])
     dataset_test = pd.DataFrame(columns=['id', 'fold', 'text', 'polarity', 'rating'])
     raw_datapath = os.path.join(self.data_path, self.info['properties']['data_file'])
     self.populate_data(raw_datapath, dataset_train, 'train')
     self.populate_data(raw_datapath, dataset_test, 'test')
     dataset = dataset_train.append(dataset_test, ignore_index=True)
     normalized_text = normalize.normalize_text(dataset)
     dataset['text'] = normalized_text
     
     return dataset
コード例 #3
0
ファイル: sts.py プロジェクト: ahmedyes2000/gsitk
    def normalize_data(self):
        raw_datapath = os.path.join(self.data_path,
                                    self.info['properties']['data_file'])
        data = pd.read_csv(raw_datapath, sep=';')

        data = data[['polarity', 'tweet']]
        data['polarity'] = data['polarity'].apply(lambda p: 1
                                                  if p == 4 else -1)
        data['text'] = data['tweet']

        # Tokenize and clean the test
        text_data = normalize.normalize_text(data)
        data = pd.concat([data['polarity'], text_data], axis=1)

        data.columns = ['polarity', 'text']

        return data
コード例 #4
0
 def normalize_data(self):
     dataset = pd.DataFrame(columns=['id', 'fold', 'text', 'polarity'])
     raw_datapath = os.path.join(self.data_path,
                                 self.info['properties']['data_file'])
     logger.debug('Normalizing PL04')
     get_pol = lambda p: 1 if p == 'pos' else -1
     count = 0
     for pol in ('pos', 'neg'):
         for file in glob(os.path.join(raw_datapath, '{}/*'.format(pol))):
             text = self._read_file(file)
             cv, id_ = self._get_file_cv_id(file)
             fold = self._choose_fold(cv, folds)
             polarity = get_pol(pol)
             dataset.loc[count, :] = [id_, fold, text, polarity]
             count += 1
     normalized_text = normalize.normalize_text(dataset)
     dataset['text'] = normalized_text
     return dataset
コード例 #5
0
    def normalize_data(self):
        raw_datapath = os.path.join(self.data_path,
                                    self.info['properties']['data_file'])

        data = pd.read_csv(raw_datapath,
                           header=None,
                           index_col=False,
                           sep='\t',
                           names=['id', 'polarity', 'text'])

        # Round polarity value
        data['polarity'] = data['polarity'].apply(self._labelize)
        # Tokenize and clean the test
        text_data = normalize.normalize_text(data)
        data = pd.concat([data['polarity'], text_data], axis=1)

        data.columns = ['polarity', 'text']

        return data
コード例 #6
0
ファイル: semeval07.py プロジェクト: ahmedyes2000/gsitk
    def normalize_data(self):
        
        dev_text = self._read_xml_file(os.path.join(self.data_path, 'trial/affectivetext_trial.xml'))
        dev_valence = self._read_valence_annotation(os.path.join(self.data_path, 'trial/affectivetext_trial.valence.gold'))
        dev_emotion = self._read_emo_annotation(os.path.join(self.data_path, 'trial/affectivetext_trial.emotions.gold'))
        dev = pd.concat([dev_text, dev_valence, dev_emotion], axis=1)
        dev['fold'] = 'dev'

        test_text = self._read_xml_file(os.path.join(self.data_path, 'test/affectivetext_test.xml'))
        test_valence = self._read_valence_annotation(os.path.join(self.data_path, 'key/affectivetext_test.valence.gold'))
        test_emotion = self._read_emo_annotation(os.path.join(self.data_path, 'key/affectivetext_test.emotions.gold'))
        test = pd.concat([test_text, test_valence, test_emotion], axis=1)
        test['fold'] = 'test'

        data = pd.concat([dev, test], axis=0)

        # Tokenize and clean the test
        data['original_text'] = data['text'].copy()
        text_data = normalize.normalize_text(data)
        data['text'] = text_data

        return data
コード例 #7
0
    def normalize_data(self):
        raw_datapath = os.path.join(self.data_path,
                                    self.info['properties']['data_file'])

        data = pd.read_csv(raw_datapath,
                           header=None,
                           encoding='utf-8',
                           sep='\t',
                           index_col=False,
                           names=['tweet_id', 'user_id', 'polarity', 'text'])

        # Convert the raw polarity values to a [-1,1] range
        pol_conv = {"negative": -1, "neutral": 0, "positive": 1}

        data['polarity'].replace(pol_conv, inplace=True)
        # Tokenize and clean the test
        text_data = normalize.normalize_text(data)
        data = pd.concat([data['polarity'], text_data], axis=1)

        data.columns = ['polarity', 'text']

        return data
コード例 #8
0
ファイル: sst.py プロジェクト: ahmedyes2000/gsitk
    def normalize_data(self):
        raw_datapath = os.path.join(self.data_path,
                                    self.info['properties']['data_file'])
        trees_path = os.path.join(self.data_path, 'trainDevTestTrees_PTB')
        if not os.path.isdir(trees_path):
            os.mkdir(trees_path)
        shutil.move(raw_datapath, trees_path)
        stanford_treebank = pytreebank.load_sst(self.data_path)
        train = self.convert_treebank(stanford_treebank['train'], 'train')
        dev = self.convert_treebank(stanford_treebank['dev'], 'dev')
        test = self.convert_treebank(stanford_treebank['test'], 'test')
        data = pd.concat([train, dev, test], ignore_index=True)

        # Remove directory to avoid pytreebank library error
        #shutil.rmtree(raw_datapath)

        # Tokenize and clean the test
        text_data = normalize.normalize_text(data)
        logger.info(data)
        data['text'] = text_data

        return data
コード例 #9
0
ファイル: fake1.py プロジェクト: ahmedyes2000/gsitk
    def normalize_data(self):
        raw_data_path = os.path.join(self.data_path,
                                     self.info['properties']['data_file'])

        data = pd.read_csv(
            raw_data_path,
            encoding='utf-8',
            sep='\t',
            index_col=False,
        )

        if len(data) < 1:
            return data

        text_data = normalize.normalize_text(data)
        data = pd.concat([data['polarity'], text_data], axis=1)
        data.columns = ['polarity', 'text']
        # Remove text that was not fetched from source
        remove = lambda l: l != ['not', 'available']
        data = data.loc[data['text'].apply(remove)].reset_index(drop=True)

        return data
コード例 #10
0
    def normalize_data(self):
        raw_datapath = os.path.join(self.data_path,
                                    self.info['properties']['data_file'])

        data = pd.read_csv(
            raw_datapath,
            header=None,
            encoding='latin-1',
            index_col=False,
            names=['polarity', 'id', 'date', 'query', 'user', 'text'])

        # Convert the raw polarity values to a [-1,1] range
        pol_conv = {0: -1, 2: 0, 4: 1}

        data['polarity'].replace(pol_conv, inplace=True)

        # Tokenize and clean the test
        text_data = normalize.normalize_text(data)
        data = pd.concat([data['polarity'], text_data], axis=1)

        data.columns = ['polarity', 'text']

        return data
コード例 #11
0
def test_normalize_text(text_df):
    norm = normalize.normalize_text(text_df)
    assert norm[0] ==  ['the', 'cat', 'is', 'on', 'the', 'mat', '.']
    assert norm[1] == ['my','dog','is','running','through','the','garden',',','he','is','so','happy','!','smile']
コード例 #12
0
def norm_text(text_df):
    return normalize.normalize_text(text_df)