Example #1
0
class JokeData(object):
    data_set = None
    train_data = None
    test_data = None
    vocab = None
    data_num = 0
    vocab_size = 0
    max_seq_len = 0

    def __init__(self, conf):
        print(conf.data_path)
        self.data_set = get_joke_data(conf.data_path)
        self.data_num = len(self.data_set)
        self.data_set.apply(self.split_sent,new_field_name='words')
        self.max_seq_len = min(self.max_seq_len,conf.max_seq_len)
        self.data_set.apply(lambda x : len(x['words']),new_field_name='seq_len')
        self.train_data,self.test_data = self.data_set.split(0.2)

    def split_chinese_sent(self,ins,remove_punc=False):
        line = ins['raw_joke'].strip()
        words = ['<START>']
        for c in line:
            if c in [',','。','?','!']:
                if remove_punc:
                    continue
                else:
                    words.append(c)
            else:
                words.append(c)
        words.append('<EOS>')
        self.max_seq_len = max(self.max_seq_len,len(words))
        return words
    
    def split_sent(self,ins,remove_punc=False):
        words = ['<START>'] + ins['raw_joke'].split() + ['<EOS>']
        self.max_seq_len = max(self.max_seq_len,len(words))
        return words

    def pad_seq(self,ins):
        words = ins['words']
        if(len(words) < self.max_seq_len):
            words = [0]*(self.max_seq_len-len(words)) + words
        else:
            words = words[:self.max_seq_len]
        return words
        
    def get_vocab(self):
        self.vocab = Vocabulary(min_freq=10)
        self.train_data.apply(lambda x : [self.vocab.add(word) for word in x['words']])
        self.vocab.build_vocab()
        self.vocab.build_reverse_vocab()
        self.vocab_size = self.vocab.__len__()

        self.train_data.apply(lambda x : [self.vocab.to_index(word) for word in x['words']],new_field_name='words')
        self.train_data.apply(self.pad_seq,new_field_name='pad_words')
        
        self.test_data.apply(lambda x : [self.vocab.to_index(word) for word in x['words']],new_field_name='words')
        self.test_data.apply(self.pad_seq,new_field_name='pad_words')
class TextData():
    vocab_size = 0
    dataset_size = 0
    train_size = 0
    test_size = 0
    class_num = 4
    min_count = 10
    max_seq_len = 500
    seq_limit = 2000
    data_src = "20news"

    data_set = DataSet()
    train_set = DataSet()
    test_set = DataSet()
    dev_set = DataSet()
    vocab = None


    def __init__(self,data_src="20news",min_count=10,seq_limit=None):
        self.data_src = data_src
        self.min_count = min_count
        if seq_limit is not None:
            self.seq_limit = seq_limit

    def find_max_len(self,words):
        self.max_seq_len = max(len(words),self.max_seq_len)

    def seq_regularize(self,words):
        wlen = len(words)
        if wlen<self.max_seq_len:
            return [0]*(self.max_seq_len-wlen) + words
        else:
            return words[:self.max_seq_len]

    def fetch_20news(self,size=4):
        print("Loading 20newsgroups data and tokenize.")
        if size==20:
            train,test = get_all_20news()
        else:
            train,test = get_text_classification_datasets()
        train_input,test_input = tokenize(train.data,test.data)
        train_target = train.target
        test_target = test.target
        self.class_num = len(train.target_names)
        assert (self.class_num == len(test.target_names))

        # Building Fastnlp dataset.
        print("Building Fastnlp dataset.")
        self.train_set = DataSet({"text":train_input,"class":train_target})
        self.test_set = DataSet({"text":test_input,"class":test_target})
        
        # Building Fastnlp vocabulary...
        print("Building Fastnlp vocabulary.")
        self.vocab = Vocabulary(min_freq=self.min_count)
        self.train_set.apply(lambda x : [self.vocab.add_word(word) for word in x['text']])
        self.vocab.build_vocab()
        self.vocab.build_reverse_vocab()
        self.vocab_size = len(self.vocab)
        # Building multi-hot-vector for train_set and test_set.
        print("Building id-presentation for train_set and test_set.")
        self.vocab.index_dataset(self.train_set,self.test_set,field_name='text',new_field_name='words')
        
        self.train_set.apply_field(lambda x : len(x),field_name='words',new_field_name='seq_len')
        self.test_set.apply_field(lambda x : len(x),field_name='words',new_field_name='seq_len')
        self.train_set.apply_field(self.find_max_len,field_name='words')

        print(self.max_seq_len)
        self.max_seq_len = min(self.max_seq_len,self.seq_limit)

        self.train_set.apply_field(self.seq_regularize,field_name='words',new_field_name='words')
        self.test_set.apply_field(self.seq_regularize,field_name='words',new_field_name='words')
        # self.train_set.apply(lambda x : text2multi_hot(x['words'],self.vocab_size),new_field_name="input")
        # self.test_set.apply(lambda x : text2multi_hot(x['words'],self.vocab_size),new_field_name='input')
        
        # Building target-vector for train_set and test_set.
        print("Building target-vector for train_set and test_set.")
        self.train_set.apply(lambda x : int(x['class']),new_field_name="target",is_target=True)
        self.test_set.apply(lambda x : int(x['class']),new_field_name="target",is_target=True)
        # self.train_set.apply(lambda x : class2target(x['class'],self.calss_num),new_field_name="target")
        # self.test_set.apply(lambda x : class2target(x['class'],self.calss_num),new_field_name="target")

    def fetch_csv(self,path=None):
        print("Not implemented now...")
        pass

    def fetch_data(self,path=None):
        if self.data_src == "20news":
            # Loading 20newsgroups data and tokenize.
            self.fetch_20news()
        elif self.data_src == "20news_all":
            self.fetch_20news(size=20)
        else:
            print("No data src...")
        
        self.train_size = self.train_set.get_length()
        self.test_size = self.test_set.get_length()
        return self.train_size,self.test_size
Example #3
0
class TextData():
    data_src = "all_data"
    class_num = 2
    min_count = 10
    max_seq_len = 500
    seq_limit = 2000

    train_set = DataSet()
    val_set = DataSet()
    test_set = DataSet()
    train_size = 0
    val_size = 0
    test_size = 0

    test_projectid = None

    vocab = None
    vocab_size = 0

    def __init__(self, data_src="all_data", min_count=10, seq_limit=None):
        self.data_src = data_src
        self.min_count = min_count
        if seq_limit is not None:
            self.seq_limit = seq_limit

    def find_max_len(self, words):
        self.max_seq_len = max(len(words), self.max_seq_len)

    def seq_regularize(self, words):
        wlen = len(words)
        if wlen < self.max_seq_len:
            return [0] * (self.max_seq_len - wlen) + words
        else:
            return words[:self.max_seq_len]

    def fetch_csv(self,
                  path,
                  text_var="essay",
                  target="is_exciting",
                  subset_num=None,
                  us_rate=None,
                  os_rate=None):
        """ 
        us_rate: under sampling rate
        os_rate: over sampling rate
         """
        print("Loading data from {} ...".format(path))
        df = pd.read_csv(path)
        # text_vars=["title", "short_description", "need_statement", "essay"]
        text_vars = text_var  # only select the essay column
        target_var = "y"
        df[target_var] = 0.0
        df[target_var][df[target] == "t"] = 1.0
        df[target_var][df[target] != "t"] = 0.0
        train_df = df[df['split'] == 'train']
        val_df = df[df['split'] == 'val']
        test_df = df[df['split'] == 'test']
        train_num = len(train_df)
        val_num = len(val_df)
        test_num = len(test_df)
        print("nums:({},{},{})".format(train_num, val_num, test_num))
        if os_rate is not None:
            print("Over Sample mode")
            ros = RandomOverSampler(random_state=0)
        elif us_rate is not None:
            print("Under Sample mode")
            train_df_t = train_df[df[target] == "t"]
            train_df_f = train_df[df[target] == "f"]
            t_num = len(train_df_t)
            f_num = len(train_df_f)
            print("Raw train t:f = {}:{}".format(t_num, f_num))
            nf_num = int(t_num / us_rate)
            f_num = min(nf_num, f_num)
            balanced_train_t = train_df_t.sample(n=t_num)
            balanced_train_f = train_df_f.sample(n=f_num)
            train_df = pd.concat([balanced_train_t,
                                  balanced_train_f]).sample(frac=1)
            print("Balanced train: t:f = {}:{}".format(len(balanced_train_t),
                                                       len(balanced_train_f)))
            # print("Train 1.0:",len(train_df[train_df[target_var] == 1.0]))

            val_df_t = val_df[df[target] == "t"]
            val_df_f = val_df[df[target] == "f"]
            t_num = len(val_df_t)
            f_num = len(val_df_f)
            print("Raw val t:f = {}:{}".format(t_num, f_num))
            nf_num = int(t_num / us_rate)
            f_num = min(nf_num, f_num)
            balanced_val_t = val_df_t.sample(n=t_num)
            balanced_val_f = val_df_f.sample(n=f_num)
            val_df = pd.concat([balanced_val_t, balanced_val_f]).sample(frac=1)
            print("Balanced val: t:f = {}:{}".format(len(balanced_val_t),
                                                     len(balanced_val_f)))
        else:
            print("No sample mode")
        if subset_num is not None and subset_num > 0:
            print("Get sub set of size {}.".format(subset_num))
            train_df = train_df.sample(n=subset_num)
            val_df = val_df.sample(n=subset_num)

        train_num = len(train_df)
        val_num = len(val_df)
        test_num = len(test_df)
        print("subset nums:({},{},{})".format(train_num, val_num, test_num))

        train_target = train_df[target_var].values
        count = 0
        print(count)
        val_target = val_df[target_var].values
        test_target = test_df[target_var].values

        print("tokenize train set")
        train_input = tokenize(train_df[text_vars].values)
        print("tokenize val set")
        val_input = tokenize(val_df[text_vars].values)
        print("tokenize test set")
        test_input = tokenize(test_df[text_vars].values)

        assert (self.class_num == 2)
        self.test_projectid = test_df['projectid']
        # Building Fastnlp dataset.
        print("Building Fastnlp dataset.")
        if os_rate is not None:
            print("Over Sampling...")
            train_input, train_target = ros.fit_sample(
                np.array(train_input)[:, np.newaxis],
                np.array(train_target)[:, np.newaxis])
            train_input = train_input.squeeze().tolist()
            train_target = train_target.tolist()
            val_input, val_target = ros.fit_sample(
                np.array(val_input)[:, np.newaxis],
                np.array(val_target)[:, np.newaxis])
            val_input = val_input.squeeze().tolist()
            val_target = val_target.tolist()
        self.train_set = DataSet({"text": train_input, "class": train_target})
        self.val_set = DataSet({"text": val_input, "class": val_target})
        self.test_set = DataSet({"text": test_input, "class": test_target})

        # Building Fastnlp vocabulary...
        print("Building Fastnlp vocabulary.")
        self.vocab = Vocabulary(min_freq=self.min_count)
        self.train_set.apply(
            lambda x: [self.vocab.add_word(word) for word in x['text']])
        self.vocab.build_vocab()
        self.vocab.build_reverse_vocab()
        self.vocab_size = len(self.vocab)
        # Building multi-hot-vector for train_set and test_set.
        print("Building id-presentation for train_set and test_set.")
        self.vocab.index_dataset(self.train_set,
                                 self.val_set,
                                 self.test_set,
                                 field_name='text',
                                 new_field_name='words')

        self.train_set.apply_field(lambda x: len(x),
                                   field_name='words',
                                   new_field_name='seq_len')
        self.val_set.apply_field(lambda x: len(x),
                                 field_name='words',
                                 new_field_name='seq_len')
        self.test_set.apply_field(lambda x: len(x),
                                  field_name='words',
                                  new_field_name='seq_len')
        self.train_set.apply_field(self.find_max_len, field_name='words')

        print(self.max_seq_len)
        self.max_seq_len = min(self.max_seq_len, self.seq_limit)

        self.train_set.apply_field(self.seq_regularize,
                                   field_name='words',
                                   new_field_name='words')
        self.val_set.apply_field(self.seq_regularize,
                                 field_name='words',
                                 new_field_name='words')
        self.test_set.apply_field(self.seq_regularize,
                                  field_name='words',
                                  new_field_name='words')
        # self.train_set.apply(lambda x : text2multi_hot(x['words'],self.vocab_size),new_field_name="input")
        # self.val_set.apply(lambda x : text2multi_hot(x['words'],self.vocab_size),new_field_name='input')
        # self.test_set.apply(lambda x : text2multi_hot(x['words'],self.vocab_size),new_field_name='input')

        # Building target-vector for train_set and test_set.
        print("Building target-vector for train_set and test_set.")
        self.train_set.apply(lambda x: int(x['class']),
                             new_field_name="target",
                             is_target=True)
        self.val_set.apply(lambda x: int(x['class']),
                           new_field_name="target",
                           is_target=True)
        self.test_set.apply(lambda x: int(x['class']),
                            new_field_name="target",
                            is_target=True)
        # self.train_set.apply(lambda x : class2target(x['class'],self.calss_num),new_field_name="target")
        # self.test_set.apply(lambda x : class2target(x['class'],self.calss_num),new_field_name="target")

    def fetch_data(self,
                   path,
                   text_var="essay",
                   target_var="is_exciting",
                   subset_num=None,
                   us_rate=None,
                   os_rate=None):
        if self.data_src == "all_data":
            # Loading 20newsgroups data and tokenize.
            self.fetch_csv(path, text_var, target_var, subset_num, us_rate,
                           os_rate)
        else:
            print("No legal data src type:{} ...".format(self.data_src))
            assert (0 == 1)

        self.train_size = self.train_set.get_length()
        self.val_size = self.val_set.get_length()
        self.test_size = self.test_set.get_length()
        return self.train_size, self.val_size, self.test_size