Exemple #1
0
def prepare_dataset(X, y, pipeline, y_one_hot=True, y_as_is=False):
    try:
        print_dataset_statistics(y)
    except:
        pass

    X = pipeline.fit_transform(X)

    if y_as_is:
        try:
            return X, numpy.asarray(y, dtype=float)
        except:
            return X, y

    # 1 - Labels to categories
    y_cat = labels_to_categories(y)

    if y_one_hot:
        # 2 - Labels to one-hot vectors
        return X, categories_to_onehot(y_cat)

    return X, y_cat
    def __init__(self,
                 word_indices,
                 text_lengths,
                 loading_data=True,
                 datafolder="",
                 preprocess_typ="ekphrasis",
                 **kwargs):
        self.word_indices = word_indices
        self.y_one_hot = kwargs.get("y_one_hot", True)

        self.pipeline = Pipeline([
            ('ext',
             EmbeddingsExtractor(word_indices=word_indices,
                                 max_lengths=text_lengths,
                                 add_tokens=(True),
                                 unk_policy="random"))
        ])
        if (loading_data):
            print("Loading data...")
            self.X_train = pickle.load(
                open("{}X_train_{}.pickle".format(datafolder, preprocess_typ),
                     "rb"))
            self.X_test = pickle.load(
                open("{}X_test_{}.pickle".format(datafolder, preprocess_typ),
                     "rb"))
            self.y_train = pickle.load(
                open("{}y_train_{}.pickle".format(datafolder, preprocess_typ),
                     "rb"))
            self.y_test = pickle.load(
                open("{}y_test_{}.pickle".format(datafolder, preprocess_typ),
                     "rb"))

            print(
                "-------------------\ntraining set stats\n-------------------")
            print_dataset_statistics(self.y_train)
            print("-------------------")
Exemple #3
0
    def __init__(self,
                 word_indices,
                 text_lengths,
                 subtask="A",
                 silver=False,
                 **kwargs):

        self.word_indices = word_indices

        filter_classes = kwargs.get("filter_classes", None)
        self.y_one_hot = kwargs.get("y_one_hot", True)

        self.pipeline = Pipeline([
            ('preprocess',
             CustomPreProcessor(
                 TextPreProcessor(
                     backoff=[
                         'url', 'email', 'percent', 'money', 'phone', 'user',
                         'time', 'url', 'date', 'number'
                     ],
                     include_tags={
                         "hashtag", "allcaps", "elongated", "repeated",
                         'emphasis', 'censored'
                     },
                     fix_html=True,
                     segmenter="twitter",
                     corrector="twitter",
                     unpack_hashtags=True,
                     unpack_contractions=True,
                     spell_correct_elong=False,
                     tokenizer=SocialTokenizer(lowercase=True).tokenize,
                     dicts=[emoticons]))),
            ('ext',
             EmbeddingsExtractor(word_indices=word_indices,
                                 max_lengths=text_lengths,
                                 add_tokens=(False,
                                             True) if subtask != "A" else True,
                                 unk_policy="random"))
        ])

        # loading data
        print("Loading data...")
        dataset = SemEvalDataLoader(verbose=False).get_data(task=subtask,
                                                            years=None,
                                                            datasets=None,
                                                            only_semeval=True)
        random.Random(42).shuffle(dataset)

        if filter_classes:
            dataset = [d for d in dataset if d[0] in filter_classes]

        self.X = [obs[1] for obs in dataset]
        self.y = [obs[0] for obs in dataset]
        print("total observations:", len(self.y))

        print("-------------------\ntraining set stats\n-------------------")
        print_dataset_statistics(self.y)
        print("-------------------")

        if silver:
            print("Loading silver data...")
            dataset = SemEvalDataLoader().get_silver()
            self.silver_X = [obs[1] for obs in dataset]
            self.silver_y = [obs[0] for obs in dataset]
            print("total observations:", len(self.silver_y))