Example #1
0
def load_instances(config, instances):
    for instance_config in config["REST_instances"]:
        instance = Instance(instance_config["name"],
                            instance_config["language"],
                            instance_config["embeddings_path"],
                            instance_config["preprocessing_style"],
                            instance_config["model_path"],
                            instance_config["labels"])

        instance.text_processor = TextPreProcessor(
            # terms that will be normalized
            normalize=[
                'url', 'email', 'percent', 'money', 'phone', 'user', 'time',
                'url', 'date', 'number'
            ],
            # terms that will be annotated
            annotate={
                "hashtag", "allcaps", "elongated", "repeated", 'emphasis',
                'censored'
            },
            fix_html=True,  # fix HTML tokens

            # corpus from which the word statistics are going to be used
            # for word segmentation
            segmenter=instance_config["preprocessing_style"],

            # corpus from which the word statistics are going to be used
            # for spell correction
            corrector=instance_config["preprocessing_style"],
            unpack_hashtags=True,  # perform word segmentation on hashtags
            unpack_contractions=True,  # Unpack contractions (can't -> can not)
            spell_correct_elong=False,  # spell correction for elongated words

            # select a tokenizer. You can use SocialTokenizer, or pass your own
            # the tokenizer, should take as input a string and return a list of tokens
            tokenizer=SocialTokenizer(lowercase=True).tokenize,

            # list of dictionaries, for replacing tokens extracted from the text,
            # with other expressions. You can pass more than one dictionaries.
            dicts=[emoticons])

        instance.itos, instance.stoi, instance.vectors, instance.embeddings_size = \
            load_embeddings(instance.embeddings_path)

        instance.text = data.Field()
        instance.text.build_vocab([instance.itos])
        instance.text.vocab.set_vectors(instance.stoi, instance.vectors,
                                        instance.embeddings_size)

        instance.model = torch.load(
            instance.model_path,
            map_location='cpu' if not cuda_available else None)
        instance.model = instance.model.eval()
        instances[instance_config["name"]] = instance
Example #2
0
    def __init__(
            self,
            liwc_path: str = '',
            emolex_path: str = 'english_emolex.csv',
            estimator_path: str = 'english_twitter_politeness_estimator.joblib',
            feature_defn_path:
        str = 'english_twitter_additional_features.pickle',
            countVectorizer_path: str = '') -> None:
        # Preload LIWC dictionary:
        if liwc_path:
            liwc_df = pd.read_csv(liwc_path)
            liwc_df['*'] = liwc_df['term'].str.endswith('*')
            liwc_df['t'] = liwc_df['term'].str.rstrip('*')
            self.liwc_prefx = liwc_df[liwc_df['*']].groupby(
                'category')['t'].apply(set)
            self.liwc_whole = liwc_df[~liwc_df['*']].groupby(
                'category')['t'].apply(set)
            self.use_liwc = True

        # Preload EmoLex dictionary:
        emolex_df = pd.read_csv(emolex_path, index_col=0)
        self.emolex = emolex_df.apply(lambda s: set(s[s == 1].index))

        # Preload additional feature rules:
        pltlex = pd.read_pickle(feature_defn_path)
        types = pltlex.apply(type)
        self.pltlex_ptn = pltlex[types == re.Pattern].to_dict()
        self.pltlex_set = pltlex[types == set].to_dict()

        # Initialize Tokenizer:
        self.text_processor = TextPreProcessor(
            # terms that will be normalized:
            normalize=[
                'url', 'email', 'percent', 'money', 'phone', 'user', 'time',
                'url', 'date', 'number'
            ],
            # terms that will be annotated:
            annotate={
                "hashtag", "allcaps", "elongated", "repeated", 'emphasis',
                'censored'
            },
            # perform word segmentation on hashtags:
            unpack_hashtags=False,
            # Unpack contractions (can't -> can not):
            unpack_contractions=True,
            tokenizer=SocialTokenizer(lowercase=True).tokenize,
        )
        # preload classifier:
        self.clf = joblib.load(estimator_path)

        if countVectorizer_path:
            self.counter = joblib.load(countVectorizer_path)
            self.use_cntVec = True
Example #3
0
    def __init__(self):
        self.label2emotion = {0: "others", 1: "happy", 2: "sad", 3: "angry"}
        self.emotion2label = {"others": 0, "happy": 1, "sad": 2, "angry": 3}

        self.emoticons_additional = {
            '(^・^)': '<happy>',
            ':‑c': '<sad>',
            '=‑d': '<happy>',
            ":'‑)": '<happy>',
            ':‑d': '<laugh>',
            ':‑(': '<sad>',
            ';‑)': '<happy>',
            ':‑)': '<happy>',
            ':\\/': '<sad>',
            'd=<': '<annoyed>',
            ':‑/': '<annoyed>',
            ';‑]': '<happy>',
            '(^�^)': '<happy>',
            'angru': 'angry',
            "d‑':": '<annoyed>',
            ":'‑(": '<sad>',
            ":‑[": '<annoyed>',
            '(�?�)': '<happy>',
            'x‑d': '<laugh>',
        }

        self.text_processor = TextPreProcessor(
            # terms that will be normalized
            normalize=[
                'url', 'email', 'percent', 'money', 'phone', 'user', 'time',
                'url', 'date', 'number'
            ],
            # terms that will be annotated
            annotate={
                "hashtag", "allcaps", "elongated", "repeated", 'emphasis',
                'censored'
            },
            fix_html=True,  # fix HTML tokens
            # corpus from which the word statistics are going to be used
            # for word segmentation
            segmenter="twitter",
            # corpus from which the word statistics are going to be used
            # for spell correction
            corrector="twitter",
            unpack_hashtags=True,  # perform word segmentation on hashtags
            unpack_contractions=True,  # Unpack contractions (can't -> can not)
            spell_correct_elong=True,  # spell correction for elongated words
            # select a tokenizer. You can use SocialTokenizer, or pass your own
            # the tokenizer, should take as input a string and return a list of tokens
            tokenizer=SocialTokenizer(lowercase=True).tokenize,
            # list of dictionaries, for replacing tokens extracted from the text,
            # with other expressions. You can pass more than one dictionaries.
            dicts=[emoticons, self.emoticons_additional])
Example #4
0
    def __init__(self, args):
        if args.datastories:
            tokenizer = SocialTokenizer(lowercase=True)
        else:
            tokenizer = TweetTokenizer()
        self.RAW = data.RawField()
        self.TEXT = data.Field(batch_first=True,
                               include_lengths=True,
                               lower=True,
                               tokenize=tokenizer.tokenize)
        self.LABEL = data.Field(sequential=False, unk_token=None)

        self.train, self.dev, self.test = datasets.EMO.splits(
            args, self.RAW, self.TEXT, self.LABEL, args.train_data_path,
            args.valid_data_path, args.test_data_path)

        self.TEXT.build_vocab(self.train,
                              self.dev,
                              self.test,
                              vectors=GloVe(name='840B', dim=300))

        if args.fasttext:
            self.FASTTEXT = data.Field(batch_first=True,
                                       include_lengths=True,
                                       lower=True,
                                       tokenize=tokenizer.tokenize)
            self.FASTTEXT.vocab = copy.deepcopy(self.TEXT.vocab)
            self.FASTTEXT.vocab.set_vectors(self.FASTTEXT.vocab.stoi,
                                            vectors=FastText(language='en'),
                                            dim=300)
        self.LABEL.build_vocab(self.train)

        self.train_iter, self.dev_iter, self.test_iter = \
            data.BucketIterator.splits((self.train, self.dev, self.test),
                                       batch_size=args.batch_size,
                                       device=args.device,
                                       repeat=False)

        self.max_word_len = max([len(w) for w in self.TEXT.vocab.itos])
        # for <pad>
        self.char_vocab = {'': 0}
        # for <unk> and <pad>
        self.characterized_words = [[0] * self.max_word_len,
                                    [0] * self.max_word_len]

        if args.char_emb:
            self.build_char_vocab()

        filehandler = open('./data/vocab.obj', 'wb')
        pickle.dump(self.TEXT.vocab, filehandler)
        filehandler = open('./data/label.obj', 'wb')
        pickle.dump(self.LABEL.vocab, filehandler)
    def __init__(self, word_indices, text_lengths, **kwargs):

        self.word_indices = word_indices

        filter_classes = kwargs.get("filter_classes", None)
        self.y_one_hot = kwargs.get("y_one_hot", True)

        self.pipeline = Pipeline([
            ('preprocess',
             CustomPreProcessor(
                 TextPreProcessor(
                     backoff=[
                         'url', 'email', 'percent', 'money', 'phone', 'user',
                         'time', 'url', 'date', 'number'
                     ],
                     include_tags={
                         "hashtag", "allcaps", "elongated", "repeated",
                         'emphasis', 'censored'
                     },
                     fix_html=True,
                     segmenter="twitter",
                     corrector="twitter",
                     unpack_hashtags=True,
                     unpack_contractions=True,
                     spell_correct_elong=False,
                     tokenizer=SocialTokenizer(lowercase=True).tokenize,
                     dicts=[emoticons]))),
            ('ext',
             EmbeddingsExtractor(word_indices=word_indices,
                                 max_lengths=text_lengths,
                                 add_tokens=True,
                                 unk_policy="random"))
        ])

        # loading data
        print("Loading data...")
        dataset = DataLoader(verbose=False).get_data(years=None, datasets=None)
        random.Random(42).shuffle(dataset)

        if filter_classes:
            dataset = [d for d in dataset if d[0] in filter_classes]

        self.X = [obs[1] for obs in dataset]
        self.y = [obs[0] for obs in dataset]
        print("total observations:", len(self.y))

        print("-------------------\ntraining set stats\n-------------------")
        print_dataset_statistics(self.y)
        print("-------------------")
Example #6
0
def tokenizer(tweet):
    """Returns the tokenized sentence using a tokenizer specially
    designed for social network content, that can handle complex
    emoticons, emojis and other unstructured expressions like dates,
    times and more.

    Args:
        tweet (str) : the original tweet.

    Returns:
        tokenized_tweet (str) : the tokenized tweet.

    """
    social_tokenizer = SocialTokenizer(lowercase=False).tokenize
    return " ".join(s for s in social_tokenizer(tweet))
Example #7
0
def twitter_preprocessor():
    preprocessor = TextPreProcessor(
        normalize=['url', 'email', 'phone', 'user'],
        annotate={
            "hashtag", "elongated", "allcaps", "repeated", 'emphasis',
            'censored'
        },
        all_caps_tag="wrap",
        fix_text=False,
        segmenter="twitter_2018",
        corrector="twitter_2018",
        unpack_hashtags=True,
        unpack_contractions=True,
        spell_correct_elong=False,
        tokenizer=SocialTokenizer(lowercase=True).tokenize).pre_process_doc
    return preprocessor
def emotion_and_split():

    text_process = TextPreProcessor(

        segmenter="twitter",

        corrector="twitter",

        unpack_hashtags=True,
        unpack_contractions=True,

        tokenizer=SocialTokenizer(lowercase=True).tokenize,

        dicts=[emoticons]
    )

    return text_process
Example #9
0
    def __init__(self):
        self.root_dir = "CrisisLexT26/"
        self.count = 0
        self.natural_disasters = []
        self.non_natural_disasters = []

        self.prep_natural_disasters = []
        self.prep_non_natural_disasters = []

        self.nat_labels = []
        self.non_natural_labels = []

        self.en_prep_nat_tweets = []
        self.en_prep_non_nat_tweets = []

        self.text_processor = TextPreProcessor(
            # terms that will be normalized
            normalize=[
                'url', 'email', 'percent', 'money', 'phone', 'user', 'time',
                'url', 'date', 'number'
            ],
            # terms that will be annotated
            annotate={
                "hashtag", "allcaps", "elongated", "repeated", 'emphasis',
                'censored'
            },
            fix_html=True,  # fix HTML tokens

            # corpus from which the word statistics are going to be used
            # for word segmentation
            segmenter="twitter",

            # corpus from which the word statistics are going to be used
            # for spell correction
            corrector="twitter",
            unpack_hashtags=True,  # perform word segmentation on hashtags
            unpack_contractions=True,  # Unpack contractions (can't -> can not)
            spell_correct_elong=True,  # spell correction for elongated words

            # select a tokenizer. You can use SocialTokenizer, or pass your own
            # the tokenizer, should take as input a string and return a list of tokens
            tokenizer=SocialTokenizer(lowercase=True).tokenize,

            # list of dictionaries, for replacing tokens extracted from the text,
            # with other expressions. You can pass more than one dictionaries.
            dicts=[emoticons])
Example #10
0
 def __init__(self, **kwargs):
     self.text_processor = TextPreProcessor(
         omit=kwargs.get('normalize', []),
         normalize=kwargs.get(
             'normalize',
             ['url', 'email', 'phone', 'user', 'time', 'url', 'date']),
         annotate=kwargs.get('annotate', {}),
         fix_html=kwargs.get('fix_html', True),
         segmenter=kwargs.get('segmenter', "twitter"),
         corrector=kwargs.get('corrector', "twitter"),
         unpack_hashtags=kwargs.get('unpack_hashtags', True),
         unpack_contractions=kwargs.get('unpack_contractions', True),
         spell_correct_elong=kwargs.get('fix_elongation', True),
         spell_correction=kwargs.get('spell_correction', True),
         fix_bad_unicode=kwargs.get('fix_bad_unicode', True),
         tokenizer=SocialTokenizer(lowercase=True).tokenize,
         dicts=[emoticons])
Example #11
0
def datastories_processor(x):
    from ekphrasis.dicts.emoticons import emoticons
    from ekphrasis.classes.tokenizer import SocialTokenizer
    from ekphrasis.classes.preprocessor import TextPreProcessor

    text_processor = TextPreProcessor(
        # terms that will be normalized
        normalize=[
            'url', 'email', 'percent', 'money', 'phone', 'user', 'time', 'url',
            'date', 'number'
        ],
        # terms that will be annotated
        annotate={
            "hashtag", "allcaps", "elongated", "repeated", 'emphasis',
            'censored'
        },
        fix_html=True,  # fix HTML tokens

        # corpus from which the word statistics are going to be used
        # for word segmentation
        segmenter="twitter",

        # corpus from which the word statistics are going to be used
        # for spell correction
        corrector="twitter",
        unpack_hashtags=True,  # perform word segmentation on hashtags
        unpack_contractions=True,  # Unpack contractions (can't -> can not)
        spell_correct_elong=False,  # spell correction for elongated words

        # select a tokenizer. You can use SocialTokenizer, or pass your own
        # the tokenizer, should take as input a string and return a list of tokens
        tokenizer=SocialTokenizer(lowercase=True).tokenize,

        # list of dictionaries, for replacing tokens extracted from the text,
        # with other expressions. You can pass more than one dictionaries.
        dicts=[emoticons])

    x = [text_processor.pre_process_doc(sent) for sent in x]
    temp = []
    for sent in x:
        context = ''
        for word in sent:
            context = context + ' ' + word
        temp.append(context)

    return temp
Example #12
0
def bow_model(task, max_features=10000):
    if task == "clf":
        algo = LogisticRegression(C=0.6,
                                  random_state=0,
                                  class_weight='balanced')
    elif task == "reg":
        algo = SVR(kernel='linear', C=0.6)
    else:
        raise ValueError("invalid task!")

    word_features = TfidfVectorizer(
        ngram_range=(1, 1),
        tokenizer=lambda x: x,
        analyzer='word',
        min_df=5,
        # max_df=0.9,
        lowercase=False,
        use_idf=True,
        smooth_idf=True,
        max_features=max_features,
        sublinear_tf=True)
    preprocessor = TextPreProcessor(
        backoff=[
            'url', 'email', 'percent', 'money', 'phone', 'user', 'time', 'url',
            'date', 'number'
        ],
        include_tags={
            "hashtag", "allcaps", "elongated", "repeated", 'emphasis',
            'censored'
        },
        fix_html=True,
        segmenter="twitter",
        corrector="twitter",
        unpack_hashtags=True,
        unpack_contractions=True,
        spell_correct_elong=False,
        tokenizer=SocialTokenizer(lowercase=True).tokenize,
        dicts=[emoticons])

    model = Pipeline([('preprocess',
                       CustomPreProcessor(preprocessor, to_list=True)),
                      ('bow-feats', word_features),
                      ('normalizer', Normalizer(norm='l2')), ('clf', algo)])

    return model
Example #13
0
class Const:
    MODEL_NAME = "cardiffnlp/twitter-roberta-base"
    MAX_TOKEN_LEN = 128
    SPECIAL_TOKENS = [
        "<head>",
        "</head>",
        "<tail>",
        "</tail>",
        "<url>",
        "<user>",
        "<date>",
        "<number>",
        "<money>",
        "<email>",
        "<percent>",
        "<phone>",
        "<time>",
        "<hashtag>",
        "</hashtag>",
    ]
    NORMALIZE = [
        "url",
        "email",
        "percent",
        "money",
        "phone",
        "user",
        "time",
        "url",
        "date",
        "number",
    ]

    TEXT_PROCESSOR_ARGS = dict(
        normalize=NORMALIZE,
        annotate={"hashtag"},
        fix_html=True,
        segmenter="twitter",
        corrector="twitter",
        unpack_hashtags=True,
        unpack_contractions=True,
        spell_correct_elong=False,
        tokenizer=SocialTokenizer(lowercase=True).tokenize,
        dicts=[emoticons],
    )
Example #14
0
def preprocess_(dataset):
    preprocessor = TextPreProcessor(
        normalize=['url', 'email', 'percent', 'money', 'phone', 'user',
                   'time',
                   'date', 'number'],
        annotate={"hashtag", "elongated", "allcaps", "repeated", 'emphasis',
                  'censored'},
        all_caps_tag="wrap",
        fix_text=True,
        segmenter="twitter_2018",
        corrector="twitter_2018",
        unpack_hashtags=True,
        unpack_contractions=True,
        spell_correct_elong=False,
        tokenizer=SocialTokenizer(lowercase=True).tokenize,
        dicts=[emoticons]
    ).pre_process_doc
    return [preprocessor(x) for x in dataset]
Example #15
0
def twitter_preprocess():
    preprocessor = TextPreProcessor(
        normalize=['url', 'email', 'percent', 'money', 'phone', 'user',
                   'time',
                   'date', 'number'],
        annotate={"hashtag", "elongated", "allcaps", "repeated", 'emphasis',
                  'censored'},
        all_caps_tag="wrap",
        fix_text=True,
        segmenter="twitter_2018",
        corrector="twitter_2018",
        unpack_hashtags=True,
        unpack_contractions=True,
        spell_correct_elong=False,
        tokenizer=SocialTokenizer(lowercase=True).tokenize,
        dicts=[emoticons]
    ).pre_process_doc

    def preprocess(name, dataset):
        desc = "PreProcessing dataset {}...".format(name)

        data = [None for _ in range(len(dataset))]
        N = len(data)
        for i, x in tqdm(enumerate(dataset), desc=desc, total=N):
            data[i] = preprocessor(x)
        data = [preprocessor(x)
                for x in tqdm(dataset, desc=desc)]

        return data

    def parallel_preprocess(name, dataset):
        N = len(dataset)
        batchsize = 1000
        n_splits = N // batchsize + (1 if N % batchsize > 0 else 0)
        batches = (dataset[i*batchsize:(i+1)*batchsize] for i in range(n_splits))
        data = []
        with Pool(processes=6) as p:
            for result in tqdm(p.imap(preprocess_, batches), total=n_splits):
                data += result
        return data

    # return preprocess
    return parallel_preprocess
    def create_preprocessor(self):
        preprocessor = TextPreProcessor(
            normalize=[
                'url', 'email', 'percent', 'money', 'phone', 'user', 'time',
                'url', 'date', 'number'
            ],
            annotate={
                "hashtag", "allcaps", "elongated", 'emphasis', 'censored'
            },
            fix_html=True,
            segmenter='twitter',
            corrector='twitter',
            unpack_hashtags=True,
            unpack_contractions=True,
            spell_correct_elong=True,
            tokenizer=SocialTokenizer(lowercase=True).tokenize,
            dicts=[emoticons])

        return preprocessor
def ekphrasis_config(str):

    social_tokenizer = SocialTokenizer(lowercase=True).tokenize
    str_list = social_tokenizer(str)
    # print(str_list)
    # for index in range(len(str_list)):
    #     str_list[index] = sp.correct(str_list[index])

    # for index in range(len(str_list)):
    #     if str_list[index] in EMOTICONS_TOKEN.keys():
    #         str_list[index] = EMOTICONS_TOKEN[str_list[index]]
    # for index in range(len(str_list)):
    #     if str_list[index] in EMOTICONS_TOKEN.keys():
    #         str_list[index] = EMOTICONS_TOKEN[str_list[index]][1:len(EMOTICONS_TOKEN[str_list[index]]) - 1]
    #
    # for index in range(len(str_list)):
    #     if str_list[index] in LOGOGRAM.keys():
    #         str_list[index] = LOGOGRAM[str_list[index]]

    return str_list
Example #18
0
    def __init__(self):

        self.text_processor_options = TextPreProcessor(
            normalize=['url', 'email', 'percent', 'money', 'phone', 'user',
                       'time', 'url', 'date', 'number'],
            unpack_contractions=False,
            annotate={"allcaps", "elongated", "repeated",
                      'emphasis', 'censored'},
            fix_html=True,  # fix HTML tokens
            # corpus from which the word statistics are going to be used
            # for word segmentation and correction
            segmenter="english",
            corrector="english",
            unpack_hashtags=False,  # perform word segmentation on hashtags
            spell_correct_elong=False,  # spell correction for elongated words
            # the tokenizer, should take as input a string and return a list of tokens
            tokenizer=SocialTokenizer(lowercase=True).tokenize,
            # list of dictionaries, for replacing tokens extracted from the text,
            dicts=[emoticons]
        )
Example #19
0
    def __new__(cls, with_vinai=False):
        if cls.__singleton is None:
            cls.__singleton = super(Tokenizer, cls).__new__(cls)
            if with_vinai:
                cls.__tokenizer = normalizeTweet
            else:
                cls.__tokenizer = TextPreProcessor(
                    # terms that will be normalized
                    normalize=[
                        'url', 'email', 'percent', 'money', 'phone', 'user',
                        'time', 'date', 'number'
                    ],
                    # terms that will be annotated
                    annotate={
                        "hashtag", "allcaps", "elongated", "repeated",
                        'emphasis', 'censored'
                    },
                    fix_html=True,  # fix HTML tokens

                    # corpus from which the word statistics are going to be used
                    # for word segmentation
                    segmenter="twitter",

                    # corpus from which the word statistics are going to be used
                    # for spell correction
                    corrector="twitter",
                    unpack_hashtags=
                    True,  # perform word segmentation on hashtags
                    unpack_contractions=
                    True,  # Unpack contractions (can't -> can not)
                    spell_correct_elong=
                    False,  # spell correction for elongated words

                    # select a tokenizer. You can use SocialTokenizer, or pass your own
                    # the tokenizer, should take as input a string and return a list of tokens
                    tokenizer=SocialTokenizer(lowercase=True).tokenize,

                    # list of dictionaries, for replacing tokens extracted from the text,
                    # with other expressions. You can pass more than one dictionaries.
                    dicts=[emoticons]).pre_process_doc
        return cls.__singleton
Example #20
0
def preprocess_through_ekphrasis(train_file_path, test_file_path,
                                 trial_file_path):
    text_processor = TextPreProcessor(
        normalize=[
            'url', 'email', 'percent', 'money', 'phone', 'user', 'time', 'url',
            'date', 'number'
        ],
        annotate={
            "hashtag", "allcaps", "elongated", "repeated", 'emphasis',
            'censored'
        },
        fix_html=True,
        segmenter="twitter",
        corrector="twitter",
        unpack_hashtags=True,
        unpack_contractions=True,
        spell_correct_elong=True,
        spell_correction=True,
        all_caps_tag="wrap",
        fix_bad_unicode=True,
        tokenizer=SocialTokenizer(lowercase=True).tokenize,
        dicts=[emoticons])

    for file_path in [train_file_path, test_file_path, trial_file_path]:
        with open(file_path, 'r', newline='') as file:
            new_sentences = list()
            labels = list()
            for line in file:
                labels.append(line.split('\t')[0])
                new_sentences.append(" ".join(
                    text_processor.pre_process_doc(line.split('\t')[1])))
        with open(file_path[:-4] + "_ekphrasis.csv", 'w',
                  newline='') as new_file:
            for label, sentence in zip(labels, new_sentences):
                new_file.write("{}\t{}\n".format(
                    label,
                    sentence.replace("[ <hashtag> triggerword </hashtag> #]",
                                     "[#TRIGGERWORD#]").replace(
                                         "[ <allcaps> newline </allcaps> ]",
                                         "[NEWLINE]")))
Example #21
0
def nbow_model(task, embeddings, word2idx):
    if task == "clf":
        algo = LogisticRegression(C=0.6,
                                  random_state=0,
                                  class_weight='balanced')
    elif task == "reg":
        algo = SVR(kernel='linear', C=0.6)
    else:
        raise ValueError("invalid task!")

    embeddings_features = NBOWVectorizer(aggregation=["mean"],
                                         embeddings=embeddings,
                                         word2idx=word2idx,
                                         stopwords=False)

    preprocessor = TextPreProcessor(
        backoff=[
            'url', 'email', 'percent', 'money', 'phone', 'user', 'time', 'url',
            'date', 'number'
        ],
        include_tags={
            "hashtag", "allcaps", "elongated", "repeated", 'emphasis',
            'censored'
        },
        fix_html=True,
        segmenter="twitter",
        corrector="twitter",
        unpack_hashtags=True,
        unpack_contractions=True,
        spell_correct_elong=False,
        tokenizer=SocialTokenizer(lowercase=True).tokenize,
        dicts=[emoticons])

    model = Pipeline([('preprocess',
                       CustomPreProcessor(preprocessor, to_list=True)),
                      ('embeddings-feats', embeddings_features),
                      ('normalizer', Normalizer(norm='l2')), ('clf', algo)])

    return model
Example #22
0
    def __init__(self, text, **kwargs):
        self.text = text
        self.text_processor = TextPreProcessor(
            # terms that will be normalize e.g. [email protected] to <email>
            normalize=[
                'url', 'email', 'percent', 'money', 'phone', 'user', 'time',
                'date', 'number'
            ],

            # terms that will be annotated e.g. <hashtag>#test</hashtag>
            annotate={
                "hashtag", "allcaps", "elongated", "repeated", 'emphasis'
            },
            fix_html=True,  # fix HTML tokens
            unpack_hashtags=True,  # perform word segmentation on hashtags

            # select a tokenizer. You can use SocialTokenizer, or pass your own if not text tokenized on whitespace
            # the tokenizer, should take as input a string and return a list of tokens
            tokenizer=SocialTokenizer(lowercase=True).tokenize,

            # list of dictionaries, for replacing tokens extracted from the text,
            # with other expressions. You can pass more than one dictionaries.
            dicts=[emoticons])
def get_preprocessed_data(raw_data,sentence_segmentation=False,pre_lang_check=True):
	text_processor = TextPreProcessor(
	omit=['url', 'email', 'user'],
	normalize=['url', 'email', 'user'],
	annotate={"elongated", "repeated",'emphasis', 'censored'},
	segmenter="twitter",
	corrector="twitter",
	unpack_hashtags=True,
	unpack_contractions=True,
	spell_correct_elong=False,
	spell_correction=False,
	tokenizer=SocialTokenizer(lowercase=True).tokenize)
	processed_data=[]
	for data_type in raw_data:
		if len(raw_data[data_type])>=MAX_SEQ:
			chunks = int(len(raw_data[data_type])/MAX_SEQ)
			for j in range(chunks):
				processed_data+=TextPreprocessing(raw_data[data_type][MAX_SEQ*j:MAX_SEQ*(j+1)],text_processor,sentence_segmentation,pre_lang_check,get_mode(data_type))
			if MAX_SEQ*chunks!=len(raw_data[data_type]):
				processed_data+=TextPreprocessing(raw_data[data_type][MAX_SEQ*chunks:],text_processor,sentence_segmentation,pre_lang_check,get_mode(data_type))
		else:
			processed_data+=TextPreprocessing(raw_data[data_type],text_processor,sentence_segmentation,pre_lang_check,get_mode(data_type))
	return processed_data
Example #24
0
    def __init__(self, args):
        if args.datastories:
            tokenizer = SocialTokenizer(lowercase=True)
        else:
            tokenizer = TweetTokenizer()
        self.RAW = data.RawField()
        self.TEXT = data.Field(batch_first=True,
                               include_lengths=True,
                               lower=True,
                               tokenize=tokenizer.tokenize)
        self.LABEL = data.Field(sequential=False, unk_token=None)

        filehandler = open('./data/vocab.obj', 'rb')
        self.TEXT.vocab = pickle.load(filehandler)
        filehandler = open('./data/label.obj', 'rb')
        self.LABEL.vocab = pickle.load(filehandler)

        self.test = datasets.EMO.getTestData(args, self.RAW, self.TEXT,
                                             args.test_data_path)

        self.test_iter = \
            data.Iterator(self.test,
                          batch_size=args.batch_size,
                          device=args.device,
                          shuffle=False,
                          sort=False,
                          repeat=False)

        self.max_word_len = max([len(w) for w in self.TEXT.vocab.itos])
        # for <pad>
        self.char_vocab = {'': 0}
        # for <unk> and <pad>
        self.characterized_words = [[0] * self.max_word_len,
                                    [0] * self.max_word_len]

        if args.char_emb:
            self.build_char_vocab()
Example #25
0
def build_vocab(dataset):
    # use text processing tool to do word normalization, annotation, segmentation, tokenization, and spell correction
    # return a vocabulary set
    vocabulary_set = set()
    text_processor = TextPreProcessor(
        normalize=['url', 'email', 'percent', 'money', 'phone', 'user',
                   'time', 'url', 'date', 'number'],
        annotate={"hashtag", "allcaps", "elongated", "repeated",
                  'emphasis', 'censored'},
        fix_html=True,
        segmenter="twitter",
        corrector="twitter",
        unpack_hashtags=True,
        unpack_contractions=True,
        spell_correct_elong=False,
        tokenizer=SocialTokenizer(lowercase=True).tokenize,
        dicts=[emoticons]
    )
    for text_tensor, _ in dataset:
        text = str(text_tensor.numpy()[0], 'utf-8')
        some_tokens = text_processor.pre_process_doc(text)
        vocabulary_set.update(some_tokens)

    return vocabulary_set
Example #26
0
    def twitter_preprocess(self):
        preprocessor = TextPreProcessor(
            normalize=[
                'url', 'email', 'percent', 'money', 'phone', 'user', 'time',
                'date', 'number'
            ],
            annotate={
                "hashtag", "elongated", "allcaps", "repeated", 'emphasis',
                'censored'
            },
            all_caps_tag="wrap",
            fix_text=True,
            segmenter="twitter_2018",
            corrector="twitter_2018",
            unpack_hashtags=True,
            unpack_contractions=True,
            spell_correct_elong=False,
            tokenizer=SocialTokenizer(lowercase=True).tokenize,
            dicts=[emoticons])

        text = self.data
        cache_file = os.path.join('./', "cached",
                                  "preprocessed_" + self.name + ".pkl")
        preprocessed = None
        if os.path.isfile(cache_file):
            with open(cache_file, 'rb') as f:
                preprocessed = pickle.load(f)
        else:
            preprocessed = [
                preprocessor.pre_process_doc(x)
                for x in tqdm(text, desc="Preprocessing dataset...")
            ]
            with open(cache_file, 'wb') as f:
                pickle.dump(preprocessed, f)

        return preprocessed
Example #27
0
    annotate={
        "hashtag", "allcaps", "elongated", "repeated", 'emphasis', 'censored'
    },
    fix_html=True,  # fix HTML tokens
    # corpus from which the word statistics are going to be used
    # for word segmentation
    segmenter="twitter",
    # corpus from which the word statistics are going to be used
    # for spell correction
    corrector="twitter",
    unpack_hashtags=True,  # perform word segmentation on hashtags
    unpack_contractions=True,  # Unpack contractions (can't -> can not)
    spell_correct_elong=True,  # spell correction for elongated words
    # select a tokenizer. You can use SocialTokenizer, or pass your own
    # the tokenizer, should take as input a string and return a list of tokens
    tokenizer=SocialTokenizer(lowercase=True).tokenize,
    # list of dictionaries, for replacing tokens extracted from the text,
    # with other expressions. You can pass more than one dictionaries.
    dicts=[emoticons, emoticons_additional])


def tokenize(text):
    print("in tokenize")
    text = " ".join(text_processor.pre_process_doc(text))
    return text


def preprocessData(dataFilePath, mode):
    print("in preprocess data")
    conversations = []
    labels = []
Example #28
0
    def yelpInstanceConstructFromTrain(self, paramFpathInTrainTxt,
                                       paramFpathOutToken2IndexDict,
                                       paramFpathOutIndex2TokenDict,
                                       paramFpathOutTrainParams,
                                       paramFpathOutTrainInstance):
        '''
        combine reviews with stars, reshuffle reviews, and split into two sets
        ===================================================
        parameters:
        -----------
        paramFpathInTrainTxt: review texted train
        paramFpathOutToken2IndexDict: map token to index
        paramFpathOutIndex2TokenDict: map index to token
        paramFpathOutTest: test se
        paramFpathOutParams: the parameters needed for training
        paramTrainsetPercent: train set percent

        return:
        -----------
        None
        '''

        # read in the train.txt
        fpointerInTrainTxt = open(paramFpathInTrainTxt, 'rt', encoding='utf8')

        def __function4map(elem4map):
            '''
            stripe elem
            ===================================================
            parameters:
            -----------
            elem4map

            return:
            -----------
            mapped elem
            '''
            elemstriped = elem4map.strip()
            return elemstriped

        listTrainTxt = list(map(__function4map,
                                fpointerInTrainTxt.readlines()))
        fpointerInTrainTxt.close()

        # ----------initialize TextPreProcessor
        text_processor = TextPreProcessor(
            normailze=[
                'url', 'email', 'percent', 'money', 'phone', 'user', 'time',
                'date', 'number'
            ],
            annotate={
                "hashtag", "allcaps", "elongated", "repeated", "emphasis",
                "censored"
            },
            fix_html=True,
            segmenter="english",
            corrector="english",
            unpack_hashtags=True,
            unpack_contractions=True,
            spell_correct_elong=False,
            tokenizer=SocialTokenizer(lowercase=True).tokenize,
            dicts=[emoticons])
        # ----------Initialize TextPreProcessor

        listTrainTxtTokenized = \
            list(text_processor.pre_process_docs(listTrainTxt))
        listTrainTxt = None
        # ----------save the vocabulary table,
        #           calculate and save the parameters
        # filter top 20,000 tokens
        dictVocabulary2Freq = dict()
        for listTokens in listTrainTxtTokenized:
            for aToken in listTokens:
                if aToken in dictVocabulary2Freq:
                    dictVocabulary2Freq[aToken] += 1
                else:
                    dictVocabulary2Freq[aToken] = 1
        itemgetter1 = operator.itemgetter(1)
        list_k_v_top_20000 = sorted(dictVocabulary2Freq.items(),
                                    key=itemgetter1,
                                    reverse=True)[0:20000]
        dict_k_v_top_20000 = {k: v for k, v in list_k_v_top_20000}
        dictVocabulary2Freq = None
        list_k_v_top_20000 = None

        # calculate maxDocumentSize and vocabularySize
        maxDocumentSize = 0
        vocabularySize = 0

        dictVocabulary2Index = dict()
        dictIndex2Vocabulary = dict()
        tokenCurrentIndex = 0
        for listTokens in listTrainTxtTokenized:
            if maxDocumentSize < len(listTokens):
                maxDocumentSize = len(listTokens)
            for aToken in listTokens:
                # filter rare words, reduce vocabulary size
                if aToken not in dict_k_v_top_20000:
                    continue
                if aToken in dictVocabulary2Index:
                    pass
                else:
                    dictVocabulary2Index[aToken] = tokenCurrentIndex
                    dictIndex2Vocabulary[tokenCurrentIndex] = aToken
                    tokenCurrentIndex += 1
        vocabularySize = tokenCurrentIndex
        assert vocabularySize == len(dictVocabulary2Index)

        # trim doc_size to 0.5 maxDocSize
        # trimmed_doc_size = maxDocumentSize * 0.5

        # json write using the fp4jsonoutput = open(,'wt', encoding='utf8')
        fp4jsonoutput = open(paramFpathOutToken2IndexDict,
                             'wt',
                             encoding='utf8')
        json.dump(dictVocabulary2Index, fp4jsonoutput, ensure_ascii=False)
        fp4jsonoutput.close()

        fp4jsonoutput = open(paramFpathOutIndex2TokenDict,
                             'wt',
                             encoding='utf8')
        json.dump(dictIndex2Vocabulary, fp4jsonoutput, ensure_ascii=False)
        fp4jsonoutput.close()

        # dictVocabulary2Index = None
        dictIndex2Vocabulary = None

        fpointerOutParams = open(paramFpathOutTrainParams,
                                 'wt',
                                 encoding='utf8')

        str4write = 'TrainingInstances: %d\n' % len(listTrainTxtTokenized)\
            + 'DocumentSeqLen: %d\n' % maxDocumentSize\
            + 'VocabularySize: %d\n' % vocabularySize

        fpointerOutParams.write(str4write)

        fpointerOutParams.close()
        # ----------calculate and save the parameters

        # ----------construct training instances and perform padding
        print('Hello1')

        def __function_tokenlist_to_traininstance(tokenlist):
            '''
            from tokenlist to padded instance list
            adding subsampling
            '''
            tokenlist_size = len(tokenlist)
            traininginstance = list()
            for n in range(tokenlist_size):
                # ----------split tokenlist section
                tokenlist_section = None
                if n - HALF_WINDOW_SIZE < 0:
                    if n + HALF_WINDOW_SIZE >= tokenlist_size:
                        tokenlist_section = tokenlist
                    else:
                        tokenlist_section = tokenlist[:n + HALF_WINDOW_SIZE]
                else:
                    if n + HALF_WINDOW_SIZE >= tokenlist_size:
                        tokenlist_section = tokenlist[n - HALF_WINDOW_SIZE:]
                    else:
                        tokenlist_section = tokenlist[n - HALF_WINDOW_SIZE:n +
                                                      HALF_WINDOW_SIZE]
                # ----------calculate tokenlist multiterm
                countlist_vocab = [0 for i in range(vocabularySize)]
                countlist_vocab[dictVocabulary2Index[tokenlist[n]]] += 1
                traininginstance.append(countlist_vocab)
                countlist_vocab = [0 for i in range(vocabularySize)]
                for atoken in tokenlist_section:
                    countlist_vocab[dictVocabulary2Index[atoken]] += 1
                traininginstance.append(countlist_vocab)

            # ----------padding
            for n in range(tokenlist_size, maxDocumentSize):
                fullzero_vocab = [0 for i in range(vocabularySize)]
                traininginstance.append(fullzero_vocab)
                fullzero_vocab = [0 for i in range(vocabularySize)]
                traininginstance.append(fullzero_vocab)

            return traininginstance

        def __function_traininstance_to_string(traininstance):
            '''
            from traininstance to a string
            '''
            str_training_instance = ''
            for acountlist_vocab in traininstance:
                acountlist_vocab = list(map(str, acountlist_vocab))
                str_acountlist_vocab = ' '.join(acountlist_vocab)
                str_training_instance += ' ' + str_acountlist_vocab

            str_training_instance += '\n'
            return str_training_instance

        fpointerOutTrainInstance = open(paramFpathOutTrainInstance,
                                        'wt',
                                        encoding='utf8')
        for aTrainTxtTokenized in listTrainTxtTokenized:
            aTrainInstance = __function_tokenlist_to_traininstance(
                aTrainTxtTokenized)
            aStrTrainInstance = __function_traininstance_to_string(
                aTrainInstance)
            fpointerOutTrainInstance.write(aStrTrainInstance)
        fpointerOutTrainInstance.close()

        return None
    DATA_PATH_ITA = args.trainSet
    DATA_PATH_TEST_ITA = args.testSet
    OUTPUT_DIR = args.odir
    preproc = args.preproc
    data_train = pd.read_csv(DATA_PATH_ITA,sep=';',encoding='utf-8',engine='c')
    data_test = pd.read_csv(DATA_PATH_TEST_ITA,sep=";",encoding='utf_8')

    if args.doShuffle == True:
    	data_train = data_train.reindex(np.random.permutation(data_train.index))
    if preproc == 'mirko':
        text_processor = TextPreProcessor (
            remove=[ 'email', 'percent', 'money', 'phone', 'time', 'date', 'number'] ,
            annotate={} ,
            fix_html=True ,
            unpack_hashtags=False ,  
            tokenizer=SocialTokenizer(lowercase=args.doLower).tokenize,
            dicts = [ emoticons ])
        data_train.text.astype(str)
        data_test.text.astype(str)
        data_train['text_preprocessed'] = data_train.apply(lambda row: mirkoPreprocessing(row,args,text_processor), axis=1)
        data_test['text_preprocessed'] = data_test.apply(lambda row: mirkoPreprocessing(row,args,text_processor), axis=1)
    if preproc == 'raw':
        data_train['text_preprocessed'] = data_train['text']
        data_test['text_preprocessed'] = data_test['text']

    pd.set_option('display.max_colwidth', 800)
    print('***** TRAIN HEAD ***')
    print(data_train.head())    
    print('***** TEST HEAD ***')
    print(data_test.head())
    
Example #30
0
text_processor = TextPreProcessor(
    normalize=[
        'url', 'email', 'percent', 'money', 'phone', 'time', 'url', 'date',
        'number'
    ],
    annotate={
        "hashtag", "allcaps", "elongated", "repeated", 'emphasis', 'censored'
    },
    fix_html=True,
    segmenter="twitter",
    corrector="twitter",
    unpack_hashtags=True,
    unpack_contractions=True,
    spell_correct_elong=True,
    tokenizer=SocialTokenizer(lowercase=False).tokenize,
    dicts=[emoticons])

REMOVE_TAGS = [
    "<emphasis>", "<kiss>", "<repeated>", "<laugh>", "<allcaps>", "</allcaps>",
    "<angel>", "<elongated>", "<tong>", "<annoyed>", "<censored>", "<happy>",
    "<percent>", "<wink>", "<headdesk>", "<surprise>", "<date>", "<time>",
    "<url>", "<sad>", "<email>", "<phone>", "<hashtag>", "</hashtag>"
]

ADD_TO_GLOVE = ["<number>", "<money>"]

# Try removing punctuations as well


def pre_process_single(tweet, t_id):