Beispiel #1
0
def preprocess_data(genuine_filepath, bot_filepath):
    """
    Preprocess data and normalize tweets.
    """
    # Open csv file and get the tweet part of the csv.
    # Strip out newlines and quotes around text.
    with codecs.open(bot_filepath, 'r', encoding='utf-8',
                     errors='ignore') as bots_file:
        bot_sentences = [
            x.split(',')[1].strip('\n').strip('"').lower()
            if len(x.split(',')) > 1 else '' for x in bots_file.readlines()
        ]
    bot_sentences = bot_sentences[1:]

    with codecs.open(genuine_filepath, 'r', encoding='utf-8',
                     errors='ignore') as genuine_file:
        genuine_sentences = [
            x.split(',')[1].strip('\n').strip('"').lower()
            if len(x.split(',')) > 1 else '' for x in genuine_file.readlines()
        ]
    genuine_sentences = genuine_sentences[1:]

    text_processor = TextPreProcessor(
        # terms that will be normalized
        normalize=[
            'url', 'email', 'percent', 'money', 'phone', 'user', 'time', 'url',
            'date', 'number'
        ],
        # terms that will be annotated
        annotate={
            "hashtag", "allcaps", "elongated", "repeated", 'emphasis',
            'censored'
        },
        fix_html=True,  # fix HTML tokens

        # corpus from which the word statistics are going to be used
        # for word segmentation
        segmenter="twitter",

        # corpus from which the word statistics are going to be used
        # for spell correction
        corrector="twitter",
        unpack_hashtags=True,  # perform word segmentation on hashtags
        unpack_contractions=True,  # Unpack contractions (can't -> can not)
        spell_correct_elong=False,  # spell correction for elongated words

        # select a tokenizer. You can use SocialTokenizer, or pass your own
        # the tokenizer, should take as input a string and return a list of tokens
        tokenizer=SocialTokenizer(lowercase=True).tokenize,

        # list of dictionaries, for replacing tokens extracted from the text,
        # with other expressions. You can pass more than one dictionaries.
        dicts=[emoticons])
    bot_sentences = [text_processor.pre_process_doc(s) for s in bot_sentences]
    genuine_sentences = [
        text_processor.pre_process_doc(s) for s in genuine_sentences
    ]

    return genuine_sentences, bot_sentences
Beispiel #2
0
class TextPreprocessor():

    def __init__(self):

        self.text_processor_options = TextPreProcessor(
            normalize=['url', 'email', 'percent', 'money', 'phone', 'user',
                       'time', 'url', 'date', 'number'],
            unpack_contractions=False,
            annotate={"allcaps", "elongated", "repeated",
                      'emphasis', 'censored'},
            fix_html=True,  # fix HTML tokens
            # corpus from which the word statistics are going to be used
            # for word segmentation and correction
            segmenter="english",
            corrector="english",
            unpack_hashtags=False,  # perform word segmentation on hashtags
            spell_correct_elong=False,  # spell correction for elongated words
            # the tokenizer, should take as input a string and return a list of tokens
            tokenizer=SocialTokenizer(lowercase=True).tokenize,
            # list of dictionaries, for replacing tokens extracted from the text,
            dicts=[emoticons]
        )

    def do_ekphrasis_preprocessing(self, sentences):
        if isinstance(sentences, str):
            return self.text_processor_options.pre_process_doc(sentences)

        assert (type(sentences).__module__ == np.__name__)
        preprocessed = [self.text_processor_options.pre_process_doc(s) for s in sentences]
        return np.array(preprocessed)

    def do_decontraction(self, sentences):
        if isinstance(sentences, str):
            sentences = np.array([sentences])
        assert(type(sentences).__module__ == np.__name__)
        preprocessed = []
        for s in sentences:
            ''' Does not deal with 'd as it is ambiguous'''
            s = re.sub(r"[W, w]on\'t", "will not", s)
            s = re.sub(r"[C, c]an\'t", "can not", s)
            s = re.sub(r"[C, c]annot", "can not", s)
            s = re.sub(r"n\'t", " not", s)
            s = re.sub(r"\'re", " are", s)
            s = re.sub(r"[H, h]e\'s", "he is", s)
            s = re.sub(r"[S, s]he\'s", "she is", s)
            s = re.sub(r"[I, i]t\'s", "it is", s)
            s = re.sub(r"\'ll", " will", s)
            s = re.sub(r"\'ve", " have", s)
            s = re.sub(r"\'m", " am", s)
            s = re.sub(r"[D, d]idn\'t", "did not", s)
            preprocessed.append(s)
        return np.array(preprocessed)
def build_vocab(dataset):
    vocabulary_set = set()
    text_processor = TextPreProcessor(
        normalize=[
            'url', 'email', 'percent', 'money', 'phone', 'user', 'time', 'url',
            'date', 'number'
        ],
        annotate={
            "hashtag", "allcaps", "elongated", "repeated", 'emphasis',
            'censored'
        },
        fix_html=True,
        segmenter="twitter",
        corrector="twitter",
        unpack_hashtags=True,
        unpack_contractions=True,
        spell_correct_elong=False,
        tokenizer=SocialTokenizer(lowercase=True).tokenize,
        dicts=[emoticons])
    for text_tensor, _ in dataset:
        text = str(text_tensor.numpy()[1], 'utf-8')
        some_tokens = text_processor.pre_process_doc(text)
        vocabulary_set.update(some_tokens)

    return vocabulary_set
        def build_vocab_list(dataframe):
            vocab_set = set()
            sentenses = []

            text_processor = TextPreProcessor(
                normalize=[
                    'url', 'email', 'percent', 'money', 'phone', 'user',
                    'time', 'url', 'date', 'number'
                ],
                annotate={
                    "hashtag", "allcaps", "elongated", "repeated", 'emphasis',
                    'censored'
                },
                fix_html=True,
                segmenter="twitter",
                corrector="twitter",
                unpack_hashtags=True,
                unpack_contractions=True,
                spell_correct_elong=False,
                tokenizer=SocialTokenizer(lowercase=True).tokenize,
                dicts=[emoticons])

            for index in range(dataframe.shape[0]):
                tweet = dataframe["tweet"][index]
                tok = text_processor.pre_process_doc(tweet)
                sentenses.append(" ".join(tok))
                vocab_set.update(tok)

            df_sentenses = pd.DataFrame(sentenses, columns=['content'])
            return vocab_set, df_sentenses
def preprocess_dataset(tweets, y):
    """uses ekphrasis API to preprocess the tweets"""

    text_processor = TextPreProcessor(
        # terms that will be normalized
        normalize=[
            'url', 'email', 'percent', 'money', 'phone', 'user', 'time', 'url',
            'date', 'number'
        ],
        # terms that will be annotated
        fix_html=True,  # fix HTML tokens
        # corpus from which the word statistics are going to be used
        # for word segmentation
        segmenter="twitter",
        # corpus from which the word statistics are going to be used
        # for spell correction
        corrector="twitter",
        unpack_hashtags=True,  # perform word segmentation on hashtags
        unpack_contractions=True,  # Unpack contractions (can't -> can not)
        spell_correct_elong=False,  # spell correction for elongated words
        spell_correction=False,
        # select a tokenizer. You can use SocialTokenizer, or pass your own
        # the tokenizer, should take as input a string and return a list of tokens
        tokenizer=SocialTokenizer(lowercase=True).tokenize,
        # list of dictionaries, for replacing tokens extracted from the text,
        # with other expressions. You can pass more than one dictionaries.
        dicts=[emoticons])
    ynew = []
    filter_tweets = []
    for t in range(0, len(tweets)):
        tokens = text_processor.pre_process_doc(tweets[t])
        newtokens = []
        i = 0
        while (i < len(tokens)):
            try:
                if (tokens[i] == "pic" and tokens[i + 1] == "."
                        and tokens[i + 2] == "twitter"):
                    break
                elif (tokens[i] in [
                        "<url>", "<email>", "<user>", "<money>", "<percent>",
                        "<phone>", "<time>", "<date>", "<number>"
                ]):
                    i += 1
                    continue
                elif (tokens[i] == "<" and tokens[i + 1] == "emoji"):
                    while (tokens[i] != ">"):
                        i += 1
                    i += 1
                else:
                    newtokens.append(tokens[i])
                    i += 1
            except:
                break
        if (len(newtokens) != 0):
            filter_tweets.append(" ".join(newtokens))
            ynew.append(y[t])
    return filter_tweets, ynew  #tokenizing and other preprocessing #removes emojis
Beispiel #6
0
def clean_then_tokenize_text(data):
	text_all = []
	text_processor = TextPreProcessor(
		normalize=['user','url'],)
	for key in data:
		text = data[key]
		a= []
		temp = ""
		for line in text:
			if True:
				line = text_processor.pre_process_doc(line)
				temp=" ".join( text_to_word_sequence(line) )
				a.append(temp)
		data[key]['cln_text'] = a
		text_all +=a
	return text_all
Beispiel #7
0
class EkhprasisPreprocessor(Preprocessor):

    def __init__(self, verbose: int=0, omit=None,
                 normalize=None, annotate={"hashtag", "allcaps", "elongated", "repeated", 'emphasis'},
                 segmenter="twitter", corrector="twitter", unpack_hashtags=False, unpack_contractions=True,
                 spell_correct_elong=True, spell_correction=True, tokenizer=Tokenizer(lowercase=True),
                 dicts=None):
        super().__init__(name="EkhprasisPreprocessor", verbose=verbose)
        if dicts is None:
            dicts = [others, emoticons_original]
        if normalize is None:
            normalize = ['number']
        if omit is None:
            omit = ['email', 'percent', 'money', 'phone', 'user', 'time', 'url', 'date']
        logging.info("{} loading...".format(self._name))
        self.tweet_processor = TextPreProcessor(
            # omit terms
            omit=omit,
            # terms that will be normalized
            normalize=normalize,
            # terms that will be annotated
            annotate=annotate,

            # corpus from which the word statistics are going to be used
            # for word segmentation
            segmenter=segmenter,

            # corpus from which the word statistics are going to be used
            # for spell correction
            corrector=corrector,

            unpack_hashtags=unpack_hashtags,  # perform word segmentation on hashtags
            unpack_contractions=unpack_contractions,  # Unpack contractions (can't -> can not)
            spell_correct_elong=spell_correct_elong,  # spell correction for elongated words
            spell_correction=spell_correction,  # spell correction

            # select a tokenizer. You can use SocialTokenizer, or pass your own
            # the tokenizer, should take as input a string and return a list of tokens
            tokenizer=tokenizer.tokenize,

            # list of dictionaries, for replacing tokens extracted from the text,
            # with other expressions. You can pass more than one dictionaries.
            dicts=dicts
        )

    def _preprocess(self, sentence) -> str:
        return ' '.join(' '.join(self.tweet_processor.pre_process_doc(sentence)).split())
Beispiel #8
0
def datastories_processor(x):
    from ekphrasis.dicts.emoticons import emoticons
    from ekphrasis.classes.tokenizer import SocialTokenizer
    from ekphrasis.classes.preprocessor import TextPreProcessor

    text_processor = TextPreProcessor(
        # terms that will be normalized
        normalize=[
            'url', 'email', 'percent', 'money', 'phone', 'user', 'time', 'url',
            'date', 'number'
        ],
        # terms that will be annotated
        annotate={
            "hashtag", "allcaps", "elongated", "repeated", 'emphasis',
            'censored'
        },
        fix_html=True,  # fix HTML tokens

        # corpus from which the word statistics are going to be used
        # for word segmentation
        segmenter="twitter",

        # corpus from which the word statistics are going to be used
        # for spell correction
        corrector="twitter",
        unpack_hashtags=True,  # perform word segmentation on hashtags
        unpack_contractions=True,  # Unpack contractions (can't -> can not)
        spell_correct_elong=False,  # spell correction for elongated words

        # select a tokenizer. You can use SocialTokenizer, or pass your own
        # the tokenizer, should take as input a string and return a list of tokens
        tokenizer=SocialTokenizer(lowercase=True).tokenize,

        # list of dictionaries, for replacing tokens extracted from the text,
        # with other expressions. You can pass more than one dictionaries.
        dicts=[emoticons])

    x = [text_processor.pre_process_doc(sent) for sent in x]
    temp = []
    for sent in x:
        context = ''
        for word in sent:
            context = context + ' ' + word
        temp.append(context)

    return temp
Beispiel #9
0
class EkphrasisProxy():
    def __init__(self, **kwargs):
        self.text_processor = TextPreProcessor(
            omit=kwargs.get('normalize', []),
            normalize=kwargs.get(
                'normalize',
                ['url', 'email', 'phone', 'user', 'time', 'url', 'date']),
            annotate=kwargs.get('annotate', {}),
            fix_html=kwargs.get('fix_html', True),
            segmenter=kwargs.get('segmenter', "twitter"),
            corrector=kwargs.get('corrector', "twitter"),
            unpack_hashtags=kwargs.get('unpack_hashtags', True),
            unpack_contractions=kwargs.get('unpack_contractions', True),
            spell_correct_elong=kwargs.get('fix_elongation', True),
            spell_correction=kwargs.get('spell_correction', True),
            fix_bad_unicode=kwargs.get('fix_bad_unicode', True),
            tokenizer=SocialTokenizer(lowercase=True).tokenize,
            dicts=[emoticons])

    def preprocess_text(self, text):
        return self.text_processor.pre_process_doc(text)
Beispiel #10
0
def preprocess_through_ekphrasis(train_file_path, test_file_path,
                                 trial_file_path):
    text_processor = TextPreProcessor(
        normalize=[
            'url', 'email', 'percent', 'money', 'phone', 'user', 'time', 'url',
            'date', 'number'
        ],
        annotate={
            "hashtag", "allcaps", "elongated", "repeated", 'emphasis',
            'censored'
        },
        fix_html=True,
        segmenter="twitter",
        corrector="twitter",
        unpack_hashtags=True,
        unpack_contractions=True,
        spell_correct_elong=True,
        spell_correction=True,
        all_caps_tag="wrap",
        fix_bad_unicode=True,
        tokenizer=SocialTokenizer(lowercase=True).tokenize,
        dicts=[emoticons])

    for file_path in [train_file_path, test_file_path, trial_file_path]:
        with open(file_path, 'r', newline='') as file:
            new_sentences = list()
            labels = list()
            for line in file:
                labels.append(line.split('\t')[0])
                new_sentences.append(" ".join(
                    text_processor.pre_process_doc(line.split('\t')[1])))
        with open(file_path[:-4] + "_ekphrasis.csv", 'w',
                  newline='') as new_file:
            for label, sentence in zip(labels, new_sentences):
                new_file.write("{}\t{}\n".format(
                    label,
                    sentence.replace("[ <hashtag> triggerword </hashtag> #]",
                                     "[#TRIGGERWORD#]").replace(
                                         "[ <allcaps> newline </allcaps> ]",
                                         "[NEWLINE]")))
class SentencePreprocessor:
    
    def __init__(self):

        # Define a Text Pre-Processing pipeline
        # You can easily define a preprocessing pipeline, by using the TextPreProcessor.
        self.text_processor = TextPreProcessor(
            # terms that will be normalized
            normalize=['url', 'email', 'percent', 'money', 'phone', 'user',
                'time', 'url', 'date', 'number'],
            # terms that will be annotated
            annotate={"hashtag", "allcaps", "elongated", "repeated",
                'emphasis', 'censored'},
            fix_html=True,  # fix HTML tokens
            
            # corpus from which the word statistics are going to be used 
            # for word segmentation 
            segmenter="english", 
            
            # corpus from which the word statistics are going to be used 
            # for spell correction
            corrector="english", 
            
            unpack_hashtags=True,  # perform word segmentation on hashtags
            unpack_contractions=True,  # Unpack contractions (can't -> can not)
            spell_correct_elong=False,  # spell correction for elongated words
            
            # select a tokenizer. You can use SocialTokenizer, or pass your own
            # the tokenizer, should take as input a string and return a list of tokens
            tokenizer=SocialTokenizer(lowercase=True).tokenize,
            
            # list of dictionaries, for replacing tokens extracted from the text,
            # with other expressions. You can pass more than one dictionaries.
            dicts=[emoticons])
    
    def prepro_sent(self, sent):
        # As smart quote is not handled in ekphrasis
        sent = sent.replace('‘', '\'').replace('’', '\'').replace('“', '"').replace('”', '"')
        return ' '.join(self.text_processor.pre_process_doc(sent))
Beispiel #12
0
    def twitter_preprocess(self):
        preprocessor = TextPreProcessor(
            normalize=[
                'url', 'email', 'percent', 'money', 'phone', 'user', 'time',
                'date', 'number'
            ],
            annotate={
                "hashtag", "elongated", "allcaps", "repeated", 'emphasis',
                'censored'
            },
            all_caps_tag="wrap",
            fix_text=True,
            segmenter="twitter_2018",
            corrector="twitter_2018",
            unpack_hashtags=True,
            unpack_contractions=True,
            spell_correct_elong=False,
            tokenizer=SocialTokenizer(lowercase=True).tokenize,
            dicts=[emoticons])

        text = self.data
        cache_file = os.path.join('./', "cached",
                                  "preprocessed_" + self.name + ".pkl")
        preprocessed = None
        if os.path.isfile(cache_file):
            with open(cache_file, 'rb') as f:
                preprocessed = pickle.load(f)
        else:
            preprocessed = [
                preprocessor.pre_process_doc(x)
                for x in tqdm(text, desc="Preprocessing dataset...")
            ]
            with open(cache_file, 'wb') as f:
                pickle.dump(preprocessed, f)

        return preprocessed
tok = list()

tk = TweetTokenizer()
p = Preprocess()
text_processor = TextPreProcessor(
    # terms that will be normalized
    normalize=[
        'email', 'percent', 'money', 'phone', 'time', 'url', 'date', 'number'
    ],
    fix_html=True,  # fix HTML tokens
    segmenter="twitter",
    corrector="twitter",
    unpack_hashtags=True,  # perform word segmentation on hashtags
    unpack_contractions=True,  # Unpack contractions (can't -> can not)
    spell_correct_elong=True,  # spell correction for elongated words
    dicts=[emoticons])

for i in tw_:
    #print ( i )
    line = p.preprocess_mentions(line, repl='<mention>')
    line = p.preprocess_reserved_words(line, repl='<reserved>')
    line = text_processor.pre_process_doc(line)
    tok.append(tk.tokenize(line))

for k in tok[1:100]:
    print(k)

model = Word2Vec(tok, min_count=5, size=300, window=5, sg=1)
model.train(tok, total_examples=len(tok), epochs=100)

model.wv.save_word2vec_format('w2v_positive_300.bin', binary=True)
Beispiel #14
0
input_data_file = "/home/zz/Work/chase/data/ml/ml/rm/labeled_data_all_corrected.csv"
#input_data_file="/home/zz/Work/chase/data/ml/ml/dt/labeled_data_all_2.csv"
#input_data_file="/home/zz/Work/chase/data/ml/ml/w/labeled_data_all.csv"
#input_data_file="/home/zz/Work/chase/data/ml/ml/w+ws/labeled_data_all.csv"
#input_data_file="/home/zz/Work/chase/data/ml/ml/ws-exp/labeled_data_all.csv"
#input_data_file="/home/zz/Work/chase/data/ml/ml/ws-amt/labeled_data_all.csv"
#input_data_file="/home/zz/Work/chase/data/ml/ml/ws-gb/labeled_data_all.csv"

raw_data = pd.read_csv(input_data_file, sep=',', encoding="utf-8")
header_row = list(raw_data.columns.values)
with open(input_data_file + "c.csv", 'w', newline='') as csvfile:
    csvwriter = csv.writer(csvfile,
                           delimiter=',',
                           quotechar='"',
                           quoting=csv.QUOTE_MINIMAL)
    csvwriter.writerow(header_row)

    for row in raw_data.iterrows():
        tweet = list(row[1])
        tweet_text = text_processor.pre_process_doc(tweet[col_text])
        tweet_text = list(filter(lambda a: a != '<elongated>', tweet_text))
        tweet_text = list(filter(lambda a: a != '<emphasis>', tweet_text))
        tweet_text = list(filter(lambda a: a != 'RT', tweet_text))
        tweet_text = list(filter(lambda a: a != '"', tweet_text))
        tweet_text = " ".join(tweet_text)

        #reset content
        tweet[col_text] = tweet_text

        csvwriter.writerow(tweet)
Beispiel #15
0
def preprocess(df):
  df['ProcessedText'] = None
  df['ProcessedText_length'] = 0
  df['ProcessedText_BERT'] = None
  df['ProcessedText_BERTbase_length'] = 0
  print(df.columns)

  text_processor = TextPreProcessor(  # terms that will be normalized
        normalize=['url', 'email', 'percent', 'money', 'phone', 'user', 'time', 'url', 'date', 'number'],
        # terms that will be annotated
        annotate={"hashtag", "allcaps", "elongated", "repeated", 'emphasis', 'censored'}, fix_html=True,  # fix HTML tokens

        # corpus from which the word statistics are going to be used
        # for word segmentation
        segmenter="twitter",

        # corpus from which the word statistics are going to be used
        # for spell correction
        corrector="twitter",

        unpack_hashtags=True,  # perform word segmentation on hashtags
        unpack_contractions=True,  # Unpack contractions (can't -> can not)
        spell_correct_elong=True,  # spell correction for elongated words

        # select a tokenizer. You can use SocialTokenizer, or pass your own
        # the tokenizer, should take as input a string and return a list of tokens
        tokenizer=SocialTokenizer(lowercase=True).tokenize,

        # list of dictionaries, for replacing tokens extracted from the text,
        # with other expressions. You can pass more than one dictionaries.
        dicts=[emoticons])
  
        # Tweets pre-processing
  for index, row in df.iterrows():
        s = df.loc[index, 'text']
        # remove RT and USER
        s = "".join(re.sub('RT @[\w_]+: ', ' ', s))
        # df.loc[index, 'text'] = "".join(re.sub(r'&#  ;', ' ', df.loc[index, 'text']))
        # df.loc[index, 'text'] = "".join(re.sub(r' &# ;', ' ', df.loc[index, 'text']))
        # remove special characters
        s = "".join(re.sub(r'&#\d+;', ' ', s))
        # pre-processing
        s = " ".join(text_processor.pre_process_doc((s)))
        s = "".join(re.sub(r'\<[^>]*\>', ' ', s))
        # Remove non-ascii words or characters
        s = "".join([i if ord(i) < 128 else '' for i in s])
        s = s.replace(r'_[\S]?',r'')
        s = s.replace(r'[ ]{2, }',r' ')
        # Remove &, < and >
        s = s.replace(r'&amp;?', r'and')
        s = s.replace(r'&lt;', r'<')
        s = s.replace(r'&gt;', r'>')
        # Insert space between words and punctuation marks
        s = s.replace(r'([\w\d]+)([^\w\d ]+)', r'\1 \2')
        s = s.replace(r'([^\w\d ]+)([\w\d]+)', r'\1 \2')
        # Calculate text length for later use in LSTM
        s_length = len(s.split())
        # save ProcessedText and ProcessedText_length in final df
        df.loc[index, 'ProcessedText'] = s.strip()
        df.loc[index, 'ProcessedText_length'] = s_length

    # Drop texts with length <=2 and drop duplicates
  df = df[df['ProcessedText_length'] > 2]
  df = df.drop_duplicates(subset=['ProcessedText'])

    # BERT preprocess
  df['ProcessedText_BERT'] = '[CLS] ' + df.ProcessedText
  tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
  df['ProcessedText_BERTbase_length'] = [len(tokenizer.tokenize(sent)) for sent in df.ProcessedText_BERT]
    # tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')
    # df['ProcessedText_BERTlarge_length'] = [len(tokenizer.tokenize(sent)) for sent in df.ProcessedText_BERT]

  label_dict = dict()
  for i, l in enumerate(list(df.labels.value_counts().keys())):
     label_dict.update({l: i})

  df['Mapped_label'] = [label_dict[label] for label in df.labels]
  return df
Beispiel #16
0
class SentimentDataset(Dataset):
    def __init__(self, file, max_length, max_topic_length, word2idx, tword2idx,
                 topic_bs):
        """
        A PyTorch Dataset
        What we have to do is to implement the 2 abstract methods:

            - __len__(self): in order to let the DataLoader know the size
                of our dataset and to perform batching, shuffling and so on...
            - __getitem__(self, index): we have to return the properly
                processed data-item from our dataset with a given index

        Args:
            file (str): path to the data file
            max_length (int): the max length for each sentence.
                if 0 then use the maximum length in the dataset
            word2idx (dict): a dictionary which maps words to indexes
        """

        self.text_processor = TextPreProcessor(
            # terms that will be normalized
            normalize=[
                'url', 'email', 'percent', 'money', 'phone', 'user', 'time',
                'url', 'date', 'number'
            ],
            # terms that will be annotated
            annotate={
                "hashtag", "allcaps", "elongated", "repeated", 'emphasis',
                'censored'
            },
            fix_html=True,  # fix HTML tokens

            # corpus from which the word statistics are going to be used
            # for word segmentation
            segmenter="twitter",

            # corpus from which the word statistics are going to be used
            # for spell correction
            corrector="twitter",
            unpack_hashtags=True,  # perform word segmentation on hashtags
            unpack_contractions=True,  # Unpack contractions (can't -> can not)
            spell_correct_elong=False,  # spell correction for elongated words

            # select a tokenizer. You can use SocialTokenizer, or pass your own
            # the tokenizer, should take as input a string and return a list of tokens
            tokenizer=SocialTokenizer(lowercase=True).tokenize,

            # list of dictionaries, for replacing tokens extracted from the text,
            # with other expressions. You can pass more than one dictionaries.
            dicts=[emoticons])

        self.word2idx = word2idx
        self.tword2idx = tword2idx

        print("loading dataset from {}...".format(file))
        _data = load_data_from_dir(file)
        if topic_bs:
            self.data = [x[2] for x in _data]
            self.labels = [x[1] for x in _data]
            self.topics = [x[0] for x in _data]
        else:
            self.data = [x[1] for x in _data]
            self.labels = [x[0] for x in _data]

        print("Tokenizing...")
        # self.data = [tokenize(x) for x in self.data]
        self.data = [self.text_processor.pre_process_doc(x) for x in self.data]
        self.topics = [
            self.text_processor.pre_process_doc(x) for x in self.topics
        ]

        # if max_length == 0, then set max_length
        # to the maximum sentence length in the dataset
        if max_length == 0:
            self.max_length = max([len(x) for x in self.data])
        else:
            self.max_length = max_length

        if max_topic_length == 0:
            self.max_topic_length = max([len(x) for x in self.topics])
        else:
            self.max_topic_length = max_topic_length

        # define a mapping for the labels,
        # for transforming the string labels to numbers
        self.label_encoder = preprocessing.LabelEncoder()
        self.label_encoder = self.label_encoder.fit(self.labels)

        self.label_count = Counter(self.labels)
        self.weights = [
            self.label_count['-1'], self.label_count['2'],
            self.label_count['0'], self.label_count['1'], self.label_count['2']
        ]

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        """
        Returns the _transformed_ item from the dataset

        Args:
            index (int):

        Returns:
            (tuple):
                * example (ndarray): vector representation of a training example
                * label (string): the class label
                * length (int): the length (tokens) of the sentence
                * index (int): the index of the returned dataitem in the dataset.
                  It is useful for getting the raw input for visualizations.

        Examples:
            For an `index` where:
            ::
                self.data[index] = ['super', 'eagles', 'coach', 'sunday', 'oliseh',
                                    'meets', 'with', 'chelsea', "'", 's', 'victor',
                                    'moses', 'in', 'london', '<url>']
                self.target[index] = "neutral"

            the function will return:
            ::
                example = [  533  3908  1387   649 38127  4118    40  1876    63   106  7959 11520
                            22   888     7     0     0     0     0     0     0     0     0     0
                             0     0     0     0     0     0     0     0     0     0     0     0
                             0     0     0     0     0     0     0     0     0     0     0     0
                             0     0]
                label = 1
        """

        sample, label, topic = self.data[index], self.labels[
            index], self.topics[index]

        # transform the sample and the label,
        # in order to feed them to the model
        message = vectorize(sample, self.word2idx, self.max_length)
        topic = vectorize(topic, self.tword2idx, self.max_topic_length)
        label = self.label_encoder.transform([label])[0]

        return message, topic, label, len(self.data[index]), len(
            self.topics[index]), self.weights, index
Beispiel #17
0
import numpy as np
from nltk.tokenize import TweetTokenizer as TweetTokenizer
from nltk.corpus import stopwords
import random as rn
stop_words = set(stopwords.words('spanish'))
i = 0
matrixTweetsEmb = np.zeros((len(matrixTweets), 100, 50, 300))
for tweetsUser in matrixTweets:
    embTweetsUser = []
    if (i % 100) == 0:
        print(i)
    for tweet in tweetsUser:
        embTweetUser = np.zeros([50, 300])
        #Preprocesso
        tokList = text_processor.pre_process_doc(tweet)
        #Rimuovo le stopwords
        tokList = [w for w in tokList if not w in stop_words]
        #trovo l'embedding
        numTok = 0
        for token in tokList[0:50]:
            g_vec = []
            is_in_model = False
            if token in google_300.vocab.keys():
                is_in_model = True
                g_vec = google_300.word_vec(token)
            elif token == "<number>":
                is_in_model = True
                g_vec = google_300.word_vec("número")
            elif token == "<percent>":
                is_in_model = True
Beispiel #18
0
    },
    fix_html=True,  # fix HTML tokens

    # corpus from which the word statistics are going to be used
    # for word segmentation
    segmenter="twitter",

    # corpus from which the word statistics are going to be used
    # for spell correction
    corrector="twitter",
    unpack_hashtags=True,  # perform word segmentation on hashtags
    unpack_contractions=True,  # Unpack contractions (can't -> can not)
    spell_correct_elong=False,  # spell correction for elongated words

    # select a tokenizer. You can use SocialTokenizer, or pass your own
    # the tokenizer, should take as input a string and return a list of tokens
    tokenizer=SocialTokenizer(lowercase=True).tokenize,

    # list of dictionaries, for replacing tokens extracted from the text,
    # with other expressions. You can pass more than one dictionaries.
    dicts=[emoticons])

sentences = [
    "CANT WAIT for the new season of #TwinPeaks \(^o^)/!!! #davidlynch #tvseries :)))",
    "I saw the new #johndoe movie and it suuuuucks!!! WAISTED $10... #badmovies :/",
    "@SentimentSymp:  can't wait for the Nov 9 #Sentiment talks!  YAAAAAAY !!!  😈"
]

for s in sentences:
    print(" ".join(text_processor.pre_process_doc(s)))
Beispiel #19
0
)


for j in range(2,3):
  dt = pd.read_csv('id_and_sentences_'+str(float(j))+'.tsv', sep='\t', encoding='utf8', header=None, names=["id", "message"],
                  error_bad_lines=False)
  id = dt.iloc[:, 0]
  sentences = dt.iloc[:, 1]

  examples = []
  import re

  i = 0
  for s in sentences:
      s = s.lower()
      s = str(" ".join(text_processor.pre_process_doc(s)))
      s = re.sub(r"[^a-zA-ZÀ-ú</>!?♥♡\s\U00010000-\U0010ffff]", ' ', s)
      s = re.sub(r"\s+", ' ', s)
      s = re.sub(r'(\w)\1{2,}', r'\1\1', s)
      s = re.sub(r'^\s', '', s)
      s = re.sub(r'\s$', '', s)
      s = emoji.demojize(s).replace(":", "").replace("_", " ").replace("<", " ").replace(">", " ").replace("/", "")
      s = re.sub(r"\s+", ' ', s).lstrip()
      # print(s)
      examples.append(s)
      i = i + 1

  sentence_embeddings = model.encode(examples)
  print(sentence_embeddings[0])
  print(sentence_embeddings[0].shape)
Beispiel #20
0
        ans1 = ans1 + " ".join(l1[1:])
        ans = ans1[:-4] + " " + " ".join(l2) + " " + ans1[-4:] + '\n'
        ans = list(ans)
        ans[-3] = '\t'
        ans[-5] = '\t'
        # print(ans)
        ans = "".join(ans)
        # print(ans)

        ##Modified by Keshav
        print(ans)
        ans = ans.replace('@ ',
                          '@').replace('# ', '#').replace('<', '').replace(
                              '>', '').replace("_", ' ').replace('  ', ' ')
        string = ans.split('\t')[1]
        processed = text_processor.pre_process_doc(string)
        print(processed)
        new_ans = []
        for token in processed:
            if token.startswith(("<hash", "<number", "<")) or "user" in token:
                continue
            new_ans.append(token)

        arr = ans.split('\t')
        arr[1] = re.sub(r"http.*|…", "", " ".join(new_ans))
        ans = "\t".join(arr)

        print(arr[2])
        print(ans)
        # break
        newfile.write(ans)
Beispiel #21
0
        # corpus from which the word statistics are going to be used
        # for word segmentation
        segmenter="twitter",

        # corpus from which the word statistics are going to be used
        # for spell correction
        corrector="twitter",
        spell_correction=True,
        spell_correct_elong=False,  # spell correction for elongated words

        # list of dictionaries, for replacing tokens extracted from the text,
        # with other expressions. You can pass more than one dictionaries.
        dicts=[emoticons])

    with open("data/" + FILE_NAME, 'r', newline='', encoding='utf-8') as f:
        reader = csv.reader(f)
        with open("data/" + FILE_NAME.split(".")[0] + "_normalized.csv",
                  'w',
                  newline='',
                  encoding='utf-8') as w:
            writer = csv.writer(w)
            header = next(reader)
            header.insert(CONTENT_COLUMN + 1, "Normalized")
            writer.writerow(header)
            print("Processing tweets...")
            for row in reader:
                content = row[CONTENT_COLUMN]
                row.insert(CONTENT_COLUMN + 1,
                           " ".join(text_processor.pre_process_doc(content)))
                writer.writerow(row)
class TexProcessor(object):
    def __init__(self, args, lang):
        self.others = Strategy(args.others)  # valori validi 0,1,2 //ANDATA
        self.emoji = Strategy(
            args.emoji)  #0,1 emoji ,2 (emoji) ,3, 4 ,5 (traduzione) //
        self.emoticon = Strategy(
            args.emoticon)  #0,1 emoticon ,2 (emoticon) ,3, 4 ,5 (traduzione)
        self.url = Strategy(args.url)  # 0,1,2,3
        self.hashtag = Strategy(
            args.hashtag)  # 0,1 = #hashtag,2 ,3 (#hashtag),4,5
        self.punctuation = Strategy(args.punctuation)  #Valori validi 0,3
        self.mention = Strategy(args.mention)  #0,1,2,3
        self.lower = args.lower  #true o false
        self.lang = lang  # EN o IT
        self.ita_moji = pd.read_csv('./data/italianMoji.csv', sep=';')
        if self.lang == 'IT':
            self.lm = wordninja.LanguageModel('./data/words.last_all.txt.gz')
        else:
            self.lm = None
        self.text_processor = TextPreProcessor(
            remove=[
                'email',  #raw o nomralize.
                'percent',  #raw o nomralize: EN: percentage, IT: percentuale.
                'money',  # raw o nomralize: EN: money, IT: soldi. verificare se becca le valute
                'phone',  # raw o nomralize: EN: phone, IT: telefono
                'time',  # raw o nomralize: EN time, It: ore 
                'date',  # raw o nomralize EN date, It data
                'number'  #raw o nomralize En number, it numero.
            ],
            annotate={},
            fix_html=True,
            unpack_hashtags=False,
            tokenizer=SocialTokenizer(lowercase=self.lower).tokenize,
            dicts=[emoticons])

    def load_dict_emoticon(self):
        if self.lang == 'EN':
            return {
                ":‑)": "happy",
                ": ‑)": "happy",
                ": ‑ )": "happy",
                ":-]": "happy",
                ": - ]": "happy",
                ": -]": "happy",
                ":-3": "happy",
                ": - 3": "happy",
                ": -3": "happy",
                ":->": "happy",
                ": - >": "happy",
                ": ->": "happy",
                "8-)": "happy",
                "8 -)": "happy",
                "8 - )": "happy",
                ":-}": "happy",
                ": - }": "happy",
                ": -}": "happy",
                ":)": "happy",
                ": )": "happy",
                ":]": "happy",
                ": ]": "happy",
                ":3": "happy",
                ": 3": "happy",
                ":>": "happy",
                ": >": "happy",
                "8)": "happy",
                "8 )": "happy",
                ":}": "happy",
                ": }": "happy",
                ":o)": "happy",
                ":o )": "happy",
                ": o )": "happy",
                ":c)": "happy",
                ": c )": "happy",
                ":c )": "happy",
                ":^)": "happy",
                ": ^ )": "happy",
                ": ^)": "happy",
                "=]": "happy",
                "= ]": "happy",
                "=)": "happy",
                "= )": "happy",
                ":-))": "happy",
                ": - ) )": "happy",
                ":- ) )": "happy",
                ":- ))": "happy",
                ": -))": "happy",
                ":‑D": "happy",
                ": ‑ D": "happy",
                ": ‑D": "happy",
                "8‑D": "happy",
                "8 ‑D": "happy",
                "8 ‑ D": "happy",
                "x‑D": "happy",
                "x ‑ D": "happy",
                "x ‑D": "happy",
                "X‑D": "happy",
                "X ‑ D": "happy",
                "X ‑D": "happy",
                ":D": "happy",
                ": D": "happy",
                "8D": "happy",
                "8 D": "happy",
                "xD": "happy",
                "x D": "happy",
                "XD": "happy",
                "X D": "happy",
                ":‑(": "sad",
                ": ‑(": "sad",
                ": ‑ (": "sad",
                ":‑c": "sad",
                ": ‑c": "sad",
                ":‑<": "sad",
                ": ‑ <": "sad",
                ":‑[": "sad",
                ": ‑ [": "sad",
                ":(": "sad",
                ": (": "sad",
                ":c": "sad",
                ": c": "sad",
                ":<": "sad",
                ": <": "sad",
                ":[": "sad",
                ": [": "sad",
                ":-||": "sad",
                ": - | |": "sad",
                ": - ||": "sad",
                ": -||": "sad",
                ": -| |": "sad",
                ">:[": "sad",
                ">: [": "sad",
                "> : [": "sad",
                ":{": "sad",
                ": {": "sad",
                ":@": "sad",
                ": @": "sad",
                ">:(": "sad",
                "> : (": "sad",
                ":'‑(": "sad",
                ": '‑(": "sad",
                ": ' ‑(": "sad",
                ": ' ‑ (": "sad",
                ":'(": "sad",
                ": ' (": "sad",
                ": '(": "sad",
                ":‑P": "playful",
                ": ‑P": "playful",
                ": ‑ P": "playful",
                "X‑P": "playful",
                "X ‑ P": "playful",
                "X ‑P": "playful",
                "x‑p": "playful",
                "x ‑p": "playful",
                ":‑p": "playful",
                ": ‑p": "playful",
                ": ‑ p": "playful",
                ":‑Þ": "playful",
                ": ‑ Þ": "playful",
                ":‑þ": "playful",
                ": ‑þ": "playful",
                ":‑b": "playful",
                ": ‑ b": "playful",
                ": ‑b": "playful",
                ":P": "playful",
                ": P": "playful",
                "XP": "playful",
                "X P": "playful",
                "xp": "playful",
                "x p": "playful",
                ":p": "playful",
                ": p": "playful",
                ":Þ": "playful",
                ": Þ": "playful",
                ":þ": "playful",
                ": þ": "playful",
                ":b": "playful",
                ": b": "playful",
                "<3": "love",
                "< 3": "love",
                ":*": "love",
                ": *": "love"
            }
        else:
            return {
                ":‑)": "felice",
                ": ‑)": "felice",
                ": ‑ )": "felice",
                ":-]": "felice",
                ": - ]": "felice",
                ": -]": "felice",
                ":-3": "felice",
                ": - 3": "felice",
                ": -3": "felice",
                ":->": "felice",
                ": - >": "felice",
                ": ->": "felice",
                "8-)": "felice",
                "8 -)": "felice",
                "8 - )": "felice",
                ":-}": "felice",
                ": - }": "felice",
                ": -}": "felice",
                ":)": "felice",
                ": )": "felice",
                ":]": "felice",
                ": ]": "felice",
                ":3": "felice",
                ": 3": "felice",
                ":>": "felice",
                ": >": "felice",
                "8)": "felice",
                "8 )": "felice",
                ":}": "felice",
                ": }": "felice",
                ":o)": "felice",
                ":o )": "felice",
                ": o )": "felice",
                ":c)": "felice",
                ": c )": "felice",
                ":c )": "felice",
                ":^)": "felice",
                ": ^ )": "felice",
                ": ^)": "felice",
                "=]": "felice",
                "= ]": "felice",
                "=)": "felice",
                "= )": "felice",
                ":-))": "felice",
                ": - ) )": "felice",
                ":- ) )": "felice",
                ":- ))": "felice",
                ": -))": "felice",
                ":‑D": "felice",
                ": ‑ D": "felice",
                ": ‑D": "felice",
                "8‑D": "felice",
                "8 ‑D": "felice",
                "8 ‑ D": "felice",
                "x‑D": "felice",
                "x ‑ D": "felice",
                "x ‑D": "felice",
                "X‑D": "felice",
                "X ‑ D": "felice",
                "X ‑D": "felice",
                ":D": "felice",
                ": D": "felice",
                "8D": "felice",
                "8 D": "felice",
                "xD": "felice",
                "x D": "felice",
                "XD": "felice",
                "X D": "felice",
                ":‑(": "triste",
                ": ‑(": "triste",
                ": ‑ (": "triste",
                ":‑c": "triste",
                ": ‑c": "triste",
                ":‑<": "triste",
                ": ‑ <": "triste",
                ":‑[": "triste",
                ": ‑ [": "triste",
                ":(": "triste",
                ": (": "triste",
                ":c": "triste",
                ": c": "triste",
                ":<": "triste",
                ": <": "triste",
                ":[": "triste",
                ": [": "triste",
                ":-||": "triste",
                ": - | |": "triste",
                ": - ||": "triste",
                ": -||": "triste",
                ": -| |": "triste",
                ">:[": "triste",
                ">: [": "triste",
                "> : [": "triste",
                ":{": "triste",
                ": {": "triste",
                ":@": "triste",
                ": @": "triste",
                ">:(": "triste",
                "> : (": "triste",
                ":'‑(": "triste",
                ": '‑(": "triste",
                ": ' ‑(": "triste",
                ": ' ‑ (": "triste",
                ":'(": "triste",
                ": ' (": "triste",
                ": '(": "triste",
                ":‑P": "scherzoso",
                ": ‑P": "scherzoso",
                ": ‑ P": "scherzoso",
                "X‑P": "scherzoso",
                "X ‑ P": "scherzoso",
                "X ‑P": "scherzoso",
                "x‑p": "scherzoso",
                "x ‑p": "scherzoso",
                ":‑p": "scherzoso",
                ": ‑p": "scherzoso",
                ": ‑ p": "scherzoso",
                ":‑Þ": "scherzoso",
                ": ‑ Þ": "scherzoso",
                ":‑þ": "scherzoso",
                ": ‑þ": "scherzoso",
                ":‑b": "scherzoso",
                ": ‑ b": "scherzoso",
                ": ‑b": "scherzoso",
                ":P": "scherzoso",
                ": P": "scherzoso",
                "XP": "scherzoso",
                "X P": "scherzoso",
                "xp": "scherzoso",
                "x p": "scherzoso",
                ":p": "scherzoso",
                ": p": "scherzoso",
                ":Þ": "scherzoso",
                ": Þ": "scherzoso",
                ":þ": "scherzoso",
                ": þ": "scherzoso",
                ":b": "scherzoso",
                ": b": "scherzoso",
                "<3": "amore",
                "< 3": "amore",
                ":*": "amore",
                ": *": "amore"
            }

    def do_preprocess(self, tweet):
        #Gestione Emoticon.
        SMILEY = self.load_dict_emoticon()
        if self.emoticon == Strategy.REMOVE:
            words = tweet.split()
            reformed = [" " if word in SMILEY else word for word in words]
            tweet = " ".join(reformed)
        if self.emoticon == Strategy.NORMALIZE:
            words = tweet.split()
            reformed = [
                "emoticon" if word in SMILEY else word for word in words
            ]
            tweet = " ".join(reformed)
        if self.emoticon == Strategy.PACK:
            words = tweet.split()
            reformed = [
                "(emoticon)" if word in SMILEY else word for word in words
            ]
            tweet = " ".join(reformed)
        if self.emoticon == Strategy.TRASLATE:
            words = tweet.split()
            reformed = [
                SMILEY[word] if word in SMILEY else word for word in words
            ]
            tweet = " ".join(reformed)

        emoji_pattern = re.compile(
            "["
            u"\U0001F600-\U0001F64F"  #emoticons 
            u"\U0001F300-\U0001F5FF"  #symbols & pictographs
            u"\U0001F680-\U0001F6FF"  #transport & map symbols
            u"\U0001F1E0-\U0001F1FF"  #flags (iOS)
            u"\U00002702-\U000027B0"
            u"\U000024C2-\U0001F251"
            "]+",
            flags=re.UNICODE)

        #Gestione Emoji
        if self.emoji == Strategy.REMOVE:
            tweet = emoji_pattern.sub(r'', tweet)
        if self.emoji == Strategy.NORMALIZE:
            tweet = emoji_pattern.sub(r' emoji ', tweet)
        if self.emoji == Strategy.PACK:
            tweet = emoji_pattern.sub(r' (emoji) ', tweet)
        if self.emoji == Strategy.TRASLATE:
            number = emoji.emoji_count(tweet)
            if number != 0:
                if self.lang == 'EN':
                    tweet = emoji.demojize(tweet, delimiters=("", ""))
                    tweet = tweet.replace("_", " ")
                elif self.lang == 'IT':
                    emos = emoji.emoji_lis(tweet)
                    for emo in emos:
                        singleMoji = str(emo['emoji'])
                        ita_M = self.ita_moji[self.ita_moji['emoji'] ==
                                              singleMoji]
                        if (len(ita_M['text_ita'].values) != 0):
                            significato = ita_M['text_ita'].values[0]
                            tweet = tweet.replace(singleMoji, significato)
                        else:
                            tweet = tweet.replace(singleMoji, '')
        if self.emoji == Strategy.PACK_TRASLATE:
            number = emoji.emoji_count(tweet)
            if number != 0:
                if self.lang == 'EN':
                    tweet = emoji.demojize(tweet, delimiters=("(", ")"))
                    tweet = tweet.replace("_", " ")
                elif self.lang == 'IT':
                    emos = emoji.emoji_lis(tweet)
                    for emo in emos:
                        singleMoji = str(emo['emoji'])
                        ita_M = self.ita_moji[self.ita_moji['emoji'] ==
                                              singleMoji]
                        if (len(ita_M['text_ita'].values) != 0):
                            significato = ita_M['text_ita'].values[0]
                            tweet = tweet.replace(singleMoji,
                                                  '(' + significato + ')')
                        else:
                            tweet = tweet.replace(singleMoji, '')
        #Gestione other.
        if self.others == Strategy.NORMALIZE:
            tweet = str(" ".join(self.text_processor.pre_process_doc(tweet)))
            if self.lang == 'EN':
                tweet = tweet.replace('<percent>', '<percentage>')
            if self.lang == 'IT':
                tweet = tweet.replace('<percent>', '<percentuale>')
                tweet = tweet.replace('<money>', '<soldi>')
                tweet = tweet.replace('<time>', '<tempo>')
                tweet = tweet.replace('<date>', '<data>')
                tweet = tweet.replace('<number>', '<numero>')
            tweet = tweet.replace("<", " ")
            tweet = tweet.replace(">", " ")
        if self.others == Strategy.PACK:
            tweet = str(" ".join(self.text_processor.pre_process_doc(tweet)))
            if self.lang == 'EN':
                tweet = tweet.replace('<percent>', '<percentage>')
            if self.lang == 'IT':
                tweet = tweet.replace('<percent>', '<percentuale>')
                tweet = tweet.replace('<money>', '<soldi>')
                tweet = tweet.replace('<time>', '<tempo>')
                tweet = tweet.replace('<date>', '<data>')
                tweet = tweet.replace('<number>', '<numero>')
            tweet = tweet.replace("<", "(")
            tweet = tweet.replace(">", "(")
        elems = [
            tag.strip("#") for tag in tweet.split() if tag.startswith("#")
        ]

        #Hashtag
        if self.hashtag == Strategy.REMOVE:
            for elem in elems:
                tweet = tweet.replace("#" + elem, " ")
        if self.hashtag == Strategy.TRASLATE:
            for elem in elems:
                if self.lang == 'IT':
                    traslate = ' '.join(self.lm.split(elem))
                    tweet = tweet.replace("#" + elem, traslate)
                if self.lang == 'EN':
                    traslate = ' '.join(wordninja.split(elem))
                    tweet = tweet.replace("#" + elem, traslate)
        if self.hashtag == Strategy.PACK_TRASLATE:
            for elem in elems:
                if self.lang == 'IT':
                    traslate = ' '.join(self.lm.split(elem))
                    tweet = tweet.replace("#" + elem, '< ' + traslate + ' >')
                if self.lang == 'EN':
                    traslate = ' '.join(wordninja.split(elem))
                    tweet = tweet.replace("#" + elem, '< ' + traslate + ' >')
        if self.hashtag == Strategy.NORMALIZE:
            for elem in elems:
                tweet = tweet.replace("#" + elem, "#hashtag")
        if self.hashtag == Strategy.PACK:
            for elem in elems:
                tweet = tweet.replace("#" + elem, "(#hashtag)")
        #URLs
        if self.url == Strategy.REMOVE:
            tweet = ' '.join(re.sub("(\w+:\/\/\S+)", " ", tweet).split())
        if self.url == Strategy.NORMALIZE:
            tweet = ' '.join(re.sub("(\w+:\/\/\S+)", " url ", tweet).split())
        if self.url == Strategy.PACK:
            tweet = ' '.join(re.sub("(\w+:\/\/\S+)", " (url) ", tweet).split())
        #Mentions
        if self.mention == Strategy.NORMALIZE:
            tweet = ' '.join(
                re.sub("(@[A-Za-z0-9]+)", " @user ", tweet).split())
        if self.mention == Strategy.PACK:
            tweet = ' '.join(
                re.sub("(@[A-Za-z0-9]+)", " (@user) ", tweet).split())
        if self.mention == Strategy.REMOVE:
            tweet = ' '.join(re.sub("(@[A-Za-z0-9]+)", " ", tweet).split())
        if self.punctuation == Strategy.REMOVE:
            tweet = ' '.join(re.sub("[\.\,\!\?\:\;\-\=]", " ", tweet).split())
        if self.lower == True:
            return tweet.lower()
        else:
            return tweet
# -*- coding: utf-8 -*-
import sys
from ekphrasis.classes.preprocessor import TextPreProcessor
from ekphrasis.classes.tokenizer import SocialTokenizer
from ekphrasis.dicts.emoticons import emoticons

text_processor = TextPreProcessor(
    normalize=[
        'url', 'email', 'percent', 'money', 'phone', 'user', 'time', 'url',
        'date', 'number'
    ],
    annotate={
        'hashtag', 'allcaps', 'elongated', 'repeated', 'emphasis', 'censored'
    },
    fix_html=True,
    segmenter='twitter',
    corrector='twitter',
    unpack_hashtags=True,  # perform word segmentation on hashtags
    unpack_contractions=True,  # Unpack contractions (can't -> can not)
    spell_correct_elong=False,  # spell correction for elongated words
    tokenizer=SocialTokenizer(lowercase=True).tokenize,
    dicts=[emoticons])

filename_input = sys.argv[1]
filename_output = filename_input + '.out'

with open(filename_input, 'r') as fin, open(filename_output, 'w') as fout:
    for line in fin:
        fout.write(' '.join(text_processor.pre_process_doc(s)) + '\n')
Beispiel #24
0
class Pem:
    '''
    Politeness Estimator for Microblogs.
    Typing information was done via:

    ```shell
    monkeytype run __init__.py
    monkeytype apply pem
    ```
    '''

    threshold = 0.5
    use_liwc = False
    use_cntVec = False

    def __init__(
            self,
            liwc_path: str = '',
            emolex_path: str = 'english_emolex.csv',
            estimator_path: str = 'english_twitter_politeness_estimator.joblib',
            feature_defn_path:
        str = 'english_twitter_additional_features.pickle',
            countVectorizer_path: str = '') -> None:
        # Preload LIWC dictionary:
        if liwc_path:
            liwc_df = pd.read_csv(liwc_path)
            liwc_df['*'] = liwc_df['term'].str.endswith('*')
            liwc_df['t'] = liwc_df['term'].str.rstrip('*')
            self.liwc_prefx = liwc_df[liwc_df['*']].groupby(
                'category')['t'].apply(set)
            self.liwc_whole = liwc_df[~liwc_df['*']].groupby(
                'category')['t'].apply(set)
            self.use_liwc = True

        # Preload EmoLex dictionary:
        emolex_df = pd.read_csv(emolex_path, index_col=0)
        self.emolex = emolex_df.apply(lambda s: set(s[s == 1].index))

        # Preload additional feature rules:
        pltlex = pd.read_pickle(feature_defn_path)
        types = pltlex.apply(type)
        self.pltlex_ptn = pltlex[types == re.Pattern].to_dict()
        self.pltlex_set = pltlex[types == set].to_dict()

        # Initialize Tokenizer:
        self.text_processor = TextPreProcessor(
            # terms that will be normalized:
            normalize=[
                'url', 'email', 'percent', 'money', 'phone', 'user', 'time',
                'url', 'date', 'number'
            ],
            # terms that will be annotated:
            annotate={
                "hashtag", "allcaps", "elongated", "repeated", 'emphasis',
                'censored'
            },
            # perform word segmentation on hashtags:
            unpack_hashtags=False,
            # Unpack contractions (can't -> can not):
            unpack_contractions=True,
            tokenizer=SocialTokenizer(lowercase=True).tokenize,
        )
        # preload classifier:
        self.clf = joblib.load(estimator_path)

        if countVectorizer_path:
            self.counter = joblib.load(countVectorizer_path)
            self.use_cntVec = True

    def load(self, filepath: str = 'tweets.csv'):
        self.df = pd.read_csv(filepath)
        return self

    def _tokenizeString(self, s: str) -> List[str]:
        '''
        _tokenizeString tokenizes a string.
        Interestingly, it is faster to put this call into a separate method like this.
        '''
        return self.text_processor.pre_process_doc(s)

    def tokenize(self):
        self.df['token'] = self.df['text'].apply(self._tokenizeString)
        self.df['token_cnts'] = self.df['token'].apply(Counter)
        return self

    def vectorizeByLiwc(self, cnts: dict, liwc_whole: dict,
                        liwc_prefx: dict) -> Series:
        '''Vectorize by LIWC'''
        result = self.countAcrossDicts(cnts, liwc_whole)

        for category, tokens in liwc_prefx.items():
            for j, n_appearance in cnts.items():
                n_prefixes = sum(map(j.startswith, tokens))
                result[category] += n_appearance * n_prefixes

        return pd.Series(result)

    def vectorizeByEmolex(self, cnts: dict, lex: dict) -> Series:
        '''Vectorize by EmoLex'''
        result = self.countAcrossDicts(cnts, lex)
        return pd.Series(result)

    def vectorizeByPoliteLex(self, r: Series, patterns: dict,
                             sets: dict) -> Series:
        '''Vectorize by PoliteLex'''
        result = self.countAcrossDicts(r['token_cnts'], sets)

        text = r['text']
        for feature_name, pattern in patterns.items():
            # Slightly faster than `sum(1 for m in pattern.finditer(text))`.
            result[feature_name] = len(pattern.findall(text))

        return pd.Series(result)

    @staticmethod
    def countAcrossDicts(cnts: dict, sets: dict) -> dict:
        result = {}
        # This native-Python implementation is faster than DataFrame multiplication.
        for feature_name, tokens in sets.items():
            tokens_seen = tokens.intersection(cnts)
            result[feature_name] = sum(cnts[token] for token in tokens_seen)
        return result

    def vectorize(self, debug=True):
        '''
        This function extracts features from the provided texts.
        It requires that `self.df` is already prepared.
        It writes the prepared features to `self.X`.
        '''
        if self.use_liwc:
            liwc_cnts_df = self.df['token_cnts'].apply(
                self.vectorizeByLiwc,
                liwc_whole=self.liwc_whole,
                liwc_prefx=self.liwc_prefx)
        emolex_cnts_df = self.df['token_cnts'].apply(self.vectorizeByEmolex,
                                                     lex=self.emolex)
        politelex_cnts_df = self.df.apply(self.vectorizeByPoliteLex,
                                          patterns=self.pltlex_ptn,
                                          sets=self.pltlex_set,
                                          axis=1)

        if self.use_cntVec:
            # Unigrams:
            space_separated_texts = self.df['token'].apply(' '.join)
            unigram_matrix = self.counter.transform(space_separated_texts)
            unigram_matrix = unigram_matrix.todense()

        if debug:
            if self.use_liwc: self.liwc_cnts_df = liwc_cnts_df
            self.emolex_cnts_df = emolex_cnts_df.astype(int)
            self.politelex_cnts_df = politelex_cnts_df
            if self.use_cntVec:
                self.space_separated_texts = space_separated_texts
                self.unigram_df = pd.DataFrame(unigram_matrix,
                                               index=self.df.index)

        # Combine all feature sets into one table:
        all_feats = [
            emolex_cnts_df,
            politelex_cnts_df,
        ]
        if self.use_liwc:
            all_feats.insert(0, liwc_cnts_df)
        if self.use_cntVec:
            all_feats.append(unigram_matrix)
        self.X = concat(all_feats, axis=1)
        return self

    def predict(self) -> Series:
        def scoreToLabel(score):
            if score < -self.threshold:
                return 'Rude'
            if score > self.threshold:
                return 'Polite'
            return 'Neutral'

        scores = self.predict_proba()
        labels = scores.apply(scoreToLabel).rename('label')
        return labels

    def predict_proba(self) -> Series:
        probs = self.clf.predict_proba(self.X)
        probs_df = pd.DataFrame(probs)
        scores = probs_df.loc[:, 1] - probs_df.loc[:, 0]

        # Zero out scores that is too insignificant:
        scores = scores.apply(lambda x: 0
                              if -self.threshold < x < self.threshold else x)

        return scores.rename('score')
Beispiel #25
0
class SentClean:
    prep_default = {'spell': False,
                    'remove_sequences': False,
                    'lowercase': False,
                    'punctuations': [],
                    'excluding_criteria': ['copyright','copyright','medrxiv','appendix'],
                    'starting_keywords_to_remove': [
                        'method', 'results', 'result', 'conclusion', 'conclusions', 'evaluation', 'evaluations',
                        'objectives', 'objective', 'cc - by international license', 'doi']
                    }
    def __init__(self,
                 prep=prep_default
                 ):
        """
        Constructor of clean functions over extracted texts/tweets
        :param prep: paramter settings of the text-preprocessor
        """

        # check existence of the keys within prep dict, which needs to be a list
        for k in self.prep_default.keys():
            if not k in prep.keys():
                prep[k] = self.prep_default[k]

        self.prep = prep
        self.omit = list(emoticons.keys()) + list(emoticons.values())
        self.text_processor = TextPreProcessor(
            fix_html=True,
            normalize=[],
            segmenter='twitter',
            corrector='twitter',
            fix_text=True,
            unpack_hashtags=True,  # perform word segmentation on hashtags
            unpack_contractions=prep['spell'],  # Unpack contractions (can't -> can not)
            spell_correction=prep['spell'],
            spell_correct_elong=prep['spell'],
            tokenizer=SocialTokenizer(lowercase=prep['lowercase']).tokenize,
            dicts=[{}],
            omit=list(emoticons.keys()) + list(emoticons.values()),
        )
        self.nlp = spacy.load("en_core_web_sm")
        self.nlp_sent = English()  # just the language with no model
        sentencizer = self.nlp_sent.create_pipe("sentencizer")
        self.nlp_sent.add_pipe(sentencizer)



    def clean_tweet(self, text):
        text_list = self.clean_sentences([text])
        if not len(text_list) > 0:
            return None

        return text_list


    def pattern_repl(self, matchobj):
        """
        Return a replacement string to be used for match object
        """
        return ' '.rjust(len(matchobj.group(0)))


    def clean_sentences(self, sentences, min_len_sent=5, max_num_punctuations=10):
        """
        function to clean list of sentences
        :param sentences: list (str) of the sentences
        :param min_len_sent: int parameter used to trim
        :param max_num_punctuations: int parameter used to trim
        :return: cleaned sentences
        """

        # remove non english sentences
        en_sentences = []
        for s in sentences:
            try:
                if detect(s) == 'en':
                    en_sentences += [s]
            except:
                continue
        sentences = en_sentences

        # remove chinese characters
        sentences = [re.sub("([^\x00-\x7F])+", " ", text) for text in sentences]

        # restrict to ascii characters
        sentences = [s.encode('ascii', errors='ignore').decode() for s in sentences]
        # print(f'input sentence is {sentences}.')
        # trim length
        trim_sentences = [s for s in sentences if len(s.split()) > min_len_sent]
        new_trim_sentences = []
        for s in trim_sentences:
            p_count = 0
            for p_ in self.prep['punctuations']:
                p_count += s.count(p_)
            if p_count < max_num_punctuations:
                new_trim_sentences += [s]

        # remove redundant
        trim_sentences = list(set(new_trim_sentences))
        # print(f"trim sentence is: {trim_sentences}.")
        # extra sentence wise pre processing steps
        new_text_list = []
        for sent_ in trim_sentences:
            # space correction on urls
            text = sent_.replace('http: /', 'https:/')
            text = text.replace('https: /', 'https:/')
            text = p.clean(text)
            text = clean(text,
                         fix_unicode=True,  # fix various unicode errors
                         to_ascii=True,  # transliterate to closest ASCII representation
                         lower=True,  # lowercase text
                         no_line_breaks=True,  # fully strip line breaks as opposed to only normalizing them
                         no_urls=True,  # replace all URLs with a special token
                         no_emails=True,  # replace all email addresses with a special token
                         no_phone_numbers=True,  # replace all phone numbers with a special token
                         no_numbers=False,  # replace all numbers with a special token
                         no_digits=False,  # replace all digits with a special token
                         no_currency_symbols=False,  # replace all currency symbols with a special token
                         no_punct=False,  # fully remove punctuation
                         replace_with_url=" ",
                         replace_with_email=" ",
                         replace_with_phone_number=" ",
                         replace_with_number=" ",
                         replace_with_digit=" ",
                         replace_with_currency_symbol=" ",
                         lang="en"  # set to 'de' for German special handling
                         )
            # remove citations
            text = re.sub('\[(\s*\d*,*\s*)+\]', '', text)
            text = re.sub('\[(\s*\d*-*\s*)+\]', '', text)
            text = re.sub('\((\s*\d*,*\s*)+\)', '', text)
            text = re.sub('\((\s*\d*-*\s*)+\)', '', text)

            # replace [**Patterns**] with spaces.
            text = re.sub(r'\[\*\*.*?\*\*\]', self.pattern_repl, text)
            # remove hashtag symbol and unpack it
            text = " ".join(self.text_processor.pre_process_doc(text))
            # remove emoticons
            for item in self.omit:
                text = text.replace(item, ' ')
            # remove non-word character-repetitions
            text = re.sub(r'(\W)\1+', r'\1', text)
            if self.prep['remove_sequences']:
                # remove sequences like 'A p p e n d i x'
                text = re.sub(r'(\S\s){3,}', '', text)
            for p_ in self.prep['punctuations']:
                # replace `_` with spaces.
                text = text.replace(p_, ' ' + p_ + ' ')
            if self.prep['spell']:
                # spell correction
                text = " ".join(spell_corrector.correct(w) for w in social_tokenizer(text))
            # remove douplicated whitespaces
            text = squeezeWhitespace(text)
            
            if text.split(' ')[0] in self.prep['starting_keywords_to_remove'] and text.split(' ')[1] == ':':
                text = ' '.join(text.split(' ')[2:])
            # exclude sentences including keywords like
            for ex_key in self.prep['excluding_criteria']:
                if ex_key in text:
                    text = ''

            # check if there exists verb on the text
            doc = self.nlp((text))
            number_of_verbs = len([token.lemma_ for token in doc if token.pos_ == "VERB"])
            # print(f"the text is: {text}.")
            if len(text.split(' ')) > min_len_sent and  number_of_verbs > 0:
                new_text_list.append(text)

        return new_text_list
Beispiel #26
0
from ekphrasis.classes.tokenizer import SocialTokenizer
from ekphrasis.dicts.emoticons import emoticons

text_processor = TextPreProcessor(
    normalize=[
        'url', 'email', 'percent', 'money', 'phone', 'user', 'time', 'url',
        'date', 'number'
    ],
    annotate={
        'hashtag', 'allcaps', 'elongated', 'repeated', 'emphasis', 'censored'
    },
    fix_html=True,
    segmenter='twitter',
    corrector='twitter',
    unpack_hashtags=True,  # perform word segmentation on hashtags
    unpack_contractions=True,  # Unpack contractions (can't -> can not)
    spell_correct_elong=False,  # spell correction for elongated words
    tokenizer=SocialTokenizer(lowercase=True).tokenize,
    dicts=[emoticons])

for filename_input in [
        '1/train.text', '1/test.text', '2/train.text', '2/test.text',
        '3/train.text', '3/test.text'
]:
    filename_output = filename_input + '.out'

    with open(filename_input, 'r') as fin, open(filename_output, 'w') as fout:
        for line in fin:
            fout.write(' '.join(text_processor.pre_process_doc(line.strip())) +
                       '\n')
Beispiel #27
0
class PreprocessingText:
    def __init__(self, text, **kwargs):
        self.text = text
        self.text_processor = TextPreProcessor(
            # terms that will be normalize e.g. [email protected] to <email>
            normalize=[
                'url', 'email', 'percent', 'money', 'phone', 'user', 'time',
                'date', 'number'
            ],

            # terms that will be annotated e.g. <hashtag>#test</hashtag>
            annotate={
                "hashtag", "allcaps", "elongated", "repeated", 'emphasis'
            },
            fix_html=True,  # fix HTML tokens
            unpack_hashtags=True,  # perform word segmentation on hashtags

            # select a tokenizer. You can use SocialTokenizer, or pass your own if not text tokenized on whitespace
            # the tokenizer, should take as input a string and return a list of tokens
            tokenizer=SocialTokenizer(lowercase=True).tokenize,

            # list of dictionaries, for replacing tokens extracted from the text,
            # with other expressions. You can pass more than one dictionaries.
            dicts=[emoticons])

    def remove_stopwords(self, data):
        stop_ger = stopwords.words('german')

        allowed_stopwords = [
            'kein', 'keine', 'keinem', 'keinen', 'keiner', 'keines', 'nicht',
            'nichts'
        ]
        for a in allowed_stopwords:
            stop_ger.remove(a)

        customstopwords = [
            'rt', 'mal', 'heute', 'gerade', 'erst', 'macht', 'eigentlich',
            'warum', 'gibt', 'gar', 'immer', 'schon', 'beim', 'ganz', 'dass',
            'wer', 'mehr', 'gleich', 'wohl'
        ]
        normalizedwords = [
            '<url>', '<email>', '<percent>', 'money>', '<phone>', '<user>',
            '<time>', '<url>', '<date>', '<number>'
        ]
        stop_ger = stop_ger + customstopwords + normalizedwords
        clean_data = []
        if (type(data) == list):
            for d in data:
                data_stop_words = []
                for word in d:
                    if word not in stop_ger:
                        data_stop_words.append(word)
                clean_data.append(data_stop_words)
        if (type(data) == str):
            words = data.split()
            for word in words:
                if word not in stop_ger:
                    clean_data.append(word)
        return clean_data

    def lemmatize_words(self, data):
        _lemmatizer = PatternParserLemmatizer()
        lemmatized_data = []
        if (type(data) == list):
            for d in data:
                text = ""
                for word in d:
                    text = text + " " + word
                l = _lemmatizer.lemmatize(text)
                lemmatized_data.append([i[0] for i in l])
        if (type(data) == str):
            l = _lemmatizer.lemmatize(data)
            lemmatized_data.append([i[0] for i in l])
        return lemmatized_data

    def ekphrasis_preprocessing(self):
        X_clean = []
        if (type(self.text) == str):
            X_clean.append(self.text_processor.pre_process_doc(self.text))
        if (type(self.text) == list):
            for row in tqdm(self.text):
                X_clean.append(self.text_processor.pre_process_doc(row))
        return X_clean
Beispiel #28
0
    except:
        None

    # deal with contractions that the tool misses
    tweet = re.sub(
        r"(\b)([Ww]hat|[Ii]t|[Hh]e|[Ss]he|[Tt]hat|[Tt]here|[Hh]ow|[Ww]ho|[Hh]ere|[Ww]here|[Ww]hen)'s",
        r"\1\2 is", tweet)
    tweet = re.sub(r"(\b)([Aa]in)'t", r"is not", tweet)
    tweet = re.sub(r"(\b)([Ww]asn)'t", r"was not", tweet)
    tweet = re.sub(r"(\b)([Hh]e|[Ss]he|[Ii]|[Yy]ou|[Tt]hey|[Ww]e)'d",
                   r"\1\2 would", tweet)
    tweet = re.sub(r"(\b)([Ii]t|[Tt]hat|[Tt]his)'ll", r"\1\2 will", tweet)
    tweet = re.sub(r"(\b)([Cc])'mon", r"come on", tweet)

    # process the rest of the tweet with the nltk tweet tokenizer
    tweet = " ".join(text_processor.pre_process_doc(tweet)).lower()

    clean_tweets.append(tweet)

# below is code to create the tsv file of cleaned tweets
index = 0
with open('task1_training_cleaned.tsv', mode='w') as tsvfile:
    tsvwriter = csv.writer(tsvfile, delimiter='\t')

    index = 0
    for tweet in clean_tweets:
        tsvwriter.writerow([target[index], tweet])

        index += 1
tsvfile.close()
Beispiel #29
0
class Preprocess:
    def __init__(self):
        self.label2emotion = {0: "others", 1: "happy", 2: "sad", 3: "angry"}
        self.emotion2label = {"others": 0, "happy": 1, "sad": 2, "angry": 3}

        self.emoticons_additional = {
            '(^・^)': '<happy>',
            ':‑c': '<sad>',
            '=‑d': '<happy>',
            ":'‑)": '<happy>',
            ':‑d': '<laugh>',
            ':‑(': '<sad>',
            ';‑)': '<happy>',
            ':‑)': '<happy>',
            ':\\/': '<sad>',
            'd=<': '<annoyed>',
            ':‑/': '<annoyed>',
            ';‑]': '<happy>',
            '(^�^)': '<happy>',
            'angru': 'angry',
            "d‑':": '<annoyed>',
            ":'‑(": '<sad>',
            ":‑[": '<annoyed>',
            '(�?�)': '<happy>',
            'x‑d': '<laugh>',
        }

        self.text_processor = TextPreProcessor(
            # terms that will be normalized
            normalize=[
                'url', 'email', 'percent', 'money', 'phone', 'user', 'time',
                'url', 'date', 'number'
            ],
            # terms that will be annotated
            annotate={
                "hashtag", "allcaps", "elongated", "repeated", 'emphasis',
                'censored'
            },
            fix_html=True,  # fix HTML tokens
            # corpus from which the word statistics are going to be used
            # for word segmentation
            segmenter="twitter",
            # corpus from which the word statistics are going to be used
            # for spell correction
            corrector="twitter",
            unpack_hashtags=True,  # perform word segmentation on hashtags
            unpack_contractions=True,  # Unpack contractions (can't -> can not)
            spell_correct_elong=True,  # spell correction for elongated words
            # select a tokenizer. You can use SocialTokenizer, or pass your own
            # the tokenizer, should take as input a string and return a list of tokens
            tokenizer=SocialTokenizer(lowercase=True).tokenize,
            # list of dictionaries, for replacing tokens extracted from the text,
            # with other expressions. You can pass more than one dictionaries.
            dicts=[emoticons, self.emoticons_additional])

    def tokenize(self, text):
        text = " ".join(self.text_processor.pre_process_doc(text))
        return text

    def preprocessData(self, dataFilePath, mode):
        conversations = []
        labels = []
        with io.open(dataFilePath, encoding="utf8") as finput:
            finput.readline()
            for line in finput:
                line = line.strip().split('\t')
                for i in range(1, 4):
                    line[i] = self.tokenize(line[i])
                if mode == "train":
                    labels.append(self.emotion2label[line[4]])
                conv = line[1:4]
                conversations.append(conv)
        if mode == "train":
            return np.array(conversations), np.array(labels)
        else:
            return np.array(conversations)
Beispiel #30
0
def clean_tweets(df):
    # define the text preprocessro
    text_processor = TextPreProcessor(
        # terms that will be normalized
        normalize=['url', 'email', 'money', 'phone', 'time', 'date'],
        # terms that will be annotated
        annotate={
            "hashtag", "allcaps", "elongated", "repeated", 'emphasis',
            'censored'
        },
        fix_html=True,  # fix HTML tokens

        # corpus from which the word statistics are going to be used
        # for word segmentation
        segmenter="twitter",

        # corpus from which the word statistics are going to be used
        # for spell correction
        corrector="twitter",
        unpack_hashtags=True,  # perform word segmentation on hashtags
        unpack_contractions=True,  # Unpack contractions (can't -> can not)
        spell_correct_elong=False,  # spell correction for elongated words

        # select a tokenizer. You can use SocialTokenizer, or pass your own
        # the tokenizer, should take as input a string and return a list of tokens
        #tokenizer=SocialTokenizer(lowercase=True).tokenize,
        tokenizer=TweetTokenizer().tokenize,

        # list of dictionaries, for replacing tokens extracted from the text,
        # with other expressions. You can pass more than one dictionaries.
        dicts=[emoticons])
    seg = Segmenter(corpus="twitter")

    tweet_text = df.tweet_text.to_list()

    clean_tweets = []
    for tweet in tweet_text:

        # manually tag usernames
        # ex: @DoctorChristian -> <user> doctor christian </user>
        match = re.findall(r'@\w+', tweet)

        try:
            for at in match:
                user_seg = seg.segment(at[1:])
                tweet = tweet.replace(at, '<user> ' + user_seg + ' </user>')
        except:
            None

        # manually tag all caps so that the unpack_contractions functions works
        match = re.findall(r"(?<![#@$])\b([A-Z][A-Z ,.']*[A-Z])\b", tweet)

        try:
            for all_caps in match:
                tweet = tweet.replace(
                    all_caps, '<allcaps> ' + all_caps.lower() + ' </allcaps>')
        except:
            None

        # manually tag percentages
        match = re.findall(r"(\d+.?\d?%)", tweet)

        try:
            for percent in match:
                tweet = tweet.replace(
                    percent,
                    '<percent> ' + percent[0:len(percent) - 1] + ' </percent>')
        except:
            None

        # deal with contractions that the tool misses
        tweet = re.sub(
            r"(\b)([Ww]hat|[Ii]t|[Hh]e|[Ss]he|[Tt]hat|[Tt]here|[Hh]ow|[Ww]ho|[Hh]ere|[Ww]here|[Ww]hen)'s",
            r"\1\2 is", tweet)
        tweet = re.sub(r"(\b)([Aa]in)'t", r"is not", tweet)
        tweet = re.sub(r"(\b)([Ww]asn)'t", r"was not", tweet)
        tweet = re.sub(r"(\b)([Hh]e|[Ss]he|[Ii]|[Yy]ou|[Tt]hey|[Ww]e)'d",
                       r"\1\2 would", tweet)
        tweet = re.sub(r"(\b)([Ii]t|[Tt]hat|[Tt]his)'ll", r"\1\2 will", tweet)
        tweet = re.sub(r"(\b)([Cc])'mon", r"come on", tweet)

        # process the rest of the tweet with the nltk tweet tokenizer
        tweet = " ".join(text_processor.pre_process_doc(tweet)).lower()

        clean_tweets.append(tweet)

    # below is code to create the tsv file of cleaned tweets
    df['tweet_text'] = clean_tweets

    return df