def preprocess_data(genuine_filepath, bot_filepath): """ Preprocess data and normalize tweets. """ # Open csv file and get the tweet part of the csv. # Strip out newlines and quotes around text. with codecs.open(bot_filepath, 'r', encoding='utf-8', errors='ignore') as bots_file: bot_sentences = [ x.split(',')[1].strip('\n').strip('"').lower() if len(x.split(',')) > 1 else '' for x in bots_file.readlines() ] bot_sentences = bot_sentences[1:] with codecs.open(genuine_filepath, 'r', encoding='utf-8', errors='ignore') as genuine_file: genuine_sentences = [ x.split(',')[1].strip('\n').strip('"').lower() if len(x.split(',')) > 1 else '' for x in genuine_file.readlines() ] genuine_sentences = genuine_sentences[1:] text_processor = TextPreProcessor( # terms that will be normalized normalize=[ 'url', 'email', 'percent', 'money', 'phone', 'user', 'time', 'url', 'date', 'number' ], # terms that will be annotated annotate={ "hashtag", "allcaps", "elongated", "repeated", 'emphasis', 'censored' }, fix_html=True, # fix HTML tokens # corpus from which the word statistics are going to be used # for word segmentation segmenter="twitter", # corpus from which the word statistics are going to be used # for spell correction corrector="twitter", unpack_hashtags=True, # perform word segmentation on hashtags unpack_contractions=True, # Unpack contractions (can't -> can not) spell_correct_elong=False, # spell correction for elongated words # select a tokenizer. You can use SocialTokenizer, or pass your own # the tokenizer, should take as input a string and return a list of tokens tokenizer=SocialTokenizer(lowercase=True).tokenize, # list of dictionaries, for replacing tokens extracted from the text, # with other expressions. You can pass more than one dictionaries. dicts=[emoticons]) bot_sentences = [text_processor.pre_process_doc(s) for s in bot_sentences] genuine_sentences = [ text_processor.pre_process_doc(s) for s in genuine_sentences ] return genuine_sentences, bot_sentences
class TextPreprocessor(): def __init__(self): self.text_processor_options = TextPreProcessor( normalize=['url', 'email', 'percent', 'money', 'phone', 'user', 'time', 'url', 'date', 'number'], unpack_contractions=False, annotate={"allcaps", "elongated", "repeated", 'emphasis', 'censored'}, fix_html=True, # fix HTML tokens # corpus from which the word statistics are going to be used # for word segmentation and correction segmenter="english", corrector="english", unpack_hashtags=False, # perform word segmentation on hashtags spell_correct_elong=False, # spell correction for elongated words # the tokenizer, should take as input a string and return a list of tokens tokenizer=SocialTokenizer(lowercase=True).tokenize, # list of dictionaries, for replacing tokens extracted from the text, dicts=[emoticons] ) def do_ekphrasis_preprocessing(self, sentences): if isinstance(sentences, str): return self.text_processor_options.pre_process_doc(sentences) assert (type(sentences).__module__ == np.__name__) preprocessed = [self.text_processor_options.pre_process_doc(s) for s in sentences] return np.array(preprocessed) def do_decontraction(self, sentences): if isinstance(sentences, str): sentences = np.array([sentences]) assert(type(sentences).__module__ == np.__name__) preprocessed = [] for s in sentences: ''' Does not deal with 'd as it is ambiguous''' s = re.sub(r"[W, w]on\'t", "will not", s) s = re.sub(r"[C, c]an\'t", "can not", s) s = re.sub(r"[C, c]annot", "can not", s) s = re.sub(r"n\'t", " not", s) s = re.sub(r"\'re", " are", s) s = re.sub(r"[H, h]e\'s", "he is", s) s = re.sub(r"[S, s]he\'s", "she is", s) s = re.sub(r"[I, i]t\'s", "it is", s) s = re.sub(r"\'ll", " will", s) s = re.sub(r"\'ve", " have", s) s = re.sub(r"\'m", " am", s) s = re.sub(r"[D, d]idn\'t", "did not", s) preprocessed.append(s) return np.array(preprocessed)
def build_vocab(dataset): vocabulary_set = set() text_processor = TextPreProcessor( normalize=[ 'url', 'email', 'percent', 'money', 'phone', 'user', 'time', 'url', 'date', 'number' ], annotate={ "hashtag", "allcaps", "elongated", "repeated", 'emphasis', 'censored' }, fix_html=True, segmenter="twitter", corrector="twitter", unpack_hashtags=True, unpack_contractions=True, spell_correct_elong=False, tokenizer=SocialTokenizer(lowercase=True).tokenize, dicts=[emoticons]) for text_tensor, _ in dataset: text = str(text_tensor.numpy()[1], 'utf-8') some_tokens = text_processor.pre_process_doc(text) vocabulary_set.update(some_tokens) return vocabulary_set
def build_vocab_list(dataframe): vocab_set = set() sentenses = [] text_processor = TextPreProcessor( normalize=[ 'url', 'email', 'percent', 'money', 'phone', 'user', 'time', 'url', 'date', 'number' ], annotate={ "hashtag", "allcaps", "elongated", "repeated", 'emphasis', 'censored' }, fix_html=True, segmenter="twitter", corrector="twitter", unpack_hashtags=True, unpack_contractions=True, spell_correct_elong=False, tokenizer=SocialTokenizer(lowercase=True).tokenize, dicts=[emoticons]) for index in range(dataframe.shape[0]): tweet = dataframe["tweet"][index] tok = text_processor.pre_process_doc(tweet) sentenses.append(" ".join(tok)) vocab_set.update(tok) df_sentenses = pd.DataFrame(sentenses, columns=['content']) return vocab_set, df_sentenses
def preprocess_dataset(tweets, y): """uses ekphrasis API to preprocess the tweets""" text_processor = TextPreProcessor( # terms that will be normalized normalize=[ 'url', 'email', 'percent', 'money', 'phone', 'user', 'time', 'url', 'date', 'number' ], # terms that will be annotated fix_html=True, # fix HTML tokens # corpus from which the word statistics are going to be used # for word segmentation segmenter="twitter", # corpus from which the word statistics are going to be used # for spell correction corrector="twitter", unpack_hashtags=True, # perform word segmentation on hashtags unpack_contractions=True, # Unpack contractions (can't -> can not) spell_correct_elong=False, # spell correction for elongated words spell_correction=False, # select a tokenizer. You can use SocialTokenizer, or pass your own # the tokenizer, should take as input a string and return a list of tokens tokenizer=SocialTokenizer(lowercase=True).tokenize, # list of dictionaries, for replacing tokens extracted from the text, # with other expressions. You can pass more than one dictionaries. dicts=[emoticons]) ynew = [] filter_tweets = [] for t in range(0, len(tweets)): tokens = text_processor.pre_process_doc(tweets[t]) newtokens = [] i = 0 while (i < len(tokens)): try: if (tokens[i] == "pic" and tokens[i + 1] == "." and tokens[i + 2] == "twitter"): break elif (tokens[i] in [ "<url>", "<email>", "<user>", "<money>", "<percent>", "<phone>", "<time>", "<date>", "<number>" ]): i += 1 continue elif (tokens[i] == "<" and tokens[i + 1] == "emoji"): while (tokens[i] != ">"): i += 1 i += 1 else: newtokens.append(tokens[i]) i += 1 except: break if (len(newtokens) != 0): filter_tweets.append(" ".join(newtokens)) ynew.append(y[t]) return filter_tweets, ynew #tokenizing and other preprocessing #removes emojis
def clean_then_tokenize_text(data): text_all = [] text_processor = TextPreProcessor( normalize=['user','url'],) for key in data: text = data[key] a= [] temp = "" for line in text: if True: line = text_processor.pre_process_doc(line) temp=" ".join( text_to_word_sequence(line) ) a.append(temp) data[key]['cln_text'] = a text_all +=a return text_all
class EkhprasisPreprocessor(Preprocessor): def __init__(self, verbose: int=0, omit=None, normalize=None, annotate={"hashtag", "allcaps", "elongated", "repeated", 'emphasis'}, segmenter="twitter", corrector="twitter", unpack_hashtags=False, unpack_contractions=True, spell_correct_elong=True, spell_correction=True, tokenizer=Tokenizer(lowercase=True), dicts=None): super().__init__(name="EkhprasisPreprocessor", verbose=verbose) if dicts is None: dicts = [others, emoticons_original] if normalize is None: normalize = ['number'] if omit is None: omit = ['email', 'percent', 'money', 'phone', 'user', 'time', 'url', 'date'] logging.info("{} loading...".format(self._name)) self.tweet_processor = TextPreProcessor( # omit terms omit=omit, # terms that will be normalized normalize=normalize, # terms that will be annotated annotate=annotate, # corpus from which the word statistics are going to be used # for word segmentation segmenter=segmenter, # corpus from which the word statistics are going to be used # for spell correction corrector=corrector, unpack_hashtags=unpack_hashtags, # perform word segmentation on hashtags unpack_contractions=unpack_contractions, # Unpack contractions (can't -> can not) spell_correct_elong=spell_correct_elong, # spell correction for elongated words spell_correction=spell_correction, # spell correction # select a tokenizer. You can use SocialTokenizer, or pass your own # the tokenizer, should take as input a string and return a list of tokens tokenizer=tokenizer.tokenize, # list of dictionaries, for replacing tokens extracted from the text, # with other expressions. You can pass more than one dictionaries. dicts=dicts ) def _preprocess(self, sentence) -> str: return ' '.join(' '.join(self.tweet_processor.pre_process_doc(sentence)).split())
def datastories_processor(x): from ekphrasis.dicts.emoticons import emoticons from ekphrasis.classes.tokenizer import SocialTokenizer from ekphrasis.classes.preprocessor import TextPreProcessor text_processor = TextPreProcessor( # terms that will be normalized normalize=[ 'url', 'email', 'percent', 'money', 'phone', 'user', 'time', 'url', 'date', 'number' ], # terms that will be annotated annotate={ "hashtag", "allcaps", "elongated", "repeated", 'emphasis', 'censored' }, fix_html=True, # fix HTML tokens # corpus from which the word statistics are going to be used # for word segmentation segmenter="twitter", # corpus from which the word statistics are going to be used # for spell correction corrector="twitter", unpack_hashtags=True, # perform word segmentation on hashtags unpack_contractions=True, # Unpack contractions (can't -> can not) spell_correct_elong=False, # spell correction for elongated words # select a tokenizer. You can use SocialTokenizer, or pass your own # the tokenizer, should take as input a string and return a list of tokens tokenizer=SocialTokenizer(lowercase=True).tokenize, # list of dictionaries, for replacing tokens extracted from the text, # with other expressions. You can pass more than one dictionaries. dicts=[emoticons]) x = [text_processor.pre_process_doc(sent) for sent in x] temp = [] for sent in x: context = '' for word in sent: context = context + ' ' + word temp.append(context) return temp
class EkphrasisProxy(): def __init__(self, **kwargs): self.text_processor = TextPreProcessor( omit=kwargs.get('normalize', []), normalize=kwargs.get( 'normalize', ['url', 'email', 'phone', 'user', 'time', 'url', 'date']), annotate=kwargs.get('annotate', {}), fix_html=kwargs.get('fix_html', True), segmenter=kwargs.get('segmenter', "twitter"), corrector=kwargs.get('corrector', "twitter"), unpack_hashtags=kwargs.get('unpack_hashtags', True), unpack_contractions=kwargs.get('unpack_contractions', True), spell_correct_elong=kwargs.get('fix_elongation', True), spell_correction=kwargs.get('spell_correction', True), fix_bad_unicode=kwargs.get('fix_bad_unicode', True), tokenizer=SocialTokenizer(lowercase=True).tokenize, dicts=[emoticons]) def preprocess_text(self, text): return self.text_processor.pre_process_doc(text)
def preprocess_through_ekphrasis(train_file_path, test_file_path, trial_file_path): text_processor = TextPreProcessor( normalize=[ 'url', 'email', 'percent', 'money', 'phone', 'user', 'time', 'url', 'date', 'number' ], annotate={ "hashtag", "allcaps", "elongated", "repeated", 'emphasis', 'censored' }, fix_html=True, segmenter="twitter", corrector="twitter", unpack_hashtags=True, unpack_contractions=True, spell_correct_elong=True, spell_correction=True, all_caps_tag="wrap", fix_bad_unicode=True, tokenizer=SocialTokenizer(lowercase=True).tokenize, dicts=[emoticons]) for file_path in [train_file_path, test_file_path, trial_file_path]: with open(file_path, 'r', newline='') as file: new_sentences = list() labels = list() for line in file: labels.append(line.split('\t')[0]) new_sentences.append(" ".join( text_processor.pre_process_doc(line.split('\t')[1]))) with open(file_path[:-4] + "_ekphrasis.csv", 'w', newline='') as new_file: for label, sentence in zip(labels, new_sentences): new_file.write("{}\t{}\n".format( label, sentence.replace("[ <hashtag> triggerword </hashtag> #]", "[#TRIGGERWORD#]").replace( "[ <allcaps> newline </allcaps> ]", "[NEWLINE]")))
class SentencePreprocessor: def __init__(self): # Define a Text Pre-Processing pipeline # You can easily define a preprocessing pipeline, by using the TextPreProcessor. self.text_processor = TextPreProcessor( # terms that will be normalized normalize=['url', 'email', 'percent', 'money', 'phone', 'user', 'time', 'url', 'date', 'number'], # terms that will be annotated annotate={"hashtag", "allcaps", "elongated", "repeated", 'emphasis', 'censored'}, fix_html=True, # fix HTML tokens # corpus from which the word statistics are going to be used # for word segmentation segmenter="english", # corpus from which the word statistics are going to be used # for spell correction corrector="english", unpack_hashtags=True, # perform word segmentation on hashtags unpack_contractions=True, # Unpack contractions (can't -> can not) spell_correct_elong=False, # spell correction for elongated words # select a tokenizer. You can use SocialTokenizer, or pass your own # the tokenizer, should take as input a string and return a list of tokens tokenizer=SocialTokenizer(lowercase=True).tokenize, # list of dictionaries, for replacing tokens extracted from the text, # with other expressions. You can pass more than one dictionaries. dicts=[emoticons]) def prepro_sent(self, sent): # As smart quote is not handled in ekphrasis sent = sent.replace('‘', '\'').replace('’', '\'').replace('“', '"').replace('”', '"') return ' '.join(self.text_processor.pre_process_doc(sent))
def twitter_preprocess(self): preprocessor = TextPreProcessor( normalize=[ 'url', 'email', 'percent', 'money', 'phone', 'user', 'time', 'date', 'number' ], annotate={ "hashtag", "elongated", "allcaps", "repeated", 'emphasis', 'censored' }, all_caps_tag="wrap", fix_text=True, segmenter="twitter_2018", corrector="twitter_2018", unpack_hashtags=True, unpack_contractions=True, spell_correct_elong=False, tokenizer=SocialTokenizer(lowercase=True).tokenize, dicts=[emoticons]) text = self.data cache_file = os.path.join('./', "cached", "preprocessed_" + self.name + ".pkl") preprocessed = None if os.path.isfile(cache_file): with open(cache_file, 'rb') as f: preprocessed = pickle.load(f) else: preprocessed = [ preprocessor.pre_process_doc(x) for x in tqdm(text, desc="Preprocessing dataset...") ] with open(cache_file, 'wb') as f: pickle.dump(preprocessed, f) return preprocessed
tok = list() tk = TweetTokenizer() p = Preprocess() text_processor = TextPreProcessor( # terms that will be normalized normalize=[ 'email', 'percent', 'money', 'phone', 'time', 'url', 'date', 'number' ], fix_html=True, # fix HTML tokens segmenter="twitter", corrector="twitter", unpack_hashtags=True, # perform word segmentation on hashtags unpack_contractions=True, # Unpack contractions (can't -> can not) spell_correct_elong=True, # spell correction for elongated words dicts=[emoticons]) for i in tw_: #print ( i ) line = p.preprocess_mentions(line, repl='<mention>') line = p.preprocess_reserved_words(line, repl='<reserved>') line = text_processor.pre_process_doc(line) tok.append(tk.tokenize(line)) for k in tok[1:100]: print(k) model = Word2Vec(tok, min_count=5, size=300, window=5, sg=1) model.train(tok, total_examples=len(tok), epochs=100) model.wv.save_word2vec_format('w2v_positive_300.bin', binary=True)
input_data_file = "/home/zz/Work/chase/data/ml/ml/rm/labeled_data_all_corrected.csv" #input_data_file="/home/zz/Work/chase/data/ml/ml/dt/labeled_data_all_2.csv" #input_data_file="/home/zz/Work/chase/data/ml/ml/w/labeled_data_all.csv" #input_data_file="/home/zz/Work/chase/data/ml/ml/w+ws/labeled_data_all.csv" #input_data_file="/home/zz/Work/chase/data/ml/ml/ws-exp/labeled_data_all.csv" #input_data_file="/home/zz/Work/chase/data/ml/ml/ws-amt/labeled_data_all.csv" #input_data_file="/home/zz/Work/chase/data/ml/ml/ws-gb/labeled_data_all.csv" raw_data = pd.read_csv(input_data_file, sep=',', encoding="utf-8") header_row = list(raw_data.columns.values) with open(input_data_file + "c.csv", 'w', newline='') as csvfile: csvwriter = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) csvwriter.writerow(header_row) for row in raw_data.iterrows(): tweet = list(row[1]) tweet_text = text_processor.pre_process_doc(tweet[col_text]) tweet_text = list(filter(lambda a: a != '<elongated>', tweet_text)) tweet_text = list(filter(lambda a: a != '<emphasis>', tweet_text)) tweet_text = list(filter(lambda a: a != 'RT', tweet_text)) tweet_text = list(filter(lambda a: a != '"', tweet_text)) tweet_text = " ".join(tweet_text) #reset content tweet[col_text] = tweet_text csvwriter.writerow(tweet)
def preprocess(df): df['ProcessedText'] = None df['ProcessedText_length'] = 0 df['ProcessedText_BERT'] = None df['ProcessedText_BERTbase_length'] = 0 print(df.columns) text_processor = TextPreProcessor( # terms that will be normalized normalize=['url', 'email', 'percent', 'money', 'phone', 'user', 'time', 'url', 'date', 'number'], # terms that will be annotated annotate={"hashtag", "allcaps", "elongated", "repeated", 'emphasis', 'censored'}, fix_html=True, # fix HTML tokens # corpus from which the word statistics are going to be used # for word segmentation segmenter="twitter", # corpus from which the word statistics are going to be used # for spell correction corrector="twitter", unpack_hashtags=True, # perform word segmentation on hashtags unpack_contractions=True, # Unpack contractions (can't -> can not) spell_correct_elong=True, # spell correction for elongated words # select a tokenizer. You can use SocialTokenizer, or pass your own # the tokenizer, should take as input a string and return a list of tokens tokenizer=SocialTokenizer(lowercase=True).tokenize, # list of dictionaries, for replacing tokens extracted from the text, # with other expressions. You can pass more than one dictionaries. dicts=[emoticons]) # Tweets pre-processing for index, row in df.iterrows(): s = df.loc[index, 'text'] # remove RT and USER s = "".join(re.sub('RT @[\w_]+: ', ' ', s)) # df.loc[index, 'text'] = "".join(re.sub(r'&# ;', ' ', df.loc[index, 'text'])) # df.loc[index, 'text'] = "".join(re.sub(r' &# ;', ' ', df.loc[index, 'text'])) # remove special characters s = "".join(re.sub(r'&#\d+;', ' ', s)) # pre-processing s = " ".join(text_processor.pre_process_doc((s))) s = "".join(re.sub(r'\<[^>]*\>', ' ', s)) # Remove non-ascii words or characters s = "".join([i if ord(i) < 128 else '' for i in s]) s = s.replace(r'_[\S]?',r'') s = s.replace(r'[ ]{2, }',r' ') # Remove &, < and > s = s.replace(r'&?', r'and') s = s.replace(r'<', r'<') s = s.replace(r'>', r'>') # Insert space between words and punctuation marks s = s.replace(r'([\w\d]+)([^\w\d ]+)', r'\1 \2') s = s.replace(r'([^\w\d ]+)([\w\d]+)', r'\1 \2') # Calculate text length for later use in LSTM s_length = len(s.split()) # save ProcessedText and ProcessedText_length in final df df.loc[index, 'ProcessedText'] = s.strip() df.loc[index, 'ProcessedText_length'] = s_length # Drop texts with length <=2 and drop duplicates df = df[df['ProcessedText_length'] > 2] df = df.drop_duplicates(subset=['ProcessedText']) # BERT preprocess df['ProcessedText_BERT'] = '[CLS] ' + df.ProcessedText tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') df['ProcessedText_BERTbase_length'] = [len(tokenizer.tokenize(sent)) for sent in df.ProcessedText_BERT] # tokenizer = BertTokenizer.from_pretrained('bert-large-uncased') # df['ProcessedText_BERTlarge_length'] = [len(tokenizer.tokenize(sent)) for sent in df.ProcessedText_BERT] label_dict = dict() for i, l in enumerate(list(df.labels.value_counts().keys())): label_dict.update({l: i}) df['Mapped_label'] = [label_dict[label] for label in df.labels] return df
class SentimentDataset(Dataset): def __init__(self, file, max_length, max_topic_length, word2idx, tword2idx, topic_bs): """ A PyTorch Dataset What we have to do is to implement the 2 abstract methods: - __len__(self): in order to let the DataLoader know the size of our dataset and to perform batching, shuffling and so on... - __getitem__(self, index): we have to return the properly processed data-item from our dataset with a given index Args: file (str): path to the data file max_length (int): the max length for each sentence. if 0 then use the maximum length in the dataset word2idx (dict): a dictionary which maps words to indexes """ self.text_processor = TextPreProcessor( # terms that will be normalized normalize=[ 'url', 'email', 'percent', 'money', 'phone', 'user', 'time', 'url', 'date', 'number' ], # terms that will be annotated annotate={ "hashtag", "allcaps", "elongated", "repeated", 'emphasis', 'censored' }, fix_html=True, # fix HTML tokens # corpus from which the word statistics are going to be used # for word segmentation segmenter="twitter", # corpus from which the word statistics are going to be used # for spell correction corrector="twitter", unpack_hashtags=True, # perform word segmentation on hashtags unpack_contractions=True, # Unpack contractions (can't -> can not) spell_correct_elong=False, # spell correction for elongated words # select a tokenizer. You can use SocialTokenizer, or pass your own # the tokenizer, should take as input a string and return a list of tokens tokenizer=SocialTokenizer(lowercase=True).tokenize, # list of dictionaries, for replacing tokens extracted from the text, # with other expressions. You can pass more than one dictionaries. dicts=[emoticons]) self.word2idx = word2idx self.tword2idx = tword2idx print("loading dataset from {}...".format(file)) _data = load_data_from_dir(file) if topic_bs: self.data = [x[2] for x in _data] self.labels = [x[1] for x in _data] self.topics = [x[0] for x in _data] else: self.data = [x[1] for x in _data] self.labels = [x[0] for x in _data] print("Tokenizing...") # self.data = [tokenize(x) for x in self.data] self.data = [self.text_processor.pre_process_doc(x) for x in self.data] self.topics = [ self.text_processor.pre_process_doc(x) for x in self.topics ] # if max_length == 0, then set max_length # to the maximum sentence length in the dataset if max_length == 0: self.max_length = max([len(x) for x in self.data]) else: self.max_length = max_length if max_topic_length == 0: self.max_topic_length = max([len(x) for x in self.topics]) else: self.max_topic_length = max_topic_length # define a mapping for the labels, # for transforming the string labels to numbers self.label_encoder = preprocessing.LabelEncoder() self.label_encoder = self.label_encoder.fit(self.labels) self.label_count = Counter(self.labels) self.weights = [ self.label_count['-1'], self.label_count['2'], self.label_count['0'], self.label_count['1'], self.label_count['2'] ] def __len__(self): return len(self.data) def __getitem__(self, index): """ Returns the _transformed_ item from the dataset Args: index (int): Returns: (tuple): * example (ndarray): vector representation of a training example * label (string): the class label * length (int): the length (tokens) of the sentence * index (int): the index of the returned dataitem in the dataset. It is useful for getting the raw input for visualizations. Examples: For an `index` where: :: self.data[index] = ['super', 'eagles', 'coach', 'sunday', 'oliseh', 'meets', 'with', 'chelsea', "'", 's', 'victor', 'moses', 'in', 'london', '<url>'] self.target[index] = "neutral" the function will return: :: example = [ 533 3908 1387 649 38127 4118 40 1876 63 106 7959 11520 22 888 7 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0] label = 1 """ sample, label, topic = self.data[index], self.labels[ index], self.topics[index] # transform the sample and the label, # in order to feed them to the model message = vectorize(sample, self.word2idx, self.max_length) topic = vectorize(topic, self.tword2idx, self.max_topic_length) label = self.label_encoder.transform([label])[0] return message, topic, label, len(self.data[index]), len( self.topics[index]), self.weights, index
import numpy as np from nltk.tokenize import TweetTokenizer as TweetTokenizer from nltk.corpus import stopwords import random as rn stop_words = set(stopwords.words('spanish')) i = 0 matrixTweetsEmb = np.zeros((len(matrixTweets), 100, 50, 300)) for tweetsUser in matrixTweets: embTweetsUser = [] if (i % 100) == 0: print(i) for tweet in tweetsUser: embTweetUser = np.zeros([50, 300]) #Preprocesso tokList = text_processor.pre_process_doc(tweet) #Rimuovo le stopwords tokList = [w for w in tokList if not w in stop_words] #trovo l'embedding numTok = 0 for token in tokList[0:50]: g_vec = [] is_in_model = False if token in google_300.vocab.keys(): is_in_model = True g_vec = google_300.word_vec(token) elif token == "<number>": is_in_model = True g_vec = google_300.word_vec("número") elif token == "<percent>": is_in_model = True
}, fix_html=True, # fix HTML tokens # corpus from which the word statistics are going to be used # for word segmentation segmenter="twitter", # corpus from which the word statistics are going to be used # for spell correction corrector="twitter", unpack_hashtags=True, # perform word segmentation on hashtags unpack_contractions=True, # Unpack contractions (can't -> can not) spell_correct_elong=False, # spell correction for elongated words # select a tokenizer. You can use SocialTokenizer, or pass your own # the tokenizer, should take as input a string and return a list of tokens tokenizer=SocialTokenizer(lowercase=True).tokenize, # list of dictionaries, for replacing tokens extracted from the text, # with other expressions. You can pass more than one dictionaries. dicts=[emoticons]) sentences = [ "CANT WAIT for the new season of #TwinPeaks \(^o^)/!!! #davidlynch #tvseries :)))", "I saw the new #johndoe movie and it suuuuucks!!! WAISTED $10... #badmovies :/", "@SentimentSymp: can't wait for the Nov 9 #Sentiment talks! YAAAAAAY !!! 😈" ] for s in sentences: print(" ".join(text_processor.pre_process_doc(s)))
) for j in range(2,3): dt = pd.read_csv('id_and_sentences_'+str(float(j))+'.tsv', sep='\t', encoding='utf8', header=None, names=["id", "message"], error_bad_lines=False) id = dt.iloc[:, 0] sentences = dt.iloc[:, 1] examples = [] import re i = 0 for s in sentences: s = s.lower() s = str(" ".join(text_processor.pre_process_doc(s))) s = re.sub(r"[^a-zA-ZÀ-ú</>!?♥♡\s\U00010000-\U0010ffff]", ' ', s) s = re.sub(r"\s+", ' ', s) s = re.sub(r'(\w)\1{2,}', r'\1\1', s) s = re.sub(r'^\s', '', s) s = re.sub(r'\s$', '', s) s = emoji.demojize(s).replace(":", "").replace("_", " ").replace("<", " ").replace(">", " ").replace("/", "") s = re.sub(r"\s+", ' ', s).lstrip() # print(s) examples.append(s) i = i + 1 sentence_embeddings = model.encode(examples) print(sentence_embeddings[0]) print(sentence_embeddings[0].shape)
ans1 = ans1 + " ".join(l1[1:]) ans = ans1[:-4] + " " + " ".join(l2) + " " + ans1[-4:] + '\n' ans = list(ans) ans[-3] = '\t' ans[-5] = '\t' # print(ans) ans = "".join(ans) # print(ans) ##Modified by Keshav print(ans) ans = ans.replace('@ ', '@').replace('# ', '#').replace('<', '').replace( '>', '').replace("_", ' ').replace(' ', ' ') string = ans.split('\t')[1] processed = text_processor.pre_process_doc(string) print(processed) new_ans = [] for token in processed: if token.startswith(("<hash", "<number", "<")) or "user" in token: continue new_ans.append(token) arr = ans.split('\t') arr[1] = re.sub(r"http.*|…", "", " ".join(new_ans)) ans = "\t".join(arr) print(arr[2]) print(ans) # break newfile.write(ans)
# corpus from which the word statistics are going to be used # for word segmentation segmenter="twitter", # corpus from which the word statistics are going to be used # for spell correction corrector="twitter", spell_correction=True, spell_correct_elong=False, # spell correction for elongated words # list of dictionaries, for replacing tokens extracted from the text, # with other expressions. You can pass more than one dictionaries. dicts=[emoticons]) with open("data/" + FILE_NAME, 'r', newline='', encoding='utf-8') as f: reader = csv.reader(f) with open("data/" + FILE_NAME.split(".")[0] + "_normalized.csv", 'w', newline='', encoding='utf-8') as w: writer = csv.writer(w) header = next(reader) header.insert(CONTENT_COLUMN + 1, "Normalized") writer.writerow(header) print("Processing tweets...") for row in reader: content = row[CONTENT_COLUMN] row.insert(CONTENT_COLUMN + 1, " ".join(text_processor.pre_process_doc(content))) writer.writerow(row)
class TexProcessor(object): def __init__(self, args, lang): self.others = Strategy(args.others) # valori validi 0,1,2 //ANDATA self.emoji = Strategy( args.emoji) #0,1 emoji ,2 (emoji) ,3, 4 ,5 (traduzione) // self.emoticon = Strategy( args.emoticon) #0,1 emoticon ,2 (emoticon) ,3, 4 ,5 (traduzione) self.url = Strategy(args.url) # 0,1,2,3 self.hashtag = Strategy( args.hashtag) # 0,1 = #hashtag,2 ,3 (#hashtag),4,5 self.punctuation = Strategy(args.punctuation) #Valori validi 0,3 self.mention = Strategy(args.mention) #0,1,2,3 self.lower = args.lower #true o false self.lang = lang # EN o IT self.ita_moji = pd.read_csv('./data/italianMoji.csv', sep=';') if self.lang == 'IT': self.lm = wordninja.LanguageModel('./data/words.last_all.txt.gz') else: self.lm = None self.text_processor = TextPreProcessor( remove=[ 'email', #raw o nomralize. 'percent', #raw o nomralize: EN: percentage, IT: percentuale. 'money', # raw o nomralize: EN: money, IT: soldi. verificare se becca le valute 'phone', # raw o nomralize: EN: phone, IT: telefono 'time', # raw o nomralize: EN time, It: ore 'date', # raw o nomralize EN date, It data 'number' #raw o nomralize En number, it numero. ], annotate={}, fix_html=True, unpack_hashtags=False, tokenizer=SocialTokenizer(lowercase=self.lower).tokenize, dicts=[emoticons]) def load_dict_emoticon(self): if self.lang == 'EN': return { ":‑)": "happy", ": ‑)": "happy", ": ‑ )": "happy", ":-]": "happy", ": - ]": "happy", ": -]": "happy", ":-3": "happy", ": - 3": "happy", ": -3": "happy", ":->": "happy", ": - >": "happy", ": ->": "happy", "8-)": "happy", "8 -)": "happy", "8 - )": "happy", ":-}": "happy", ": - }": "happy", ": -}": "happy", ":)": "happy", ": )": "happy", ":]": "happy", ": ]": "happy", ":3": "happy", ": 3": "happy", ":>": "happy", ": >": "happy", "8)": "happy", "8 )": "happy", ":}": "happy", ": }": "happy", ":o)": "happy", ":o )": "happy", ": o )": "happy", ":c)": "happy", ": c )": "happy", ":c )": "happy", ":^)": "happy", ": ^ )": "happy", ": ^)": "happy", "=]": "happy", "= ]": "happy", "=)": "happy", "= )": "happy", ":-))": "happy", ": - ) )": "happy", ":- ) )": "happy", ":- ))": "happy", ": -))": "happy", ":‑D": "happy", ": ‑ D": "happy", ": ‑D": "happy", "8‑D": "happy", "8 ‑D": "happy", "8 ‑ D": "happy", "x‑D": "happy", "x ‑ D": "happy", "x ‑D": "happy", "X‑D": "happy", "X ‑ D": "happy", "X ‑D": "happy", ":D": "happy", ": D": "happy", "8D": "happy", "8 D": "happy", "xD": "happy", "x D": "happy", "XD": "happy", "X D": "happy", ":‑(": "sad", ": ‑(": "sad", ": ‑ (": "sad", ":‑c": "sad", ": ‑c": "sad", ":‑<": "sad", ": ‑ <": "sad", ":‑[": "sad", ": ‑ [": "sad", ":(": "sad", ": (": "sad", ":c": "sad", ": c": "sad", ":<": "sad", ": <": "sad", ":[": "sad", ": [": "sad", ":-||": "sad", ": - | |": "sad", ": - ||": "sad", ": -||": "sad", ": -| |": "sad", ">:[": "sad", ">: [": "sad", "> : [": "sad", ":{": "sad", ": {": "sad", ":@": "sad", ": @": "sad", ">:(": "sad", "> : (": "sad", ":'‑(": "sad", ": '‑(": "sad", ": ' ‑(": "sad", ": ' ‑ (": "sad", ":'(": "sad", ": ' (": "sad", ": '(": "sad", ":‑P": "playful", ": ‑P": "playful", ": ‑ P": "playful", "X‑P": "playful", "X ‑ P": "playful", "X ‑P": "playful", "x‑p": "playful", "x ‑p": "playful", ":‑p": "playful", ": ‑p": "playful", ": ‑ p": "playful", ":‑Þ": "playful", ": ‑ Þ": "playful", ":‑þ": "playful", ": ‑þ": "playful", ":‑b": "playful", ": ‑ b": "playful", ": ‑b": "playful", ":P": "playful", ": P": "playful", "XP": "playful", "X P": "playful", "xp": "playful", "x p": "playful", ":p": "playful", ": p": "playful", ":Þ": "playful", ": Þ": "playful", ":þ": "playful", ": þ": "playful", ":b": "playful", ": b": "playful", "<3": "love", "< 3": "love", ":*": "love", ": *": "love" } else: return { ":‑)": "felice", ": ‑)": "felice", ": ‑ )": "felice", ":-]": "felice", ": - ]": "felice", ": -]": "felice", ":-3": "felice", ": - 3": "felice", ": -3": "felice", ":->": "felice", ": - >": "felice", ": ->": "felice", "8-)": "felice", "8 -)": "felice", "8 - )": "felice", ":-}": "felice", ": - }": "felice", ": -}": "felice", ":)": "felice", ": )": "felice", ":]": "felice", ": ]": "felice", ":3": "felice", ": 3": "felice", ":>": "felice", ": >": "felice", "8)": "felice", "8 )": "felice", ":}": "felice", ": }": "felice", ":o)": "felice", ":o )": "felice", ": o )": "felice", ":c)": "felice", ": c )": "felice", ":c )": "felice", ":^)": "felice", ": ^ )": "felice", ": ^)": "felice", "=]": "felice", "= ]": "felice", "=)": "felice", "= )": "felice", ":-))": "felice", ": - ) )": "felice", ":- ) )": "felice", ":- ))": "felice", ": -))": "felice", ":‑D": "felice", ": ‑ D": "felice", ": ‑D": "felice", "8‑D": "felice", "8 ‑D": "felice", "8 ‑ D": "felice", "x‑D": "felice", "x ‑ D": "felice", "x ‑D": "felice", "X‑D": "felice", "X ‑ D": "felice", "X ‑D": "felice", ":D": "felice", ": D": "felice", "8D": "felice", "8 D": "felice", "xD": "felice", "x D": "felice", "XD": "felice", "X D": "felice", ":‑(": "triste", ": ‑(": "triste", ": ‑ (": "triste", ":‑c": "triste", ": ‑c": "triste", ":‑<": "triste", ": ‑ <": "triste", ":‑[": "triste", ": ‑ [": "triste", ":(": "triste", ": (": "triste", ":c": "triste", ": c": "triste", ":<": "triste", ": <": "triste", ":[": "triste", ": [": "triste", ":-||": "triste", ": - | |": "triste", ": - ||": "triste", ": -||": "triste", ": -| |": "triste", ">:[": "triste", ">: [": "triste", "> : [": "triste", ":{": "triste", ": {": "triste", ":@": "triste", ": @": "triste", ">:(": "triste", "> : (": "triste", ":'‑(": "triste", ": '‑(": "triste", ": ' ‑(": "triste", ": ' ‑ (": "triste", ":'(": "triste", ": ' (": "triste", ": '(": "triste", ":‑P": "scherzoso", ": ‑P": "scherzoso", ": ‑ P": "scherzoso", "X‑P": "scherzoso", "X ‑ P": "scherzoso", "X ‑P": "scherzoso", "x‑p": "scherzoso", "x ‑p": "scherzoso", ":‑p": "scherzoso", ": ‑p": "scherzoso", ": ‑ p": "scherzoso", ":‑Þ": "scherzoso", ": ‑ Þ": "scherzoso", ":‑þ": "scherzoso", ": ‑þ": "scherzoso", ":‑b": "scherzoso", ": ‑ b": "scherzoso", ": ‑b": "scherzoso", ":P": "scherzoso", ": P": "scherzoso", "XP": "scherzoso", "X P": "scherzoso", "xp": "scherzoso", "x p": "scherzoso", ":p": "scherzoso", ": p": "scherzoso", ":Þ": "scherzoso", ": Þ": "scherzoso", ":þ": "scherzoso", ": þ": "scherzoso", ":b": "scherzoso", ": b": "scherzoso", "<3": "amore", "< 3": "amore", ":*": "amore", ": *": "amore" } def do_preprocess(self, tweet): #Gestione Emoticon. SMILEY = self.load_dict_emoticon() if self.emoticon == Strategy.REMOVE: words = tweet.split() reformed = [" " if word in SMILEY else word for word in words] tweet = " ".join(reformed) if self.emoticon == Strategy.NORMALIZE: words = tweet.split() reformed = [ "emoticon" if word in SMILEY else word for word in words ] tweet = " ".join(reformed) if self.emoticon == Strategy.PACK: words = tweet.split() reformed = [ "(emoticon)" if word in SMILEY else word for word in words ] tweet = " ".join(reformed) if self.emoticon == Strategy.TRASLATE: words = tweet.split() reformed = [ SMILEY[word] if word in SMILEY else word for word in words ] tweet = " ".join(reformed) emoji_pattern = re.compile( "[" u"\U0001F600-\U0001F64F" #emoticons u"\U0001F300-\U0001F5FF" #symbols & pictographs u"\U0001F680-\U0001F6FF" #transport & map symbols u"\U0001F1E0-\U0001F1FF" #flags (iOS) u"\U00002702-\U000027B0" u"\U000024C2-\U0001F251" "]+", flags=re.UNICODE) #Gestione Emoji if self.emoji == Strategy.REMOVE: tweet = emoji_pattern.sub(r'', tweet) if self.emoji == Strategy.NORMALIZE: tweet = emoji_pattern.sub(r' emoji ', tweet) if self.emoji == Strategy.PACK: tweet = emoji_pattern.sub(r' (emoji) ', tweet) if self.emoji == Strategy.TRASLATE: number = emoji.emoji_count(tweet) if number != 0: if self.lang == 'EN': tweet = emoji.demojize(tweet, delimiters=("", "")) tweet = tweet.replace("_", " ") elif self.lang == 'IT': emos = emoji.emoji_lis(tweet) for emo in emos: singleMoji = str(emo['emoji']) ita_M = self.ita_moji[self.ita_moji['emoji'] == singleMoji] if (len(ita_M['text_ita'].values) != 0): significato = ita_M['text_ita'].values[0] tweet = tweet.replace(singleMoji, significato) else: tweet = tweet.replace(singleMoji, '') if self.emoji == Strategy.PACK_TRASLATE: number = emoji.emoji_count(tweet) if number != 0: if self.lang == 'EN': tweet = emoji.demojize(tweet, delimiters=("(", ")")) tweet = tweet.replace("_", " ") elif self.lang == 'IT': emos = emoji.emoji_lis(tweet) for emo in emos: singleMoji = str(emo['emoji']) ita_M = self.ita_moji[self.ita_moji['emoji'] == singleMoji] if (len(ita_M['text_ita'].values) != 0): significato = ita_M['text_ita'].values[0] tweet = tweet.replace(singleMoji, '(' + significato + ')') else: tweet = tweet.replace(singleMoji, '') #Gestione other. if self.others == Strategy.NORMALIZE: tweet = str(" ".join(self.text_processor.pre_process_doc(tweet))) if self.lang == 'EN': tweet = tweet.replace('<percent>', '<percentage>') if self.lang == 'IT': tweet = tweet.replace('<percent>', '<percentuale>') tweet = tweet.replace('<money>', '<soldi>') tweet = tweet.replace('<time>', '<tempo>') tweet = tweet.replace('<date>', '<data>') tweet = tweet.replace('<number>', '<numero>') tweet = tweet.replace("<", " ") tweet = tweet.replace(">", " ") if self.others == Strategy.PACK: tweet = str(" ".join(self.text_processor.pre_process_doc(tweet))) if self.lang == 'EN': tweet = tweet.replace('<percent>', '<percentage>') if self.lang == 'IT': tweet = tweet.replace('<percent>', '<percentuale>') tweet = tweet.replace('<money>', '<soldi>') tweet = tweet.replace('<time>', '<tempo>') tweet = tweet.replace('<date>', '<data>') tweet = tweet.replace('<number>', '<numero>') tweet = tweet.replace("<", "(") tweet = tweet.replace(">", "(") elems = [ tag.strip("#") for tag in tweet.split() if tag.startswith("#") ] #Hashtag if self.hashtag == Strategy.REMOVE: for elem in elems: tweet = tweet.replace("#" + elem, " ") if self.hashtag == Strategy.TRASLATE: for elem in elems: if self.lang == 'IT': traslate = ' '.join(self.lm.split(elem)) tweet = tweet.replace("#" + elem, traslate) if self.lang == 'EN': traslate = ' '.join(wordninja.split(elem)) tweet = tweet.replace("#" + elem, traslate) if self.hashtag == Strategy.PACK_TRASLATE: for elem in elems: if self.lang == 'IT': traslate = ' '.join(self.lm.split(elem)) tweet = tweet.replace("#" + elem, '< ' + traslate + ' >') if self.lang == 'EN': traslate = ' '.join(wordninja.split(elem)) tweet = tweet.replace("#" + elem, '< ' + traslate + ' >') if self.hashtag == Strategy.NORMALIZE: for elem in elems: tweet = tweet.replace("#" + elem, "#hashtag") if self.hashtag == Strategy.PACK: for elem in elems: tweet = tweet.replace("#" + elem, "(#hashtag)") #URLs if self.url == Strategy.REMOVE: tweet = ' '.join(re.sub("(\w+:\/\/\S+)", " ", tweet).split()) if self.url == Strategy.NORMALIZE: tweet = ' '.join(re.sub("(\w+:\/\/\S+)", " url ", tweet).split()) if self.url == Strategy.PACK: tweet = ' '.join(re.sub("(\w+:\/\/\S+)", " (url) ", tweet).split()) #Mentions if self.mention == Strategy.NORMALIZE: tweet = ' '.join( re.sub("(@[A-Za-z0-9]+)", " @user ", tweet).split()) if self.mention == Strategy.PACK: tweet = ' '.join( re.sub("(@[A-Za-z0-9]+)", " (@user) ", tweet).split()) if self.mention == Strategy.REMOVE: tweet = ' '.join(re.sub("(@[A-Za-z0-9]+)", " ", tweet).split()) if self.punctuation == Strategy.REMOVE: tweet = ' '.join(re.sub("[\.\,\!\?\:\;\-\=]", " ", tweet).split()) if self.lower == True: return tweet.lower() else: return tweet
# -*- coding: utf-8 -*- import sys from ekphrasis.classes.preprocessor import TextPreProcessor from ekphrasis.classes.tokenizer import SocialTokenizer from ekphrasis.dicts.emoticons import emoticons text_processor = TextPreProcessor( normalize=[ 'url', 'email', 'percent', 'money', 'phone', 'user', 'time', 'url', 'date', 'number' ], annotate={ 'hashtag', 'allcaps', 'elongated', 'repeated', 'emphasis', 'censored' }, fix_html=True, segmenter='twitter', corrector='twitter', unpack_hashtags=True, # perform word segmentation on hashtags unpack_contractions=True, # Unpack contractions (can't -> can not) spell_correct_elong=False, # spell correction for elongated words tokenizer=SocialTokenizer(lowercase=True).tokenize, dicts=[emoticons]) filename_input = sys.argv[1] filename_output = filename_input + '.out' with open(filename_input, 'r') as fin, open(filename_output, 'w') as fout: for line in fin: fout.write(' '.join(text_processor.pre_process_doc(s)) + '\n')
class Pem: ''' Politeness Estimator for Microblogs. Typing information was done via: ```shell monkeytype run __init__.py monkeytype apply pem ``` ''' threshold = 0.5 use_liwc = False use_cntVec = False def __init__( self, liwc_path: str = '', emolex_path: str = 'english_emolex.csv', estimator_path: str = 'english_twitter_politeness_estimator.joblib', feature_defn_path: str = 'english_twitter_additional_features.pickle', countVectorizer_path: str = '') -> None: # Preload LIWC dictionary: if liwc_path: liwc_df = pd.read_csv(liwc_path) liwc_df['*'] = liwc_df['term'].str.endswith('*') liwc_df['t'] = liwc_df['term'].str.rstrip('*') self.liwc_prefx = liwc_df[liwc_df['*']].groupby( 'category')['t'].apply(set) self.liwc_whole = liwc_df[~liwc_df['*']].groupby( 'category')['t'].apply(set) self.use_liwc = True # Preload EmoLex dictionary: emolex_df = pd.read_csv(emolex_path, index_col=0) self.emolex = emolex_df.apply(lambda s: set(s[s == 1].index)) # Preload additional feature rules: pltlex = pd.read_pickle(feature_defn_path) types = pltlex.apply(type) self.pltlex_ptn = pltlex[types == re.Pattern].to_dict() self.pltlex_set = pltlex[types == set].to_dict() # Initialize Tokenizer: self.text_processor = TextPreProcessor( # terms that will be normalized: normalize=[ 'url', 'email', 'percent', 'money', 'phone', 'user', 'time', 'url', 'date', 'number' ], # terms that will be annotated: annotate={ "hashtag", "allcaps", "elongated", "repeated", 'emphasis', 'censored' }, # perform word segmentation on hashtags: unpack_hashtags=False, # Unpack contractions (can't -> can not): unpack_contractions=True, tokenizer=SocialTokenizer(lowercase=True).tokenize, ) # preload classifier: self.clf = joblib.load(estimator_path) if countVectorizer_path: self.counter = joblib.load(countVectorizer_path) self.use_cntVec = True def load(self, filepath: str = 'tweets.csv'): self.df = pd.read_csv(filepath) return self def _tokenizeString(self, s: str) -> List[str]: ''' _tokenizeString tokenizes a string. Interestingly, it is faster to put this call into a separate method like this. ''' return self.text_processor.pre_process_doc(s) def tokenize(self): self.df['token'] = self.df['text'].apply(self._tokenizeString) self.df['token_cnts'] = self.df['token'].apply(Counter) return self def vectorizeByLiwc(self, cnts: dict, liwc_whole: dict, liwc_prefx: dict) -> Series: '''Vectorize by LIWC''' result = self.countAcrossDicts(cnts, liwc_whole) for category, tokens in liwc_prefx.items(): for j, n_appearance in cnts.items(): n_prefixes = sum(map(j.startswith, tokens)) result[category] += n_appearance * n_prefixes return pd.Series(result) def vectorizeByEmolex(self, cnts: dict, lex: dict) -> Series: '''Vectorize by EmoLex''' result = self.countAcrossDicts(cnts, lex) return pd.Series(result) def vectorizeByPoliteLex(self, r: Series, patterns: dict, sets: dict) -> Series: '''Vectorize by PoliteLex''' result = self.countAcrossDicts(r['token_cnts'], sets) text = r['text'] for feature_name, pattern in patterns.items(): # Slightly faster than `sum(1 for m in pattern.finditer(text))`. result[feature_name] = len(pattern.findall(text)) return pd.Series(result) @staticmethod def countAcrossDicts(cnts: dict, sets: dict) -> dict: result = {} # This native-Python implementation is faster than DataFrame multiplication. for feature_name, tokens in sets.items(): tokens_seen = tokens.intersection(cnts) result[feature_name] = sum(cnts[token] for token in tokens_seen) return result def vectorize(self, debug=True): ''' This function extracts features from the provided texts. It requires that `self.df` is already prepared. It writes the prepared features to `self.X`. ''' if self.use_liwc: liwc_cnts_df = self.df['token_cnts'].apply( self.vectorizeByLiwc, liwc_whole=self.liwc_whole, liwc_prefx=self.liwc_prefx) emolex_cnts_df = self.df['token_cnts'].apply(self.vectorizeByEmolex, lex=self.emolex) politelex_cnts_df = self.df.apply(self.vectorizeByPoliteLex, patterns=self.pltlex_ptn, sets=self.pltlex_set, axis=1) if self.use_cntVec: # Unigrams: space_separated_texts = self.df['token'].apply(' '.join) unigram_matrix = self.counter.transform(space_separated_texts) unigram_matrix = unigram_matrix.todense() if debug: if self.use_liwc: self.liwc_cnts_df = liwc_cnts_df self.emolex_cnts_df = emolex_cnts_df.astype(int) self.politelex_cnts_df = politelex_cnts_df if self.use_cntVec: self.space_separated_texts = space_separated_texts self.unigram_df = pd.DataFrame(unigram_matrix, index=self.df.index) # Combine all feature sets into one table: all_feats = [ emolex_cnts_df, politelex_cnts_df, ] if self.use_liwc: all_feats.insert(0, liwc_cnts_df) if self.use_cntVec: all_feats.append(unigram_matrix) self.X = concat(all_feats, axis=1) return self def predict(self) -> Series: def scoreToLabel(score): if score < -self.threshold: return 'Rude' if score > self.threshold: return 'Polite' return 'Neutral' scores = self.predict_proba() labels = scores.apply(scoreToLabel).rename('label') return labels def predict_proba(self) -> Series: probs = self.clf.predict_proba(self.X) probs_df = pd.DataFrame(probs) scores = probs_df.loc[:, 1] - probs_df.loc[:, 0] # Zero out scores that is too insignificant: scores = scores.apply(lambda x: 0 if -self.threshold < x < self.threshold else x) return scores.rename('score')
class SentClean: prep_default = {'spell': False, 'remove_sequences': False, 'lowercase': False, 'punctuations': [], 'excluding_criteria': ['copyright','copyright','medrxiv','appendix'], 'starting_keywords_to_remove': [ 'method', 'results', 'result', 'conclusion', 'conclusions', 'evaluation', 'evaluations', 'objectives', 'objective', 'cc - by international license', 'doi'] } def __init__(self, prep=prep_default ): """ Constructor of clean functions over extracted texts/tweets :param prep: paramter settings of the text-preprocessor """ # check existence of the keys within prep dict, which needs to be a list for k in self.prep_default.keys(): if not k in prep.keys(): prep[k] = self.prep_default[k] self.prep = prep self.omit = list(emoticons.keys()) + list(emoticons.values()) self.text_processor = TextPreProcessor( fix_html=True, normalize=[], segmenter='twitter', corrector='twitter', fix_text=True, unpack_hashtags=True, # perform word segmentation on hashtags unpack_contractions=prep['spell'], # Unpack contractions (can't -> can not) spell_correction=prep['spell'], spell_correct_elong=prep['spell'], tokenizer=SocialTokenizer(lowercase=prep['lowercase']).tokenize, dicts=[{}], omit=list(emoticons.keys()) + list(emoticons.values()), ) self.nlp = spacy.load("en_core_web_sm") self.nlp_sent = English() # just the language with no model sentencizer = self.nlp_sent.create_pipe("sentencizer") self.nlp_sent.add_pipe(sentencizer) def clean_tweet(self, text): text_list = self.clean_sentences([text]) if not len(text_list) > 0: return None return text_list def pattern_repl(self, matchobj): """ Return a replacement string to be used for match object """ return ' '.rjust(len(matchobj.group(0))) def clean_sentences(self, sentences, min_len_sent=5, max_num_punctuations=10): """ function to clean list of sentences :param sentences: list (str) of the sentences :param min_len_sent: int parameter used to trim :param max_num_punctuations: int parameter used to trim :return: cleaned sentences """ # remove non english sentences en_sentences = [] for s in sentences: try: if detect(s) == 'en': en_sentences += [s] except: continue sentences = en_sentences # remove chinese characters sentences = [re.sub("([^\x00-\x7F])+", " ", text) for text in sentences] # restrict to ascii characters sentences = [s.encode('ascii', errors='ignore').decode() for s in sentences] # print(f'input sentence is {sentences}.') # trim length trim_sentences = [s for s in sentences if len(s.split()) > min_len_sent] new_trim_sentences = [] for s in trim_sentences: p_count = 0 for p_ in self.prep['punctuations']: p_count += s.count(p_) if p_count < max_num_punctuations: new_trim_sentences += [s] # remove redundant trim_sentences = list(set(new_trim_sentences)) # print(f"trim sentence is: {trim_sentences}.") # extra sentence wise pre processing steps new_text_list = [] for sent_ in trim_sentences: # space correction on urls text = sent_.replace('http: /', 'https:/') text = text.replace('https: /', 'https:/') text = p.clean(text) text = clean(text, fix_unicode=True, # fix various unicode errors to_ascii=True, # transliterate to closest ASCII representation lower=True, # lowercase text no_line_breaks=True, # fully strip line breaks as opposed to only normalizing them no_urls=True, # replace all URLs with a special token no_emails=True, # replace all email addresses with a special token no_phone_numbers=True, # replace all phone numbers with a special token no_numbers=False, # replace all numbers with a special token no_digits=False, # replace all digits with a special token no_currency_symbols=False, # replace all currency symbols with a special token no_punct=False, # fully remove punctuation replace_with_url=" ", replace_with_email=" ", replace_with_phone_number=" ", replace_with_number=" ", replace_with_digit=" ", replace_with_currency_symbol=" ", lang="en" # set to 'de' for German special handling ) # remove citations text = re.sub('\[(\s*\d*,*\s*)+\]', '', text) text = re.sub('\[(\s*\d*-*\s*)+\]', '', text) text = re.sub('\((\s*\d*,*\s*)+\)', '', text) text = re.sub('\((\s*\d*-*\s*)+\)', '', text) # replace [**Patterns**] with spaces. text = re.sub(r'\[\*\*.*?\*\*\]', self.pattern_repl, text) # remove hashtag symbol and unpack it text = " ".join(self.text_processor.pre_process_doc(text)) # remove emoticons for item in self.omit: text = text.replace(item, ' ') # remove non-word character-repetitions text = re.sub(r'(\W)\1+', r'\1', text) if self.prep['remove_sequences']: # remove sequences like 'A p p e n d i x' text = re.sub(r'(\S\s){3,}', '', text) for p_ in self.prep['punctuations']: # replace `_` with spaces. text = text.replace(p_, ' ' + p_ + ' ') if self.prep['spell']: # spell correction text = " ".join(spell_corrector.correct(w) for w in social_tokenizer(text)) # remove douplicated whitespaces text = squeezeWhitespace(text) if text.split(' ')[0] in self.prep['starting_keywords_to_remove'] and text.split(' ')[1] == ':': text = ' '.join(text.split(' ')[2:]) # exclude sentences including keywords like for ex_key in self.prep['excluding_criteria']: if ex_key in text: text = '' # check if there exists verb on the text doc = self.nlp((text)) number_of_verbs = len([token.lemma_ for token in doc if token.pos_ == "VERB"]) # print(f"the text is: {text}.") if len(text.split(' ')) > min_len_sent and number_of_verbs > 0: new_text_list.append(text) return new_text_list
from ekphrasis.classes.tokenizer import SocialTokenizer from ekphrasis.dicts.emoticons import emoticons text_processor = TextPreProcessor( normalize=[ 'url', 'email', 'percent', 'money', 'phone', 'user', 'time', 'url', 'date', 'number' ], annotate={ 'hashtag', 'allcaps', 'elongated', 'repeated', 'emphasis', 'censored' }, fix_html=True, segmenter='twitter', corrector='twitter', unpack_hashtags=True, # perform word segmentation on hashtags unpack_contractions=True, # Unpack contractions (can't -> can not) spell_correct_elong=False, # spell correction for elongated words tokenizer=SocialTokenizer(lowercase=True).tokenize, dicts=[emoticons]) for filename_input in [ '1/train.text', '1/test.text', '2/train.text', '2/test.text', '3/train.text', '3/test.text' ]: filename_output = filename_input + '.out' with open(filename_input, 'r') as fin, open(filename_output, 'w') as fout: for line in fin: fout.write(' '.join(text_processor.pre_process_doc(line.strip())) + '\n')
class PreprocessingText: def __init__(self, text, **kwargs): self.text = text self.text_processor = TextPreProcessor( # terms that will be normalize e.g. [email protected] to <email> normalize=[ 'url', 'email', 'percent', 'money', 'phone', 'user', 'time', 'date', 'number' ], # terms that will be annotated e.g. <hashtag>#test</hashtag> annotate={ "hashtag", "allcaps", "elongated", "repeated", 'emphasis' }, fix_html=True, # fix HTML tokens unpack_hashtags=True, # perform word segmentation on hashtags # select a tokenizer. You can use SocialTokenizer, or pass your own if not text tokenized on whitespace # the tokenizer, should take as input a string and return a list of tokens tokenizer=SocialTokenizer(lowercase=True).tokenize, # list of dictionaries, for replacing tokens extracted from the text, # with other expressions. You can pass more than one dictionaries. dicts=[emoticons]) def remove_stopwords(self, data): stop_ger = stopwords.words('german') allowed_stopwords = [ 'kein', 'keine', 'keinem', 'keinen', 'keiner', 'keines', 'nicht', 'nichts' ] for a in allowed_stopwords: stop_ger.remove(a) customstopwords = [ 'rt', 'mal', 'heute', 'gerade', 'erst', 'macht', 'eigentlich', 'warum', 'gibt', 'gar', 'immer', 'schon', 'beim', 'ganz', 'dass', 'wer', 'mehr', 'gleich', 'wohl' ] normalizedwords = [ '<url>', '<email>', '<percent>', 'money>', '<phone>', '<user>', '<time>', '<url>', '<date>', '<number>' ] stop_ger = stop_ger + customstopwords + normalizedwords clean_data = [] if (type(data) == list): for d in data: data_stop_words = [] for word in d: if word not in stop_ger: data_stop_words.append(word) clean_data.append(data_stop_words) if (type(data) == str): words = data.split() for word in words: if word not in stop_ger: clean_data.append(word) return clean_data def lemmatize_words(self, data): _lemmatizer = PatternParserLemmatizer() lemmatized_data = [] if (type(data) == list): for d in data: text = "" for word in d: text = text + " " + word l = _lemmatizer.lemmatize(text) lemmatized_data.append([i[0] for i in l]) if (type(data) == str): l = _lemmatizer.lemmatize(data) lemmatized_data.append([i[0] for i in l]) return lemmatized_data def ekphrasis_preprocessing(self): X_clean = [] if (type(self.text) == str): X_clean.append(self.text_processor.pre_process_doc(self.text)) if (type(self.text) == list): for row in tqdm(self.text): X_clean.append(self.text_processor.pre_process_doc(row)) return X_clean
except: None # deal with contractions that the tool misses tweet = re.sub( r"(\b)([Ww]hat|[Ii]t|[Hh]e|[Ss]he|[Tt]hat|[Tt]here|[Hh]ow|[Ww]ho|[Hh]ere|[Ww]here|[Ww]hen)'s", r"\1\2 is", tweet) tweet = re.sub(r"(\b)([Aa]in)'t", r"is not", tweet) tweet = re.sub(r"(\b)([Ww]asn)'t", r"was not", tweet) tweet = re.sub(r"(\b)([Hh]e|[Ss]he|[Ii]|[Yy]ou|[Tt]hey|[Ww]e)'d", r"\1\2 would", tweet) tweet = re.sub(r"(\b)([Ii]t|[Tt]hat|[Tt]his)'ll", r"\1\2 will", tweet) tweet = re.sub(r"(\b)([Cc])'mon", r"come on", tweet) # process the rest of the tweet with the nltk tweet tokenizer tweet = " ".join(text_processor.pre_process_doc(tweet)).lower() clean_tweets.append(tweet) # below is code to create the tsv file of cleaned tweets index = 0 with open('task1_training_cleaned.tsv', mode='w') as tsvfile: tsvwriter = csv.writer(tsvfile, delimiter='\t') index = 0 for tweet in clean_tweets: tsvwriter.writerow([target[index], tweet]) index += 1 tsvfile.close()
class Preprocess: def __init__(self): self.label2emotion = {0: "others", 1: "happy", 2: "sad", 3: "angry"} self.emotion2label = {"others": 0, "happy": 1, "sad": 2, "angry": 3} self.emoticons_additional = { '(^・^)': '<happy>', ':‑c': '<sad>', '=‑d': '<happy>', ":'‑)": '<happy>', ':‑d': '<laugh>', ':‑(': '<sad>', ';‑)': '<happy>', ':‑)': '<happy>', ':\\/': '<sad>', 'd=<': '<annoyed>', ':‑/': '<annoyed>', ';‑]': '<happy>', '(^�^)': '<happy>', 'angru': 'angry', "d‑':": '<annoyed>', ":'‑(": '<sad>', ":‑[": '<annoyed>', '(�?�)': '<happy>', 'x‑d': '<laugh>', } self.text_processor = TextPreProcessor( # terms that will be normalized normalize=[ 'url', 'email', 'percent', 'money', 'phone', 'user', 'time', 'url', 'date', 'number' ], # terms that will be annotated annotate={ "hashtag", "allcaps", "elongated", "repeated", 'emphasis', 'censored' }, fix_html=True, # fix HTML tokens # corpus from which the word statistics are going to be used # for word segmentation segmenter="twitter", # corpus from which the word statistics are going to be used # for spell correction corrector="twitter", unpack_hashtags=True, # perform word segmentation on hashtags unpack_contractions=True, # Unpack contractions (can't -> can not) spell_correct_elong=True, # spell correction for elongated words # select a tokenizer. You can use SocialTokenizer, or pass your own # the tokenizer, should take as input a string and return a list of tokens tokenizer=SocialTokenizer(lowercase=True).tokenize, # list of dictionaries, for replacing tokens extracted from the text, # with other expressions. You can pass more than one dictionaries. dicts=[emoticons, self.emoticons_additional]) def tokenize(self, text): text = " ".join(self.text_processor.pre_process_doc(text)) return text def preprocessData(self, dataFilePath, mode): conversations = [] labels = [] with io.open(dataFilePath, encoding="utf8") as finput: finput.readline() for line in finput: line = line.strip().split('\t') for i in range(1, 4): line[i] = self.tokenize(line[i]) if mode == "train": labels.append(self.emotion2label[line[4]]) conv = line[1:4] conversations.append(conv) if mode == "train": return np.array(conversations), np.array(labels) else: return np.array(conversations)
def clean_tweets(df): # define the text preprocessro text_processor = TextPreProcessor( # terms that will be normalized normalize=['url', 'email', 'money', 'phone', 'time', 'date'], # terms that will be annotated annotate={ "hashtag", "allcaps", "elongated", "repeated", 'emphasis', 'censored' }, fix_html=True, # fix HTML tokens # corpus from which the word statistics are going to be used # for word segmentation segmenter="twitter", # corpus from which the word statistics are going to be used # for spell correction corrector="twitter", unpack_hashtags=True, # perform word segmentation on hashtags unpack_contractions=True, # Unpack contractions (can't -> can not) spell_correct_elong=False, # spell correction for elongated words # select a tokenizer. You can use SocialTokenizer, or pass your own # the tokenizer, should take as input a string and return a list of tokens #tokenizer=SocialTokenizer(lowercase=True).tokenize, tokenizer=TweetTokenizer().tokenize, # list of dictionaries, for replacing tokens extracted from the text, # with other expressions. You can pass more than one dictionaries. dicts=[emoticons]) seg = Segmenter(corpus="twitter") tweet_text = df.tweet_text.to_list() clean_tweets = [] for tweet in tweet_text: # manually tag usernames # ex: @DoctorChristian -> <user> doctor christian </user> match = re.findall(r'@\w+', tweet) try: for at in match: user_seg = seg.segment(at[1:]) tweet = tweet.replace(at, '<user> ' + user_seg + ' </user>') except: None # manually tag all caps so that the unpack_contractions functions works match = re.findall(r"(?<![#@$])\b([A-Z][A-Z ,.']*[A-Z])\b", tweet) try: for all_caps in match: tweet = tweet.replace( all_caps, '<allcaps> ' + all_caps.lower() + ' </allcaps>') except: None # manually tag percentages match = re.findall(r"(\d+.?\d?%)", tweet) try: for percent in match: tweet = tweet.replace( percent, '<percent> ' + percent[0:len(percent) - 1] + ' </percent>') except: None # deal with contractions that the tool misses tweet = re.sub( r"(\b)([Ww]hat|[Ii]t|[Hh]e|[Ss]he|[Tt]hat|[Tt]here|[Hh]ow|[Ww]ho|[Hh]ere|[Ww]here|[Ww]hen)'s", r"\1\2 is", tweet) tweet = re.sub(r"(\b)([Aa]in)'t", r"is not", tweet) tweet = re.sub(r"(\b)([Ww]asn)'t", r"was not", tweet) tweet = re.sub(r"(\b)([Hh]e|[Ss]he|[Ii]|[Yy]ou|[Tt]hey|[Ww]e)'d", r"\1\2 would", tweet) tweet = re.sub(r"(\b)([Ii]t|[Tt]hat|[Tt]his)'ll", r"\1\2 will", tweet) tweet = re.sub(r"(\b)([Cc])'mon", r"come on", tweet) # process the rest of the tweet with the nltk tweet tokenizer tweet = " ".join(text_processor.pre_process_doc(tweet)).lower() clean_tweets.append(tweet) # below is code to create the tsv file of cleaned tweets df['tweet_text'] = clean_tweets return df