def build_vocab(dataset): vocabulary_set = set() text_processor = TextPreProcessor( normalize=[ 'url', 'email', 'percent', 'money', 'phone', 'user', 'time', 'url', 'date', 'number' ], annotate={ "hashtag", "allcaps", "elongated", "repeated", 'emphasis', 'censored' }, fix_html=True, segmenter="twitter", corrector="twitter", unpack_hashtags=True, unpack_contractions=True, spell_correct_elong=False, tokenizer=SocialTokenizer(lowercase=True).tokenize, dicts=[emoticons]) for text_tensor, _ in dataset: text = str(text_tensor.numpy()[1], 'utf-8') some_tokens = text_processor.pre_process_doc(text) vocabulary_set.update(some_tokens) return vocabulary_set
def __init__(self): # Define a Text Pre-Processing pipeline # You can easily define a preprocessing pipeline, by using the TextPreProcessor. self.text_processor = TextPreProcessor( # terms that will be normalized normalize=['url', 'email', 'percent', 'money', 'phone', 'user', 'time', 'url', 'date', 'number'], # terms that will be annotated annotate={"hashtag", "allcaps", "elongated", "repeated", 'emphasis', 'censored'}, fix_html=True, # fix HTML tokens # corpus from which the word statistics are going to be used # for word segmentation segmenter="english", # corpus from which the word statistics are going to be used # for spell correction corrector="english", unpack_hashtags=True, # perform word segmentation on hashtags unpack_contractions=True, # Unpack contractions (can't -> can not) spell_correct_elong=False, # spell correction for elongated words # select a tokenizer. You can use SocialTokenizer, or pass your own # the tokenizer, should take as input a string and return a list of tokens tokenizer=SocialTokenizer(lowercase=True).tokenize, # list of dictionaries, for replacing tokens extracted from the text, # with other expressions. You can pass more than one dictionaries. dicts=[emoticons])
def __init__(self, args, lang): self.others = Strategy(args.others) # valori validi 0,1,2 //ANDATA self.emoji = Strategy( args.emoji) #0,1 emoji ,2 (emoji) ,3, 4 ,5 (traduzione) // self.emoticon = Strategy( args.emoticon) #0,1 emoticon ,2 (emoticon) ,3, 4 ,5 (traduzione) self.url = Strategy(args.url) # 0,1,2,3 self.hashtag = Strategy( args.hashtag) # 0,1 = #hashtag,2 ,3 (#hashtag),4,5 self.punctuation = Strategy(args.punctuation) #Valori validi 0,3 self.mention = Strategy(args.mention) #0,1,2,3 self.lower = args.lower #true o false self.lang = lang # EN o IT self.ita_moji = pd.read_csv('./data/italianMoji.csv', sep=';') if self.lang == 'IT': self.lm = wordninja.LanguageModel('./data/words.last_all.txt.gz') else: self.lm = None self.text_processor = TextPreProcessor( remove=[ 'email', #raw o nomralize. 'percent', #raw o nomralize: EN: percentage, IT: percentuale. 'money', # raw o nomralize: EN: money, IT: soldi. verificare se becca le valute 'phone', # raw o nomralize: EN: phone, IT: telefono 'time', # raw o nomralize: EN time, It: ore 'date', # raw o nomralize EN date, It data 'number' #raw o nomralize En number, it numero. ], annotate={}, fix_html=True, unpack_hashtags=False, tokenizer=SocialTokenizer(lowercase=self.lower).tokenize, dicts=[emoticons])
def __init__(self, prep=prep_default ): """ Constructor of clean functions over extracted texts/tweets :param prep: paramter settings of the text-preprocessor """ # check existence of the keys within prep dict, which needs to be a list for k in self.prep_default.keys(): if not k in prep.keys(): prep[k] = self.prep_default[k] self.prep = prep self.omit = list(emoticons.keys()) + list(emoticons.values()) self.text_processor = TextPreProcessor( fix_html=True, normalize=[], segmenter='twitter', corrector='twitter', fix_text=True, unpack_hashtags=True, # perform word segmentation on hashtags unpack_contractions=prep['spell'], # Unpack contractions (can't -> can not) spell_correction=prep['spell'], spell_correct_elong=prep['spell'], tokenizer=SocialTokenizer(lowercase=prep['lowercase']).tokenize, dicts=[{}], omit=list(emoticons.keys()) + list(emoticons.values()), ) self.nlp = spacy.load("en_core_web_sm") self.nlp_sent = English() # just the language with no model sentencizer = self.nlp_sent.create_pipe("sentencizer") self.nlp_sent.add_pipe(sentencizer)
def build_vocab_list(dataframe): vocab_set = set() sentenses = [] text_processor = TextPreProcessor( normalize=[ 'url', 'email', 'percent', 'money', 'phone', 'user', 'time', 'url', 'date', 'number' ], annotate={ "hashtag", "allcaps", "elongated", "repeated", 'emphasis', 'censored' }, fix_html=True, segmenter="twitter", corrector="twitter", unpack_hashtags=True, unpack_contractions=True, spell_correct_elong=False, tokenizer=SocialTokenizer(lowercase=True).tokenize, dicts=[emoticons]) for index in range(dataframe.shape[0]): tweet = dataframe["tweet"][index] tok = text_processor.pre_process_doc(tweet) sentenses.append(" ".join(tok)) vocab_set.update(tok) df_sentenses = pd.DataFrame(sentenses, columns=['content']) return vocab_set, df_sentenses
def preprocess_data(genuine_filepath, bot_filepath): """ Preprocess data and normalize tweets. """ # Open csv file and get the tweet part of the csv. # Strip out newlines and quotes around text. with codecs.open(bot_filepath, 'r', encoding='utf-8', errors='ignore') as bots_file: bot_sentences = [ x.split(',')[1].strip('\n').strip('"').lower() if len(x.split(',')) > 1 else '' for x in bots_file.readlines() ] bot_sentences = bot_sentences[1:] with codecs.open(genuine_filepath, 'r', encoding='utf-8', errors='ignore') as genuine_file: genuine_sentences = [ x.split(',')[1].strip('\n').strip('"').lower() if len(x.split(',')) > 1 else '' for x in genuine_file.readlines() ] genuine_sentences = genuine_sentences[1:] text_processor = TextPreProcessor( # terms that will be normalized normalize=[ 'url', 'email', 'percent', 'money', 'phone', 'user', 'time', 'url', 'date', 'number' ], # terms that will be annotated annotate={ "hashtag", "allcaps", "elongated", "repeated", 'emphasis', 'censored' }, fix_html=True, # fix HTML tokens # corpus from which the word statistics are going to be used # for word segmentation segmenter="twitter", # corpus from which the word statistics are going to be used # for spell correction corrector="twitter", unpack_hashtags=True, # perform word segmentation on hashtags unpack_contractions=True, # Unpack contractions (can't -> can not) spell_correct_elong=False, # spell correction for elongated words # select a tokenizer. You can use SocialTokenizer, or pass your own # the tokenizer, should take as input a string and return a list of tokens tokenizer=SocialTokenizer(lowercase=True).tokenize, # list of dictionaries, for replacing tokens extracted from the text, # with other expressions. You can pass more than one dictionaries. dicts=[emoticons]) bot_sentences = [text_processor.pre_process_doc(s) for s in bot_sentences] genuine_sentences = [ text_processor.pre_process_doc(s) for s in genuine_sentences ] return genuine_sentences, bot_sentences
def preprocess_dataset(tweets, y): """uses ekphrasis API to preprocess the tweets""" text_processor = TextPreProcessor( # terms that will be normalized normalize=[ 'url', 'email', 'percent', 'money', 'phone', 'user', 'time', 'url', 'date', 'number' ], # terms that will be annotated fix_html=True, # fix HTML tokens # corpus from which the word statistics are going to be used # for word segmentation segmenter="twitter", # corpus from which the word statistics are going to be used # for spell correction corrector="twitter", unpack_hashtags=True, # perform word segmentation on hashtags unpack_contractions=True, # Unpack contractions (can't -> can not) spell_correct_elong=False, # spell correction for elongated words spell_correction=False, # select a tokenizer. You can use SocialTokenizer, or pass your own # the tokenizer, should take as input a string and return a list of tokens tokenizer=SocialTokenizer(lowercase=True).tokenize, # list of dictionaries, for replacing tokens extracted from the text, # with other expressions. You can pass more than one dictionaries. dicts=[emoticons]) ynew = [] filter_tweets = [] for t in range(0, len(tweets)): tokens = text_processor.pre_process_doc(tweets[t]) newtokens = [] i = 0 while (i < len(tokens)): try: if (tokens[i] == "pic" and tokens[i + 1] == "." and tokens[i + 2] == "twitter"): break elif (tokens[i] in [ "<url>", "<email>", "<user>", "<money>", "<percent>", "<phone>", "<time>", "<date>", "<number>" ]): i += 1 continue elif (tokens[i] == "<" and tokens[i + 1] == "emoji"): while (tokens[i] != ">"): i += 1 i += 1 else: newtokens.append(tokens[i]) i += 1 except: break if (len(newtokens) != 0): filter_tweets.append(" ".join(newtokens)) ynew.append(y[t]) return filter_tweets, ynew #tokenizing and other preprocessing #removes emojis
def __init__(self): self.label2emotion = {0: "others", 1: "happy", 2: "sad", 3: "angry"} self.emotion2label = {"others": 0, "happy": 1, "sad": 2, "angry": 3} self.emoticons_additional = { '(^・^)': '<happy>', ':‑c': '<sad>', '=‑d': '<happy>', ":'‑)": '<happy>', ':‑d': '<laugh>', ':‑(': '<sad>', ';‑)': '<happy>', ':‑)': '<happy>', ':\\/': '<sad>', 'd=<': '<annoyed>', ':‑/': '<annoyed>', ';‑]': '<happy>', '(^�^)': '<happy>', 'angru': 'angry', "d‑':": '<annoyed>', ":'‑(": '<sad>', ":‑[": '<annoyed>', '(�?�)': '<happy>', 'x‑d': '<laugh>', } self.text_processor = TextPreProcessor( # terms that will be normalized normalize=[ 'url', 'email', 'percent', 'money', 'phone', 'user', 'time', 'url', 'date', 'number' ], # terms that will be annotated annotate={ "hashtag", "allcaps", "elongated", "repeated", 'emphasis', 'censored' }, fix_html=True, # fix HTML tokens # corpus from which the word statistics are going to be used # for word segmentation segmenter="twitter", # corpus from which the word statistics are going to be used # for spell correction corrector="twitter", unpack_hashtags=True, # perform word segmentation on hashtags unpack_contractions=True, # Unpack contractions (can't -> can not) spell_correct_elong=True, # spell correction for elongated words # select a tokenizer. You can use SocialTokenizer, or pass your own # the tokenizer, should take as input a string and return a list of tokens tokenizer=SocialTokenizer(lowercase=True).tokenize, # list of dictionaries, for replacing tokens extracted from the text, # with other expressions. You can pass more than one dictionaries. dicts=[emoticons, self.emoticons_additional])
def __init__( self, liwc_path: str = '', emolex_path: str = 'english_emolex.csv', estimator_path: str = 'english_twitter_politeness_estimator.joblib', feature_defn_path: str = 'english_twitter_additional_features.pickle', countVectorizer_path: str = '') -> None: # Preload LIWC dictionary: if liwc_path: liwc_df = pd.read_csv(liwc_path) liwc_df['*'] = liwc_df['term'].str.endswith('*') liwc_df['t'] = liwc_df['term'].str.rstrip('*') self.liwc_prefx = liwc_df[liwc_df['*']].groupby( 'category')['t'].apply(set) self.liwc_whole = liwc_df[~liwc_df['*']].groupby( 'category')['t'].apply(set) self.use_liwc = True # Preload EmoLex dictionary: emolex_df = pd.read_csv(emolex_path, index_col=0) self.emolex = emolex_df.apply(lambda s: set(s[s == 1].index)) # Preload additional feature rules: pltlex = pd.read_pickle(feature_defn_path) types = pltlex.apply(type) self.pltlex_ptn = pltlex[types == re.Pattern].to_dict() self.pltlex_set = pltlex[types == set].to_dict() # Initialize Tokenizer: self.text_processor = TextPreProcessor( # terms that will be normalized: normalize=[ 'url', 'email', 'percent', 'money', 'phone', 'user', 'time', 'url', 'date', 'number' ], # terms that will be annotated: annotate={ "hashtag", "allcaps", "elongated", "repeated", 'emphasis', 'censored' }, # perform word segmentation on hashtags: unpack_hashtags=False, # Unpack contractions (can't -> can not): unpack_contractions=True, tokenizer=SocialTokenizer(lowercase=True).tokenize, ) # preload classifier: self.clf = joblib.load(estimator_path) if countVectorizer_path: self.counter = joblib.load(countVectorizer_path) self.use_cntVec = True
class TextPreprocessor(): def __init__(self): self.text_processor_options = TextPreProcessor( normalize=['url', 'email', 'percent', 'money', 'phone', 'user', 'time', 'url', 'date', 'number'], unpack_contractions=False, annotate={"allcaps", "elongated", "repeated", 'emphasis', 'censored'}, fix_html=True, # fix HTML tokens # corpus from which the word statistics are going to be used # for word segmentation and correction segmenter="english", corrector="english", unpack_hashtags=False, # perform word segmentation on hashtags spell_correct_elong=False, # spell correction for elongated words # the tokenizer, should take as input a string and return a list of tokens tokenizer=SocialTokenizer(lowercase=True).tokenize, # list of dictionaries, for replacing tokens extracted from the text, dicts=[emoticons] ) def do_ekphrasis_preprocessing(self, sentences): if isinstance(sentences, str): return self.text_processor_options.pre_process_doc(sentences) assert (type(sentences).__module__ == np.__name__) preprocessed = [self.text_processor_options.pre_process_doc(s) for s in sentences] return np.array(preprocessed) def do_decontraction(self, sentences): if isinstance(sentences, str): sentences = np.array([sentences]) assert(type(sentences).__module__ == np.__name__) preprocessed = [] for s in sentences: ''' Does not deal with 'd as it is ambiguous''' s = re.sub(r"[W, w]on\'t", "will not", s) s = re.sub(r"[C, c]an\'t", "can not", s) s = re.sub(r"[C, c]annot", "can not", s) s = re.sub(r"n\'t", " not", s) s = re.sub(r"\'re", " are", s) s = re.sub(r"[H, h]e\'s", "he is", s) s = re.sub(r"[S, s]he\'s", "she is", s) s = re.sub(r"[I, i]t\'s", "it is", s) s = re.sub(r"\'ll", " will", s) s = re.sub(r"\'ve", " have", s) s = re.sub(r"\'m", " am", s) s = re.sub(r"[D, d]idn\'t", "did not", s) preprocessed.append(s) return np.array(preprocessed)
def twitter_preprocess(): preprocessor = TextPreProcessor( normalize=[ 'url', 'email', 'percent', 'money', 'phone', 'user', 'time', 'date', 'number' ], annotate={ "hashtag", "elongated", "allcaps", "repeated", 'emphasis', 'censored' }, all_caps_tag="wrap", fix_text=True, segmenter="twitter_2018", corrector="twitter_2018", unpack_hashtags=True, unpack_contractions=True, spell_correct_elong=False, tokenizer=SocialTokenizer(lowercase=True).tokenize, dicts=[emoticons]).pre_process_doc def preprocess(name, dataset): desc = "PreProcessing dataset {}...".format(name) # data = [] # with multiprocessing.Pool(processes=4) as pool: # iterator = pool.imap_unordered(preprocessor, X, 1000) # for i, result in enumerate(tqdm(iterator, total=len(X))): # pass data = [preprocessor(x) for x in tqdm(dataset, desc=desc)] return data return preprocess
def get_text_processor(word_stats='twitter'): return TextPreProcessor( # terms that will be normalized normalize=['url', 'email', 'phone', 'user'], # terms that will be annotated annotate={ "hashtag", "allcaps", "elongated", "repeated", 'emphasis', 'censored' }, fix_html=True, # fix HTML tokens # corpus from which the word statistics are going to be used # for word segmentation segmenter=word_stats, # corpus from which the word statistics are going to be used # for spell correction corrector=word_stats, unpack_hashtags=True, # perform word segmentation on hashtags unpack_contractions=True, # Unpack contractions (can't -> can not) spell_correct_elong=False, # spell correction for elongated words # select a tokenizer. You can use SocialTokenizer, or pass your own # the tokenizer, should take as input a string and return a list of tokens tokenizer=SocialTokenizer(lowercase=True).tokenize, # list of dictionaries, for replacing tokens extracted from the text, # with other expressions. You can pass more than one dictionaries. dicts=[emoticons])
def clean_then_tokenize_text(data): text_all = [] text_processor = TextPreProcessor( normalize=['user','url'],) for key in data: text = data[key] a= [] temp = "" for line in text: if True: line = text_processor.pre_process_doc(line) temp=" ".join( text_to_word_sequence(line) ) a.append(temp) data[key]['cln_text'] = a text_all +=a return text_all
def twitter_preprocess(): """ ekphrasis-social tokenizer sentence preprocessor. Substitutes a series of terms by special coins when called over an iterable (dataset) """ norm = [ 'url', 'email', 'percent', 'money', 'phone', 'user', 'time', 'date', 'number' ] ann = { "hashtag", "elongated", "allcaps", "repeated", "emphasis", "censored" } preprocessor = TextPreProcessor( normalize=norm, annotate=ann, all_caps_tag="wrap", fix_text=True, segmenter="twitter_2018", corrector="twitter_2018", unpack_hashtags=True, unpack_contractions=True, spell_correct_elong=False, tokenizer=SocialTokenizer(lowercase=True).tokenize, dicts=[emoticons]).pre_process_doc def preprocess(name, dataset): description = " Ekphrasis-based preprocessing dataset " description += "{}...".format(name) data = [preprocessor(x) for x in tqdm(dataset, desc=description)] return data return preprocess
def twitter_preprocess(): preprocessor = TextPreProcessor( normalize=[ 'url', 'email', 'percent', 'money', 'phone', 'user', 'time', 'date', 'number' ], annotate={ "hashtag", "elongated", "allcaps", "repeated", 'emphasis', 'censored' }, all_caps_tag="wrap", fix_text=True, segmenter="twitter_2018", corrector="twitter_2018", unpack_hashtags=True, unpack_contractions=True, spell_correct_elong=False, tokenizer=SocialTokenizer(lowercase=True).tokenize, dicts=[emoticons]).pre_process_doc def preprocess(name, dataset): desc = "PreProcessing dataset from nlp.py:33 {}...".format(name) data = [preprocessor(x) for x in tqdm(dataset, desc=desc)] return data return preprocess
def datastories_processor(x): from ekphrasis.dicts.emoticons import emoticons from ekphrasis.classes.tokenizer import SocialTokenizer from ekphrasis.classes.preprocessor import TextPreProcessor text_processor = TextPreProcessor( # terms that will be normalized normalize=[ 'url', 'email', 'percent', 'money', 'phone', 'user', 'time', 'url', 'date', 'number' ], # terms that will be annotated annotate={ "hashtag", "allcaps", "elongated", "repeated", 'emphasis', 'censored' }, fix_html=True, # fix HTML tokens # corpus from which the word statistics are going to be used # for word segmentation segmenter="twitter", # corpus from which the word statistics are going to be used # for spell correction corrector="twitter", unpack_hashtags=True, # perform word segmentation on hashtags unpack_contractions=True, # Unpack contractions (can't -> can not) spell_correct_elong=False, # spell correction for elongated words # select a tokenizer. You can use SocialTokenizer, or pass your own # the tokenizer, should take as input a string and return a list of tokens tokenizer=SocialTokenizer(lowercase=True).tokenize, # list of dictionaries, for replacing tokens extracted from the text, # with other expressions. You can pass more than one dictionaries. dicts=[emoticons]) x = [text_processor.pre_process_doc(sent) for sent in x] temp = [] for sent in x: context = '' for word in sent: context = context + ' ' + word temp.append(context) return temp
def __init__(self, **kwargs): self.text_processor = TextPreProcessor( omit=kwargs.get('normalize', []), normalize=kwargs.get( 'normalize', ['url', 'email', 'phone', 'user', 'time', 'url', 'date']), annotate=kwargs.get('annotate', {}), fix_html=kwargs.get('fix_html', True), segmenter=kwargs.get('segmenter', "twitter"), corrector=kwargs.get('corrector', "twitter"), unpack_hashtags=kwargs.get('unpack_hashtags', True), unpack_contractions=kwargs.get('unpack_contractions', True), spell_correct_elong=kwargs.get('fix_elongation', True), spell_correction=kwargs.get('spell_correction', True), fix_bad_unicode=kwargs.get('fix_bad_unicode', True), tokenizer=SocialTokenizer(lowercase=True).tokenize, dicts=[emoticons])
def get_tweet_processor(additional_dictionary_list=None): dicts = [emoticons] #print (dicts) print(len(dicts)) if additional_dictionary_list: dicts.extend(additional_dictionary_list) print(len(dicts)) ''' Test with this code block: sentences = [ "he's aaaaaaaaand rt CANT WAIT for the ijwts new season of #TwinPeaks \(^o^)/!!! #davidlynch #tvseries :)))", "I saw the new #johndoe movie and it suuuuucks!!! WAISTED $10... #badmovies :/", "@SentimentSymp: can't wait for the Nov 9 #Sentiment talks! YAAAAAAY !!! :-D http://sentimentsymposium.com/." ] for s in sentences: print(" ".join(text_processor.pre_process_doc(s))) ''' text_processor = TextPreProcessor( # terms that will be normalized normalize=[ 'email', 'percent', 'money', 'phone', 'user', 'time', 'url', 'date', 'number' ], # terms that will be annotated annotate={ "hashtag", "allcaps", "elongated", "repeated", 'emphasis', 'censored' }, fix_html=True, # fix HTML tokens # corpus from which the word statistics are going to be used # for word segmentation segmenter="twitter", # corpus from which the word statistics are going to be used # for spell correction corrector="twitter", unpack_hashtags=True, # perform word segmentation on hashtags unpack_contractions=True, # Unpack contractions (can't -> can not) spell_correct_elong=True, # spell correction for elongated words # select a tokenizer. You can use SocialTokenizer, or pass your own # the tokenizer, should take as input a string and return a list of tokens tokenizer=SocialTokenizer(lowercase=True).tokenize, # list of dictionaries, for replacing tokens extracted from the text, # with other expressions. You can pass more than one dictionary. dicts=dicts) return text_processor
def __init__(self, verbose: int=0, omit=None, normalize=None, annotate={"hashtag", "allcaps", "elongated", "repeated", 'emphasis'}, segmenter="twitter", corrector="twitter", unpack_hashtags=False, unpack_contractions=True, spell_correct_elong=True, spell_correction=True, tokenizer=Tokenizer(lowercase=True), dicts=None): super().__init__(name="EkhprasisPreprocessor", verbose=verbose) if dicts is None: dicts = [others, emoticons_original] if normalize is None: normalize = ['number'] if omit is None: omit = ['email', 'percent', 'money', 'phone', 'user', 'time', 'url', 'date'] logging.info("{} loading...".format(self._name)) self.tweet_processor = TextPreProcessor( # omit terms omit=omit, # terms that will be normalized normalize=normalize, # terms that will be annotated annotate=annotate, # corpus from which the word statistics are going to be used # for word segmentation segmenter=segmenter, # corpus from which the word statistics are going to be used # for spell correction corrector=corrector, unpack_hashtags=unpack_hashtags, # perform word segmentation on hashtags unpack_contractions=unpack_contractions, # Unpack contractions (can't -> can not) spell_correct_elong=spell_correct_elong, # spell correction for elongated words spell_correction=spell_correction, # spell correction # select a tokenizer. You can use SocialTokenizer, or pass your own # the tokenizer, should take as input a string and return a list of tokens tokenizer=tokenizer.tokenize, # list of dictionaries, for replacing tokens extracted from the text, # with other expressions. You can pass more than one dictionaries. dicts=dicts )
def load_instances(config, instances): for instance_config in config["REST_instances"]: instance = Instance(instance_config["name"], instance_config["language"], instance_config["embeddings_path"], instance_config["preprocessing_style"], instance_config["model_path"], instance_config["labels"]) instance.text_processor = TextPreProcessor( # terms that will be normalized normalize=[ 'url', 'email', 'percent', 'money', 'phone', 'user', 'time', 'url', 'date', 'number' ], # terms that will be annotated annotate={ "hashtag", "allcaps", "elongated", "repeated", 'emphasis', 'censored' }, fix_html=True, # fix HTML tokens # corpus from which the word statistics are going to be used # for word segmentation segmenter=instance_config["preprocessing_style"], # corpus from which the word statistics are going to be used # for spell correction corrector=instance_config["preprocessing_style"], unpack_hashtags=True, # perform word segmentation on hashtags unpack_contractions=True, # Unpack contractions (can't -> can not) spell_correct_elong=False, # spell correction for elongated words # select a tokenizer. You can use SocialTokenizer, or pass your own # the tokenizer, should take as input a string and return a list of tokens tokenizer=SocialTokenizer(lowercase=True).tokenize, # list of dictionaries, for replacing tokens extracted from the text, # with other expressions. You can pass more than one dictionaries. dicts=[emoticons]) instance.itos, instance.stoi, instance.vectors, instance.embeddings_size = \ load_embeddings(instance.embeddings_path) instance.text = data.Field() instance.text.build_vocab([instance.itos]) instance.text.vocab.set_vectors(instance.stoi, instance.vectors, instance.embeddings_size) instance.model = torch.load( instance.model_path, map_location='cpu' if not cuda_available else None) instance.model = instance.model.eval() instances[instance_config["name"]] = instance
def __init__(self): self.text_processor_options = TextPreProcessor( normalize=['url', 'email', 'percent', 'money', 'phone', 'user', 'time', 'url', 'date', 'number'], unpack_contractions=False, annotate={"allcaps", "elongated", "repeated", 'emphasis', 'censored'}, fix_html=True, # fix HTML tokens # corpus from which the word statistics are going to be used # for word segmentation and correction segmenter="english", corrector="english", unpack_hashtags=False, # perform word segmentation on hashtags spell_correct_elong=False, # spell correction for elongated words # the tokenizer, should take as input a string and return a list of tokens tokenizer=SocialTokenizer(lowercase=True).tokenize, # list of dictionaries, for replacing tokens extracted from the text, dicts=[emoticons] )
def tokenize( data, is_lower=True, remove_stopwords=True, remove_puncts=True, remove_num=True, remove_currency=True ): text_processor = TextPreProcessor( annotate=['hashtag'], fix_html=True, # fix HTML tokens # corpus from which the word statistics are going to be used # for word segmentation segmenter="english", # corpus from which the word statistics are going to be used # for spell correction corrector="english", unpack_hashtags=True, # perform word segmentation on hashtags unpack_contractions=True, # Unpack contractions (can't -> can not) spell_correct=True, ) tokenized_corpus = [] for sentence in data: tokenized_sentence = [] # processed_sentence = text_processor.pre_process_doc(sentence) # clean_sentence = clean(processed_sentence, **clean_text_param) spacy_doc = nlp(sentence) for token in spacy_doc: processed_token = token if (remove_stopwords and processed_token.is_stop): continue elif (remove_puncts and processed_token.is_punct): continue elif (remove_num and processed_token.is_digit): continue elif (remove_currency and processed_token.is_currency): continue elif (is_lower): tokenized_sentence.append(token.lower_) else: tokenized_sentence.append(token.text) tokenized_corpus.append(tokenized_sentence) return tokenized_corpus
def preprocess_through_ekphrasis(train_file_path, test_file_path, trial_file_path): text_processor = TextPreProcessor( normalize=[ 'url', 'email', 'percent', 'money', 'phone', 'user', 'time', 'url', 'date', 'number' ], annotate={ "hashtag", "allcaps", "elongated", "repeated", 'emphasis', 'censored' }, fix_html=True, segmenter="twitter", corrector="twitter", unpack_hashtags=True, unpack_contractions=True, spell_correct_elong=True, spell_correction=True, all_caps_tag="wrap", fix_bad_unicode=True, tokenizer=SocialTokenizer(lowercase=True).tokenize, dicts=[emoticons]) for file_path in [train_file_path, test_file_path, trial_file_path]: with open(file_path, 'r', newline='') as file: new_sentences = list() labels = list() for line in file: labels.append(line.split('\t')[0]) new_sentences.append(" ".join( text_processor.pre_process_doc(line.split('\t')[1]))) with open(file_path[:-4] + "_ekphrasis.csv", 'w', newline='') as new_file: for label, sentence in zip(labels, new_sentences): new_file.write("{}\t{}\n".format( label, sentence.replace("[ <hashtag> triggerword </hashtag> #]", "[#TRIGGERWORD#]").replace( "[ <allcaps> newline </allcaps> ]", "[NEWLINE]")))
def __init__(self): self.transformations = [] self.text_processor = TextPreProcessor( fix_html=True, # fix HTML tokens # corpus from which the word statistics are going to be used # for word segmentation segmenter="english", # corpus from which the word statistics are going to be used # for spell correction corrector="english", unpack_hashtags=False, # perform word segmentation on hashtags unpack_contractions=False, # Unpack contractions (can't -> can not) spell_correct=True, # spell correction for elongated words ) self.punct = "[\.,:;\(\)\[\]@\-\$£]" nltk.download('stopwords') self.stops = stopwords.words('english') self.nlp = spacy.load('en_core_web_lg')
def __init__(self, text, **kwargs): self.text = text self.text_processor = TextPreProcessor( # terms that will be normalize e.g. [email protected] to <email> normalize=[ 'url', 'email', 'percent', 'money', 'phone', 'user', 'time', 'date', 'number' ], # terms that will be annotated e.g. <hashtag>#test</hashtag> annotate={ "hashtag", "allcaps", "elongated", "repeated", 'emphasis' }, fix_html=True, # fix HTML tokens unpack_hashtags=True, # perform word segmentation on hashtags # select a tokenizer. You can use SocialTokenizer, or pass your own if not text tokenized on whitespace # the tokenizer, should take as input a string and return a list of tokens tokenizer=SocialTokenizer(lowercase=True).tokenize, # list of dictionaries, for replacing tokens extracted from the text, # with other expressions. You can pass more than one dictionaries. dicts=[emoticons])
def __init__(self, word_indices, text_lengths, **kwargs): self.word_indices = word_indices filter_classes = kwargs.get("filter_classes", None) self.y_one_hot = kwargs.get("y_one_hot", True) self.pipeline = Pipeline([ ('preprocess', CustomPreProcessor( TextPreProcessor( backoff=[ 'url', 'email', 'percent', 'money', 'phone', 'user', 'time', 'url', 'date', 'number' ], include_tags={ "hashtag", "allcaps", "elongated", "repeated", 'emphasis', 'censored' }, fix_html=True, segmenter="twitter", corrector="twitter", unpack_hashtags=True, unpack_contractions=True, spell_correct_elong=False, tokenizer=SocialTokenizer(lowercase=True).tokenize, dicts=[emoticons]))), ('ext', EmbeddingsExtractor(word_indices=word_indices, max_lengths=text_lengths, add_tokens=True, unk_policy="random")) ]) # loading data print("Loading data...") dataset = DataLoader(verbose=False).get_data(years=None, datasets=None) random.Random(42).shuffle(dataset) if filter_classes: dataset = [d for d in dataset if d[0] in filter_classes] self.X = [obs[1] for obs in dataset] self.y = [obs[0] for obs in dataset] print("total observations:", len(self.y)) print("-------------------\ntraining set stats\n-------------------") print_dataset_statistics(self.y) print("-------------------")
def twitter_preprocess(self): preprocessor = TextPreProcessor( normalize=[ 'url', 'email', 'percent', 'money', 'phone', 'user', 'time', 'date', 'number' ], annotate={ "hashtag", "elongated", "allcaps", "repeated", 'emphasis', 'censored' }, all_caps_tag="wrap", fix_text=True, segmenter="twitter_2018", corrector="twitter_2018", unpack_hashtags=True, unpack_contractions=True, spell_correct_elong=False, tokenizer=SocialTokenizer(lowercase=True).tokenize, dicts=[emoticons]) text = self.data cache_file = os.path.join('./', "cached", "preprocessed_" + self.name + ".pkl") preprocessed = None if os.path.isfile(cache_file): with open(cache_file, 'rb') as f: preprocessed = pickle.load(f) else: preprocessed = [ preprocessor.pre_process_doc(x) for x in tqdm(text, desc="Preprocessing dataset...") ] with open(cache_file, 'wb') as f: pickle.dump(preprocessed, f) return preprocessed
def twitter_preprocessor(): preprocessor = TextPreProcessor( normalize=['url', 'email', 'phone', 'user'], annotate={ "hashtag", "elongated", "allcaps", "repeated", 'emphasis', 'censored' }, all_caps_tag="wrap", fix_text=False, segmenter="twitter_2018", corrector="twitter_2018", unpack_hashtags=True, unpack_contractions=True, spell_correct_elong=False, tokenizer=SocialTokenizer(lowercase=True).tokenize).pre_process_doc return preprocessor
def emotion_and_split(): text_process = TextPreProcessor( segmenter="twitter", corrector="twitter", unpack_hashtags=True, unpack_contractions=True, tokenizer=SocialTokenizer(lowercase=True).tokenize, dicts=[emoticons] ) return text_process
def __init__(self): self.root_dir = "CrisisLexT26/" self.count = 0 self.natural_disasters = [] self.non_natural_disasters = [] self.prep_natural_disasters = [] self.prep_non_natural_disasters = [] self.nat_labels = [] self.non_natural_labels = [] self.en_prep_nat_tweets = [] self.en_prep_non_nat_tweets = [] self.text_processor = TextPreProcessor( # terms that will be normalized normalize=[ 'url', 'email', 'percent', 'money', 'phone', 'user', 'time', 'url', 'date', 'number' ], # terms that will be annotated annotate={ "hashtag", "allcaps", "elongated", "repeated", 'emphasis', 'censored' }, fix_html=True, # fix HTML tokens # corpus from which the word statistics are going to be used # for word segmentation segmenter="twitter", # corpus from which the word statistics are going to be used # for spell correction corrector="twitter", unpack_hashtags=True, # perform word segmentation on hashtags unpack_contractions=True, # Unpack contractions (can't -> can not) spell_correct_elong=True, # spell correction for elongated words # select a tokenizer. You can use SocialTokenizer, or pass your own # the tokenizer, should take as input a string and return a list of tokens tokenizer=SocialTokenizer(lowercase=True).tokenize, # list of dictionaries, for replacing tokens extracted from the text, # with other expressions. You can pass more than one dictionaries. dicts=[emoticons])