def __init__(self, pipeline=None, lowercase=False, verbose=False, debug=False): """ Args: pipeline (list): list of terms to use for tokenization. Each term, is a key from the dict of regexes `expressions.txt`. Order matters! lowercase (bool): set to True in order to lowercase the text verbose (bool): set to True to print each text after tokenization. Useful for debugging purposes. debug (bool): set to True in order to pause after tokenizing each text (wait for pressing any key). Useful for debugging purposes, if you want to inspect each text as is processed. """ self.lowercase = lowercase self.debug = debug self.verbose = verbose colorama.init(autoreset=False, convert=False, strip=False, wrap=True) self.pipeline = [] self.regexes = ExManager().expressions if pipeline is None: pipeline = self.default_pipeline self.build(pipeline) self.pipeline.append("(?:\S)") # CATCH ALL remaining terms self.tok = re.compile(r"({})".format("|".join(self.pipeline)))
def __init__(self, corpus="english", max_split_length=20): """ Args: corpus (str): the statistics from which corpus to use for the spell correction. max_split_length (int): the maximum length of that a word can have for looking for splits """ # self.unigrams = Counter(read_stats(corpus, 1)) # self.bigrams = Counter(read_stats(corpus, 2)) self.unigrams = read_stats(corpus, 1) self.bigrams = read_stats(corpus, 2) self.N = sum(self.unigrams.values()) self.L = max_split_length self.Pw = Pdist(self.unigrams, self.N, self.unk_probability) self.P2w = Pdist(self.bigrams, self.N) self.case_split = ExManager().get_compiled()["camel_split"]
class Segmenter: def __init__(self, corpus="english", max_split_length=20): """ Args: corpus (str): the statistics from which corpus to use for the spell correction. max_split_length (int): the maximum length of that a word can have for looking for splits """ # self.unigrams = Counter(read_stats(corpus, 1)) # self.bigrams = Counter(read_stats(corpus, 2)) self.unigrams = read_stats(corpus, 1) self.bigrams = read_stats(corpus, 2) self.N = sum(self.unigrams.values()) self.L = max_split_length self.Pw = Pdist(self.unigrams, self.N, self.unk_probability) self.P2w = Pdist(self.bigrams, self.N) self.case_split = ExManager().get_compiled()["camel_split"] def condProbWord(self, word, prev): """ Conditional probability of word, given previous word if bigram is not in our list, then fall back to unigrams Args: word (): candidate word prev (): previous observed word Returns: """ try: return self.P2w[prev + NGRAM_SEP + word] / float(self.Pw[prev]) except KeyError: return self.Pw(word) @staticmethod def unk_probability(key, total): """ Estimate the probability of an unknown word, penalizing its length :param key: the word :param total: the count of all tokens :return: """ return 10. / (total * 10**len(key)) @staticmethod def combine(first, rem): """ Combine first and rem results into one (probability, words) pair :param first: a tuple in the form: probability, word :param rem: a tuple in the form: probability, list_of_words :return: """ (first_prob, first_word) = first (rem_prob, rem_words) = rem return first_prob + rem_prob, [first_word] + rem_words def splits(self, text): """ Return a list of all possible (first, rem) pairs with max length of first <=L :param text: :return: """ return [(text[:i + 1], text[i + 1:]) for i in range(min(len(text), self.L))] # if you don't have enough RAM lower the maxsize @lru_cache(maxsize=20000) def find_segment(self, text, prev='<S>'): """ Return (log P(words), words), where words is the best estimated segmentation :param text: the text to be segmented :param prev: :return: """ if not text: return 0.0, [] candidates = [ self.combine((log10(self.condProbWord(first, prev)), first), self.find_segment(rem, first)) for first, rem in self.splits(text) ] return max(candidates) # if you don't have enough RAM lower the maxsize @lru_cache(maxsize=20000) def segment(self, word): if word.islower(): return " ".join(self.find_segment(word)[1]) else: return self.case_split.sub(r' \1', word).lower() def demo(self): print("BBCtest: ", self.segment('BbcTest')) print("choosespain: ", self.segment('choosespain')) print("speedofart: ", self.segment('speedofart')) print("smallandinsignificant: ", self.segment('smallandinsignificant'))
def __init__(self, **kwargs): """ Kwargs: omit (list): choose what tokens that you want to omit from the text. possible values: ['email', 'percent', 'money', 'phone', 'user', 'time', 'url', 'date', 'hashtag'] Important Notes: 1 - put url at front, if you plan to use it. Messes with the regexes! 2 - if you use hashtag then unpack_hashtags will automatically be set to False normalize (list): choose what tokens that you want to normalize from the text. possible values: ['email', 'percent', 'money', 'phone', 'user', 'time', 'url', 'date', 'hashtag'] for example: [email protected] will be transformed to <email> Important Notes: 1 - put url at front, if you plan to use it. Messes with the regexes! 2 - if you use hashtag then unpack_hashtags will automatically be set to False unpack_contractions (bool): Replace *English* contractions in ``text`` str with their unshortened forms for example: can't -> can not, wouldn't -> would not, and so on... unpack_hashtags (bool): split a hashtag to it's constituent words. for example: #ilikedogs -> i like dogs annotate (list): add special tags to special tokens. possible values: ['hashtag', 'allcaps', 'elongated', 'repeated'] for example: [email protected] -> [email protected] <email> tokenizer (callable): callable function that accepts a string and returns a list of strings if no tokenizer is provided then the text will be tokenized on whitespace segmenter (str): define the statistics of what corpus you would like to use [english, twitter] corrector (str): define the statistics of what corpus you would like to use [english, twitter] all_caps_tag (str): how to wrap the capitalized words values [single, wrap, every] Note: applicable only when `allcaps` is included in annotate[] - single: add a tag after the last capitalized word - wrap: wrap all words with opening and closing tags - every: add a tag after each word spell_correct_elong (bool): choose if you want to perform spell correction after the normalization of elongated words. * significantly affects performance (speed) spell_correction (bool): choose if you want to perform spell correction to the text * significantly affects performance (speed) fix_text (bool): choose if you want to fix bad unicode terms and html entities. """ self.omit = kwargs.get("omit", {}) self.backoff = kwargs.get("normalize", {}) self.include_tags = kwargs.get("annotate", {}) self.unpack_contractions = kwargs.get("unpack_contractions", False) self.tokenizer = kwargs.get("tokenizer", None) self.dicts = kwargs.get("dicts", None) self.spell_correction = kwargs.get("spell_correction", False) self.spell_correct_elong = kwargs.get("spell_correct_elong", False) self.fix_text = kwargs.get("fix_bad_unicode", False) self.unpack_hashtags = kwargs.get("unpack_hashtags", False) self.segmenter_corpus = kwargs.get("segmenter", "english") self.corrector_corpus = kwargs.get("corrector", "english") self.all_caps_tag = kwargs.get("all_caps_tag", "wrap") self.mode = kwargs.get("mode", "normal") if self.unpack_hashtags: self.segmenter = Segmenter(corpus=self.segmenter_corpus) if self.mode != "fast": self.spell_corrector = SpellCorrector(corpus=self.corrector_corpus) self.regexes = ExManager().get_compiled() if 'hashtag' in self.omit or 'hashtag' in self.backoff: print("You can't omit/backoff and unpack hashtags!\n " "unpack_hashtags will be set to False") self.unpack_hashtags = False
def __init__(self, lowercase=False, verbose=False, debug=False, **kwargs): """ Args: lowercase (bool): set to True in order to lowercase the text verbose (bool): set to True to print each text after tokenization. Useful for debugging purposes. debug (bool): set to True in order to pause after tokenizing each text (wait for pressing any key). Useful for debugging purposes, if you want to inspect each text as is processed. Kwargs (): emojis (bool): True to keep emojis urls (bool): True to keep urls tags (bool): True to keep tags: <tag> emails (bool): True to keep emails users (bool): True to keep users handles: @cbaziotis hashtags (bool): True to keep hashtags cashtags (bool): True to keep cashtags phones (bool): True to keep phones percents (bool): True to keep percents money (bool): True to keep money expressions date (bool): True to keep date expressions time (bool): True to keep time expressions acronyms (bool): True to keep acronyms emoticons (bool): True to keep emoticons censored (bool): True to keep censored words: f**k emphasis (bool): True to keep words with emphasis: *very* good numbers (bool): True to keep numbers """ self.lowercase = lowercase self.debug = debug self.verbose = verbose colorama.init(autoreset=False, convert=False, strip=False, wrap=True) pipeline = [] self.regexes = ExManager().expressions triggerword = kwargs.get("triggerword", True) emojis = kwargs.get("emojis", True) urls = kwargs.get("urls", True) tags = kwargs.get("tags", True) emails = kwargs.get("emails", True) users = kwargs.get("users", True) hashtags = kwargs.get("hashtags", True) cashtags = kwargs.get("cashtags", True) phones = kwargs.get("phones", True) percents = kwargs.get("percents", True) money = kwargs.get("money", True) date = kwargs.get("date", True) time = kwargs.get("time", True) acronyms = kwargs.get("acronyms", True) emoticons = kwargs.get("emoticons", True) censored = kwargs.get("censored", True) emphasis = kwargs.get("emphasis", True) numbers = kwargs.get("numbers", True) if triggerword: pipeline.append(self.regexes["TRIGGERWORD"]) if urls: pipeline.append(self.regexes["URL"]) if tags: pipeline.append(self.regexes["TAG"]) if emails: pipeline.append(self.wrap_non_matching(self.regexes["EMAIL"])) if users: pipeline.append(self.wrap_non_matching(self.regexes["USER"])) if hashtags: pipeline.append(self.wrap_non_matching(self.regexes["HASHTAG"])) if cashtags: pipeline.append(self.wrap_non_matching(self.regexes["CASHTAG"])) if phones: pipeline.append(self.wrap_non_matching(self.regexes["PHONE"])) if percents: pipeline.append(self.wrap_non_matching(self.regexes["PERCENT"])) if money: pipeline.append(self.wrap_non_matching(self.regexes["MONEY"])) if date: pipeline.append(self.wrap_non_matching(self.regexes["DATE"])) if time: pipeline.append(self.wrap_non_matching(self.regexes["TIME"])) if acronyms: pipeline.append(self.wrap_non_matching(self.regexes["ACRONYM"])) if emoticons: pipeline.append(self.regexes["LTR_FACE"]) pipeline.append(self.regexes["RTL_FACE"]) if censored: pipeline.append(self.wrap_non_matching(self.regexes["CENSORED"])) if emphasis: pipeline.append(self.wrap_non_matching(self.regexes["EMPHASIS"])) # terms like 'eco-friendly', 'go_to', 'john's' - maybe remove the ' or add a parameter for it # pipeline.append(r"(?:\b[a-zA-Z]+[a-zA-Z'\-_]+[a-zA-Z]+\b)") # <3 ^5 if emoticons: pipeline.append( self.wrap_non_matching(self.regexes["REST_EMOTICONS"])) if numbers: pipeline.append(self.regexes["NUMBER"]) if emojis: pipeline.append(self.regexes["EMOJI"]) # any other word pipeline.append(self.regexes["WORD"]) # EASTERN EMOTICONS - (^_^;) (>_<)> \(^o^)/ if emoticons: pipeline.append( self.wrap_non_matching(self.regexes["EASTERN_EMOTICONS"])) # keep repeated puncts as one term # pipeline.append(r"") pipeline.append("(?:\S)") # CATCH ALL remaining terms self.tok = re.compile(r"({})".format("|".join(pipeline)))
# from ekphrasis.classes.preprocessor import TextPreProcessor # from ekphrasis.classes.tokenizer import SocialTokenizer from src.data.preprocessing.dicts.emoticons import emoticons from src.data.preprocessing.dicts.wrong_word import wrong_word from src.data.preprocessing.dicts.abbreviations import abbr_word, acronyms # import spacy_udpipe # spacy_udpipe.download("it-postwita") # nlp = spacy_udpipe.load("it-postwita") # social_tokenizer = lambda text : [ token.text for token in nlp(text)] from ekphrasis.classes.exmanager import ExManager regexes = ExManager().get_compiled() backoff = ['url', 'email', 'user', 'percent', 'money', 'phone', 'time', 'date'] # text_processor = TextPreProcessor( # normalize=['url', 'email', 'user', 'percent', 'money', 'phone', 'time', 'date'], # fix_html=True, # fix HTML tokens # # select a tokenizer. You can use SocialTokenizer, or pass your own # # the tokenizer, should take as input a string and return a list of tokens # tokenizer= social_tokenizer, #SocialTokenizer(lowercase=False).tokenize, # # list of dictionaries, for replacing tokens extracted from the text, # # with other expressions. You can pass more than one dictionaries. # dicts=[emoticons] # )