Example #1
0
    def __init__(self,
                 pipeline=None,
                 lowercase=False,
                 verbose=False,
                 debug=False):
        """
        Args:
            pipeline (list): list of terms to use for tokenization.
                Each term, is a key from the dict of regexes `expressions.txt`.
                Order matters!
            lowercase (bool): set to True in order to lowercase the text
            verbose (bool): set to True to print each text after tokenization.
                Useful for debugging purposes.
            debug (bool): set to True in order to pause after tokenizing
                each text (wait for pressing any key).
                Useful for debugging purposes, if you want to inspect each text
                as is processed.
        """
        self.lowercase = lowercase
        self.debug = debug
        self.verbose = verbose
        colorama.init(autoreset=False, convert=False, strip=False, wrap=True)

        self.pipeline = []

        self.regexes = ExManager().expressions

        if pipeline is None:
            pipeline = self.default_pipeline

        self.build(pipeline)

        self.pipeline.append("(?:\S)")  # CATCH ALL remaining terms
        self.tok = re.compile(r"({})".format("|".join(self.pipeline)))
Example #2
0
    def __init__(self, corpus="english", max_split_length=20):
        """
        Args:
            corpus (str): the statistics from which corpus to use for
                the spell correction.
            max_split_length (int): the maximum length of that a word can have
                for looking for splits
        """

        # self.unigrams = Counter(read_stats(corpus, 1))
        # self.bigrams = Counter(read_stats(corpus, 2))
        self.unigrams = read_stats(corpus, 1)
        self.bigrams = read_stats(corpus, 2)
        self.N = sum(self.unigrams.values())
        self.L = max_split_length

        self.Pw = Pdist(self.unigrams, self.N, self.unk_probability)
        self.P2w = Pdist(self.bigrams, self.N)

        self.case_split = ExManager().get_compiled()["camel_split"]
Example #3
0
class Segmenter:
    def __init__(self, corpus="english", max_split_length=20):
        """
        Args:
            corpus (str): the statistics from which corpus to use for
                the spell correction.
            max_split_length (int): the maximum length of that a word can have
                for looking for splits
        """

        # self.unigrams = Counter(read_stats(corpus, 1))
        # self.bigrams = Counter(read_stats(corpus, 2))
        self.unigrams = read_stats(corpus, 1)
        self.bigrams = read_stats(corpus, 2)
        self.N = sum(self.unigrams.values())
        self.L = max_split_length

        self.Pw = Pdist(self.unigrams, self.N, self.unk_probability)
        self.P2w = Pdist(self.bigrams, self.N)

        self.case_split = ExManager().get_compiled()["camel_split"]

    def condProbWord(self, word, prev):
        """
        Conditional probability of word, given previous word
        if bigram is not in our list, then fall back to unigrams
        Args:
            word (): candidate word
            prev (): previous observed word

        Returns:

        """
        try:
            return self.P2w[prev + NGRAM_SEP + word] / float(self.Pw[prev])
        except KeyError:
            return self.Pw(word)

    @staticmethod
    def unk_probability(key, total):
        """
        Estimate the probability of an unknown word, penalizing its length
        :param key: the word
        :param total: the count of all tokens
        :return:
        """
        return 10. / (total * 10**len(key))

    @staticmethod
    def combine(first, rem):
        """
        Combine first and rem results into one (probability, words) pair
        :param first: a tuple in the form: probability, word
        :param rem: a tuple in the form: probability, list_of_words
        :return:
        """
        (first_prob, first_word) = first
        (rem_prob, rem_words) = rem
        return first_prob + rem_prob, [first_word] + rem_words

    def splits(self, text):
        """
        Return a list of all possible (first, rem) pairs with max length of first <=L
        :param text:
        :return:
        """
        return [(text[:i + 1], text[i + 1:])
                for i in range(min(len(text), self.L))]

    # if you don't have enough RAM lower the maxsize
    @lru_cache(maxsize=20000)
    def find_segment(self, text, prev='<S>'):
        """
        Return (log P(words), words), where words is the best estimated segmentation
        :param text: the text to be segmented
        :param prev:
        :return:
        """
        if not text:
            return 0.0, []
        candidates = [
            self.combine((log10(self.condProbWord(first, prev)), first),
                         self.find_segment(rem, first))
            for first, rem in self.splits(text)
        ]
        return max(candidates)

    # if you don't have enough RAM lower the maxsize
    @lru_cache(maxsize=20000)
    def segment(self, word):
        if word.islower():
            return " ".join(self.find_segment(word)[1])
        else:
            return self.case_split.sub(r' \1', word).lower()

    def demo(self):
        print("BBCtest: ", self.segment('BbcTest'))
        print("choosespain: ", self.segment('choosespain'))
        print("speedofart: ", self.segment('speedofart'))
        print("smallandinsignificant: ", self.segment('smallandinsignificant'))
Example #4
0
    def __init__(self, **kwargs):
        """
        Kwargs:
            omit (list): choose what tokens that you want to omit from the text.
                possible values: ['email', 'percent', 'money', 'phone', 'user',
                    'time', 'url', 'date', 'hashtag']
                Important Notes:
                            1 - put url at front, if you plan to use it.
                                Messes with the regexes!
                            2 - if you use hashtag then unpack_hashtags will
                                automatically be set to False

            normalize (list): choose what tokens that you want to normalize
                from the text.
                possible values: ['email', 'percent', 'money', 'phone', 'user',
                    'time', 'url', 'date', 'hashtag']
                for example: [email protected] will be transformed to <email>
                Important Notes:
                            1 - put url at front, if you plan to use it.
                                Messes with the regexes!
                            2 - if you use hashtag then unpack_hashtags will
                                automatically be set to False

            unpack_contractions (bool): Replace *English* contractions in
                ``text`` str with their unshortened forms
                for example: can't -> can not, wouldn't -> would not, and so on...

            unpack_hashtags (bool): split a hashtag to it's constituent words.
                for example: #ilikedogs -> i like dogs

            annotate (list): add special tags to special tokens.
                possible values: ['hashtag', 'allcaps', 'elongated', 'repeated']
                for example: [email protected] -> [email protected] <email>

            tokenizer (callable): callable function that accepts a string and
                returns a list of strings if no tokenizer is provided then
                the text will be tokenized on whitespace

            segmenter (str): define the statistics of what corpus you would
                like to use [english, twitter]

            corrector (str): define the statistics of what corpus you would
                like to use [english, twitter]

            all_caps_tag (str): how to wrap the capitalized words
                values [single, wrap, every]
                Note: applicable only when `allcaps` is included in annotate[]
                    - single: add a tag after the last capitalized word
                    - wrap: wrap all words with opening and closing tags
                    - every: add a tag after each word

            spell_correct_elong (bool): choose if you want to perform
                spell correction after the normalization of elongated words.
                * significantly affects performance (speed)

            spell_correction (bool): choose if you want to perform
                spell correction to the text
                * significantly affects performance (speed)

            fix_text (bool): choose if you want to fix bad unicode terms and
                html entities.
        """
        self.omit = kwargs.get("omit", {})
        self.backoff = kwargs.get("normalize", {})
        self.include_tags = kwargs.get("annotate", {})
        self.unpack_contractions = kwargs.get("unpack_contractions", False)
        self.tokenizer = kwargs.get("tokenizer", None)
        self.dicts = kwargs.get("dicts", None)
        self.spell_correction = kwargs.get("spell_correction", False)
        self.spell_correct_elong = kwargs.get("spell_correct_elong", False)
        self.fix_text = kwargs.get("fix_bad_unicode", False)
        self.unpack_hashtags = kwargs.get("unpack_hashtags", False)
        self.segmenter_corpus = kwargs.get("segmenter", "english")
        self.corrector_corpus = kwargs.get("corrector", "english")
        self.all_caps_tag = kwargs.get("all_caps_tag", "wrap")
        self.mode = kwargs.get("mode", "normal")

        if self.unpack_hashtags:
            self.segmenter = Segmenter(corpus=self.segmenter_corpus)
        if self.mode != "fast":
            self.spell_corrector = SpellCorrector(corpus=self.corrector_corpus)

        self.regexes = ExManager().get_compiled()
        if 'hashtag' in self.omit or 'hashtag' in self.backoff:
            print("You can't omit/backoff and unpack hashtags!\n "
                  "unpack_hashtags will be set to False")
            self.unpack_hashtags = False
Example #5
0
    def __init__(self, lowercase=False, verbose=False, debug=False, **kwargs):
        """

        Args:
            lowercase (bool): set to True in order to lowercase the text
            verbose (bool): set to True to print each text after tokenization.
                Useful for debugging purposes.
            debug (bool): set to True in order to pause after tokenizing
                each text (wait for pressing any key).
                Useful for debugging purposes, if you want to inspect each text
                as is processed.

        Kwargs ():
            emojis (bool): True to keep emojis
            urls (bool): True to keep urls
            tags (bool): True to keep tags: <tag>
            emails (bool): True to keep emails
            users (bool): True to keep users handles: @cbaziotis
            hashtags (bool): True to keep hashtags
            cashtags (bool): True to keep cashtags
            phones (bool): True to keep phones
            percents (bool): True to keep percents
            money (bool): True to keep money expressions
            date (bool): True to keep date expressions
            time (bool): True to keep time expressions
            acronyms (bool): True to keep acronyms
            emoticons (bool): True to keep emoticons
            censored (bool): True to keep censored words: f**k
            emphasis (bool): True to keep words with emphasis: *very* good
            numbers (bool): True to keep numbers
        """

        self.lowercase = lowercase
        self.debug = debug
        self.verbose = verbose
        colorama.init(autoreset=False, convert=False, strip=False, wrap=True)
        pipeline = []
        self.regexes = ExManager().expressions

        triggerword = kwargs.get("triggerword", True)
        emojis = kwargs.get("emojis", True)
        urls = kwargs.get("urls", True)
        tags = kwargs.get("tags", True)
        emails = kwargs.get("emails", True)
        users = kwargs.get("users", True)
        hashtags = kwargs.get("hashtags", True)
        cashtags = kwargs.get("cashtags", True)
        phones = kwargs.get("phones", True)
        percents = kwargs.get("percents", True)
        money = kwargs.get("money", True)
        date = kwargs.get("date", True)
        time = kwargs.get("time", True)
        acronyms = kwargs.get("acronyms", True)
        emoticons = kwargs.get("emoticons", True)
        censored = kwargs.get("censored", True)
        emphasis = kwargs.get("emphasis", True)
        numbers = kwargs.get("numbers", True)

        if triggerword:
            pipeline.append(self.regexes["TRIGGERWORD"])

        if urls:
            pipeline.append(self.regexes["URL"])

        if tags:
            pipeline.append(self.regexes["TAG"])

        if emails:
            pipeline.append(self.wrap_non_matching(self.regexes["EMAIL"]))

        if users:
            pipeline.append(self.wrap_non_matching(self.regexes["USER"]))

        if hashtags:
            pipeline.append(self.wrap_non_matching(self.regexes["HASHTAG"]))

        if cashtags:
            pipeline.append(self.wrap_non_matching(self.regexes["CASHTAG"]))

        if phones:
            pipeline.append(self.wrap_non_matching(self.regexes["PHONE"]))

        if percents:
            pipeline.append(self.wrap_non_matching(self.regexes["PERCENT"]))

        if money:
            pipeline.append(self.wrap_non_matching(self.regexes["MONEY"]))

        if date:
            pipeline.append(self.wrap_non_matching(self.regexes["DATE"]))

        if time:
            pipeline.append(self.wrap_non_matching(self.regexes["TIME"]))

        if acronyms:
            pipeline.append(self.wrap_non_matching(self.regexes["ACRONYM"]))

        if emoticons:
            pipeline.append(self.regexes["LTR_FACE"])
            pipeline.append(self.regexes["RTL_FACE"])

        if censored:
            pipeline.append(self.wrap_non_matching(self.regexes["CENSORED"]))

        if emphasis:
            pipeline.append(self.wrap_non_matching(self.regexes["EMPHASIS"]))

        # terms like 'eco-friendly', 'go_to', 'john's' - maybe remove the ' or add a parameter for it
        # pipeline.append(r"(?:\b[a-zA-Z]+[a-zA-Z'\-_]+[a-zA-Z]+\b)")

        # <3 ^5
        if emoticons:
            pipeline.append(
                self.wrap_non_matching(self.regexes["REST_EMOTICONS"]))

        if numbers:
            pipeline.append(self.regexes["NUMBER"])

        if emojis:
            pipeline.append(self.regexes["EMOJI"])

        # any other word
        pipeline.append(self.regexes["WORD"])

        # EASTERN EMOTICONS - (^_^;)   (>_<)>  \(^o^)/
        if emoticons:
            pipeline.append(
                self.wrap_non_matching(self.regexes["EASTERN_EMOTICONS"]))

        # keep repeated puncts as one term
        # pipeline.append(r"")

        pipeline.append("(?:\S)")  # CATCH ALL remaining terms

        self.tok = re.compile(r"({})".format("|".join(pipeline)))
Example #6
0
# from ekphrasis.classes.preprocessor import TextPreProcessor
# from ekphrasis.classes.tokenizer import SocialTokenizer

from src.data.preprocessing.dicts.emoticons import emoticons
from src.data.preprocessing.dicts.wrong_word import wrong_word
from src.data.preprocessing.dicts.abbreviations import abbr_word, acronyms
# import spacy_udpipe
# spacy_udpipe.download("it-postwita")
# nlp = spacy_udpipe.load("it-postwita")

# social_tokenizer = lambda text : [  token.text for token in nlp(text)]

from ekphrasis.classes.exmanager import ExManager

regexes = ExManager().get_compiled()
backoff = ['url', 'email', 'user', 'percent', 'money', 'phone', 'time', 'date']


# text_processor = TextPreProcessor(
#     normalize=['url', 'email', 'user', 'percent', 'money', 'phone', 'time', 'date'],
#     fix_html=True,  # fix HTML tokens
    
#     # select a tokenizer. You can use SocialTokenizer, or pass your own
#     # the tokenizer, should take as input a string and return a list of tokens
#     tokenizer= social_tokenizer, #SocialTokenizer(lowercase=False).tokenize,
    
#     # list of dictionaries, for replacing tokens extracted from the text,
#     # with other expressions. You can pass more than one dictionaries.
#     dicts=[emoticons]
# )