def validate_spelling(tree, filename, options):
    """
    Checks spelling of text within tags.
    If options['learn'], then unknown words will be added to the dictionary.
    """
    result = True
    try:
        speller = aspell.Speller(('lang', 'en'), ('personal-dir', '.'),
                                 ('personal', VOCABULARY))
    except:  # some versions of aspell use a different path
        speller = aspell.Speller(('lang', 'en'),
                                 ('personal-path', './' + VOCABULARY))
    if options['debug']:
        [print(i[0] + ' ' + str(i[2]) + '\n') for i in speller.ConfigKeys()]
    try:
        root = tree.getroot()
        for section in root.iter():
            if section.text and isinstance(section.tag, basestring) and \
               section.tag not in ('a', 'code', 'monospace', 'pre'):
                for word in re.findall('([a-zA-Z]+\'?[a-zA-Z]+)',
                                       section.text):
                    if not speller.check(word):
                        if options['learn']:
                            speller.addtoPersonal(word)
                        else:
                            result = False
                            print('[-] Misspelled (unknown) word {0} in {1}'.
                                  format(word.encode('utf-8'), filename))
        if options['learn']:
            speller.saveAllwords()
    except aspell.AspellSpellerError as exception:
        print('[-] Spelling disabled ({0})'.format(exception))
    return result
def evaluate_aspell_builtin(input, lang_code):
    """
  """
    import enchant
    import aspell

    input = build_article_information(input)

    result_content = "{ \"predictions\": [\n"

    for aidx, article in enumerate(input):
        for sidx, sentence in enumerate(article.sentences):
            chkr = aspell.Speller('lang', lang_code.split("_")[0])
            tokens, spaces = call_regex(sentence)

            shift = 0

            for tidx, t in enumerate(tokens):
                if t == "\"":
                    t = t.replace("\"", "\\\"")
                if t == "\\":
                    t = t.replace("\\", "\\\\")
                token = t
                suggestions = []
                try:
                    if chkr.check(t) == False:
                        sugg = chkr.suggest(t)
                        if len(sugg) > 0:
                            tempSuggestion = sugg[0].strip()
                            if (" " in tempSuggestion):
                                multi_tokens = tempSuggestion.split(" ")
                                token = None
                            else:
                                token = tempSuggestion
                                suggestions = sugg[1:]
                except:
                    token = t

                if token == None:  # is none, so tokens is filled with multiple elements -> splitted word
                    num_tokens = len(multi_tokens)
                    for idx, tt in enumerate(multi_tokens):
                        result_content += generate_token_information(
                            aidx, sidx, tidx + idx, tt, suggestions,
                            spaces[tidx], tidx < (len(tokens) + shift - 1))
                    shift += num_tokens - 1
                else:
                    result_content += generate_token_information(
                        aidx, sidx, tidx + shift, token, suggestions,
                        spaces[tidx], tidx < (len(tokens) + shift - 1))
            if ((aidx < (len(input) - 1))
                    or (sidx < len(article.sentences) - 1)):
                if result_content[-1] != "," and result_content[-2] != ",":
                    result_content += ",\n"

    result_content += "  ]\n}"

    #print("DEBUG: ")
    #print(result_content)

    return result_content
Exemple #3
0
    def set_line_manager(self, line_manager_):
        if line_manager_:
            self.lm = line_manager_
            if CAN_ASPELL:
                self.spell_checker = line_manager_.spell_checker
                print line_manager_.spell_checker.lang
                self.speller = aspell.Speller('lang',
                                              line_manager_.spell_checker.lang)

            next_error_button = wx.Button(self.panel,
                                          wx.ID_ANY,
                                          label='Next Error',
                                          size=(90, 30))
            self.Bind(wx.EVT_BUTTON, self.OnNextBadLine, next_error_button)
            next_error_button.SetDefault()
            next_error_button.SetSize(next_error_button.GetBestSize())
            self.current_text.Add(next_error_button, row=2, col=1)

            if self.speller:
                add_to_dictionary_button = wx.Button(self.panel,
                                                     wx.ID_ANY,
                                                     label='+ to Dict',
                                                     size=(90, 30))
                self.Bind(wx.EVT_BUTTON, self.OnAddToDict,
                          add_to_dictionary_button)
                add_to_dictionary_button.SetDefault()
                add_to_dictionary_button.SetSize(
                    add_to_dictionary_button.GetBestSize())
                self.current_text.Add(add_to_dictionary_button, row=5, col=1)
            # Sizers for layout
            self.panel.SetSizerAndFit(self.current_text)
Exemple #4
0
    def spell_checker(self, url, words=[]):
        """Spell checker.

        :param url: webpage url
        :param words: expected word list
        :return: list of misspelled words
        """
        self.open(url)
        cleanr = re.compile('<.*?>')
        page_content = re.sub(cleanr, '', self.get_page_source())
        cleantext = []
        speller_obj = aspell.Speller("lang", "en")
        if len(words):
            for word in words:
                speller_obj.addtoSession(word)

        invalidchars = set(string.punctuation.replace("_", ""))
        for word in nltk.word_tokenize(page_content):
            if any(invalidchar in word for invalidchar in invalidchars) or \
                    len(word) < 2:
                continue
            else:
                cleantext.append(word)

        misspelled = list(
            set([
                word.encode('ascii', 'ignore') for word in cleantext if
                not speller_obj.check(word) and re.match('^[a-zA-Z ]*$', word)
            ]))

        return misspelled
def query_handler(query):
    correcter = aspell.Speller('lang', 'en')
    query = query.split()
    correct_query = []
    for term in query:
        if len(correcter.suggest(term)) == 0: continue
        correct_query.append(correcter.suggest(term)[0].lower())
    return correct_query
Exemple #6
0
def construct_globals():
    global MATCH_ALPHA_WORD, LOWER, speller, word_set, detokenizer
    MATCH_ALPHA_WORD = "[A-Za-zĂÂÎȘȚăâîșț]+"
    LOWER = [chr(i) for i in range(ord('a'), ord('z') + 1)]
    LOWER += list("ăâșîț")
    speller = aspell.Speller('lang', 'ro')
    word_set = set()
    detokenizer = Detok()
Exemple #7
0
    def __init__(self, wordlist):
        SpChecker.__init__(self, wordlist)
        self.compile_dict(wordlist)

        params = [
            ('master', './' + self.dictfile),
            ('master-path', './' + self.dictfile),
        ]
        self.sp = aspell.Speller(*params)
Exemple #8
0
    def __init__(self, lang):
        if lang in ['pt', 'pt_BR']:
            data_dir = os.path.expanduser('~/root/tmp/usr/lib64/aspell')
        else:
            data_dir = os.path.expanduser('~/root/usr/lib/aspell')

        self.speller = aspell.Speller(
            ('data-dir', data_dir), ('dict-dir', data_dir), ('size', '80'),
            ('sug-mode', 'fast'), ('encoding', 'utf-8'), ('lang', lang))
Exemple #9
0
def cleanUnigramCountFile(inputfile, outputfile, n, language,
                          filterByDictionary):
    '''filter the unigram count file, and reduce the number of items in it'''

    df = pandas.read_table(inputfile, encoding='utf-8')
    df.columns = ['word', 'count']
    #take some multiple of items to run the filters on

    #discard purely numeric items
    df_nonnumeric = df[[type(x) is unicode for x in df['word']]]

    #discard the <s> string
    df_clean = df_nonnumeric[[x != u'</s>' for x in df_nonnumeric['word']]]

    #delete apostrophes, numbers
    df_clean['word'] = [re.sub(u"’|'|\d", '', x) for x in df_clean['word']]

    #check for any empty strings
    df_clean = df_clean[[x != '' and x is not None for x in df_clean['word']]]

    df_clean['word'] = [cleanString(x) for x in df_clean['word']]

    #check whether the upper and lower case is in the dictionary
    aspellLang = language
    if aspellLang == 'pt':
        aspellLang = 'pt-BR'
    speller = aspell.Speller(('lang', aspellLang), ('encoding', 'utf-8'))
    df_clean['aspell_upper'] = [
        speller.check(x.lower().encode('utf-8')) == 1 for x in df_clean['word']
    ]
    df_clean['aspell_lower'] = [
        speller.check(x.title().encode('utf-8')) == 1 for x in df_clean['word']
    ]

    #Convert anything that can be lower case to lower case
    df_clean['word'][df_clean['aspell_lower']] = [
        x.lower() for x in df_clean['word'][df_clean['aspell_lower']]
    ]

    if filterByDictionary:
        #check the rejected words
        #df_clean.ix[~df_clean['aspell']]
        if language == 'de':
            #German nouns are capitalized
            df_clean = df_clean.ix[np.logical_or(df_clean['aspell_lower'],
                                                 df_clean['aspell_upper'])]
        else:
            df_clean = df_clean.ix[df_clean['aspell_lower']]

    to_write = df_clean.drop(['aspell_lower', 'aspell_upper'], axis=1)
    to_write['word'] = [x.lower() for x in to_write['word']]
    to_write.to_csv(outputfile,
                    sep='\t',
                    index=False,
                    header=False,
                    encoding='utf-8')
    print('Wrote to file: ' + outputfile)
Exemple #10
0
def main():
    nlp = English()
    speller = aspell.Speller('lang', 'en')

    for line in sys.stdin:
        text = line[:-1]
        tokens = tokenize(text, nlp=nlp)
        tokens = correct(tokens, speller=speller)
        text = untokenize(tokens)
        print(text)
Exemple #11
0
 def __init__(self, bot: UtilsBot):
     self.bot = bot
     self.speller = aspell.Speller('lang', 'en')
     self.api_db = self.bot.mongo.client.api.users
     app = web.Application()
     app.add_routes([web.post('/speak', self.handle_speak_message), web.post('/disconnect', self.handle_disconnect),
                     web.get('/check_access', self.check_access), web.get('/avatar_urls', self.avatar_urls),
                     web.get('/regen_img/{data}', self.regen_image)])
     # noinspection PyProtectedMember
     self.bot.loop.create_task(self.start_site(app))
Exemple #12
0
    def set_line_manager(self, line_manager_, strict):
        self.strict = strict
        if line_manager_:
            self.lm = line_manager_
            if CAN_ASPELL:
                self.spell_checker = line_manager_.spell_checker
                self.speller = aspell.Speller('lang',
                                              line_manager_.spell_checker.lang)
            button_row = 2
            next_line_button = wx.Button(self.panel,
                                         wx.ID_ANY,
                                         label='Next Line',
                                         size=(90, 30))
            self.Bind(wx.EVT_BUTTON, self.OnNextLine, next_line_button)
            next_line_button.SetDefault()
            next_line_button.SetSize(next_line_button.GetBestSize())
            self.current_text.Add(next_line_button, row=button_row, col=1)

            button_row += 1
            previous_line_button = wx.Button(self.panel,
                                             wx.ID_ANY,
                                             label='Prev Line',
                                             size=(90, 30))
            self.Bind(wx.EVT_BUTTON, self.OnPreviousLine, previous_line_button)
            previous_line_button.SetDefault()
            previous_line_button.SetSize(previous_line_button.GetBestSize())
            self.current_text.Add(previous_line_button, row=button_row, col=1)
            if self.strict:
                button_row += 1
                join_line_button = wx.Button(self.panel,
                                             wx.ID_ANY,
                                             label='Join Lines',
                                             size=(90, 30))
                self.Bind(wx.EVT_BUTTON, self.OnJoinLines, join_line_button)
                join_line_button.SetDefault()
                join_line_button.SetSize(join_line_button.GetBestSize())
                self.current_text.Add(join_line_button, row=button_row, col=1)

            if self.speller:
                button_row += 1
                add_to_dictionary_button = wx.Button(self.panel,
                                                     wx.ID_ANY,
                                                     label='+ to Dict',
                                                     size=(90, 30))
                self.Bind(wx.EVT_BUTTON, self.OnAddToDict,
                          add_to_dictionary_button)
                add_to_dictionary_button.SetDefault()
                add_to_dictionary_button.SetSize(
                    add_to_dictionary_button.GetBestSize())
                self.current_text.Add(add_to_dictionary_button,
                                      row=button_row,
                                      col=1)
            # Sizers for layout
            self.panel.SetSizerAndFit(self.current_text)
Exemple #13
0
def initialize_speller():
    """
    Initialize and return speller module.
    """
    speller = None
    try:
        speller = aspell.Speller(('lang', 'en'), ('personal-dir', '.'),
                                 ('personal', VOCABULARY))
    except aspell.AspellConfigError as exception:  # some versions of aspell use a different path
        logging.debug(
            'Encountered exception when trying to intialize spelling: %s',
            exception)
        try:
            speller = aspell.Speller(('lang', 'en'),
                                     ('personal-path', './' + VOCABULARY))
        except aspell.AspellSpellerError as exception:
            logging.error('Could not initialize speller: %s', exception)
    if speller:
        [logging.debug('%s %s', i[0], i[2]) for i in speller.ConfigKeys()]
    return speller
    def __init__(self, tokenize=True, pretrained=False, device="cpu"):
        self.tokenize = tokenize
        self.pretrained = None
        self.device = None

        self.ckpt_path = None
        self.vocab_path, self.weights_path = "", ""
        self.model, self.vocab = None, None

        self.model = aspell.Speller()
        self.model.setConfigKey(
            'sug-mode', "normal")  #ultra, fast, normal, slow, or bad-spellers
Exemple #15
0
def main(filename):
    speller = aspell.Speller('lang', LANG)
    buffersize = 2**16
    with open(filename) as f:
        while True:
            lines_buffer = f.readlines(buffersize)
            if not lines_buffer:
                break
            for line in lines_buffer:
                word = line.strip()
                if speller.check(word):
                    print(word)
Exemple #16
0
def missing_targets_aspell(target_words):
    s = ap.Speller('lang', 'en')
    missing_words = set()
    for word in target_words:
        if s.check(word) == False:
            missing_words.add(word)
    missing_words = list(missing_words)

    with open('../spelling_mistakes/aspell_missing_targets.txt',
              'w') as aspell_misspelling:
        for word in missing_words:
            new_line = word + '\n'
            aspell_misspelling.write(new_line)
Exemple #17
0
def check_one(bot, word):
    c = aspell.Speller('lang', 'en')
    if c.check(word):
        bot.say("I don't see any problems with that word.")
        return
    else:
        suggestions = c.suggest(word)[:5]

    if len(suggestions) == 0:
        bot.say("That doesn't seem to be correct.")
    else:
        bot.say("That doesn't seem to be correct. Try {0}.".format(', '.join(
            ['"{0}"'.format(s) for s in suggestions])))
Exemple #18
0
def check_multiple(bot, words):
    mistakes = []

    c = aspell.Speller('lang', 'en')
    for word in words:
        if not c.check(word):
            mistakes.append(word)

    if len(mistakes) == 0:
        bot.say("Nothing seems to be misspelled.")
    else:
        bot.say('The following word(s) seem to be misspelled: {0}'.format(
            ', '.join(['"{0}"'.format(w) for w in mistakes])))
def separate_waw(text):
    ar_spell = aspell.Speller('lang', 'ar')
    words = line.split()
    sentence = ''
    for word in words:
        if word.startswith('و'):
            if word in ar_spell:
                sentence += word + ' '
            else:
                sentence += 'و ' + word[1:] + ' '
                print('{} changed to {}'.format(word, 'و ' + word[1:]))
        else:
            sentence += word + ' '
    return sentence
Exemple #20
0
def bag_of_words_features(document, word_features, spell):
    features = {}
    s = aspell.Speller('lang', 'en')
    if spell:
        for i in range(len(document)):
            if s.check(document[i]) == 0:
                if len(s.suggest(document[i])) > 0 and len(
                        s.suggest(document[i])) < 15 and s.suggest(
                            document[i])[0] in word_features:
                    document[i] = s.suggest(document[i])[0]
    document_words = set(document)
    for word in word_features:
        features['contains(%s)' % word] = (word in document)
    return features
Exemple #21
0
def make_light_lexicon(infile, outfile):
    ar_spell = aspell.Speller(('dict-dir', './ar_dict/'), ('lang', 'ar'),
                              ('encoding', 'utf-8'))
    lexicon = open(infile, encoding='utf-8').read().split()
    print(infile, 'size', len(lexicon))
    light_lexicon = set()
    for word in lexicon:
        light_word = light_stem_word(word)
        if light_word != word and light_word not in lexicon \
                and light_word in ar_spell:
            light_lexicon.add(light_word)
    light_lexicon = list(sorted(light_lexicon))
    print('light size', len(light_lexicon))
    with open(outfile, mode='w', encoding='utf-8') as file_writer:
        file_writer.write('\n'.join(light_lexicon))
Exemple #22
0
 def __init__(self, opts, *args, **qdict):
     BaseHTTPServer.HTTPServer.__init__(self, *args, **qdict)
     import nltk
     _SENTENCE_TOKENIZE_MODEL = "tokenizers/punkt/english.pickle"
     self.tokenizer = nltk.data.load(_SENTENCE_TOKENIZE_MODEL) 
     self.speller = aspell.Speller('lang', 'en')
     try:
         sennabin = unicode(os.environ['SENNAPATH'])
     except KeyError:
         sennabin = u"/data/tool/senna/"
     self.senna = src.tools.senna.SennaWrap(sennabin)
     self.funcs = {'split':self.split, 'spell':self.spell, 'pas': self.pas, "score" : self.score}
     M_PATH = opts.model_dir
     self.model = SklearnClassifier()
     self.model.load_model(M_PATH)
     self.model.load_fmap(M_PATH)
Exemple #23
0
def save_command(bot, trigger):
    """Commit pending changes to the bot's personal dictionary.

    This action cannot be undone, except by manually editing the aspell
    dictionary file.
    """
    for word in bot.memory['spellcheck_pending_adds']:
        if word != word.strip() and trigger.group(2) != 'force':
            bot.say('"{0} contains extra whitespace. Amend the pending list with '
                    '{1}scdel/{1}scadd, or force saving anyway with {1}scsave force.'
                    .format(word, bot.config.core.help_prefix))
            return
    c = aspell.Speller('lang', 'en')
    for word in bot.memory['spellcheck_pending_adds']:
        c.addtoPersonal(word)
    c.saveAllwords()
    bot.say('Saved {0} pending words to my word list.'
            .format(len(bot.memory['spellcheck_pending_adds'])))
    del bot.memory['spellcheck_pending_adds'][:]  # list.clear() is py3.3+ only :(
Exemple #24
0
def go():
    fn = "test-data/cyprob-page-000.txt"
    nm, ext = os.path.splitext(fn)
    new_fn = "{nm}-auto-corrected{ext}".format(nm=nm, ext=ext)

    greek_speller = aspell.Speller('lang', 'el')
    get_first_suggestion_ = partial(get_first_suggestion,
                                    speller=greek_speller)

    new_lines = list()

    with open(fn, "r") as fin:
        for ln in fin:
            nln = " ".join(
                [get_first_suggestion_(word=word) for word in ln.split()])
            new_lines.append(nln)

    with open(new_fn, "w") as fout:
        fout.write("\n".join(new_lines))
Exemple #25
0
def filterByDictionary(merged, dictionary_filter, language):
    if dictionary_filter is None:
        print('Not limiting words to a spelling dictionary')
        pass
    elif dictionary_filter in ('lowerInDictionary', 'inDictionary'):
        aspellLang = language

        if aspellLang == 'pt':
            aspellLang = 'pt-BR'

        speller = aspell.Speller(('lang', aspellLang), ('encoding', 'utf-8'))

        merged['aspell_lower'] = [
            speller.check(x.lower().encode('utf-8')) == 1
            for x in merged['word']
        ]

        if dictionary_filter == 'lowerInDictionary':
            if aspellLang == 'de':
                raise ValueError(
                    'German must use inDictionary filter setting because all nouns are capitalized'
                )
            print('Limiting to words with lower-case in spelling dictionary')
            #German nouns are capitalized, so need to check upper case
            merged = merged[
                merged['aspell_lower']]  #only take the upper case one
        elif dictionary_filter == 'inDictionary':
            print(
                'Limiting to words with lower-case or upper-case in spelling dictionary'
            )
            merged['aspell_upper'] = [
                speller.check(x.title().encode('utf-8')) == 1
                for x in merged['word']
            ]
            #this should be checking if speller.check has x.upper
            merged = merged[merged['aspell_upper'] | merged['aspell_lower']]
    else:
        raise ValueError(
            'Dictionary specification not recognized. Choose None, "lowerInDictioanry" or "inDictionary"'
        )
    return (merged)
Exemple #26
0
def aspell_dict(input_dict, name):
    """
    Generates a dictionary of corrections carried out by the Aspell spelling correction tool.
    :param input_dict: input template dictionary
    :param name: name of dataset for use in file output write
    :return: Updated dictionary with suggestions and candidates, name of file written to
    """
    file_name = name + "_aspell_dict.txt"
    spell = aspell.Speller('lang', 'en')
    working_dict = dict(input_dict)
    counter = 0

    for misspelling, details in working_dict.items():
        if counter % 100 == 0:
            print("Aspell iteration:", counter)
        suggestions = spell.suggest(misspelling)
        if len(suggestions) > 0:
            working_dict[misspelling]['suggested'] = suggestions[0]
            working_dict[misspelling]['candidates'] = suggestions[:10]
        counter += 1

    return working_dict, file_name
Exemple #27
0
    def spellcheck(self, spellchecker, num, exclude_list=None):
        if exclude_list != None:
            exclude_list = exclude_list.split(',')

        if spellchecker == 'aspell':
            s = aspell.Speller('lang', 'en')
            self.replaced = {}
            for review in self.tokens:
                for i in range(len(review)):
                    token = review[i].encode('utf8')
                    if sum(char.isdigit() for char in review[i])/len(review[i]) > 0.5:
                        # dont use encode/decode because then everything is an int
                        if num == True:
                            if exclude_list == None or review[i] not in exclude_list:
                                review[i] = '_num'  # encode to same token indicating a number is present
                    elif s.check(token) == False:
                        suggestions = s.suggest(token)
                        if any([word.lower() == token.decode('utf8').lower() for word in suggestions]):
                            replace = next((word for word in suggestions if word.lower() == token.decode('utf8').lower()), None)
                            self.replaced[review[i]] = replace #cache results
                            review[i] = replace
                        else:
                            frequencies = np.array((self.tokens_dist.freq(token.decode(
                                'utf8')) * self.tokens_dist.N() - 1))  # exclude this instance from the frequency count
                            for j in range(len(suggestions)):
                                frequencies = np.append(frequencies, self.tokens_dist.freq(
                                    suggestions[j]) * self.tokens_dist.N())
                            most_frequent_index = np.argmax(frequencies)
                            if most_frequent_index != 0:
                                replace = suggestions[most_frequent_index - 1]
                                self.replaced[review[i]] = replace  # cache results
                                review[i] = replace
        elif num == True:
            for review in self.tokens:
                for i in range(len(review)):
                    if sum(char.isdigit() for char in review[i]) / len(review[i]) > 0.5:
                        if exclude_list == None or review[i] not in exclude_list:
                            review[i] = '_num'  # encode to same token indicating a number is present
Exemple #28
0
class pluginClass(plugin):
    s = aspell.Speller('lang', 'en')

    def gettype(self):
        return "command"

    def action(self, complete):
        msg = complete.message().decode('utf-8')
        if self.s.check(msg):
            msg = '"' + msg + '" is spelled correctly'
        else:
            suggs = self.s.suggest(msg)
            if len(suggs) > 0:
                msg = ', '.join(suggs)
            else:
                msg = 'No spelling suggestions.'
        return ["PRIVMSG $C$ :" + msg]

    def describe(self, complete):
        return [
            "PRIVMSG $C$ :I am the !spell module", "PRIVMSG $C$ :Usage:",
            "PRIVMSG $C$ :!spell [word]"
        ]
Exemple #29
0
    'Arabic spell checker based on aspell. The input is a file and the output is errors with frequencies.'
)  # type: ArgumentParser

parser.add_argument('-i',
                    '--infile',
                    type=argparse.FileType(mode='r', encoding='utf-8'),
                    help='input file.',
                    required=True)
parser.add_argument('-o',
                    '--outfile',
                    type=argparse.FileType(mode='w', encoding='utf-8'),
                    help='output file.',
                    required=True)

if __name__ == '__main__':
    ar_spell = aspell.Speller('lang', 'ar')
    args = parser.parse_args()
    words = args.infile.read().split()
    outfile = args.outfile
    errors_count = dict()
    for word in words:
        if not ar_spell.check(word):
            if word not in errors_count:
                errors_count[word] = 1
            else:
                errors_count[word] += 1
    sorted_freq = sorted(errors_count.items(),
                         key=operator.itemgetter(1),
                         reverse=True)
    outfile.write('# word\tfreq\tsuggestion\n')
    for word, freq in sorted_freq:
import spacy
import re
import emoji
from nltk.tokenize import TweetTokenizer
from nltk import ngrams
import aspell

s = aspell.Speller('lang', 'es')
nlp = spacy.load('es_core_news_md')
tokenize = nlp.tokenizer


class Tweet:
    def __init__(self, text):
        self.raw_text = text
        self.clean_text = None
        self.doc = None


    def filter(self, *args):
        """ Apply optional filters 'retweets', 'emoticons', 'handles', 'urls', 'hashtags' and '*' """
        exps = []

        #this ones can be improved. some are not getting extracted
        if "retweets" in args:
            exps.append(re.compile("^RT ?(?<=^|(?<=[^a-zA-Z0-9-_\.]))@([A-Za-z]+[A-Za-z0-9-_]+):"))
        if "emoticons" in args:
            exps.append("emoticons")
        if "flags" in args:
            exps.append(re.compile(u"[\U0001F1E6-\U0001F1FF]"))
        if "handles" in args: