def show_web_paragraphs():
    splitter = MergeParagraphs(400)
    stop = NltkPlusStopWords(True)
    ranker = TopTfIdf(stop, 6)
    stop_words = stop.words

    corpus = TriviaQaWebDataset()
    train = corpus.get_train()
    points = flatten_iterable([(q, d) for d in q.all_docs] for q in train)
    np.random.shuffle(points)

    for q, d in points:
        q_words = {strip_accents_unicode(w.lower()) for w in q.question}
        q_words = {x for x in q_words if x not in stop_words}

        doc = corpus.evidence.get_document(d.doc_id)
        doc = splitter.split_annotated(doc, d.answer_spans)
        ranked = ranker.dists(q.question, doc)
        if len(ranked) < 2 or len(ranked[1][0].answer_spans) == 0:
            continue
        print(" ".join(q.question))
        print(q.answer.all_answers)
        for i, (para, dist) in enumerate(ranked[0:2]):
            text = flatten_iterable(para.text)
            print("Start=%d, Rank=%d, Dist=%.4f" % (para.start, i, dist))
            if len(para.answer_spans) == 0:
                continue
            for s, e in para.answer_spans:
                text[s] = bcolors.CYAN + text[s]
                text[e] = text[e] + bcolors.ENDC
            for i, w in enumerate(text):
                if strip_accents_unicode(w.lower()) in q_words:
                    text[i] = bcolors.ERROR + text[i] + bcolors.ENDC
            print(" ".join(text))
        input()
Example #2
0
    def _preprocess_word(
        self,
        word: str,
        preprocessor_args: PreprocessorArgs = {
            'strip_accents': False,
            'lowercase': False,
            'preprocessor': None,
        }
    ) -> str:
        """pre-processes a word before it is searched in the model's vocabulary.

        Parameters
        ----------
        word : str
            Word to be preprocessed.
        preprocessor_args : PreprocessorArgs, optional
            Dictionary with arguments that specifies how the words will be preprocessed,
            by default { 
                'strip_accents': False, 
                'lowercase': False, 
                'preprocessor': None, }

        Returns
        -------
        str
            The pre-processed word according to the given parameters.
        """

        preprocessor = preprocessor_args.get('preprocessor', None)
        if preprocessor and callable(preprocessor):
            word = preprocessor(word)

        else:
            if preprocessor_args.get('lowercase', False):
                word = word.lower()

            strip_accents = preprocessor_args.get('strip_accents', False)
            if strip_accents == True:
                word = strip_accents_unicode(word)
            elif strip_accents == 'ascii':
                word = strip_accents_ascii(word)
            elif strip_accents == 'unicode':
                word = strip_accents_unicode(word)

        if self.vocab_prefix is not None:
            word = self.vocab_prefix + word

        return word
def preprocessor_tweet(s):

    tweet_p.set_options(tweet_p.OPT.EMOJI,
                        tweet_p.OPT.URL,
                        tweet_p.OPT.RESERVED,
                        tweet_p.OPT.SMILEY,
                        tweet_p.OPT.MENTION)
    s = re.sub(r'@petrogustavo', 'petrogustavo', s)
    s = re.sub(r'@sergio_fajardo', 'sergio_fajardo', s)
    s = re.sub(r'@IvanDuque','IvanDuque',s)
    s = re.sub(r'@AlvaroUribeVel','AlvaroUribeVel',s)
    s = re.sub(r'@JuanManSantos','JuanManSantos',s)
    s = re.sub(r'@German_Vargas','German_Vargas',s)
    s = re.sub(r'@ClaudiaLopez','ClaudiaLopez',s)
    s = re.sub(r'@DeLaCalleHum','DeLaCalleHum',s)
    s = tweet_p.clean(s)
    s = re.sub(r'\b(?:a*(?:ja)+h?|(?:l+o+)+l+)\b', ' ', s)
    s = re.sub(r'[^\w]', ' ', s)
    # s = re.sub(r'^https?:\/\/.*[\r\n]*', '', s)
    # s = re.sub(r'#', '', s)
    # s = re.sub(r'¡+', '', s)
    # s = re.sub(r':', '', s)
    # s = re.sub(r'!+', '', s)
    # s = re.sub(r'"', '', s)


    # s = re.sub(r'/[-?]/', '', s)
    # s = re.sub(r'¿+', '', s)
    # s = re.sub(r'@\w+', '', s)
    s = strip_accents_unicode(s.lower())
    s = tweet_p.clean(s)

    return s
Example #4
0
def freq_weights(words, corpus='grozea', corpus_stats=None, strip=False):
    if strip:
        from sklearn.feature_extraction.text import strip_accents_unicode
        words = strip_accents_unicode(words)
    if corpus == 'grozea':
        corpus_stats = pd.read_csv(GROZEA,
                                   sep='[  ]',
                                   names=['freq', 'word'],
                                   index_col=1)
    elif corpus == 'ngrams':
        import bz2
        corpus_stats = pd.read_csv(bz2.BZ2File(NGRAMS),
                                   sep='[\t]',
                                   names=['word', 'freq'])
    w = np.ones(len(words))
    for k, word in enumerate(words):
        try:
            if corpus == 'ngrams':
                w[k] += corpus_stats['freq'][np.where(
                    corpus_stats['word'] == word)[0]]
            else:
                w[k] += corpus_stats.lookup([word], ['freq'])[0]
        except:
            pass
    return w / w.sum()
Example #5
0
    def _preprocess(self, doc):
        if self.input == "content":
            pass
        elif self.input == "filename":
            with open(doc,
                      "r",
                      encoding=self.encoding,
                      errors=self.decode_error) as fh:
                doc = fh.read()
        elif self.input == "file":
            doc = doc.read()

        if isinstance(doc, bytes):
            doc = doc.decode(self.encoding, self.decode_error)

        if self.strip_accents is not None:
            if self.strip_accents == "unicode":
                doc = strip_accents_unicode(doc)
            elif self.strip_accents == "ascii":
                doc = strip_accents_ascii(doc)
            else:
                raise ValueError('Invalid value for "strip_accents": %s' %
                                 self.strip_accents)

        if self.analyzer == "char" and self._compat_mode():
            doc = self._white_spaces.sub(" ", doc)

        return doc
Example #6
0
def test_strip_accents():
    # check some classical latin accentuated symbols
    a = 'àáâãäåçèéêë'
    expected = 'aaaaaaceeee'
    assert strip_accents_unicode(a) == expected

    a = 'ìíîïñòóôõöùúûüý'
    expected = 'iiiinooooouuuuy'
    assert strip_accents_unicode(a) == expected

    # check some arabic
    a = '\u0625'  # alef with a hamza below: إ
    expected = '\u0627'  # simple alef: ا
    assert strip_accents_unicode(a) == expected

    # mix letters accentuated and not
    a = "this is à test"
    expected = 'this is a test'
    assert strip_accents_unicode(a) == expected

    # strings that are already decomposed
    a = "o\u0308"  # o with diaresis
    expected = "o"
    assert strip_accents_unicode(a) == expected

    # combining marks by themselves
    a = "\u0300\u0301\u0302\u0303"
    expected = ""
    assert strip_accents_unicode(a) == expected

    # Multiple combining marks on one character
    a = "o\u0308\u0304"
    expected = "o"
    assert strip_accents_unicode(a) == expected
Example #7
0
def preprocessor(s):
    s = clean_html(s)
    s = clean_twitter(s)
    s = format_numbers(s)
    s = split_numbers(s)
    s = tokenize_numbers(s)
    s = strip_accents_unicode(s.lower())
    s = tokenize_short(s)
    return s
Example #8
0
def test_strip_accents():
    # check some classical latin accentuated symbols
    a = '\xe0\xe1\xe2\xe3\xe4\xe5\xe7\xe8\xe9\xea\xeb'
    expected = 'aaaaaaceeee'
    assert_equal(strip_accents_unicode(a), expected)

    a = '\xec\xed\xee\xef\xf1\xf2\xf3\xf4\xf5\xf6\xf9\xfa\xfb\xfc\xfd'
    expected = 'iiiinooooouuuuy'
    assert_equal(strip_accents_unicode(a), expected)

    # check some arabic
    a = '\u0625'  # halef with a hamza below
    expected = '\u0627'  # simple halef
    assert_equal(strip_accents_unicode(a), expected)

    # mix letters accentuated and not
    a = "this is \xe0 test"
    expected = 'this is a test'
    assert_equal(strip_accents_unicode(a), expected)
def test_strip_accents():
    # check some classical latin accentuated symbols
    a = u'\xe0\xe1\xe2\xe3\xe4\xe5\xe7\xe8\xe9\xea\xeb'
    expected = u'aaaaaaceeee'
    assert_equal(strip_accents_unicode(a), expected)

    a = u'\xec\xed\xee\xef\xf1\xf2\xf3\xf4\xf5\xf6\xf9\xfa\xfb\xfc\xfd'
    expected = u'iiiinooooouuuuy'
    assert_equal(strip_accents_unicode(a), expected)

    # check some arabic
    a = u'\u0625'  # halef with a hamza below
    expected = u'\u0627'  # simple halef
    assert_equal(strip_accents_unicode(a), expected)

    # mix letters accentuated and not
    a = u"this is \xe0 test"
    expected = u'this is a test'
    assert_equal(strip_accents_unicode(a), expected)
Example #10
0
def test_strip_accents():
    # check some classical latin accentuated symbols
    a = 'àáâãäåçèéêë'
    expected = 'aaaaaaceeee'
    assert strip_accents_unicode(a) == expected

    a = 'ìíîïñòóôõöùúûüý'
    expected = 'iiiinooooouuuuy'
    assert strip_accents_unicode(a) == expected

    # check some arabic
    a = '\u0625'  # alef with a hamza below: إ
    expected = '\u0627'  # simple alef: ا
    assert strip_accents_unicode(a) == expected

    # mix letters accentuated and not
    a = "this is à test"
    expected = 'this is a test'
    assert strip_accents_unicode(a) == expected
def test_strip_accents():
    # check some classical latin accentuated symbols
    a = 'àáâãäåçèéêë'
    expected = 'aaaaaaceeee'
    assert_equal(strip_accents_unicode(a), expected)

    a = 'ìíîïñòóôõöùúûüý'
    expected = 'iiiinooooouuuuy'
    assert_equal(strip_accents_unicode(a), expected)

    # check some arabic
    a = '\u0625'  # alef with a hamza below: إ
    expected = '\u0627'  # simple alef: ا
    assert_equal(strip_accents_unicode(a), expected)

    # mix letters accentuated and not
    a = "this is à test"
    expected = 'this is a test'
    assert_equal(strip_accents_unicode(a), expected)
def show_open_paragraphs(start: int, end: int):
    splitter = MergeParagraphs(400)
    stop = NltkPlusStopWords(True)
    ranker = ShallowOpenWebRanker(6)
    stop_words = stop.words

    print("Loading train")
    corpus = TriviaQaOpenDataset()
    train = corpus.get_dev()
    np.random.shuffle(train)

    for q in train:
        q_words = {strip_accents_unicode(w.lower()) for w in q.question}
        q_words = {x for x in q_words if x not in stop_words}

        para = []
        for d in q.all_docs:
            doc = corpus.evidence.get_document(d.doc_id)
            para += splitter.split_annotated(doc, d.answer_spans)

        ranked = ranker.prune(q.question, para)
        if len(ranked) < start:
            continue
        ranked = ranked[start:end]

        print(" ".join(q.question))
        print(q.answer.all_answers)
        for i in range(start, end):
            para = ranked[i]
            text = flatten_iterable(para.text)
            print("Start=%d, Rank=%d" % (para.start, i))
            if len(para.answer_spans) == 0:
                # print("No Answer!")
                continue
            for s, e in para.answer_spans:
                text[s] = bcolors.CYAN + text[s]
                text[e] = text[e] + bcolors.ENDC
            for i, w in enumerate(text):
                if strip_accents_unicode(w.lower()) in q_words:
                    text[i] = bcolors.ERROR + text[i] + bcolors.ENDC
            print(" ".join(text))
        input()
Example #13
0
    def transform(self, X_df):

        X = np.array([
            ' '.join(clean_str(text.strip_accents_unicode(dd)))
            for dd in X_df.statement
        ])

        check_is_fitted(self, '_feat', 'The tfidf vector is not fitted')

        X = super(FeatureExtractor, self).transform(X)

        return X
Example #14
0
def rule_preprocessing(corpus,
                       stop_words=nltk.corpus.stopwords.words('portuguese'),
                       join_tokens=False,
                       reduce_inflection='stemming'):
    '''Pre processamento atraves de regras, basta inserirmos uma pd.Series na imput e as regras pre-estabelecidas
    melhorarao o texto'''

    #Removendo os espacos antes e depois da frase e substituicao de multiplos espacos
    corpus = [phrases.strip() for phrases in corpus]
    corpus = [re.sub(' +', ' ', phrases) for phrases in corpus]

    #coloca todas as palavras em minúsculo
    corpus = [phrases.lower() for phrases in corpus]

    #extrai pontuacoes
    corpus = [
        phrases.translate(str.maketrans('', '', punctuation))
        for phrases in corpus
    ]

    #Substituicao de entidades
    corpus = entities_subs(corpus)

    #Correcao palavras
    corpus = text_correction(corpus)

    #retira acentos
    corpus = [strip_accents_unicode(phrases) for phrases in corpus]

    #tokenizacao e remocao de stop_words
    corpus = tokenization(corpus, stop_words=stop_words)

    ###################### ESTA PARTE PRECISA MELHORAR #########################

    #     #Reduzir inflexoes das palavras
    #     if(reduce_inflection == 'stemming'):
    #         corpus =  [stemming(phrases) for phrases in corpus]

    #     elif(reduce_inflection == 'lemmatization'):
    #         corpus = lemmatization(corpus)

    #     else:
    #         pass

    ###################### ESTA PARTE PRECISA MELHORAR #########################

    #junta as palavras tokenizadas
    if (join_tokens == True):
        corpus = [' '.join(phrases) for phrases in corpus]

    return corpus
def get_stopwords(language,
                  include_desc15_stopwords=True,
                  include_custom=True,
                  include_withoutdiacritics=True):
    if language in NLTK_LAN_TRANSLATOR:
        language = NLTK_LAN_TRANSLATOR[language]
    assert language in NLTK_LAN_TRANSLATOR.values(
    ), f"Cannot deal with language {language}"
    stopwords = set(nlstopwords.words(language))
    if include_desc15_stopwords and language == "english":
        stopwords |= load_desc15_stopwords()
    if include_custom and language == "english":
        stopwords |= set(get_setting("CUSTOM_STOPWORDS"))
    if include_withoutdiacritics:
        stopwords |= set(strip_accents_unicode(i) for i in stopwords)
    return tuple(stopwords)
Example #16
0
    def fit(self, X_df, y=None):
        """Learn a vocabulary dictionary of all tokens in the raw documents.
        Parameters
        ----------
        raw_documents : iterable
            An iterable which yields either str, unicode or file objects.
        Returns
        -------
        self
        """
        self._feat = np.array([
            ' '.join(clean_str(text.strip_accents_unicode(dd)))
            for dd in X_df.statement
        ])

        super(FeatureExtractor, self).fit(self._feat)

        return self
Example #17
0
    def clean_document(self, doc):
        # Remove HTML.
        cleaned_doc = re.sub('<[^<]+?>', ' ', doc)

        # Remove Unicode chars.
        cleaned_doc = cleaned_doc.encode('ascii', 'ignore')
        cleaned_doc = cleaned_doc.decode()

        # Remove digits and punctuation.
        cleaned_doc = strip_accents_unicode(cleaned_doc) \
                  .translate(str.maketrans(' ', ' ', string.digits)) \
                  .translate(str.maketrans(' ', ' ', string.punctuation))

        # Remove additional unwanted chars.
        for unwanted_char in self._additional_unwanted_chars:
            if unwanted_char in cleaned_doc:
                cleaned_doc = cleaned_doc.replace(unwanted_char, ' ') \

        return cleaned_doc
Example #18
0
def freq_weights(words, corpus='grozea', corpus_stats=None, strip=False):
    if strip:
        from sklearn.feature_extraction.text import strip_accents_unicode
        words = strip_accents_unicode(words)
    if corpus == 'grozea':
        corpus_stats = pd.read_csv(GROZEA, sep='[  ]',
                                   names=['freq', 'word'], index_col=1)
    elif corpus == 'ngrams':
        import bz2
        corpus_stats = pd.read_csv(bz2.BZ2File(NGRAMS), sep='[\t]',
                                   names=['word', 'freq'])
    w = np.ones(len(words))
    for k, word in enumerate(words):
        try:
            if corpus == 'ngrams':
                w[k] += corpus_stats['freq'][np.where(corpus_stats['word'] == word)[0]]
            else:
                w[k] += corpus_stats.lookup([word], ['freq'])[0]
        except:
            pass
    return w / w.sum()
Example #19
0
 def _preprocess(self, string: str) -> str:
     return strip_accents_unicode(string)
Example #20
0
def uppercase(s):
    return strip_accents_unicode(s).upper()
def set_items_as_tokens_preprocessor(value: Union[str, Set, List]):
    return [strip_accents_unicode(str(i).lower()) for i in value] \
        if isinstance(value, set) or isinstance(value, list) \
        else [strip_accents_unicode(str(value).lower())]
Example #22
0
def uppercase(s):
    return strip_accents_unicode(s).upper()
Example #23
0
 def per_word_prepro(word):
     return strip_accents_unicode(word.lower())
Example #24
0
 def preprocess(path):
     text = Classifier.file_to_text(path)
     text = text.lower()
     text = strip_accents_unicode(text)
     return text
Example #25
0
def clustering_preprocessor(s):
    s = clean_html(s)
    s = clean_twitter(s)
    s = strip_accents_unicode(s.lower())
    s = s.strip()
    return s
Example #26
0
 def transform(cls, string):
     return strip_accents_unicode(string)
Example #27
0
 def transform(cls, string):
     return strip_accents_unicode(string)