Beispiel #1
0
 def clean_string(cls, s):
     try:
         assert (isinstance(s, str) and len(s) > 0), 'The string is invalid'
         cucco = Cucco()
         normalizations = [
             'remove_extra_white_spaces',
             'remove_accent_marks',
             ('replace_symbol', {
                 'replacement': '_'
             }),
             ('replace_emojis', {
                 'replacement': ''
             }),
             ('replace_urls', {
                 'replacement': ''
             }),
         ]
         new_s = cucco.normalize(s, normalizations).lower().rstrip('-_')
         return True, new_s
     except AssertionError as e:
         logger.exception(str(e))
         return False, None
     except Exception as e:
         logger.exception('Error while cleaning string {}'.format(str(e)))
         return False, None
Beispiel #2
0
def get_page_sentences(url):
    paragraphs_normalized = []
    normEsp = Cucco()
    norms = ['replace_punctuation', 'remove_extra_whitespaces']
    soup = BeautifulSoup(requests.get(url).text, "lxml")
    paragraphs = soup.find_all('p')
    stripped_paragraph = [tag.get_text().strip() for tag in paragraphs]
    for sentence in stripped_paragraph:
        paragraphs_normalized.append(normEsp.normalize(sentence, norms))
    return paragraphs_normalized
def normalize2(text):
    from cucco import Cucco
    from cucco.config import Config

    import re

    text = text.lower()
    cucco_config = Config()
    cucco_config.language = detect_language(text)

    if (cucco_config.language in ('es', 'en', 'fr')):
        cucco = Cucco(config=cucco_config)
        normalizations = [
            'remove_stop_words',
            # 'remove_accent_marks', # french accents
            ('replace_hyphens', {
                'replacement': ' '
            }),
            ('replace_symbols', {
                'replacement': ' '
            }),
            ('replace_punctuation', {
                'replacement': ' '
            }),
            'remove_extra_white_spaces',
        ]
    else:
        cucco = Cucco()
        normalizations = [
            # 'remove_stop_words', -- not an identified language
            # 'remove_accent_marks', # french accents
            ('replace_hyphens', {
                'replacement': ' '
            }),
            ('replace_symbols', {
                'replacement': ' '
            }),
            ('replace_punctuation', {
                'replacement': ' '
            }),
            'remove_extra_white_spaces',
        ]

    text = cucco.normalize(text, normalizations)

    text = re.sub('(\d+)%', '%', text)  # convert numbers percent to %
    text = re.sub('(\d+)', '#', text)  # convert numbers to #
    # text = re.sub('#(?P<word>([a-zA-Z])+)', '\g<word>', text) # remove numbers before and after strings'
    # text = re.sub('(?P<word>([a-zA-Z])+)#', '\g<word>', text) # remove numbers before and after strings'
    text = text.split()
    # text = [w for w in text if ( len(w) > 2 and len(w) < 20 ) ] # remove short and very long words
    text = ' '.join(text)

    return text
def get_page_sentences(url):
    stripped_sentences, final_sentences = ([] for i in range(2))
    soup = BeautifulSoup(requests.get(url).text, "lxml")
    list_paragraphs = soup.find_all('p')
    stripped_sentences = [tag.get_text().strip() for tag in list_paragraphs]
    norm_esp = Cucco()
    norms = ['replace_punctuation', 'remove_extra_whitespaces']
    for sentence in stripped_sentences:
        if len(sentence) > 0:
            final_sentences.append(norm_esp.normalize(sentence, norms))
    return final_sentences
Beispiel #5
0
def normalize_df_headers(df):
    norm_esp = Cucco()

    headers = df.columns
    normalized_headers = []
    for header in headers:
        normalized_header = norm_esp.normalize(str.lower(header).replace(' ', '')).replace('–', '_')
        normalized_headers.append(
            normalized_header if len(normalized_header) != 0 else str.lower(header))

    df.columns = normalized_headers
    return df
Beispiel #6
0
def preprocessing(doc_set):
    print("Iniciando preprocesamiento...")
    tokenizer = RegexpTokenizer(r'\w+')

    es_stop = get_stop_words('es')
    es_stop.append(u'rt')
    es_stop.append(u'RT')
    es_stop.append(u'Rt')

    normEsp = Cucco(language='es')
    norms = [
        'remove_stop_words', 'replace_punctuation', 'remove_extra_whitespaces',
        'remove_accent_marks'
    ]

    stemmer = SnowballStemmer('spanish')
    #stemmer = Stemmer.Stemmer('spanish')

    out_set = []

    for doc in doc_set:
        doc = normEsp.normalize(doc, norms)
        raw = doc.lower()
        tokens = tokenizer.tokenize(raw)

        stooped_tokens = [i for i in tokens if not i in es_stop]

        #stemmer_words = stemmer.stemWords(stooped_tokens)

        stemmer_words = [parse(s, lemmata=True) for s in stooped_tokens]

        stemmer_words = [a[4] for a in [b.split("/") for b in stemmer_words]]

        #stemmer_words = []
        #for word in stooped_tokens:
        #	#stemmer_words.append(stemmer.stem(word))
        #	stemmer_words.append(word)

        out_set.append(stemmer_words)

    dictionary = corpora.Dictionary(
        out_set)  #diccionario con las palabras enlazadas a una id
    corpus = [dictionary.doc2bow(doc) for doc in out_set]
    #print(corpus[0]) #imprime la bolsa de palabras, son tuplas de la forma (termID, termfrecuency) en el documento 0
    #print(corpus[1])

    print("Done")

    return dictionary, corpus, out_set
def load_sentences(list_urls):
    paragraphs_normalized = []
    token_paragraphs = []
    normEsp = Cucco()
    norms = ['replace_punctuation', 'remove_extra_whitespaces']
    for i in range(len(list_urls)):
        url = list_urls[i]
        soup = BeautifulSoup(requests.get(url).text, "lxml")
        # headline = soup.find('h1').get_text()
        paragraphs = soup.find_all('p')
        stripped_paragraph = [tag.get_text().strip() for tag in paragraphs]
        for sentence in stripped_paragraph:
            paragraphs_normalized.append(normEsp.normalize(sentence, norms))
        for j in paragraphs_normalized:
            token_paragraphs.append(word_tokenize(j))
    return token_paragraphs
Beispiel #8
0
def Wordcloud(listaTexto, nombreArchivo, termino):
    #Normalizar el texto
    cucco = Cucco()
    text_tweets = ''
    for x in listaTexto:
        text_tweets += cucco.normalize(str(x)) + ' '

    stopwords_spa = stopwords.words('spanish')
        
    #Tokenizar tweets
    tokenized_words_tweets = word_tokenize(text_tweets)


    words_tweets = [word.lower() for word in tokenized_words_tweets if (len(word)>3 and word.lower() != termino.lower() and word.lower() not in termino.lower())]
    texto_words_tweets = [word for word in words_tweets if word not in stopwords_spa]
    
    
    '''#NER
        
    java_path = JavaPath()
    os.environ['JAVAHOME'] = java_path

    _model_filename = ModelPath()
    _path_to_jar = JarPath()
    st = StanfordNERTagger(model_filename=_model_filename, path_to_jar=_path_to_jar)

    classified_text_tweets= st.tag(texto_words_tweets)

    dict_tweets = dict()
    for element in classified_text_tweets:
        if(element[1]!='O'):
            if(element[0] in dict_tweets):
                dict_tweets[element[0]]+=1
            else:
                dict_tweets[element[0]]=1
    sorted(dict_tweets.items(),key=operator.itemgetter(1),reverse=True)[0:10]'''
    
  
    wordcloud = WordCloud(max_font_size=50, max_words=200, background_color="white").generate(str(texto_words_tweets).replace("'",""))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off")
    plt.savefig('media/'+nombreArchivo+'.jpg')
    plt.close()
Beispiel #9
0
    def test_remove_stop_words(self, request):
        for after, before, _, kwargs, message in self._tests_generator(request.node.name):
            assert self._cucco.remove_stop_words(before, **kwargs) == after, message

        # Force language
        self._cucco = Cucco()
        for after, before, _, kwargs, message in self._tests_generator(request.node.name):
            kwargs['language'] = 'en'
            assert self._cucco.remove_stop_words(before, **kwargs) == after, message

        # Force invalid language
        self._cucco = Cucco()
        for after, before, _, kwargs, message in self._tests_generator(request.node.name):
            kwargs['language'] = 'invalid'
            assert self._cucco.remove_stop_words(before, **kwargs) == before, message

        # Test lazy load
        self._cucco = Cucco(lazy_load=True)
        for after, before, _, kwargs, message in self._tests_generator(request.node.name):
            kwargs['language'] = 'en'
            assert self._cucco.remove_stop_words(before, **kwargs) == after, message
Beispiel #10
0
    def test_remove_stop_words(self, request):
        for after, before, _, kwargs, message in self._tests_generator(
                request.node.name):
            assert self._cucco.remove_stop_words(before,
                                                 **kwargs) == after, message

        # Force language
        self._cucco = Cucco()
        for after, before, _, kwargs, message in self._tests_generator(
                request.node.name):
            kwargs['language'] = 'en'
            assert self._cucco.remove_stop_words(before,
                                                 **kwargs) == after, message

        # Force invalid language
        self._cucco = Cucco()
        for after, before, _, kwargs, message in self._tests_generator(
                request.node.name):
            kwargs['language'] = 'invalid'
            assert self._cucco.remove_stop_words(before,
                                                 **kwargs) == before, message

        # Test lazy load
        self._cucco = Cucco(lazy_load=True)
        for after, before, _, kwargs, message in self._tests_generator(
                request.node.name):
            kwargs['language'] = 'en'
            assert self._cucco.remove_stop_words(before,
                                                 **kwargs) == after, message
Beispiel #11
0
def load_data(filepath):
    captions = []
    tags = []
    zipped = ()

    cucco = Cucco()

    with open(filepath, 'r+') as file:
        doc = file.read()
    doc = json.loads(doc)
    for obj in doc:
        for post in doc[obj]:
            hashtags = doc[obj][post]['tags']
            if len(hashtags) > 0:
                capt = [
                    cucco.replace_emojis(
                        str(doc[obj][post]['caption']).lower(), '')
                ]
                tags += hashtags
                cap = capt * len(hashtags)
                captions += cap
    return captions, tags
def remove_stop_words(sentence):
    normEng = Cucco(language='en')
    normEsp = Cucco(language='es')
    norms = [
        'remove_stop_words', 'replace_punctuation', 'remove_extra_whitespaces'
    ]
    sent = normEng.normalize(sentence, norms)
    return normEsp.normalize(sent, norms)
Beispiel #13
0
class EngPreprocessing:
    def __init__(self):
        self.norm_eng = Cucco(language='en')
        self.norm_ops = ['replace_punctuation', 'remove_extra_whitespaces']

    def process(self, sentences):
        result = []

        for sentence in sentences:
            print('preprocessing sentence: ', sentence)

            expand_contraction = self.__expand_contraction(sentence.lower())
            steamming = self.__steaming(expand_contraction)
            remove_number = self.__remove_number(steamming)
            normalising = self.__normalise(remove_number)

            result.append(normalising)

        return result

    def __expand_contraction(self, sentence):
        def replace(match):
            return eng_cList[match.group(0)]

        return eng_c_re.sub(replace, sentence)

    def __steaming(self, sentence):
        return ' '.join(
            lemEng.Sentence(lemEng.parse(sentence, lemmata=True)).lemmata)

    def __remove_number(self, sentence):
        """
        Removes all numbers from strings, both alphabetic (in English) and numeric. Intended to be
        part of a text normalisation process. If the number contains 'and' or commas, these are
        left behind on the assumption the text will be cleaned further to remove punctuation
        and stop-words.
        """

        query = sentence.replace('-', ' ').lower().split(' ')
        resultwords = [word.strip() for word in query if word not in eng_nums]
        noText = ' '.join(resultwords)

        noNums = re.sub(r"[-+]?[.\d]*[\d]+[:,.\d]*", r" ",
                        noText)  # remove numeric number
        noNums = re.sub(r"\s\s+", r"\s", noNums)

        return noNums

    def __normalise(self, sentence):
        return self.norm_eng.normalize(text=sentence,
                                       normalizations=self.norm_ops)
Beispiel #14
0
def text_processor(
    language='en',
    num=False,
    lower=False,
    level='token',
    normalize=True,
    max_len=None,
    min_len=0,
):
    normalizations = [('replace_emails', {
        'replacement': '<email>'
    }), ('replace_emojis', {
        'replacement': '<emoji>'
    }), ('replace_urls', {
        'replacement': '<url>'
    })]

    normalizer = None
    try:
        from normalizr import Normalizr
        normalizer = Normalizr().normalize
    except ImportError:
        try:
            from cucco import Cucco
            normalizer = Cucco().normalize
        except ImportError:
            warnings.warn(
                "Try installing normalizr or cucco for better normalization")

    NUM = re.compile('[0-9]+')

    def processor(sent):
        if normalize and normalizer is not None:
            sent = normalizer(sent, normalizations)
        if num:
            sent = NUM.sub('<num>', sent)  # number substitution
        if lower:
            sent = sent.lower()  # downcase

        sent = segmenter(sent, level=level)

        if len(sent) <= min_len:
            return None

        if max_len is not None and len(sent) > max_len:
            return sent[:max_len]
        else:
            return sent

    return processor
def clean_str_for_update(s):
    '''
    Make all characters in lower case.
    Replace all spaces, special characters with '_'
    Drop all the trailing characters '-' and '_'
    Add character 'x' if the string starts with number(s)
  '''
    try:
        assert (isinstance(s, str)
                and len(s) > 0), 'The column name is invalid'
        cucco = Cucco()
        normalizations = [
            'remove_extra_white_spaces', 'remove_accent_marks',
            ('replace_characters', {
                'characters': ['-', '*'],
                'replacement': ''
            })
        ]
        new_s = cucco.normalize(s, normalizations).strip('-_\n ')
        return new_s
    except Exception as e:
        logging.exception('Error while cleaning column name')
        raise
def normalize(text):
    """
    normalizing the input -- it is supposed to remove stopwords (if not, nltk.corpus.stopwords.words()-- list of stopwords ) /
    markup cleaning / new lines / punctuation and " (string.punctuation() ) / 's / to_lowercase / / names? / numbers - change to . or other char (#) / steaming (normalizing) - nltk.stem.porter.PorterStemmer()
    remove if length <= 3 or >= 20 or contains http / roman numbers / bullet point (.) / text + number o al reves > text / ' / - /
    for normalizing only necessary stop words (overrepresented), low caps, numbers by # / punctuation
    REMOVE PREPOSITIONS N ALL LANGUAGES
    :param text:
    :return:
    """

    from cucco import Cucco
    import re

    cucco = Cucco()
    text = text.lower()
    text = cucco.normalize(text)

    text = re.sub('(\d+)%', '%', text)  # convert numbers percent to %
    text = re.sub('(\d+)', '#', text)  # convert numbers to #
    text = re.sub('(•|“|‘|”|’s|(.|)’)', "",
                  text)  # remove dot point for lists and “‘”
    # remove english possessive 'sop’s' and its
    # remove french l’ and d’ or ‘key
    #   Mr.Ging > mrging 'genderbasedviolence'  ascertain iraniansupported fuelefficient logisticsrelated
    # 19 471  no in 780 996 00 10pm a as 425 abovementioned avenirstrongp  genderrelated
    # in word_counts there are numbers and short words
    text = re.sub('#(?P<word>([a-zA-Z])+)', '\g<word>',
                  text)  # remove numbers before and after strings'
    text = re.sub('(?P<word>([a-zA-Z])+)#', '\g<word>',
                  text)  # remove numbers before and after strings'
    text = text.split()
    text = [w for w in text if (len(w) > 2 and len(w) < 20)
            ]  # remove short and very long words
    text = ' '.join(text)

    return text
Beispiel #17
0
#from Skylar.models import Flow
#afrom Skylar.utils.utils import format_message
from ai.model.keras_similarr import keras_similar
from ai.model.utils.feature_extractor import extract_features
from ai.model.utils.nltk_util import mark_negation
from ai.model.utils.qclassifier import Qclassifier
from ai.model.utils.spelling.spelling import Spelling
from ai.skysentiment import get_sentiment_values_2 as get_sentiment_values

from sematch.semantic.similarity import WordNetSimilarity
from nltk import word_tokenize, pos_tag, ne_chunk
from nltk.chunk import tree2conlltags
from cucco import Cucco

normalizr = Cucco()
normalizations = ['remove_extra_white_spaces', 'replace_punctuation',
                  'replace_symbols', 'remove_accent_marks']

class fmodel(object):
    def __init__(self):
        self.out = {}
        self.keras = keras_similar()
        self.classifier = Qclassifier()
        self.spell=Spelling()
        self.wn = WordNetSimilarity()
        self.en_nlp = spacy.load("en_core_web_md")
        self.stopwords_en=[]
        with open(os.path.join(os.path.dirname(os.path.realpath(__file__)),
            'utils', 'stopwords_en.txt')) as f:
Beispiel #18
0
from cucco import Cucco

cucco = Cucco()
print(cucco.normalize('Who let the cucco out?'))
Beispiel #19
0
number_rows = len(urls_list)
number_cols = 3
array_data = np.empty(shape=(number_rows, number_cols), dtype='object')

for i in range(number_rows):
    url_second = urls_list[i]
    array_data[i][0] = url_second
    p_tags = BeautifulSoup(requests.get(url_second).text).find_all('p')
    array_data[i][1] = BeautifulSoup(
        requests.get(url_second).text).find_all('h1')
    array_data[i][2] = [tag.get_text().strip() for tag in p_tags]
#save to csv if wanted
#pd.DataFrame(arra_data).to_csv("HypothesisA.csv")

#Cleaning the text
normEsp = Cucco()
norms = ['replace_punctuation', 'remove_extra_whitespaces']
new_stopwords = set(
    stopwords.words('spanish')) - {'ella', 'ellas', 'una', 'unas', 'él'}
for i in range(number_rows):
    p_tags_text = [
        normEsp.normalize(sentence, norms) for sentence in array_data[i][2]
    ]
    espTokens = [word_tokenize(text) for text in p_tags_text]
    flatList = [word for sentList in espTokens for word in sentList]
    filtered = [word for word in flatList if word not in new_stopwords]
    array_data[i][2] = filtered

espFreq = FreqDist(word for word in array_data[0][2])
for word, frequency in espFreq.most_common(20):
    print(u'{}: {}'.format(word, frequency))
Beispiel #20
0
import re
from cucco import Cucco

_CUCCO = Cucco()

NORMALIZATIONS = ['remove_extra_white_spaces']


def normalize(text: str) -> str:
    """
    Text normalization.

    >>> normalize("ООО  'ВЫМПЕЛКОМ' ")
    "ООО 'ВЫМПЕЛКОМ'"
    >>> normalize('ЗАО "ЮВЕЛИРНЫЙ завод')
    'ЗАО "ЮВЕЛИРНЫЙ завод'
    >>> normalize("ОАО 'ЁЛКИ и ПАЛКИ' ")
    "ОАО 'ЁЛКИ и ПАЛКИ'"
    >>> normalize('Столовая №1')
    'Столовая №1'

    :param text: some hand typed text
    :return: normalized text
    """
    return _CUCCO.normalize(text, NORMALIZATIONS)


def company_name_normalization(name: str) -> str:
    """
    Company name normalization
Beispiel #21
0
 def setup_method(self):
     self._cucco = Cucco()
Beispiel #22
0
class TestCucco(object):

    _cucco = None

    @staticmethod
    def _tests_generator(test):
        for test in TESTS_DATA['tests'][test[5:]]:
            yield (test['after'], test['before'],
                   test['characters'] if 'characters' in test else '',
                   test['kwargs'] if 'kwargs' in test else dict(),
                   test['message'])

    def setup_method(self):
        self._cucco = Cucco()

    def test_normalize(self, request):
        for after, before, _, kwargs, message in self._tests_generator(
                request.node.name):
            assert self._cucco.normalize(before, **kwargs) == after, message

    def test_remove_accent_marks(self, request):
        for after, before, _, _, message in self._tests_generator(
                request.node.name):
            assert self._cucco.remove_accent_marks(before) == after, message

    def test_remove_stop_words(self, request):
        for after, before, _, kwargs, message in self._tests_generator(
                request.node.name):
            assert self._cucco.remove_stop_words(before,
                                                 **kwargs) == after, message

        # Force language
        self._cucco = Cucco()
        for after, before, _, kwargs, message in self._tests_generator(
                request.node.name):
            kwargs['language'] = 'en'
            assert self._cucco.remove_stop_words(before,
                                                 **kwargs) == after, message

        # Force invalid language
        self._cucco = Cucco()
        for after, before, _, kwargs, message in self._tests_generator(
                request.node.name):
            kwargs['language'] = 'invalid'
            assert self._cucco.remove_stop_words(before,
                                                 **kwargs) == before, message

        # Test lazy load
        self._cucco = Cucco(lazy_load=True)
        for after, before, _, kwargs, message in self._tests_generator(
                request.node.name):
            kwargs['language'] = 'en'
            assert self._cucco.remove_stop_words(before,
                                                 **kwargs) == after, message

    def test_replace_characters(self, request):
        for after, before, characters, kwargs, message in self._tests_generator(
                request.node.name):
            assert self._cucco.replace_characters(text=before,
                                                  characters=characters,
                                                  **kwargs) == after, message

    def test_replace_emails(self, request):
        for after, before, _, kwargs, message in self._tests_generator(
                request.node.name):
            assert self._cucco.replace_emails(text=before,
                                              **kwargs) == after, message

    def test_replace_emojis(self, request):
        for after, before, _, kwargs, message in self._tests_generator(
                request.node.name):
            assert self._cucco.replace_emojis(text=before,
                                              **kwargs) == after, message

    def test_remove_extra_whitespaces(self, request):
        for after, before, _, _, message in self._tests_generator(
                request.node.name):
            assert self._cucco.remove_extra_whitespaces(
                before) == after, message

    def test_replace_hyphens(self, request):
        for after, before, _, kwargs, message in self._tests_generator(
                request.node.name):
            assert self._cucco.replace_hyphens(text=before,
                                               **kwargs) == after, message

    def test_replace_punctuation(self, request):
        for after, before, _, kwargs, message in self._tests_generator(
                request.node.name):
            assert self._cucco.replace_punctuation(text=before,
                                                   **kwargs) == after, message

    def test_replace_symbols(self, request):
        for after, before, _, kwargs, message in self._tests_generator(
                request.node.name):
            assert self._cucco.replace_symbols(text=before,
                                               **kwargs) == after, message

    def test_replace_urls(self, request):
        for after, before, _, kwargs, message in self._tests_generator(
                request.node.name):
            assert self._cucco.replace_urls(text=before,
                                            **kwargs) == after, message
Beispiel #23
0
def searchTweets(query):
    db = firestore.client()
    maxCount = 100
    max_id = -1
    count = 0

    obj = {
    query : {
        "regioes": {
        "Norte": {
            "tristeza": 0,
            "alegria": 0,
            "amor": 0,
            "raiva": 0
        },
        "Nordeste": {
            "tristeza": 0,
            "alegria": 0,
            "amor": 0,
            "raiva": 0
        },
        "Centro-Oeste": {
            "tristeza": 0,
            "alegria": 0,
            "amor": 0,
            "raiva": 0
        },
        "Sul": {
            "tristeza": 0,
            "alegria": 0,
            "amor": 0,
            "raiva": 0
        },
        "Sudeste": {
            "tristeza": 0,
            "alegria": 0,
            "amor": 0,
            "raiva": 0
        }
        }
    }
    }

    other_obj = {
    "regioes": {
        "Norte": {
        "tristeza": 0,
        "alegria": 0,
        "amor": 0,
        "raiva": 0,
        "count": 0
        },
        "Nordeste": {
        "tristeza": 0,
        "alegria": 0,
        "amor": 0,
        "raiva": 0,
        "count": 0
        },
        "Centro-Oeste": {
        "tristeza": 0,
        "alegria": 0,
        "amor": 0,
        "raiva": 0,
        "count": 0
        },
        "Sul": {
        "tristeza": 0,
        "alegria": 0,
        "amor": 0,
        "raiva": 0,
        "count": 0
        },
        "Sudeste": {
        "tristeza": 0,
        "alegria": 0,
        "amor": 0,
        "raiva": 0,
        "count": 0
        }
    }
    }

    users_ref = db.collection(query)
    docs = users_ref.stream()

    jsonT = ""
    for doc in docs:
        jsonT = doc.to_dict()["porcentagem"]
    if jsonT == "":
        while count < maxCount:
            if max_id <= 0:
                searched_tweets = api.search(q=query+" -filter:retweets", lang="pt-br", tweet_mode='extended', count=maxCount*5)
            else:
                searched_tweets = api.search(q=query+" -filter:retweets", lang="pt-br", tweet_mode='extended', count=maxCount*5, max_id=str(max_id - 1))
            if not searched_tweets:
                print("tem nada aq mona") 
                break
            else:
                for tweet in searched_tweets:
                    if (tweet.place is not None) and (count < maxCount):
                        text = json.dumps(tweet._json['full_text'], sort_keys=True, indent=4, ensure_ascii=False).encode('utf8').decode()
                        finalText = text.split(" ")
                        text = ""
                        for aux in finalText:
                            if not '@' in aux and not 'https://' in aux:
                                text += aux + " "

                        count += 1
                        text = Cucco.replace_emojis(text)
                        text = text.replace('"', '')
                        municipio = (json.dumps(tweet._json['place']['full_name'], sort_keys=True, indent=4, ensure_ascii=False).encode('utf8')).decode().split(",")[0].replace('"',"")
                        
                        try:
                            if municipio == 'Sao Paulo':
                                municipio = 'São Paulo'
                            regiao = regioes.getRegion(ufbr.get_cidade(municipio).codigo)
                            em = classify(text)
                            other_obj["regioes"][regiao][em] +=1
                            other_obj["regioes"][regiao]["count"] +=1
                            pass
                        except Exception as identifier:
                            count -= 1
                            pass

            max_id = searched_tweets[-1].id

        arr_reg = ["Norte", "Nordeste", "Centro-Oeste", "Sul", "Sudeste"]
        arr_emo = ["tristeza", "alegria", "amor", "raiva"]
        for i in arr_reg:
            for j in arr_emo:
                total = other_obj["regioes"][i]["count"]
                if total == 0:
                    obj[query]["regioes"][i][j] = 0
                else :
                    obj[query]["regioes"][i][j] = round((other_obj["regioes"][i][j] / total) * 100, 2)

        db.collection(query).add({ "tweets_classificados": json.dumps(other_obj), "porcentagem" : json.dumps(obj) })
        objs = [obj, other_obj]
        return objs

    else:
        users_ref = db.collection(query)
        docs = users_ref.stream()

        jsonP = ""
        for doc in docs:
            jsonP = doc.to_dict()["porcentagem"]
            jsonT = doc.to_dict()["tweets_classificados"]

            arr = [json.loads(jsonP), json.loads(jsonT)]

        return arr
Beispiel #24
0
import glob
import errno
from cucco import Cucco
cucco = Cucco()

positive_files = './pos/*.txt'
negative_files = './neg/*.txt'
normalizations = [
    'remove_accent_marks', 'remove_extra_whitespaces', 'remove_stop_words',
    'replace_charachters', 'replace_emails', 'replace_emojis',
    'replace_hyphens', 'replace_punctuation', 'replace_symbols', 'replace_urls'
]
iterations = 0
files = glob.glob(negative_files)
for name in files:
    with open(name, "r+") as f:
        text = f.read().decode('utf8')
        words = text.split(" ")

        a = [word for word in words if '@'.decode('utf-8') not in word]

        for i, word in enumerate(a):
            a[i] = a[i].replace("&amp;", "&")
            a[i] = a[i].replace("&lt;", "<")
            a[i] = a[i].replace("&gt;", ">")
            a[i] = a[i].replace("&quot;", '"')

        output = ' '.join(a)

        normalized_out = cucco.normalize(output, normalizations)
        print(normalized_out)
Beispiel #25
0
def get_tasks(task_id):
    abc = []

    graph = facebook.GraphAPI(access_token=token, version=3.1)
    node = "/%s" % task_id

    video = graph.request(
        node + "/comments?fields=id,message,comment_count,"
        "reactions.type(LIKE).limit(0).summary(total_count).as(like),"
        "reactions.type(LOVE).limit(0).summary(total_count).as(love),"
        "reactions.type(WOW).limit(0).summary(total_count).as(wow),"
        "reactions.type(HAHA).limit(0).summary(total_count).as(haha),"
        "reactions.type(SAD).limit(0).summary(total_count).as(sad),"
        "reactions.type(ANGRY).limit(0).summary(total_count).as(angry)")
    # video = graph.request(node + '?fields='
    #                            'reactions.type(LIKE).limit(0).summary(total_count).as(like),'
    #                           'reactions.type(LOVE).limit(0).summary(total_count).as(love),'
    #                             'reactions.type(WOW).limit(0).summary(total_count).as(wow),'
    #                             'reactions.type(HAHA).limit(0).summary(total_count).as(haha),'
    #                             'reactions.type(SAD).limit(0).summary(total_count).as(sad),'
    #                             'reactions.type(ANGRY).limit(0).summary(total_count).as(angry)')

    # Wrap this block in a while loop so we can keep paginating requests until
    # finished.

    # Baca dataset
    joy_feel = read_dataset(get_full_path("dataset/cf/pp/filter/joy.txt"),
                            "joy")
    disgust_feel = read_dataset(
        get_full_path("dataset/cf/pp/filter/disgust.txt"), "disgust")
    sadness_feel = read_dataset(
        get_full_path("dataset/cf/pp/filter/sadness.txt"), "sadness")
    anger_feel = read_dataset(get_full_path("dataset/cf/pp/filter/anger.txt"),
                              "anger")
    fear_feel = read_dataset(get_full_path("dataset/cf/pp/filter/fear.txt"),
                             "fear")
    surprise_feel = read_dataset(
        get_full_path("dataset/cf/pp/filter/surpriseExtra.txt"), "surprise")

    # filter away words that are less than 3 letters to form the training data
    dataku = []
    for (words, sentiment) in (joy_feel + disgust_feel + sadness_feel +
                               anger_feel + fear_feel + surprise_feel):
        dataku.append((words.rstrip(), sentiment))

    lines = []
    labels = []
    for words, sentiment in dataku:
        html_parser = HTMLParser()

        lines.append(html_parser.unescape(words))
        labels.append(sentiment)

    headlines, labels = lines, labels

    pipeline = Pipeline([
        (
            "count_vectorizer",
            CountVectorizer(
                ngram_range=(2, 3),
                min_df=1,
                max_df=0.8,
                stop_words=frozenset([
                    "saya",
                    "sedang",
                    "lagi",
                    "adalah",
                    "di",
                    "dari",
                    "karena",
                    "dan",
                    "dengan",
                    "ke",
                    "yang",
                    "untuk",
                    "itu",
                    "orang",
                ]),
            ),
        ),
        ("tfidf_transformer", TfidfTransformer()),
        ("classifier", MultinomialNB()),
    ])
    pipeline.fit(headlines, labels)
    angerx = 0
    joyx = 0
    surprisex = 0
    sadnessx = 0
    fearx = 0
    disgustx = 0
    while True:
        try:
            # print("Get post comments data :")
            for each_video in video["data"]:
                if each_video["message"] != "":
                    # connect to database
                    init_tag()
                    html_parser = HTMLParser()
                    spell_check = jalanSpellCheck()
                    koreksi_slang = slangWordCorrect()
                    cucco = Cucco()

                    kata = cucco.replace_emojis(each_video["message"])

                    # Escape HTML
                    kata = html_parser.unescape(each_video["message"])
                    kata = " ".join(kata.split())

                    # Hapus emoji
                    kata = cucco.replace_emojis(kata)

                    normalizations = ["remove_extra_white_spaces"]

                    # Hapus extra spasi
                    kata = cucco.normalize(kata, normalizations)

                    kata = kata.replace("/", " ")

                    # Conver ke lowercase
                    kata = kata.lower()

                    # Hapus repeating character yang lebih dari 2
                    kata = re.sub(r"(.)\1+", r"\1\1", kata)

                    # Proses ,. yang sisa jadi 2
                    kata = kata.replace("..", ".")
                    kata = kata.replace(",,", ",")
                    kata = kata.replace("!!", "!")
                    kata = kata.replace("??", "?")

                    # Tambahkan spasi habis titik
                    rx = r"\.(?=\S)"
                    kata = re.sub(rx, ". ", kata)

                    # Slang correction
                    kata = koreksi_slang.jalan(kata)

                    # Spellcheck error
                    # tampung_kata_1 = []
                    # tampung_1 = kata.split()
                    # for word in tampung_1:
                    #    tampung_kata_1.append(spell_check.correctSpelling(word))
                    # kata = " ".join(tampung_kata_1)
                    asdqwe = kata

                    # Check apakah ada tanda baca di akhir
                    if (re.match(".*[^.?!]$", kata) is not None) == True:
                        kata = kata + " ."

                    resultx = do_tag(kata)
                    kata = " ".join(resultx)

                    # print(words)
                    # xxx = "".join([" " + i for i in words]).strip()

                    # kata = xxx

                    if kata != "":
                        linesz = []
                        linesz.append(kata)
                        words = []
                        for y in linesz:
                            lines = y.split()
                            for x in lines:
                                word = x.split("/")
                                chars_to_remove = set((
                                    ",",
                                    "IN",
                                    "CC",
                                    "SC",
                                    "CDO",
                                    "CDC",
                                    "CDP",
                                    "CDI",
                                    "DT",
                                    "MD",
                                    "OP",
                                    "CP",
                                    "SYM",
                                    ".",
                                ))
                                if word[1] not in chars_to_remove:
                                    words.append(word[0] + "_" + word[1])
                            resultx = "".join([" " + i for i in words]).strip()
                            # print(resultx)

                        cobaa = []
                        cobaa.append(resultx)
                        for x in pipeline.predict(cobaa):
                            hasilx = x
                        if hasilx == "anger":
                            angerx = angerx + 1
                        elif hasilx == "joy":
                            joyx = joyx + 1
                        elif hasilx == "sadness":
                            sadnessx = sadnessx + 1
                        elif hasilx == "fear":
                            fearx = fearx + 1
                        elif hasilx == "disgust":
                            disgustx = disgustx + 1
                        elif hasilx == "surprise":
                            surprisex = surprisex + 1

                        comments_data = {
                            "id":
                            each_video["id"],
                            "komen":
                            each_video["message"],
                            "asdqwe":
                            asdqwe,
                            "komen_edit":
                            resultx,
                            "prediksi":
                            hasilx,
                            "like_count":
                            each_video["like"]["summary"]["total_count"],
                            "love_count":
                            each_video["love"]["summary"]["total_count"],
                            "wow_count":
                            each_video["wow"]["summary"]["total_count"],
                            "haha_count":
                            each_video["haha"]["summary"]["total_count"],
                            "sad_count":
                            each_video["sad"]["summary"]["total_count"],
                            "angry_count":
                            each_video["angry"]["summary"]["total_count"],
                        }

                    abc.append(comments_data)
            # Attempt to make a request to the next page of data, if it exists.
            video = requests.get(video["paging"]["next"]).json()
        except KeyError:
            # When there are no more pages (['paging']['next']), break from the
            # loop and end the script.
            break

    ctrku = {
        "anger": angerx,
        "joy": joyx,
        "sadness": sadnessx,
        "fear": fearx,
        "surprise": surprisex,
        "disgust": disgustx,
    }

    # comments_data = {
    #    'id' : video['comment_count'],
    #    'video_like' : video['like']['summary']['total_count'],
    #    'video_love': video['love']['summary']['total_count'],
    #    'video_wow': video['wow']['summary']['total_count'],
    #    'video_haha': video['haha']['summary']['total_count'],
    #    'video_sad': video['sad']['summary']['total_count'],
    #    'video_angry': video['angry']['summary']['total_count']
    #    }
    # abc.append(comments_data)

    return jsonify({"tasks": abc}, {"ASD": ctrku})
Beispiel #26
0
 def __init__(self):
     self.norm_eng = Cucco(language='en')
     self.norm_ops = ['replace_punctuation', 'remove_extra_whitespaces']
def remove_emoji(text):
    cucco = Cucco()
    return cucco.replace_emojis(text)
Beispiel #28
0
write a python program for searching and replacing a pattern.

sentence = "This is a phone number 672-123-456-9910"
pattern = r".*(phone).*?([\d-]+)"
match = re.match(pattern, sentence)
match.groups()
match.group()
match.group(0)
match.group(1)
match.group(2)
match.group(1,2)

write a python program for searching and replacing flags.
## THis library checks for Emojis and replaces it with the regular expressions
from cucco import Cucco
cucco = Cucco()
a=cucco.replace_emojis(':) :)) :( FSDFSDDFSDfv')
print(a)
    
write the syntax and a simple program for regular expression pattern in python.

import re
text = 'You can try to find an ant in this string'
pattern = 'an?\w'
for match in re.finditer(pattern, text):
    sStart = match.start()
    sEnd = match.end()
    sGroup = match.group()
    print('Match "{}" found at: [{},{}]'.format(sGroup, sStart,sEnd))

Beispiel #29
0
class TestCucco(object):

    _cucco = None

    @staticmethod
    def _tests_generator(test):
        for test in TESTS_DATA['tests'][test[5:]]:
            yield (test['after'],
                   test['before'],
                   test['characters'] if 'characters' in test else '',
                   test['kwargs'] if 'kwargs' in test else dict(),
                   test['message'])

    def setup_method(self):
        self._cucco = Cucco()

    def test_normalize(self, request):
        for after, before, _, kwargs, message in self._tests_generator(request.node.name):
            assert self._cucco.normalize(before, **kwargs) == after, message

    def test_remove_accent_marks(self, request):
        for after, before, _, _, message in self._tests_generator(request.node.name):
            assert self._cucco.remove_accent_marks(before) == after, message

    def test_remove_stop_words(self, request):
        for after, before, _, kwargs, message in self._tests_generator(request.node.name):
            assert self._cucco.remove_stop_words(before, **kwargs) == after, message

        # Force language
        self._cucco = Cucco()
        for after, before, _, kwargs, message in self._tests_generator(request.node.name):
            kwargs['language'] = 'en'
            assert self._cucco.remove_stop_words(before, **kwargs) == after, message

        # Force invalid language
        self._cucco = Cucco()
        for after, before, _, kwargs, message in self._tests_generator(request.node.name):
            kwargs['language'] = 'invalid'
            assert self._cucco.remove_stop_words(before, **kwargs) == before, message

        # Test lazy load
        self._cucco = Cucco(lazy_load=True)
        for after, before, _, kwargs, message in self._tests_generator(request.node.name):
            kwargs['language'] = 'en'
            assert self._cucco.remove_stop_words(before, **kwargs) == after, message

    def test_replace_characters(self, request):
        for after, before, characters, kwargs, message in self._tests_generator(request.node.name):
            assert self._cucco.replace_characters(text=before, characters=characters, **kwargs) == after, message

    def test_replace_emails(self, request):
        for after, before, _, kwargs, message in self._tests_generator(request.node.name):
            assert self._cucco.replace_emails(text=before, **kwargs) == after, message

    def test_replace_emojis(self, request):
        for after, before, _, kwargs, message in self._tests_generator(request.node.name):
            assert self._cucco.replace_emojis(text=before, **kwargs) == after, message

    def test_remove_extra_white_spaces(self, request):
        for after, before, _, _, message in self._tests_generator(request.node.name):
            assert self._cucco.remove_extra_white_spaces(before) == after, message

    def test_replace_hyphens(self, request):
        for after, before, _,  kwargs, message in self._tests_generator(request.node.name):
            assert self._cucco.replace_hyphens(text=before, **kwargs) == after, message

    def test_replace_punctuation(self, request):
        for after, before, _, kwargs, message in self._tests_generator(request.node.name):
            assert self._cucco.replace_punctuation(text=before, **kwargs) == after, message

    def test_replace_symbols(self, request):
        for after, before, _, kwargs, message in self._tests_generator(request.node.name):
            assert self._cucco.replace_symbols(text=before, **kwargs) == after, message

    def test_replace_urls(self, request):
        for after, before, _, kwargs, message in self._tests_generator(request.node.name):
            assert self._cucco.replace_urls(text=before, **kwargs) == after, message
Beispiel #30
0
def test(task_id):
    video = []
    conn = create_connection("datafacebook/Kompas/" + str(task_id) + ".db")
    cursor = conn.execute(
        "SELECT comment_id, comment_content, like_count, love_count, wow_count, haha_count, sad_count, angry_count from Comments"
    )
    for row in cursor:
        video.append({
            "id": row[0],
            "message": row[1],
            "like": row[2],
            "love": row[3],
            "wow": row[4],
            "haha": row[5],
            "sad": row[6],
            "angry": row[7],
        })
    conn.close()

    abc = []

    joy_feel = read_dataset(get_full_path("dataset/cf/pp/filter/joy.txt"),
                            "joy")
    disgust_feel = read_dataset(
        get_full_path("dataset/cf/pp/filter/disgust.txt"), "disgust")
    sadness_feel = read_dataset(
        get_full_path("dataset/cf/pp/filter/sadness.txt"), "sadness")
    anger_feel = read_dataset(get_full_path("dataset/cf/pp/filter/anger.txt"),
                              "anger")
    fear_feel = read_dataset(get_full_path("dataset/cf/pp/filter/fear.txt"),
                             "fear")
    surprise_feel = read_dataset(
        get_full_path("dataset/cf/pp/filter/surpriseExtra.txt"), "surprise")

    dataku = []
    for (words, sentiment) in (joy_feel + disgust_feel + sadness_feel +
                               anger_feel + fear_feel + surprise_feel):
        dataku.append((words.rstrip(), sentiment))

    lines = []
    labels = []
    for words, sentiment in dataku:
        html_parser = HTMLParser()

        lines.append(html_parser.unescape(words))
        labels.append(sentiment)

    headlines, labels = lines, labels

    pipeline = Pipeline([
        (
            "count_vectorizer",
            CountVectorizer(
                ngram_range=(2, 3),
                min_df=1,
                max_df=0.8,
                stop_words=frozenset([
                    "saya",
                    "sedang",
                    "lagi",
                    "adalah",
                    "di",
                    "dari",
                    "karena",
                    "dan",
                    "dengan",
                    "ke",
                    "yang",
                    "untuk",
                    "itu",
                    "orang",
                ]),
            ),
        ),
        ("tfidf_transformer", TfidfTransformer()),
        ("classifier", MultinomialNB()),
    ])
    pipeline.fit(headlines, labels)
    angerx = 0
    joyx = 0
    surprisex = 0
    sadnessx = 0
    fearx = 0
    disgustx = 0

    for each_video in video:
        if each_video["message"] != "":
            # connect to database
            init_tag()
            html_parser = HTMLParser()
            spell_check = jalanSpellCheck()
            koreksi_slang = slangWordCorrect()
            cucco = Cucco()

            kata = cucco.replace_emojis(each_video["message"])

            # Escape HTML
            kata = html_parser.unescape(each_video["message"])
            kata = " ".join(kata.split())

            # Hapus emoji
            kata = cucco.replace_emojis(kata)

            normalizations = ["remove_extra_white_spaces"]

            # Hapus extra spasi
            kata = cucco.normalize(kata, normalizations)

            kata = kata.replace("/", " ")

            # Conver ke lowercase
            kata = kata.lower()

            # Hapus repeating character yang lebih dari 2
            kata = re.sub(r"(.)\1+", r"\1\1", kata)

            # Proses ,. yang sisa jadi 2
            kata = kata.replace("..", ".")
            kata = kata.replace(",,", ",")
            kata = kata.replace("!!", "!")
            kata = kata.replace("??", "?")

            # Tambahkan spasi habis titik
            rx = r"\.(?=\S)"
            kata = re.sub(rx, ". ", kata)

            # Slang correction
            kata = koreksi_slang.jalan(kata)

            # Spellcheck error
            # tampung_kata_1 = []
            # tampung_1 = kata.split()
            # for word in tampung_1:
            #    tampung_kata_1.append(spell_check.correctSpelling(word))
            # kata = " ".join(tampung_kata_1)
            asdqwe = kata

            # Check apakah ada tanda baca di akhir
            if (re.match(".*[^.?!]$", kata) is not None) == True:
                kata = kata + " ."

            resultx = do_tag(kata)
            kata = " ".join(resultx)

            if kata != "":
                linesz = []
                linesz.append(kata)
                words = []

                for y in linesz:
                    lines = y.split()
                    for x in lines:
                        word = x.split("/")
                        chars_to_remove = set((
                            ",",
                            "IN",
                            "CC",
                            "SC",
                            "CDO",
                            "CDC",
                            "CDP",
                            "CDI",
                            "DT",
                            "MD",
                            "OP",
                            "CP",
                            "SYM",
                            ".",
                        ))
                        if word[1] not in chars_to_remove:
                            words.append(word[0] + "_" + word[1])
                    resultx = "".join([" " + i for i in words]).strip()

                cobaa = []
                cobaa.append(resultx)

                for x in pipeline.predict(cobaa):
                    hasilx = x
                if hasilx == "anger":
                    angerx = angerx + 1
                elif hasilx == "joy":
                    joyx = joyx + 1
                elif hasilx == "sadness":
                    sadnessx = sadnessx + 1
                elif hasilx == "fear":
                    fearx = fearx + 1
                elif hasilx == "disgust":
                    disgustx = disgustx + 1
                elif hasilx == "surprise":
                    surprisex = surprisex + 1

                comments_data = {
                    "id": each_video["id"],
                    "komen": each_video["message"],
                    "asdqwe": asdqwe,
                    "komen_edit": resultx,
                    "prediksi": hasilx,
                    "like_count": each_video["like"],
                    "love_count": each_video["love"],
                    "wow_count": each_video["wow"],
                    "haha_count": each_video["haha"],
                    "sad_count": each_video["sad"],
                    "angry_count": each_video["angry"],
                }

            abc.append(comments_data)

    ctrku = {
        "anger": angerx,
        "joy": joyx,
        "sadness": sadnessx,
        "fear": fearx,
        "surprise": surprisex,
        "disgust": disgustx,
    }

    return jsonify({"tasks": abc}, {"ASD": ctrku})
Beispiel #31
0
# -*- coding: utf-8 -*-

import sys
import enchant
import os
from nltk.tokenize import RegexpTokenizer
from stop_words import get_stop_words
#from pattern.en import parse
from cucco import Cucco

#La primera linea debe tener el nombre del archivo o pagina

#for line in sys.stdin:

tokenizer = RegexpTokenizer(r'\w+')
cuccoEn = Cucco()
norms = [
    'remove_stop_words', 'replace_punctuation', 'remove_extra_whitespaces',
    'remove_accent_marks'
]
dic = enchant.Dict("en_US")
en_stop = get_stop_words('en')

fileName = os.environ["map_input_file"]
#fileName = "Test.cpp";

raw = None
tokens = []
final_tokens = []
parse_tokens = []
Beispiel #32
0
def dialogue_iterator(filename, test=False, raw=False):
    """
    Iterate dialogues in the specified file.

    One may specify whether to read a test dataset (without evaluation scores
    and user types) and to return raw dialogue phrases (without
    postprocessing).
    """

    cu = Cucco()
    normalizations = [
        'remove_accent_marks', ('replace_emojis', {
            'replacement': ' '
        }), ('replace_hyphens', {
            'replacement': ''
        }), ('replace_punctuation', {
            'replacement': ''
        }), ('replace_urls', {
            'replacement': ' '
        }), 'remove_extra_whitespaces'
    ]

    with open(filename) as input_file:
        for r in json.load(input_file):
            if not raw:
                r['context'] = cu.normalize(r['context'])
            # form the thread list
            th_list = []
            for i in r['thread']:
                if not raw:
                    i['text'] = i['text'].rstrip()
                    if not i['text']:
                        continue
                    i['text'] = cu.normalize(i['text'], normalizations)
                    i['text'] = i['text'].lower()
                th_list.append(Thread(i['text'], i['userId'], i.get('time')))

            # if we're dealing with the test dataset, do not return user types
            # and evaluation scores
            if test:
                d = Dialogue(r['context'], r['dialogId'], None, th_list, None)
            else:
                # form the evaluation dictionary
                ev_dict = {}
                for i in r['evaluation']:
                    if i['userId'] == 'Alice':
                        ev_dict['Alice'] = i['quality']
                    elif i['userId'] == 'Bob':
                        ev_dict['Bob'] = i['quality']
                    else:
                        raise ValueError('incorrect user ID')
                # form the user list
                us_dict = {}
                for i in r['users']:
                    if i['id'] == 'Alice':
                        us_dict['Alice'] = i['userType']
                    elif i['id'] == 'Bob':
                        us_dict['Bob'] = i['userType']
                    else:
                        raise ValueError('incorrect user ID')
                d = Dialogue(r['context'], r['dialogId'],
                             Evaluation(ev_dict['Alice'], ev_dict['Bob']),
                             th_list, User(us_dict['Alice'], us_dict['Bob']))
            yield concat_phrases(d)
Beispiel #33
0
 def setup_method(self):
     self._cucco = Cucco()