Beispiel #1
0
def preprocess(tokenized):
    """
    Preprocessing:
    - detokenize
    - Spaces before periods at end of sentences
    - everything lowercase
    """
    s = TreebankWordDetokenizer().detokenize(tokenized)
    s = re.sub('([.,!?()])', r' \1 ', s)
    s = re.sub('\s{2,}', ' ', s)
    s = s.lower()
    return s
Beispiel #2
0
for all_messages in messages:
    sender = all_messages.get('sender_name')
    message = all_messages.get('content')
    if message is None:
        message = '1'
    if sender not in liste_message:
        liste_message[sender] = []
    liste_sender.append(sender)
    liste_message[sender].append(message)

# my group conversation was in french, hence the fr_core_news
nlp = spacy.load('fr_core_news_md')
for x in liste_message:
    texte_tokenise = liste_message[x]
    texte_detok = TreebankWordDetokenizer().detokenize(texte_tokenise)
    doc = nlp(texte_detok.lower())

    # in french, not all the apostrophe letters were tokenized correctly, so we added some
    #  stopwords
    customize_stop_words = [
        "d’", "qu’", "d'", "qu'", "j'", "j’", "c'", "c’", "l'", "l’", "t'",
        "t’", "-ce", "n'", "n’", "lol", "pis", 'pi', 'ca'
    ]
    liste_tokens = []
    for w in customize_stop_words:
        nlp.vocab[w].is_stop = True
    for token in doc:
        if not token.is_punct | token.is_stop:
            liste_tokens.append(token.text)
    word_freq = Counter(liste_tokens)
    common_words = word_freq.most_common(10)