Esempio n. 1
0
def preprocess_siamese(doc):

    # pre-process data
    doc = tp.normalize.normalize_unicode(doc)
    doc = tp.normalize_whitespace(doc)
    doc = tp.normalize_quotation_marks(doc)
    doc = tp.replace_emails(doc, replace_with="<EMAIL>")
    doc = tp.replace_urls(doc, replace_with="<URL>")
    doc = tp.replace_hashtags(doc, replace_with="<HASHTAG>")
    doc = tp.replace_emojis(doc, replace_with="<EMOJI>")
    doc = tp.replace_phone_numbers(doc, replace_with="<PHONE>")

    # apply spaCy to tokenize doc
    doc = nlp_token(doc)

    # build new sentences for pre-processed doc
    doc_new = []
    for sent in doc.sents:
        sent_new = ""
        for token in sent:
            token = token.text
            token = token.replace("\n", "")
            token = token.replace("\t", "")
            token = token.strip()
            sent_new += token + " "

        doc_new.append(sent_new[:-1])

    return doc_new
Esempio n. 2
0
def clean(text: str) -> str:
    txt = text.strip()
    txt = preprocessing.normalize_unicode(txt, form="NFKC")
    # Collapse whitespaces
    txt = preprocessing.normalize_whitespace(txt)
    # Remove newlines
    txt = preprocessing.normalize_repeating_chars(txt, chars="\n", maxn=1)
    # fix hyphen-ated words
    txt = preprocessing.normalize_hyphenated_words(txt)
    txt = preprocessing.normalize_quotation_marks(txt)
    txt = preprocessing.replace_urls(txt, replace_with="")
    txt = preprocessing.replace_phone_numbers(txt, replace_with="")
    txt = preprocessing.replace_emails(txt, replace_with="")
    txt = preprocessing.replace_user_handles(txt, replace_with="")
    txt = preprocessing.normalize_repeating_chars(txt, chars=".,;:-_ ", maxn=1)
    txt = re.sub("\n ", " ", txt)
    txt = re.sub(" \n", " ", txt)
    txt = re.sub("\n", " ", txt)
    txt = re.sub(" . ", " ", txt)
    txt = re.sub(r"\.([A-Z])", r". \1", txt)
    txt = re.sub(r"\. ([A-Z])", r".\n\1", txt)
    # fix for some common abbreviations
    for abv in ['Dr', 'St', 'Mr', 'Ms', 'mt', 'Inst', 'inc', 'est']:
        txt = re.sub(abv + "\.\n", abv + ". ", txt)
    return txt
Esempio n. 3
0
def html2text(html):
    """HTML to text converter

    Args:
        html (str): html

    Returns:
        str: html page content in plaintext
    """
    if not html:
        return ''

    # remove code snippets
    html = re.sub(r'<pre>.*?</pre>',
                  ' ',
                  html,
                  flags=re.DOTALL | re.MULTILINE | re.IGNORECASE)
    html = re.sub(r'<code>.*?</code>',
                  ' ',
                  html,
                  flags=re.DOTALL | re.MULTILINE | re.IGNORECASE)

    # strip the rest
    s = MLStripper()
    s.feed(html)
    text = s.get_data()
    text = preprocessing.normalize_whitespace(text)
    return text
Esempio n. 4
0
def clean_textacy(textfile):
    text = open(textfile).read()
    text = preprocessing.normalize_whitespace(text)
    text = preprocessing.normalize.normalize_hyphenated_words(text)
    text = preprocessing.normalize.normalize_quotation_marks(text)
    text = preprocessing.normalize.normalize_unicode(text)
    text = preprocessing.remove.remove_accents(text)
    # text=preprocessing.remove.remove_punctuation(text)
    text = preprocessing.replace.replace_currency_symbols(text)
    text = preprocessing.replace.replace_emails(text)
    text = preprocessing.replace.replace_hashtags(text)
    # text=preprocessing.replace.replace_numbers(text)
    text = preprocessing.replace.replace_phone_numbers(text)
    text = preprocessing.replace.replace_urls(text)
    text = preprocessing.replace.replace_user_handles(text)

    print(text)
    # now replace the original doc with cleaned version
    newfile = 'cleaned_' + textfile
    textfile2 = open(newfile, 'w')
    textfile2.write(text)
    textfile2.close()
    os.remove(textfile)

    return [newfile]
Esempio n. 5
0
def processText(text):

    preprocessedText = preprocessing.normalize.normalize_unicode(text)
    preprocessedText = preprocessing.normalize.normalize_quotation_marks(
        preprocessedText)

    preprocessedText = preprocessing.remove.remove_accents(preprocessedText)
    preprocessedText = preprocessing.remove.remove_punctuation(
        preprocessedText)

    preprocessedText = preprocessing.replace.replace_emails(
        preprocessedText, "")
    preprocessedText = preprocessing.replace.replace_phone_numbers(
        preprocessedText, "")
    #preprocessedText = preprocessing.replace.replace_contractions(preprocessedText)

    # lemmatize the entire text
    # first, split the text to a list of words
    words = TextBlob(preprocessedText).words
    # then, lemmatize each word
    lemmatizedText = ""
    for w in words:
        lemmatizedText += "{} ".format(w.lemmatize())

    # normalize the whitespaces for texts which include s.l. 'Title    And I am ...'
    return preprocessing.normalize_whitespace(lemmatizedText)
Esempio n. 6
0
def preprocess_sentence(sent, lower=True):
    """Pre-process a sentence ( via ``textacy.preprocess` module ).

    Args:
        sent (str): text.
        lower (bool): whether to return a lowercase string.

    Returns:
        str
    """
    # normalize unicode
    sent = preprocessing.normalize_unicode(sent)

    # deaccent
    sent = preprocessing.remove_accents(sent)

    # replace newline chars
    sent = re.sub("\n|\r", " ", sent)

    # unpack contractions
    sent = contractions.fix(sent)

    # replace emoji symbols
    sent = preprocessing.replace_emojis(sent)

    # replace hashtags
    sent = preprocessing.replace_hashtags(sent)

    # replace user handles
    sent = preprocessing.replace_user_handles(sent)

    # replace currency symbols
    sent = preprocessing.replace_currency_symbols(sent)

    # replace emails
    sent = preprocessing.replace_emails(sent)

    # replace URLs
    sent = preprocessing.replace_urls(sent)

    # remove punctuation
    sent = preprocessing.remove_punctuation(sent)

    # normalize whitespace
    sent = preprocessing.normalize_whitespace(sent)

    if lower:
        sent = sent.lower()
    return sent
Esempio n. 7
0
def test_plaintext_functionality(text):
    preprocessed_text = preprocessing.normalize_whitespace(text)
    preprocessed_text = preprocessing.remove_punctuation(text)
    preprocessed_text = preprocessed_text.lower()
    assert all(char.islower() for char in preprocessed_text if char.isalpha())
    assert all(char.isalnum() or char.isspace() for char in preprocessed_text)
    keyword = "America"
    kwics = text_utils.keyword_in_context(text,
                                          keyword,
                                          window_width=35,
                                          print_only=False)
    for pre, kw, post in kwics:
        assert kw == keyword
        assert isinstance(pre, compat.unicode_)
        assert isinstance(post, compat.unicode_)
Esempio n. 8
0
def text_cleanup(text):
    "cleanup our text"

    text = preprocessing.replace_emails(text, replace_with='')
    text = preprocessing.replace_urls(text, replace_with='')
    text = preprocessing.replace_hashtags(text, replace_with='')
    text = preprocessing.replace_phone_numbers(text, replace_with='')
    text = preprocessing.replace_numbers(text, replace_with='')

    text = preprocessing.remove_accents(text)
    text = preprocessing.remove_punctuation(text)

    text = preprocessing.normalize_quotation_marks(text)
    text = preprocessing.normalize_hyphenated_words(text)
    text = text.replace('\n', ' ').replace('\t', ' ')
    text = text.lower()

    text = preprocessing.normalize_whitespace(text)
    return text
Esempio n. 9
0
def textacy_preprocess(sentence):
    """Preprocess text."""
    sentence = preprocessing.normalize_hyphenated_words(sentence)
    sentence = preprocessing.normalize_quotation_marks(sentence)
    #sentence = preprocessing.normalize_repeating_chars(sentence)
    sentence = preprocessing.normalize_unicode(sentence)
    sentence = preprocessing.normalize_whitespace(sentence)
    sentence = preprocessing.remove_accents(sentence)
    sentence = preprocessing.remove_punctuation(sentence)
    sentence = preprocessing.replace_currency_symbols(sentence)
    sentence = preprocessing.replace_emails(sentence)
    sentence = preprocessing.replace_emojis(sentence)
    sentence = preprocessing.replace_hashtags(sentence)
    sentence = preprocessing.replace_numbers(sentence)
    sentence = preprocessing.replace_phone_numbers(sentence)
    sentence = preprocessing.replace_urls(sentence)
    sentence = preprocessing.replace_user_handles(sentence)

    return sentence
def load(path):
    email_text = extract_email_text(path)
    if not email_text:
        return []

    # use textacy to do the processing, remove the whitesapace, punctuation
    email_text = preprocessing.normalize_whitespace(
        preprocessing.remove_punctuation(email_text))
    # remove accents and noralize unicode
    email_text = preprocessing.normalize_unicode(
        preprocessing.remove_accents(email_text))

    # Tokenize the message
    tokens = to_tokenized_text(email_text)

    # Remove stopwords and stem tokens
    if len(tokens) > 2:
        # extract stemming word
        return [w.lemma_ for w in tokens if w not in nlp.Defaults.stopwords]
    return []
Esempio n. 11
0
    def preprocess_doc(self, doc):

        # pre-process data
        doc = tp.normalize.normalize_unicode(doc)
        doc = tp.normalize_whitespace(doc)
        doc = tp.normalize_quotation_marks(doc)

        # apply spaCy to tokenize doc
        doc = self.tokenizer(doc)

        # build new sentences for pre-processed doc
        doc_new = []
        for sent in doc.sents:
            sent_new = ''
            for token in sent:
                token = token.text
                token = token.replace('\n', '')
                token = token.replace('\t', '')
                token = token.strip()
                sent_new += token + ' '
            doc_new.append(sent_new[:-1])
        return doc_new
Esempio n. 12
0
def preprocess(text):
    return preprocessing.normalize_whitespace(preprocessing.remove_punctuation(text))
Esempio n. 13
0
    def _clean(self, text: str):
        txt = text.strip()

        #
        txt = preprocessing.normalize_unicode(txt, form="NFKC")

        # txt = preprocessing.remove_punctuation(txt)

        # Collapse whitespaces
        txt = preprocessing.normalize_whitespace(txt)
        # Remove newlines
        txt = preprocessing.normalize_repeating_chars(txt, chars="\n", maxn=1)
        # fix hyphen-ated words
        txt = preprocessing.normalize_hyphenated_words(txt)
        txt = preprocessing.normalize_quotation_marks(txt)
        txt = preprocessing.replace_urls(txt, replace_with="")
        txt = preprocessing.replace_phone_numbers(txt, replace_with="")
        txt = preprocessing.replace_emails(txt, replace_with="")
        txt = preprocessing.replace_user_handles(txt, replace_with="")
        txt = preprocessing.normalize_repeating_chars(txt,
                                                      chars=".,;:-_ ",
                                                      maxn=1)
        txt = re.sub("\n ", " ", txt)
        txt = re.sub(" \n", " ", txt)
        txt = re.sub("\n", " ", txt)
        txt = re.sub(" . ", " ", txt)

        # txt = text.encode().decode("unicode-escape")
        # Used ftfy for "fixing" broken text, e.g. Unicode
        # txt = fix_text(txt.strip(), normalization="NFKC")

        # re- minissence => reminissence
        # txt = re.sub(r"([a-z])\-\s{,2}([a-z])", r"\1\2", txt)

        # collapse two+ newlines into single whitespace
        # txt = re.sub(r"\s+\n{1,}\s*(\w)", r" \1", txt)

        # collapse two+ newlines into single whitespace
        # txt = re.sub("\n+", " ", txt)
        """
        # collapse two+ newlines into single whitespace
        txt = re.sub(r"\s+\n{2,}\s*(\w)", r" \1", txt)

        # double-newlines to dots
        txt = re.sub(r"\n\n", ". ", txt)

        # collapse whitespace
        txt = re.sub(r"(\s){2,}", r"\1", txt)
        # collapse dots
        txt = re.sub(r"\.{2,}", ".", txt)
        # newline to whitespace between word characters
        txt = re.sub(r"(\w)\n(\w)", r"\1 \2", txt)
        # newline + open brace to whitespace
        txt = re.sub(r"(\w)\n(\()", r"\1 \2", txt)
        # comma + newline  to whitespace
        txt = re.sub(r"(\w)\,\n(\w)", r"\1 \2", txt)

        # Number end of sentence, followed by sentence that starts with number + dot
        txt = re.sub(r"(\d+)\.(\d\.\s+)", r"\1. ", txt)
        # remove decimals + dot after whitespace followed by whitespace
        txt = re.sub(r"(\.\s*)\d+\.\s+", r"\1", txt)

        # collapse backslashes
        txt = re.sub(r"\\{2,}", r"\\", txt)
        # remove 'escaped backslash' artefacts
        txt = re.sub(r"\\\\", "", txt)
        # remove lowdash artifacts ("lines")
        txt = re.sub(r"_{2,}", r"", txt)

        # normalize newline
        txt = re.sub(r"\r\n", r"\n", txt)
        # Linebreaks starting with numbers \n77\n
        txt = re.sub(r"\n\d+\n", r"\n", txt)

        # remove quotes + decimals on beginning of sentences
        txt = re.sub(r"\.([\"']?)\d+\s+", r".\1", txt)
        # remove quotes + decimals on beginning of sentences
        txt = re.sub(r"\.([\"']?)\d+\s+", r".\1", txt)

        # collapse dots
        txt = re.sub(r"\.\s+\.", ". ", txt)
        # collapse whitespace
        txt = re.sub(r"(\w+)\s{2,}(\w+)", r"\1 \2", txt)

        # Add space+ dot with double quotes
        txt = re.sub(r"\.\"(\w+)", r'.". \1', txt)

        # Add space+ between two sentences
        txt = re.sub(r"([a-z])\.([A-Z])", r"\1. \2", txt)
        """

        return txt
Esempio n. 14
0
def preprocess_text(text,
                    char_count_filter=True,
                    stopwords=None,
                    min_len=2,
                    max_len=15):
    """
    Pre-processing steps prior to spaCy nlp pipeline. Optional filtering of
    tokens based on character length.

    Parameters
    ----------
    text : str
    char_count_filter : bool
    stopwords : iterable, None
    min_len : int
    max_len : int

    Returns
    -------
    text : str
        pre-processed text
    """
    # 1) convert to lower case for robust stop-word recognition
    text = text.lower()

    # 2) normalise
    text = preprocessing.normalize_quotation_marks(text)
    # text = preprocessing.normalize_repeating_chars(text)
    text = preprocessing.normalize_hyphenated_words(text)
    text = preprocessing.normalize_whitespace(text)

    # 3) replace
    text = preprocessing.replace_currency_symbols(text)
    text = preprocessing.replace_emails(text)
    text = preprocessing.replace_emojis(text)
    text = preprocessing.replace_hashtags(text)
    text = preprocessing.replace_numbers(text)
    text = preprocessing.replace_phone_numbers(text)
    text = preprocessing.replace_urls(text)
    text = preprocessing.replace_user_handles(text)

    # 4) remove
    text = preprocessing.remove_accents(text)
    text = preprocessing.remove_punctuation(text)
    text = re.sub("[^A-Za-z0-9]+", " ", text)  # keep text and numbers

    # 5) optionally remove tokens based on length
    if char_count_filter & (stopwords is not None):
        # filter based on token length
        tokens = gensim.utils.simple_preprocess(doc=text,
                                                min_len=min_len,
                                                max_len=max_len)
        # filter case-specific words
        tokens = [token for token in tokens if token not in stopwords]

        # convert processed list of tokens back to one string
        text = " ".join(tokens)
    else:
        raise NotImplementedError("Not implemented.")

    return text
Esempio n. 15
0
def test_normalize_whitespace(test_input, expected_result):
    assert preprocessing.normalize_whitespace(test_input) == expected_result
    def clean_tweet(self, text):
        # FIXED UNICODE
        # text = preprocess.fix_bad_unicode(text)
        text = ftfy.fix_text(text)

        # GET TEXT ONLY FROM HTML
        text = BeautifulSoup(text, features='lxml').getText()

        # UN-PACK CONTRACTIONS
        text = preprocess.unpack_contractions(text)

        # REMOVE URL
        # text = preprocess.replace_urls(text)
        text = preprocessing.replace_urls(text)

        # REMOVE EMAILS
        # text = preprocess.replace_emails(text)
        text = preprocessing.replace_emails(text)

        # REMOVE PHONE NUMBERS
        # text = preprocess.replace_phone_numbers(text)
        text = preprocessing.replace_phone_numbers(text)

        # REMOVE NUMBERS
        # text = preprocess.replace_numbers(text)
        text = preprocessing.replace_numbers(text)

        # REMOVE CURRENCY
        # text = preprocess.replace_currency_symbols(text)
        text = preprocessing.replace_currency_symbols(text)

        # REMOVE ACCENTS
        # text = preprocess.remove_accents(text)
        text = preprocessing.remove_accents(text)

        # CONVERT EMOJIS TO TEXT
        words = text.split()
        reformed = [
            self.SMILEY[word] if word in self.SMILEY else word
            for word in words
        ]
        text = " ".join(reformed)
        text = emoji.demojize(text)
        text = text.replace(":", " ")
        text = ' '.join(text.split())

        # SPLIT ATTACHED WORDS
        text = ' '.join(re.findall('[A-Z][^A-Z]*', text))

        # SPLIT UNDERSCORE WORDS
        text = text.replace('_', ' ')

        # REMOVE PUNCTUATION
        # text = preprocess.remove_punct(text)
        text = preprocessing.remove_punctuation(text)

        # Remove numbers
        text = re.sub(r'\d', '', text)

        # REMOVE WORDS LESS THAN 3 CHARACTERS
        text = re.sub(r'\b\w{1,2}\b', '', text)

        # NORMALIZE WHITESPACE
        # text = preprocess.normalize_whitespace(text)
        text = preprocessing.normalize_whitespace(text)

        return text
Esempio n. 17
0
def test_normalize_whitespace():
    text = "Hello, world!  Hello...\t \tworld?\n\nHello:\r\n\n\nWorld. "
    proc_text = "Hello, world! Hello... world?\nHello:\nWorld."
    assert preprocessing.normalize_whitespace(text) == proc_text
Esempio n. 18
0

binance_words = nlp(binance_string)._.combo_basic.sort_values(ascending=False).head(1000)


# In[ ]:


binance3000.text.to_csv('binance3000_texts.csv')


# In[ ]:






# %%
from textacy import preprocessing
df3 = preprocessing.normalize_whitespace(preprocessing.remove_punctuation(df3.text))


# %%
import textacy
textacy.text_utils.KWIC(strings, "language", window_width=35)   
# %%


# %%
Esempio n. 19
0
 def process_line(self, line: str) -> Optional[str]:
     return normalize_whitespace(line)