Exemple #1
0
    def get_content(self, url=None):
        data = []
        response = requests.get(url).text
        soup = BeautifulSoup(response, "html5lib")

        img = soup.select_one(
            '.read-page--photo-gallery--item__picture > img')['data-src']
        contents = soup.select('.article-content-body__item-content > p')

        for i in range(len(contents)):
            if contents[i].text.strip() != '' and contents[i].text.strip()[:1] != '*' \
                    and contents[i].text.strip()[:8] != 'Reporter' and contents[i].text.strip()[:14] != 'Saksikan video'\
                    and contents[i].text.strip()[:1] != '(' and contents[i].text.strip()[:14] != 'Saksikan Video' \
                    and contents[i].text.strip()[:2] != ' (' and contents[i].text.strip()[:7] != 'Sumber:':
                data.append(contents[i].text.strip() + '\n\n')

        con = ''.join(data)
        con = preprocess_text(con, fix_unicode=True)
        con = self.ner_text(con)
        con2 = ''.join(data)
        con2 = self.ner_text(con2)
        con2 = con2.split('\n\n')

        data_json = {"img": img, "content": con, "content_html": con2}
        return data_json
Exemple #2
0
    def get_content(self, url=None):
        data = []
        response = requests.get(url).text
        soup = BeautifulSoup(response, "html5lib")

        sub_category = soup.select('.breadcrumbs > li')[2].text
        img = soup.select_one('figure > a')['href']

        contents = soup.select('#isi > p')

        for content in contents:
            if content.text.strip(
            )[:10] != 'Baca juga:' and content.text.strip()[:5] != 'Baca:':
                data.append(content.text.strip() + '\n\n')

        con = ''.join(data)
        con = preprocess_text(con, fix_unicode=True)
        con = self.ner_text(con)
        con2 = ''.join(data)
        con2 = self.ner_text(con2)
        con2 = con2.split('\n\n')

        data_json = {
            "sub_category": sub_category,
            "img": img,
            "content": con,
            "content_html": con2
        }

        return data_json
Exemple #3
0
    def getContent(self, url=None):
        iData = []
        iResponse = requests.get(url).text
        iSoup = BeautifulSoup(iResponse, "html5lib")

        subCategory = iSoup.select('.breadcrumbs > li')[2].text
        img = iSoup.select_one('figure > a')['href']

        iContents = iSoup.select('#isi > p')

        for content in iContents:
            if content.text.strip(
            )[:10] != 'Baca juga:' and content.text.strip()[:5] != 'Baca:':
                iData.append(content.text.strip() + '\n\n')

        ordinaryContent = ''.join(iData)
        ordinaryContent = preprocess_text(ordinaryContent, fix_unicode=True)
        ordinaryContent = self.nerText(ordinaryContent)
        htmlContent = ''.join(iData)
        htmlContent = self.nerText(htmlContent)
        htmlContent = htmlContent.split('\n\n')

        iJson = {
            "subCategory": subCategory,
            "img": img,
            "content": ordinaryContent,
            "contentHTML": htmlContent
        }

        return iJson
Exemple #4
0
    def getContent(self, url=None):
        iData = []
        iResponse = requests.get(url).text
        iSoup = BeautifulSoup(iResponse, "html5lib")

        img = iSoup.select_one(
            '.read-page--photo-gallery--item__picture > img')['data-src']
        contents = iSoup.select('.article-content-body__item-content > p')

        for i in range(len(contents)):
            if contents[i].text.strip() != '' and contents[i].text.strip()[:1] != '*' \
                    and contents[i].text.strip()[:8] != 'Reporter' and contents[i].text.strip()[:14] != 'Saksikan video'\
                    and contents[i].text.strip()[:1] != '(' and contents[i].text.strip()[:14] != 'Saksikan Video' \
                    and contents[i].text.strip()[:2] != ' (' and contents[i].text.strip()[:7] != 'Sumber:':
                iData.append(contents[i].text.strip() + '\n\n')

        ordinaryContent = ''.join(iData)
        ordinaryContent = preprocess_text(ordinaryContent, fix_unicode=True)
        ordinaryContent = self.nerText(ordinaryContent)
        htmlContent = ''.join(iData)
        htmlContent = self.nerText(htmlContent)
        htmlContent = htmlContent.split('\n\n')

        iJson = {
            "img": img,
            "content": ordinaryContent,
            "contentHTML": htmlContent
        }
        return iJson
Exemple #5
0
    def getContent(self, url=None):
        iData = []
        iResponse = requests.get(url).text
        iSoup = BeautifulSoup(iResponse, "html5lib")
        contents = iSoup.select_one('.photo > img')
        contents2 = iSoup.select('.read__content > p')
        img = contents['data-src']

        for i in range(len(contents2)):
            if contents2[i].text != '':
                if (contents2[i].text[:9] != 'Baca juga' and contents2[i].text[:5] != 'Baca:') \
                        and (contents2[i].text[:15] != 'We are thrilled') and (contents2[i].text[:6] != 'Flinke') \
                        and (contents2[i].text[:18] != 'Baca selengkapnya:') and (contents2[i].text[:25]) != 'Baca berita selengkapnya:' \
                        and (contents2[i].text[:7]) != 'Sumber:':
                    iData.append(contents2[i].text + '\n\n')

        ordinaryContent = ''.join(iData)
        ordinaryContent = preprocess_text(ordinaryContent, fix_unicode=True)
        ordinaryContent = self.nerText(ordinaryContent)
        htmlContent = ''.join(iData)
        htmlContent = self.nerText(htmlContent)
        htmlContent = htmlContent.split('\n\n')

        iJson = {
            "img": img,
            "content": ordinaryContent,
            "contentHTML": htmlContent
        }

        return iJson
Exemple #6
0
    def get_content(self, url=None):
        response = requests.get(url).text
        soup = BeautifulSoup(response, "html5lib")

        contents = soup.select_one('.photo > img')
        contents2 = soup.select('.read__content > p')

        temp_img = contents['src']

        data = []
        for i in range(len(contents2)):
            if contents2[i].text != '':
                if (contents2[i].text[:9] != 'Baca juga' and contents2[i].text[:5] != 'Baca:') \
                        and (contents2[i].text[:15] != 'We are thrilled') and (contents2[i].text[:6] != 'Flinke') \
                        and (contents2[i].text[:18] != 'Baca selengkapnya:') and (contents2[i].text[:25]) != 'Baca berita selengkapnya:' \
                        and (contents2[i].text[:7]) != 'Sumber:':
                    data.append(contents2[i].text + '\n\n')

        con = ''.join(data)
        con = preprocess_text(con, fix_unicode=True)
        con = self.ner_text(con)
        con2 = ''.join(data)
        con2 = self.ner_text(con2)
        con2 = con2.split('\n\n')

        data_json = {"img": temp_img, "content": con, "content_html": con2}

        return data_json
Exemple #7
0
 def sym(text:str) -> str:
     """generalize symbols such as urls, emails, phone numbers and filepaths to generic tokens."""
     text = preprocess_text(text, 
                            no_emails=True, 
                            no_phone_numbers=True,
                            no_accents=True)
     
     # generalize file paths
     file_path_regex = r'C:(\\\\\S+){2,}|(/\S+){2,}|[Cc]:\\\w+(\\[0-9a-zA-Z_\-]+)+'
     text = re.sub(file_path_regex, ' xxxfilepath ', text)
     
     # generalize @ mentions
     at_mention_regex = r'\W@\w+'
     text = re.sub(at_mention_regex, ' xxxatmention ', text)
     
     # get date/time
     text = re.sub(r'\d+[-/]\d+[-/]\d+(.{0,2})?(\d+:\d+:\d+)', ' xxxdatetm ', text)
     
     # strings that have >=4 dots w/o any whitespace in between
     text = re.sub(r'(\S+\.\S+){4,}', 'xxunk', text)
     
     # things that look like IP addresses
     text = re.sub(r'\d+\.\d+.\d+\.\d+', 'xxunk', text)
     
     # long strings or numbers
     text = re.sub(r'\S{30,}|\d{6,}', 'xxunk', text)
     
     # generalize json
     json_regex = r'\{(?:[^{}]|(?R))*\}'
     text = regex.sub(json_regex, ' xxxjson ', text)
     
     return text
def test_preprocess_text():
    text = (
        "Well… That's a long story. "
        "Hello, world!  Hello...\t \tworld?\n\nHello:\r\n\n\nWorld. "
        "Y'all can't believe you're not who they've said I'll become, but shouldn't. "
        "I learned everything I know from www.stackoverflow.com and http://wikipedia.org/ and Mom. "
        "I can be reached at [email protected] through next Friday. "
        "I can be reached at 555-123-4567 through next Friday. "
        "I owe $1,000.99 to 123 people for 2 +1 reasons. "
        "El niño se asustó -- qué miedo!")
    proc_text = (
        "Well... That's a long story. "
        "Hello, world! Hello... world?\nHello:\nWorld. "
        "You all can not believe you are not who they have said I will become, but should not. "
        "I learned everything I know from *URL* and *URL* and Mom. "
        "I can be reached at *EMAIL* through next Friday. "
        "I can be reached at *PHONE* through next Friday. "
        "I owe USD*NUMBER* to *NUMBER* people for *NUMBER* *NUMBER* reasons. "
        "El nino se asusto -- que miedo!")
    assert preprocess.preprocess_text(text,
                                      normalized_unicode=True,
                                      no_urls=True,
                                      no_emails=True,
                                      no_phone_numbers=True,
                                      no_numbers=True,
                                      no_currency_symbols=True,
                                      no_contractions=True,
                                      no_accents=True) == proc_text
Exemple #9
0
    def cleanContent(self, iData=None):
        for i in tqdm(range(len(iData)), desc='Clean Content'):
            text_stopword = []
            iData[i]['cleanContent'] = preprocess_text(
                iData[i]['content'],
                lowercase=True,
                fix_unicode=True,
                no_punct=True,
                no_numbers=True,
                no_urls=True,
                no_currency_symbols=True,
                no_phone_numbers=True,
                no_emails=True)
            clean_content = iData[i]['cleanContent'].split()

            [
                text_stopword.append(cc) for cc in clean_content
                if cc not in stopwords
            ]
            case_folding = ' '.join(text_stopword)
            # stemming = stemmer.stem(case_folding)

            iData[i]['cleanContent'] = case_folding

        return iData
Exemple #10
0
def process_questions(t):
    text = tap.preprocess_text(t, fix_unicode=True, lowercase=True,
                               transliterate=True, no_urls=True,
                               no_emails=True, no_phone_numbers=True,
                               no_numbers=True,
                               no_currency_symbols=True, no_punct=True,
                               no_contractions=True, no_accents=True)
    return set(word_tokenize(text))
Exemple #11
0
def cleaning_text(text):
    text = preprocess_text(text,
                           no_numbers=True,
                           fix_unicode=True,
                           lowercase=True,
                           no_punct=True)
    text = " ".join(text.replace("number", "").split())
    return text
def clean_text(text):
    text = text.replace('/n', ' ')).replace('.com', ' ').replace('.org', ' ').replace('.net', ' ')
    text = strip_html(text)
    # Remove contractions, if any:
    text = preprocess_text(text, fix_unicode=True, no_accents=True, no_contractions=True, lowercase=True, no_punct=True, no_currency_symbols=True), replace_with=' ')
    text = replace_urls(text, replace_with='')
    text = replace_numbers(text, replace_with='')
    return text
def clean_sentence(sentences):
    c = sentences.replace('-', ' ')  # people use to concatinate words
    c = normalize_whitespace(c)
    c = preprocess_text(c,
                        lowercase=True,
                        no_numbers=True,
                        no_punct=True,
                        no_contractions=True)
    return c
Exemple #14
0
    def cleanContent(self, iData=None):
        for i in tqdm(range(len(iData)), desc='Clean Content'):
            text_stopword = []
            iData[i]['cleanContent'] = preprocess_text(iData[i]['content'], lowercase=True, fix_unicode=True,no_punct=True,no_numbers=True)
            clean_content = iData[i]['cleanContent'].split()

            [text_stopword.append(cc) for cc in clean_content if cc not in stopwords]

            iData[i]['cleanContent'] = ' '.join(text_stopword)

        return iData
Exemple #15
0
def clean(text, lower=True, **kwargs):
    text = clean_quotes(text)
    text = preprocess_text(text,
                           fix_unicode=True,
                           lowercase=lower,
                           no_urls=True,
                           no_emails=True,
                           transliterate=True,
                           no_numbers=True,
                           no_phone_numbers=True)
    return text
def preprocess(text, fix_unicode=True, normalize_white_space = False, lowercase=False, transliterate=False,
                    no_urls=False, no_emails=False, no_phone_numbers=False,
                    no_numbers=False, no_currency_symbols=False, no_punct=False,
                    no_contractions=False, no_accents=False):
    if normalize_white_space:
        text = pp.normalize_whitespace(text)
    text = pp.preprocess_text(text, fix_unicode, lowercase, transliterate,
                    no_urls, no_emails, no_phone_numbers,
                    no_numbers, no_currency_symbols, no_punct,
                    no_contractions, no_accents)
    return text
Exemple #17
0
def preprocess_sentence(sent):
    # TODO check language?
    s = preprocess.normalize_whitespace(sent)
    return preprocess.preprocess_text(s,
                                      lowercase=True,
                                      transliterate=True,
                                      no_urls=True,
                                      no_phone_numbers=True,
                                      no_numbers=True,
                                      no_currency_symbols=True,
                                      no_contractions=True,
                                      no_accents=True)
Exemple #18
0
    def stepOne(self, content=None):
        result = preprocess_text(content,
                                 fix_unicode=True,
                                 lowercase=True,
                                 no_urls=True,
                                 no_emails=True,
                                 no_phone_numbers=True,
                                 no_numbers=True,
                                 no_currency_symbols=True,
                                 no_punct=True)

        return result
Exemple #19
0
    def getCategory(self, iData=None):

        model = joblib.load('modelMNB')

        for data in iData:
            clean = preprocess_text(data['cleanContent'], lowercase=True)
            result = model.predict([clean])
            result = result[0]

            data['category'] = result

        return iData
Exemple #20
0
 def custom_preprocess(self, text):
     text = self.replace_bank_names(text)
     text = preprocess_text(text,
                            fix_unicode=True,
                            lowercase=False,
                            no_urls=True,
                            no_emails=True,
                            no_phone_numbers=True,
                            no_punct=False,
                            no_numbers=False)
     text = self.replace_characters_to_space(text)
     return text
Exemple #21
0
def process(content, env, **settings):
    for doc in content:
        try:
            text = doc['text']
            text = preprocess_text(text, **settings)
        except Exception:
            logger.exception(
                "Textacy Processor: got an error in extracting content: %r",
                doc)

            continue

        yield set_text(doc, text)
Exemple #22
0
def preprocess_text_by_config(text, textacy_defs):
    return preprocess_text(
        text,
        fix_unicode=textacy_defs['fix_unicode'],
        lowercase=textacy_defs['lowercase'],
        transliterate=textacy_defs['transliterate'],
        no_urls=textacy_defs['no_urls'],
        no_emails=textacy_defs['no_emails'],
        no_phone_numbers=textacy_defs['no_phone_numbers'],
        no_numbers=textacy_defs['no_numbers'],
        no_currency_symbols=textacy_defs['no_currency_symbols'],
        no_punct=textacy_defs['no_punct'],
        no_contractions=textacy_defs['no_contractions'],
        no_accents=textacy_defs['no_accents'])
Exemple #23
0
    def from_feed(self, url):
        fdict = fp.parse(url)

        for entry in fdict.entries:
            # Each entry may have multiple pieces of content. Here they're just concatenated.
            body = ""
            for c in entry.content:
                body += " " + c.value

            # Preprocessing
            body = pre.preprocess_text(body, no_urls=True, no_emails=True, no_phone_numbers=True)

            metadata = {'title': entry.title,
                    'author': entry.author,
                    'date_updated': entry.updated,
                    'publication_title': fdict.feed.title}
            self.add_text(body, metadata = metadata)
Exemple #24
0
def textacy_cleaner(text: str) -> str:
    if isinstance(text, numbers.Number) and numpy.isnan(text):
        logging.warning("Received nan instead of str")
        return "nan"

    return preprocess_text(text,
                           fix_unicode=False,
                           lowercase=True,
                           transliterate=True,
                           no_urls=True,
                           no_emails=True,
                           no_phone_numbers=True,
                           no_numbers=True,
                           no_currency_symbols=True,
                           no_punct=True,
                           no_contractions=False,
                           no_accents=True)
def textacy_cleaner(text: str) -> str:
    """
    Defines the default function for cleaning text.

    This function operates over a list.
    """
    return preprocess_text(text,
                           fix_unicode=True,
                           lowercase=True,
                           transliterate=True,
                           no_urls=True,
                           no_emails=True,
                           no_phone_numbers=True,
                           no_numbers=True,
                           no_currency_symbols=True,
                           no_punct=True,
                           no_contractions=False,
                           no_accents=True)
Exemple #26
0
def str_clean_up(x, nlp):

    # replace the "..." by " "
    # raw example: "to...find...this...purpose...\nof...a 'voice' hearer...is...to...go.."
    out = x.replace("...", " ")

    out = out.replace("\n", " ")

    # replace all punctuations
    # out = out.replace('.', '')
    # out = out.replace(',', '')
    # out = out.replace('"', '')
    # out = out.replace('?', '')
    # out = out.replace('!', '')

    # replace all slashes
    out = out.replace('\\', '')
    out = out.replace('/', '')

    # fix unicode, currency, contraction, accents
    out = preprocess_text(out,
                          fix_unicode=True,
                          transliterate=True,
                          no_currency_symbols=True,
                          no_contractions=True,
                          no_accents=True,
                          no_urls=True,
                          no_emails=True)

    # replace http and emails
    out_doc = nlp(out)
    text = []
    for token in out_doc:
        if token.like_url or token.like_email:
            pass
        else:
            text.append(token.text)
    out = " ".join(text)

    # replace other characters
    import re
    out = re.sub("[^A-Za-z0-9 ?'.:;!]+", "", out)

    return out
Exemple #27
0
 def preprocessText(self, strtxt, lang='en', ner=False):
     self.utilclass = UtilityClass()
     posttxt = str(strtxt)
     '''
     if ner: 
         posttxt = self.processNER(posttxt, lang=lang)
     '''
     posttxt = preprocess_text(posttxt,
                               fix_unicode=True,
                               lowercase=False,
                               transliterate=False,
                               no_urls=True,
                               no_emails=True,
                               no_phone_numbers=True,
                               no_numbers=True,
                               no_currency_symbols=True,
                               no_punct=False,
                               no_contractions=False,
                               no_accents=False)
     return posttxt
Exemple #28
0
def textacy_cleaner(text: str) -> str:
    if isinstance(text, (int, float, complex)):
        # workaround module not found error if inside model
        import numpy, logging
        if numpy.isnan(text):
            logging.warning("Received nan instead of str")
            return "nan"

    from textacy.preprocess import preprocess_text
    return preprocess_text(text,
                           fix_unicode=False,
                           lowercase=True,
                           transliterate=True,
                           no_urls=True,
                           no_emails=True,
                           no_phone_numbers=True,
                           no_numbers=True,
                           no_currency_symbols=True,
                           no_punct=True,
                           no_contractions=False,
                           no_accents=True)
Exemple #29
0
def preprocess_f(text,
                 fix_unicode=True,
                 lowercase=True,
                 no_urls=True,
                 no_emails=True,
                 no_phone_numbers=True,
                 no_numbers=True,
                 no_currency_symbols=True,
                 no_punct=True,
                 no_accents=True):
    """Preprocess text."""
    clean_text = preprocess_text(text,
                                 fix_unicode=fix_unicode,
                                 lowercase=lowercase,
                                 no_urls=no_urls,
                                 no_emails=no_emails,
                                 no_phone_numbers=no_phone_numbers,
                                 no_numbers=no_numbers,
                                 no_currency_symbols=no_currency_symbols,
                                 no_punct=no_punct,
                                 no_accents=no_accents)
    return clean_text
Exemple #30
0
def preprocess_text_string(text):
    """Preprocesses text for feature extraction.

    Preprocessing tasks are as follows:
        - whitespace normalization
        - fixing broken unicode via ftfy
        - converting text to lowercase
        - replacing url strings with 'url'
        - replacing phone number strings with 'phone'
        - replacing currency symbols with their standard 3-letter abbreviations
        - stripping punctuation
        - replacing contractions with their unshortened forms
        - lemmatizing words

    Parameters
    ----------
    text : str
        The input text to be preprocessed.

    Returns
    -------
    preprocessed : str
        The preprocessed output text.
    """
    text = preprocess_text(text,
                           fix_unicode=True,
                           lowercase=True,
                           no_urls=True,
                           no_phone_numbers=True,
                           no_currency_symbols=True,
                           no_punct=True,
                           no_contractions=True)
    doc = Doc(text, lang='en')
    lemmatized_tokens = doc.to_terms_list(ngrams=1,
                                          named_entities=False,
                                          as_strings=True,
                                          normalize='lemma')
    return ' '.join(lemmatized_tokens)
Exemple #31
0
def tokenizer(sentences):
    y = []
    if type(sentences) == str:
        sentences = [sentences]
    for comment in sentences:
        comment = my_preprocess(comment)
        txt = preprocess.normalize_whitespace(comment)

        txt = preprocess.preprocess_text(txt,
                                         fix_unicode=True,
                                         lowercase=True,
                                         transliterate=True,
                                         no_urls=True,
                                         no_emails=True,
                                         no_phone_numbers=True,
                                         no_numbers=True,
                                         no_currency_symbols=True,
                                         no_punct=True,
                                         no_contractions=True,
                                         no_accents=True)

        y.append(u''.join(txt))
    return y