Example #1
0
    def __init__(self,
                 clean_html=True,
                 remove_new_lines=True,
                 hash_seed=None,
                 remove_non_english=True,
                 remove_stop_words=True,
                 remove_punct=True,
                 remove_non_alpha=True,
                 replace_emails=True,
                 replace_numbers=True,
                 lemma=True,
                 replace_urls=True,
                 language='English',
                 tokenization_method='byWords'):
        self.number_pattern = "NUMBER_PATTERN"
        self.url_pattern = "URL_PATTERN"
        self.email_pattern = "EMAIL_PATTERN"
        self.reserved_tokens = set(
            [self.number_pattern, self.url_pattern, self.email_pattern])
        self.clean_html = clean_html
        self.remove_new_lines = remove_new_lines
        self.hash_seed = hash_seed
        self.remove_non_english = remove_non_english
        self.remove_stop_words = remove_stop_words
        self.remove_punct = remove_punct
        self.remove_non_alpha = remove_non_alpha
        self.replace_emails = replace_emails
        self.replace_urls = replace_urls
        self.replace_numbers = replace_numbers
        self.lemma = lemma
        self.language = language
        self.tokenization_method = tokenization_method
        self.max_text_length = 10**5

        self.nlp = None
        self.html_parser = HTMLParser()
        self._unicode_chr_splitter = _Re(
            '(?s)((?:[\ud800-\udbff][\udc00-\udfff])|.)').split
        self.languages_to_model_names = {
            'English': 'en_core_web_sm',
            'German': 'de_core_news_sm',
            'French': 'fr_core_news_sm',
            'Spanish': 'es_core_news_sm',
            'Portuguese': 'pt_core_news_sm',
            'Italian': 'it_core_news_sm',
            'Dutch': 'nl_core_news_sm'
        }
        self.spacy_count = 0
        self.spacy_reset_count = 500
Example #2
0
    def __init__(self,
                 clean_html=True,
                 remove_new_lines=True,
                 hash_seed=None,
                 remove_non_english=True,
                 remove_stop_words=True,
                 remove_punct=True,
                 remove_non_alpha=True,
                 replace_emails=True,
                 replace_numbers=True,
                 lemma=True,
                 replace_urls=True,
                 language=ANY_LANGUAGE,
                 tokenization_method='tokenizer'):
        self.number_pattern = "NUMBER_PATTERN"
        self.url_pattern = "URL_PATTERN"
        self.email_pattern = "EMAIL_PATTERN"
        self.reserved_tokens = set(
            [self.number_pattern, self.url_pattern, self.email_pattern])
        self.clean_html = clean_html
        self.remove_new_lines = remove_new_lines
        self.hash_seed = hash_seed
        self.remove_non_english = remove_non_english
        self.remove_stop_words = remove_stop_words
        self.remove_punct = remove_punct
        self.remove_non_alpha = remove_non_alpha
        self.replace_emails = replace_emails
        self.replace_urls = replace_urls
        self.replace_numbers = replace_numbers
        self.lemma = lemma
        self.language = language
        self.tokenization_method = tokenization_method
        self.max_text_length = 10**5

        self.nlp = None
        self.html_parser = HTMLParser()
        self._unicode_chr_splitter = _Re(
            '(?s)((?:[\ud800-\udbff][\udc00-\udfff])|.)').split
        self.spacy_count = 0
        self.spacy_reset_count = 500
Example #3
0
http://alt.qcri.org/semeval2016/task5/
https://github.com/magizbox/underthesea/wiki/SemEval-2016-Task-5
'''

from __future__ import absolute_import, division, unicode_literals

import os
import io
import logging
import numpy as np
from re import compile as _Re

from senteval.tools.validation import KFoldClassifier

_unicode_chr_splitter = _Re('(?s)((?:[\ud800-\udbff][\udc00-\udfff])|.)').split


def split_unicode_chrs(text):
    return [chr for chr in _unicode_chr_splitter(text) if chr]


def merge_two_dicts(x, y):
    z = x.copy()  # start with x's keys and values
    z.update(y)  # modifies z with y's keys and values & returns None
    return z


class ABSA_CHEval(object):
    def __init__(self, task_path, seed=1111):
        logging.info('***** Transfer task : ABSA_CH *****\n\n')
Example #4
0
def split_unicode_chrs(text):
    _unicode_chr_splitter = _Re(
        '(?s)((?:[\ud800-\udbff][\udc00-\udfff])|.)').split
    return [chr for chr in _unicode_chr_splitter(text) if chr]
Example #5
0
import urllib.request
from urllib.parse import quote
from bs4 import BeautifulSoup
import subprocess
import platform
import datetime
import json
import wget
import re
from re import compile as _Re

_unicode_chr_splitter = _Re('(?s)((?:[\u2e80-\u9fff])|.)').split

Anki = "../../addToAnkiJapanese.py"


def look_up_from_yahoo(word, Collection, Deck, Download_dir):
    # Eliminate the end of line delimiter
    word = word.splitlines()[0]
    wordUrl = urllib.parse.quote(word, safe='')
    url = "http://jisho.org/search/{}".format(wordUrl)
    content = urllib.request.urlopen(url).read()
    soup = BeautifulSoup(content, 'lxml')
    front_word = ""
    back_word = ""
    furi = ""
    furiChild = []
    furiList = []
    text = ""
    textChild = []
    textList = []
Example #6
0
    if request.is_ajax():
        html = render_to_string(template, context_dict or {}, 
            context_instance=RequestContext(request), **kwargs)
        return HttpResponse(html)
        
    else:
        context_dict['snippet'] = template
        template = 'generic_parent.html'
    
    return render_to_response(
        template, context_dict or {}, context_instance=RequestContext(request),
            **kwargs)  


from re import compile as _Re
_unicode_chr_splitter = _Re( '(?s)((?:[\ud800-\udbff][\udc00-\udfff])|.)' ).split
def _split_unicode_chrs(text):
    return [ chr for chr in _unicode_chr_splitter( text ) if chr ]


def _is_punctuation(x):
    
    try:
        if x in string.whitespace:
            return True
            
        if x in string.punctuation:
            return True
                
        if unicodedata.category(x).startswith(('P', 'Z', 'S')):
            return True
Example #7
0
def split_unicode_chrs( text ):
	_unicode_chr_splitter = _Re( '(?s)((?:[\ud800-\udbff][\udc00-\udfff])|.)' ).split
	return [ chr for chr in _unicode_chr_splitter( text ) if chr ]
Example #8
0
import pinyin.cedict
import re
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from PIL import Image
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from re import compile as _Re
import matplotlib.pyplot as plt

# In[ ]:

nlp = spacy.load('en_core_web_sm')

# In[2]:

character_splitter = _Re('(?s)((?:[\ud800-\udbff][\udc00-\udfff])|.)').split


def split_characters(text):
    return [chr for chr in character_splitter(text) if chr]


# In[3]:


def remove_chinese_symbol(text):
    text_split = split_characters(text)
    for txt in text_split:
        if txt > u'\u4e00' and txt < u'\u9fff':
            text = ''
    return text
 def __init__(self):
     self._splitter = _Re( '(?s)((?:[\ud800-\udbff][\udc00-\udfff])|.)' ).split