def __init__(self, clean_html=True, remove_new_lines=True, hash_seed=None, remove_non_english=True, remove_stop_words=True, remove_punct=True, remove_non_alpha=True, replace_emails=True, replace_numbers=True, lemma=True, replace_urls=True, language='English', tokenization_method='byWords'): self.number_pattern = "NUMBER_PATTERN" self.url_pattern = "URL_PATTERN" self.email_pattern = "EMAIL_PATTERN" self.reserved_tokens = set( [self.number_pattern, self.url_pattern, self.email_pattern]) self.clean_html = clean_html self.remove_new_lines = remove_new_lines self.hash_seed = hash_seed self.remove_non_english = remove_non_english self.remove_stop_words = remove_stop_words self.remove_punct = remove_punct self.remove_non_alpha = remove_non_alpha self.replace_emails = replace_emails self.replace_urls = replace_urls self.replace_numbers = replace_numbers self.lemma = lemma self.language = language self.tokenization_method = tokenization_method self.max_text_length = 10**5 self.nlp = None self.html_parser = HTMLParser() self._unicode_chr_splitter = _Re( '(?s)((?:[\ud800-\udbff][\udc00-\udfff])|.)').split self.languages_to_model_names = { 'English': 'en_core_web_sm', 'German': 'de_core_news_sm', 'French': 'fr_core_news_sm', 'Spanish': 'es_core_news_sm', 'Portuguese': 'pt_core_news_sm', 'Italian': 'it_core_news_sm', 'Dutch': 'nl_core_news_sm' } self.spacy_count = 0 self.spacy_reset_count = 500
def __init__(self, clean_html=True, remove_new_lines=True, hash_seed=None, remove_non_english=True, remove_stop_words=True, remove_punct=True, remove_non_alpha=True, replace_emails=True, replace_numbers=True, lemma=True, replace_urls=True, language=ANY_LANGUAGE, tokenization_method='tokenizer'): self.number_pattern = "NUMBER_PATTERN" self.url_pattern = "URL_PATTERN" self.email_pattern = "EMAIL_PATTERN" self.reserved_tokens = set( [self.number_pattern, self.url_pattern, self.email_pattern]) self.clean_html = clean_html self.remove_new_lines = remove_new_lines self.hash_seed = hash_seed self.remove_non_english = remove_non_english self.remove_stop_words = remove_stop_words self.remove_punct = remove_punct self.remove_non_alpha = remove_non_alpha self.replace_emails = replace_emails self.replace_urls = replace_urls self.replace_numbers = replace_numbers self.lemma = lemma self.language = language self.tokenization_method = tokenization_method self.max_text_length = 10**5 self.nlp = None self.html_parser = HTMLParser() self._unicode_chr_splitter = _Re( '(?s)((?:[\ud800-\udbff][\udc00-\udfff])|.)').split self.spacy_count = 0 self.spacy_reset_count = 500
http://alt.qcri.org/semeval2016/task5/ https://github.com/magizbox/underthesea/wiki/SemEval-2016-Task-5 ''' from __future__ import absolute_import, division, unicode_literals import os import io import logging import numpy as np from re import compile as _Re from senteval.tools.validation import KFoldClassifier _unicode_chr_splitter = _Re('(?s)((?:[\ud800-\udbff][\udc00-\udfff])|.)').split def split_unicode_chrs(text): return [chr for chr in _unicode_chr_splitter(text) if chr] def merge_two_dicts(x, y): z = x.copy() # start with x's keys and values z.update(y) # modifies z with y's keys and values & returns None return z class ABSA_CHEval(object): def __init__(self, task_path, seed=1111): logging.info('***** Transfer task : ABSA_CH *****\n\n')
def split_unicode_chrs(text): _unicode_chr_splitter = _Re( '(?s)((?:[\ud800-\udbff][\udc00-\udfff])|.)').split return [chr for chr in _unicode_chr_splitter(text) if chr]
import urllib.request from urllib.parse import quote from bs4 import BeautifulSoup import subprocess import platform import datetime import json import wget import re from re import compile as _Re _unicode_chr_splitter = _Re('(?s)((?:[\u2e80-\u9fff])|.)').split Anki = "../../addToAnkiJapanese.py" def look_up_from_yahoo(word, Collection, Deck, Download_dir): # Eliminate the end of line delimiter word = word.splitlines()[0] wordUrl = urllib.parse.quote(word, safe='') url = "http://jisho.org/search/{}".format(wordUrl) content = urllib.request.urlopen(url).read() soup = BeautifulSoup(content, 'lxml') front_word = "" back_word = "" furi = "" furiChild = [] furiList = [] text = "" textChild = [] textList = []
if request.is_ajax(): html = render_to_string(template, context_dict or {}, context_instance=RequestContext(request), **kwargs) return HttpResponse(html) else: context_dict['snippet'] = template template = 'generic_parent.html' return render_to_response( template, context_dict or {}, context_instance=RequestContext(request), **kwargs) from re import compile as _Re _unicode_chr_splitter = _Re( '(?s)((?:[\ud800-\udbff][\udc00-\udfff])|.)' ).split def _split_unicode_chrs(text): return [ chr for chr in _unicode_chr_splitter( text ) if chr ] def _is_punctuation(x): try: if x in string.whitespace: return True if x in string.punctuation: return True if unicodedata.category(x).startswith(('P', 'Z', 'S')): return True
def split_unicode_chrs( text ): _unicode_chr_splitter = _Re( '(?s)((?:[\ud800-\udbff][\udc00-\udfff])|.)' ).split return [ chr for chr in _unicode_chr_splitter( text ) if chr ]
import pinyin.cedict import re from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator from PIL import Image from sklearn.feature_extraction.text import CountVectorizer from sklearn.decomposition import LatentDirichletAllocation from re import compile as _Re import matplotlib.pyplot as plt # In[ ]: nlp = spacy.load('en_core_web_sm') # In[2]: character_splitter = _Re('(?s)((?:[\ud800-\udbff][\udc00-\udfff])|.)').split def split_characters(text): return [chr for chr in character_splitter(text) if chr] # In[3]: def remove_chinese_symbol(text): text_split = split_characters(text) for txt in text_split: if txt > u'\u4e00' and txt < u'\u9fff': text = '' return text
def __init__(self): self._splitter = _Re( '(?s)((?:[\ud800-\udbff][\udc00-\udfff])|.)' ).split