def separar_silabas(palavra, separador): """ Função que separa silabas da palavra indicada na chamada da função. O usuário ainda pode escolher que tipo de separador ele deseja para poder ficar mais amigável ao seu código. """ #TODO: Implementar função nativamente para processamento de sílabas from pyphen import Pyphen _palavra_sep = palavra.lower() dic = Pyphen(lang="pt_BR") _palavra_sep = dic.inserted(_palavra_sep) if separador == "-": return _palavra_sep _palavra_sep = str(_palavra_sep).replace("-", separador) return _palavra_sep
def syllable_count(self, text, lang='en_US'): """ Function to calculate syllable words in a text. I/P - a text O/P - number of syllable words """ text = text.lower() text = "".join(x for x in text if x not in exclude) if text is None: return 0 elif len(text) == 0: return 0 else: dic = Pyphen(lang=lang) count = 0 for word in text.split(' '): word_hyphenated = dic.inserted(word) count += max(1, word_hyphenated.count("-") + 1) return count
def syllable_count(self, text, lang='en_US'): """ Function to calculate syllable words in a text. I/P - a text O/P - number of syllable words """ if isinstance(text, bytes): text = text.decode(self.text_encoding) text = text.lower() text = self.remove_punctuation(text) if not text: return 0 dic = Pyphen(lang=lang) count = 0 for word in text.split(' '): word_hyphenated = dic.inserted(word) count += max(1, word_hyphenated.count("-") + 1) return count
def n_syllables(doc: Doc): """ Return number of syllables per token """ dic = Pyphen(lang=doc.lang_) def count_syl(token: Token): word_hyphenated = dic.inserted(token.lower_) return max(1, word_hyphenated.count("-") + 1) return [count_syl(token) for token in doc._._filtered_tokens]
def syllable_counts(self, text): """ Calculates number of syllables per token Punctuation is removed before tokenization """ text = Text.to_text(text) if not text.text: return 0 dic = Pyphen(lang=self.__lang) def count_syl(token): word_hyphenated = dic.inserted(token.lower()) return max(1, word_hyphenated.count("-") + 1) return [count_syl(token) for token in text.tokens_without_punctuation]
class textstatistics: __lang = "en_US" text_encoding = "utf-8" __easy_word_sets = {} __punctuation_regex = re.compile(f'[{re.escape(string.punctuation)}]') def __init__(self): self.set_lang(self.__lang) def _cache_clear(self): caching_methods = [ method for method in dir(self) if callable(getattr(self, method)) and hasattr(getattr(self, method), "cache_info") ] for method in caching_methods: getattr(self, method).cache_clear() def set_lang(self, lang): self.__lang = lang self.pyphen = Pyphen(lang=self.__lang) self._cache_clear() @lru_cache(maxsize=128) def char_count(self, text, ignore_spaces=True): """ Function to return total character counts in a text, pass the following parameter `ignore_spaces = False` to ignore whitespaces """ if ignore_spaces: text = text.replace(" ", "") return len(text) @lru_cache(maxsize=128) def letter_count(self, text, ignore_spaces=True): """ Function to return total letter amount in a text, pass the following parameter `ignore_spaces = False` to ignore whitespaces """ if ignore_spaces: text = text.replace(" ", "") return len(self.remove_punctuation(text)) @classmethod def remove_punctuation(cls, text): return cls.__punctuation_regex.sub('', text) @lru_cache(maxsize=128) def lexicon_count(self, text, removepunct=True): """ Function to return total lexicon (words in lay terms) counts in a text """ if removepunct: text = self.remove_punctuation(text) count = len(text.split()) return count @lru_cache(maxsize=128) def syllable_count(self, text, lang=None): """ Function to calculate syllable words in a text. I/P - a text O/P - number of syllable words """ if lang: warnings.warn( "The 'lang' argument has been moved to " "'textstat.set_lang(<lang>)'. This argument will be removed " "in the future.", DeprecationWarning) if isinstance(text, bytes): text = text.decode(self.text_encoding) text = text.lower() text = self.remove_punctuation(text) if not text: return 0 count = 0 for word in text.split(' '): count += len(self.pyphen.positions(word)) + 1 return count @lru_cache(maxsize=128) def sentence_count(self, text): """ Sentence count of a text """ ignore_count = 0 sentences = re.split(r' *[\.\?!][\'"\)\]]*[ |\n](?=[A-Z])', text) for sentence in sentences: if self.lexicon_count(sentence) <= 2: ignore_count += 1 return max(1, len(sentences) - ignore_count) @lru_cache(maxsize=128) def avg_sentence_length(self, text): try: asl = float(self.lexicon_count(text) / self.sentence_count(text)) return legacy_round(asl, 1) except ZeroDivisionError: return 0.0 @lru_cache(maxsize=128) def avg_syllables_per_word(self, text, interval=None): syllable = self.syllable_count(text) words = self.lexicon_count(text) try: if interval: syllables_per_word = float(syllable) * interval / float(words) else: syllables_per_word = float(syllable) / float(words) return legacy_round(syllables_per_word, 1) except ZeroDivisionError: return 0.0 @lru_cache(maxsize=128) def avg_character_per_word(self, text): try: letters_per_word = float( self.char_count(text) / self.lexicon_count(text)) return legacy_round(letters_per_word, 2) except ZeroDivisionError: return 0.0 @lru_cache(maxsize=128) def avg_letter_per_word(self, text): try: letters_per_word = float( self.letter_count(text) / self.lexicon_count(text)) return legacy_round(letters_per_word, 2) except ZeroDivisionError: return 0.0 @lru_cache(maxsize=128) def avg_sentence_per_word(self, text): try: sentence_per_word = float( self.sentence_count(text) / self.lexicon_count(text)) return legacy_round(sentence_per_word, 2) except ZeroDivisionError: return 0.0 @lru_cache(maxsize=128) def flesch_reading_ease(self, text): sentence_length = self.avg_sentence_length(text) s_interval = 100 if self.__get_lang_root() in ['es', 'it'] else None syllables_per_word = self.avg_syllables_per_word(text, s_interval) flesch = ( self.__get_lang_cfg("fre_base") - float( self.__get_lang_cfg("fre_sentence_length") * sentence_length) - float( self.__get_lang_cfg("fre_syll_per_word") * syllables_per_word)) return legacy_round(flesch, 2) @lru_cache(maxsize=128) def flesch_kincaid_grade(self, text): sentence_lenth = self.avg_sentence_length(text) syllables_per_word = self.avg_syllables_per_word(text) flesch = (float(0.39 * sentence_lenth) + float(11.8 * syllables_per_word) - 15.59) return legacy_round(flesch, 1) @lru_cache(maxsize=128) def polysyllabcount(self, text): count = 0 for word in text.split(): wrds = self.syllable_count(word) if wrds >= 3: count += 1 return count @lru_cache(maxsize=128) def smog_index(self, text): sentences = self.sentence_count(text) if sentences >= 3: try: poly_syllab = self.polysyllabcount(text) smog = ((1.043 * (30 * (poly_syllab / sentences))**.5) + 3.1291) return legacy_round(smog, 1) except ZeroDivisionError: return 0.0 else: return 0.0 @lru_cache(maxsize=128) def coleman_liau_index(self, text): letters = legacy_round(self.avg_letter_per_word(text) * 100, 2) sentences = legacy_round(self.avg_sentence_per_word(text) * 100, 2) coleman = float((0.058 * letters) - (0.296 * sentences) - 15.8) return legacy_round(coleman, 2) @lru_cache(maxsize=128) def automated_readability_index(self, text): chrs = self.char_count(text) words = self.lexicon_count(text) sentences = self.sentence_count(text) try: a = float(chrs) / float(words) b = float(words) / float(sentences) readability = ((4.71 * legacy_round(a, 2)) + (0.5 * legacy_round(b, 2)) - 21.43) return legacy_round(readability, 1) except ZeroDivisionError: return 0.0 @lru_cache(maxsize=128) def linsear_write_formula(self, text): easy_word = 0 difficult_word = 0 text_list = text.split()[:100] for word in text_list: if self.syllable_count(word) < 3: easy_word += 1 else: difficult_word += 1 text = ' '.join(text_list) number = float( (easy_word * 1 + difficult_word * 3) / self.sentence_count(text)) if number <= 20: number -= 2 return number / 2 @lru_cache(maxsize=128) def difficult_words(self, text, syllable_threshold=2): return len(self.difficult_words_list(text, syllable_threshold)) @lru_cache(maxsize=128) def difficult_words_list(self, text, syllable_threshold=2): words = set(re.findall(r"[\w\='‘’]+", text.lower())) diff_words = [ word for word in words if self.is_difficult_word(word, syllable_threshold) ] return diff_words @lru_cache(maxsize=128) def is_difficult_word(self, word, syllable_threshold=2): easy_word_set = self.__get_lang_easy_words() if word in easy_word_set: return False if self.syllable_count(word) < syllable_threshold: return False return True @lru_cache(maxsize=128) def is_easy_word(self, word, syllable_threshold=2): return not self.is_difficult_word(word, syllable_threshold) @lru_cache(maxsize=128) def dale_chall_readability_score(self, text): word_count = self.lexicon_count(text) count = word_count - self.difficult_words(text, syllable_threshold=0) try: per = float(count) / float(word_count) * 100 except ZeroDivisionError: return 0.0 difficult_words = 100 - per score = ((0.1579 * difficult_words) + (0.0496 * self.avg_sentence_length(text))) if difficult_words > 5: score += 3.6365 return legacy_round(score, 2) @lru_cache(maxsize=128) def gunning_fog(self, text): try: syllable_threshold = self.__get_lang_cfg("syllable_threshold") per_diff_words = (self.difficult_words( text, syllable_threshold=syllable_threshold) / self.lexicon_count(text) * 100) grade = 0.4 * (self.avg_sentence_length(text) + per_diff_words) return legacy_round(grade, 2) except ZeroDivisionError: return 0.0 @lru_cache(maxsize=128) def lix(self, text): words = text.split() words_len = len(words) long_words = len([wrd for wrd in words if len(wrd) > 6]) per_long_words = (float(long_words) * 100) / words_len asl = self.avg_sentence_length(text) lix = asl + per_long_words return legacy_round(lix, 2) @lru_cache(maxsize=128) def rix(self, text): """ A Rix ratio is simply the number of long words divided by the number of assessed sentences. rix = LW/S """ words = text.split() long_words_count = len([wrd for wrd in words if len(wrd) > 6]) sentences_count = self.sentence_count(text) try: rix = long_words_count / sentences_count except ZeroDivisionError: rix = 0.00 return legacy_round(rix, 2) @lru_cache(maxsize=128) def spache_readability(self, text, float_output=True): """ Function to calculate SPACHE readability formula for young readers. I/P - a text O/P - an int Spache Readability Index/Grade Level """ total_no_of_words = self.lexicon_count(text) count_of_sentences = self.sentence_count(text) asl = total_no_of_words / count_of_sentences pdw = (self.difficult_words(text) / total_no_of_words) * 100 spache = (0.141 * asl) + (0.086 * pdw) + 0.839 if not float_output: return int(spache) else: return spache @lru_cache(maxsize=128) def dale_chall_readability_score_v2(self, text): """ Function to calculate New Dale Chall Readability formula. I/P - a text O/P - an int Dale Chall Readability Index/Grade Level """ total_no_of_words = self.lexicon_count(text) count_of_sentences = self.sentence_count(text) asl = total_no_of_words / count_of_sentences pdw = (self.difficult_words(text) / total_no_of_words) * 100 raw_score = 0.1579 * (pdw) + 0.0496 * asl adjusted_score = raw_score if raw_score > 0.05: adjusted_score = raw_score + 3.6365 return legacy_round(adjusted_score, 2) @lru_cache(maxsize=128) def text_standard(self, text, float_output=None): grade = [] # Appending Flesch Kincaid Grade lower = legacy_round(self.flesch_kincaid_grade(text)) upper = math.ceil(self.flesch_kincaid_grade(text)) grade.append(int(lower)) grade.append(int(upper)) # Appending Flesch Reading Easy score = self.flesch_reading_ease(text) if score < 100 and score >= 90: grade.append(5) elif score < 90 and score >= 80: grade.append(6) elif score < 80 and score >= 70: grade.append(7) elif score < 70 and score >= 60: grade.append(8) grade.append(9) elif score < 60 and score >= 50: grade.append(10) elif score < 50 and score >= 40: grade.append(11) elif score < 40 and score >= 30: grade.append(12) else: grade.append(13) # Appending SMOG Index lower = legacy_round(self.smog_index(text)) upper = math.ceil(self.smog_index(text)) grade.append(int(lower)) grade.append(int(upper)) # Appending Coleman_Liau_Index lower = legacy_round(self.coleman_liau_index(text)) upper = math.ceil(self.coleman_liau_index(text)) grade.append(int(lower)) grade.append(int(upper)) # Appending Automated_Readability_Index lower = legacy_round(self.automated_readability_index(text)) upper = math.ceil(self.automated_readability_index(text)) grade.append(int(lower)) grade.append(int(upper)) # Appending Dale_Chall_Readability_Score lower = legacy_round(self.dale_chall_readability_score(text)) upper = math.ceil(self.dale_chall_readability_score(text)) grade.append(int(lower)) grade.append(int(upper)) # Appending Linsear_Write_Formula lower = legacy_round(self.linsear_write_formula(text)) upper = math.ceil(self.linsear_write_formula(text)) grade.append(int(lower)) grade.append(int(upper)) # Appending Gunning Fog Index lower = legacy_round(self.gunning_fog(text)) upper = math.ceil(self.gunning_fog(text)) grade.append(int(lower)) grade.append(int(upper)) # Finding the Readability Consensus based upon all the above tests d = Counter(grade) final_grade = d.most_common(1) score = final_grade[0][0] if float_output: return float(score) else: lower_score = int(score) - 1 upper_score = lower_score + 1 return "{}{} and {}{} grade".format(lower_score, get_grade_suffix(lower_score), upper_score, get_grade_suffix(upper_score)) @lru_cache(maxsize=128) def reading_time(self, text, ms_per_char=14.69): """ Function to calculate reading time (Demberg & Keller, 2008) I/P - a text O/P - reading time in second """ words = text.split() nchars = map(len, words) rt_per_word = map(lambda nchar: nchar * ms_per_char, nchars) reading_time = sum(list(rt_per_word)) return legacy_round(reading_time / 1000, 2) # Spanish readability tests @lru_cache(maxsize=128) def fernandez_huerta(self, text): ''' Fernandez Huerta readability score https://legible.es/blog/lecturabilidad-fernandez-huerta/ ''' sentence_length = self.avg_sentence_length(text) syllables_per_word = self.avg_syllables_per_word(text) f_huerta = (206.85 - float(60 * syllables_per_word) - float(1.02 * sentence_length)) return legacy_round(f_huerta, 1) @lru_cache(maxsize=128) def szigriszt_pazos(self, text): ''' Szigriszt Pazos readability score (1992) https://legible.es/blog/perspicuidad-szigriszt-pazos/ ''' syllables = self.syllable_count(text) total_words = self.lexicon_count(text) total_sentences = self.sentence_count(text) s_p = (self.__get_lang_cfg("fre_base") - 62.3 * (syllables / total_words) - (total_words / total_sentences)) return legacy_round(s_p, 2) @lru_cache(maxsize=128) def gutierrez_polini(self, text): ''' Guttierrez de Polini index https://legible.es/blog/comprensibilidad-gutierrez-de-polini/ ''' total_words = self.lexicon_count(text) total_letters = self.letter_count(text) total_sentences = self.sentence_count(text) gut_pol = (95.2 - 9.7 * (total_letters / total_words) - 0.35 * (total_words / total_sentences)) return legacy_round(gut_pol, 2) @lru_cache(maxsize=128) def crawford(self, text): ''' Crawford index https://legible.es/blog/formula-de-crawford/ ''' total_sentences = self.sentence_count(text) total_words = self.lexicon_count(text) total_syllables = self.syllable_count(text) # Calculating __ per 100 words sentences_per_words = 100 * (total_sentences / total_words) syllables_per_words = 100 * (total_syllables / total_words) craw_years = (-0.205 * sentences_per_words + 0.049 * syllables_per_words - 3.407) return legacy_round(craw_years, 1) def __get_lang_cfg(self, key): """ Read as get lang config """ default = langs.get("en") config = langs.get(self.__get_lang_root(), default) return config.get(key, default.get(key)) def __get_lang_root(self): return self.__lang.split("_")[0] def __get_lang_easy_words(self): lang = self.__get_lang_root() if lang not in self.__easy_word_sets: try: easy_word_set = { ln.decode("utf-8").strip() for ln in pkg_resources.resource_stream( "textstat", f"resources/{lang}/easy_words.txt", ) } except FileNotFoundError: warnings.warn( "There is no easy words vocabulary for " f"{self.__lang}, using english.", Warning, ) easy_word_set = { ln.decode("utf-8").strip() for ln in pkg_resources.resource_stream( "textstat", "resources/en/easy_words.txt") } self.__easy_word_sets[lang] = easy_word_set return self.__easy_word_sets[lang]
def _shyphenate_text(dic: pyphen.Pyphen, text: str) -> str: if len(text) < 5: return text else: return " ".join( dic.inserted(word, hyphen=SOFT_HYPHEN) for word in text.split(" "))
def read_data(args, config): '''read data sets, construct all needed structures and update the config''' if args.ssm == '1': config.ssm = 1 hyphenator = Pyphen(lang=args.lang) def my_characters(word): return ['⎡'] + list(word) + ['⎦'] if args.is_train == '1': if not os.path.exists(args.save_dir): os.makedirs(args.save_dir) with open(os.path.join( args.save_dir, args.prefix + '-data.pkl'), 'wb') as data_file: word_data = open(os.path.join(args.data_dir, 'train.txt'), 'r').read() \ .replace('\n', args.eos).split() words = list(set(word_data)) characters = set() word_lens_in_char = [] for word in words: chars = my_characters(word) word_lens_in_char.append(len(chars)) for char in chars: characters.add(char) chars_list = list(characters) pickle.dump( (word_data, words, word_lens_in_char, chars_list), data_file) else: with open(os.path.join( args.save_dir, args.prefix + '-data.pkl'), 'rb') as data_file: word_data, words, word_lens_in_char, chars_list = \ pickle.load(data_file) word_data_size, word_vocab_size = len(word_data), len(words) print('data has %d words, %d unique' % (word_data_size, word_vocab_size)) config.word_vocab_size = word_vocab_size config.num_sampled = int(word_vocab_size * 0.2) word_to_ix = { word:i for i,word in enumerate(words) } ix_to_word = { i:word for i,word in enumerate(words) } def get_word_raw_data(input_file): data = open(input_file, 'r').read().replace('\n', args.eos).split() return [word_to_ix[w] for w in data] train_raw_data = get_word_raw_data(os.path.join(args.data_dir, 'train.txt')) valid_raw_data = get_word_raw_data(os.path.join(args.data_dir, 'valid.txt')) test_raw_data = get_word_raw_data(os.path.join(args.data_dir, 'test.txt')) char_vocab_size = len(chars_list) max_word_len = int(np.percentile(word_lens_in_char, 100)) config.max_word_len = max_word_len print('data has %d unique chars' % char_vocab_size) print('max word length in chars is set to', max_word_len) # a fake character for zero-padding zero_pad_char = ' ' chars_list.insert(0, zero_pad_char) char_vocab_size += 1 config.char_vocab_size = char_vocab_size char_to_ix = { char:i for i,char in enumerate(chars_list) } ix_to_char = { i:char for i,char in enumerate(chars_list) } word_ix_to_char_ixs = {} for word in words: word_ix = word_to_ix[word] word_in_chars = my_characters(word) word_in_chars += [zero_pad_char] * (max_word_len - len(word_in_chars)) word_ix_to_char_ixs[word_ix] = \ [char_to_ix[char] for char in word_in_chars] return train_raw_data, valid_raw_data, test_raw_data, word_ix_to_char_ixs
from utils.web.slack_api import parse_config from utils.web.slack_api.big_emoji import resize_image, resize_gif from utils.web.slack_api.text_to_emoji import text_to_emoji from utils.web.servers.core import register_cmd, SlackInfo, slack_api, app, init_slack_api, gen_help_str, \ send_to_channel, request_in_loop, no_dm, run_in_executor __author__ = 'acushner' from utils.web.servers.incident import IncidentInfo, init_incident_store _admins = parse_config().admin_id_name_map DEFAULT_SIZE_MULT = 6. MAX_SIZE_MULT = 15. _pyphen = Pyphen(lang='en') @register_cmd @no_dm async def embiggen(si: SlackInfo): """emoji [size_multiple] [_size_multiple_]: multiple to scale up/down emoji size by only works on custom emoji due to download issues from slack""" emoji, *rest = si.argstr.split() mult = min(MAX_SIZE_MULT, float(first(rest, DEFAULT_SIZE_MULT))) if mult <= 0: return text(f'invalid mult: {mult}') all_emoji = await slack_api.get_emoji() try:
def hyphenate(text, hyphen='­'): py = Pyphen(lang='de_de') words = text.split(' ') return ' '.join([py.inserted(word, hyphen=hyphen) for word in words])
class Syllable: def __init__(self, syl): self.id = ID.next() self.text = syl self.phonemes = [] self.stressed = False def __str__(self): arr = [] for ph in self.phonemes: arr.append(ph.text) return u'{} ({})'.format(self.text, arr) hyp = Pyphen(lang='pl_PL') ph_map = { 'I': 'y', 'en': u'ę', 'on': u'ą', 'v': 'w', 'S': 'sz', 'Z': u'ż', 'si': u'ś', 'zi': u'ź', 'x': 'h', 'ts': 'c', 'tS': 'cz', 'dZ': u'dż', 'ni': u'ń',
class GermaLemma(object): """ Lemmatizer for German language text main class. """ pyphen_dic = Pyphen(lang='de') def __init__(self, **kwargs): """ Initialize GermaLemma lemmatizer. By default, it will load the lemmatizer data from 'data/lemmata.pickle'. You can also pass a manual lemmata dictionary via `lemmata` or load a corpus in CONLL09 format via `tiger_corpus` or load pickled lemmatizer data from `pickle`. Force usage of pattern.de module by setting `use_pattern_module` to True (or False for not using). By default, it will try to use pattern.de if it is installed. """ if 'lemmata' in kwargs: self.lemmata = kwargs['lemmata'] if 'lemmata_lower' in kwargs: self.lemmata_lower = kwargs['lemmata_lower'] else: self.lemmata_lower = { pos: {token.lower(): lemma for token, lemma in pos_lemmata} for pos, pos_lemmata in self.lemmata.items() } elif 'tiger_corpus' in kwargs: self.lemmata, self.lemmata_lower = self.load_corpus_lemmata( kwargs['tiger_corpus']) elif 'pickle' in kwargs: self.load_from_pickle(kwargs['pickle']) else: try: self.load_from_pickle(DEFAULT_LEMMATA_PICKLE) except FileNotFoundError: self.load_from_pickle( os.path.join(sys.prefix, DEFAULT_LEMMATA_PICKLE)) self.pattern_module = None use_pattern_module = kwargs.get('use_pattern_module', None) if use_pattern_module in (True, None): try: self.pattern_module = import_module('pattern.de') except ImportError: if use_pattern_module is True: raise ImportError('pattern.de module could not be loaded') def find_lemma(self, w, pos_tag): """ Find a lemma for word `w` that has a Part-of-Speech tag `pos_tag`. `pos_tag` should be a valid STTS tagset tag (see http://www.ims.uni-stuttgart.de/forschung/ressourcen/lexika/TagSets/stts-table.html) or a simplified form with: - 'N' for nouns - 'V' for verbs - 'ADJ' for adjectives - 'ADV' for adverbs All other tags will raise a ValueError("Unsupported POS tag")! Return the lemma or, if no lemma was found, return `w`. """ if not w: # do not process empty strings return w if pos_tag == 'NE': # if word is a name, it already is the lemma return w if pos_tag.startswith('N') or pos_tag.startswith('V'): pos = pos_tag[0] elif pos_tag.startswith('ADJ') or pos_tag.startswith('ADV'): pos = pos_tag[:3] else: raise ValueError("Unsupported POS tag") # look if we can directly find `w` in the lemmata dictionary res = self.dict_search(w, pos) if not res and self.pattern_module: # try to use pattern.de module res_pattern = self._lemma_via_patternlib(w, pos) if res_pattern != w: res = res_pattern if not res: # try to split nouns that are made of composita if pos == 'N': res = self._composita_lemma(w) or w else: res = w # try to lemmatize adjectives using prevalent German language adjective suffixes if pos == 'ADJ': res = self._adj_lemma(res) # nouns always start with a capital letter if pos == 'N': if len(res) > 1 and res[0].islower(): res = res[0].upper() + res[1:] else: # all other forms are lower-case res = res.lower() return res def dict_search(self, w, pos, use_lower=False): """ Lemmata dictionary lookup for word `w` with POS tag `pos`. Return lemma if found, else None. """ pos_lemmata = self.lemmata_lower[pos] if use_lower else self.lemmata[ pos] return pos_lemmata.get(w, None) def _adj_lemma(self, w): """ Try to lemmatize adjectives using prevalent German language adjective suffixes. Return possibly lemmatized adjective. """ for full, reduced in ADJ_SUFFIXES_DICT.items(): if w.endswith(full): return w[:-len(full)] + reduced return w def _composita_lemma(self, w): """ Try to split a word `w` that is possibly made of composita. Return the lemma if found, else return None. """ # find most important split position first when a hyphen is used in the word try: split_positions = [w.rfind('-') + 1] except ValueError: split_positions = [] # add possible split possitions by using Pyphen's hyphenation positions split_positions.extend([ p for p in self.pyphen_dic.positions(w) if p not in split_positions ]) # now split `w` by hyphenation step by step for hy_pos in split_positions: # split in left and right parts (start and end of the strings) left, right = w[:hy_pos], w[hy_pos:] # look if the right part can be found in the lemmata dictionary # if we have a noun, a lower case match will also be accepted if left and right and not right.endswith('innen'): res = self.dict_search(right, 'N', use_lower=right[0].islower()) if res: # concatenate the left side with the found partial lemma if left[-1] == '-': res = left + res.capitalize() else: res = left + res.lower() if w.isupper(): return res.upper() else: return res return None def _lemma_via_patternlib(self, w, pos): """ Try to find a lemma for word `w` that has a Part-of-Speech tag `pos_tag` by using pattern.de module's functions. Return the lemma or `w` if lemmatization was not possible with pattern.de """ if not self.pattern_module: raise RuntimeError('pattern.de module not loaded') if pos == 'NP': # singularize noun return self.pattern_module.singularize(w) elif pos.startswith('V'): # get infinitive of verb return self.pattern_module.conjugate(w) elif pos.startswith('ADJ') or pos.startswith( 'ADV'): # get baseform of adjective or adverb return self.pattern_module.predicative(w) return w @classmethod def load_corpus_lemmata(cls, corpus_file): lemmata = defaultdict(dict) lemmata_lower = defaultdict(dict) with codecs.open(corpus_file, encoding="utf-8") as f: for line in f: parts = line.split() if len(parts) == 15: token, lemma = parts[1:3] pos = parts[4] cls.add_to_lemmata_dicts(lemmata, lemmata_lower, token, lemma, pos) return lemmata, lemmata_lower @staticmethod def add_to_lemmata_dicts(lemmata, lemmata_lower, token, lemma, pos): for pos_prefix in VALID_POS_PREFIXES: if pos.startswith(pos_prefix): if token not in lemmata[pos_prefix]: lemmata[pos_prefix][token] = lemma if lemma not in lemmata[pos_prefix]: # for quicker lookup lemmata[pos_prefix][lemma] = lemma if pos_prefix == 'N': token_lower = token.lower() if token_lower not in lemmata_lower[pos_prefix]: lemmata_lower[pos_prefix][token_lower] = lemma lemma_lower = lemma.lower() if lemma_lower not in lemmata_lower[pos_prefix]: lemmata_lower[pos_prefix][lemma_lower] = lemma return def save_to_pickle(self, pickle_file): with open(pickle_file, 'wb') as f: pickle.dump((self.lemmata, self.lemmata_lower), f, protocol=2) def load_from_pickle(self, pickle_file): with open(pickle_file, 'rb') as f: self.lemmata, self.lemmata_lower = pickle.load(f)
def read_data(args, config): '''read data sets, construct all needed structures and update the config''' if args.ssm == '1': config.ssm = 1 hyphenator = Pyphen(lang=args.dict) def my_syllables(word): return hyphenator.inserted(word).split('-') if args.is_train == '1': if not os.path.exists(args.save_dir): os.makedirs(args.save_dir) with open(os.path.join(args.save_dir, args.prefix + '-data.pkl'), 'wb') as data_file: word_data = open( os.path.join(args.data_dir, 'train.txt'), 'r').read() \ .replace('\n', args.eos).split() words = list(set(word_data)) syllables = set() word_lens_in_syl = [] for word in words: syls = my_syllables(word) word_lens_in_syl.append(len(syls)) for syl in syls: syllables.add(syl) syls_list = list(syllables) pickle.dump((word_data, words, word_lens_in_syl, syls_list), data_file) else: with open(os.path.join(args.save_dir, args.prefix + '-data.pkl'), 'rb') as data_file: word_data, words, word_lens_in_syl, syls_list = pickle.load( data_file) word_data_size, word_vocab_size = len(word_data), len(words) print('data has %d words, %d unique' % (word_data_size, word_vocab_size)) config.word_vocab_size = word_vocab_size config.num_sampled = int(word_vocab_size * 0.2) word_to_ix = {word: i for i, word in enumerate(words)} ix_to_word = {i: word for i, word in enumerate(words)} def get_word_raw_data(input_file): data = open(input_file, 'r').read().replace('\n', args.eos).split() return [word_to_ix[w] for w in data] train_raw_data = get_word_raw_data(os.path.join(args.data_dir, 'train.txt')) valid_raw_data = get_word_raw_data(os.path.join(args.data_dir, 'valid.txt')) test_raw_data = get_word_raw_data(os.path.join(args.data_dir, 'test.txt')) syl_vocab_size = len(syls_list) max_word_len = int(np.percentile(word_lens_in_syl, 100)) config.max_word_len = max_word_len print('data has %d unique syllables' % syl_vocab_size) print('max word length in syllables is set to', max_word_len) # a fake syllable for zero-padding zero_pad_syl = ' ' syls_list.insert(0, zero_pad_syl) syl_vocab_size += 1 config.syl_vocab_size = syl_vocab_size syl_to_ix = {syl: i for i, syl in enumerate(syls_list)} ix_to_syl = {i: syl for i, syl in enumerate(syls_list)} word_ix_to_syl_ixs = {} for word in words: word_ix = word_to_ix[word] word_in_syls = my_syllables(word) word_in_syls += [zero_pad_syl] * (max_word_len - len(word_in_syls)) word_ix_to_syl_ixs[word_ix] = [syl_to_ix[syl] for syl in word_in_syls] return train_raw_data, valid_raw_data, test_raw_data, word_ix_to_syl_ixs
def set_lang(self, lang): self.__lang = lang self.pyphen = Pyphen(lang=self.__lang) self._cache_clear()
class CustomGermaLemma(object): """ Lemmatizer for German language text main class. """ pyphen_dic = Pyphen(lang='de') def __init__(self, **kwargs): if ('tiger_corpus' in kwargs): self.lemmata, self.lemmata_lower = self.load_corpus_lemmata( kwargs['tiger_corpus']) elif ('pickle' in kwargs): self.load_from_pickle(kwargs['pickle']) self.pattern_module = import_module('pattern.de') self.iwnlpLemmatizer = CustomIWNLPLemmatizer( join(FILE_PATH, "lib", "IWNLP.Lemmatizer_20170501.json")) def find_lemma(self, w, pos, props=None): # do not process empty strings if (not (w)): raise ValueError("Empty String!") # valid pos = N,V,ADJ,ADV elif (not (pos in ["NOUN", "VERB", "ADJ", "ADV", "AUX"])): return word iwnlpLemmas = self.iwnlpLemmatizer.lemmatize(w, pos) if (iwnlpLemmas): return iwnlpLemmas, None if (pos.startswith('N') or pos.startswith('V')): pos = pos[0] elif (pos.startswith('ADJ') or pos.startswith('ADV')): pos = pos[:3] elif (pos == "AUX"): pos = "V" # look if we can directly find `w` in the lemmata dictionary res = self.dict_search(w, pos) composita = None if (not (res)): # try to split nouns that are made of composita if (pos == 'N'): compositaRes = self._composita_lemma(w) res = compositaRes[0] if (len(compositaRes) > 1): composita = compositaRes[1:] # try to lemmatize adjectives using prevalent German language adjective suffixe elif pos == 'ADJ': res = self._adj_lemma(w) # try to use pattern.de module if (not (res) and props and self.pattern_module): res_pattern = self._lemma_via_patternlib(w, pos, props) if res_pattern != w: res = res_pattern if (res): # nouns always start with a capital letter if (pos == 'N'): if len(res) > 1 and res[0].islower(): res = res[0].upper() + res[1:] else: res = res.lower() return [res], composita return res, composita def dict_search(self, w, pos, use_lower=False): """ Lemmata dictionary lookup for word `w` with POS tag `pos`. Return lemma if found, else None. """ pos_lemmata = self.lemmata_lower[pos] if use_lower else self.lemmata[ pos] return pos_lemmata.get(w, None) def _adj_lemma(self, w): """ Try to lemmatize adjectives using prevalent German language adjective suffixes. Return possibly lemmatized adjective. """ for full, reduced in ADJ_SUFFIXES_DICT.items(): if w.endswith(full): return w[:-len(full)] + reduced return None def _composita_lemma(self, w): """ Try to split a word `w` that is possibly made of composita. Return the lemma if found, else return None. """ # find most important split position first, only right part needs to exist try: split_positions = [w.rfind('-') + 1] except ValueError: split_positions = [] split_positions.extend([ p for p in self.pyphen_dic.positions(w) if p not in split_positions ]) for hy_pos in split_positions: left, right = w[:hy_pos], w[hy_pos:] if (left and right and not (right.endswith('innen'))): resRight = self.dict_search(right, 'N', use_lower=right[0].islower()) if (not (resRight)): resRight = self.iwnlpLemmatizer.lemmatize(right, "NOUN") if (resRight): resRight = resRight[0] if resRight: resLeft = self.dict_search(left, 'N', use_lower=left[0].islower()) if (not (resLeft)): resLeft = self.iwnlpLemmatizer.lemmatize(left, "NOUN") if (resLeft): resLeft = resLeft[0] if (not (resLeft)): resLeft = self.dict_search(left[:-1], 'N', use_lower=left[0].islower()) if (not (resLeft)): resLeft = self.iwnlpLemmatizer.lemmatize( left[:-1], "NOUN") if (resLeft): resLeft = resLeft[0] # concatenate the left side with the found partial lemma if left[-1] == '-': res = left + resRight.capitalize() else: res = left + resRight.lower() resList = [] if w.isupper(): resList.append(res.upper()) else: resList.append(res.capitalize()) resList.append(resRight.capitalize()) if (resLeft): resList.append(resLeft.capitalize()) return resList # try other split positions, both parts need to exist split_positions = [ i for i in range(3, len(w) - 2) if not (i in split_positions) ] for hy_pos in split_positions: left, right = w[:hy_pos], w[hy_pos:] if (left and right and not (right.endswith('innen'))): resRight = self.dict_search(right, 'N', use_lower=right[0].islower()) if (not (resRight)): resRight = self.iwnlpLemmatizer.lemmatize(right, "NOUN") if (resRight): resRight = resRight[0] resLeft = self.dict_search(left, 'N', use_lower=left[0].islower()) if (not (resLeft)): resLeft = self.iwnlpLemmatizer.lemmatize(left, "NOUN") if (resLeft): resLeft = resLeft[0] if (not (resLeft)): resLeft = self.dict_search(left[:-1], 'N', use_lower=left[0].islower()) if (not (resLeft)): resLeft = self.iwnlpLemmatizer.lemmatize(left[:-1], "NOUN") if (resLeft): resLeft = resLeft[0] if (resRight and resLeft): res = left + resRight.lower() resList = [] if w.isupper(): resList.append(res.upper()) else: resList.append(res.capitalize()) resList.append(resRight.capitalize()) resList.append(resLeft.capitalize()) return resList return [None] def _lemma_via_patternlib(self, w, pos, props={}): """ Try to find a lemma for word `w` that has a Part-of-Speech tag `pos_tag` by using pattern.de module's functions. Return the lemma or `w` if lemmatization was not possible with pattern.de """ if (not (self.pattern_module)): raise RuntimeError('pattern.de module not loaded') if (pos.startswith('N') and "number" in props and props["number"] != "Sg"): # pos == 'NP': singularize noun return self.pattern_module.singularize(w) elif (pos.startswith('V') and "form" in props and props["form"] != "INF"): # get infinitive of verb return self.pattern_module.conjugate(w) elif (pos.startswith('ADJ') or pos.startswith('ADV')): # get baseform of adjective or adverb return self.pattern_module.predicative(w) return w @staticmethod def add_to_lemmata_dicts(lemmata, lemmata_lower, token, lemma, pos): for pos_prefix in VALID_POS_PREFIXES: if pos.startswith(pos_prefix): if token not in lemmata[pos_prefix]: lemmata[pos_prefix][token] = lemma if lemma not in lemmata[pos_prefix]: # for quicker lookup lemmata[pos_prefix][lemma] = lemma if pos_prefix == 'N': token_lower = token.lower() if token_lower not in lemmata_lower[pos_prefix]: lemmata_lower[pos_prefix][token_lower] = lemma lemma_lower = lemma.lower() if lemma_lower not in lemmata_lower[pos_prefix]: lemmata_lower[pos_prefix][lemma_lower] = lemma return @classmethod def load_corpus_lemmata(cls, corpus_file): lemmata = defaultdict(dict) lemmata_lower = defaultdict(dict) with codecs.open(corpus_file, encoding="utf-8") as f: for line in f: parts = line.split() if len(parts) == 15: token, lemma = parts[1:3] pos = parts[4] cls.add_to_lemmata_dicts(lemmata, lemmata_lower, token, lemma, pos) return lemmata, lemmata_lower def save_to_pickle(self, pickle_file): with open(pickle_file, 'wb') as f: pickle.dump((self.lemmata, self.lemmata_lower), f, protocol=2) def load_from_pickle(self, pickle_file): with open(pickle_file, 'rb') as f: self.lemmata, self.lemmata_lower = pickle.load(f)
class ContentCleaner: def __init__(self, dataset, content_column): self.dataset = dataset.reset_index() self.content_column = content_column self.dic = Pyphen(lang='en_US') self.process_data() def __str__(self): return """ This class takes a raw dataset of data and builds a clean NLP dataset with features of out of it """ def lower_case(self): self.dataset[self.content_column] = self.dataset[ self.content_column].str.lower() def remove_html_tags(self): cleanr = re.compile('<.*?>.*<.*>') self.dataset[self.content_column] = [ re.sub(cleanr, '', r) for r in self.dataset[self.content_column] ] def stem_words(self): """ https://stackoverflow.com/questions/38763007/how-to-use-spacy-lemmatizer-to-get-a-word-into-basic-form """ print("Stemming Words") for i, row in tqdm(self.dataset.iterrows()): stemmed_string = "" content_row = nlp(row["content"]) for word in content_row: stemmed_string += " " + word.lemma_ self.dataset.loc[i, "content"] = stemmed_string def remove_stop_words(self): print("Removing Stop Words") for i, row in tqdm(self.dataset.iterrows()): sentence_sans_stop_words = "" content_row = nlp(row["content"]) for word in content_row: if word.is_stop is False: sentence_sans_stop_words += " " + word.text self.dataset.loc[i, "content"] = sentence_sans_stop_words self.dataset.loc[i, "num_words"] = len(content_row) def count_adjectives(self): """ see: https://spacy.io/api/annotation https://spacy.io/usage/linguistic-features """ print("Counting Adjectives") for i, row in tqdm(self.dataset.iterrows()): adjective_count = 0 content_row = nlp(row["content"]) for word in content_row: if word.pos_ == "ADJ": adjective_count += 1 self.dataset.loc[i, "adjectives"] = adjective_count def biggest_word(self): """ Taken from https://github.com/shivam5992/textstat """ self.dic = Pyphen(lang='en_US') print("Finding Biggest Words") for i, row in tqdm(self.dataset.iterrows()): biggest_word = 0 content_row = nlp(row["content"]) for word in content_row: word_hyphenated = self.dic.inserted(word.text) word_size = max(1, word_hyphenated.count("-") + 1) if word_size > biggest_word: biggest_word = word_size self.dataset.loc[i, "biggest_word_syllables"] = biggest_word def readability_score(self): """ Taken from - https://github.com/shivam5992/textstat Based on The Flesch Reading Ease formula """ def avg_sentence_length(text): sentences = re.split(r' *[\.\?!][\'"\)\]]*[ |\n](?=[A-Z])', text) ignore_count = 0 sentence_lengths = [] for sentence in sentences: if len(sentence.split(" ")) <= 2: ignore_count += 1 else: sentence_lengths.append(len(sentence.split(" "))) sentence_count = max(1, len(sentences) - ignore_count) sentence_length_mean = sum(sentence_lengths) return sentence_length_mean / sentence_count def avg_syllables_per_word(text): words = nlp(row["content"]) syllables = [] self.dic = Pyphen(lang='en_US') for word in words: word_hyphenated = self.dic.inserted(word.text) syllables.append(max(1, word_hyphenated.count("-") + 1)) return sum(syllables) / len(words) def legacy_round(number, points=0): p = 10**points return float( math.floor((number * p) + math.copysign(0.5, number))) / p # code from https://github.com/shivam5992/textstat print("Assessing Readability Score") for i, row in tqdm(self.dataset.iterrows()): sentence_length = avg_sentence_length(row["content"]) syllables_per_word = avg_syllables_per_word(row["content"]) flesch = (206.835 - float(1.015 * sentence_length) - float(84.6 * syllables_per_word)) Flesch_reading_score = legacy_round(flesch, 2) self.dataset.loc[i, "flesch_reading_score"] = Flesch_reading_score def count_alliteration(self): print("Counting Alliteration") for i, row in tqdm(self.dataset.iterrows()): repeat_letter = None consecutive = False alliteration_count = 0 if len(row["content"]) > 0: words = row["content"].split(" ") for word in words: if len(word) > 0: # Start of new alliteration if str(word )[0] == repeat_letter and consecutive is False: alliteration_count += 1 repeat_letter = str(word)[0] consecutive = True # In the middle of a consecutive streak of alliteration elif str(word)[0] == repeat_letter and consecutive: repeat_letter = str(word)[0] # End of an alliteration elif str(word)[0] != repeat_letter: repeat_letter = str(word)[0] consecutive = False self.dataset.loc[i, "alliteration"] = alliteration_count else: self.dataset.loc[i, "alliteration"] = 0 def process_data(self): self.count_alliteration() self.count_adjectives() self.biggest_word() self.readability_score() self.remove_html_tags() self.lower_case() self.remove_stop_words() self.stem_words()
import logging logging.basicConfig(filename="log.txt", level=logging.INFO, format="%(asctime)s %(message)s") db, db_c = db_init() app = Flask(__name__) app.config["JSONIFY_PRETTYPRINT_REGULAR"] = True app.secret_key = load_config("secret_key") login_manager = LoginManager(app) game_nwords = int(load_config("nwords")) wordlist_file = "res/wordlists/de.txt" words = [l.split() for l in open(wordlist_file).readlines()] article_choices = ["der", "die", "das"] hyphen_dic = Pyphen(lang="de_DE") story_users = load_story_users() story_filenames = find_story_filenames() @login_manager.user_loader def load_user(username): db_c.execute("SELECT username,displayname,hash from users WHERE username=?", [username]) data = db_c.fetchone() if data is None: return None return User(data[0], data[1], data[2]) @login_manager.unauthorized_handler def unauthorized_callback():
import nltk from nltk.corpus import words from nltk.tokenize import sent_tokenize, word_tokenize from nltk.corpus import cmudict # from spellchecker import SpellChecker # nltk.download('words') # nltk.download('cmudict') # nltk.download('punkt') # nltk.download('averaged_perceptron_tagger') import math import re import string import syllables from pyphen import Pyphen d = cmudict.dict() dic = Pyphen(lang="en") # tool = language_tool_python.LanguageTool('en-US') def removePunctuation(text): result = "" for char in text: if char in (".", ",", "!", "?", "؟", "،", "\"," "/", "\"", "#", "$", "%", "&", "'", "(", ")", "*", "+", ":", ";", "<", ">", "=", "[", "]", "^", "_", "`", "{", "}", "|", "~"): continue if char in ("-", "\n", "\r", "\t"): char = " "
def __init__(self, dataset, content_column): self.dataset = dataset.reset_index() self.content_column = content_column self.dic = Pyphen(lang='en_US') self.process_data()
import textstat from sklearn.preprocessing import label_binarize from sklearn.decomposition import PCA import numpy as np import pandas as pd import pkg_resources import ast import spacy #from collections import Counter from pyphen import Pyphen import pickle #import xgboost # lead the language model from spay. this must be downloaded nlp = spacy.load('en_core_web_md') pyphen_dic = Pyphen(lang='en') # set word lists to be used ## This corpus comes from the Cambridge English Corpus of spoken English and includes ## all the NGSL and SUP words needed to get 90% coverage. NGSL_wordlist = set([ ln.decode('utf-8').strip() for ln in pkg_resources.resource_stream( 'financial_readability', 'word_lists/NGSL_wordlist.txt') ]) ## The Business Service List 1.0, also known as the BSL (Browne, C. & Culligan, B., 2016) is a list of approximately 1700 words ## that occur with very high frequency within the domain of general business English. Based on a 64.5 million word corpus of business ## texts, newspapers, journals and websites, the BSL 1.0 version gives approximately 97% coverage of general business English materials ## when learned in combination with the 2800 words of core general English in the New General Service List or NGSL (Browne, C., Culligan, B., and Phillips, J. 2013) BSL_wordlist = set([
def __init__(self, language): self.pyphen = Pyphen(lang=language)
def count_syllables(word): pyphen_dic = Pyphen(lang='en') syllabled_word = pyphen_dic.inserted(word) return syllabled_word.count('-') + 1
type=str, help='E-Mail subject related to survey mails') parser.add_argument('-notxt', action='store_true', help='Disable saving of results to txt file') parser.add_argument('-nobar', action='store_true', help='Disable plotting of bar plots') parser.add_argument('-nopie', action='store_true', help='Disable plotting of pie plots') args = parser.parse_args() ### CONFIG - SET VARIABLES AND DEFAULTS HERE ### #pyphen dictionary german_dict = Pyphen(lang='de_DE') #e-mail information login = args.login password = args.password pop_server = (args.pop_server if args.pop_server else 'pop3.web.de') filter_subject = (args.subject if args.subject else 'Evaluation') #file information write_txt = not args.notxt write_bars = not args.nobar write_pies = not args.nopie txt_file_name = 'results.txt' bar_file_name = 'result_bars.pdf' pie_file_name = 'result_pies.pdf' #allowed text lengths until new line for plot labels pie_wrap_len = 19 bar_wrap_len = 10
def DataPreprocessing(data, train=1): global docCount #EXTRACTING DENSE FEATURES sentiment = np.array([]) word_count = np.array([]) char_count = np.array([]) sent_count = np.array([]) syl_count = np.array([]) mention_count = np.array([]) url_count = np.array([]) special_count = np.array([]) cat_count = np.array([]) dic = Pyphen(lang='en') for text in data["tweet"]: blob = TextBlob(text) #OPTIONAL SPELLING CORRECTION #data.loc[docCount,"tweet"]=str(blob.correct()) #print(data.loc[docCount,"tweet"],type(data.loc[docCount,"tweet"])) url_count = np.append(url_count, blob.words.count("URL")) mention_count = np.append(mention_count, blob.words.count("USER")) cat_count = np.append(cat_count, sum(c == '#' for c in text)) special_count = np.append( special_count, sum(not c.isalnum() and c != ' ' and c != '@' and c != '#' for c in text)) syl_count = np.append( syl_count, len(TextBlob(dic.inserted(text).replace('-', ' ')).words)) char_count = np.append(char_count, len(text)) word_count = np.append(word_count, len(blob.words)) sent_count = np.append(sent_count, len(blob.sentences)) sentiment = np.append(sentiment, blob.sentiment.polarity) docCount += 1 #INITIALIZING STEMMER AND STOP WORD CORPUS stop_words = set(stopwords.words('english')) porter_stemmer = PorterStemmer() #POS TAGGING POS = CMUTweetTagger.runtagger_parse(data["tweet"]) POSDictionary = { "N": "nn", "O": "pro", "S": "np", "^": "nnps", "Z": "nnpz", "L": "vl", "M": "nv", "V": "md", "A": "adj", "R": "adv", "!": "int", "D": "det", "P": "ppt", "&": "cc", "T": "rp", "X": "ex", "Y": "exv", "#": "cat", "@": "tar", "~": "dsc", ",": "punc", "$": "num", "U": "url", "E": "emo", "G": "abr" } #PREPROCESSING (REMOVE STOP WORDS AND STEMMING) docCount = 0 for doc in POS: filtered_sentence = [] for word in doc: if word[0] not in stop_words: filtered_sentence.append(porter_stemmer.stem( word[0])) #+'_'+POSDictionary[word[1]]) data.loc[docCount, "tweet"] = filtered_sentence data.loc[docCount, "tweet"] = " ".join(data.loc[docCount, "tweet"]) docCount += 1 #REPLACING LABEL (subtask) WITH INTEGER if (train == 1): data['label'] = data['subtask'].factorize()[0] data['sentiment'] = sentiment + 1 data['sent_count'] = sent_count data['word_count'] = word_count data['syl_count'] = syl_count data['url_count'] = url_count data['mention_count'] = mention_count data['cat_count'] = cat_count data['special_count'] = special_count #SEPERATING FEATURES AND LABELS X = data[[ 'tweet', 'sentiment', 'sent_count', 'word_count', 'syl_count', 'url_count', 'mention_count', 'special_count', 'cat_count' ]] if train == 1: y = data['label'] else: y = None return X, y
def __init__(self, lang='en_US'): self.dic = dic = Pyphen(lang=lang)
import re from pandocfilters import Para, Str, toJSONFilter, walk from pyphen import Pyphen dic = Pyphen(lang='en_US', left=3, right=3) word_detection_pattern = re.compile(r'\w{7,}', re.UNICODE) def inpara(key, value, format, meta): if key == 'Para': return Para(walk(value, hyphenate, format, meta)) def hyphenate(key, value, format, meta): if key == 'Str': return Str(word_detection_pattern.sub( lambda match: dic.inserted(match.group(0), hyphen=''), value)) if __name__ == "__main__": toJSONFilter(inpara)
def count_syllables(word): return max(1, len(Pyphen(lang='en_US').hd.positions(word)) + 1)