def syllable_count(self, text, lang=None): """ Function to calculate syllable words in a text. I/P - a text O/P - number of syllable words """ if lang: warnings.warn( "The 'lang' argument has been moved to " "'textstat.set_lang(<lang>)'. This argument will be removed " "in the future.", DeprecationWarning ) if isinstance(text, bytes): text = text.decode(self.text_encoding) text = text.lower() text = self.remove_punctuation(text) if not text: return 0 dic = Pyphen(lang=self.__lang) count = 0 for word in text.split(' '): word_hyphenated = dic.inserted(word) count += max(1, word_hyphenated.count("-") + 1) return count
def _count_syllables(self, word): """Counting the syllables in a word.""" dic = Pyphen(lang=self.pyphen_language) word = dic.inserted(word) s_count = word.count("-") + 1 return s_count
def count_syllables(word): # necessary for the syllable count dic = Pyphen(lang='en_EN') word_hyphenated = dic.inserted(word) # triple hyphens resulting from hyphens inside the normal word need to be reduced to single hyphens word_hyphenated = word_hyphenated.replace("---", "-") syllables = max(1, word_hyphenated.count("-") + 1) return syllables
def syllable_count(text): text = text.lower() text = "".join(x for x in text if x not in exclude) dic = Pyphen(lang='ru_RU') count = 0 for word in text.split(' '): word_hyphenated = dic.inserted(word) count += max(1, word_hyphenated.count("-") + 1) return count
def avg_syllables_per_word(text): words = nlp(row["content"]) syllables = [] self.dic = Pyphen(lang='en_US') for word in words: word_hyphenated = self.dic.inserted(word.text) syllables.append(max(1, word_hyphenated.count("-") + 1)) return sum(syllables) / len(words)
def count_syllables(self): """ Compte les syllabes d’un texte, en utilisant le dictionnaire Pyphen pour composer les mots en syllabes sensuite faire le comptage des syllabes selon l'expression reguliere indiques dessous :return: nombre syllabes obtenu """ dic = Pyphen(lang='en') text = ' '.join(dic.inserted(w, ' ') for w in self.text.split()) return len(re.findall(r'(\w+|\,|\;|\b\.|\:|\?)', text))
def split_syllables(self, lang='fr'): """ Sépare les syllabes du texte, si la langue du texte n’est pas le français il faut la specifier, et elle doit être disponible dans le module pyphen (pyphen.LANGUAGES.keys()) :param lang: langue du texte :return: renvoi `self` afin de pouvoir chainer les opérations """ dic = Pyphen(lang=lang) self.text = ' '.join(dic.inserted(w, '/') for w in self.text.split()).replace('-/', '-') return self
def n_syllables(doc: Doc): """ Return number of syllables per token """ dic = Pyphen(lang=doc.lang_) def count_syl(token: Token): word_hyphenated = dic.inserted(token.lower_) return max(1, word_hyphenated.count("-") + 1) return [count_syl(token) for token in doc._._filtered_tokens]
def Morphemes(s_word, lang, measures): di = Pyphen(lang=lang) morphemes = [] for pair in di.iterate(s_word): morphemes.append(pair) if len(morphemes) != 0: measures["morphemes"] = morphemes measures["morphemes_count"] = len(morphemes[0]) else: measures["morphemes"] = "Not Found" measures["morphemes_count"] = 0
def split_syllables(self, lang='en'): """ Sépare les syllabes du texte, si la langue du texte n’est pas le englais il faut la specifier, et elle doit être disponible dans le module pyphen (pyphen.LANGUAGES.keys()) :param lang: langue du texte :return: renvoi `text` sous la forme demandee """ dic = Pyphen(lang=lang) text = ' '.join(dic.inserted(w, ' ') for w in self.text.split()) text = re.sub(r'(\w+|\,|\;|\:|\.\W+\b)', r'\1-' , text) text = re.sub(r'(\s)', r'' , text) return text
def separar_silabas(palavra, separador): """ Função que separa silabas da palavra indicada na chamada da função. O usuário ainda pode escolher que tipo de separador ele deseja para poder ficar mais amigável ao seu código. """ #TODO: Implementar função nativamente para processamento de sílabas from pyphen import Pyphen _palavra_sep = palavra.lower() dic = Pyphen(lang="pt_BR") _palavra_sep = dic.inserted(_palavra_sep) if separador == "-": return _palavra_sep _palavra_sep = str(_palavra_sep).replace("-", separador) return _palavra_sep
def syllable_counts(self, text): """ Calculates number of syllables per token Punctuation is removed before tokenization """ text = Text.to_text(text) if not text.text: return 0 dic = Pyphen(lang=self.__lang) def count_syl(token): word_hyphenated = dic.inserted(token.lower()) return max(1, word_hyphenated.count("-") + 1) return [count_syl(token) for token in text.tokens_without_punctuation]
def Syllable_Count(s_word): exclude = list(string.punctuation) s_word = s_word.lower() s_word = "".join(x for x in s_word if x not in exclude) if s_word is None: return 0 elif len(s_word) == 0: return 0 else: dic = Pyphen(lang='en_US') count = 0 for word in s_word.split(' '): word_hyphenated = dic.inserted(word) count += max(1, word_hyphenated.count("-") + 1) return count
def count_syl_line(line): """Creates a dictionary that relates each word in a line to the number of syllables in the line""" from pyphen import Pyphen from re import split dic = Pyphen(lang='en-US') words = split(" ", line) syllables = {} j = 0 if line == '': return -1 else: for i in words: syllables[i] = len(split("-", dic.inserted(words[j]))) j += 1 line_syl_count = int(sum(syllables.values())) return line_syl_count
def Syllable_Count(s_word, lang, measures): exclude = list(string.punctuation) s_word = s_word.lower() s_word = "".join(x for x in s_word if x not in exclude) if s_word is None: measures["no_of_syllables"] = 0 elif len(s_word) == 0: measures["no_of_syllables"] = 0 else: dic = Pyphen(lang=lang) count = 0 for word in s_word.split(' '): word_hyphenated = dic.inserted(word) count += max(1, word_hyphenated.count("-") + 1) measures["no_of_syllables"] = count
def biggest_word(self): """ Taken from https://github.com/shivam5992/textstat """ self.dic = Pyphen(lang='en_US') print("Finding Biggest Words") for i, row in tqdm(self.dataset.iterrows()): biggest_word = 0 content_row = nlp(row["content"]) for word in content_row: word_hyphenated = self.dic.inserted(word.text) word_size = max(1, word_hyphenated.count("-") + 1) if word_size > biggest_word: biggest_word = word_size self.dataset.loc[i, "biggest_word_syllables"] = biggest_word
def syllable_count(text, lang='en_US'): """ Function to calculate syllable words in a text. I/P - a text O/P - number of syllable words """ text = text.lower() text = delete_mask_return_sen(text) text = remove_punctuation(text).strip() if not text: return 0 dic = Pyphen(lang=lang) count = 0 for word in text.split(' '): if word: word_hyphenated = dic.inserted(word) count += max(1, word_hyphenated.count("-") + 1) return count
def syllable_count(self, text, lang='en_US'): """ Function to calculate syllable words in a text. I/P - a text O/P - number of syllable words """ text = text.lower() text = "".join(x for x in text if x not in exclude) if not text: return 0 dic = Pyphen(lang=lang) count = 0 for word in text.split(' '): word_hyphenated = dic.inserted(word) count += max(1, word_hyphenated.count("-") + 1) return count
def syllable_count(self, text, lang='en_US'): """ Function to calculate syllable words in a text. I/P - a text O/P - number of syllable words """ if isinstance(text, bytes): text = text.decode(self.text_encoding) text = text.lower() text = self.remove_punctuation(text) if not text: return 0 dic = Pyphen(lang=lang) count = 0 for word in text.split(' '): word_hyphenated = dic.inserted(word) count += max(1, word_hyphenated.count("-") + 1) return count
import textstat from sklearn.preprocessing import label_binarize from sklearn.decomposition import PCA import numpy as np import pandas as pd import pkg_resources import ast import spacy #from collections import Counter from pyphen import Pyphen import pickle #import xgboost # lead the language model from spay. this must be downloaded nlp = spacy.load('en_core_web_md') pyphen_dic = Pyphen(lang='en') # set word lists to be used ## This corpus comes from the Cambridge English Corpus of spoken English and includes ## all the NGSL and SUP words needed to get 90% coverage. NGSL_wordlist = set([ ln.decode('utf-8').strip() for ln in pkg_resources.resource_stream( 'financial_readability', 'word_lists/NGSL_wordlist.txt') ]) ## The Business Service List 1.0, also known as the BSL (Browne, C. & Culligan, B., 2016) is a list of approximately 1700 words ## that occur with very high frequency within the domain of general business English. Based on a 64.5 million word corpus of business ## texts, newspapers, journals and websites, the BSL 1.0 version gives approximately 97% coverage of general business English materials ## when learned in combination with the 2800 words of core general English in the New General Service List or NGSL (Browne, C., Culligan, B., and Phillips, J. 2013) BSL_wordlist = set([
def __init__(self, language): self.pyphen = Pyphen(lang=language)
type=str, help='E-Mail subject related to survey mails') parser.add_argument('-notxt', action='store_true', help='Disable saving of results to txt file') parser.add_argument('-nobar', action='store_true', help='Disable plotting of bar plots') parser.add_argument('-nopie', action='store_true', help='Disable plotting of pie plots') args = parser.parse_args() ### CONFIG - SET VARIABLES AND DEFAULTS HERE ### #pyphen dictionary german_dict = Pyphen(lang='de_DE') #e-mail information login = args.login password = args.password pop_server = (args.pop_server if args.pop_server else 'pop3.web.de') filter_subject = (args.subject if args.subject else 'Evaluation') #file information write_txt = not args.notxt write_bars = not args.nobar write_pies = not args.nopie txt_file_name = 'results.txt' bar_file_name = 'result_bars.pdf' pie_file_name = 'result_pies.pdf' #allowed text lengths until new line for plot labels pie_wrap_len = 19 bar_wrap_len = 10
class GermaLemma(object): """ Lemmatizer for German language text main class. """ pyphen_dic = Pyphen(lang='de') def __init__(self, **kwargs): """ Initialize GermaLemma lemmatizer. By default, it will load the lemmatizer data from 'data/lemmata.pickle'. You can also pass a manual lemmata dictionary via `lemmata` or load a corpus in CONLL09 format via `tiger_corpus` or load pickled lemmatizer data from `pickle`. Force usage of pattern.de module by setting `use_pattern_module` to True (or False for not using). By default, it will try to use pattern.de if it is installed. """ if 'lemmata' in kwargs: self.lemmata = kwargs['lemmata'] if 'lemmata_lower' in kwargs: self.lemmata_lower = kwargs['lemmata_lower'] else: self.lemmata_lower = { pos: {token.lower(): lemma for token, lemma in pos_lemmata} for pos, pos_lemmata in self.lemmata.items() } elif 'tiger_corpus' in kwargs: self.lemmata, self.lemmata_lower = self.load_corpus_lemmata( kwargs['tiger_corpus']) elif 'pickle' in kwargs: self.load_from_pickle(kwargs['pickle']) else: try: self.load_from_pickle(DEFAULT_LEMMATA_PICKLE) except FileNotFoundError: self.load_from_pickle( os.path.join(sys.prefix, DEFAULT_LEMMATA_PICKLE)) self.pattern_module = None use_pattern_module = kwargs.get('use_pattern_module', None) if use_pattern_module in (True, None): try: self.pattern_module = import_module('pattern.de') except ImportError: if use_pattern_module is True: raise ImportError('pattern.de module could not be loaded') def find_lemma(self, w, pos_tag): """ Find a lemma for word `w` that has a Part-of-Speech tag `pos_tag`. `pos_tag` should be a valid STTS tagset tag (see http://www.ims.uni-stuttgart.de/forschung/ressourcen/lexika/TagSets/stts-table.html) or a simplified form with: - 'N' for nouns - 'V' for verbs - 'ADJ' for adjectives - 'ADV' for adverbs All other tags will raise a ValueError("Unsupported POS tag")! Return the lemma or, if no lemma was found, return `w`. """ if not w: # do not process empty strings return w if pos_tag == 'NE': # if word is a name, it already is the lemma return w if pos_tag.startswith('N') or pos_tag.startswith('V'): pos = pos_tag[0] elif pos_tag.startswith('ADJ') or pos_tag.startswith('ADV'): pos = pos_tag[:3] else: raise ValueError("Unsupported POS tag") # look if we can directly find `w` in the lemmata dictionary res = self.dict_search(w, pos) if not res and self.pattern_module: # try to use pattern.de module res_pattern = self._lemma_via_patternlib(w, pos) if res_pattern != w: res = res_pattern if not res: # try to split nouns that are made of composita if pos == 'N': res = self._composita_lemma(w) or w else: res = w # try to lemmatize adjectives using prevalent German language adjective suffixes if pos == 'ADJ': res = self._adj_lemma(res) # nouns always start with a capital letter if pos == 'N': if len(res) > 1 and res[0].islower(): res = res[0].upper() + res[1:] else: # all other forms are lower-case res = res.lower() return res def dict_search(self, w, pos, use_lower=False): """ Lemmata dictionary lookup for word `w` with POS tag `pos`. Return lemma if found, else None. """ pos_lemmata = self.lemmata_lower[pos] if use_lower else self.lemmata[ pos] return pos_lemmata.get(w, None) def _adj_lemma(self, w): """ Try to lemmatize adjectives using prevalent German language adjective suffixes. Return possibly lemmatized adjective. """ for full, reduced in ADJ_SUFFIXES_DICT.items(): if w.endswith(full): return w[:-len(full)] + reduced return w def _composita_lemma(self, w): """ Try to split a word `w` that is possibly made of composita. Return the lemma if found, else return None. """ # find most important split position first when a hyphen is used in the word try: split_positions = [w.rfind('-') + 1] except ValueError: split_positions = [] # add possible split possitions by using Pyphen's hyphenation positions split_positions.extend([ p for p in self.pyphen_dic.positions(w) if p not in split_positions ]) # now split `w` by hyphenation step by step for hy_pos in split_positions: # split in left and right parts (start and end of the strings) left, right = w[:hy_pos], w[hy_pos:] # look if the right part can be found in the lemmata dictionary # if we have a noun, a lower case match will also be accepted if left and right and not right.endswith('innen'): res = self.dict_search(right, 'N', use_lower=right[0].islower()) if res: # concatenate the left side with the found partial lemma if left[-1] == '-': res = left + res.capitalize() else: res = left + res.lower() if w.isupper(): return res.upper() else: return res return None def _lemma_via_patternlib(self, w, pos): """ Try to find a lemma for word `w` that has a Part-of-Speech tag `pos_tag` by using pattern.de module's functions. Return the lemma or `w` if lemmatization was not possible with pattern.de """ if not self.pattern_module: raise RuntimeError('pattern.de module not loaded') if pos == 'NP': # singularize noun return self.pattern_module.singularize(w) elif pos.startswith('V'): # get infinitive of verb return self.pattern_module.conjugate(w) elif pos.startswith('ADJ') or pos.startswith( 'ADV'): # get baseform of adjective or adverb return self.pattern_module.predicative(w) return w @classmethod def load_corpus_lemmata(cls, corpus_file): lemmata = defaultdict(dict) lemmata_lower = defaultdict(dict) with codecs.open(corpus_file, encoding="utf-8") as f: for line in f: parts = line.split() if len(parts) == 15: token, lemma = parts[1:3] pos = parts[4] cls.add_to_lemmata_dicts(lemmata, lemmata_lower, token, lemma, pos) return lemmata, lemmata_lower @staticmethod def add_to_lemmata_dicts(lemmata, lemmata_lower, token, lemma, pos): for pos_prefix in VALID_POS_PREFIXES: if pos.startswith(pos_prefix): if token not in lemmata[pos_prefix]: lemmata[pos_prefix][token] = lemma if lemma not in lemmata[pos_prefix]: # for quicker lookup lemmata[pos_prefix][lemma] = lemma if pos_prefix == 'N': token_lower = token.lower() if token_lower not in lemmata_lower[pos_prefix]: lemmata_lower[pos_prefix][token_lower] = lemma lemma_lower = lemma.lower() if lemma_lower not in lemmata_lower[pos_prefix]: lemmata_lower[pos_prefix][lemma_lower] = lemma return def save_to_pickle(self, pickle_file): with open(pickle_file, 'wb') as f: pickle.dump((self.lemmata, self.lemmata_lower), f, protocol=2) def load_from_pickle(self, pickle_file): with open(pickle_file, 'rb') as f: self.lemmata, self.lemmata_lower = pickle.load(f)
def DataPreprocessing(data, train=1): global docCount #EXTRACTING DENSE FEATURES sentiment = np.array([]) word_count = np.array([]) char_count = np.array([]) sent_count = np.array([]) syl_count = np.array([]) mention_count = np.array([]) url_count = np.array([]) special_count = np.array([]) cat_count = np.array([]) dic = Pyphen(lang='en') for text in data["tweet"]: blob = TextBlob(text) #OPTIONAL SPELLING CORRECTION #data.loc[docCount,"tweet"]=str(blob.correct()) #print(data.loc[docCount,"tweet"],type(data.loc[docCount,"tweet"])) url_count = np.append(url_count, blob.words.count("URL")) mention_count = np.append(mention_count, blob.words.count("USER")) cat_count = np.append(cat_count, sum(c == '#' for c in text)) special_count = np.append( special_count, sum(not c.isalnum() and c != ' ' and c != '@' and c != '#' for c in text)) syl_count = np.append( syl_count, len(TextBlob(dic.inserted(text).replace('-', ' ')).words)) char_count = np.append(char_count, len(text)) word_count = np.append(word_count, len(blob.words)) sent_count = np.append(sent_count, len(blob.sentences)) sentiment = np.append(sentiment, blob.sentiment.polarity) docCount += 1 #INITIALIZING STEMMER AND STOP WORD CORPUS stop_words = set(stopwords.words('english')) porter_stemmer = PorterStemmer() #POS TAGGING POS = CMUTweetTagger.runtagger_parse(data["tweet"]) POSDictionary = { "N": "nn", "O": "pro", "S": "np", "^": "nnps", "Z": "nnpz", "L": "vl", "M": "nv", "V": "md", "A": "adj", "R": "adv", "!": "int", "D": "det", "P": "ppt", "&": "cc", "T": "rp", "X": "ex", "Y": "exv", "#": "cat", "@": "tar", "~": "dsc", ",": "punc", "$": "num", "U": "url", "E": "emo", "G": "abr" } #PREPROCESSING (REMOVE STOP WORDS AND STEMMING) docCount = 0 for doc in POS: filtered_sentence = [] for word in doc: if word[0] not in stop_words: filtered_sentence.append(porter_stemmer.stem( word[0])) #+'_'+POSDictionary[word[1]]) data.loc[docCount, "tweet"] = filtered_sentence data.loc[docCount, "tweet"] = " ".join(data.loc[docCount, "tweet"]) docCount += 1 #REPLACING LABEL (subtask) WITH INTEGER if (train == 1): data['label'] = data['subtask'].factorize()[0] data['sentiment'] = sentiment + 1 data['sent_count'] = sent_count data['word_count'] = word_count data['syl_count'] = syl_count data['url_count'] = url_count data['mention_count'] = mention_count data['cat_count'] = cat_count data['special_count'] = special_count #SEPERATING FEATURES AND LABELS X = data[[ 'tweet', 'sentiment', 'sent_count', 'word_count', 'syl_count', 'url_count', 'mention_count', 'special_count', 'cat_count' ]] if train == 1: y = data['label'] else: y = None return X, y
def read_data(args, config): '''read data sets, construct all needed structures and update the config''' if args.ssm == '1': config.ssm = 1 hyphenator = Pyphen(lang=args.dict) def my_syllables(word): return hyphenator.inserted(word).split('-') if args.is_train == '1': if not os.path.exists(args.save_dir): os.makedirs(args.save_dir) with open(os.path.join(args.save_dir, args.prefix + '-data.pkl'), 'wb') as data_file: word_data = open( os.path.join(args.data_dir, 'train.txt'), 'r').read() \ .replace('\n', args.eos).split() words = list(set(word_data)) syllables = set() word_lens_in_syl = [] for word in words: syls = my_syllables(word) word_lens_in_syl.append(len(syls)) for syl in syls: syllables.add(syl) syls_list = list(syllables) pickle.dump((word_data, words, word_lens_in_syl, syls_list), data_file) else: with open(os.path.join(args.save_dir, args.prefix + '-data.pkl'), 'rb') as data_file: word_data, words, word_lens_in_syl, syls_list = pickle.load( data_file) word_data_size, word_vocab_size = len(word_data), len(words) print('data has %d words, %d unique' % (word_data_size, word_vocab_size)) config.word_vocab_size = word_vocab_size config.num_sampled = int(word_vocab_size * 0.2) word_to_ix = {word: i for i, word in enumerate(words)} ix_to_word = {i: word for i, word in enumerate(words)} def get_word_raw_data(input_file): data = open(input_file, 'r').read().replace('\n', args.eos).split() return [word_to_ix[w] for w in data] train_raw_data = get_word_raw_data(os.path.join(args.data_dir, 'train.txt')) valid_raw_data = get_word_raw_data(os.path.join(args.data_dir, 'valid.txt')) test_raw_data = get_word_raw_data(os.path.join(args.data_dir, 'test.txt')) syl_vocab_size = len(syls_list) max_word_len = int(np.percentile(word_lens_in_syl, 100)) config.max_word_len = max_word_len print('data has %d unique syllables' % syl_vocab_size) print('max word length in syllables is set to', max_word_len) # a fake syllable for zero-padding zero_pad_syl = ' ' syls_list.insert(0, zero_pad_syl) syl_vocab_size += 1 config.syl_vocab_size = syl_vocab_size syl_to_ix = {syl: i for i, syl in enumerate(syls_list)} ix_to_syl = {i: syl for i, syl in enumerate(syls_list)} word_ix_to_syl_ixs = {} for word in words: word_ix = word_to_ix[word] word_in_syls = my_syllables(word) word_in_syls += [zero_pad_syl] * (max_word_len - len(word_in_syls)) word_ix_to_syl_ixs[word_ix] = [syl_to_ix[syl] for syl in word_in_syls] return train_raw_data, valid_raw_data, test_raw_data, word_ix_to_syl_ixs
def count_syllables(word): pyphen_dic = Pyphen(lang='en') syllabled_word = pyphen_dic.inserted(word) return syllabled_word.count('-') + 1
def __init__(self, lang='en_US'): self.dic = dic = Pyphen(lang=lang)
def set_lang(self, lang): self.__lang = lang self.pyphen = Pyphen(lang=self.__lang) self._cache_clear()
import re from pandocfilters import Para, Str, toJSONFilter, walk from pyphen import Pyphen dic = Pyphen(lang='en_US', left=3, right=3) word_detection_pattern = re.compile(r'\w{7,}', re.UNICODE) def inpara(key, value, format, meta): if key == 'Para': return Para(walk(value, hyphenate, format, meta)) def hyphenate(key, value, format, meta): if key == 'Str': return Str(word_detection_pattern.sub( lambda match: dic.inserted(match.group(0), hyphen=''), value)) if __name__ == "__main__": toJSONFilter(inpara)
from utils.web.slack_api import parse_config from utils.web.slack_api.big_emoji import resize_image, resize_gif from utils.web.slack_api.text_to_emoji import text_to_emoji from utils.web.servers.core import register_cmd, SlackInfo, slack_api, app, init_slack_api, gen_help_str, \ send_to_channel, request_in_loop, no_dm, run_in_executor __author__ = 'acushner' from utils.web.servers.incident import IncidentInfo, init_incident_store _admins = parse_config().admin_id_name_map DEFAULT_SIZE_MULT = 6. MAX_SIZE_MULT = 15. _pyphen = Pyphen(lang='en') @register_cmd @no_dm async def embiggen(si: SlackInfo): """emoji [size_multiple] [_size_multiple_]: multiple to scale up/down emoji size by only works on custom emoji due to download issues from slack""" emoji, *rest = si.argstr.split() mult = min(MAX_SIZE_MULT, float(first(rest, DEFAULT_SIZE_MULT))) if mult <= 0: return text(f'invalid mult: {mult}') all_emoji = await slack_api.get_emoji() try: