コード例 #1
0
ファイル: textstat.py プロジェクト: jnelson16/textstat
    def syllable_count(self, text, lang=None):
        """
        Function to calculate syllable words in a text.
        I/P - a text
        O/P - number of syllable words
        """
        if lang:
            warnings.warn(
                "The 'lang' argument has been moved to "
                "'textstat.set_lang(<lang>)'. This argument will be removed "
                "in the future.",
                DeprecationWarning
            )
        if isinstance(text, bytes):
            text = text.decode(self.text_encoding)

        text = text.lower()
        text = self.remove_punctuation(text)

        if not text:
            return 0

        dic = Pyphen(lang=self.__lang)
        count = 0
        for word in text.split(' '):
            word_hyphenated = dic.inserted(word)
            count += max(1, word_hyphenated.count("-") + 1)
        return count
コード例 #2
0
    def _count_syllables(self, word):
        """Counting the syllables in a word."""
        dic = Pyphen(lang=self.pyphen_language)
        word = dic.inserted(word)
        s_count = word.count("-") + 1

        return s_count
コード例 #3
0
ファイル: fogGermanAssignments.py プロジェクト: MajaToebs/BA
def count_syllables(word):
    # necessary for the syllable count
    dic = Pyphen(lang='en_EN')
    word_hyphenated = dic.inserted(word)
    # triple hyphens resulting from hyphens inside the normal word need to be reduced to single hyphens
    word_hyphenated = word_hyphenated.replace("---", "-")
    syllables = max(1, word_hyphenated.count("-") + 1)
    return syllables
コード例 #4
0
def syllable_count(text):
    text = text.lower()
    text = "".join(x for x in text if x not in exclude)
    dic = Pyphen(lang='ru_RU')
    count = 0
    for word in text.split(' '):
        word_hyphenated = dic.inserted(word)
        count += max(1, word_hyphenated.count("-") + 1)
    return count
コード例 #5
0
        def avg_syllables_per_word(text):
            words = nlp(row["content"])
            syllables = []
            self.dic = Pyphen(lang='en_US')

            for word in words:
                word_hyphenated = self.dic.inserted(word.text)
                syllables.append(max(1, word_hyphenated.count("-") + 1))
            return sum(syllables) / len(words)
コード例 #6
0
 def count_syllables(self):
     """
     Compte les syllabes d’un texte, en utilisant le dictionnaire Pyphen
     pour composer les mots en syllabes sensuite faire le comptage des syllabes selon
     l'expression reguliere indiques dessous
     :return: nombre syllabes obtenu
     """
     dic = Pyphen(lang='en')
     text = ' '.join(dic.inserted(w, ' ') for w in self.text.split())
     return len(re.findall(r'(\w+|\,|\;|\b\.|\:|\?)', text))
コード例 #7
0
 def split_syllables(self, lang='fr'):
     """
     Sépare les syllabes du texte, si la langue du texte n’est pas le français il faut la
     specifier, et elle doit être disponible dans le module pyphen (pyphen.LANGUAGES.keys())
     :param lang: langue du texte
     :return: renvoi `self` afin de pouvoir chainer les opérations
     """
     dic = Pyphen(lang=lang)
     self.text = ' '.join(dic.inserted(w, '/') for w in self.text.split()).replace('-/', '-')
     return self
コード例 #8
0
def n_syllables(doc: Doc):
    """
    Return number of syllables per token
    """

    dic = Pyphen(lang=doc.lang_)

    def count_syl(token: Token):
        word_hyphenated = dic.inserted(token.lower_)
        return max(1, word_hyphenated.count("-") + 1)

    return [count_syl(token) for token in doc._._filtered_tokens]
コード例 #9
0
def Morphemes(s_word, lang, measures):
    di = Pyphen(lang=lang)
    morphemes = []
    for pair in di.iterate(s_word):
        morphemes.append(pair)

    if len(morphemes) != 0:
        measures["morphemes"] = morphemes
        measures["morphemes_count"] = len(morphemes[0])
    else:
        measures["morphemes"] = "Not Found"
        measures["morphemes_count"] = 0
コード例 #10
0
 def split_syllables(self, lang='en'):
     """
     Sépare les syllabes du texte, si la langue du texte n’est pas le englais il faut la
     specifier, et elle doit être disponible dans le module pyphen (pyphen.LANGUAGES.keys())
     :param lang: langue du texte
     :return: renvoi `text` sous la forme demandee
     """
     dic = Pyphen(lang=lang)
     text = ' '.join(dic.inserted(w, ' ') for w in self.text.split())
     text = re.sub(r'(\w+|\,|\;|\:|\.\W+\b)', r'\1-' , text)
     text = re.sub(r'(\s)', r'' , text)
     return text
コード例 #11
0
 def separar_silabas(palavra, separador):
     """
     Função que separa silabas da palavra indicada na chamada da função. O
     usuário ainda pode escolher que tipo de separador ele deseja para poder
     ficar mais amigável ao seu código.
     """
     #TODO: Implementar função nativamente para processamento de sílabas
     from pyphen import Pyphen
     _palavra_sep = palavra.lower()
     dic = Pyphen(lang="pt_BR")
     _palavra_sep = dic.inserted(_palavra_sep)
     if separador == "-":
         return _palavra_sep
     _palavra_sep = str(_palavra_sep).replace("-", separador)
     return _palavra_sep
コード例 #12
0
    def syllable_counts(self, text):
        """
        Calculates number of syllables per token
        Punctuation is removed before tokenization
        """
        text = Text.to_text(text)
        if not text.text:
            return 0
        dic = Pyphen(lang=self.__lang)

        def count_syl(token):
            word_hyphenated = dic.inserted(token.lower())
            return max(1, word_hyphenated.count("-") + 1)

        return [count_syl(token) for token in text.tokens_without_punctuation]
コード例 #13
0
def Syllable_Count(s_word):
	exclude = list(string.punctuation)
	s_word = s_word.lower()
	s_word = "".join(x for x in s_word if x not in exclude)

	if s_word is None:
		return 0
	elif len(s_word) == 0:
		return 0
	else:
		dic = Pyphen(lang='en_US')
		count = 0
		for word in s_word.split(' '):
			word_hyphenated = dic.inserted(word)
			count += max(1, word_hyphenated.count("-") + 1)
		return count
コード例 #14
0
def count_syl_line(line):
    """Creates a dictionary that relates each word in a line to the number of syllables in the line"""
    from pyphen import Pyphen
    from re import split
    dic = Pyphen(lang='en-US')
    words = split(" ", line)
    syllables = {}
    j = 0
    if line == '':
        return -1
    else:
        for i in words:
            syllables[i] = len(split("-", dic.inserted(words[j])))
            j += 1
        line_syl_count = int(sum(syllables.values()))
        return line_syl_count
コード例 #15
0
def Syllable_Count(s_word, lang, measures):
    exclude = list(string.punctuation)
    s_word = s_word.lower()
    s_word = "".join(x for x in s_word if x not in exclude)

    if s_word is None:
        measures["no_of_syllables"] = 0
    elif len(s_word) == 0:
        measures["no_of_syllables"] = 0
    else:
        dic = Pyphen(lang=lang)
        count = 0
        for word in s_word.split(' '):
            word_hyphenated = dic.inserted(word)
            count += max(1, word_hyphenated.count("-") + 1)
        measures["no_of_syllables"] = count
コード例 #16
0
    def biggest_word(self):
        """
        Taken from https://github.com/shivam5992/textstat
        """
        self.dic = Pyphen(lang='en_US')
        print("Finding Biggest Words")
        for i, row in tqdm(self.dataset.iterrows()):
            biggest_word = 0
            content_row = nlp(row["content"])

            for word in content_row:
                word_hyphenated = self.dic.inserted(word.text)
                word_size = max(1, word_hyphenated.count("-") + 1)
                if word_size > biggest_word:
                    biggest_word = word_size

            self.dataset.loc[i, "biggest_word_syllables"] = biggest_word
コード例 #17
0
ファイル: nlp_feature.py プロジェクト: NLPting/paper
def syllable_count(text, lang='en_US'):
    """
    Function to calculate syllable words in a text.
    I/P - a text
    O/P - number of syllable words
    """
    text = text.lower()
    text = delete_mask_return_sen(text)
    text = remove_punctuation(text).strip()
    if not text:
        return 0
    dic = Pyphen(lang=lang)
    count = 0
    for word in text.split(' '):
        if word:
            word_hyphenated = dic.inserted(word)
            count += max(1, word_hyphenated.count("-") + 1)
    return count
コード例 #18
0
ファイル: textstat.py プロジェクト: rushdishams/textstat
    def syllable_count(self, text, lang='en_US'):
        """
        Function to calculate syllable words in a text.
        I/P - a text
        O/P - number of syllable words
        """
        text = text.lower()
        text = "".join(x for x in text if x not in exclude)

        if not text:
            return 0

        dic = Pyphen(lang=lang)
        count = 0
        for word in text.split(' '):
            word_hyphenated = dic.inserted(word)
            count += max(1, word_hyphenated.count("-") + 1)
        return count
コード例 #19
0
ファイル: textstat.py プロジェクト: matifq/textstat
    def syllable_count(self, text, lang='en_US'):
        """
        Function to calculate syllable words in a text.
        I/P - a text
        O/P - number of syllable words
        """
        if isinstance(text, bytes):
            text = text.decode(self.text_encoding)

        text = text.lower()
        text = self.remove_punctuation(text)

        if not text:
            return 0

        dic = Pyphen(lang=lang)
        count = 0
        for word in text.split(' '):
            word_hyphenated = dic.inserted(word)
            count += max(1, word_hyphenated.count("-") + 1)
        return count
コード例 #20
0
import textstat
from sklearn.preprocessing import label_binarize
from sklearn.decomposition import PCA
import numpy as np
import pandas as pd
import pkg_resources
import ast
import spacy
#from collections import Counter
from pyphen import Pyphen
import pickle
#import xgboost

# lead the language model from spay. this must be downloaded
nlp = spacy.load('en_core_web_md')
pyphen_dic = Pyphen(lang='en')

# set word lists to be used

## This corpus comes from the Cambridge English Corpus of spoken English and includes
## all the NGSL and SUP words needed to get 90% coverage.
NGSL_wordlist = set([
    ln.decode('utf-8').strip() for ln in pkg_resources.resource_stream(
        'financial_readability', 'word_lists/NGSL_wordlist.txt')
])

## The Business Service List 1.0, also known as the BSL (Browne, C. & Culligan, B., 2016) is a list of approximately 1700 words
## that occur with very high frequency within the domain of general business English. Based on a 64.5 million word corpus of business
## texts, newspapers, journals and websites, the BSL 1.0 version gives approximately 97% coverage of general business English materials
## when learned in combination with the 2800 words of core general English in the New General Service List or NGSL (Browne, C., Culligan, B., and Phillips, J. 2013)
BSL_wordlist = set([
コード例 #21
0
ファイル: textatistic.py プロジェクト: sujoykroy/Textatistic
 def __init__(self, language):
     self.pyphen = Pyphen(lang=language)
コード例 #22
0
                    type=str,
                    help='E-Mail subject related to survey mails')
parser.add_argument('-notxt',
                    action='store_true',
                    help='Disable saving of results to txt file')
parser.add_argument('-nobar',
                    action='store_true',
                    help='Disable plotting of bar plots')
parser.add_argument('-nopie',
                    action='store_true',
                    help='Disable plotting of pie plots')
args = parser.parse_args()

### CONFIG - SET VARIABLES AND DEFAULTS HERE ###
#pyphen dictionary
german_dict = Pyphen(lang='de_DE')
#e-mail information
login = args.login
password = args.password
pop_server = (args.pop_server if args.pop_server else 'pop3.web.de')
filter_subject = (args.subject if args.subject else 'Evaluation')
#file information
write_txt = not args.notxt
write_bars = not args.nobar
write_pies = not args.nopie
txt_file_name = 'results.txt'
bar_file_name = 'result_bars.pdf'
pie_file_name = 'result_pies.pdf'
#allowed text lengths until new line for plot labels
pie_wrap_len = 19
bar_wrap_len = 10
コード例 #23
0
class GermaLemma(object):
    """
    Lemmatizer for German language text main class.
    """
    pyphen_dic = Pyphen(lang='de')

    def __init__(self, **kwargs):
        """
        Initialize GermaLemma lemmatizer. By default, it will load the lemmatizer data from 'data/lemmata.pickle'. You
        can also pass a manual lemmata dictionary via `lemmata` or load a corpus in CONLL09 format via `tiger_corpus`
        or load pickled lemmatizer data from `pickle`.
        Force usage of pattern.de module by setting `use_pattern_module` to True (or False for not using). By default,
        it will try to use pattern.de if it is installed.
        """
        if 'lemmata' in kwargs:
            self.lemmata = kwargs['lemmata']
            if 'lemmata_lower' in kwargs:
                self.lemmata_lower = kwargs['lemmata_lower']
            else:
                self.lemmata_lower = {
                    pos:
                    {token.lower(): lemma
                     for token, lemma in pos_lemmata}
                    for pos, pos_lemmata in self.lemmata.items()
                }
        elif 'tiger_corpus' in kwargs:
            self.lemmata, self.lemmata_lower = self.load_corpus_lemmata(
                kwargs['tiger_corpus'])
        elif 'pickle' in kwargs:
            self.load_from_pickle(kwargs['pickle'])
        else:
            try:
                self.load_from_pickle(DEFAULT_LEMMATA_PICKLE)
            except FileNotFoundError:
                self.load_from_pickle(
                    os.path.join(sys.prefix, DEFAULT_LEMMATA_PICKLE))

        self.pattern_module = None
        use_pattern_module = kwargs.get('use_pattern_module', None)
        if use_pattern_module in (True, None):
            try:
                self.pattern_module = import_module('pattern.de')
            except ImportError:
                if use_pattern_module is True:
                    raise ImportError('pattern.de module could not be loaded')

    def find_lemma(self, w, pos_tag):
        """
        Find a lemma for word `w` that has a Part-of-Speech tag `pos_tag`. `pos_tag` should be a valid STTS tagset tag
        (see http://www.ims.uni-stuttgart.de/forschung/ressourcen/lexika/TagSets/stts-table.html) or a simplified form
        with:
        - 'N' for nouns
        - 'V' for verbs
        - 'ADJ' for adjectives
        - 'ADV' for adverbs
        All other tags will raise a ValueError("Unsupported POS tag")!
        Return the lemma or, if no lemma was found, return `w`.
        """
        if not w:  # do not process empty strings
            return w

        if pos_tag == 'NE':  # if word is a name, it already is the lemma
            return w

        if pos_tag.startswith('N') or pos_tag.startswith('V'):
            pos = pos_tag[0]
        elif pos_tag.startswith('ADJ') or pos_tag.startswith('ADV'):
            pos = pos_tag[:3]
        else:
            raise ValueError("Unsupported POS tag")

        # look if we can directly find `w` in the lemmata dictionary
        res = self.dict_search(w, pos)

        if not res and self.pattern_module:  # try to use pattern.de module
            res_pattern = self._lemma_via_patternlib(w, pos)
            if res_pattern != w:
                res = res_pattern

        if not res:
            # try to split nouns that are made of composita
            if pos == 'N':
                res = self._composita_lemma(w) or w
            else:
                res = w

            # try to lemmatize adjectives using prevalent German language adjective suffixes
            if pos == 'ADJ':
                res = self._adj_lemma(res)

        # nouns always start with a capital letter
        if pos == 'N':
            if len(res) > 1 and res[0].islower():
                res = res[0].upper() + res[1:]
        else:  # all other forms are lower-case
            res = res.lower()

        return res

    def dict_search(self, w, pos, use_lower=False):
        """
        Lemmata dictionary lookup for word `w` with POS tag `pos`.
        Return lemma if found, else None.
        """
        pos_lemmata = self.lemmata_lower[pos] if use_lower else self.lemmata[
            pos]

        return pos_lemmata.get(w, None)

    def _adj_lemma(self, w):
        """
        Try to lemmatize adjectives using prevalent German language adjective suffixes. Return possibly lemmatized
        adjective.
        """
        for full, reduced in ADJ_SUFFIXES_DICT.items():
            if w.endswith(full):
                return w[:-len(full)] + reduced

        return w

    def _composita_lemma(self, w):
        """
        Try to split a word `w` that is possibly made of composita.
        Return the lemma if found, else return None.
        """

        # find most important split position first when a hyphen is used in the word
        try:
            split_positions = [w.rfind('-') + 1]
        except ValueError:
            split_positions = []

        # add possible split possitions by using Pyphen's hyphenation positions
        split_positions.extend([
            p for p in self.pyphen_dic.positions(w) if p not in split_positions
        ])

        # now split `w` by hyphenation step by step
        for hy_pos in split_positions:
            # split in left and right parts (start and end of the strings)
            left, right = w[:hy_pos], w[hy_pos:]

            # look if the right part can be found in the lemmata dictionary
            # if we have a noun, a lower case match will also be accepted
            if left and right and not right.endswith('innen'):
                res = self.dict_search(right,
                                       'N',
                                       use_lower=right[0].islower())
                if res:
                    # concatenate the left side with the found partial lemma
                    if left[-1] == '-':
                        res = left + res.capitalize()
                    else:
                        res = left + res.lower()

                    if w.isupper():
                        return res.upper()
                    else:
                        return res

        return None

    def _lemma_via_patternlib(self, w, pos):
        """
        Try to find a lemma for word `w` that has a Part-of-Speech tag `pos_tag` by using pattern.de module's functions.
        Return the lemma or `w` if lemmatization was not possible with pattern.de
        """
        if not self.pattern_module:
            raise RuntimeError('pattern.de module not loaded')

        if pos == 'NP':  # singularize noun
            return self.pattern_module.singularize(w)
        elif pos.startswith('V'):  # get infinitive of verb
            return self.pattern_module.conjugate(w)
        elif pos.startswith('ADJ') or pos.startswith(
                'ADV'):  # get baseform of adjective or adverb
            return self.pattern_module.predicative(w)

        return w

    @classmethod
    def load_corpus_lemmata(cls, corpus_file):
        lemmata = defaultdict(dict)
        lemmata_lower = defaultdict(dict)

        with codecs.open(corpus_file, encoding="utf-8") as f:
            for line in f:
                parts = line.split()
                if len(parts) == 15:
                    token, lemma = parts[1:3]
                    pos = parts[4]
                    cls.add_to_lemmata_dicts(lemmata, lemmata_lower, token,
                                             lemma, pos)

        return lemmata, lemmata_lower

    @staticmethod
    def add_to_lemmata_dicts(lemmata, lemmata_lower, token, lemma, pos):
        for pos_prefix in VALID_POS_PREFIXES:
            if pos.startswith(pos_prefix):
                if token not in lemmata[pos_prefix]:
                    lemmata[pos_prefix][token] = lemma
                if lemma not in lemmata[pos_prefix]:  # for quicker lookup
                    lemmata[pos_prefix][lemma] = lemma

                if pos_prefix == 'N':
                    token_lower = token.lower()
                    if token_lower not in lemmata_lower[pos_prefix]:
                        lemmata_lower[pos_prefix][token_lower] = lemma
                    lemma_lower = lemma.lower()
                    if lemma_lower not in lemmata_lower[pos_prefix]:
                        lemmata_lower[pos_prefix][lemma_lower] = lemma

                return

    def save_to_pickle(self, pickle_file):
        with open(pickle_file, 'wb') as f:
            pickle.dump((self.lemmata, self.lemmata_lower), f, protocol=2)

    def load_from_pickle(self, pickle_file):
        with open(pickle_file, 'rb') as f:
            self.lemmata, self.lemmata_lower = pickle.load(f)
コード例 #24
0
ファイル: proto.py プロジェクト: steve050798/OffensEval
def DataPreprocessing(data, train=1):

    global docCount

    #EXTRACTING DENSE FEATURES
    sentiment = np.array([])
    word_count = np.array([])
    char_count = np.array([])
    sent_count = np.array([])
    syl_count = np.array([])
    mention_count = np.array([])
    url_count = np.array([])
    special_count = np.array([])
    cat_count = np.array([])
    dic = Pyphen(lang='en')
    for text in data["tweet"]:
        blob = TextBlob(text)

        #OPTIONAL SPELLING CORRECTION
        #data.loc[docCount,"tweet"]=str(blob.correct())
        #print(data.loc[docCount,"tweet"],type(data.loc[docCount,"tweet"]))

        url_count = np.append(url_count, blob.words.count("URL"))
        mention_count = np.append(mention_count, blob.words.count("USER"))
        cat_count = np.append(cat_count, sum(c == '#' for c in text))
        special_count = np.append(
            special_count,
            sum(not c.isalnum() and c != ' ' and c != '@' and c != '#'
                for c in text))
        syl_count = np.append(
            syl_count,
            len(TextBlob(dic.inserted(text).replace('-', ' ')).words))
        char_count = np.append(char_count, len(text))
        word_count = np.append(word_count, len(blob.words))
        sent_count = np.append(sent_count, len(blob.sentences))
        sentiment = np.append(sentiment, blob.sentiment.polarity)
        docCount += 1

    #INITIALIZING STEMMER AND STOP WORD CORPUS
    stop_words = set(stopwords.words('english'))
    porter_stemmer = PorterStemmer()

    #POS TAGGING
    POS = CMUTweetTagger.runtagger_parse(data["tweet"])
    POSDictionary = {
        "N": "nn",
        "O": "pro",
        "S": "np",
        "^": "nnps",
        "Z": "nnpz",
        "L": "vl",
        "M": "nv",
        "V": "md",
        "A": "adj",
        "R": "adv",
        "!": "int",
        "D": "det",
        "P": "ppt",
        "&": "cc",
        "T": "rp",
        "X": "ex",
        "Y": "exv",
        "#": "cat",
        "@": "tar",
        "~": "dsc",
        ",": "punc",
        "$": "num",
        "U": "url",
        "E": "emo",
        "G": "abr"
    }

    #PREPROCESSING (REMOVE STOP WORDS AND STEMMING)
    docCount = 0
    for doc in POS:
        filtered_sentence = []
        for word in doc:
            if word[0] not in stop_words:
                filtered_sentence.append(porter_stemmer.stem(
                    word[0]))  #+'_'+POSDictionary[word[1]])
        data.loc[docCount, "tweet"] = filtered_sentence
        data.loc[docCount, "tweet"] = " ".join(data.loc[docCount, "tweet"])
        docCount += 1

    #REPLACING LABEL (subtask) WITH INTEGER
    if (train == 1):
        data['label'] = data['subtask'].factorize()[0]
    data['sentiment'] = sentiment + 1
    data['sent_count'] = sent_count
    data['word_count'] = word_count
    data['syl_count'] = syl_count
    data['url_count'] = url_count
    data['mention_count'] = mention_count
    data['cat_count'] = cat_count
    data['special_count'] = special_count

    #SEPERATING FEATURES AND LABELS
    X = data[[
        'tweet', 'sentiment', 'sent_count', 'word_count', 'syl_count',
        'url_count', 'mention_count', 'special_count', 'cat_count'
    ]]
    if train == 1:
        y = data['label']
    else:
        y = None
    return X, y
コード例 #25
0
def read_data(args, config):
    '''read data sets, construct all needed structures and update the config'''
    if args.ssm == '1': config.ssm = 1

    hyphenator = Pyphen(lang=args.dict)

    def my_syllables(word):
        return hyphenator.inserted(word).split('-')

    if args.is_train == '1':
        if not os.path.exists(args.save_dir):
            os.makedirs(args.save_dir)
        with open(os.path.join(args.save_dir, args.prefix + '-data.pkl'),
                  'wb') as data_file:
            word_data = open(
                os.path.join(args.data_dir, 'train.txt'), 'r').read() \
                .replace('\n', args.eos).split()
            words = list(set(word_data))

            syllables = set()
            word_lens_in_syl = []

            for word in words:
                syls = my_syllables(word)
                word_lens_in_syl.append(len(syls))
                for syl in syls:
                    syllables.add(syl)

            syls_list = list(syllables)
            pickle.dump((word_data, words, word_lens_in_syl, syls_list),
                        data_file)
    else:
        with open(os.path.join(args.save_dir, args.prefix + '-data.pkl'),
                  'rb') as data_file:
            word_data, words, word_lens_in_syl, syls_list = pickle.load(
                data_file)

    word_data_size, word_vocab_size = len(word_data), len(words)
    print('data has %d words, %d unique' % (word_data_size, word_vocab_size))
    config.word_vocab_size = word_vocab_size
    config.num_sampled = int(word_vocab_size * 0.2)

    word_to_ix = {word: i for i, word in enumerate(words)}
    ix_to_word = {i: word for i, word in enumerate(words)}

    def get_word_raw_data(input_file):
        data = open(input_file, 'r').read().replace('\n', args.eos).split()
        return [word_to_ix[w] for w in data]

    train_raw_data = get_word_raw_data(os.path.join(args.data_dir,
                                                    'train.txt'))
    valid_raw_data = get_word_raw_data(os.path.join(args.data_dir,
                                                    'valid.txt'))
    test_raw_data = get_word_raw_data(os.path.join(args.data_dir, 'test.txt'))

    syl_vocab_size = len(syls_list)
    max_word_len = int(np.percentile(word_lens_in_syl, 100))
    config.max_word_len = max_word_len
    print('data has %d unique syllables' % syl_vocab_size)
    print('max word length in syllables is set to', max_word_len)

    # a fake syllable for zero-padding
    zero_pad_syl = ' '
    syls_list.insert(0, zero_pad_syl)
    syl_vocab_size += 1
    config.syl_vocab_size = syl_vocab_size

    syl_to_ix = {syl: i for i, syl in enumerate(syls_list)}
    ix_to_syl = {i: syl for i, syl in enumerate(syls_list)}

    word_ix_to_syl_ixs = {}
    for word in words:
        word_ix = word_to_ix[word]
        word_in_syls = my_syllables(word)
        word_in_syls += [zero_pad_syl] * (max_word_len - len(word_in_syls))
        word_ix_to_syl_ixs[word_ix] = [syl_to_ix[syl] for syl in word_in_syls]

    return train_raw_data, valid_raw_data, test_raw_data, word_ix_to_syl_ixs
コード例 #26
0
def count_syllables(word):
    pyphen_dic = Pyphen(lang='en')
    syllabled_word = pyphen_dic.inserted(word)
    return syllabled_word.count('-') + 1
コード例 #27
0
 def __init__(self, lang='en_US'):
     
     self.dic = dic = Pyphen(lang=lang)
コード例 #28
0
ファイル: textstat.py プロジェクト: GuillemGSubies/textstat
 def set_lang(self, lang):
     self.__lang = lang
     self.pyphen = Pyphen(lang=self.__lang)
     self._cache_clear()
コード例 #29
0
import re
from pandocfilters import Para, Str, toJSONFilter, walk
from pyphen import Pyphen

dic = Pyphen(lang='en_US', left=3, right=3)

word_detection_pattern = re.compile(r'\w{7,}', re.UNICODE)

def inpara(key, value, format, meta):
    if key == 'Para':
        return Para(walk(value, hyphenate, format, meta))

def hyphenate(key, value, format, meta):
    if key == 'Str':
        return Str(word_detection_pattern.sub(
            lambda match: dic.inserted(match.group(0), hyphen='­'),
            value))

if __name__ == "__main__":
    toJSONFilter(inpara)
コード例 #30
0
ファイル: tusebot.py プロジェクト: sweettuse/utils
from utils.web.slack_api import parse_config
from utils.web.slack_api.big_emoji import resize_image, resize_gif
from utils.web.slack_api.text_to_emoji import text_to_emoji
from utils.web.servers.core import register_cmd, SlackInfo, slack_api, app, init_slack_api, gen_help_str, \
    send_to_channel, request_in_loop, no_dm, run_in_executor

__author__ = 'acushner'

from utils.web.servers.incident import IncidentInfo, init_incident_store

_admins = parse_config().admin_id_name_map
DEFAULT_SIZE_MULT = 6.
MAX_SIZE_MULT = 15.

_pyphen = Pyphen(lang='en')


@register_cmd
@no_dm
async def embiggen(si: SlackInfo):
    """emoji [size_multiple]
    [_size_multiple_]: multiple to scale up/down emoji size by
    only works on custom emoji due to download issues from slack"""
    emoji, *rest = si.argstr.split()
    mult = min(MAX_SIZE_MULT, float(first(rest, DEFAULT_SIZE_MULT)))
    if mult <= 0:
        return text(f'invalid mult: {mult}')

    all_emoji = await slack_api.get_emoji()
    try: