# for max_len heurisitc import curses from curses.ascii import isdigit import nltk from nltk.corpus import cmudict import pyphen d = cmudict.dict() def nsyl(word): return [len(list(y for y in x if isdigit(y[-1]))) for x in d[word.lower()]] dic = pyphen.Pyphen(lang='nl_NL') ok_clusters = {'ch', 'sh', 'th'} def max_pinyin_length(name): """ heuristic for finding the maximum number of pinyin from an english name """ num_syllables = 0 try: num_syllables = nsyl(name)[0] except: pass hyphenated = dic.inserted(name).split('-') hyph_count = len(hyphenated)
def split_first_line(text, style, hinting, max_width, line_width): """Fit as much as possible in the available width for one line of text. Return ``(layout, length, resume_at, width, height, baseline)``. ``layout``: a pango Layout with the first line ``length``: length in UTF-8 bytes of the first line ``resume_at``: The number of UTF-8 bytes to skip for the next line. May be ``None`` if the whole text fits in one line. This may be greater than ``length`` in case of preserved newline characters. ``width``: width in pixels of the first line ``height``: height in pixels of the first line ``baseline``: baseline in pixels of the first line """ # In some cases (shrink-to-fit result being the preferred width) # this value is coming from Pango itself, # but floating point errors have accumulated: # width2 = (width + X) - X # in some cases, width2 < width # Increase the value a bit to compensate and not introduce # an unexpected line break. if max_width is not None: max_width *= 1.0001 # Step #1: Get a draft layout with the first line layout = None if max_width: expected_length = int(max_width / style.font_size * 2.5) if expected_length < len(text): # Try to use a small amount of text instead of the whole text layout = create_layout(text[:expected_length], style, hinting, max_width) lines = layout.iter_lines() first_line = next(lines, None) second_line = next(lines, None) if second_line is None: # The small amount of text fits in one line, give up and use # the whole text layout = None if layout is None: layout = create_layout(text, style, hinting, max_width) lines = layout.iter_lines() first_line = next(lines, None) second_line = next(lines, None) resume_at = None if second_line is None else second_line.start_index # Step #2: Don't hyphenize when it's not needed if max_width is None: # The first line can take all the place needed return first_line_metrics(first_line, text, layout, resume_at) first_line_width, _height = get_size(first_line) if second_line is None and first_line_width <= max_width: # The first line fits in the available width return first_line_metrics(first_line, text, layout, resume_at) # Step #3: Try to put the first word of the second line on the first line if first_line_width <= max_width: # The first line may have been cut too early by Pango second_line_index = second_line.start_index first_part = utf8_slice(text, slice(second_line_index)) second_part = utf8_slice(text, slice(second_line_index, None)) else: # The first word is longer than the line, try to hyphenize it first_part = '' second_part = text next_word = second_part.split(' ', 1)[0] if not next_word: # We did not find a word on the next line return first_line_metrics(first_line, text, layout, resume_at) # next_word might fit without a space afterwards. # Pango previously counted that space’s advance width. new_first_line = first_part + next_word layout.set_text(new_first_line) lines = layout.iter_lines() first_line = next(lines, None) second_line = next(lines, None) first_line_width, _height = get_size(first_line) if second_line is None and first_line_width <= max_width: # The next word fits in the first line, keep the layout resume_at = len(new_first_line.encode('utf-8')) + 1 return first_line_metrics(first_line, text, layout, resume_at) # Step #4: Try to hyphenize hyphens = style.hyphens lang = style.lang and pyphen.language_fallback(style.lang) total, left, right = style.hyphenate_limit_chars hyphenated = False # Automatic hyphenation possible and next word is long enough if hyphens not in ('none', 'manual') and lang and len(next_word) >= total: first_line_width, _height = get_size(first_line) space = max_width - first_line_width if style.hyphenate_limit_zone.unit == '%': limit_zone = max_width * style.hyphenate_limit_zone.value / 100. else: limit_zone = style.hyphenate_limit_zone.value if space > limit_zone or space < 0: # The next word does not fit, try hyphenation dictionary_key = (lang, left, right, total) dictionary = PYPHEN_DICTIONARY_CACHE.get(dictionary_key) if dictionary is None: dictionary = pyphen.Pyphen(lang=lang, left=left, right=right) PYPHEN_DICTIONARY_CACHE[dictionary_key] = dictionary for first_word_part, _ in dictionary.iterate(next_word): new_first_line = (first_part + first_word_part + style.hyphenate_character) temp_layout = create_layout(new_first_line, style, hinting, max_width) temp_lines = temp_layout.iter_lines() temp_first_line = next(temp_lines, None) temp_second_line = next(temp_lines, None) if (temp_second_line is None and space >= 0) or space < 0: hyphenated = True # TODO: find why there's no need to .encode resume_at = len(first_part + first_word_part) layout = temp_layout first_line = temp_first_line second_line = temp_second_line temp_first_line_width, _height = get_size(temp_first_line) if temp_first_line_width <= max_width: break # Step 5: Try to break word if it's too long for the line overflow_wrap = style.overflow_wrap first_line_width, _height = get_size(first_line) space = max_width - first_line_width # If we can break words and the first line is too long if overflow_wrap == 'break-word' and space < 0: if hyphenated: # Is it really OK to remove hyphenation for word-break ? new_first_line = new_first_line.rstrip( new_first_line[-(len(style.hyphenate_character)):]) if second_line is not None: second_line_index = second_line.start_index second_part = utf8_slice(text, slice(second_line_index, None)) new_first_line += second_part hyphenated = False # TODO: Modify code to preserve W3C condition: # "Shaping characters are still shaped as if the word were not broken" # The way new lines are processed in this function (one by one with no # memory of the last) prevents shaping characters (arabic, for # instance) from keeping their shape when wrapped on the next line with # pango layout. Maybe insert Unicode shaping characters in text ? temp_layout = create_layout(new_first_line, style, hinting, max_width) temp_layout.set_wrap(PANGO_WRAP_MODE['WRAP_WORD_CHAR']) temp_lines = temp_layout.iter_lines() temp_first_line = next(temp_lines, None) temp_second_line = next(temp_lines, None) temp_second_line_index = (len(new_first_line) if temp_second_line is None else temp_second_line.start_index) resume_at = temp_second_line_index first_part = utf8_slice(text, slice(temp_second_line_index)) layout = create_layout(first_part, style, hinting, max_width) lines = layout.iter_lines() first_line = next(lines, None) return first_line_metrics(first_line, text, layout, resume_at, hyphenated)
gore = ['blood','flesh','bloody','bloodstained','mangled','liver','heart','brain','splatter','splattering','splattered', 'carnage','slash','slashed','slashing','organ','slaughter','slaughtered','slaughtering'] #TODO good list #cohesives = [] pronouns = ['I','my','me','mine','myself','you','your','yours','yourself','he','him','his','himself','she','her','hers','herself','it', 'its','itself','they','them','theirs','yourselves','themselves'] causalverbs = ['make','made','cause','caused','allow','allowed','help','helped','have','had','enable','enabled','keep','kept', 'hold','held','let','force','forced','require','required','making','causing','allowing','helping','having', 'enabling','keeping','holding','letting','forcing','requiring'] causalparts = ['because','despite','resulting','thus','consequently','so','as','since'] hedgesndt = ['almost','maybe','somewhat','likely','barely','mildly','little','pretty','fairly'] amplifiers = ['completely','extremely','incredibly','quite','very','mostly','amazingly','really','definitely','exactly', 'awfully'] negations = ['not','neither','nor','none','t','\'t','never','nobody','nowhere','no'] semper = ['seem','appear','seemed','appeared','seeming','appearing'] bow_list = [] #Total word list for bag of words features dic = pyphen.Pyphen(lang='en') print('Word lists created') brownwords = FreqDist() for sentence in brown.sents(): for word in sentence: brownwords[word] += 1 print('Brown corpus loaded') """ FUNCTIONS """ def main(): object = storydata(True) train_data = object.getTrain() test_data = object.getTest()
def test_alternative(): """Test the alternative parser.""" dic = pyphen.Pyphen(lang='hu', left=1, right=1) assert tuple(dic.iterate('kulissza')) == (('kulisz', 'sza'), ('ku', 'lissza')) assert dic.inserted('kulissza') == 'ku-lisz-sza'
def get_number_syllables(self): dic = pyphen.Pyphen(lang='en') return sum([len(dic.inserted(word).split("-")) for word in self.words])
def test_inserted(): """Test the ``inserted`` method.""" dic = pyphen.Pyphen(lang='nl_NL') assert dic.inserted('lettergrepen') == 'let-ter-gre-pen'
def test_iterate(): """Test the ``iterate`` method.""" dic = pyphen.Pyphen(lang='nl_NL') assert tuple(dic.iterate('Amsterdam')) == (('Amster', 'dam'), ('Am', 'sterdam'))
import re from functools import partial from subprocess import PIPE, Popen from urllib.parse import urlparse import bleach import commonmark as cm import html5lib import pyphen import smartypants from django.apps import apps from django.conf import settings from django.core.exceptions import ImproperlyConfigured hyphen_dict = pyphen.Pyphen(lang="en_US") def insert_node_to_ast(tag, block, matchedobj): """Insert trivial block inside of given block in ast node.""" target_eq = matchedobj.groups()[1] block.t = 'html_inline' return '<{tag}>{val}</{tag}>'.format(tag=tag, val=target_eq) re_inlines_to_replace = ( (re.compile(r'(~([^ ~]*)~)'), partial(insert_node_to_ast, 'sub')), (re.compile(r'(\^([^ \^]*)\^)'), partial(insert_node_to_ast, 'sup')), ) def inject_subsup_tags(ast):
# coding=utf-8 import pyphen import re hyp = pyphen.Pyphen(lang='hu_HU') def hyphenated(text): res = [] for w in text.decode('utf-8').split(u' '): h = hyp.inserted(w, '­') h = h.replace('cs­cs', 'ccs') h = h.replace('dz­dz', 'ddz') h = h.replace('dzs­dzs', 'ddzs') h = h.replace('gy­gy', 'ggy') h = h.replace('ly­ly', 'lly') h = h.replace('ny­ny', 'nny') h = h.replace('sz­sz', 'ssz') h = h.replace('ty­ty', 'tty') h = h.replace('zs­zs', 'zzs') res.append(h) return ' '.join(res).encode('utf-8') lines = [] prev = None separator = False heading = False body = False with file('a-tavoli-fa.html') as f: for l in f:
test = pd.DataFrame( np.hstack( (vectorizer.transform(X_test).toarray(), np.array(y_test)[:, None]))) train.to_csv("train.csv", header=False, index=False) test.to_csv("test.csv", header=False, index=False) spam_list = [] with open("spam_word_list.txt", "r") as f: spam_list = [ word.strip().lower() for word in f.readlines() if word != "\n" ] d = enchant.Dict("en_US") pyphen.language_fallback('nl_NL_variant1') dic = pyphen.Pyphen(lang='en_GB') def extract_features(doc): doc = doc.lower() res = [] tokens = word_tokenize(doc) sents = sent_tokenize(doc) # Number of sentences res.append(len(sents)) # Number of verbs tags = pos_tag(tokens) counts = Counter(token[1] for token in tags) res.append(counts["VB"])
#-*- coding: utf-8 -*- # Program to split words into syllables and calculate comprehensiveness' coefficient # Declare titles: titles = ['NCzas', 'Newsweek', 'Onet', 'Polityka', 'WPolityce'] import pyphen, csv, re, math # Declare handler to consume words and split into syllables: dic = pyphen.Pyphen(lang='pl_PL') for title in titles: path = "C:\\Users\\Ilona\\PycharmProjects\\TestDataGenerator\\ArticleText\\" + title + "Text.txt" file = open(path) reader = csv.reader(file) # Declare pattern to remove all non-alphanumeric characters: pattern = re.compile('[\W_]+') sentLen = [] articleText = [] # Split into syllables: for line in reader: for item in line: sentences = item.split('.') for sentence in sentences: words = sentence.split(' ') words = list(filter(len, words)) # Counter for counting words longer than 3 syllables: longerThan3 = 0
def __init__(self, language=None, ablate=None, features_to_use=None): """ Define basic properties Args: language(str): language of input data features_to_use: a list of string named features to use """ # This dict contains all available features, along with a list of their # high-computing-power requirements e.g. ['spacy'] feature_requirements = { # TODO: Actually fill in these requirements. At the moment, I'm just putting everything as all requirements. 'is_nounphrase': ['spacy'], 'len_tokens_norm': ['spacy'], 'hypernym_count': None, 'len_chars_norm': None, 'len_tokens': None, 'len_syllables': ['hyph'], 'consonant_freq': None, 'gr_or_lat': ['affix'], 'is_capitalised': None, 'num_complex_punct': None, 'avg_chars_p_word': None, 'sent_length': None, 'unigram_prob': ['unigram_probs'], 'char_n_gram_feats': None, 'sent_n_gram_feats': None, 'iob_tags': ['spacy'], 'lemma_feats': ['spacy'], 'bag_of_shapes': ['spacy'], 'pos_tag_counts': ['spacy'], 'NER_tag_counts': ['spacy'], } if features_to_use == None or features_to_use == 'all': features_to_use = list(feature_requirements.keys()) # Total requirements is a unique list of all the requirements. self.total_requirements = set() final_features = [] for feature in features_to_use: # Making sure that we know about the feature if feature in feature_requirements.keys(): if feature_requirements[feature] is not None: for requirement in feature_requirements[feature]: self.total_requirements.add(requirement) final_features.append(feature) else: print( "{} did not match any of the features in feature_requirements, so was not used." .format(feature)) self.features_to_use = final_features self.affixes = {} self.spacy_models = { 'english': None, 'spanish': None, 'german': None, 'french': None } self.hyph_dictionaries = { 'english': None, 'spanish': None, 'german': None, 'french': None } self.unigram_prob_dict = { 'english': None, 'spanish': None, 'german': None, 'french': None } # So that we're only opening this file once. if 'affix' in self.total_requirements: self.affixes = affix_features.get_affixes() if language == 'english': if 'spacy' in self.total_requirements: self.spacy_models = {'english': spacy.load('en_core_web_lg')} if 'hyph' in self.total_requirements: self.hyph_dictionaries = {'english': pyphen.Pyphen(lang='en')} if 'unigram_probs' in self.total_requirements: self.unigram_prob_dict = { 'english': file_io.read_file('data/external/english_u_prob.csv') } elif language == 'spanish': if 'spacy' in self.total_requirements: self.spacy_models = {'spanish': spacy.load('es_core_news_md')} if 'hyph' in self.total_requirements: self.hyph_dictionaries = {'spanish': pyphen.Pyphen(lang='es')} if 'unigram_probs' in self.total_requirements: self.unigram_prob_dict = { 'spanish': file_io.read_file('data/external/spanish_u_prob.csv') } elif language == 'german': if 'spacy' in self.total_requirements: self.spacy_models = {'german': spacy.load('de_core_news_sm')} if 'hyph' in self.total_requirements: self.hyph_dictionaries = {'german': pyphen.Pyphen(lang='de')} if 'unigram_probs' in self.total_requirements: self.unigram_prob_dict = { 'german': file_io.read_file('data/external/german_u_prob.csv') } elif language == 'french': if 'spacy' in self.total_requirements: self.spacy_models = {'french': spacy.load('fr_core_news_md')} if 'hyph' in self.total_requirements: self.hyph_dictionaries = {'french': pyphen.Pyphen(lang='fr')} if 'unigram_probs' in self.total_requirements: self.unigram_prob_dict = { 'french': file_io.read_file('data/external/french_u_prob.csv') } else: if 'spacy' in self.total_requirements: self.spacy_models = { 'english': spacy.load('en_core_web_lg'), 'spanish': spacy.load("es_core_news_md"), 'german': spacy.load('de_core_news_sm'), 'french': spacy.load('fr_core_news_md') } if 'hyph' in self.total_requirements: self.hyph_dictionaries = { 'english': pyphen.Pyphen(lang='en'), 'spanish': pyphen.Pyphen(lang='es'), 'german': pyphen.Pyphen(lang='de'), 'french': pyphen.Pyphen(lang='fr') } if 'unigram_probs' in self.total_requirements: self.unigram_prob_dict = { 'english': file_io.read_file('data/external/english_u_prob.csv'), 'spanish': file_io.read_file('data/external/spanish_u_prob.csv'), 'german': file_io.read_file('data/external/german_u_prob.csv'), 'french': file_io.read_file('data/external/french_u_prob.csv') } self.ablate = ablate
def split_first_line(text, style, hinting, max_width, line_width): """Fit as much as possible in the available width for one line of text. Return ``(layout, length, resume_at, width, height, baseline)``. ``layout``: a pango Layout with the first line ``length``: length in UTF-8 bytes of the first line ``resume_at``: The number of UTF-8 bytes to skip for the next line. May be ``None`` if the whole text fits in one line. This may be greater than ``length`` in case of preserved newline characters. ``width``: width in pixels of the first line ``height``: height in pixels of the first line ``baseline``: baseline in pixels of the first line """ # In some cases (shrink-to-fit result being the preferred width) # this value is coming from Pango itself, # but floating point errors have accumulated: # width2 = (width + X) - X # in some cases, width2 < width # Increase the value a bit to compensate and not introduce # an unexpected line break. if max_width is not None: max_width *= 1.0001 # Step #1: Get a draft layout with the first line layout = None if max_width: expected_length = int(max_width / style.font_size * 2.5) if expected_length < len(text): # Try to use a small amount of text instead of the whole text layout = create_layout(text[:expected_length], style, hinting, max_width) lines = layout.iter_lines() first_line = next(lines, None) second_line = next(lines, None) if second_line is None: # The small amount of text fits in one line, give up and use # the whole text layout = None if layout is None: layout = create_layout(text, style, hinting, max_width) lines = layout.iter_lines() first_line = next(lines, None) second_line = next(lines, None) resume_at = None if second_line is None else second_line.start_index # Step #2: Don't hyphenize when it's not needed if max_width is None: # The first line can take all the place needed return first_line_metrics(first_line, text, layout, resume_at) first_line_width, _height = get_size(first_line) if second_line is None and first_line_width <= max_width: # The first line fits in the available width return first_line_metrics(first_line, text, layout, resume_at) # Step #3: Try to put the first word of the second line on the first line if first_line_width <= max_width: # The first line may have been cut too early by Pango second_line_index = second_line.start_index first_part = utf8_slice(text, slice(second_line_index)) second_part = utf8_slice(text, slice(second_line_index, None)) else: # The first word is longer than the line, try to hyphenize it first_part = '' second_part = text next_word = second_part.split(' ', 1)[0] if not next_word: # We did not find a word on the next line return first_line_metrics(first_line, text, layout, resume_at) # next_word might fit without a space afterwards. # Pango previously counted that space’s advance width. new_first_line = first_part + next_word layout.set_text(new_first_line) lines = layout.iter_lines() first_line = next(lines, None) second_line = next(lines, None) first_line_width, _height = get_size(first_line) if second_line is None and first_line_width <= max_width: # The next word fits in the first line, keep the layout resume_at = len(new_first_line.encode('utf-8')) + 1 return first_line_metrics(first_line, text, layout, resume_at) # Step #4: Try to hyphenize hyphens = style.hyphens lang = style.lang total, left, right = style.hyphenate_limit_chars if hyphens in ('none', 'manual') or lang not in pyphen.LANGUAGES: # No automatic hyphenation return first_line_metrics(first_line, text, layout, resume_at) elif len(next_word) < total: # Next word is too small return first_line_metrics(first_line, text, layout, resume_at) first_line_width, _height = get_size(first_line) space = max_width - first_line_width if style.hyphenate_limit_zone.unit == '%': limit_zone = max_width * style.hyphenate_limit_zone.value / 100. else: limit_zone = style.hyphenate_limit_zone.value hyphenated = False if space > limit_zone or space < 0: # The next word does not fit, try hyphenation dictionary_key = (lang, left, right, total) dictionary = PYPHEN_DICTIONARY_CACHE.get(dictionary_key) if dictionary is None: dictionary = pyphen.Pyphen(lang=lang, left=left, right=right) PYPHEN_DICTIONARY_CACHE[dictionary_key] = dictionary for first_word_part, _ in dictionary.iterate(next_word): new_first_line = (first_part + first_word_part + style.hyphenate_character) temp_layout = create_layout(new_first_line, style, hinting, max_width) temp_lines = temp_layout.iter_lines() temp_first_line = next(temp_lines, None) temp_second_line = next(temp_lines, None) if (temp_second_line is None and space >= 0) or space < 0: hyphenated = True # TODO: find why there's no need to .encode resume_at = len(first_part + first_word_part) layout = temp_layout first_line = temp_first_line second_line = temp_second_line temp_first_line_width, _height = get_size(temp_first_line) if temp_first_line_width <= max_width: break return first_line_metrics(first_line, text, layout, resume_at, hyphenated)
def split_first_line(text, style, context, max_width, line_width): """Fit as much as possible in the available width for one line of text. Return ``(layout, length, resume_at, width, height, baseline)``. ``layout``: a pango Layout with the first line ``length``: length in UTF-8 bytes of the first line ``resume_at``: The number of UTF-8 bytes to skip for the next line. May be ``None`` if the whole text fits in one line. This may be greater than ``length`` in case of preserved newline characters. ``width``: width in pixels of the first line ``height``: height in pixels of the first line ``baseline``: baseline in pixels of the first line """ text_wrap = style.white_space in ('pre', 'nowrap') space_collapse = style.white_space in ('normal', 'nowrap', 'pre-line') if text_wrap: max_width = None elif max_width is not None: # In some cases (shrink-to-fit result being the preferred width) # this value is coming from Pango itself, # but floating point errors have accumulated: # width2 = (width + X) - X # in some cases, width2 < width # Increase the value a bit to compensate and not introduce # an unexpected line break. The 1e-9 value comes from PEP 485. max_width *= 1 + 1e-9 # Step #1: Get a draft layout with the first line layout = create_layout(text, style, context, max_width) lines = layout.iter_lines() first_line = next(lines, None) second_line = next(lines, None) resume_at = None if second_line is None else second_line.start_index # Step #2: Don't hyphenize when it's not needed if max_width is None: # The first line can take all the place needed return first_line_metrics( first_line, text, layout, resume_at, space_collapse, style) first_line_width, _ = get_size(first_line, style) if second_line is None and first_line_width <= max_width: # The first line fits in the available width return first_line_metrics( first_line, text, layout, resume_at, space_collapse, style) # Step #3: Try to put the first word of the second line on the first line if first_line_width <= max_width: # The first line may have been cut too early by Pango second_line_index = second_line.start_index first_line_text = utf8_slice(text, slice(second_line_index)) second_line_text = utf8_slice(text, slice(second_line_index, None)) else: # The first word is longer than the line, try to hyphenize it first_line_text = '' second_line_text = text next_word = second_line_text.split(' ', 1)[0] if next_word: if space_collapse: # next_word might fit without a space afterwards # only try when space collapsing is allowed new_first_line_text = first_line_text + next_word layout.set_text(new_first_line_text) lines = layout.iter_lines() first_line = next(lines, None) second_line = next(lines, None) first_line_width, _ = get_size(first_line, style) if second_line is None and first_line_text: # The next word fits in the first line, keep the layout resume_at = len(new_first_line_text.encode('utf-8')) + 1 if resume_at == len(text.encode('utf-8')): resume_at = None return first_line_metrics( first_line, text, layout, resume_at, space_collapse, style) elif second_line: # Text may have been split elsewhere by Pango earlier resume_at = second_line.start_index else: resume_at = first_line.length + 1 elif first_line_text: # We found something on the first line but we did not find a word on # the next line, no need to hyphenate, we can keep the current layout return first_line_metrics( first_line, text, layout, resume_at, space_collapse, style) # Step #4: Try to hyphenize hyphens = style.hyphens lang = style.lang and pyphen.language_fallback(style.lang) total, left, right = style.hyphenate_limit_chars hyphenated = False soft_hyphen = u'\u00ad' # Automatic hyphenation possible and next word is long enough if hyphens != 'none' and len(next_word) >= total: first_line_width, _ = get_size(first_line, style) space = max_width - first_line_width if style.hyphenate_limit_zone.unit == '%': limit_zone = max_width * style.hyphenate_limit_zone.value / 100. else: limit_zone = style.hyphenate_limit_zone.value if space > limit_zone or space < 0: # Manual hyphenation: check that the line ends with a soft hyphen # and add the missing hyphen if hyphens == 'manual': if first_line_text.endswith(soft_hyphen): # The first line has been split on a soft hyphen if u' ' in first_line_text: first_line_text, next_word = ( first_line_text.rsplit(u' ', 1)) next_word = u' ' + next_word layout.set_text(first_line_text) lines = layout.iter_lines() first_line = next(lines, None) second_line = next(lines, None) resume_at = len( (first_line_text + u' ').encode('utf8')) else: first_line_text, next_word = u'', first_line_text soft_hyphen_indexes = [ match.start() for match in re.finditer(soft_hyphen, next_word)] soft_hyphen_indexes.reverse() dictionary_iterations = [ next_word[:i + 1] for i in soft_hyphen_indexes] elif hyphens == 'auto' and lang: # The next word does not fit, try hyphenation dictionary_key = (lang, left, right, total) dictionary = PYPHEN_DICTIONARY_CACHE.get(dictionary_key) if dictionary is None: dictionary = pyphen.Pyphen( lang=lang, left=left, right=right) PYPHEN_DICTIONARY_CACHE[dictionary_key] = dictionary dictionary_iterations = [ start for start, end in dictionary.iterate(next_word)] else: dictionary_iterations = [] if dictionary_iterations: for first_word_part in dictionary_iterations: new_first_line_text = first_line_text + first_word_part hyphenated_first_line_text = ( new_first_line_text + style.hyphenate_character) new_layout = create_layout( hyphenated_first_line_text, style, context, max_width) new_lines = new_layout.iter_lines() new_first_line = next(new_lines, None) new_second_line = next(new_lines, None) new_first_line_width, _ = get_size(new_first_line, style) new_space = max_width - new_first_line_width if new_second_line is None and ( new_space >= 0 or first_word_part == dictionary_iterations[-1]): hyphenated = True layout = new_layout first_line = new_first_line second_line = new_second_line resume_at = len(new_first_line_text.encode('utf8')) if text[len(new_first_line_text)] == soft_hyphen: resume_at += len(soft_hyphen.encode('utf8')) break if not hyphenated and not first_line_text: # Recreate the layout with no max_width to be sure that # we don't break inside the hyphenate-character string hyphenated = True layout.set_text(hyphenated_first_line_text) pango.pango_layout_set_width( layout.layout, units_from_double(-1)) lines = layout.iter_lines() first_line = next(lines, None) second_line = next(lines, None) resume_at = len(new_first_line_text.encode('utf8')) if text[len(first_line_text)] == soft_hyphen: resume_at += len(soft_hyphen.encode('utf8')) if not hyphenated and first_line_text.endswith(soft_hyphen): # Recreate the layout with no max_width to be sure that # we don't break inside the hyphenate-character string hyphenated = True hyphenated_first_line_text = ( first_line_text + style.hyphenate_character) layout.set_text(hyphenated_first_line_text) pango.pango_layout_set_width( layout.layout, units_from_double(-1)) lines = layout.iter_lines() first_line = next(lines, None) second_line = next(lines, None) resume_at = len(first_line_text.encode('utf8')) # Step 5: Try to break word if it's too long for the line overflow_wrap = style.overflow_wrap first_line_width, _ = get_size(first_line, style) space = max_width - first_line_width # If we can break words and the first line is too long if overflow_wrap == 'break-word' and space < 0: # Is it really OK to remove hyphenation for word-break ? hyphenated = False # TODO: Modify code to preserve W3C condition: # "Shaping characters are still shaped as if the word were not broken" # The way new lines are processed in this function (one by one with no # memory of the last) prevents shaping characters (arabic, for # instance) from keeping their shape when wrapped on the next line with # pango layout. Maybe insert Unicode shaping characters in text ? layout.set_text(text) pango.pango_layout_set_width( layout.layout, units_from_double(max_width)) layout.set_wrap(PANGO_WRAP_MODE['WRAP_WORD_CHAR']) temp_lines = layout.iter_lines() next(temp_lines, None) temp_second_line = next(temp_lines, None) temp_second_line_index = ( len(text.encode('utf-8')) if temp_second_line is None else temp_second_line.start_index) resume_at = temp_second_line_index first_line_text = utf8_slice(text, slice(temp_second_line_index)) layout.set_text(first_line_text) lines = layout.iter_lines() first_line = next(lines, None) return first_line_metrics( first_line, text, layout, resume_at, space_collapse, style, hyphenated, style.hyphenate_character)
def test_upper_alternative(): """Test uppercase with alternative parser.""" dic = pyphen.Pyphen(lang='hu', left=1, right=1) assert tuple(dic.iterate('KULISSZA')) == (('KULISZ', 'SZA'), ('KU', 'LISSZA')) assert dic.inserted('KULISSZA') == 'KU-LISZ-SZA'
import pyphen dic = pyphen.Pyphen(lang='en_US') def insert_soft_hyphens(text, hyphen='\xad'): """Insert the hyphen in breaking pointsaccording to the dictionary. '\xad' is the Soft Hyphen (SHY) character """ lines = [] for line in text.splitlines(): hyph_words = [dic.inserted(word, hyphen) for word in line.split()] lines.append(' '.join(hyph_words)) return '\n'.join(lines)
def test_all_dictionaries(): """Test that all included dictionaries can be parsed.""" for lang in pyphen.LANGUAGES: pyphen.Pyphen(lang=lang)
import os, re, numpy as np, pandas as pd, json import pyphen from collections import Counter from collections import defaultdict dic = pyphen.Pyphen(lang='de_DE') df = pd.read_csv('german_viz/data/nouns.csv', encoding='utf-8-sig')[['lemma','genus','suffix']] print(df) nouns = df['lemma'].values genders =df['genus'].values suffixgenders = genders[:34] all_suffixs = df['suffix'].values.tolist() suffixs = df['suffix'].values.tolist()[:34] suffix_dict_freq = defaultdict(int) for i in range(len(all_suffixs)): suffix_dict_freq[all_suffixs[i]] += 1 suffix_dict_freq links, nodes = [], [] gender_freq = defaultdict(int) for i in range(len(genders)): gender_freq[genders[i]] += 1 gender_freq mas = {'name': 'masculine', 'freq': gender_freq['m'], 'i': -1 }
def test_wrap(): """Test the ``wrap`` method.""" dic = pyphen.Pyphen(lang='nl_NL') assert dic.wrap('autobandventieldopje', 11) == ('autoband-', 'ventieldopje')
def test_fallback_dict(): """Test the ``iterate`` method with a fallback dict.""" dic = pyphen.Pyphen(lang='nl_NL-variant') assert tuple(dic.iterate('Amsterdam')) == (('Amster', 'dam'), ('Am', 'sterdam'))
def test_filename(): """Test the ``filename`` parameter.""" dic = pyphen.Pyphen(filename=pyphen.LANGUAGES['nl_NL']) assert dic.inserted('lettergrepen') == 'let-ter-gre-pen'
def count_syllables(phrase): dic = pyphen.Pyphen(lang='en') return len(dic.inserted(phrase).replace('-', ' ').split())
def test_upper(): """Test uppercase.""" dic = pyphen.Pyphen(lang='nl_NL') assert dic.inserted('LETTERGREPEN') == 'LET-TER-GRE-PEN'
def split_first_line(text, style, context, max_width, line_width, justification_spacing): """Fit as much as possible in the available width for one line of text. Return ``(layout, length, resume_at, width, height, baseline)``. ``layout``: a pango Layout with the first line ``length``: length in UTF-8 bytes of the first line ``resume_at``: The number of UTF-8 bytes to skip for the next line. May be ``None`` if the whole text fits in one line. This may be greater than ``length`` in case of preserved newline characters. ``width``: width in pixels of the first line ``height``: height in pixels of the first line ``baseline``: baseline in pixels of the first line """ # See https://www.w3.org/TR/css-text-3/#white-space-property text_wrap = style.white_space in ('normal', 'pre-wrap', 'pre-line') space_collapse = style.white_space in ('normal', 'nowrap', 'pre-line') if not text_wrap: max_width = None # Step #1: Get a draft layout with the first line layout = None if max_width is not None and max_width != float('inf'): expected_length = int(max_width / style.font_size * 2.5) if expected_length < len(text): # Try to use a small amount of text instead of the whole text layout = create_layout(text[:expected_length], style, context, max_width, justification_spacing) lines = layout.iter_lines() first_line = next(lines, None) second_line = next(lines, None) if second_line is None: # The small amount of text fits in one line, give up and use # the whole text layout = None if layout is None: layout = create_layout(text, style, context, max_width, justification_spacing) lines = layout.iter_lines() first_line = next(lines, None) second_line = next(lines, None) resume_at = None if second_line is None else second_line.start_index # Step #2: Don't hyphenize when it's not needed if max_width is None: # The first line can take all the place needed return first_line_metrics(first_line, text, layout, resume_at, space_collapse, style) first_line_width, _ = get_size(first_line, style) if second_line is None and first_line_width <= max_width: # The first line fits in the available width return first_line_metrics(first_line, text, layout, resume_at, space_collapse, style) # Step #3: Try to put the first word of the second line on the first line # https://mail.gnome.org/archives/gtk-i18n-list/2013-September/msg00006 # is a good thread related to this problem. if first_line_width <= max_width: # The first line may have been cut too early by Pango second_line_index = second_line.start_index first_line_text = utf8_slice(text, slice(second_line_index)) second_line_text = utf8_slice(text, slice(second_line_index, None)) else: # The first word is longer than the line, try to hyphenize it first_line_text = '' second_line_text = text next_word = second_line_text.split(' ', 1)[0] if next_word: if space_collapse: # next_word might fit without a space afterwards # only try when space collapsing is allowed new_first_line_text = first_line_text + next_word layout.set_text(new_first_line_text) lines = layout.iter_lines() first_line = next(lines, None) second_line = next(lines, None) first_line_width, _ = get_size(first_line, style) if second_line is None and first_line_text: # The next word fits in the first line, keep the layout resume_at = len(new_first_line_text.encode('utf-8')) + 1 if resume_at == len(text.encode('utf-8')): resume_at = None return first_line_metrics(first_line, text, layout, resume_at, space_collapse, style) elif second_line: # Text may have been split elsewhere by Pango earlier resume_at = second_line.start_index else: resume_at = first_line.length + 1 elif first_line_text: # We found something on the first line but we did not find a word on # the next line, no need to hyphenate, we can keep the current layout return first_line_metrics(first_line, text, layout, resume_at, space_collapse, style) # Step #4: Try to hyphenize hyphens = style.hyphens lang = style.lang and pyphen.language_fallback(style.lang) total, left, right = style.hyphenate_limit_chars hyphenated = False soft_hyphen = u'\u00ad' # Automatic hyphenation possible and next word is long enough if hyphens != 'none' and len(next_word) >= total: first_line_width, _ = get_size(first_line, style) space = max_width - first_line_width if style.hyphenate_limit_zone.unit == '%': limit_zone = max_width * style.hyphenate_limit_zone.value / 100. else: limit_zone = style.hyphenate_limit_zone.value if space > limit_zone or space < 0: # Manual hyphenation: check that the line ends with a soft hyphen # and add the missing hyphen if hyphens == 'manual': if first_line_text.endswith(soft_hyphen): # The first line has been split on a soft hyphen if u' ' in first_line_text: first_line_text, next_word = (first_line_text.rsplit( u' ', 1)) next_word = u' ' + next_word layout.set_text(first_line_text) lines = layout.iter_lines() first_line = next(lines, None) second_line = next(lines, None) resume_at = len( (first_line_text + u' ').encode('utf8')) else: first_line_text, next_word = u'', first_line_text soft_hyphen_indexes = [ match.start() for match in re.finditer(soft_hyphen, next_word) ] soft_hyphen_indexes.reverse() dictionary_iterations = [ next_word[:i + 1] for i in soft_hyphen_indexes ] elif hyphens == 'auto' and lang: # The next word does not fit, try hyphenation dictionary_key = (lang, left, right, total) dictionary = PYPHEN_DICTIONARY_CACHE.get(dictionary_key) if dictionary is None: dictionary = pyphen.Pyphen(lang=lang, left=left, right=right) PYPHEN_DICTIONARY_CACHE[dictionary_key] = dictionary dictionary_iterations = [ start for start, end in dictionary.iterate(next_word) ] else: dictionary_iterations = [] if dictionary_iterations: for first_word_part in dictionary_iterations: new_first_line_text = first_line_text + first_word_part hyphenated_first_line_text = (new_first_line_text + style.hyphenate_character) new_layout = create_layout(hyphenated_first_line_text, style, context, max_width, justification_spacing) new_lines = new_layout.iter_lines() new_first_line = next(new_lines, None) new_second_line = next(new_lines, None) new_first_line_width, _ = get_size(new_first_line, style) new_space = max_width - new_first_line_width if new_second_line is None and ( new_space >= 0 or first_word_part == dictionary_iterations[-1]): hyphenated = True layout = new_layout first_line = new_first_line second_line = new_second_line resume_at = len(new_first_line_text.encode('utf8')) if text[len(new_first_line_text)] == soft_hyphen: resume_at += len(soft_hyphen.encode('utf8')) break if not hyphenated and not first_line_text: # Recreate the layout with no max_width to be sure that # we don't break inside the hyphenate-character string hyphenated = True layout.set_text(hyphenated_first_line_text) pango.pango_layout_set_width(layout.layout, units_from_double(-1)) lines = layout.iter_lines() first_line = next(lines, None) second_line = next(lines, None) resume_at = len(new_first_line_text.encode('utf8')) if text[len(first_line_text)] == soft_hyphen: resume_at += len(soft_hyphen.encode('utf8')) if not hyphenated and first_line_text.endswith(soft_hyphen): # Recreate the layout with no max_width to be sure that # we don't break inside the hyphenate-character string hyphenated = True hyphenated_first_line_text = (first_line_text + style.hyphenate_character) layout.set_text(hyphenated_first_line_text) pango.pango_layout_set_width(layout.layout, units_from_double(-1)) lines = layout.iter_lines() first_line = next(lines, None) second_line = next(lines, None) resume_at = len(first_line_text.encode('utf8')) # Step 5: Try to break word if it's too long for the line overflow_wrap = style.overflow_wrap first_line_width, _ = get_size(first_line, style) space = max_width - first_line_width # If we can break words and the first line is too long if overflow_wrap == 'break-word' and space < 0: # Is it really OK to remove hyphenation for word-break ? hyphenated = False # TODO: Modify code to preserve W3C condition: # "Shaping characters are still shaped as if the word were not broken" # The way new lines are processed in this function (one by one with no # memory of the last) prevents shaping characters (arabic, for # instance) from keeping their shape when wrapped on the next line with # pango layout. Maybe insert Unicode shaping characters in text ? layout.set_text(text) pango.pango_layout_set_width(layout.layout, units_from_double(max_width)) layout.set_wrap(PANGO_WRAP_MODE['WRAP_CHAR']) temp_lines = layout.iter_lines() next(temp_lines, None) temp_second_line = next(temp_lines, None) temp_second_line_index = (len(text.encode('utf-8')) if temp_second_line is None else temp_second_line.start_index) # TODO: WRAP_CHAR is said to "wrap lines at character boundaries", but # it doesn't. Looks like it tries to split at word boundaries and then # at character boundaries if there's no enough space for a full word, # just as WRAP_WORD_CHAR does. That's why we have to split this text # twice. Find why. It may be related to the problem described in the # link given in step #3. first_line_text = utf8_slice(text, slice(temp_second_line_index)) layout.set_text(first_line_text) lines = layout.iter_lines() first_line = next(lines, None) second_line = next(lines, None) resume_at = (first_line.length if second_line is None else second_line.start_index) return first_line_metrics(first_line, text, layout, resume_at, space_collapse, style, hyphenated, style.hyphenate_character)
def get_number_pollisyllable_words(self): dic = pyphen.Pyphen(lang='en') return len([ word for word in self.words if len(dic.inserted(word).split("-")) >= 3 ])
import re from emoji import UNICODE_EMOJI from ast import literal_eval import spacy nlp = spacy.load('en_core_web_lg') spacy_stopwords = spacy.lang.en.stop_words.STOP_WORDS import nltk from nltk.tokenize import sent_tokenize, TweetTokenizer, casual_tokenize from nltk.sentiment.vader import SentimentIntensityAnalyzer nltk.download('punkt') nltk.download('vader_lexicon') import pyphen PYPHEN_DIC = pyphen.Pyphen(lang='en') from collections import Counter, OrderedDict, defaultdict from sklearn.metrics.pairwise import cosine_similarity from sklearn.feature_extraction.text import TfidfVectorizer tfidf_vectorizer = TfidfVectorizer() def cosine_similarity_sklearn(documents): X_train_counts = tfidf_vectorizer.fit_transform(documents) similarities = cosine_similarity(X_train_counts) return similarities.mean() all_pos_tags = [
import sys import pyphen import epitran import nltk import joblib from nltk.tokenize import RegexpTokenizer from inout.dta.corpus import Corpus from inout.dta.poem import Poem # sys.argv[1] Corpus path : ../../resources/Reim_Korpora/A_E_Parviol_Korpus/A_Parviol_Korpus # sys.argv[2] meter model path : ./meter/meter.model.joblib pyp = pyphen.Pyphen(lang='de') epi = epitran.Epitran('deu-Latn') #tokenizer = RegexpTokenizer(r'\w+|\$[\d\.]+|\S+') c = Corpus(sys.argv[1]) poems = c.get_poems() ###für das meter model### meter_model = sys.argv[2] clf = joblib.load(meter_model) tokenizer = RegexpTokenizer(r'\w+') def word2features(sentence, index): word = sentence[index] #print(word, len(word), " ", index) #postag = sentence[index][1]
def __init__(self, lexicon): with open(lexicon, 'r') as f: self.lexicon = json.load(f) self.fallbackDict = pyphen.Pyphen(lang='en_US')
def split_first_line(text, style, context, max_width, justification_spacing, minimum=False): """Fit as much as possible in the available width for one line of text. Return ``(layout, length, resume_at, width, height, baseline)``. ``layout``: a pango Layout with the first line ``length``: length in UTF-8 bytes of the first line ``resume_at``: The number of UTF-8 bytes to skip for the next line. May be ``None`` if the whole text fits in one line. This may be greater than ``length`` in case of preserved newline characters. ``width``: width in pixels of the first line ``height``: height in pixels of the first line ``baseline``: baseline in pixels of the first line """ # See https://www.w3.org/TR/css-text-3/#white-space-property text_wrap = style['white_space'] in ('normal', 'pre-wrap', 'pre-line') space_collapse = style['white_space'] in ('normal', 'nowrap', 'pre-line') if not text_wrap: max_width = None # Step #1: Get a draft layout with the first line layout = None if (max_width is not None and max_width != float('inf') and style['font_size']): if max_width == 0: # Trying to find minimum size, let's naively split on spaces and # keep one word + one letter space_index = text.find(' ') if space_index == -1: expected_length = len(text) else: expected_length = space_index + 2 # index + space + one letter else: expected_length = int(max_width / style['font_size'] * 2.5) if expected_length < len(text): # Try to use a small amount of text instead of the whole text layout = create_layout(text[:expected_length], style, context, max_width, justification_spacing) first_line, index = layout.get_first_line() if index is None: # The small amount of text fits in one line, give up and use # the whole text layout = None if layout is None: layout = create_layout(text, style, context, max_width, justification_spacing) first_line, index = layout.get_first_line() resume_at = index # Step #2: Don't split lines when it's not needed if max_width is None: # The first line can take all the place needed return first_line_metrics(first_line, text, layout, resume_at, space_collapse, style) first_line_width, _ = get_size(first_line, style) if index is None and first_line_width <= max_width: # The first line fits in the available width return first_line_metrics(first_line, text, layout, resume_at, space_collapse, style) # Step #3: Try to put the first word of the second line on the first line # https://mail.gnome.org/archives/gtk-i18n-list/2013-September/msg00006 # is a good thread related to this problem. first_line_text = utf8_slice(text, slice(index)) # We can’t rely on first_line_width, see # https://github.com/Kozea/WeasyPrint/issues/1051 first_line_fits = (first_line_width <= max_width or ' ' in first_line_text.strip() or can_break_text( first_line_text.strip(), style['lang'])) if first_line_fits: # The first line fits but may have been cut too early by Pango second_line_text = utf8_slice(text, slice(index, None)) else: # The line can't be split earlier, try to hyphenate the first word. first_line_text = '' second_line_text = text next_word = second_line_text.split(' ', 1)[0] if next_word: if space_collapse: # next_word might fit without a space afterwards # only try when space collapsing is allowed new_first_line_text = first_line_text + next_word layout.set_text(new_first_line_text) first_line, index = layout.get_first_line() first_line_width, _ = get_size(first_line, style) if index is None and first_line_text: # The next word fits in the first line, keep the layout resume_at = len(new_first_line_text.encode('utf-8')) + 1 return first_line_metrics(first_line, text, layout, resume_at, space_collapse, style) elif index: # Text may have been split elsewhere by Pango earlier resume_at = index else: # Second line is none resume_at = first_line.length + 1 if resume_at >= len(text.encode('utf-8')): resume_at = None elif first_line_text: # We found something on the first line but we did not find a word on # the next line, no need to hyphenate, we can keep the current layout return first_line_metrics(first_line, text, layout, resume_at, space_collapse, style) # Step #4: Try to hyphenate hyphens = style['hyphens'] lang = style['lang'] and pyphen.language_fallback(style['lang']) total, left, right = style['hyphenate_limit_chars'] hyphenated = False soft_hyphen = '\u00ad' try_hyphenate = False if hyphens != 'none': next_word_boundaries = get_next_word_boundaries(second_line_text, lang) if next_word_boundaries: # We have a word to hyphenate start_word, stop_word = next_word_boundaries next_word = second_line_text[start_word:stop_word] if stop_word - start_word >= total: # This word is long enough first_line_width, _ = get_size(first_line, style) space = max_width - first_line_width if style['hyphenate_limit_zone'].unit == '%': limit_zone = (max_width * style['hyphenate_limit_zone'].value / 100.) else: limit_zone = style['hyphenate_limit_zone'].value if space > limit_zone or space < 0: # Available space is worth the try, or the line is even too # long to fit: try to hyphenate try_hyphenate = True if try_hyphenate: # Automatic hyphenation possible and next word is long enough auto_hyphenation = hyphens == 'auto' and lang manual_hyphenation = False if auto_hyphenation: if soft_hyphen in first_line_text or soft_hyphen in next_word: # Automatic hyphenation opportunities within a word must be # ignored if the word contains a conditional hyphen, in favor # of the conditional hyphen(s). # See https://drafts.csswg.org/css-text-3/#valdef-hyphens-auto manual_hyphenation = True else: manual_hyphenation = hyphens == 'manual' if manual_hyphenation: # Manual hyphenation: check that the line ends with a soft # hyphen and add the missing hyphen if first_line_text.endswith(soft_hyphen): # The first line has been split on a soft hyphen if ' ' in first_line_text: first_line_text, next_word = (first_line_text.rsplit( ' ', 1)) next_word = ' ' + next_word layout.set_text(first_line_text) first_line, index = layout.get_first_line() resume_at = len((first_line_text + ' ').encode('utf8')) else: first_line_text, next_word = '', first_line_text soft_hyphen_indexes = [ match.start() for match in re.finditer(soft_hyphen, next_word) ] soft_hyphen_indexes.reverse() dictionary_iterations = [ next_word[:i + 1] for i in soft_hyphen_indexes ] elif auto_hyphenation: dictionary_key = (lang, left, right, total) dictionary = context.dictionaries.get(dictionary_key) if dictionary is None: dictionary = pyphen.Pyphen(lang=lang, left=left, right=right) context.dictionaries[dictionary_key] = dictionary dictionary_iterations = [ start for start, end in dictionary.iterate(next_word) ] else: dictionary_iterations = [] if dictionary_iterations: for first_word_part in dictionary_iterations: new_first_line_text = (first_line_text + second_line_text[:start_word] + first_word_part) hyphenated_first_line_text = (new_first_line_text + style['hyphenate_character']) new_layout = create_layout(hyphenated_first_line_text, style, context, max_width, justification_spacing) new_first_line, new_index = new_layout.get_first_line() new_first_line_width, _ = get_size(new_first_line, style) new_space = max_width - new_first_line_width if new_index is None and (new_space >= 0 or first_word_part == dictionary_iterations[-1]): hyphenated = True layout = new_layout first_line = new_first_line index = new_index resume_at = len(new_first_line_text.encode('utf8')) if text[len(new_first_line_text)] == soft_hyphen: # Recreate the layout with no max_width to be sure that # we don't break before the soft hyphen pango.pango_layout_set_width(layout.layout, units_from_double(-1)) resume_at += len(soft_hyphen.encode('utf8')) break if not hyphenated and not first_line_text: # Recreate the layout with no max_width to be sure that # we don't break before or inside the hyphenate character hyphenated = True layout.set_text(hyphenated_first_line_text) pango.pango_layout_set_width(layout.layout, units_from_double(-1)) first_line, index = layout.get_first_line() resume_at = len(new_first_line_text.encode('utf8')) if text[len(first_line_text)] == soft_hyphen: resume_at += len(soft_hyphen.encode('utf8')) if not hyphenated and first_line_text.endswith(soft_hyphen): # Recreate the layout with no max_width to be sure that # we don't break inside the hyphenate-character string hyphenated = True hyphenated_first_line_text = (first_line_text + style['hyphenate_character']) layout.set_text(hyphenated_first_line_text) pango.pango_layout_set_width(layout.layout, units_from_double(-1)) first_line, index = layout.get_first_line() resume_at = len(first_line_text.encode('utf8')) # Step 5: Try to break word if it's too long for the line overflow_wrap = style['overflow_wrap'] first_line_width, _ = get_size(first_line, style) space = max_width - first_line_width # If we can break words and the first line is too long if not minimum and overflow_wrap == 'break-word' and space < 0: # Is it really OK to remove hyphenation for word-break ? hyphenated = False # TODO: Modify code to preserve W3C condition: # "Shaping characters are still shaped as if the word were not broken" # The way new lines are processed in this function (one by one with no # memory of the last) prevents shaping characters (arabic, for # instance) from keeping their shape when wrapped on the next line with # pango layout. Maybe insert Unicode shaping characters in text? layout.set_text(text) pango.pango_layout_set_width(layout.layout, units_from_double(max_width)) layout.set_wrap(PANGO_WRAP_MODE['WRAP_CHAR']) first_line, index = layout.get_first_line() resume_at = index or first_line.length if resume_at >= len(text.encode('utf-8')): resume_at = None return first_line_metrics(first_line, text, layout, resume_at, space_collapse, style, hyphenated, style['hyphenate_character'])
'comment': py_comment, 'capture': py_capture_stdout, }, 'js': { 'comment': js_comment, 'capture': (lambda text, capture: text), } } # TODO (mbarkhau 2016-08-21): Warn about line length in # code blocks, because they cause horizontal # scrolling. # TODO (mbarkhau 2016-08-21): Parse lang from file # level metadata HYPHEN_DICT = pyphen.Pyphen(lang='en_US') def open(filepath, mode='r', encoding='utf-8'): return io.open(filepath, mode=mode, encoding=encoding) META_PARAM_RE = re.compile( r""" (?P<key>[\w\-\.]+) \: (?P<val>[^\}\,]+) (?:\}|\,) """, re.VERBOSE | re.MULTILINE)