def syllables_for_term(self, term): if is_special_punctuation(term): return 0 # Some things to do before stripping the term # Disney+, Apple+, etc. r = re.match('(.+)\+$', term) if r: return self.syllables_for_term(r.group(1)) + 1 stripped_term = clean_term(term) try: if has_syllable_exception(stripped_term): return syllapy.count(stripped_term) r = re.match("(.+)'s$", stripped_term) if r: # Most possessive's don't add syllables return syllapy.count(r.group(1)) r = re.match('([0-9]{4})s?$', stripped_term) if r: terms = num2words(r.group(1), to='year').split() return reduce( operator.add, [self.syllables_for_term(term) for term in terms]) if re.match('[0-9,]+$', stripped_term): terms = num2words(int(stripped_term.replace(',', ''))).split() return reduce( operator.add, [self.syllables_for_term(term) for term in terms]) r = re.match('([0-9]+)-([0-9]+)$', stripped_term) if r: s1 = self.syllables_for_term(r.group(1)) s2 = self.syllables_for_term(r.group(2)) if s1 and s2: return s1 + s2 + 1 else: return 0 r = re.match('([^-]+)[-/](.+)$', stripped_term) if r: s1 = self.syllables_for_term(r.group(1)) s2 = self.syllables_for_term(r.group(2)) if s1 and s2: return s1 + s2 else: return 0 c = syllapy.count(stripped_term) return c except RuntimeError as err: raise SyllableCountError("Unable to count syllables for term")
def _get_num_syllables(doc: Doc, min_syllables: int = 1): """Return number of words in the document. Filters punctuation and words that start with apostrophe (aka contractions) """ text = (word for word in doc if not word.is_punct and "'" not in word.text) syllables_per_word = tuple(syllapy.count(word.text) for word in text) return sum(c for c in syllables_per_word if c >= min_syllables)
def validate_haiku(form, haiku): words = haiku.data.split() syllables = 0 for word in words: word = word.lower() syllables += syllapy.count(word) if (syllables != 17): raise ValidationError("Check the number of syllables")
def syllable_count(word): try: # look in cmudict return [ len(list(y for y in x if y[-1].isdigit())) for x in cmu_d[word.lower()] ][0] except KeyError: # look in syllapy return syllapy.count(word)
def forcast(self, doc): num_words = self.get_num_words(doc) if num_words < 150: return 0 mono_syllabic = 0 for i in range(150): if syllapy.count(doc[i].text) == 1: mono_syllabic += 1 return 20 - (mono_syllabic / 10)
def countDifficult(lst): s = list() for e in lst: l = syllapy.count(e) if l > 2: #print(e) s.append(e) sset = set(s) return len(sset)
def forcast(self, doc): """Returns the Forcast score for the document. """ num_words = _get_num_words(doc) if num_words < 150: return 0 mono_syllabic = 0 for i in range(150): if syllapy.count(doc[i].text) == 1: mono_syllabic += 1 return 20 - (mono_syllabic / 10)
def haikuFormatter(haiku): syllables=0 words = haiku.split() haikuFormatted = '' for word in words: syllables += syllapy.count(word) if(syllables>= 5): if(syllables == 5): word = word+os.linesep elif(syllables>=12): if(syllables==12): word = word+os.linesep haikuFormatted += ' '+word haikuFormatted = haikuFormatted+os.linesep return haikuFormatted
def syllableCount(self, article): count = 0 for word in article: count += syllapy.count(word) return count
def main(): #f=open('0418TestOnlineG6.csv', 'w') for e in IDlist: dataresult = {} ''' with open('Z:/Special Projects/ISASP/2019/DIF/DIF EL/Transcription/Yi_AI_6WR/'+e, encoding="ISO-8859-1" ) as file: essay= file.read() ''' with open( 'Z:/Special Projects/ISASP/2019/DIF/DIF EL/Transcription/Yi_AI_6WR/AAAVLP13819000473707.txt', encoding="ISO-8859-1") as file: essay = file.read() # remove the footer in txt id_num = pulloid(essay) print(id_num) essay = noft_text(essay) essay = essay[30:] dataresult['id_num'] = id_num #process Prompt,remove stopwords and number and punctuations Promwdstxt = cleanPromt(prom6) validwdsinProm = getUniqWords( Promwdstxt ) #Here the list only contains unique words in prompt except stopwords cleaness_text = cleanPromt(essay) cleaness_text = noft_text(cleaness_text) cleanlst_essay = getWordlist(cleaness_text) #Vocab in the essay #essay_Uniq=getVocabNum(cleaness_text) Vocabnum = getVocabNum(cleaness_text) #l=getVocab(cleaness_text) print(getVocab(cleaness_text)) # count how many words in the essay are from the list Num_wdsFromProm = wdsAppinProm(validwdsinProm, cleanlst_essay) #number of words in the essay that are appeared in prompt #es_wdsinpm=wdsAppinProm(cleanedess) #count how many grammar errors #No_gram=countGram(gramerror(essay)) misspelled = {} misspelled = spell.unknown(cleanlst_essay) for e in cleanlst_essay: if e in wrong.keys(): misspelled.add(e) c = list() for e in misspelled: if e in ConTract.keys(): c.append(e) elif e in compo.keys(): c.append(e) for e in c: misspelled.remove(e) ############## removed contractions as misspelled print(misspelled) Num_Mispell = len(misspelled) w = re.split("[^-\w]+", cleaness_text) w = [string for string in w if string != ""] print(w) wordcount = len(w) dataresult['wordcount'] = wordcount from nltk.tokenize import sent_tokenize, word_tokenize sentcount = len(sent_tokenize(essay)) ASL = (wordcount / sentcount) dataresult['sentcount'] = sentcount dataresult['Avsenlg'] = ASL ASW = (syllapy.count(cleaness_text)) / wordcount ASL = (wordcount / sentcount) Fre = 206.835 - (1.015 * ASL) - (84.6 * ASW) dataresult['Times using words from prompt'] = Num_wdsFromProm dataresult['Vocab in essay'] = Vocabnum dataresult[ 'Essay percentage using words from Prompt'] = "{:.2f}".format( (Num_wdsFromProm / wordcount) * 100) dataresult['No of mispelled words'] = Num_Mispell dataresult['Percentage of mispelled words'] = "{:.2f}".format( (Num_Mispell / wordcount) * 100) #dataresult['No of grammar errors']=countGram(gramerror(essay)) dw = countDifficult(cleanlst_essay) dataresult['No of Difficult words'] = dw Estimatedlevel = textstat.text_standard(essay) dataresult['Estimatedlevel'] = Estimatedlevel ease = Fre dataresult['reading_ease'] = ease ''' sim=Similarity(cleaness_text,Promwdstxt) dataresult['Cosine Similarity with Prompt']="{:.2f}".format(sim[0][1]) posiness=Posscore(cleanlst_essay) dataresult['No of Positives in essay']=posiness PerPos="{:.2f}".format((posiness/wordcount)*100) dataresult['percentage of Positives in essay']=PerPos negaess=Negscore(cleanlst_essay) dataresult['No of Negatives in essay']=negaess PerNeg="{:.2f}".format((negaess/wordcount)*100) dataresult['percentage of Negatives in essay']=PerNeg L=wordDict.LLink() lk=0 for word in cleanlst_essay: if word in L: lk+=1 dataresult['No of Linking words in essay']=lk PerLink="{:.2f}".format((lk/wordcount)*100) dataresult['percentage of Linking Words in essay']=PerLink ''' import csv for v in dataresult: dataresult[v] = str(dataresult[v]) with open('0418TestOnlineG6.csv', 'a+', newline='') as f: writer = csv.writer(f, quoting=csv.QUOTE_ALL) writer.writerow(list(dataresult.keys())) writer.writerow(list(dataresult.values()))
def test_none(): """Testing passing `None` type.""" assert syllapy.count(None) == 0
def test_bool(): """Testing passing `bool` type.""" assert syllapy.count(True) == 0
def get_num_syllables(self, doc, min_syllables=1): # filter punctuation and words that start with apostrophe (aka contractions) text = (word for word in doc if not word.is_punct and "'" not in word.text) syllables_per_word = tuple(syllapy.count(word.text) for word in text) return sum(c for c in syllables_per_word if c >= min_syllables)
def test_space(): """Testing passing space""" assert syllapy.count(" ") == 0
import os import csv import syllapy syllable_file_path = os.path.join(os.path.dirname(__file__), 'nyt_haiku', 'data', 'syllable_counts.csv') with open(syllable_file_path, newline='') as file: reader = csv.reader(file) for row in reader: if len(row) == 2: word = row[0].lower() count = int(row[1]) if count != syllapy.count(word): print(f"{word},{count}")
def test_int(): """Testing passing `None` type.""" assert syllapy.count(2) == 0
def test_number_end_word(): """Test number at end of word""" assert syllapy.count("dog123") == 0
# Following demo: https://medium.com/better-programming/nlp-with-python-build-a-haiku-machine-in-50-lines-of-code-6c7b6de959e3 import spacy import string from spacy.matcher import Matcher import syllapy count = syllapy.count('additional') import random import re import dominate from dominate.tags import * import pdfkit from fpdf import FPDF import os title = 'Almost A Haiku - NaNoGenMo 2020' nlp = spacy.load("en_core_web_sm") #loading a language model matcher2 = Matcher(nlp.vocab) #https://spacy.io/api/matcher matcher3 = Matcher(nlp.vocab) matcher4 = Matcher(nlp.vocab) matcher5 = Matcher(nlp.vocab) # POS = Part of Speech pattern = [{ 'POS': { "IN": ["NOUN", "ADP", "ADJ", "ADV"] } }, { 'POS': {
def make_verse(incipit, syllables_length, should_rhyme_with=False): incipit = incipit[:1000] incipit_length = len(incipit) top_k = config_top_k errors = 0 added_words = 0 # We add one word at time until we reach the minimum/maximum length for i in range(651): full_output = keras_gpt_2.generate(text_model, bpe, [incipit], length=1, top_k=top_k) full_output = full_output[0] print('output', full_output) newOutput = full_output[len(incipit):] print('NEW output', newOutput) if (all(x.isalpha() or x.isspace() for x in newOutput) and all(x not in newOutput for x in config_forbidden)): incipit = full_output added_words += 1 errors = 0 else: errors += 1 if added_words == 0 and errors > 10: incipit = incipit + 'and ' if errors > 10: incipit = incipit + 'and ' current_length = len(incipit) - incipit_length print('length', current_length) syllables_count = syllapy.count(full_output[incipit_length:]) print('syllables', syllables_count) print('>>>>>>>>>>>>>>>>>>>>>>>>>>', syllables_count, ' in : ' + full_output[incipit_length:]) # If we find a line break and the length is greater than the minimum # we stop the text generation if syllables_count == syllables_length: print('Syllables length reached') break # If the string is greater than the allowed maximum, we stop the generation if syllables_count > syllables_length: print('TOO MANY SYLLABLES') spaces = [ pos for pos, char in enumerate(full_output) if char == ' ' ] # removes 2 last words incipit = full_output[:spaces[-2]] result = full_output[incipit_length:] # we clean double spaces in the result for i in range(3): result = result.replace(' ', ' ') result = result.strip() if should_rhyme_with: rhymes = rhymer.get_perfect_rhymes(should_rhyme_with) rhyme = should_rhyme_with print('all rhymes ', rhymes) all_rhymes = [] if '2' in rhymes and rhymes[2]: all_rhymes = rhymes[2] else: for r in rhymes: if rhymes[r]: all_rhymes = rhymes[r] break print('rhymes ', all_rhymes) random.shuffle(all_rhymes) for word in all_rhymes: print('>>> ', word) if (word is not should_rhyme_with and len(word) > 2 and all(x.isalpha() or x.isspace() for x in word)): rhyme = word break print('choosen ', rhyme) # shorten input to right number of syllables while True: toTest = result + ' ' + rhyme syllables_count = syllapy.count(toTest) print('checking ', toTest) print('syllables ', syllables_count) if (syllables_count <= syllables_length): break else: spaces = [ pos for pos, char in enumerate(result) if char == ' ' ] # removes 2 last words result = result[:spaces[-1]] while True: spaces = [pos for pos, char in enumerate(result) if char == ' '] if len(spaces) > 2: result = result[:spaces[-1]] else: return False solutions = nlp(result + ' ' + nlp.tokenizer.mask_token + ' ' + rhyme) print('solution', solutions) acceptable_solution = False for solution in solutions: solution = solution['sequence'] solution = solution.replace('[CLS]', '') solution = solution.replace('[SEP]', '') solution = solution.strip() syllables_count = syllapy.count(solution) print(solution, syllables_count) if (syllables_count == syllables_length): acceptable_solution = solution break if acceptable_solution: result = acceptable_solution break result = result.encode('utf-8', errors='ignore').decode('utf-8') return result
def test_in_dict(): """Test words in known dataset""" assert syllapy.count("because") == 2 assert syllapy.count("woman") == 2 assert syllapy.count("international") == 5
def test_not_in_dict(): """Test word not in known dataset""" assert syllapy.count("ostentatious") == 4
def test_punctuation_only(): """Testing punctuation only""" for punct in punctuation: assert syllapy.count(punct) == 0
def main(): f = open('0418Test3G6.csv', 'w') for e in namelist: dataresult = {} with open( 'Z:/Special Projects/ISASP/2019/DIF/DIF EL/Transcription/6WR/' + e, encoding="ISO-8859-1") as file: essay = file.read() # remove the footer in txt id_num = pullid(essay) #pull (id_num) dataresult['id_num'] = id_num print(id_num) essay = essay[22:] #process Prompt,remove stopwords and number and punctuations Promwdstxt = cleanPromt(prom6) validwdsinProm = getUniqWords( Promwdstxt ) #Here the list only contains unique words in prompt except stopwords cleaness_text = cleanPromt(essay) cleanlst_essay = getWordlist(cleaness_text) #Vocab in the essay Vocabnum = getVocabNum(cleaness_text) allwords = getVocab(cleaness_text) #print(allwords) # count how many words in the essay are from the list Num_wdsFromProm = wdsAppinProm(validwdsinProm, cleanlst_essay) misspelled = {} misspelled = spell.unknown(cleanlst_essay) for e in cleanlst_essay: if e in wrong.keys(): misspelled.add(e) c = list() for e in misspelled: if e in ConTract.keys(): c.append(e) elif e in compo.keys(): c.append(e) for e in c: misspelled.remove(e) ############## removed contractions as misspelled #print(misspelled) Num_Mispell = len(misspelled) w = re.split("[^-\w]+", cleaness_text) w = [string for string in w if string != ""] #print(w) wordcount = len(w) dataresult['wordcount'] = wordcount #es_wdsinpm=wdsAppinProm(cleanedess) PrtWdsFrProm = (Num_wdsFromProm / wordcount) * 100 sentcount = len(sent_tokenize(essay)) ASL = "{:.2f}".format(wordcount / sentcount) dataresult['sentcount'] = sentcount dataresult['Avsenlg'] = ASL ASW = (syllapy.count(cleaness_text)) / wordcount print(ASW) ASL = float(ASL) Fre = 206.835 - (1.015 * ASL) - (84.6 * ASW) dataresult['Times using words from prompt'] = Num_wdsFromProm dataresult['Vocab in essay'] = Vocabnum dataresult[ 'Essay percentage using words from Prompt'] = "{:.2f}".format( (Num_wdsFromProm / wordcount) * 100) dataresult['No of mispelled words'] = Num_Mispell dataresult['Percentage of mispelled words'] = "{:.2f}".format( (Num_Mispell / wordcount) * 100) #dataresult['No of grammar errors']=countGram(gramerror(essay)) dw = countDifficult(cleanlst_essay) dataresult['No of Difficult words'] = dw Estimatedlevel = textstat.text_standard(essay) dataresult['Estimatedlevel'] = Estimatedlevel ease = 206.835 - (1.015 * ASL) - (84.6 * ASW) dataresult['reading_ease'] = ease '''
def test_number_in_word(): """Test number in word""" assert syllapy.count("d0g") == 0
def test_case_insensitive(): """Test words changing capitalization""" assert syllapy.count("Norway") == 2 assert syllapy.count("norway") == 2 assert syllapy.count("Ohio") == 3 assert syllapy.count("ohio") == 3
def test_empty(): """Test empty string""" assert syllapy.count("") == 0
def test_simple(): """Simple Test.""" assert syllapy.count("dog!!!!!") == 1
def test_number_start_word(): """Test number at start of word""" assert syllapy.count("4dog") == 0
def syllable_counter(tokenized_list): return sum([syllapy.count(token) for token in tokenized_list])
def verse_gen(verse_input, syllable_length): global verse_words global verse_string global verse_count global verse_syllable_count global verse_one_string #Go to first whitespace, count syllables. Continue until "syllable_length" syllables. If over required amount syllables try with new input. #initialize counter y=0 x=1 verse_syllable_count=0 #Split to remove whitespace verse_words=verse_input.split(' ') while verse_syllable_count < syllable_length: print("Adding next word to the string") #Put the first word in a string verse_string=' '.join(verse_words[y:x]) #Count the syllables verse_syllable_count = syllapy.count(verse_string) #increment x x=x+1 #Get new input if the words don't make 5 syllables # if verse_syllable_count > syllable_length: # print("Need new input") # text_generator(state_dict) # verse_input = GPT2_output # verse_gen(verse_input, syllable_length) #If the words make 5 syllables, check for period or comma at the end of it. Use if so, get new input if not # if verse_syllable_count == syllable_length: # if verse_string[-1] == "." or verse_string[-1] == ",": # print(verse_string) # else: # print("Need input ending with punctuation") # verse_gen(verse_input, syllable_length) ## New way: go down the input to look for haiku-able phrases. If not, get new input if verse_syllable_count == syllable_length: print(verse_string) return verse_string if verse_syllable_count > syllable_length: #reinitialize the string and keep going print("Moving up in string") print(verse_string) #reinitialize verse_string verse_string="" verse_syllable_count=0 y=x-1