Example #1
0
def to_haiku(h):
  idx = 0
  count = 0
  h = h.split()
  haiku = []
  while count < 5 and idx < len(h):
    count += syllables.count(h[idx])
    haiku.append(str(h[idx]))
    idx += 1
  haiku.append(' / ')
  count = 0
  while count < 7 and idx < len(h):
    count += syllables.count(h[idx])
    haiku.append(str(h[idx]))
    idx += 1
  haiku.append(' / ')
  count = 0
  while count < 5 and idx < len(h):
    count += syllables.count(h[idx])
    haiku.append(str(h[idx]))
    idx += 1
  if idx < len(h):
    haiku.append(' || ')
    while idx < len(h):
      haiku.append(str(h[idx]))
      idx += 1
  elif count < 5:
    haiku.append(' | ')
  return ' '.join(haiku)
    def test_count(self):
        test_string = "reflect respect recline reduce obsessively demonstrate baseball cloud brother cobblestone " + \
                      "complete conspire conflict estuary"  # Syllable counts: 2 2 2 2 4 3 2 1 2 3 2 2 2
        raw_cmu = conversion.get_cmu(test_string.split(" "))
        expected = [2, 2, 2, 2, 4, 3, 2, 1, 2, 3, 2, 2, 2, 4]
        for i, word in enumerate(raw_cmu):
            self.assertEqual(syllables.count(word[0]), expected[i])

        # test some examples with hiatus
        test_hiatus = "duo rio maria created misery harry"  # syllable counts: 2 2 3 3 3 2
        hiatus_counts = [2, 2, 3, 3, 3, 2]

        raw_cmu_hiatus = conversion.get_cmu(test_hiatus.split(" "))
        for j, word in enumerate(raw_cmu_hiatus):
            self.assertEqual(syllables.count(word[0]), hiatus_counts[j])
 def get_complex_word_count(self):
     if self.complex_word_count is None:
         self.complex_word_count = 0
         for word in self.get_words():
             if syllables.count(word) >= 3:
                 self.complex_word_count += 1
     return self.complex_word_count
Example #4
0
def count_syllables(word):
    #d = cmudict.dict()
    #try:
        #count = [len(list(y for y in x if isdigit(y[-1]))) for x in d[word.lower()]][0]
    #except KeyError:
        #return syllables.count(word)
    ##print word, count
    #return count
    return syllables.count(word)
Example #5
0
def haikuness(words):
	words = words.split()
	count = 0
	idx = 0
	err = 0
	while count < 5 and idx < len(words):
		count += syllables.count(words[idx])
		idx += 1
	err += abs(5 - count)
	count = 0
	while count < 7 and idx < len(words):
		count += syllables.count(words[idx])
		idx += 1
	err += abs(7 - count)
	count = 0
	while count < 5 and idx < len(words):
		count += syllables.count(words[idx])
		idx += 1
	err += abs(5 - count)
	while idx < len(words):
		err += syllables.count(words[idx])
		idx += 1
	return err
Example #6
0
def count_syllables(sentance, debug=False):
    # first lets strip out punctuation and emotive marks
    count = 0
    if debug:
        print('received sentance: %s' % sentance)

    sentance = format_input(sentance)

    if debug:
        print('formatted sentance: %s' % sentance)


    words = [w for w in sentance.split() if w.isalpha()]

    if debug:
        print('extracted words: %s' % repr(words))
        nonwords = [w for w in sentance.split() if not w.isalpha()]
        if nonwords:
            print('found nonwords: %s' % repr(words))


    for w in words:
        if is_camel(w):  
            sylls = count_syllables(de_camel(w))
        else:
            sylls = syllables.count(w)


        count += sylls

        if debug:
            print('%s\t\t\t%d' %(w, sylls))

    if debug:
        print('total\t\t\t%d' % count)
    return count
Example #7
0
def find_stress(word, type="all"):
    """Convert stress marking numbers from CMU into actual stress markings
    :param word - the CMU word string to be evaluated for stress markings
    :param type - type of stress to be evaluated (primary, secondary, or both)"""

    syll_count = syllables.count(word)

    if (not word.startswith("__IGNORE__")) and syll_count > 1:
        symbols = word.split(' ')
        stress_map = stress_type(type)
        new_word = []
        clusters = ["sp", "st", "sk", "fr", "fl"]
        stop_set = [
            "nasal", "fricative", "vowel"
        ]  # stop searching for where stress starts if these are encountered
        # for each CMU symbol
        for c in symbols:
            # if the last character is a 1 or 2 (that means it has stress, and we want to evaluate it)
            if c[-1] in stress_map.keys():
                # if the new_word list is empty
                if not new_word:
                    # append to new_word the CMU symbol, replacing numbers with stress marks
                    new_word.append(
                        re.sub("\d", "",
                               stress_map[re.findall("\d", c)[0]] + c))
                else:
                    stress_mark = stress_map[c[-1]]
                    placed = False
                    hiatus = False
                    new_word = new_word[::
                                        -1]  # flip the word and backtrack through symbols
                    for i, sym in enumerate(new_word):
                        sym = re.sub("[0-9ˈˌ]", "", sym)
                        prev_sym = re.sub("[0-9ˈˌ]", "", new_word[i - 1])
                        prev_phone = phones[re.sub("[0-9ˈˌ]", "",
                                                   new_word[i - 1])]
                        if phones[sym] in stop_set or (i > 0 and prev_phone
                                                       == "stop") or sym in [
                                                           "er", "w", "j"
                                                       ]:
                            if sym + prev_sym in clusters:
                                new_word[i] = stress_mark + new_word[i]
                            elif not prev_phone == "vowel" and i > 0:
                                new_word[i - 1] = stress_mark + new_word[i - 1]
                            else:
                                if phones[sym] == "vowel":
                                    hiatus = True
                                    new_word = [
                                        stress_mark + re.sub("[0-9ˈˌ]", "", c)
                                    ] + new_word
                                else:
                                    new_word[i] = stress_mark + new_word[i]
                            placed = True
                            break
                    if not placed:
                        if new_word:
                            new_word[
                                len(new_word) -
                                1] = stress_mark + new_word[len(new_word) - 1]
                    new_word = new_word[::-1]
                    if not hiatus:
                        new_word.append(re.sub("\d", "", c))
                        hiatus = False
            else:
                if c.startswith("__IGNORE__"):
                    new_word.append(c)
                else:
                    new_word.append(re.sub("\d", "", c))

        return ' '.join(new_word)
    else:
        if word.startswith("__IGNORE__"):
            return word
        else:
            return re.sub("[0-9]", "", word)
Example #8
0
def analyze_text(text):
    print "test"
    paragraph_regex = re.compile("\\n\\s*\\n")
    all_paragraphs = re.split(paragraph_regex, text)
    all_paragraphs = filter(lambda x: len(x) > 0, all_paragraphs)

    def slice(n, words):
        n_gram = list(islice(words, n, 4))
        return " ".join(n_gram)

    def nsyl(word):
        return [
            len(list(y for y in x if isdigit(y[-1]))) for x in d[word.lower()]
        ]

    last_four_words = deque([])
    paragraph_lengths = []
    sentence_lengths = []
    avg_sentence_length_per_para = []
    num_sentences_per_para = []
    entropies = []
    syllables_counter = defaultdict(int)
    parts_of_speech_counter = defaultdict(int)
    word_counter = defaultdict(int)
    two_gram_counter = defaultdict(int)
    three_gram_counter = defaultdict(int)
    four_gram_counter = defaultdict(int)
    for paragraph in all_paragraphs:
        #extract_paragraph_topics() Google News and Wikipedia
        # distance from previous paragraph
        #distance from previous N paragraph
        paragraph_lengths.append(len(paragraph))
        entropies.append(
            entropy.shannon_entropy(paragraph.encode('utf-8').strip()))
        num_sentences = 0
        for sentence in sent_tokenize(paragraph):
            num_sentences += 1
            paragraph_sent_lengths = []
            paragraph_sent_lengths.append(len(sentence))
            avg_sentence_length_per_para.append(
                sum(paragraph_sent_lengths) /
                float(len(paragraph_sent_lengths)))
            sentence_lengths.append(len(sentence))
            num_sentences_per_para.append(num_sentences)
            words = word_tokenize(paragraph)
            pos = pos_tag(words)
            for word, p in pos:
                syllables_counter[str(syllables.count(word))] += 1
                word_counter[word] += 1
                if word is not None:
                    last_four_words.append(word)
                parts_of_speech_counter[p] += 1
                if len(last_four_words) > 4:
                    last_four_words.popleft()
                    four_gram_counter[slice(0, last_four_words)] += 1
                if len(last_four_words) >= 2:
                    two_gram_counter[slice(2, last_four_words)] += 1
                if len(last_four_words) >= 3:
                    three_gram_counter[slice(1, last_four_words)] += 1
    all_syllables = []
    for key in syllables_counter.keys():
        all_syllables.extend([float(key)] * syllables_counter[key])

    total_parts_of_speech = 0
    for key in parts_of_speech_counter:
        total_parts_of_speech += parts_of_speech_counter[key]

    df1 = pd.DataFrame({
        'average_paragraph_length': [np.mean(paragraph_lengths)],
        'average_sentence_lengths': [np.mean(sentence_lengths)],
        'avg_entropies': [np.mean(entropies)],
        'avg_syllables': [np.mean(all_syllables)]
    })

    for part_of_speech in parts_of_speech_counter.keys():
        df1[part_of_speech +
            "_prop"] = parts_of_speech_counter[part_of_speech] / float(
                total_parts_of_speech)
    total_words = 0
    for key in word_counter:
        total_words += word_counter[key]
    for stopword in STOPWORDS:
        df1[stopword + "_prop"] = word_counter[stopword] / float(total_words)
    return df1
Example #9
0
    # distance from previous N paragraph
    paragraph_lengths.append(len(paragraph))
    entropies.append(entropy.shannon_entropy(paragraph.encode("utf-8")))
    num_sentences = 0
    for sentence in sent_tokenize(paragraph):
        num_sentences += 1
        paragraph_sent_lengths = []
        paragraph_sent_lengths.append(len(sentence))
        avg_sentence_length_per_para.append(
            sum(paragraph_sent_lengths) / float(len(paragraph_sent_lengths)))
        sentence_lengths.append(len(sentence))
        num_sentences_per_para.append(num_sentences)
        words = word_tokenize(paragraph)
        pos = pos_tag(words)
        for word, p in pos:
            syllables_counter[str(syllables.count(word))] += 1
            word_counter[word] += 1
            if word is not None:
                last_four_words.append(word)
            parts_of_speech_counter[p] += 1
            if len(last_four_words) > 4:
                last_four_words.popleft()
                four_gram_counter[slice(0, last_four_words)] += 1
            if len(last_four_words) >= 2:
                two_gram_counter[slice(2, last_four_words)] += 1
            if len(last_four_words) >= 3:
                three_gram_counter[slice(1, last_four_words)] += 1
            # N grams
            # Stop Words
            # Syllable words
Example #10
0
 def change(self, i=None):
   if i is None:
     i = self.d.draw()
   self.value = i
   self.str = mydict[i]
   self.syllables = syllables.count(mydict[self.value])
Example #11
0
 def __init__(self, mydict):
   self.value = mydict.draw()
   self.d = mydict
   self.syllables = syllables.count(mydict[self.value])
   self.str = mydict[self.value]
 def get_total_syllable_count(self):
     if self.total_syllable_count is None:
         self.total_syllable_count = 0
         for word in self.get_words():
             self.total_syllable_count += syllables.count(word)
     return self.total_syllable_count