Example #1
0
def run_syllabify(args):
    for line in args.infile:
        new_line = ' '.join([
            ' '.join(syllabifier.orthographic_syllabify(w, args.lang))
            for w in line.strip().split(' ')
        ])
        args.outfile.write(new_line + '\n')
    def other_features(self,tweet):
        """
        expects text, returns a feature vector, for english and hindi
        """

        if self.lang == 'en':
            sentiment = self.sentiment_analyzer.polarity_scores(tweet)
            words = self.preprocess(tweet) #Get text only
            # pdb.set_trace()
            syllables = textstat.syllable_count(words)
            num_chars = sum(len(w) for w in words)
            num_chars_total = len(tweet)
            num_terms = len(tweet.split())
            num_words = len(words.split())
            avg_syl = round(float((syllables+0.001))/float(num_words+0.001),4)
            num_unique_terms = len(set(words.split()))

            ###Modified FK grade, where avg words per sentence is just num words/1
            FKRA = round(float(0.39 * float(num_words)/1.0) + float(11.8 * avg_syl) - 15.59,1)
            ##Modified FRE score, where sentence fixed to 1
            FRE = round(206.835 - 1.015*(float(num_words)/1.0) - (84.6*float(avg_syl)),2)

            twitter_objs = self.count_twitter_objs(tweet)
            retweet = 0
            if "rt" in words:
                retweet = 1
            features = [FKRA, FRE,syllables, avg_syl, num_chars, num_chars_total, num_terms, num_words,
                        num_unique_terms, sentiment['neg'], sentiment['pos'], sentiment['neu'], sentiment['compound'],
                        twitter_objs[2], twitter_objs[1],
                        twitter_objs[0], retweet]
            #features = pandas.DataFrame(features)
            return features
        if self.lang == 'hi':
            sentiment = self.sentiment_analyzer.predict(tweet)
            words = self.preprocess(tweet)
            
            syllables = len([syllabifier.orthographic_syllabify(w,self.lang) for w in hi_tokenizer(input=words , language_code=self.lang)])
            # pdb.set_trace()
            num_chars = sum(len(w) for w in words)
            num_chars_total = len(tweet)
            num_terms = len(tweet.split())
            num_words = len(words.split())
            avg_syl = round(float((syllables+0.001))/float(num_words+0.001),4)
            num_unique_terms = len(set(words.split()))

            ###Modified FK grade, where avg words per sentence is just num words/1
            FKRA = round(float(0.39 * float(num_words)/1.0) + float(11.8 * avg_syl) - 15.59,1)
            ##Modified FRE score, where sentence fixed to 1
            FRE = round(206.835 - 1.015*(float(num_words)/1.0) - (84.6*float(avg_syl)),2)

            twitter_objs = self.count_twitter_objs(tweet)
            retweet = 0
            if "rt" in words:
                retweet = 1
            features = [FKRA, FRE,syllables, avg_syl, num_chars, num_chars_total, num_terms, num_words,
                        num_unique_terms, sentiment[2][0].tolist(), sentiment[2][2].tolist(), sentiment[2][1].tolist(), sentiment[2][1].tolist()-sentiment[2][0].tolist()+sentiment[2][1].tolist(),
                        twitter_objs[2], twitter_objs[1],
                        twitter_objs[0], retweet]
            #features = pandas.DataFrame(features)
            return features
def getSyllables(word, lang):
    return syllabifier.orthographic_syllabify(word, lang)