Esempio n. 1
0
class Calculator:

    # Constructor for the Calculator class
    def __init__(self, filename):
        self.parser = Parser(filename)

    # Calculate Automatic Readability Index
    def automated_readability_index(self):
        return 4.71 * (float(self.parser.number_of_characters()) / float(self.parser.number_of_words())) \
               + 0.5 * (float(self.parser.number_of_words()) / float(self.parser.number_of_sentences())) - 21.43

    # Calculate Gunning fog index
    def gunning_fog_index(self):
        return 0.4 * ((float(self.parser.number_of_words()) / float(self.parser.number_of_sentences()))
                      + 100 * (float(self.parser.number_of_polysyllables()) / float(self.parser.number_of_words())))

    # Calculate smog index
    def smog_index(self):
        return 1.0430 * sqrt(float(self.parser.number_of_polysyllables()) * 30.0 / float(self.parser.number_of_sentences()))\
               + 3.1291

    # Calculate flesch reading ease
    def flesch_reading_ease(self):
        return 206.835 - 1.105 * (float(self.parser.number_of_words()) / float(self.parser.number_of_sentences())) - \
               84.6 * (float(self.parser.number_of_syllables()) / float(self.parser.number_of_words()))

    # Calculate Flesch Kincaid grade level
    def flesch_kincaid_grade_level(self):
        return (0.39 * float(self.parser.average_sentence_length())) + (11.8 * self.parser.average_syllable_per_word()) \
               - 15.59

    # Calculate Coleman Liau index
    def coleman_liau_index(self):
        return 0.0588 * (float(self.parser.average_letter_per_100_words())) \
               - 0.296 * (self.parser.average_sentences_per_100_words()) - 15.8
Esempio n. 2
0
class Generator:

    # Constructor for generator class
    def __init__(self, asl, awl, asw, psw30, juk30, difficulty, output_train_directory):
        self.asl = asl
        self.awl = awl
        self.asw = asw
        self.psw30 = psw30
        self.juk30 = juk30
        self.difficulty = difficulty
        self.output_train_directory = output_train_directory

    # Generate the equation using linear regression
    def generate(self):

        print "Calculating correlations..."

        corr_asl = stats.pearsonr(self.difficulty, self.asl)
        corr_awl = stats.pearsonr(self.difficulty, self.awl)
        corr_asw = stats.pearsonr(self.difficulty, self.asw)
        corr_psw30 = stats.pearsonr(self.difficulty,self.psw30)
        corr_juk30 = stats.pearsonr(self.difficulty, self.juk30)

        corr_asl_awl = stats.pearsonr(self.asl, self.awl)
        corr_asl_asw = stats.pearsonr(self.asl, self.asw)
        corr_asl_psw30 = stats.pearsonr(self.asl, self.psw30)
        corr_asl_juk30 = stats.pearsonr(self.asl, self.juk30)
        corr_awl_asw = stats.pearsonr(self.awl, self.asw)
        corr_awl_psw30 = stats.pearsonr(self.awl, self.psw30)
        corr_awl_juk30 = stats.pearsonr(self.awl, self.juk30)
        corr_asw_psw30 = stats.pearsonr(self.asw, self.psw30)
        corr_asw_juk30 = stats.pearsonr(self.asw, self.juk30)
        corr_psw30_juk30 = stats.pearsonr(self.psw30, self.juk30)

        output_file = open(path.join(self.output_train_directory, 'stats_training.csv'), 'a')

        output_file.write("\"\";\"\";\"\";\"\";\"\";\"\";\"\"\n")

        output_file.write("\"\";\"ASL\";\"AWL\";\"ASW\";\"PSW30\";\"JUK30\"\n")

        output_file.write("\"Correlation"
                          + "\";\""
                          + str(corr_asl[0])
                          + "\";\""
                          + str(corr_awl[0])
                          + "\";\""
                          + str(corr_asw[0])
                          + "\";\""
                          + str(corr_psw30[0])
                          + "\";\""
                          + str(corr_juk30[0])
                          + "\"\n")

        output_file.write("\"\";\"\";\"\";\"\";\"\";\"\";\"\"\n")

        output_file.write("\"\";\"ASL\";\"AWL\";\"ASW\";\"PSW30\";\"JUK30\"\n")

        output_file.write("\"ASL"
                          + "\";\""
                          + "\";\""
                          + str(corr_asl_awl[0])
                          + "\";\""
                          + str(corr_asl_asw[0])
                          + "\";\""
                          + str(corr_asl_psw30[0])
                          + "\";\""
                          + str(corr_asl_juk30[0])
                          + "\"\n")
        output_file.write("\"AWL"
                          + "\";\""
                          + str(corr_asl_awl[0])
                          + "\";\""
                          + "\";\""
                          + str(corr_awl_asw[0])
                          + "\";\""
                          + str(corr_awl_psw30[0])
                          + "\";\""
                          + str(corr_awl_juk30[0])
                          + "\"\n")
        output_file.write("\"ASW"
                          + "\";\""
                          + str(corr_asl_asw[0])
                          + "\";\""
                          + str(corr_awl_asw[0])
                          + "\";\""
                          + "\";\""
                          + str(corr_asw_psw30[0])
                          + "\";\""
                          + str(corr_asw_juk30[0])
                          + "\"\n")
        output_file.write("\"PSW30"
                          + "\";\""
                          + str(corr_asl_psw30[0])
                          + "\";\""
                          + str(corr_awl_psw30[0])
                          + "\";\""
                          + str(corr_asw_psw30[0])
                          + "\";\""
                          + "\";\""
                          + str(corr_psw30_juk30[0])
                          + "\"\n")
        output_file.write("\"JUK30"
                          + "\";\""
                          + str(corr_asl_juk30[0])
                          + "\";\""
                          + str(corr_awl_juk30[0])
                          + "\";\""
                          + str(corr_asw_juk30[0])
                          + "\";\""
                          + str(corr_psw30_juk30[0])
                          + "\";\""
                          + "\"\n")

        output_file.close()

        self.features = []

        threshold = 0.4

        if abs(corr_asl[0]) > threshold:
            self.features.append('asl')
        if abs(corr_awl[0]) > threshold:
            self.features.append('awl')
        if abs(corr_asw[0]) > threshold:
            self.features.append('asw')
        if abs(corr_psw30[0]) > threshold:
            self.features.append('psw30')
        if abs(corr_juk30[0]) > threshold:
            self.features.append('juk30')

        print "Features selected", self.features

        features_data = []

        length = len(self.difficulty)

        for feature in self.features:
            if feature == 'asl':
                features_data.append(self.asl)
            elif feature == 'awl':
                features_data.append(self.awl)
            elif feature == 'asw':
                features_data.append(self.asw)
            elif feature == 'psw30':
                features_data.append(self.psw30)
            elif feature == 'juk30':
                features_data.append(self.juk30)

        no_features = len(self.features)

        print "Performing linear regression using manual difficulty and selected features..."
        x = np.array(features_data, np.int32)

        y = np.array(self.difficulty)

        n = np.max(x.shape)

        X = np.vstack([np.ones(n), x]).T

        model, residue = np.linalg.lstsq(X, y)[:2]
        r2 = 1 - residue / (y.size * y.var())

        self.coeff = np.linalg.lstsq(X, y)[0]

        formula = ""
        for i in range(no_features):
            formula += "(" + str(self.coeff[i]) + ") * (" + str(self.features[i]) + ") + "

        formula += "(" + str(self.coeff[no_features]) + ")"
        #print "R^2 : " + str(r2)

        print "Generated the following formula: "
        print "---------------------------------------------------------------------------------"
        print formula
        print "---------------------------------------------------------------------------------"

    # Generate the custom index
    def custom_index(self, filename):

        self.parser = Parser(filename)

        length = len(self.features)

        index = 0.0

        feature_value = []
        for i in range(length):
            if self.features[i] == 'asl':
                index += float(self.parser.average_sentence_length()) * float(self.coeff[i])
            elif self.features[i] == 'awl':
                index += float(self.parser.average_word_length()) * float(self.coeff[i])
            elif self.features[i] == 'asw':
                index += float(self.parser.average_syllable_per_word()) * float(self.coeff[i])
            elif self.features[i] == 'psw30':
                index += float(self.parser.number_of_polysyllables_per_30_words()) * float(self.coeff[i])
            elif self.features[i] == 'juk30':
                index += float(self.parser.number_of_jukthakshar_per_30_words()) * float(self.coeff[i])

        index += float(self.coeff[length])

        return index