Ejemplo n.º 1
0
    def run(self):

        print "Started the program..."

        # Start the training
        print "Started training from the training data..."

        asl = []
        awl = []
        asw = []
        psw30 = []
        juk30 = []

        difficulty = {}
        difficulty_scores = []

        lines = open('manual_difficulty_scores').readlines()

        print "Reading the manual difficulty levels given to training data..."

        for line in lines:
            if line == '\n':
                break
            line = str(line).split(':')
            difficulty[line[0]] = int(line[1])

        # Create output directory
        try:
            stat(self.output_directory)
        except:
            mkdir(self.output_directory)

        print "Writing stats of training data to " + self.output_directory + "\stats_training.csv..."

        # Write to output file
        output_file = open(path.join(self.output_directory, 'stats_training.csv'), 'w+')
        output_file.write("\"Filename\";\"ASL\";\"AWL\";\"ASW\";\"PSW30\";\"JUK30\"\n")

        print "Calculating English Readability indices for training data..."
        print "Added them to " + self.output_directory + "\english_index_training.csv..."

        english_index_file = open(path.join(self.output_directory, 'english_index_training.csv'), 'w+')
        english_index_file.write("\"Filename\";"
                                 "\"Automatic Readability Index\";"
                                 "\"Gunning Fog Index\";"
                                 "\"Smog Index\";"
                                 "\"Flesch Reading Ease\";"
                                 "\"Flesch Kincaid Grade Level\";"
                                 "\"Coleman Liau Index\"\n")

        # Walk through all files in tree
        for (dir_path, _, file_names) in walk(self.input_train_directory):
            for filename in file_names:
                test_file = path.join(dir_path, filename)

                # The index calculator
                c = Calculator(test_file)

                # The parser
                p = c.parser

                if difficulty.has_key(test_file):
                    difficulty_scores.append(difficulty[test_file])
                    asl.append(p.average_sentence_length())
                    awl.append(p.average_word_length())
                    asw.append(p.average_syllable_per_word())
                    psw30.append(p.number_of_polysyllables_per_30_words())
                    juk30.append(p.number_of_jukthakshar_per_30_words())

                # The output line
                output_line = "\"" + str(test_file) \
                              + "\";\"" \
                              + str(p.average_sentence_length())\
                              + "\";\"" \
                              + str(p.average_word_length())\
                              + "\";\"" \
                              + str(p.average_syllable_per_word()) \
                              + "\";\"" \
                              + str(p.number_of_polysyllables_per_30_words()) \
                              + "\";\""\
                              + str(p.number_of_jukthakshar_per_30_words())\
                              + "\"\n"

                output_file.write(output_line)

                english_index_output_line = "\"" + str(test_file) \
                                            + "\";\"" \
                                            + str(c.automated_readability_index())\
                                            + "\";\"" \
                                            + str(c.gunning_fog_index())\
                                            + "\";\"" \
                                            + str(c.smog_index()) \
                                            + "\";\""\
                                            + str(c.flesch_reading_ease()) \
                                            + "\";\"" \
                                            + str(c.flesch_kincaid_grade_level()) \
                                            + "\";\""\
                                            + str(c.coleman_liau_index())\
                                            + "\"\n"

                english_index_file.write(english_index_output_line)

        output_file.close()
        english_index_file.close()

        # Generate the index
        print "Generating the custom index using Correlation and Linear Regression..."

        g = Generator(asl, awl, asw, psw30, juk30, difficulty_scores, self.output_directory)
        g.generate()

        # Stop the training

        # Start testing

        # Write to output file

        print "Start testing on testing data..."

        print "Calculating stats for testing data..."
        print "Writing stats to " + self.output_directory + "\stats_testing.csv..."

        output_file = open(path.join(self.output_directory, 'stats_testing.csv'), 'w+')
        output_file.write("\"Filename\";\"ASL\";\"AWL\";\"ASW\";\"PSW30\";\"JUK30\"\n")

        print "Calculating English Indices on testing data..."
        print "Applying our custom formula to testing data..."
        print "Output written to " + self.output_directory + "\index_testing.csv..."

        index_file = open(path.join(self.output_directory, 'index_testing.csv'), 'w+')
        index_file.write("\"Filename\";"
                         "\"Generated Index Value\";"
                         "\"Automatic Readability Index\";"
                         "\"Gunning Fog Index\";"
                         "\"Smog Index\";"
                         "\"Flesch Reading Ease\";"
                         "\"Flesch Kincaid Grade Level\";"
                         "\"Coleman Liau Index\"\n")

        # Walk through all files in tree
        for (dir_path, _, file_names) in walk(self.input_test_directory):
            for filename in file_names:
                test_file = path.join(dir_path, filename)

                # The index calculator
                c = Calculator(test_file)

                index = g.custom_index(test_file)

                # The parser
                p = c.parser

                # The output line
                output_line = "\"" + str(test_file) \
                              + "\";\"" \
                              + str(p.average_sentence_length())\
                              + "\";\"" \
                              + str(p.average_word_length())\
                              + "\";\"" \
                              + str(p.average_syllable_per_word()) \
                              + "\";\"" \
                              + str(p.number_of_polysyllables_per_30_words()) \
                              + "\";\""\
                              + str(p.number_of_jukthakshar_per_30_words())\
                              + "\"\n"

                output_file.write(output_line)

                index_output_line = "\"" \
                                    + str(test_file) \
                                    + "\";\"" \
                                    + str(index) \
                                    + "\";\"" \
                                    + str(c.automated_readability_index())\
                                    + "\";\"" \
                                    + str(c.gunning_fog_index())\
                                    + "\";\"" \
                                    + str(c.smog_index()) \
                                    + "\";\""\
                                    + str(c.flesch_reading_ease()) \
                                    + "\";\"" \
                                    + str(c.flesch_kincaid_grade_level()) \
                                    + "\";\""\
                                    + str(c.coleman_liau_index())\
                                    + "\"\n"

                index_file.write(index_output_line)

        output_file.close()
        index_file.close()

        print "End testing..."
        # Stop testing