def build_alt_words_table(n): Logger.log_message('Building alternate words table based on ' + str(n) + '-grams') alt_words = {} in_file = path.join('out', str(n) + '-gram-regexp.csv') Logger.log_message('Reading ' + in_file) input_file = open(in_file) for line in input_file.readlines(): words = str(line.split(';')[1]).split() for word in words: for alt_word in words: if word in alt_words: if alt_word not in alt_words[word]: alt_words[word].append(alt_word) else: alt_words[word] = [alt_word] input_file.close() Logger.log_success('Finished reading ' + in_file) out_file = path.join('out', 'lwlm-alt-words-' + str(n) + '-grams.csv') Logger.log_message('Writing alternate words table to ' + out_file) output_file = open(out_file, 'w+') for word in alt_words: words = set(alt_words[word]) col = ' '.join(w for w in words) output_file.write(word + ';' + col + '\n') output_file.close() Logger.log_success('Alternate words table has been written to ' + out_file)
def run(self): Logger.log_message("Running ITFIDF") Logger.log_message("Reading " + self.tf_dict) tf_dict_file = open(self.tf_dict) for line in tf_dict_file.readlines(): cols = line.split(";") self.tf_dictionary[cols[0]] = int(cols[1]) tf_dict_file.close() Logger.log_message("Reading " + self.df_dict) df_dict_file = open(self.df_dict) for line in df_dict_file.readlines(): cols = line.split(";") self.df_dictionary[cols[0]] = int(cols[1]) max_tf = max(self.tf_dictionary.values()) max_df = max(self.df_dictionary.values()) for word in self.df_dictionary: if word in self.tf_dictionary: if word in self.df_dictionary: self.itfidf[word] = (max_tf * max_df) / (self.tf_dictionary[word] * self.df_dictionary[word]) Logger.log_message("Writing results to " + self.out_file) self.dump_results() Logger.log_success("Finished writing results to " + self.out_file)
def run(self): # Build up the syllable count dictionary dictionary = open(self.dict_file, 'r') Logger.log_message("Reading " + self.dict_file) for line in dictionary.readlines(): cols = line.split(';') self.syllable_val[cols[0]] = int(cols[2]) dictionary.close() # Check for the input directory try: stat(self.in_dir) except: Logger.log_error('Input text not found') return Logger.log_message('Running syllable counter') for(dir_path, _, file_names) in walk(self.in_dir): for file_name in file_names: in_file = path.join(dir_path, file_name) self.count_syllables(in_file) Logger.log_success('Finished syllable counting') Logger.log_message('Writing results to ' + self.out_file) self.dump_results() Logger.log_success('Finished writing results to ' + self.out_file)
def run(self): # Check for the input directory try: stat(self.in_dir) except: Logger.log_error('Input text not found') return # Create the output directory try: stat(self.out_dir) except: makedirs(self.out_dir) Logger.log_message('Started stemming') # Walk through the input directory for(dir_path, _, file_names) in walk(self.in_dir): for file_name in file_names: in_file = path.join(dir_path, file_name) out_file = path.join(self.out_dir, file_name + '_' + dir_path.replace('/', '_') + '.txt') Stemmer.stem_file(in_file, out_file) Logger.log_success('Finished stemming')
def run(self): dictionary = open(self.dict_file) Logger.log_message('Reading ' + self.dict_file) # Construct the dictionary for line in dictionary.readlines(): items = line.split(";") self.document_frequencies[items[0]] = 0 self.document_words[items[0]] = False # Check for the input directory try: stat(self.in_dir) except: Logger.log_error('Input text not found') return Logger.log_message('Running document frequency counter') for(dir_path, _, file_names) in walk(self.in_dir): for file_name in file_names: in_file = path.join(dir_path, file_name) self.terms_in_document(in_file) Logger.log_message('Finished document frequency counting') Logger.log_message('Writing results to ' + self.out_file) self.dump_results() Logger.log_success('Finished writing results to ' + self.out_file)
def run(self): Logger.log_message("Running merger on " + self.in_file) input_file = open(self.in_file) for line in input_file.readlines(): if line[0] == "#": continue cols = line.split(";") file_name = str(cols[0].split("/")[-1]) grade = self.file_names.index(file_name[0]) + 1 if grade not in self.number_of_words: self.number_of_chars[grade] = 0 self.number_of_words[grade] = 0 self.number_of_syllables[grade] = 0 self.number_of_sentences[grade] = 0 self.number_of_chars[grade] += int(cols[1]) self.number_of_words[grade] += int(cols[2]) self.number_of_syllables[grade] += int(cols[3]) self.number_of_sentences[grade] += int(cols[4]) input_file.close() Logger.log_message("Writing merged results to " + self.out_file) self.dump_results() Logger.log_success("Results written to " + self.out_file)
def run(self): Logger.log_message("Running Readability Calculator on " + self.stats_file) input_file = open(self.stats_file) for line in input_file.readlines(): if line[0] == "#": continue cols = line.split(";") grade = int(cols[0]) words = float(cols[2]) syllables = float(cols[3]) sentences = float(cols[4]) flesch_kincaid_grade = 0.39 * (words / sentences) + \ 11.8 * (syllables / words) - 15.59 self.readability_grade[grade] = flesch_kincaid_grade input_file.close() Logger.log_message("Writing results to " + self.out_file) self.dump_results() Logger.log_success("Results written to " + self.out_file) grades = self.readability_grade.keys() fk_grades = self.readability_grade.values() correlation = pearsonr(grades, fk_grades) Logger.log_result("Correlation between grade level and Flesch Kincaid grade: " \ + str(correlation))
def main(): if len(sys.argv) > 1: if sys.argv[1] == 'server': run_server() return Logger.log_message("Running application Simplify") cleanup() LWLM.build_tables('corpus') Logger.log_success("Application exited successfully")
def build_tables(in_dir): Logger.log_message('Building 3-Gram LWLM tables') ng = NGram(3, in_dir) ng.run() Logger.log_success('Finished building 3-Gram LWLM tables') Logger.log_message('Building 5-Gram LWLM tables') ng = NGram(5, in_dir) ng.run() Logger.log_success('Finished building 5-Gram LWLM tables') LWLM.build_alt_words_table(3) LWLM.build_alt_words_table(5)
def run(self): Logger.log_message('Running ' + str(self.n) + '-Gram Frequency counter') try: stat(self.in_dir) except: Logger.log_error('Input text not found') return for(dir_path, _, file_names) in walk(self.in_dir): for file_name in file_names: in_file = path.join(dir_path, file_name) self.parse_file(in_file) self.dump_results() Logger.log_success(str(self.n) + '-Gram Frequency counter exited successfully')
def main(): Logger.log_message("Starting NCERT Readability application") # Run the parser parser = Parser(CORPUS_DIR, OUTPUT_DIR, STATS_FILE) parser.run() # Merge the stats merger = Merger(path.join(OUTPUT_DIR, STATS_FILE), path.join(OUTPUT_DIR, MERGED_STATS_FILE)) merger.run() readability_calc = ReadabilityCalculator( path.join(OUTPUT_DIR, MERGED_STATS_FILE), path.join(OUTPUT_DIR, RESULTS_FILE)) readability_calc.run() Logger.log_success("Application exited successfully")
def run(self): # Check for the input directory try: stat(self.in_dir) except: Logger.log_error('Input text not found') return Logger.log_message('Running term frequency counter') for(dir_path, _, file_names) in walk(self.in_dir): for file_name in file_names: in_file = path.join(dir_path, file_name) self.count_term_frequency(in_file) Logger.log_success('Finished term frequency counting') Logger.log_message('Writing results to ' + self.out_file) self.dump_results() Logger.log_success('Finished writing results to ' + self.out_file)
def dump_results(self): out_file = path.join('out', str(self.n) + '-gram.csv') Logger.log_message('Writing ' + str(self.n) + '-Gram table to ' + out_file) output_file = open(out_file, 'w+') for s in self.table: output_file.write(s + ';' + str(self.table[s]) + '\n') output_file.close() Logger.log_success('Finished writing ' + str(self.n) + '-Gram table to ' + out_file) out_file = path.join('out', str(self.n) + '-gram-regexp.csv') Logger.log_message('Writing ' + str(self.n) + '-Gram Regular Expressions to ' + out_file) output_file = open(out_file, 'w+') for nb in self.neighbors: words = set(self.neighbors[nb]) col = ' '.join(w for w in words) output_file.write(nb + ';' + col + '\n') output_file.close() Logger.log_success(str(self.n) + '-Gram Regular Expressions have been written to ' + out_file)
def run(self): Logger.log_message("Parser started running") # Check if the input directory exists try: stat(self.in_dir) except: Logger.log_error("Input text not found") return # Create output directory if it doesn't exist try: stat(self.out_dir) except: mkdir(self.out_dir) for (dir_path, _, file_names) in walk(self.in_dir): for file_name in file_names: in_file = path.join(dir_path, file_name) self.parse(in_file) Logger.log_message("Writing results to " + self.out_file) self.dump_results() Logger.log_success("Results have been written to " + self.out_file)
def __init__(self, host, port, debug): self.host = host self.port = port self.debug = debug self.app = Flask(__name__) # Index route @self.app.route('/') def index(): return render_template('index.html') @self.app.route('/api/tag') def tag_api(): text = request.args["text"] _type = request.args["type"] tagger = Tagger(_type) result = tagger.tag(text) return jsonify(success=True, result=result) Logger.log_success('Server started successfully')
def cleanup(): Logger.log_message('Cleaning up') call(['rm', '-rf', 'out']) call(['mkdir', 'out']) Logger.log_success('Finished cleaning up')
def __init__(self, host, port, debug): self.host = host self.port = port self.debug = debug self.app = Flask(__name__) self.syntactic_simplifier = SyntacticSimplifier() self.enricher = Enricher() @self.app.route('/') def index(): return render_template('index.html') @self.app.route('/enrich') def enrich(): return render_template('enrich.html') @self.app.route('/lexus') def lexus(): return render_template('lexus.html') @self.app.route('/syntax') def syntax(): return render_template('syntax.html') @self.app.route('/readability') def readability(): return render_template('readability.html') @self.app.route('/api/simplify') def simplify_api(): text = request.args['text'] n = request.args['n'] lex_result = LexicalSimplifier.simplify(text, n) syn_result = self.syntactic_simplifier.simplify(text) result = { "lexical": lex_result, "syntactic": syn_result } return jsonify(success=True, result=result) @self.app.route('/api/lexus/simplify') def lexus_simplify_api(): text = request.args['text'] n = request.args['n'] result = LexicalSimplifier.simplify(text, n) return jsonify(success=True, result=result) @self.app.route('/api/syntax/simplify') def syntax_simplify_api(): text = request.args['text'] result = self.syntactic_simplifier.simplify(text, False, True) return jsonify(success=True, result=result) @self.app.route('/api/enrich') def enrich_api(): text = request.args['text'] result = self.enricher.enrich(text) return jsonify(success=True, result=result) @self.app.route('/api/readability') def readability_api(): text = request.args['text'] result = { "flesch_kincaid_grade_level": FleschKincaid.calculate_grade_level(text) } return jsonify(success=True, result=result) Logger.log_success("Started application server successfully")