def build_alt_words_table(n): Logger.log_message('Building alternate words table based on ' + str(n) + '-grams') alt_words = {} in_file = path.join('out', str(n) + '-gram-regexp.csv') Logger.log_message('Reading ' + in_file) input_file = open(in_file) for line in input_file.readlines(): words = str(line.split(';')[1]).split() for word in words: for alt_word in words: if word in alt_words: if alt_word not in alt_words[word]: alt_words[word].append(alt_word) else: alt_words[word] = [alt_word] input_file.close() Logger.log_success('Finished reading ' + in_file) out_file = path.join('out', 'lwlm-alt-words-' + str(n) + '-grams.csv') Logger.log_message('Writing alternate words table to ' + out_file) output_file = open(out_file, 'w+') for word in alt_words: words = set(alt_words[word]) col = ' '.join(w for w in words) output_file.write(word + ';' + col + '\n') output_file.close() Logger.log_success('Alternate words table has been written to ' + out_file)
def run(self): Logger.log_message("Running ITFIDF") Logger.log_message("Reading " + self.tf_dict) tf_dict_file = open(self.tf_dict) for line in tf_dict_file.readlines(): cols = line.split(";") self.tf_dictionary[cols[0]] = int(cols[1]) tf_dict_file.close() Logger.log_message("Reading " + self.df_dict) df_dict_file = open(self.df_dict) for line in df_dict_file.readlines(): cols = line.split(";") self.df_dictionary[cols[0]] = int(cols[1]) max_tf = max(self.tf_dictionary.values()) max_df = max(self.df_dictionary.values()) for word in self.df_dictionary: if word in self.tf_dictionary: if word in self.df_dictionary: self.itfidf[word] = (max_tf * max_df) / (self.tf_dictionary[word] * self.df_dictionary[word]) Logger.log_message("Writing results to " + self.out_file) self.dump_results() Logger.log_success("Finished writing results to " + self.out_file)
def __init__(self, stats_file, out_file): Logger.log_message("Initializing Readability Calculator") self.stats_file = stats_file self.out_file = out_file self.readability_grade = {}
def parse(self, in_file): if not in_file.endswith(".txt"): return Logger.log_message("Parsing file " + in_file) input_file = open(in_file) content = input_file.read() words = content.split() self.number_of_words[in_file] = 0 self.number_of_sentences[in_file] = 0 self.number_of_syllables[in_file] = 0 self.number_of_chars[in_file] = 0 for word in words: # Check if there are any separators for separator in self.separators: if separator in word: self.number_of_sentences[in_file] += 1 sanitized_word = Parser.sanitize_word(word) if sanitized_word == "": continue self.number_of_words[in_file] += 1 self.number_of_chars[in_file] += len(sanitized_word) self.number_of_syllables[in_file] += \ SyllableCounter.count_syllables(sanitized_word) input_file.close()
def main(): if len(sys.argv) > 1: if sys.argv[1] == 'server': run_server() return Logger.log_message("Running application Simplify") cleanup() LWLM.build_tables('corpus') Logger.log_success("Application exited successfully")
def __init__(self, in_file, out_file): Logger.log_message("Initializing merger") self.in_file = in_file self.out_file = out_file self.file_names = "abcdefghijklmnopqrstuvwxyz" self.number_of_chars = {} self.number_of_words = {} self.number_of_sentences = {} self.number_of_syllables = {}
def sanitize(in_file, out_file): Logger.log_message('Sanitizing ' + in_file) input_file = open(in_file, 'r') output_file = open(out_file, 'w') for input_line in input_file.readlines(): output_line = ' '.join([word.lower() for word in split('\W', input_line) if word]) output_file.write(output_line + '\n') input_file.close() output_file.close()
def stem_file(in_file, out_file): Logger.log_message('Stemming ' + in_file) input_file = open(in_file, 'r') output_file = open(out_file, 'w') for line in input_file.readlines(): output_line = ' '.join([PorterStemmer().stem_word(word) for word in line.split()]) output_file.write(output_line + '\n') input_file.close() output_file.close()
def __init__(self, in_dir, out_dir, out_file): Logger.log_message("Initializing parser") self.in_dir = in_dir self.out_file = path.join(out_dir, out_file) self.out_dir = out_dir self.separators = [".", "!", "?"] self.number_of_words = {} self.number_of_sentences = {} self.number_of_syllables = {} self.number_of_chars = {}
def run(self): Logger.log_message("Running Readability Calculator on " + self.stats_file) input_file = open(self.stats_file) for line in input_file.readlines(): if line[0] == "#": continue cols = line.split(";") grade = int(cols[0]) words = float(cols[2]) syllables = float(cols[3]) sentences = float(cols[4]) flesch_kincaid_grade = 0.39 * (words / sentences) + \ 11.8 * (syllables / words) - 15.59 self.readability_grade[grade] = flesch_kincaid_grade input_file.close() Logger.log_message("Writing results to " + self.out_file) self.dump_results() Logger.log_success("Results written to " + self.out_file) grades = self.readability_grade.keys() fk_grades = self.readability_grade.values() correlation = pearsonr(grades, fk_grades) Logger.log_result("Correlation between grade level and Flesch Kincaid grade: " \ + str(correlation))
def parse_file(self, in_file): Logger.log_message('Running ' + str(self.n) + '-Gram Frequency counter on ' + in_file) input_file = open(in_file) # Create a sanitized content string content = "" for line in input_file.readlines(): words = line.split() for word in words: content += Sanitizer.sanitize_word(word) + " " content = content.split() length = len(content) # Parse the content for i in range(length - self.n + 1): s = "" neighbor_str = "" word = "" for j in range(self.n): s += content[i + j] if j == self.n / 2: neighbor_str += '*' word = content[i + j] else: neighbor_str += content[i + j] pass if j != self.n - 1: s += '|' neighbor_str += '|' if s in self.table: self.table[s] += 1 else: self.table[s] = 1 if neighbor_str in self.neighbors: self.neighbors[neighbor_str].append(word) else: self.neighbors[neighbor_str] = [word] input_file.close()
def count_kf_frequency(self, in_file): Logger.log_message('Counting Kucera Francis frequency for ' + in_file) input_file = open(in_file, 'r') for line in input_file.readlines(): for word in line.split(): if word.isdigit(): continue if word in self.kf_val: # If word is present in the psycholinguistic dictionary self.kf_res[word] = self.kf_val[word] else: self.kf_res[word] = 0
def count_syllables(self, in_file): Logger.log_message('Counting number of syllables for ' + in_file) input_file = open(in_file, 'r') for line in input_file.readlines(): for word in line.split(): if word.isdigit(): continue if word in self.syllable_val: # If word is present in psycholinguistic dictionary self.syllable_res[word] = self.syllable_val[word] else: self.syllable_res[word] = 0
def count_term_frequency(self, in_file): Logger.log_message('Counting term frequency for ' + in_file) input_file = open(in_file, 'r') for line in input_file.readlines(): for word in line.split(): if word.isdigit(): continue if self.frequencies.has_key(word): self.frequencies[word] += 1 else: self.frequencies[word] = 1 input_file.close()
def build_tables(in_dir): Logger.log_message('Building 3-Gram LWLM tables') ng = NGram(3, in_dir) ng.run() Logger.log_success('Finished building 3-Gram LWLM tables') Logger.log_message('Building 5-Gram LWLM tables') ng = NGram(5, in_dir) ng.run() Logger.log_success('Finished building 5-Gram LWLM tables') LWLM.build_alt_words_table(3) LWLM.build_alt_words_table(5)
def main(): Logger.log_message("Starting NCERT Readability application") # Run the parser parser = Parser(CORPUS_DIR, OUTPUT_DIR, STATS_FILE) parser.run() # Merge the stats merger = Merger(path.join(OUTPUT_DIR, STATS_FILE), path.join(OUTPUT_DIR, MERGED_STATS_FILE)) merger.run() readability_calc = ReadabilityCalculator( path.join(OUTPUT_DIR, MERGED_STATS_FILE), path.join(OUTPUT_DIR, RESULTS_FILE)) readability_calc.run() Logger.log_success("Application exited successfully")
def run(self): # Check for the input directory try: stat(self.in_dir) except: Logger.log_error('Input text not found') return # Create the output directory try: stat(self.out_dir) except: makedirs(self.out_dir) Logger.log_message('Started stemming') # Walk through the input directory for(dir_path, _, file_names) in walk(self.in_dir): for file_name in file_names: in_file = path.join(dir_path, file_name) out_file = path.join(self.out_dir, file_name + '_' + dir_path.replace('/', '_') + '.txt') Stemmer.stem_file(in_file, out_file) Logger.log_success('Finished stemming')
def run(self): Logger.log_message("Running merger on " + self.in_file) input_file = open(self.in_file) for line in input_file.readlines(): if line[0] == "#": continue cols = line.split(";") file_name = str(cols[0].split("/")[-1]) grade = self.file_names.index(file_name[0]) + 1 if grade not in self.number_of_words: self.number_of_chars[grade] = 0 self.number_of_words[grade] = 0 self.number_of_syllables[grade] = 0 self.number_of_sentences[grade] = 0 self.number_of_chars[grade] += int(cols[1]) self.number_of_words[grade] += int(cols[2]) self.number_of_syllables[grade] += int(cols[3]) self.number_of_sentences[grade] += int(cols[4]) input_file.close() Logger.log_message("Writing merged results to " + self.out_file) self.dump_results() Logger.log_success("Results written to " + self.out_file)
def run(self): # Check for the input directory try: stat(self.in_dir) except: Logger.log_error('Input text not found') return Logger.log_message('Running term frequency counter') for(dir_path, _, file_names) in walk(self.in_dir): for file_name in file_names: in_file = path.join(dir_path, file_name) self.count_term_frequency(in_file) Logger.log_success('Finished term frequency counting') Logger.log_message('Writing results to ' + self.out_file) self.dump_results() Logger.log_success('Finished writing results to ' + self.out_file)
def terms_in_document(self, in_file): Logger.log_message('Running document frequency counter for ' + in_file) input_file = open(in_file) # Reset words in document every time for word in self.document_words: self.document_words[word] = False # Set the words present in document to True for line in input_file.readlines(): for word in line.split(): self.document_words[word] = True for word in self.document_words: # If word present in document if self.document_words[word]: if word in self.document_frequencies: self.document_frequencies[word] += 1 else: self.document_frequencies[word] = 1
def __init__(self, host, port, debug): self.host = host self.port = port self.debug = debug self.app = Flask(__name__) # Index route @self.app.route('/') def index(): return render_template('index.html') @self.app.route('/api/tag') def tag_api(): text = request.args["text"] _type = request.args["type"] tagger = Tagger(_type) result = tagger.tag(text) return jsonify(success=True, result=result) Logger.log_success('Server started successfully')
def run_server(): try: stat('out') except: Logger.log_error('Data tables not built yet') Logger.log_message('Please run ./run first') return Logger.log_message('Running application server') web_app = WebApp('localhost', 8000, True) web_app.run()
def dump_results(self): out_file = path.join('out', str(self.n) + '-gram.csv') Logger.log_message('Writing ' + str(self.n) + '-Gram table to ' + out_file) output_file = open(out_file, 'w+') for s in self.table: output_file.write(s + ';' + str(self.table[s]) + '\n') output_file.close() Logger.log_success('Finished writing ' + str(self.n) + '-Gram table to ' + out_file) out_file = path.join('out', str(self.n) + '-gram-regexp.csv') Logger.log_message('Writing ' + str(self.n) + '-Gram Regular Expressions to ' + out_file) output_file = open(out_file, 'w+') for nb in self.neighbors: words = set(self.neighbors[nb]) col = ' '.join(w for w in words) output_file.write(nb + ';' + col + '\n') output_file.close() Logger.log_success(str(self.n) + '-Gram Regular Expressions have been written to ' + out_file)
def run(self): Logger.log_message('Running ' + str(self.n) + '-Gram Frequency counter') try: stat(self.in_dir) except: Logger.log_error('Input text not found') return for(dir_path, _, file_names) in walk(self.in_dir): for file_name in file_names: in_file = path.join(dir_path, file_name) self.parse_file(in_file) self.dump_results() Logger.log_success(str(self.n) + '-Gram Frequency counter exited successfully')
def run(self): Logger.log_message("Parser started running") # Check if the input directory exists try: stat(self.in_dir) except: Logger.log_error("Input text not found") return # Create output directory if it doesn't exist try: stat(self.out_dir) except: mkdir(self.out_dir) for (dir_path, _, file_names) in walk(self.in_dir): for file_name in file_names: in_file = path.join(dir_path, file_name) self.parse(in_file) Logger.log_message("Writing results to " + self.out_file) self.dump_results() Logger.log_success("Results have been written to " + self.out_file)
def run(self): # Build up the syllable count dictionary dictionary = open(self.dict_file, 'r') Logger.log_message("Reading " + self.dict_file) for line in dictionary.readlines(): cols = line.split(';') self.syllable_val[cols[0]] = int(cols[2]) dictionary.close() # Check for the input directory try: stat(self.in_dir) except: Logger.log_error('Input text not found') return Logger.log_message('Running syllable counter') for(dir_path, _, file_names) in walk(self.in_dir): for file_name in file_names: in_file = path.join(dir_path, file_name) self.count_syllables(in_file) Logger.log_success('Finished syllable counting') Logger.log_message('Writing results to ' + self.out_file) self.dump_results() Logger.log_success('Finished writing results to ' + self.out_file)
def main(): args = sys.argv usage = ''' ./run txtdump\t<Gives the text dump of corpus> ./run sanitize\t<Sanitize the text dump to remove white spaces, etc.> ./run stem\t\t<Stem the sanitized text> ./run tf\t\t<Calculate the raw term frequency> ./run df\t\t<Calculate the document frequency> ./run itfidf\t<Calculate the inverse term frequency - inverse document frequency> ./run dict\t\t<Create the psycholinguistic dictionary> ./run kff\t\t<Calculate the Kucera Francis frequency> ./run syl\t\t<Calculate the number of syllables> ./run server\t<Run the application server> ''' if len(args) < 2: Logger.log_usage(usage) return if args[1] == 'server': web_app = WebApp('127.0.0.1', 5000, DEBUG) web_app.run() return elif args[1] == 'txtdump': txt_dump = TxtDump('corpus', path.join('tmp', 'txtdump')) txt_dump.run() return elif args[1] == 'sanitize': sanitizer = Sanitizer(path.join('tmp', 'txtdump'), path.join('tmp', 'sanitized')) sanitizer.run() return elif args[1] == 'stem': stemmer = Stemmer(path.join('tmp', 'sanitized'), path.join('tmp', 'stemmed')) stemmer.run() return elif args[1] == 'tf': tf = TermFrequency(path.join('tmp', 'stemmed'), path.join('data', 'tf_stemmed.csv')) tf.run() tf = TermFrequency(path.join('tmp', 'sanitized'), path.join('data', 'terms_list.csv')) tf.run() return elif args[1] == 'df': df = DocumentFrequency(path.join('tmp', 'stemmed'), path.join('data', 'df_stemmed.csv'), path.join('data', 'tf_stemmed.csv')) df.run() return elif args[1] == 'itfidf': itfidf = ITFIDF(path.join('data', 'itfidf_stemmed.csv'), path.join('data', 'tf_stemmed.csv'), path.join('data', 'df_stemmed.csv')) itfidf.run() return elif args[1] == 'dict': dict_creator = PsycholinguisticDbCreator(path.join('data', 'psycholinguistic_db'), path.join('data', 'psycholinguistic_db.csv')) dict_creator.create() return elif args[1] == 'kff': kf_freq_counter = KFFrequency(path.join('tmp', 'stemmed'), path.join('data', 'kff_stemmed.csv'), path.join('data', 'psycholinguistic_db.csv')) kf_freq_counter.run() return elif args[1] == 'syl': syllable_counter = SyllableCounter(path.join('tmp', 'stemmed'), path.join('data', 'syllables_stemmed.csv'), path.join('data', 'psycholinguistic_db.csv')) syllable_counter.run() return else: Logger.log_usage(usage) return
def cleanup(): Logger.log_message('Cleaning up') call(['rm', '-rf', 'out']) call(['mkdir', 'out']) Logger.log_success('Finished cleaning up')
def run(self): dictionary = open(self.dict_file) Logger.log_message('Reading ' + self.dict_file) # Construct the dictionary for line in dictionary.readlines(): items = line.split(";") self.document_frequencies[items[0]] = 0 self.document_words[items[0]] = False # Check for the input directory try: stat(self.in_dir) except: Logger.log_error('Input text not found') return Logger.log_message('Running document frequency counter') for(dir_path, _, file_names) in walk(self.in_dir): for file_name in file_names: in_file = path.join(dir_path, file_name) self.terms_in_document(in_file) Logger.log_message('Finished document frequency counting') Logger.log_message('Writing results to ' + self.out_file) self.dump_results() Logger.log_success('Finished writing results to ' + self.out_file)
def __init__(self, host, port, debug): self.host = host self.port = port self.debug = debug self.app = Flask(__name__) self.syntactic_simplifier = SyntacticSimplifier() self.enricher = Enricher() @self.app.route('/') def index(): return render_template('index.html') @self.app.route('/enrich') def enrich(): return render_template('enrich.html') @self.app.route('/lexus') def lexus(): return render_template('lexus.html') @self.app.route('/syntax') def syntax(): return render_template('syntax.html') @self.app.route('/readability') def readability(): return render_template('readability.html') @self.app.route('/api/simplify') def simplify_api(): text = request.args['text'] n = request.args['n'] lex_result = LexicalSimplifier.simplify(text, n) syn_result = self.syntactic_simplifier.simplify(text) result = { "lexical": lex_result, "syntactic": syn_result } return jsonify(success=True, result=result) @self.app.route('/api/lexus/simplify') def lexus_simplify_api(): text = request.args['text'] n = request.args['n'] result = LexicalSimplifier.simplify(text, n) return jsonify(success=True, result=result) @self.app.route('/api/syntax/simplify') def syntax_simplify_api(): text = request.args['text'] result = self.syntactic_simplifier.simplify(text, False, True) return jsonify(success=True, result=result) @self.app.route('/api/enrich') def enrich_api(): text = request.args['text'] result = self.enricher.enrich(text) return jsonify(success=True, result=result) @self.app.route('/api/readability') def readability_api(): text = request.args['text'] result = { "flesch_kincaid_grade_level": FleschKincaid.calculate_grade_level(text) } return jsonify(success=True, result=result) Logger.log_success("Started application server successfully")