def clean_corpus(inputfile): print("Script for cleaning raw text input") c = Counter() all_chars = set() output_file = os.path.join(OUT_DIR, FileTool.getfilename(inputfile) + '.cleaned.txt') output_numfile = os.path.join(OUT_DIR, FileTool.getfilename(inputfile) + '.num.txt') print("Input file : %s" % (inputfile)) print("Output file : %s" % (output_file)) print("Output (numbered) file: %s" % (output_numfile)) with open(inputfile, 'r', encoding='utf8') as infile, open(output_file, 'w', encoding='utf8') as outfile, open(output_numfile, 'w', encoding='utf8') as outnumfile: for linenum, line in enumerate(infile): c.count("Line") cleaned_line = remove_numbering(line) cleaned_line = remove_special_chars(cleaned_line) for a_char in cleaned_line: all_chars.add(a_char) outfile.write("%s\n" % cleaned_line) outnumfile.write("%s\t%s\n" % (linenum+1, cleaned_line)) c.summarise() print("-" * 80) try: print("All characters: %s" % str(sorted(list(all_chars)))) except: pass print("Done!")