Example #1
0
def clean_corpus(inputfile):
	print("Script for cleaning raw text input")
	c = Counter()
	all_chars = set()
	
	output_file    = os.path.join(OUT_DIR, FileTool.getfilename(inputfile) + '.cleaned.txt')
	output_numfile = os.path.join(OUT_DIR, FileTool.getfilename(inputfile) + '.num.txt')
	print("Input file            : %s" % (inputfile))
	print("Output file           : %s" % (output_file))
	print("Output (numbered) file: %s" % (output_numfile))
	
	with open(inputfile, 'r', encoding='utf8') as infile, open(output_file, 'w', encoding='utf8') as outfile, open(output_numfile, 'w', encoding='utf8') as outnumfile: 
		for linenum, line in enumerate(infile):
			c.count("Line")
			cleaned_line = remove_numbering(line)
			cleaned_line = remove_special_chars(cleaned_line)
			for a_char in cleaned_line:
				all_chars.add(a_char)
			outfile.write("%s\n" % cleaned_line)
			outnumfile.write("%s\t%s\n" % (linenum+1, cleaned_line))
		c.summarise()
	print("-" * 80)
	try:
		print("All characters: %s" % str(sorted(list(all_chars))))
	except:
		pass
	print("Done!")
Example #2
0
def clean_corpus(inputfile):
    print("Script for cleaning raw text input")
    c = Counter()
    all_chars = set()
    
    output_file    = os.path.join(OUT_DIR, FileTool.getfilename(inputfile) + '.cleaned.txt')
    output_numfile = os.path.join(OUT_DIR, FileTool.getfilename(inputfile) + '.num.txt')
    print("Input file            : %s" % (inputfile))
    print("Output file           : %s" % (output_file))
    print("Output (numbered) file: %s" % (output_numfile))
    
    with open(inputfile, 'r', encoding='utf8') as infile, open(output_file, 'w', encoding='utf8') as outfile, open(output_numfile, 'w', encoding='utf8') as outnumfile: 
        for linenum, line in enumerate(infile):
            c.count("Line")
            cleaned_line = remove_numbering(line)
            cleaned_line = remove_special_chars(cleaned_line)
            for a_char in cleaned_line:
                all_chars.add(a_char)
            outfile.write("%s\n" % cleaned_line)
            outnumfile.write("%s\t%s\n" % (linenum+1, cleaned_line))
        c.summarise()
    print("-" * 80)
    try:
        print("All characters: %s" % str(sorted(list(all_chars))))
    except:
        pass
    print("Done!")