def main(): # Initial input checking if len(sys.argv) < 2: print("Insufficient arguments. Usage: parse.py <fileName>") sys.exit(22) if os.path.isfile(sys.argv[1]) == False: print("File does not exist.") sys.exit(22) if sys.argv[1][-3:] == "txt": doc = Document() with open(sys.argv[1]) as file: parser = document.Parser(file, doc) parser.parseDocument() # for w in doc: # print("%s: %s" % (w, doc.getWordCount(w))) if len(sys.argv) >= 3: if "-stats" in sys.argv: print("Document Statistics:") print("Total Words : %d" % (doc.getNumTotalWords())) print("Different Words : %d" % (doc.getNumDifferentWords())) print("Number of Sentences : %d" % (doc.getNumSentences())) print("Number of Paragraphs: %d" % (doc.getNumParagraphs())) print("------------") if "-lw" in sys.argv: print("Words in Document:") for w in doc: print("%s" % (w)) print("------------") if "-lwf" in sys.argv: print("Word Frequency in Document:") for w in doc: print("%s: %s" % (w, doc.getWordCount(w))) print("------------") if "-mfw" in sys.argv: print("Most Frequent Word(s) in Document:") maxFreq = 0 mostFreqWords = [] for w in doc: if doc.getWordCount(w) > maxFreq: maxFreq = doc.getWordCount(w) mostFreqWords = [] mostFreqWords.append(w) elif doc.getWordCount(w) == maxFreq: mostFreqWords.append(w) for w in mostFreqWords: print("%s: %s" % (w, maxFreq)) print("------------") for args in sys.argv: if "-find=" in args: searchWord = args[6:] if len(searchWord) > 0: print("Searching for Word \"%s\" in Document:" % searchWord) print("%s: %s" % (searchWord, doc.getWordCount(searchWord))) print("------------") elif sys.argv[1][-3:] == "csv": csv = Custom_CSV() with open(sys.argv[1]) as file: parser = custom_csv.CSV_Parser(file, csv) parser.parseCSV() else: print("Bad file extenstion.") sys.exit(22)
def main(): argParser = buildArguments() args = argParser.parse_args() filename = args.file print("------------") if filename[-3:] == 'csv': print('okay csv') csv = Custom_CSV() print('Opening csv...') try : with open(args.file) as file: print('Parsing csv...') parser = custom_csv.CSV_Parser(file, csv) parser.parseCSV() print('Done!') print("------------") except FileNotFoundError as e: print('Could not find file %s' % (args.file)) return e.errno if args.display_all: print('Displaying all vectors and their rows...') print('%-16s %-13s' % ('Row', 'Vector')) for i in range(0,csv.getNumVectors()): print("%d: " % i, end="") for e in csv.getVector(i): print("%f\t" % e, end="") print() print("------------") if args.top: top = int(args.top) if top < 0: print('Please do not give the program negative numbers') return 22 if top > csv.getNumVectors(): print('Row %d is out of range' % top) return 22 print('Displaying top %d rows...' % top) print('%-16s %-13s' % ('Row', 'Vector')) for i in range(0,top): print("%d: " % i, end="") for e in csv.getVector(i): print("%f\t" % e, end="") print() print("------------") if args.length: row = int(args.length) if row < 0: print('Please do not give the program negative numbers') return 22 if row > csv.getNumVectors(): print('Row %d is out of range' % row) return 22 print('Computing length of row %d...' % row) v = csv.getVector(row) if not args.quiet: print("row %2d: " % row, end="") for e in v: print("%f\t" % e, end="") print() print("length: %f" % vector_math.length(v)) print("------------") if args.dot: row1 = int(args.dot[0]) row2 = int(args.dot[1]) if row1 < 0 or row2 < 0: print('Please do not give the program negative numbers') return 22 if row1 > csv.getNumVectors() or row2 > csv.getNumVectors(): print('Row %s is out of range' % args.dot) return 22 x = csv.getVector(row1) y = csv.getVector(row2) print('Computing dot product of row %d and %d...' % (row1, row2)) if len(x) == len(y): if not args.quiet: print("row1 %2d: " % row1, end="") for e in x: print("%f\t" % e, end="") print() print("row2 %2d: " % row2, end="") for e in y: print("%f\t" % e, end="") print() print("Dot Product: %f" % vector_math.dot(x, y)) else: print('Row %d does not have the same length as %d' % (row1, row2)) print("------------") if args.euclidean: row1 = int(args.euclidean[0]) row2 = int(args.euclidean[1]) if row1 < 0 or row2 < 0: print('Please do not give the program negative numbers') return 22 if row1 > csv.getNumVectors() or row2 > csv.getNumVectors(): print('Row %s is out of range' % args.dot) return 22 x = csv.getVector(row1) y = csv.getVector(row2) print('Computing Euclidean Distance of row %d and %d...' % (row1, row2)) if len(x) == len(y): if not args.quiet: print("row1 %2d: " % row1, end="") for e in x: print("%f\t" % e, end="") print() print("row2 %2d: " % row2, end="") for e in y: print("%f\t" % e, end="") print() print("Euclidean Distance: %f" % vector_math.euclideanDistance(x, y)) else: print('Row %d does not have the same length as %d' % (row1, row2)) print("------------") if args.manhattan: row1 = int(args.manhattan[0]) row2 = int(args.manhattan[1]) if row1 < 0 or row2 < 0: print('Please do not give the program negative numbers') return 22 if row1 > csv.getNumVectors() or row2 > csv.getNumVectors(): print('Row %s is out of range' % args.dot) return 22 x = csv.getVector(row1) y = csv.getVector(row2) print('Computing Manhattan Distance of row %d and %d...' % (row1, row2)) if len(x) == len(y): if not args.quiet: print("row1 %2d: " % row1, end="") for e in x: print("%f\t" % e, end="") print() print("row2 %2d: " % row2, end="") for e in y: print("%f\t" % e, end="") print() print("Manhattan Distance : %f" % vector_math.manhattanDistance(x, y)) else: print('Row %d does not have the same length as %d' % (row1, row2)) print("------------") if args.pearson: row1 = int(args.pearson[0]) row2 = int(args.pearson[1]) if row1 < 0 or row2 < 0: print('Please do not give the program negative numbers') return 22 if row1 > csv.getNumVectors() or row2 > csv.getNumVectors(): print('Row %s is out of range' % args.dot) return 22 x = csv.getVector(row1) y = csv.getVector(row2) print('Computing Pearson Correlation of row %d and %d...' % (row1, row2)) if len(x) == len(y): if not args.quiet: print("row1 %2d: " % row1, end="") for e in x: print("%f\t" % e, end="") print() print("row2 %2d: " % row2, end="") for e in y: print("%f\t" % e, end="") print() print("Pearson Correlation: %f" % vector_math.pearsonCorrelation(x, y)) else: print('Row %d does not have the same length as %d' % (row1, row2)) print("------------") if args.min_row: row = int(args.min_row) print('Computing min for row %d' % row) if row < 0: print('Please do not give the program negative numbers') return 22 if row > csv.getNumVectors(): print('Row %d is out of range' % row) return 22 print('Computing min of row %d...' % row) v = csv.getVector(row) if not args.quiet: print("row %2d: " % row, end="") for e in v: print("%f\t" % e, end="") print() print("min: %f" % vector_math.minRow(v)) print("------------") if args.max_row: row = int(args.max_row) print('Computing max for row %d' % row) if row < 0: print('Please do not give the program negative numbers') return 22 if row > csv.getNumVectors(): print('Row %d is out of range' % row) return 22 print('Computing max of row %d...' % row) v = csv.getVector(row) if not args.quiet: print("row %2d: " % row, end="") for e in v: print("%f\t" % e, end="") print() print("max: %f" % vector_math.maxRow(v)) print("------------") if args.median_row: row = int(args.median_row) print('Computing median for row %d' % row) if row < 0: print('Please do not give the program negative numbers') return 22 if row > csv.getNumVectors(): print('Row %d is out of range' % row) return 22 print('Computing median of row %d...' % row) v = csv.getVector(row) if not args.quiet: print("row %2d: " % row, end="") for e in v: print("%f\t" % e, end="") print() print("median: %f" % vector_math.medianRow(v)) print("------------") if args.mean_row: row = int(args.mean_row) print('Computing mean for row %d' % row) if row < 0: print('Please do not give the program negative numbers') return 22 if row > csv.getNumVectors(): print('Row %d is out of range' % row) return 22 print('Computing mean of row %d...' % row) v = csv.getVector(row) if not args.quiet: print("row %2d: " % row, end="") for e in v: print("%f\t" % e, end="") print() print("mean: %f" % vector_math.meanRow(v)) print("------------") if args.min_col: col = int(args.min_col) print('Computing min for col %d' % col) if col < 0: print('Please do not give the program negative numbers') return 22 print('Computing min of column %d...' % col) c = csv.getColumn(col) if len(c) > 0: if not args.quiet: print("%d" % col); print("---"); for e in c: print("%f" % e) print("min: %f" % vector_math.minRow(c)) else: print("Column %d had no entries" % col) print("------------") if args.max_col: col = int(args.max_col) print('Computing max for col %d' % col) if col < 0: print('Please do not give the program negative numbers') return 22 print('Computing max of column %d...' % col) c = csv.getColumn(col) if len(c) > 0: if not args.quiet: print("%d" % col); print("---"); for e in c: print("%f" % e) print("max: %f" % vector_math.maxRow(c)) else: print("Column %d had no entries" % col) print("------------") if args.median_col: col = int(args.median_col) print('Computing median for col %d' % col) if col < 0: print('Please do not give the program negative numbers') return 22 print('Computing median of column %d...' % col) c = csv.getColumn(col) if len(c) > 0: if not args.quiet: print("%d" % col); print("---"); for e in c: print("%f" % e) print("median: %f" % vector_math.medianRow(c)) else: print("Column %d had no entries" % col) print("------------") if args.mean_col: col = int(args.mean_col) print('Computing mean for col %d' % col) if col < 0: print('Please do not give the program negative numbers') return 22 print('Computing mean of column %d...' % col) c = csv.getColumn(col) if len(c) > 0: if not args.quiet: print("%d" % col); print("---"); for e in c: print("%f" % e) print("mean: %f" % vector_math.meanRow(c)) else: print("Column %d had no entries" % col) print("------------") elif filename[-3:] == 'txt': doc = Document() # Read Document print('Opening document...') try: with open(args.file) as file: print('Parsing document...') parser = document.Parser(file, doc) parser.parseDocument() print('Done!') print("------------") except FileNotFoundError as e: print('Could not find file %s' % (args.file)) return e.errno # Display Document if args.display_all: print('Displaying all words and their occurrences...') print('%-16s %-13s' % ('Word', 'Occurrences')) for w in doc: print('%-16s %-13d' % (w, doc.getWordCount(w))) print("------------") if args.most_frequent: print('Finding most frequent...') mostFreq = doc.getMostFrequentWord() print('%-16s %-13s' % ('Most Frequent', 'Occurrences')) print('%-16s %-13d' % (mostFreq, doc.getWordCount(mostFreq))) print("------------") if args.most_frequent_percent: percent = int(args.most_frequent_percent) if percent > 100 or percent < 0: print('%d is an invalid percentage -- must be between 0 and 100' % percent) return 22 print('Finding words within %d percent of the most frequent...' % percent) mostFreqWords = doc.getMostFrequentWords(percent) print('%-16s %-13s' % ('Most Frequent', 'Occurrences')) for w in mostFreqWords: print('%-16s %-13d' % (w, doc.getWordCount(w))) print("------------") if args.top: topNum = int(args.top) if topNum < 0: print('Please do not give the program negative numbers') return 22 print('Finding top %d words...' % topNum) topWords = doc.getTopWords(topNum) print('%-16s %-13s' % ('Top %d Frequent' % topNum, 'Occurrences')) for w in topWords: print('%-16s %-13d' % (w, doc.getWordCount(w))) print("------------") if args.equal_to: freq = int(args.equal_to) print('Finding words with %d ocurrences...' % freq) wordsEqual = doc.getWordsEqualToFrequency(freq) print('%-16s %-13s' % ('Word', 'Occurrences')) wordsEqual = sorted(wordsEqual) for w in wordsEqual: print('%-16s %-13d' % (w, doc.getWordCount(w))) print("------------") if args.above: freq = int(args.above) print('Finding words above %d occurrences...' % freq) wordsAbove = doc.getWordsAboveFrequency(freq) print('%-16s %-13s' % ('Word', 'Occurrences')) wordsAbove = sorted(wordsAbove) for w in wordsAbove: print('%-16s %-13d' % (w, doc.getWordCount(w))) print("------------") if args.word: print('Finding occurrences for %s...' % args.word) print('%-16s %-13s' % ('Word', 'Occurrences')) for w in args.word.split(','): print('%-16s %-13d' % (w, doc.getWordCount(w))) print("------------") if args.is_in: print('Checking if %s is in the document...' % args.is_in) if doc.getWordCount(args.is_in) > 0: print('"%s" IS in the document!' % args.is_in) else: print('"%s" IS NOT in the document!' % args.is_in) print("------------") if args.stats: print("Document Statistics:") print("Number Total Words : %d" % (doc.getNumTotalWords())) print("Number Different Words: %d" % (doc.getNumDifferentWords())) print("Number of Sentences : %d" % (doc.getNumSentences())) print("Number of Paragraphs : %d" % (doc.getNumParagraphs())) print("------------") else: print('Please provide either a .csv or a .txt file to be parsed') return 22 return 0