Esempio n. 1
0
def main():
    argParser = buildArguments()
    args = argParser.parse_args()
    filename = args.file
    print("------------")
    if filename[-3:] == 'csv':
        print('okay csv')
        csv = Custom_CSV()
        print('Opening csv...')
        try :
            with open(args.file) as file:
                print('Parsing csv...')
                parser = custom_csv.CSV_Parser(file, csv)
                parser.parseCSV()
                print('Done!')
                print("------------")
        except FileNotFoundError as e:
            print('Could not find file %s' % (args.file))
            return e.errno
        if args.display_all:
            print('Displaying all vectors and their rows...')
            print('%-16s %-13s' % ('Row', 'Vector'))
            for i in range(0,csv.getNumVectors()):
                print("%d: " % i, end="")
                for e in csv.getVector(i):
                    print("%f\t" % e, end="")
                print()
            print("------------")
        if args.top:
            top = int(args.top)
            if top < 0:
                print('Please do not give the program negative numbers')
                return 22
            if top > csv.getNumVectors():
                print('Row %d is out of range' % top)
                return 22
            print('Displaying top %d rows...' % top)
            print('%-16s %-13s' % ('Row', 'Vector'))
            for i in range(0,top):
                print("%d: " % i, end="")
                for e in csv.getVector(i):
                    print("%f\t" % e, end="")
                print()
            print("------------")
        if args.length:
            row = int(args.length)
            if row < 0:
                print('Please do not give the program negative numbers')
                return 22
            if row > csv.getNumVectors():
                print('Row %d is out of range' % row)
                return 22
            print('Computing length of row %d...' % row)
            v = csv.getVector(row)
            if not args.quiet:
                print("row %2d: " % row, end="")
                for e in v:
                    print("%f\t" % e, end="")
                print()
            print("length: %f" % vector_math.length(v))
            print("------------")
        if args.dot:
            row1 = int(args.dot[0])
            row2 = int(args.dot[1])
            if row1 < 0 or row2 < 0:
                print('Please do not give the program negative numbers')
                return 22
            if row1 > csv.getNumVectors() or row2 > csv.getNumVectors():
                print('Row %s is out of range' % args.dot)
                return 22
            x = csv.getVector(row1)
            y = csv.getVector(row2)
            print('Computing dot product of row %d and %d...' % (row1, row2))
            if len(x) == len(y):
                if not args.quiet:
                    print("row1 %2d: " % row1, end="")
                    for e in x:
                        print("%f\t" % e, end="")
                    print()
                    print("row2 %2d: " % row2, end="")
                    for e in y:
                        print("%f\t" % e, end="")
                    print()
                print("Dot Product: %f" % vector_math.dot(x, y))
            else:
                print('Row %d does not have the same length as %d' % (row1, row2))
            print("------------")
        if args.euclidean:
            row1 = int(args.euclidean[0])
            row2 = int(args.euclidean[1])
            if row1 < 0 or row2 < 0:
                print('Please do not give the program negative numbers')
                return 22
            if row1 > csv.getNumVectors() or row2 > csv.getNumVectors():
                print('Row %s is out of range' % args.dot)
                return 22
            x = csv.getVector(row1)
            y = csv.getVector(row2)
            print('Computing Euclidean Distance of row %d and %d...' % (row1, row2))
            if len(x) == len(y):
                if not args.quiet:
                    print("row1 %2d: " % row1, end="")
                    for e in x:
                        print("%f\t" % e, end="")
                    print()
                    print("row2 %2d: " % row2, end="")
                    for e in y:
                        print("%f\t" % e, end="")
                    print()
                print("Euclidean Distance: %f" % vector_math.euclideanDistance(x, y))
            else:
                print('Row %d does not have the same length as %d' % (row1, row2))
            print("------------")
        if args.manhattan:
            row1 = int(args.manhattan[0])
            row2 = int(args.manhattan[1])
            if row1 < 0 or row2 < 0:
                print('Please do not give the program negative numbers')
                return 22
            if row1 > csv.getNumVectors() or row2 > csv.getNumVectors():
                print('Row %s is out of range' % args.dot)
                return 22
            x = csv.getVector(row1)
            y = csv.getVector(row2)
            print('Computing Manhattan Distance of row %d and %d...' % (row1, row2))
            if len(x) == len(y):
                if not args.quiet:
                    print("row1 %2d: " % row1, end="")
                    for e in x:
                        print("%f\t" % e, end="")
                    print()
                    print("row2 %2d: " % row2, end="")
                    for e in y:
                        print("%f\t" % e, end="")
                    print()
                print("Manhattan Distance  : %f" % vector_math.manhattanDistance(x, y))
            else:
                print('Row %d does not have the same length as %d' % (row1, row2))
            print("------------")
        if args.pearson:
            row1 = int(args.pearson[0])
            row2 = int(args.pearson[1])
            if row1 < 0 or row2 < 0:
                print('Please do not give the program negative numbers')
                return 22
            if row1 > csv.getNumVectors() or row2 > csv.getNumVectors():
                print('Row %s is out of range' % args.dot)
                return 22
            x = csv.getVector(row1)
            y = csv.getVector(row2)
            print('Computing Pearson Correlation of row %d and %d...' % (row1, row2))
            if len(x) == len(y):
                if not args.quiet:
                    print("row1 %2d: " % row1, end="")
                    for e in x:
                        print("%f\t" % e, end="")
                    print()
                    print("row2 %2d: " % row2, end="")
                    for e in y:
                        print("%f\t" % e, end="")
                    print()
                print("Pearson Correlation: %f" % vector_math.pearsonCorrelation(x, y))
            else:
                print('Row %d does not have the same length as %d' % (row1, row2))
            print("------------")
        if args.min_row:
            row = int(args.min_row)
            print('Computing min for row %d' % row)
            if row < 0:
                print('Please do not give the program negative numbers')
                return 22
            if row > csv.getNumVectors():
                print('Row %d is out of range' % row)
                return 22
            print('Computing min of row %d...' % row)
            v = csv.getVector(row)
            if not args.quiet:
                print("row %2d: " % row, end="")
                for e in v:
                    print("%f\t" % e, end="")
                print()
            print("min: %f" % vector_math.minRow(v))
            print("------------")
        if args.max_row:
            row = int(args.max_row)
            print('Computing max for row %d' % row)
            if row < 0:
                print('Please do not give the program negative numbers')
                return 22
            if row > csv.getNumVectors():
                print('Row %d is out of range' % row)
                return 22
            print('Computing max of row %d...' % row)
            v = csv.getVector(row)
            if not args.quiet:
                print("row %2d: " % row, end="")
                for e in v:
                    print("%f\t" % e, end="")
                print()
            print("max: %f" % vector_math.maxRow(v))
            print("------------")
        if args.median_row:
            row = int(args.median_row)
            print('Computing median for row %d' % row)
            if row < 0:
                print('Please do not give the program negative numbers')
                return 22
            if row > csv.getNumVectors():
                print('Row %d is out of range' % row)
                return 22
            print('Computing median of row %d...' % row)
            v = csv.getVector(row)
            if not args.quiet:
                print("row %2d: " % row, end="")
                for e in v:
                    print("%f\t" % e, end="")
                print()
            print("median: %f" % vector_math.medianRow(v))
            print("------------")
        if args.mean_row:
            row = int(args.mean_row)
            print('Computing mean for row %d' % row)
            if row < 0:
                print('Please do not give the program negative numbers')
                return 22
            if row > csv.getNumVectors():
                print('Row %d is out of range' % row)
                return 22
            print('Computing mean of row %d...' % row)
            v = csv.getVector(row)
            if not args.quiet:
                print("row %2d: " % row, end="")
                for e in v:
                    print("%f\t" % e, end="")
                print()
            print("mean: %f" % vector_math.meanRow(v))
            print("------------")

        if args.min_col:
            col = int(args.min_col)
            print('Computing min for col %d' % col)
            if col < 0:
                print('Please do not give the program negative numbers')
                return 22
            print('Computing min of column %d...' % col)
            c = csv.getColumn(col)
            if len(c) > 0:
                if not args.quiet:
                    print("%d" % col);
                    print("---");
                    for e in c:
                        print("%f" % e)
                print("min: %f" % vector_math.minRow(c))
            else:
                print("Column %d had no entries" % col)
            print("------------")
        if args.max_col:
            col = int(args.max_col)
            print('Computing max for col %d' % col)
            if col < 0:
                print('Please do not give the program negative numbers')
                return 22
            print('Computing max of column %d...' % col)
            c = csv.getColumn(col)
            if len(c) > 0:
                if not args.quiet:
                    print("%d" % col);
                    print("---");
                    for e in c:
                        print("%f" % e)
                print("max: %f" % vector_math.maxRow(c))
            else:
                print("Column %d had no entries" % col)
            print("------------")
        if args.median_col:
            col = int(args.median_col)
            print('Computing median for col %d' % col)
            if col < 0:
                print('Please do not give the program negative numbers')
                return 22
            print('Computing median of column %d...' % col)
            c = csv.getColumn(col)
            if len(c) > 0:
                if not args.quiet:
                    print("%d" % col);
                    print("---");
                    for e in c:
                        print("%f" % e)
                print("median: %f" % vector_math.medianRow(c))
            else:
                print("Column %d had no entries" % col)
            print("------------")
        if args.mean_col:
            col = int(args.mean_col)
            print('Computing mean for col %d' % col)
            if col < 0:
                print('Please do not give the program negative numbers')
                return 22
            print('Computing mean of column %d...' % col)
            c = csv.getColumn(col)
            if len(c) > 0:
                if not args.quiet:
                    print("%d" % col);
                    print("---");
                    for e in c:
                        print("%f" % e)
                print("mean: %f" % vector_math.meanRow(c))
            else:
                print("Column %d had no entries" % col)
            print("------------")

    elif filename[-3:] == 'txt':
        doc = Document()
        # Read Document
        print('Opening document...')
        try:
            with open(args.file) as file:
                print('Parsing document...')
                parser = document.Parser(file, doc)
                parser.parseDocument()
                print('Done!')
                print("------------")
        except FileNotFoundError as e:
            print('Could not find file %s' % (args.file))
            return e.errno
        # Display Document
        if args.display_all:
            print('Displaying all words and their occurrences...')
            print('%-16s %-13s' % ('Word', 'Occurrences'))
            for w in doc:
                print('%-16s %-13d' % (w, doc.getWordCount(w)))
            print("------------")
        if args.most_frequent:
            print('Finding most frequent...')
            mostFreq = doc.getMostFrequentWord()
            print('%-16s %-13s' % ('Most Frequent', 'Occurrences'))
            print('%-16s %-13d' % (mostFreq, doc.getWordCount(mostFreq)))
            print("------------")
        if args.most_frequent_percent:
            percent = int(args.most_frequent_percent)
            if percent > 100 or percent < 0:
                print('%d is an invalid percentage -- must be between 0 and 100' % percent)
                return 22
            print('Finding words within %d percent of the most frequent...' % percent)
            mostFreqWords = doc.getMostFrequentWords(percent)
            print('%-16s %-13s' % ('Most Frequent', 'Occurrences'))
            for w in mostFreqWords:
                print('%-16s %-13d' % (w, doc.getWordCount(w)))
            print("------------")
        if args.top:
            topNum = int(args.top)
            if topNum < 0:
                print('Please do not give the program negative numbers')
                return 22
            print('Finding top %d words...' % topNum)
            topWords = doc.getTopWords(topNum)
            print('%-16s %-13s' % ('Top %d Frequent' % topNum, 'Occurrences'))
            for w in topWords:
                print('%-16s %-13d' % (w, doc.getWordCount(w)))
            print("------------")
        if args.equal_to:
            freq = int(args.equal_to)
            print('Finding words with %d ocurrences...' % freq)
            wordsEqual = doc.getWordsEqualToFrequency(freq)
            print('%-16s %-13s' % ('Word', 'Occurrences'))
            wordsEqual = sorted(wordsEqual)
            for w in wordsEqual:
                print('%-16s %-13d' % (w, doc.getWordCount(w)))
            print("------------")
        if args.above:
            freq = int(args.above)
            print('Finding words above %d occurrences...' % freq)
            wordsAbove = doc.getWordsAboveFrequency(freq)
            print('%-16s %-13s' % ('Word', 'Occurrences'))
            wordsAbove = sorted(wordsAbove)
            for w in wordsAbove:
                print('%-16s %-13d' % (w, doc.getWordCount(w)))
            print("------------")
        if args.word:
            print('Finding occurrences for %s...' % args.word)
            print('%-16s %-13s' % ('Word', 'Occurrences'))
            for w in args.word.split(','):
                print('%-16s %-13d' % (w, doc.getWordCount(w)))
            print("------------")
        if args.is_in:
            print('Checking if %s is in the document...' % args.is_in)
            if doc.getWordCount(args.is_in) > 0:
                print('"%s" IS in the document!' % args.is_in)
            else:
                print('"%s" IS NOT in the document!' % args.is_in)
            print("------------")
        if args.stats:
            print("Document Statistics:")
            print("Number Total Words    : %d" % (doc.getNumTotalWords()))
            print("Number Different Words: %d" % (doc.getNumDifferentWords()))
            print("Number of Sentences   : %d" % (doc.getNumSentences()))
            print("Number of Paragraphs  : %d" % (doc.getNumParagraphs()))
            print("------------")

    else:
        print('Please provide either a .csv or a .txt file to be parsed')
        return 22
    return 0