parser.add_argument('-c', '--comparator', choices=comparators.keys(), help='comparison function to define URL uniqueness (DEFAULT=alpha)') args = parser.parse_args() # read URLs from file urls = None try: urls = open(args.input).readlines() urls = [x.strip() for x in urls] except IOError: print "Error: File \"%s\" not found." % args.input sys.exit() # default comparator is alpha cmp = comparators['alpha'] if args.comparator is not None: cmp = comparators[args.comparator] # set of urls, normalized normUrls = normalizer.normalize_list(urls) # print results for i, url in enumerate(urls): normUrl = normUrls[i] print 'Source: ', url print 'Valid: ', validator.is_valid(url) print 'Canonicalized: ', normUrl print 'Source unique: ', is_unique(url, urls, cmp) print 'Canonicalized unique: ', is_unique(normUrl, normUrls, cmp) print
try: outfile = open(args.output, 'w') except IOError: print "Error: Unable to open output file \"%s\"." % args.output sys.exit() try: urls = open(args.input).readlines() except IOError: print "Error: File \"%s\" not found." % args.input sys.exit() sel = 3 # default selection is quicksort if args.sort is not None: # try getting sort selection from command-line args sel = args.sort # remove leading/trailing whitespace from urls urls = [x.strip() for x in urls] # normalize and validate urls, if specified if args.filter is not None: validUrls = validator.valid_list(urls) if args.filter == 'valid': urls = normalizer.normalize_list(validUrls) elif args.filter == 'invalid': urls = filter(lambda x: x not in validUrls, urls) sorter = algos[sel](urls) sortedList = sorter.sort() outfile.write("\n".join(sortedList))