Example #1
0
def main():
    '''
  Takes in a list of URL's in a file and outputs the
  validity of the URL, the canonicalized URL, the uniqueness
  of the URL and the canonicalized URL
  '''
    (parser, opts, args) = controller()
    if not opts.filename:
        parser.print_help()
        sys.exit(1)

    filename = opts.filename

    try:
        f = open(filename, 'r')
        raw_url_list = reader.read_file(f)
    except IOError as e:
        handle_io_exception(filename, e)

    unique_raw_urls = set()
    unique_canonicalized_urls = set()
    is_raw_valid = False
    is_raw_unique = False
    is_canonical_unique = False
    canonicalized_url = ""

    for raw_url in raw_url_list:
        print("Source: " + raw_url)
        is_raw_valid = url_validator.is_valid(raw_url)
        print("Valid: " + str(is_raw_valid))
        canonicalized_url = url_normalize.url_normalize(raw_url)
        print("Canonical: " + canonicalized_url)

        is_raw_unique = raw_url not in unique_raw_urls
        if is_raw_unique:
            unique_raw_urls.add(raw_url)
        print("Source unique: " + str(is_raw_unique))

        is_canonical_unique = canonicalized_url not in unique_canonicalized_urls
        if is_canonical_unique:
            unique_canonicalized_urls.add(canonicalized_url)
        print("Canonicalized URL unique: " + str(is_canonical_unique))
Example #2
0
def main():
  '''
  Takes in a list of URL's in a file and outputs the
  validity of the URL, the canonicalized URL, the uniqueness
  of the URL and the canonicalized URL
  '''
  (parser, opts, args) = controller()
  if not opts.filename:
    parser.print_help()
    sys.exit(1)

  filename = opts.filename

  try:
    f = open(filename, 'r')
    raw_url_list = reader.read_file(f)
  except IOError as e:
    handle_io_exception(filename, e)
  
  unique_raw_urls = set()
  unique_canonicalized_urls = set()
  is_raw_valid = False
  is_raw_unique = False
  is_canonical_unique = False
  canonicalized_url = ""
  
  for raw_url in raw_url_list:
    print("Source: " + raw_url)
    is_raw_valid = url_validator.is_valid(raw_url)
    print("Valid: " + str(is_raw_valid))
    canonicalized_url = url_normalize.url_normalize(raw_url)
    print("Canonical: " + canonicalized_url)
    
    is_raw_unique = raw_url not in unique_raw_urls
    if is_raw_unique:
      unique_raw_urls.add(raw_url)
    print("Source unique: " + str(is_raw_unique))
    
    is_canonical_unique = canonicalized_url not in unique_canonicalized_urls
    if is_canonical_unique:
      unique_canonicalized_urls.add(canonicalized_url)
    print("Canonicalized URL unique: " + str(is_canonical_unique))
Example #3
0
 def runTest(self):
     assert url_validator.is_valid(value) == expected
 def runTest(self):
     assert url_validator.is_valid(value) == expected