def main():

  domainFileName, days, queryTypeNames, queryTypeCodes, outputPath = parseArgs(sys.argv[1:])
  domainFileDesc = open(domainFileName, "r")

  domainList = []
  regDomainList = []

  # read domains form the input file
  for domain in domainFileDesc:
    domain = domain.strip().lower()
    if domain[-1] == '.':
      domain = domain[:-1]
    regdomain = regdom.get_registered_domain(domain)
    if regdomain is not None:
      domainList.append(domain)
      regDomainList.append(regdomain)
    else:
      print 'Domain %s does not have valid registered domain, skipping.' % domain

  filesToSearch = findFiles(domainList, queryTypeNames, days)
  inputPath = makeInputString(filesToSearch, days)
  print inputPath
  regexString = getRegex(regDomainList)
  queryString = makeQueryString (queryTypeCodes)
  write_pig(inputPath, outputPath, queryString, regexString)
def main():

    domainFileName, dateRange, queryTypes, queryTypeCodes, outputpath = parseArgs(sys.argv[1:])
    # print queryTypes
    # print queryTypeCodes
    domainFileDesc = open(domainFileName, "r")
    domainList = []
    regDomainList = []

    # read domains form the input file
    for domain in domainFileDesc:
        domain = domain.strip().lower()
        if domain[-1] == ".":
            domain = domain[:-1]
        regdomain = regdom.get_registered_domain(domain)
        if not regdomain is None:
            domainList.append(domain)
            regDomainList.append(regdomain)
        else:
            print "Domain %s does not have valid registered domain, skipping." % domain

    filesToSearch = findFiles(domainList, queryTypes, dateRange)
    fileinputstring = makeInputString(filesToSearch)
    #  fileinputstring = "/user/pdhakshi/SIE_DATA/BY_MULTIPARAMS/{%s}.gz/*" % (",".join(filesToSearch))
    regexstring = get_regex(regDomainList)

    # for aFile in filesToSearch:
    # print aFile

    querystring = makequerystring(queryTypeCodes)
    write_pig(fileinputstring, outputpath, querystring, regexstring)
def findFiles(domainList, queryTypes, days):
  fileList = []
  for domain in domainList:
    regDomain = regdom.get_registered_domain(domain)
    revRegDomain = ".".join(regDomain.split(".")[::-1])

    tld = revRegDomain.split('.')[0]
    tld = tld.upper()

    if (tld != "COM" and
        tld != "NET" and
        tld != "ORG" and
        tld != "ARPA"):
        tld = "OTHR"

    domainHashCode = getJavahash(revRegDomain)

    qtype_tld_list = []
    for qtype in queryTypes:
      qtype_tld = qtype + "_" + tld
      percent_dist_for_group = bucketDistribution[qtype_tld]
      bucketnumber = getBucketNumber(percent_dist_for_group, domainHashCode)
      qtype_tld_list.append(qtype_tld + "_" + bucketnumber)

    # Add the file to the output list only if it was not added already while processing some
    # other domain.
    for afile in qtype_tld_list:
      if afile not in fileList:
        fileList.append(afile)

  #return "{%s}/{%s}" % (",".join(days), ",".join(fileList))
  return fileList
def findFiles(domainList, queryTypes, dateRange):
    result = []
    for domain in domainList:
        regDomain = regdom.get_registered_domain(domain)

        if regDomain[-1] == ".":
            regDomain = regDomain[:-1]

        revRegDomain = ".".join(regDomain.split(".")[::-1])

        tld = revRegDomain.split(".")[0]
        tld = tld.upper()

        if tld != "COM" and tld != "NET" and tld != "ARPA":
            tld = "OTHR"

        domainHashCode = getJavahash(revRegDomain)
        # print revRegDomain, domainHashCode

        qtype_tld_list = []
        for qtype in queryTypes:
            qtype_tld = qtype + "_" + tld
            percent_dist_for_group = bucket_distribution[qtype_tld]
            bucketnumber = getBucketNumber(percent_dist_for_group, domainHashCode)
            qtype_tld_list.append(qtype_tld + "_" + bucketnumber)

        temp_result = [aDay + "_" + qtype_tld for aDay in dateRange for qtype_tld in qtype_tld_list]

        for aResult in temp_result:
            if aResult not in result:
                result.append(aResult)

    return result
def main():

  domainFileName, yearmonthToDayMap, queryTypes, outputpath = parseArgs(sys.argv[1:])
  domainFileDesc = open(domainFileName, "r")
  domainList = []
  regDomainList = []

  # read domains form the input file
  for domain in domainFileDesc:
    domain = domain.strip().lower()
    if domain[-1] == '.':
      domain = domain[:-1]
    regdomain = regdom.get_registered_domain(domain)
    if not regdomain is None:
      domainList.append(domain)
      regDomainList.append(regdomain)
    else:
      print 'Domain %s does not have valid registered domain, skipping.' % domain

  inputstring = makeInputString(yearmonthToDayMap)
  querystring = makequerystring (queryTypes)
  regexstring = get_regex(regDomainList)
  write_pig(inputstring, outputpath, querystring, regexstring)