def preprocess( result ):
    words = removePunct(result.title)
    words += " "
    words += removePunct(result.snippet)
    result.tokens = nltk.word_tokenize(words)
    for tok in result.tokens:
        if tok not in STOPS:
            tok = PorterStemmer().stem(tok.decode('utf-8'))
            tok = tok.lower().encode('utf-8')
    return result
def searchRank( query ):
    resList = []    # list of search result objects
    relList = []    # list of "indexes" of relevant results

    googleSearch(query, resList, 1)
    googleSearch(query, resList, 11)

    for r in resList:
        r = preprocess(r)       # initialize tokens attribute with pre-processed words
        r.vector = Counter(r.tokens)
        print r.rank
        print r.title
        print r.url
        print r.snippet
        print

    # ask user which results are relevant
    print "Choose up to 5 results that were relevant to your search."
    print "Enter a negative number to quit."
    relNum = int(input("Enter a result number: "))
    i = 0
    while relNum >= 0 and i < 5:
        if relNum not in relList:
            relList.append(relNum)
        else:
            print "Error: You already entered that result"
        i += 1
        relNum = int(input("Enter a result number ( negative to quit ): "))

    # write relevant data to file
    infile = open(query+'.txt', 'wb')
    for i in relList:
        for r in resList:
            if i == r.rank:
                infile.write(r.title + ' ')
                infile.write(r.snippet + ' ')
    infile.close()

    '''--------------------pre-process our relevance test set-------------------------'''
    readfile = open(query+'.txt', 'rb')
    relWords = readfile.read()
    relWords = removePunct(relWords)
    relTokens = nltk.word_tokenize(relWords)

    infile = open(query+'-clean.txt', 'w')

    for tok in relTokens:
        if tok not in STOPS:
            tok = PorterStemmer().stem(tok.decode('utf-8'))
            tok = tok.lower().encode('utf-8')
            infile.write(tok + ' ')

    infile.close()

    '''--------------------calculate, sort, and display----------------------------------'''
    relevanceVector = Counter(relTokens)    # get vector for relevance data to calc similarity

    print "Calculating relevancy of your search results......"
    # calculate similarity
    for r in resList:
        r.cosine = calc_cos(r.vector, relevanceVector)
        r.jaccard = jaccard(set(r.tokens), set(relTokens))
        # print "cosine:", r.cosine
        # print "jaccard:", r.jaccard

    print "Select sorting preference:"
    print "[1] Jaccard Coefficient"
    print "[2] Cosine Similarity"
    print
    sortChoice = raw_input("Enter choice here: ")

    if sortChoice.lower() in ['1', 'j', 'jaccard', 'jaccard coefficient']:
        resList.sort(key = lambda x: x.jaccard, reverse=True)
        print "Showing results based on jaccard coeffecient: "
    elif sortChoice.lower() in ['2', 'c', 'cosine','cosine similarity']:
        resList.sort(key = lambda x: x.cosine, reverse=True)
        print "Showing results based on cosine similarity: "

    for r in resList:
        print
        print r.rank
        print r.title
        print r.url
        print r.snippet
        print