Example #1
0
File: main.py Project: setr/cs429
def search(query, scorer, index):
    """
    Retrieve documents matching a query using the specified scorer.

    1) Tokenize the query.
    2) Convert the query tokens to a vector, using Index.query_to_vector.
    3) Call the scorer's score function.
    4) Return the list of document ids in descending order of relevance.

    NB: Due to the inconsistency of floating point arithmetic, when sorting,
    round the scores to 6 decimal places (e.g., round(x, 6)). This will ensure
    replicable results.

    Params:
      query....A string representing a search query.
      scorer...A ScoringFunction to retrieve documents.
      index....A Index storing postings lists.
    Returns:
      A list of document ids in descending order of relevance to the query.
    """
    query_tokened = index.tokenize(query)
    query_vector = index.query_to_vector(query_tokened)
    score = scorer.score(query_vector, index)
    sortedlist = sorted(score.items(),
                        key=lambda x: round(x[1], 5),
                        reverse=True)
    return [i[0] - 1 for i in sortedlist]
Example #2
0
def search(query, scorer, index):
    """
    Retrieve documents matching a query using the specified scorer.
    1) Tokenize the query.
    2) Convert the query tokens to a vector, using Index.query_to_vector.
    3) Call the scorer's score function.
    4) Return the list of document ids in descending order of relevance.
    NB: Due to the inconsistency of floating point arithmetic, when sorting,
    round the scores to 6 decimal places (e.g., round(x, 6)). This will ensure
    replicable results.
    Params:
      query....A string representing a search query.
      scorer...A ScoringFunction to retrieve documents.
      index....A Index storing postings lists.
    Returns:
      A list of document ids in descending order of relevance to the query.
    """
    tokenized = index.tokenize(query)
    vector = index.query_to_vector(tokenized)
    tempResult = scorer.score(vector,index)
    tempResult = sorted(tempResult.items(), key=lambda key: round(key[1], 6), reverse=True)
    result = []
    for i in range(0, len(tempResult)):
        result.append(tempResult[i][0])
    return result
Example #3
0
def run():
    indexer = TinyIndexer(INDEX_PATH, NUM_PAGES, PAGE_SIZE)
    indexer.create_if_not_exists()
    nlp = English()
    for path in glob(CRAWL_GLOB):
        print("Path", path)
        with gzip.open(path, 'rt') as html_file:
            url = html_file.readline().strip()
            content = html_file.read()

        if indexer.document_indexed(url):
            print("Page exists, skipping", url)
            continue

        cleaned_text = clean(content)
        try:
            title = bs4.BeautifulSoup(content,
                                      features="lxml").find('title').string
        except AttributeError:
            title = cleaned_text[:80]
        tokens = tokenize(nlp, cleaned_text)
        print("URL", url)
        print("Tokens", tokens)
        print("Title", title)
        indexer.index(tokens, url, title)
Example #4
0
def search(query, scorer, index):
    """
    Retrieve documents matching a query using the specified scorer.

    1) Tokenize the query.
    2) Convert the query tokens to a vector, using Index.query_to_vector.
    3) Call the scorer's score function.
    4) Return the list of document ids in descending order of relevance.

    NB: Due to the inconsistency of floating point arithmetic, when sorting,
    round the scores to 6 decimal places (e.g., round(x, 6)). This will ensure
    replicable results.

    Params:
      query....A string representing a search query.
      scorer...A ScoringFunction to retrieve documents.
      index....A Index storing postings lists.
    Returns:
      A list of document ids in descending order of relevance to the query.
    """
    ###TODO
    tokenized_query = index.tokenize(query)
    q_vector = index.query_to_vector(tokenized_query)
    doc_score = scorer.score(q_vector,index)

    return sorted(doc_score, key=lambda k: doc_score[k],reverse=True)
def search(query, scorer, index):
    """
    Retrieve documents matching a query using the specified scorer.

    1) Tokenize the query.
    2) Convert the query tokens to a vector, using Index.query_to_vector.
    3) Call the scorer's score function.
    4) Return the list of document ids in descending order of relevance.

    NB: Due to the inconsistency of floating point arithmetic, when sorting,
    round the scores to 6 decimal places (e.g., round(x, 6)). This will ensure
    replicable results.

    Params:
      query....A string representing a search query.
      scorer...A ScoringFunction to retrieve documents.
      index....A Index storing postings lists.
    Returns:
      A list of document ids in descending order of relevance to the query.
    """
    qry_terms = index.tokenize(query)
    qry_vector = index.query_to_vector(qry_terms)
    document_rank = scorer.score(qry_vector,index)
    doc_list = defaultdict(lambda: 0)
    for key in document_rank.keys():
        doc_list[key] = round(document_rank[key],6)
    
    sorted_list =  sorted(doc_list.items(), key = lambda x:x[1], reverse=True)
    result = [value[0] for value in sorted_list]
    
    return result
Example #6
0
def search(query, scorer, index):
    """
    Retrieve documents matching a query using the specified scorer.

    1) Tokenize the query.
    2) Convert the query tokens to a vector, using Index.query_to_vector.
    3) Call the scorer's score function.
    4) Return the list of document ids in descending order of relevance.

    NB: Due to the inconsistency of floating point arithmetic, when sorting,
    round the scores to 6 decimal places (e.g., round(x, 6)). This will ensure
    replicable results.

    Params:
      query....A string representing a search query.
      scorer...A ScoringFunction to retrieve documents.
      index....A Index storing postings lists.
    Returns:
      A list of document ids in descending order of relevance to the query.
    """
    ###TODO
    scores = {}
    tokens = index.tokenize(query)
    res_vector = index.query_to_vector(tokens)
    scores = scorer.score(res_vector,index)
    
    return sorted(scores, key=scores.__getitem__, reverse=True)
    pass
Example #7
0
 def runPNAClassifier(self, outerLimit, innerLimit, query):
     tagDistances = {}
     topTags = []
     #Calculate tf-idf vector of query based off of idf vector in database
     queryDict = {}
     queryDict['tokens'] = self.tfidf.get_doc_keywords_dict(index.tokenize(query))
     for centroid in self.centroids:
         #Calculate distance of query to centroid
         #Save distance into tagDistance array
         tagDistances[centroid['tag']] = {'tag':centroid['tag'],'distance':calculateDistance(centroid,queryDict)}
     topTags = sorted(tagDistances, key= lambda X : tagDistances[X]['distance'])[:outerLimit]
     print "initial:" + str(topTags)
     #Loop throught the top tags(outerLimit is the cut off for this)
     #We will now get the knn neighbors of all the questions inside of the outerLimit
     '''
     loadedQuestions = []
     questionDistances = {}
     for tag in topTags:
         for question in self.corpus:
             if tag in question['tags']:
                 questionDict = {}
                 questionDict['tokens'] = self.tfidf.get_doc_keywords_dict(question['body'])
                 questionDict['tags'] = question['tags']
                 loadedQuestions.append(questionDict)
     for question in loadedQuestions:
         distance = calculateDistance(question,queryDict)
         for tag in question['tags']:
             if tag in topTags:
                 if tag in questionDistances:
                     questionDistances[tag]['distance'] += distance
                     questionDistances[tag]['count'] += 1
                 else:
                     questionDistances[tag] = {}
                     questionDistances[tag]['distance'] = distance
                     questionDistances[tag]['tag'] = tag
                     questionDistances[tag]['count'] = 1
     for tag in questionDistances:
         questionDistances[tag]['distance'] = (questionDistances[tag]['distance'] * 1.0) / (questionDistances[tag]['count'] * 1.0)
     topTags = sorted(questionDistances, key= lambda X : questionDistances[X]['distance'])[:innerLimit]
     print topTags
     '''
     rvalue = {}
     rvalue['tags'] = []
     
     for tag in topTags:
         rvalue['tags'].append(str(tag))
     return rvalue
Example #8
0
def search(query, scorer, index):
    """
    Retrieve documents matching a query using the specified scorer.

    1) Tokenize the query.
    2) Convert the query tokens to a vector, using Index.query_to_vector.
    3) Call the scorer's score function.
    4) Return the list of document ids in descending order of relevance.

    NB: Due to the inconsistency of floating point arithmetic, when sorting,
    round the scores to 6 decimal places (e.g., round(x, 6)). This will ensure
    replicable results.

    Params:
      query....A string representing a search query.
      scorer...A ScoringFunction to retrieve documents.
      index....A Index storing postings lists.
    Returns:
      A list of document ids in descending order of relevance to the query.
    """
    
    # 1. Tokenize
    query_tokens = index.tokenize(query)
    
    # 2. Query tokens --> query Vectors
    query_vector = index.query_to_vector(query_tokens)
    
    # 3. Scorer Functions...
    scores_dict = scorer.score(query_vector, index)
    
    temp_dict = defaultdict(lambda:0.0)
    
    for key, values in scores_dict.items():
    	temp_dict[key] = round(values,6)
    
    lists = sorted(temp_dict.items(), key=lambda k:k[1], reverse=True)
    
    new_list = []
    
    for temp in lists:
    	new_list.append(int(temp[0]))
    
    return new_list