Beispiel #1
0
def search(request):
    start = time.time()
    query = request.GET.get('q', '')
    original = query
    page = request.GET.get('p', '')
    if not page:
        page = 1
    else:
        page = int(page)
    
    query = checkSpelling(query)
    query = decompose(query, True)
    query = returnSynonyms(query)
    query = decompose(query, True)
    query = applySearchHistory(query)

    results = retrieveFromIndex(query, page)
    
    suggestions = createSuggestions(original, results[0])
    
    pages = ceil(results[1] / eventbook_settings.PAGE_SIZE)   
    processtime = time.time() - start
    
    pagestart = max(1, page - 5)
    pageend = min(pages, page + 5)
        
    context = {'documents': results[0], 'query': original, 'extendedquery': query, 'results': results[1], 'page': page, 'totalpages': pages, 'pages': range(pagestart, pageend + 1), 'prev': max(1, page - 1), 'next': min(pages, page + 1), 'processtime': round(processtime, 4), 'suggestions': suggestions}
    
    return render(request, 'querying/search.html', context)
Beispiel #2
0
def processAndSaveDoc(document):   
    # try:
        if document:
            document.description = decompose(document.description, False);
            document = multiLabelClassification(document);
            document = clusterDocument(document);
            document = findDuplicate(document);
            
            document.save()
def vcspace(docTexts):  #original text, a set/list

    doclists = []
    rowlists = []
    #n=0
    tokenlist = []
    for text in docTexts:
        text = decompose(text, False)  #every text becomes a list
        list = getTokensFromText(text)
        doclists.append(list)
        for token in list:
            if token not in tokenlist:
                tokenlist.append(token)
        list = []
        rowlists.append(list)

    for list in rowlists:  #initialize
        i = 0
        for i in range(0, len(tokenlist)):
            list.append(0)

    textNum = len(doclists)  # number of rows
    tokenNum = len(tokenlist)  # number of columns
    i = 0
    #print(tokenlist)

    n = -1
    for list in doclists:
        n = n + 1
        for i in range(0, len(list)):
            if list[i] in tokenlist:
                index = tokenlist.index(list[i])
                rowlists[n][index] = rowlists[n][index] + 1

    k = 0
    m = 0
    Similarity = []
    for list1 in rowlists:
        for list2 in rowlists:
            normText1 = 0
            normText2 = 0
            for k in range(0, tokenNum):
                normText1 = normText1 + list1[k]
                normText2 = normText2 + list2[k]
                #normText1=power(normText1,1.0/2)
                #normText2=power(normText2,1.0/2)
            normText1 = normText1**(1. / 2)
            normText2 = normText2**(1. / 2)
            Similarity.append(0)
            for k in range(0, tokenNum):
                Similarity[m] = Similarity[m] + list1[k] * list2[k]
            Similarity[m] = Similarity[m] / normText1 / normText2
            #print(Similarity[m])
            m += 1

    return Similarity
Beispiel #4
0
def createSuggestions(
    query, documents
):  ## input the original query and relevant documents(Top 5 documents retrieved in our case)
    doclists = []
    rowlists = []
    tokenlist = []

    for document in documents:
        tokens = document[0].getAllTokensAsText()
        doclists.append(tokens)
        for token in tokens:
            if token not in tokenlist:
                tokenlist.append(token)
        rowlists.append([])

    ## deal with query
    queryTokens = getTokensFromText(query)
    doclists.insert(0, queryTokens)

    rowlists.insert(0, [])

    for list in rowlists:  # initialize
        i = 0
        for i in range(0, len(tokenlist)):
            list.append(0)

    i = 0
    # print(tokenlist)
    n = -1
    for list in doclists:
        n = n + 1
        for i in range(0, len(list)):
            if list[i] in tokenlist:
                index = tokenlist.index(list[i])
                rowlists[n][index] = rowlists[n][index] + 1

    score = {}
    for i in range(0, len(tokenlist) - 1):
        score[tokenlist[i]] = rowlists[0][i]
        for j in range(1, len(rowlists) - 1):
            score[tokenlist[i]] = score[tokenlist[i]] + 0.75 * rowlists[j][i]
    rankwords = sorted(score.items(), key=lambda map: map[1], reverse=True)

    # print(rankwords)

    ## load stopwords
    stopwords = open(eventbook_settings.PROJECT_ROOT + "common/SmartStoplist.txt")
    stop_words = []
    for line in stopwords:
        if line.strip()[0:1] != "#":
            for word in line.split():  # in case more than one per line
                stop_words.append(word)
    # print(stop_words)

    ## generate new query
    ## find words with score bigger than 1.5 and don't show in the original query and the stop_words.
    newrank = []
    # print(len(query))
    for i in range(0, len(tokenlist) - 1):
        if rankwords[i][1] >= 1.5 and rankwords[i][0] not in query and rankwords[i][0] not in stop_words:
            newrank.append(rankwords[i][0])
    k = len(newrank)

    ## if there are no words in newrank, we don't have expanded query
    if k == 0:
        suggestions = None
    else:
        suggestions = []
        queryTokens = getTokensFromText(query)

        #  we only generate at most 5 new queries
        for i in range(0, min(k, 5)):
            tokens = getTokensFromText(newrank[i])
            suggestion = decompose(newrank[i], False)
            # Only add the suggested word if it is not in the query yet
            if suggestion and suggestion != "" and not any(suggestion in s for s in queryTokens):
                suggestions.append(suggestion)

    # print(suggestions)

    return suggestions
Beispiel #5
0
def vcspace(docTexts):    #original text, a set/list
    
    doclists=[]
    rowlists=[]
    #n=0
    tokenlist=[]
    for text in docTexts:
        text=decompose(text, False)  #every text becomes a list
        list=getTokensFromText(text)       
        doclists.append(list)
        for token in list:
            if token not in tokenlist:
                tokenlist.append(token)
        list=[]
        rowlists.append(list)
    
    for list in rowlists:  #initialize
        i=0
        for i in range(0,len(tokenlist)): 
            list.append(0)
        
    
    textNum=len(doclists) # number of rows
    tokenNum=len(tokenlist) # number of columns 
    i=0
    #print(tokenlist)
    
    n=-1
    for list in doclists:
        n=n+1
        for i in range(0,len(list)):
            if list[i] in tokenlist:
                index=tokenlist.index(list[i])
                rowlists[n][index]=rowlists[n][index]+1   

    k=0
    m=0
    Similarity=[]
    for list1 in rowlists:
        for list2 in rowlists:
            normText1=0
            normText2=0
            for k in range(0,tokenNum):
                normText1=normText1+list1[k]
                normText2=normText2+list2[k]
                #normText1=power(normText1,1.0/2)
                #normText2=power(normText2,1.0/2)
            normText1=normText1**(1./2)
            normText2=normText2**(1./2)
            Similarity.append(0)
            for k in range(0,tokenNum):
                Similarity[m]=Similarity[m]+list1[k]*list2[k]
            Similarity[m]=Similarity[m]/normText1/normText2
            #print(Similarity[m])
            m+=1
    
    return Similarity