Beispiel #1
0
def parse_tokens(tokens):
	"""
	Implementation of the shunting-yard algorithm, with
	modifications to handle unary right associative 
	operators
	"""
	output = []
	op_stack = []
	phrasal = []
	for tok in tokens:
		if tok == "\"":
			if op_stack and op_stack[-1] == "\"":
				output.append(PhrasePostings(phrasal))
				phrasal = []
				op_stack.pop()
			else: op_stack.append(tok)
		elif tok == ')':
			while op_stack[-1] != '(':apply_op(op_stack,output)
			op_stack.pop()
		elif tok in prec:
			if tok in right:
				while op_stack and op_stack[-1] != '(' and prec[op_stack[-1]] > prec[tok]:
					apply_op(op_stack,output)
			else:
				while op_stack and op_stack[-1] != '(' and prec[op_stack[-1]] >= prec[tok]:
					apply_op(op_stack,output)
			op_stack.append(tok)
		else:
			if op_stack and op_stack[-1] == '\"':
				phrasal.append(preprocess(tok))
			else:
				output.append(process_token(tok))
	while op_stack: apply_op(op_stack,output)
	return output[0]
Beispiel #2
0
def split_query(query):
	result = {}
	tokens = query.split()
	for i in tokens:
		term = preprocess(i)
		result[term] = result.get(term,0) + 1
	return result
Beispiel #3
0
def process_token(tok):
	if tok.lower() in stop_words: return all_postings
	tok = preprocess(tok)
	try:
		return Postings(tok)
	except KeyError:
		return EmpPostings()
Beispiel #4
0
def queryToScore(query):
    postingHandler = PostingHandler(dictionary_file, postings_file)
    uniDict = makeUniGrams(preprocess(query))
    q_len = tf_idf.getLtcLen(postingHandler, uniDict)
    N = postingHandler.getNumDoc()
    for word in uniDict:
        df = postingHandler.getDocFreq(word)
        uniDict[word] = tf_idf.get_ltc(uniDict[word], N, df, q_len)
    return uniDict
def query_tf(query):
    '''This function will filter given query for stopwords and stem, then will return their tf-'''

    d_qr = {}
    Q = preprocess(query).split()

    length = len(query.split())
    for keys in Q:
        if keys in d_qr:
            d_qr[keys] = d_qr[keys] + 1 / length
        else:
            d_qr[keys] = 1 / length

    return d_qr
Beispiel #6
0
 def free_text_query(self, query):
     """
     Retrieve documents from query.
     :param query: A string with something. Please have something,
     :return:
     """
     words = word_tokenize(query)
     words = preprocess(words)
     if len(words) == 0:
         return []
     res = self.postings.get_postings_list(words.pop())
     while len(words) > 0:
         posting = self.postings.get_postings_list(words.pop())
         res = union(res, posting)
     return res
Beispiel #7
0
    def __identify_query(self, query_string):
        """
        identify the query type and process the queries
        """
        if ("AND" in query_string):
            # Consider as list of queries
            # Pre process if need be
            out = []
            split_word = query_string.split(' ')

            i = 0
            while (i < len(split_word)):
                if (split_word[i][0] == '"'):
                    combined = []
                    d = i
                    while (d < len(split_word)):
                        combined.append(split_word[d].replace('"', ''))
                        d += 1
                        if split_word[d][-1] == '"':
                            i = d + 1
                            combined.append(split_word[d].replace('"', ''))
                            break
                    out.append(preprocess(combined))
                else:
                    if (split_word[i] != "AND"):
                        out.append(preprocess([split_word[i]]))
                    i += 1

            # Flattens the out list so we can get a tf_q
            flat_term_list = [item for sublist in out for item in sublist]
            self.__get_tf(flat_term_list)
            return True, out
        else:
            # pre process as per normal
            self.__get_tf(self.__get_term_list(query_string))
            return False, query_string
Beispiel #8
0
def processBoolQuery(query):
    # Since only boolean operator is AND, each element in this list will be AND-merged
    query = query.split(" AND ")
    query2 = []
    for i in range(len(query)):
        query2.extend(preprocess(query[i]))
    query = query2
    convertToScores(query)

    # Since skip lists aren't implemented, no need to prioritise shortest query
    while len(query) > 1:
        query[0] = doAnd(query[0], query[1])
        del query[1]

    query = query[0]
    if len(query) == 0:
        return ""
    else:
        query.sort(key=lambda x: x[1], reverse=True)
        # print query
        result = [str(i) for i in zip(*query)[0]]
        return ' '.join(result)
def query_type_two(q):
    #we "clean" the query and we calculate the tf

    query = voc_lyrics(preprocess(q))
    vocabulary = db.INVERTED.find({"vocabulary": {
        "$exists": True
    }})[1]['vocabulary']

    tot_num_doc = db.ADMHMW3.count()
    tot_doc = []

    #we take the inverted index for each term of the query
    for q in query:
        term = q[0]

        try:
            #we take the id of the term from the vocabulary
            term_id = list(vocabulary.keys())[list(
                vocabulary.values()).index(term)]
            posting = db.INVERTED.find({term_id: {
                "$exists": True
            }})[1][term_id]

        except:
            print(
                "The query is impossible, one or more terms aren't in the db")
            return

        res = list(map(tuple, posting))
        docs = list(zip(*res))[0]
        tot_doc.append(set(docs))

    # we take the intersection between the posting lists
    common_doc = list(set.intersection(*tot_doc))

    #print the result to receive the k from the users
    n = len(common_doc)

    print("The number of the results is", n, "Insert the value of k")
    k = int(input())

    #we clusterize only if the number of documents is a value bigger than k
    if k < n:

        X = np.zeros((n, n))

        updated_common_docs = {}
        dict_docs = {}

        #for each document in the intersection, we take the index list, that has this form: doc_id = [(term_id, tf)], and we update the value with tf*idf
        #we should have done this operation for all songs and store in another collection. This approach would have produced a code faster in this phase, because
        #we would have avoid this double loop
        for idx, doc in enumerate(common_doc):
            dict_docs[idx] = doc
            doc_new = []
            list_term = db.INDEX.find({doc: {"$exists": True}})[0][doc]
            for term_tf in list_term:
                term = term_tf[0]
                tf = term_tf[1]

                term_id = list(vocabulary.keys())[list(
                    vocabulary.values()).index(term)]
                posting_length = len(
                    db.INVERTED.find({term_id: {
                        "$exists": True
                    }})[0][term_id])
                idf = 1 + log(tot_num_doc / posting_length)
                value = tf * idf
                doc_new.append((term, value))

            updated_common_docs[doc] = doc_new

    #we compute the distance matrix
        max_dis = 0
        for i in range(n - 1):
            doc_i = updated_common_docs[common_doc[i]]
            for j in range(i + 1, n):
                doc_j = updated_common_docs[common_doc[j]]
                d = distance(doc_i, doc_j)
                max_dis = max(d, max_dis)
                X[i][j] = d
                X[j][i] = d

        #Cluster of songs

        clusters = KMeans(n_clusters=k).fit(
            PCA(n_components=n).fit_transform(X)).labels_

        clust_dict = {}

        for idx, x in enumerate(clusters):
            try:
                clust_dict[x].append(dict_docs[idx])
            except:
                clust_dict[x] = [dict_docs[idx]]

        res = {}
        text = ""

        #print of the result
        for k, v in clust_dict.items():
            for idx, doc_id in enumerate(v):
                d = db.ADMHMW3.find({"_id": bson.ObjectId(doc_id)})[0]
                try:
                    res[k].append((d['Artist'], d['Title']))
                except:
                    res[k] = [(d['Artist'], d['Title'])]

                text += d["Lyrics"] + " "

        print(res)

        generate_word_cloud(text)

    else:
        print("k is too big, bye!")
        return
Beispiel #10
0
 def __get_term_list(self, query_string):
     term_list = preprocess(word_tokenize(query_string))
     return term_list