def it_should_search_test(self): vectorSpace = VectorSpace(self.documents) eq_(vectorSpace.search(["cat"]), [ 0.14487566959813258, 0.1223402602604157, 0.07795622058966725, 0.05586504042763477 ])
def it_should_search_test(self): vectorSpace = VectorSpace(self.documents) eq_( vectorSpace.search(["cat"]), [0.14487566959813258, 0.1223402602604157, 0.07795622058966725, 0.05586504042763477], )
def run(data, queries, max_response=10): all_documents = [] for entry in data: all_documents.append(entry["raw_data"]) vector_space = VectorSpace(all_documents) #Search for cat indexed_result = {} result = vector_space.search([queries]) index = 0 for entry in result: indexed_result[index] = entry index += 1 sorted_resp = sorted(indexed_result.items(), key=operator.itemgetter(1), reverse=True) sorted_resp = sorted_resp[:int(max_response) + 1] response = {} rank = 1 for entry in sorted_resp: data_index = entry[0] response[rank] = data[data_index] rank += 1 return response
def it_should_find_related_test(self): vector_space = VectorSpace(self.documents) eq_(vector_space.related(0), [1.0000000000000002, 0.9999999999999998, 0.0])
def it_should_search_test(self): vector_space = VectorSpace(self.documents, transforms = []) eq_(vector_space.search(["cat"]), [1.0, 0.7071067811865475, 0.0])
def it_should_find_return_similarity_rating_test(self): vectorSpace = VectorSpace(self.documents) eq_(vectorSpace.related(0), [1.0, 0.9922455760198575, 0.08122814162371816, 0.0762173599906487])
queries = pickle.load(open("QueryStrings.p", "rb")) print "total queries" print len(queries) print "loaded queries" documents = pickle.load(open("documentContentList2.p", "rb")) print "loaded documents" docIds = pickle.load(open("docIdList.p", "rb")) print len(docIds) print "loaded doc ids" documents = list(documents) print "loaded documents" print len(documents) #documents = documents[:100] #docIds = docIds[:100] #queries = queries[:10] vector_space = VectorSpace(documents) print "finished conversion" print "load user click file" userQueriesAndClicks = pickle.load( open("user_specific_positive_negative_examples_dic_test", "rb")) print "finished loading user click file" #queryResults = dict( [ (x[0], (x[1], x[2])) for x in userQueriesAndClicks_strict[userID] ]) # given a query and a ranking, this function provides the relevanceJudgements list as # required by averagePrecision def turnIntoBinaryRelevanceThing(query, ranking, relevantDocuments): #rel = self.relevantDocuments[query] binarized = []
# Create the corpus file_content_all=[] corpus='AspectJ' creator = sourceCorpusCreator() sourcepath = "E:\PhD\LSI\Repo\\"+corpus+"\SourceAndBugData244\\" keywordsfilepath='E:\PhD\LSI\Repo\\'+corpus+'\data\keyword-documents.txt' #querypath="E:\PhD\LSI\Repo\\"+corpus+"\BugData\\" source_content_all={} source_content_all=creator.CorpusCreatorDict(sourcepath, '.java') print ('Total files in corpus ') print(len(source_content_all)) print (source_content_all) vector_space = VectorSpace(source_content_all) file_path_all=vector_space.get_file_path_all() print (file_path_all) document_ID_file_info_mapping=vector_space.get_document_ID_file_info_mapping() print (document_ID_file_info_mapping) keywords_docs_string=str(vector_space.vector_index_to_keyword_mapping) file_read_write=FileReadWrite(sourcepath) file_read_write.writeFiles(keywordsfilepath, keywords_docs_string) print (len(vector_space.vector_index_to_keyword_mapping)) #import pdb #pdb.set_trace() print ("Keyords-document vector/matrix") print ('length of vector_space.collection_of_document_term_vectors') print len(vector_space.collection_of_document_term_vectors) document_term_matrix=vector_space.collection_of_document_term_vectors
queries = pickle.load(open("QueryStrings.p", "rb")) print "total queries" print len(queries) print "loaded queries" documents = pickle.load(open("documentContentList2.p", "rb")) print "loaded documents" docIds = pickle.load(open("docIdList.p", "rb")) print len(docIds) print "loaded doc ids" documents = list(documents) print "loaded documents" print len(documents) #documents = documents[:100] #docIds = docIds[:100] #queries = queries[:10] vector_space = VectorSpace(documents) print "finished conversion" print "load user click file" userQueriesAndClicks = pickle.load(open("user_specific_positive_negative_examples_dic_test", "rb")) print "finished loading user click file" #queryResults = dict( [ (x[0], (x[1], x[2])) for x in userQueriesAndClicks_strict[userID] ]) # given a query and a ranking, this function provides the relevanceJudgements list as # required by averagePrecision def turnIntoBinaryRelevanceThing(query, ranking, relevantDocuments): #rel = self.relevantDocuments[query] binarized = [] for doc in ranking: if doc in relevantDocuments: binarized.append(1)
def anotate(inpt, skipsize): k = 2 queueSize = skipsize * 2 + 1 queueMid = skipsize + 1 queueIsReady = lambda x : len(x) == queueSize def push(element, queue): queue.append(element) if len(queue) > queueSize: queue.pop(0) vocabulary = get_document_vocabulary(inpt) vocSize = len(vocabulary) + 1 print "Starting on determining word co-occurences." cocs = defaultdict(list) queue = [] for word in inpt: push(word, queue) if queueIsReady(queue): mid = queue[queueMid] if mid in vocabulary: coc = [] for i in xrange(skipsize): if queue[i] in vocabulary: word1 = queue[i] else: word1 = "_UNKNOWN_" if queue[i+1+skipsize] in vocabulary: word2 = queue[i+1+skipsize] else: word2 = "_UNKNOWN_" coc.append(word1) coc.append(word2) #print "final co-occurences" #print coc cocs[mid].append(coc) #print "Coc[mid]" #print cocs[mid] print "Determining LSA relatedness scores between documents..." clustered = dict() for key in cocs.keys(): #print "KEY:" + key #print "\ncocs[" + key + "]:" #print "len cocs key" #print len(cocs[key]) #print cocs[key][0] #print cocs[key][1] #print " ".join(cocs[key][0]) a = [" ".join(cocs[key][i]) for i in range(len(cocs[key]))] #a = a[:6] #print "len a" #print len(a) """ print a[0] print a[1] print a[2] print a[3] print a[4] print a[5] """ vector_space = VectorSpace(a) scores = vector_space.related(0) LSIscores = [] for i in range(len(a)): ss = {"docText" : a[i], "similarity" : scores[i]} LSIscores.append(ss) LSIscores = sorted(LSIscores, key=lambda k: k['similarity'], reverse=True) """print "scores" print LSIscores""" LSIscores = LSIscores[:len(LSIscores)/2] """ text = "" for item in LSIscores: text += item["docText"] + " " print "text is: " + text d = defaultdict(int) for word in text.split(): d[word] += 1 """ itemsToCluster = [] for item in LSIscores: text = item["docText"] d = defaultdict(int) for word in text.split(): d[word] += 1 d = normalize_coc(d) itemsToCluster.append(d) """ print "printing d follows" print d #normalize d = normalize_coc(d) print "after normalization" print d d = list(d) print d """ clustered[key] = kmeans_process(itemsToCluster) #print "half scores" #print LSIscores #print cocs[key] #clustered[key] = kmeans_process(LSIScores) print "Starting anotating corpus." anotated = [] queue = [] for word in inpt: push(word, queue) if queueIsReady(queue): word = queue[queueMid] if word in clustered and len(clustered[word]) > 1: coc = defaultdict(int) for i in xrange(skipsize): if queue[i] in vocabulary: word1 = queue[i] else: word1 = "_UNKNOWN_" if queue[i+1+skipsize] in vocabulary: word2 = queue[i+1+skipsize] else: word2 = "_UNKNOWN_" coc[word1] += 1 coc[word2] += 1 coc = normalize_coc(coc) # Now get the best cluster bestValue = 1 bestIndex = -1 for i in xrange(k): distance = clustered[word][i].distance(coc) if distance < bestValue: bestValue = distance bestIndex = i word = word + "_" + str(bestIndex) + " " anotated.append(word) return (clustered, anotated)
import pandas as pd import pickle from semanticpy.vector_space import VectorSpace data = pd.read_json('cc_jokes_valid.json') df = pd.DataFrame(data) dfList = df['content'].tolist() #builds vector space model and saves to picle (takes long time) vector_space = VectorSpace(dfList) filehandler = open('vsm.obj', 'w') pickle.dump(vector_space, filehandler)
#conn = sqlite3.connect('../db/development.sqlite3.bak') conn = sqlite3.connect('../db/development.sqlite3') c = conn.cursor() limit = 1000 offset = 5000 limit = 500 offset = 0 rows = c.execute('select * from articles limit {0} offset {1}'.format(limit, offset)) for row in rows: #print row texts.append(row[1] + " " + row[11]) ids.append(row[0]) #vector_space = VectorSpace(["The cat in the hat disabled", "A cat is a fine pet ponies.", "Dogs and cats make good pets.","I haven't got a hat."]) vector_space = VectorSpace(texts) #Search for cat #print vector_space.search(["cat"]) #Show score for relatedness against document 0 group = dict() for id in range(len(ids)): prob = vector_space.related(id) for i in range(len(ids)): if prob[i] > 0.2 and prob[i] < 0.9 and id != i: if group.has_key(offset + id + 1) == False: group[offset + id + 1] = [] group[offset + id + 1].append(offset + i + 1) print(offset + id + 1, offset + i + 1, prob[i]) print(group)
def vector_space_mapping(self): v = VectorSpace(self.documents) matrix = v.collection_of_document_term_vectors return matrix