def it_should_search_test(self):
        vectorSpace = VectorSpace(self.documents)

        eq_(vectorSpace.search(["cat"]), [
            0.14487566959813258, 0.1223402602604157, 0.07795622058966725,
            0.05586504042763477
        ])
    def it_should_search_test(self):
        vectorSpace = VectorSpace(self.documents)

        eq_(
            vectorSpace.search(["cat"]),
            [0.14487566959813258, 0.1223402602604157, 0.07795622058966725, 0.05586504042763477],
        )
Exemple #3
0
def run(data, queries, max_response=10):
    all_documents = []
    for entry in data:
        all_documents.append(entry["raw_data"])

    vector_space = VectorSpace(all_documents)

    #Search for cat
    indexed_result = {}
    result = vector_space.search([queries])
    index = 0

    for entry in result:
        indexed_result[index] = entry
        index += 1

    sorted_resp = sorted(indexed_result.items(),
                         key=operator.itemgetter(1),
                         reverse=True)

    sorted_resp = sorted_resp[:int(max_response) + 1]

    response = {}
    rank = 1
    for entry in sorted_resp:
        data_index = entry[0]

        response[rank] = data[data_index]
        rank += 1

    return response
Exemple #4
0
    def it_should_find_related_test(self):
        vector_space = VectorSpace(self.documents)

        eq_(vector_space.related(0), [1.0000000000000002, 0.9999999999999998, 0.0])
Exemple #5
0
    def it_should_search_test(self):
        vector_space = VectorSpace(self.documents, transforms = [])

        eq_(vector_space.search(["cat"]), [1.0, 0.7071067811865475, 0.0])
Exemple #6
0
    def it_should_find_return_similarity_rating_test(self):
        vectorSpace = VectorSpace(self.documents)

        eq_(vectorSpace.related(0), [1.0, 0.9922455760198575, 0.08122814162371816, 0.0762173599906487])
Exemple #7
0
queries = pickle.load(open("QueryStrings.p", "rb"))
print "total queries"
print len(queries)
print "loaded queries"
documents = pickle.load(open("documentContentList2.p", "rb"))
print "loaded documents"
docIds = pickle.load(open("docIdList.p", "rb"))
print len(docIds)
print "loaded doc ids"
documents = list(documents)
print "loaded documents"
print len(documents)
#documents = documents[:100]
#docIds = docIds[:100]
#queries = queries[:10]
vector_space = VectorSpace(documents)
print "finished conversion"

print "load user click file"
userQueriesAndClicks = pickle.load(
    open("user_specific_positive_negative_examples_dic_test", "rb"))
print "finished loading user click file"

#queryResults = dict( [ (x[0], (x[1], x[2])) for x in userQueriesAndClicks_strict[userID] ])


# given a query and a ranking, this function provides the relevanceJudgements list as
# required by averagePrecision
def turnIntoBinaryRelevanceThing(query, ranking, relevantDocuments):
    #rel = self.relevantDocuments[query]
    binarized = []
Exemple #8
0
# Create the corpus
file_content_all=[]
corpus='AspectJ'
creator = sourceCorpusCreator()
sourcepath = "E:\PhD\LSI\Repo\\"+corpus+"\SourceAndBugData244\\"
keywordsfilepath='E:\PhD\LSI\Repo\\'+corpus+'\data\keyword-documents.txt'
#querypath="E:\PhD\LSI\Repo\\"+corpus+"\BugData\\"
source_content_all={}
source_content_all=creator.CorpusCreatorDict(sourcepath, '.java')


print ('Total files in corpus ')
print(len(source_content_all))
print (source_content_all)

vector_space = VectorSpace(source_content_all)
file_path_all=vector_space.get_file_path_all()
print (file_path_all)
document_ID_file_info_mapping=vector_space.get_document_ID_file_info_mapping()
print (document_ID_file_info_mapping)
keywords_docs_string=str(vector_space.vector_index_to_keyword_mapping)
file_read_write=FileReadWrite(sourcepath)
file_read_write.writeFiles(keywordsfilepath, keywords_docs_string)
print (len(vector_space.vector_index_to_keyword_mapping))
#import pdb
#pdb.set_trace()
print ("Keyords-document vector/matrix")
print ('length of vector_space.collection_of_document_term_vectors')
print len(vector_space.collection_of_document_term_vectors)

document_term_matrix=vector_space.collection_of_document_term_vectors
Exemple #9
0
queries = pickle.load(open("QueryStrings.p", "rb"))
print "total queries"
print len(queries)
print "loaded queries"
documents = pickle.load(open("documentContentList2.p", "rb"))
print "loaded documents"
docIds = pickle.load(open("docIdList.p", "rb"))
print len(docIds)
print "loaded doc ids"
documents = list(documents)
print "loaded documents"
print len(documents)
#documents = documents[:100]
#docIds = docIds[:100]
#queries = queries[:10]
vector_space = VectorSpace(documents)
print "finished conversion"

print "load user click file"
userQueriesAndClicks = pickle.load(open("user_specific_positive_negative_examples_dic_test", "rb"))
print "finished loading user click file"
#queryResults = dict( [ (x[0], (x[1], x[2])) for x in userQueriesAndClicks_strict[userID] ])

# given a query and a ranking, this function provides the relevanceJudgements list as 
# required by averagePrecision
def turnIntoBinaryRelevanceThing(query, ranking, relevantDocuments):
        #rel = self.relevantDocuments[query]
        binarized = []
        for doc in ranking:
                if doc in relevantDocuments:
                        binarized.append(1)
def anotate(inpt, skipsize):
	k = 2
	queueSize = skipsize * 2 + 1
	queueMid = skipsize + 1

	queueIsReady = lambda x : len(x) == queueSize
	def push(element, queue):
		queue.append(element)
		if len(queue) > queueSize:
			queue.pop(0)
	
	vocabulary = get_document_vocabulary(inpt)
	vocSize = len(vocabulary) + 1

        print "Starting on determining word co-occurences."

	cocs = defaultdict(list)
	queue = []
	for word in inpt:
		push(word, queue)
		if queueIsReady(queue):
			mid = queue[queueMid]
			if mid in vocabulary:
				coc = []
				for i in xrange(skipsize):
					if queue[i] in vocabulary:
						word1 = queue[i]
					else:
						word1 = "_UNKNOWN_"
					if queue[i+1+skipsize] in vocabulary:
						word2 = queue[i+1+skipsize]
					else:
						word2 = "_UNKNOWN_"

					coc.append(word1)
					coc.append(word2)
				#print "final co-occurences"
				#print coc
				cocs[mid].append(coc)
				#print "Coc[mid]"
				#print cocs[mid]

	print "Determining LSA relatedness scores between documents..."
        
	clustered = dict()
	for key in cocs.keys():
                
            #print "KEY:" + key
            #print "\ncocs[" + key + "]:"
            #print "len cocs key"
            #print len(cocs[key])
            #print cocs[key][0]
            #print cocs[key][1]
            #print " ".join(cocs[key][0])
            a = [" ".join(cocs[key][i]) for i in range(len(cocs[key]))]
            #a = a[:6]
            #print "len a"
            #print len(a)
            """
            print a[0]
            print a[1]
            print a[2]
            print a[3]
            print a[4]
            print a[5]
            """
            vector_space = VectorSpace(a)
            scores = vector_space.related(0)
            LSIscores = []
            for i in range(len(a)):
                    ss = {"docText" : a[i], "similarity" : scores[i]}
                    LSIscores.append(ss)
            LSIscores = sorted(LSIscores, key=lambda k: k['similarity'], reverse=True)
            """print "scores"
            print LSIscores"""
            LSIscores = LSIscores[:len(LSIscores)/2]

            """
            text = ""
            for item in LSIscores:
                    text += item["docText"] + " "
            print "text is: " +  text

            d = defaultdict(int)
            for word in text.split():
                    d[word] += 1
            """
            itemsToCluster = []
            for item in LSIscores:
                    text = item["docText"]
                    d = defaultdict(int)
                    for word in text.split():
                            d[word] += 1
                    d = normalize_coc(d)
                    itemsToCluster.append(d)
            """
            print "printing d follows"
            print d
            #normalize
            d = normalize_coc(d)
            print "after normalization"
            print d
            d = list(d)
            print d
            """
            
            clustered[key] = kmeans_process(itemsToCluster)
            #print "half scores"
            #print LSIscores
            
            #print cocs[key]
	    #clustered[key] = kmeans_process(LSIScores)
	
        
	print "Starting anotating corpus."
	anotated = []
	queue = []
	for word in inpt:
		push(word, queue)
		if queueIsReady(queue):
			word = queue[queueMid]
			if word in clustered and len(clustered[word]) > 1:
				coc = defaultdict(int)
				for i in xrange(skipsize):
					if queue[i] in vocabulary:
						word1 = queue[i]
					else:
						word1 = "_UNKNOWN_"
					if queue[i+1+skipsize] in vocabulary:
						word2 = queue[i+1+skipsize]
					else:
						word2 = "_UNKNOWN_"

					coc[word1] += 1
					coc[word2] += 1

				coc = normalize_coc(coc)

				# Now get the best cluster
				bestValue = 1
				bestIndex = -1
				for i in xrange(k):
					distance = clustered[word][i].distance(coc)
					if distance < bestValue:
						bestValue = distance
						bestIndex = i
				word = word + "_" + str(bestIndex) + " "
			anotated.append(word)

	return (clustered, anotated)
Exemple #11
0
import pandas as pd
import pickle
from semanticpy.vector_space import VectorSpace

data = pd.read_json('cc_jokes_valid.json')
df = pd.DataFrame(data)
dfList = df['content'].tolist()

#builds vector space model and saves to picle (takes long time)
vector_space = VectorSpace(dfList)
filehandler = open('vsm.obj', 'w')
pickle.dump(vector_space, filehandler)
Exemple #12
0
#conn = sqlite3.connect('../db/development.sqlite3.bak')
conn = sqlite3.connect('../db/development.sqlite3')
c = conn.cursor()
limit = 1000
offset = 5000
limit = 500
offset = 0
rows = c.execute('select * from articles limit {0} offset {1}'.format(limit, offset))

for row in rows:
  #print row
  texts.append(row[1] + " " + row[11])
  ids.append(row[0])

#vector_space = VectorSpace(["The cat in the hat disabled", "A cat is a fine pet ponies.", "Dogs and cats make good pets.","I haven't got a hat."])
vector_space = VectorSpace(texts)

#Search for cat
#print vector_space.search(["cat"])
#Show score for relatedness against document 0

group = dict()
for id in range(len(ids)):
  prob = vector_space.related(id)
  for i in range(len(ids)):
    if prob[i] > 0.2 and prob[i] < 0.9 and id != i:
      if group.has_key(offset + id + 1) == False:
        group[offset + id + 1] = []
      group[offset + id + 1].append(offset + i + 1)
      print(offset + id + 1, offset + i + 1, prob[i])
print(group)
Exemple #13
0
 def vector_space_mapping(self):
     v = VectorSpace(self.documents)
     matrix = v.collection_of_document_term_vectors
     return matrix