def run(self): rlist=sparqlQuerypy.findProperty2(self.label1,self.label2) print '------------------' log.write('----------------\n') log.write(str(datetime.datetime.now())+'\n') log.write(self.label1+self.label2) print self.label1,self.label2#,rlist cache=[] propertyUsage=[1] for r in rlist: if u'd' in r.keys(): self.addProperty(r['p']['value']) rel_data=Neo4jDrive.insertNodeAndRelationship(r['p']['value'],"domain",r['d']['value'])[0] rel_data['name']='domain' rel_data.push() else: ccClasses=Neo4jDrive.findCCNodes(self.columnNames[self.perm_column]) buildString="(" for i in ccClasses: buildString+='<'+i+'>,' buildString=buildString[:-1] buildString+=")" if r['p']['value'] not in cache: propertyUsage=sparqlQuerypy.findPropertyClassesSecond(r['p']['value'],buildString) cache+=[r['p']['value']] print len(propertyUsage),r['p']['value'] if len(propertyUsage)<15000: for item in (set([k['r']['value'] for k in propertyUsage]) & set(ccClasses)): self.addProperty(r['p']['value']) rel_data=Neo4jDrive.insertNodeAndRelationship(r['p']['value'],"domain",item)[0] rel_data['name']="domain" rel_data.push() node=Neo4jDrive.findNodeByName(item) node.properties['hyp']='yes' node.properties['type']='cc' node.push() self.incrementDms(rel_data) #for each table we have to put a score on the link between the what and what? The property and its domain? But then how is the score calculated? Is it number of columns in the table by total in that table or is it completely unique?
def dmsScore(self): data=self.data columnNames=self.columnNames totalSize=self.totalSize relationships=self.relationships size=len(data) cache=[] bitmap={} for i,column1 in enumerate(columnNames): if column1 not in relationships.keys(): relationships[column1]={} if column1 not in bitmap.keys(): bitmap[column1]={} for j,column2 in enumerate(columnNames): if column2 not in relationships.keys(): relationships[column2]={} if i==j: continue for element in data: print '--------------------' print element[i],'-->',element[j] item=(element[i],element[j]) rlist=sparqlQuerypy.findProperty2(element[i].strip(),element[j].strip()) cache=[] for r in rlist: if column2 not in relationships[column1].keys(): relationships[column1][column2]={} if column2 not in bitmap[column1].keys(): bitmap[column1][column2]={} if item not in bitmap[column1][column2]: bitmap[column1][column2][item]={} bitmap[column1][column2][item][r['p']['value']]=0 if r['p']['value'] not in relationships[column1][column2].keys(): relationships[column1][column2][r['p']['value']]={} if u'd' in r.keys(): print 'u d is in r.keys()' relationships[column1][column2][r['p']['value']]['name']='property' if 'count' not in relationships[column1][column2][r['p']['value']].keys(): relationships[column1][column2][r['p']['value']]['count']=1.0 if bitmap[column1][column2][item][r['p']['value']]==0: relationships[column1][column2][r['p']['value']]['count']+=1 bitmap[column1][column2][item][r['p']['value']]=1 print relationships[column1][column2][r['p']['value']]['count'] relationships[column1][column2][r['p']['value']]['dms']=relationships[column1][column2][r['p']['value']]['count']/totalSize if r['p']['value'] not in relationships[column2].keys(): relationships[column2][r['p']['value']]={} relationships[column2][r['p']['value']]['name']='cp' if r['p']['value'] not in relationships.keys(): relationships[r['p']['value']]={} if r['d']['value'] not in relationships[r['p']['value']].keys(): relationships[r['p']['value']][r['d']['value']]={'name':'domain'} #-----------------TODO: add to hypothesis-------------# else: ccClasses=Neo4jDrive.findCCNodes(column2) buildString="(" for ii in ccClasses: buildString+='<'+ii+'>,' buildString=buildString[:-1] buildString+=")" if r['p']['value'] not in cache: propertyUsage=sparqlQuerypy.findPropertyClassesSecond(r['p']['value'],buildString) cache+=[r['p']['value']] #bitmap[column1][column2][item][r['p']['value']]=0 for domain in (set([k['r']['value'] for k in propertyUsage]) & set(ccClasses)): relationships[column1][column2][r['p']['value']]['name']='property' if 'count' not in relationships[column1][column2][r['p']['value']].keys(): relationships[column1][column2][r['p']['value']]['count']=1.0 print "item and r['p']['value'], is", item,r['p']['value'] if bitmap[column1][column2][item][r['p']['value']]==0: relationships[column1][column2][r['p']['value']]['count']+=1 bitmap[column1][column2][item][r['p']['value']]=1 print relationships[column1][column2][r['p']['value']]['count'] relationships[column1][column2][r['p']['value']]['dms']=relationships[column1][column2][r['p']['value']]['count']/totalSize*1.0 if r['p']['value'] not in relationships[column2].keys(): relationships[column2][r['p']['value']]={} relationships[column2][r['p']['value']]['name']='cp' if r['p']['value'] not in relationships.keys(): relationships[r['p']['value']]={} if item not in relationships[r['p']['value']].keys(): relationships[r['p']['value']][domain]={'name':'domain'} bitmap[column1][column2]=None #-------------------------add to Hypothesis----------------------# #-----------------Uploading to Neo4j----------------------------# for i,column1 in enumerate(columnNames): for j,column2 in enumerate(columnNames): if column1==column2: continue if column2 not in relationships[column1].keys(): continue for rel in relationships[column1][column2].keys(): rel_data=Neo4jDrive.insertNodeAndRelationship(column1,rel,column2)[0] rel_data.properties['type']='property' rel_data.properties['name']=rel if 'dms' in relationships[column1][column2][rel].keys(): rel_data.properties['dms']=relationships[column1][column2][rel]['dms'] else: rel_data.properties['dms']=0 rel_data.push() rel_data=Neo4jDrive.insertNodeAndRelationship(column2,'cp',rel)[0] rel_data.properties['type']='cp' rel_data.push() for domain in relationships[rel].keys(): rel_data=Neo4jDrive.insertNodeAndRelationship(rel,'domain',domain)[0] rel_data.properties['type']='domain' rel_data.push()