def run(self):
     rlist=sparqlQuerypy.findProperty2(self.label1,self.label2)
     print '------------------'
     log.write('----------------\n')
     log.write(str(datetime.datetime.now())+'\n')
     log.write(self.label1+self.label2)
     print self.label1,self.label2#,rlist
     
     cache=[]
     propertyUsage=[1]
     for r in rlist:
         if u'd' in r.keys():
             self.addProperty(r['p']['value'])
             rel_data=Neo4jDrive.insertNodeAndRelationship(r['p']['value'],"domain",r['d']['value'])[0]
             rel_data['name']='domain'
             rel_data.push()
         else:
             ccClasses=Neo4jDrive.findCCNodes(self.columnNames[self.perm_column])
             buildString="("
             for i in ccClasses:
                 buildString+='<'+i+'>,'
             buildString=buildString[:-1]
             buildString+=")"
             if r['p']['value'] not in cache:
                 propertyUsage=sparqlQuerypy.findPropertyClassesSecond(r['p']['value'],buildString)
                 cache+=[r['p']['value']]
             
                 print len(propertyUsage),r['p']['value']
                 if len(propertyUsage)<15000:
                     for item in (set([k['r']['value'] for k in propertyUsage]) & set(ccClasses)):
                          self.addProperty(r['p']['value'])
                          rel_data=Neo4jDrive.insertNodeAndRelationship(r['p']['value'],"domain",item)[0]
                          rel_data['name']="domain"
                          rel_data.push()
                          node=Neo4jDrive.findNodeByName(item)
                          node.properties['hyp']='yes'
                          node.properties['type']='cc'
                          node.push()
                          self.incrementDms(rel_data) #for each table we have to put a score on the link between the what and what? The property and its domain? But then how is the score calculated? Is it number of columns in the table by total in that table or is it completely unique?
Esempio n. 2
0
    def dmsScore(self):
        data=self.data
        columnNames=self.columnNames
        totalSize=self.totalSize
        relationships=self.relationships
        size=len(data)
        cache=[]
        bitmap={}
        
        for i,column1 in enumerate(columnNames):
            if column1 not in relationships.keys():
                relationships[column1]={}
            if column1 not in bitmap.keys():
                bitmap[column1]={}
            for j,column2 in enumerate(columnNames):
                if column2 not in relationships.keys():
                    relationships[column2]={}
                if i==j: continue
                for element in data:
                    print '--------------------'
                    print element[i],'-->',element[j]
                    item=(element[i],element[j])
                    rlist=sparqlQuerypy.findProperty2(element[i].strip(),element[j].strip())
                    cache=[]
                    for r in rlist:
                        
                        if column2 not in relationships[column1].keys():
                            relationships[column1][column2]={}
                        if column2 not in bitmap[column1].keys():
                            bitmap[column1][column2]={}
                        if item not in bitmap[column1][column2]:
                            bitmap[column1][column2][item]={}
                        bitmap[column1][column2][item][r['p']['value']]=0
                        if r['p']['value'] not in relationships[column1][column2].keys():
                            relationships[column1][column2][r['p']['value']]={}
                        if u'd' in r.keys():
                            print 'u d is in r.keys()'
                            relationships[column1][column2][r['p']['value']]['name']='property'
                            if 'count' not in relationships[column1][column2][r['p']['value']].keys():
                                relationships[column1][column2][r['p']['value']]['count']=1.0
                            if bitmap[column1][column2][item][r['p']['value']]==0:
                                relationships[column1][column2][r['p']['value']]['count']+=1
                                bitmap[column1][column2][item][r['p']['value']]=1
                            print relationships[column1][column2][r['p']['value']]['count']
                            relationships[column1][column2][r['p']['value']]['dms']=relationships[column1][column2][r['p']['value']]['count']/totalSize
                            if r['p']['value'] not in relationships[column2].keys():
                                relationships[column2][r['p']['value']]={}
                            relationships[column2][r['p']['value']]['name']='cp'
                            if r['p']['value'] not in relationships.keys():
                                relationships[r['p']['value']]={}
                            if r['d']['value'] not in relationships[r['p']['value']].keys():
                                relationships[r['p']['value']][r['d']['value']]={'name':'domain'}   
                            #-----------------TODO: add to hypothesis-------------#      

                        else:
                            ccClasses=Neo4jDrive.findCCNodes(column2)
                            
                            buildString="("
                            for ii in ccClasses:
                                buildString+='<'+ii+'>,'
                            buildString=buildString[:-1]
                            buildString+=")"

                            if r['p']['value'] not in cache:
                                propertyUsage=sparqlQuerypy.findPropertyClassesSecond(r['p']['value'],buildString)
                                cache+=[r['p']['value']]
                                #bitmap[column1][column2][item][r['p']['value']]=0
                                for domain in (set([k['r']['value'] for k in propertyUsage]) & set(ccClasses)):

                                   relationships[column1][column2][r['p']['value']]['name']='property'
                                   if 'count' not in relationships[column1][column2][r['p']['value']].keys():
                                       relationships[column1][column2][r['p']['value']]['count']=1.0
                                   print "item and r['p']['value'], is", item,r['p']['value']
                                   if bitmap[column1][column2][item][r['p']['value']]==0:
                                       relationships[column1][column2][r['p']['value']]['count']+=1
                                       bitmap[column1][column2][item][r['p']['value']]=1
                                   print relationships[column1][column2][r['p']['value']]['count']
                                   relationships[column1][column2][r['p']['value']]['dms']=relationships[column1][column2][r['p']['value']]['count']/totalSize*1.0
                                   if r['p']['value'] not in relationships[column2].keys():
                                       relationships[column2][r['p']['value']]={}
                                   relationships[column2][r['p']['value']]['name']='cp'
                                   if r['p']['value'] not in relationships.keys():
                                       relationships[r['p']['value']]={}
                                   if item not in relationships[r['p']['value']].keys():
                                       relationships[r['p']['value']][domain]={'name':'domain'}
                bitmap[column1][column2]=None
                         #-------------------------add to Hypothesis----------------------#

                     #-----------------Uploading to Neo4j----------------------------#
        for i,column1 in enumerate(columnNames):
            for j,column2 in enumerate(columnNames):
                if column1==column2: continue
                if column2 not in relationships[column1].keys(): continue 
                for rel in relationships[column1][column2].keys():
                    rel_data=Neo4jDrive.insertNodeAndRelationship(column1,rel,column2)[0]
                    
                    rel_data.properties['type']='property'
                    rel_data.properties['name']=rel
                    if 'dms' in relationships[column1][column2][rel].keys():
                        rel_data.properties['dms']=relationships[column1][column2][rel]['dms']
                    else:
                        rel_data.properties['dms']=0
                    rel_data.push()
                    rel_data=Neo4jDrive.insertNodeAndRelationship(column2,'cp',rel)[0]
                    rel_data.properties['type']='cp'
                    rel_data.push()

                    for domain in relationships[rel].keys():
                       rel_data=Neo4jDrive.insertNodeAndRelationship(rel,'domain',domain)[0]
                       rel_data.properties['type']='domain'
                       rel_data.push()