def run(self): support=self.support totalNumberOfValues=self.totalNumberOfValues column=self.column columnNames=self.columnNames item=self.item node=Neo4jDrive.findNodeByName(item) if node== None: Neo4jDrive.insertNodeAndRelationship(columnNames[column],'dataItems',item) node=Neo4jDrive.findNodeByName(item) node.properties['fvalue']=support[item] node.push() rlist=sparqlQuerypy.findBottomUp(item) for r in rlist: try: rel_data=Neo4jDrive.insertNodeAndRelationship(item,"cc",r[0]) rel_data1=Neo4jDrive.insertNodeAndRelationship(r[0],"dd",r[2]) node=node=Neo4jDrive.findNodeByName(r[2]) if node.properties['incoming']==None: node.properties['incoming']=1 else: node.properties['incoming']+=1 node.properties['type']='type' node.push() except : print columnNames[column],'cc',r[0] rel_data=rel_data[0] rel_data.properties['rel_class'] = 'cc' rel_data.properties['support']=support[item]/(totalNumberOfValues*1.0) rel_data.push()
def run(self): support = self.support totalNumberOfValues = self.totalNumberOfValues column = self.column columnNames = self.columnNames item = self.item rlist = sparqlQuerypy.findBottomUp(item) for r in rlist: rel_data = Neo4jDrive.insertNodeAndRelationship( columnNames[column], "cc", r[2]) node = Neo4jDrive.findNodeByName(r[2]) if node.properties['incoming'] == None: node.properties['incoming'] = 1 else: node.properties['incoming'] += 1 node.properties['type'] = 'type' node.push() rel_data = rel_data[0] rel_data.properties['rel_class'] = 'cc' rel_data.properties['support'] = support[item] / ( totalNumberOfValues * 1.0) rel_data.push()
def main(): csvitems=[] data=[] tables=["StatesandCapitals.csv","RiversandSourceState.csv"] size=[] for nameOfFile in tables: Neo4jDrive.insertNode(nameOfFile) node=Neo4jDrive.findNodeByName(nameOfFile) node.properties['type']='table' node.push() csvitems+=[CSVRead.readCSV(nameOfFile,firstRow=False, choice=[0,1])[1:]] size+=[len(csvitems[-1])] random.shuffle(csvitems[-1]) i=k=0 while len(csvitems)>0: for l,item in enumerate(csvitems): end=k+sample s=sample if k+sample>len(item): s=sample-(end-len(item)) end=len(item) data[i:i+s]=[[it,l] for it in item[k:end]] i+=s if k+sample>len(item): csvitems.remove(item) k+=sample run(data,tables,size)
def main(): Neo4jDrive.insertNode(nameOfFile) columnNames=CSVRead.readCSV(nameOfFile,firstRow=True, choice=[0,1,2,3,4]) for name in columnNames: Neo4jDrive.insertNodeAndRelationship(nameOfFile,"Column",name) #support=CSVRead.getSupport(nameOfFile,0) #totalNumberOfValues=CSVRead.numberOfItems(support) for column in range(sum([1 for _ in Neo4jDrive.findRelationshipsOfNode(nameOfFile,"Column")])): support=CSVRead.getSupport(nameOfFile,column) totalNumberOfValues=CSVRead.numberOfItems(support) #print i.end_node #cNode=Neo4jDrive.findNodeByName(columnNames[column]) for item in support.keys(): node=Neo4jDrive.findNodeByName(item) if node== None: Neo4jDrive.insertNodeAndRelationship(columnNames[column],'dataItems',item) node=Neo4jDrive.findNodeByName(item) node.properties['fvalue']=support[item] node.push() rlist=sparqlQuerypy.findBottomUp(item) for r in rlist: try: rel_data=Neo4jDrive.insertNodeAndRelationship(item,"cc",r[0]) rel_data1=Neo4jDrive.insertNodeAndRelationship(r[0],"dd",r[2]) node=node=Neo4jDrive.findNodeByName(r[2]) if node.properties['incoming']==None: node.properties['incoming']=1 else: node.properties['incoming']+=1 node.properties['type']='type' node.push() except : print columnNames[column],'cc',r[0] rel_data=rel_data[0] rel_data.properties['rel_class'] = 'cc' rel_data.properties['support']=support[item]/(totalNumberOfValues*1.0) rel_data.push()
def main(): Neo4jDrive.insertNode(nameOfFile) columnNames = CSVRead.readCSV(nameOfFile, firstRow=True, choice=[0, 1, 2, 3, 4]) for name in columnNames: Neo4jDrive.insertNodeAndRelationship(nameOfFile, "Column", name) # support=CSVRead.getSupport(nameOfFile,0) # totalNumberOfValues=CSVRead.numberOfItems(support) for column in range(sum([1 for _ in Neo4jDrive.findRelationshipsOfNode(nameOfFile, "Column")])): support = CSVRead.getSupport(nameOfFile, column) totalNumberOfValues = CSVRead.numberOfItems(support) # print i.end_node # cNode=Neo4jDrive.findNodeByName(columnNames[column]) for item in support.keys(): node = Neo4jDrive.findNodeByName(item) if node == None: Neo4jDrive.insertNodeAndRelationship(columnNames[column], "dataItems", item) node = Neo4jDrive.findNodeByName(item) node.properties["fvalue"] = support[item] node.push() rlist = sparqlQuerypy.findBottomUp(item) for r in rlist: try: rel_data = Neo4jDrive.insertNodeAndRelationship(item, "cc", r[0]) rel_data1 = Neo4jDrive.insertNodeAndRelationship(r[0], "dd", r[2]) node = node = Neo4jDrive.findNodeByName(r[2]) if node.properties["incoming"] == None: node.properties["incoming"] = 1 else: node.properties["incoming"] += 1 node.properties["type"] = "type" node.push() except: print columnNames[column], "cc", r[0] rel_data = rel_data[0] rel_data.properties["rel_class"] = "cc" rel_data.properties["support"] = support[item] / (totalNumberOfValues * 1.0) rel_data.push()
def run(data,tables,size): support=[[]] columnNames=[] for i,nameOfFile in enumerate(tables): columnNames+=[CSVRead.readCSV(nameOfFile,firstRow=True, choice=[0,1])] columnNames[i]=[c.strip() for c in columnNames[i]] for j,name in enumerate(columnNames[i]): z=Neo4jDrive.insertNodeAndRelationship(nameOfFile,"Column",name)[0] node=Neo4jDrive.findNodeByName(name) node.properties['type']='Column' node.push() z.properties['type']="Column" z.push() support[i]+=[CSVRead.getSupport(nameOfFile,j)] support+=[[]] support=support[:-1] totalNumberOfValues=CSVRead.getSize(nameOfFile,0) hyplock=Lock() stypelock=Lock() for itemPiece in data: indexOfFile=itemPiece[1] item=itemPiece[0] for column in range(len(columnNames[indexOfFile])): #support=CSVRead.getSupport(nameOfFile,column) #totalNumberOfValues=CSVRead.numberOfItems(support) k=ccThread(item[column],columnNames[indexOfFile],column,support[indexOfFile],size[indexOfFile]) k.start() k.join() for itemPiece in data: indexOfFile=itemPiece[1] item=itemPiece[0] for column in range(len(columnNames[indexOfFile])): #support=CSVRead.getSupport(nameOfFile,column) #totalNumberOfValues=CSVRead.numberOfItems(support) for perm_column in range(len(columnNames[indexOfFile])): if perm_column!=column: k=dmsThread(item[column],item[perm_column],size[indexOfFile],columnNames[indexOfFile],column,perm_column) k.start() k.join() allCC=set(Neo4jDrive.findAllCCNodes()) for s,c in enumerate(columnNames): for column in c: k=topDownThread(column,hyplock,stypelock,allCC,size[s]) k.start() k.join()
def main(): columnNames=[] colNam={} csvitems={} size={} tables=["StatesandCapitals.csv","RiversandSourceState.csv"] for i, nameOfFile in enumerate(tables): #pushes each table as a node into the graph along with the columns Neo4jDrive.insertNode(nameOfFile) node=Neo4jDrive.findNodeByName(nameOfFile) node.properties['type']='table' node.push() #end of push columnNames+=[CSVRead.readCSV(nameOfFile,firstRow=True, choice=[0,1])] columnNames[i]=[c.strip() for c in columnNames[i]] colNam[nameOfFile]=[c.strip() for c in columnNames[i]] for j,name in enumerate(columnNames[i]): z=Neo4jDrive.insertNodeAndRelationship(nameOfFile,"Column",name)[0] node=Neo4jDrive.findNodeByName(name) node.properties['type']='Column' node.push() z.properties['type']="Column" z.push() #end of the Column Pushing csvitems[nameOfFile]=CSVRead.readCSV(nameOfFile,firstRow=False,choice=[0,1])[1:] #stores each data set in a dictionary of lists size[nameOfFile]=[len(csvitems[nameOfFile])] #stores the sizes of the lists in a dictionary called size random.shuffle(csvitems[nameOfFile]) #shuffles for randomness relationships={} iterations=1 convergence=False #the test flag for whether convergence has been reached while(not convergence): for table in tables: start=sample*(iterations-1) end=sample*iterations rt=runThread(table, csvitems[table][start:end], colNam[table],end,relationships) rt.start() rt.join() iterations+=1 if end>5:convergence=True
def ccScores(self): data=self.data columnNames=self.columnNames totalSize=self.totalSize relationships=self.relationships size=len(data) bitmap={} for i,column in enumerate(columnNames): relationships[column]={} bitmap[column]={} #this is a dictionary which is a set of flags per data value remembering if the increment already happened. for element in data: item=element[i] rlist=sparqlQuerypy.findBottomUp(item.strip()) print 'number of nodes for', item.strip(), " is ", len(rlist) bitmap[column][item]={} for r in rlist: if r[0] not in bitmap[column][item].keys(): bitmap[column][item][r[0]]=0 if r[0] not in relationships[column].keys(): relationships[column][r[0]]={} relationships[column][r[0]]['name']='cc' if 'incoming' not in relationships[column][r[0]].keys(): relationships[column][r[0]]['incoming']=1 relationships[column][r[0]]['cc']=1.0/totalSize else: relationships[column][r[0]]['incoming']+=1 relationships[column][r[0]]['cc']=relationships[column][r[0]]['incoming']*1.0/totalSize bitmap[column][item][r[0]]=1 classSet=set() # A set to save all the possible cc classes for ease of retrieval later and to streamline it. for column in columnNames: #Loop to push the relations and nodes to Neo4j for classes in relationships[column].keys(): classSet.add(classes) rel_data=Neo4jDrive.insertNodeAndRelationship(column,'cc',classes)[0] rel_data.properties['rel_class']='cc' rel_data.properties['fk']=relationships[column][classes]['cc'] rel_data.push() for classes in classSet: #Loop to update the CCS score for each class after the previous loop is over. CCS=sum(fk)/no(fk) for the node. print classes cummulative=0 # The accumulator linkNumbers=0 # The denominator for link in Neo4jDrive.findIncomingCCLinks(classes): #loop to find incoming cc edges. cummulative+=link[0].properties['fk'] linkNumbers+=1 node=Neo4jDrive.findNodeByName(classes) node.properties['ccs']=cummulative*1.0/linkNumbers node.properties['type']='cc' node.push()
def run(self): support=self.support totalNumberOfValues=self.totalNumberOfValues*1.0 column=self.column columnNames=self.columnNames item=self.item rlist=sparqlQuerypy.findBottomUp(item.strip()) print 'number of nodes for', item.strip(), " is ", len(rlist) log.write('number of nodes for'+str( item.strip())+ " is "+ str(len(rlist))+'\n') flag=0 for r in rlist: rel_data=Neo4jDrive.insertNodeAndRelationship(columnNames[column],"cc",r[2]) rel_data=rel_data[0] node=Neo4jDrive.findNodeByName(r[2]) if r[2]=='http://dbpedia.org/ontology/PopulatedPlace': print columnNames[column], 'Happening' print 'potato',rel_data if rel_data.properties['incoming']==None: #find out why this is not happenings rel_data.properties['incoming']=1 rel_data.properties['ccs']=1/totalNumberOfValues rel_data.push() #print 'tomato',rel_data else: if flag==0: rel_data.properties['incoming']+=1 rel_data.push() rel_data.properties['ccs']=node.properties['incoming']/totalNumberOfValues flag=1 node.properties['type']='cc' node.properties['ccs']=0 numberOfLinks=0 for link in Neo4jDrive.findIncomingCCLinks(r[2]): node.properties['ccs']+=link[0].properties['ccs'] numberOfLinks+=1 if numberOfLinks>0: node.properties['ccs']/=numberOfLinks node.push() rel_data.properties['rel_class'] = 'cc' #rel_data.properties['ccs']=node.proper/(totalNumberOfValues*1.0) rel_data.push()
def addProperty(self,p): rel_data=Neo4jDrive.insertNodeAndRelationship(self.columnNames[self.column],"property",p) hypothesisSet.add(p) node=Neo4jDrive.findNodeByName(p) if node.properties['dcsincoming']==None: node.properties['dcsincoming']=1 node.properties['dcs']=1/(self.size*1.0) else: node.properties['dcsincoming']+=1 node.properties['dcs']=node.properties['dcsincoming']/(self.size*1.0) node.properties['type']='property' node.push() rel=Neo4jDrive.insertRelationship(self.columnNames[self.column], p, self.columnNames[self.perm_column])[0] if rel.properties['propCount']==None: rel.properties['type']='property_rel' rel.properties['name']=p rel.properties['count']=1 rel.properties['dms']=rel.properties['count']/(self.size*1.0) else: rel.properties['count']+=1 rel.properties['dms']=rel.properties['count']/(self.size*1.0) rel.push()
def run(self): rlist=sparqlQuerypy.findProperty2(self.label1,self.label2) print '------------------' log.write('----------------\n') log.write(str(datetime.datetime.now())+'\n') log.write(self.label1+self.label2) print self.label1,self.label2#,rlist cache=[] propertyUsage=[1] for r in rlist: if u'd' in r.keys(): self.addProperty(r['p']['value']) rel_data=Neo4jDrive.insertNodeAndRelationship(r['p']['value'],"domain",r['d']['value'])[0] rel_data['name']='domain' rel_data.push() else: ccClasses=Neo4jDrive.findCCNodes(self.columnNames[self.perm_column]) buildString="(" for i in ccClasses: buildString+='<'+i+'>,' buildString=buildString[:-1] buildString+=")" if r['p']['value'] not in cache: propertyUsage=sparqlQuerypy.findPropertyClassesSecond(r['p']['value'],buildString) cache+=[r['p']['value']] print len(propertyUsage),r['p']['value'] if len(propertyUsage)<15000: for item in (set([k['r']['value'] for k in propertyUsage]) & set(ccClasses)): self.addProperty(r['p']['value']) rel_data=Neo4jDrive.insertNodeAndRelationship(r['p']['value'],"domain",item)[0] rel_data['name']="domain" rel_data.push() node=Neo4jDrive.findNodeByName(item) node.properties['hyp']='yes' node.properties['type']='cc' node.push() self.incrementDms(rel_data) #for each table we have to put a score on the link between the what and what? The property and its domain? But then how is the score calculated? Is it number of columns in the table by total in that table or is it completely unique?
def run(self): support = self.support totalNumberOfValues = self.totalNumberOfValues column = self.column columnNames = self.columnNames item = self.item rlist = sparqlQuerypy.findBottomUp(item) for r in rlist: rel_data = Neo4jDrive.insertNodeAndRelationship(columnNames[column], "cc", r[2]) node = Neo4jDrive.findNodeByName(r[2]) if node.properties["incoming"] == None: node.properties["incoming"] = 1 else: node.properties["incoming"] += 1 node.properties["type"] = "type" node.push() rel_data = rel_data[0] rel_data.properties["rel_class"] = "cc" rel_data.properties["support"] = support[item] / (totalNumberOfValues * 1.0) rel_data.push()
def addProperty(self,p): print self.a, p rel_data=Neo4jDrive.insertNodeAndRelationship(self.a,"cp",p)[0] rel_data.properties['type']='cp' self.hyplock.acquire() hypothesisSet.add(p) self.hyplock.release() node=Neo4jDrive.findNodeByName(p) if rel_data.properties['incoming']==None: rel_data.properties['incoming']=1 rel_data.properties['dms']=1/(self.size*1.0) pr=p for j in range(len(pr)-1,0,-1): if pr[j]=='/': pr=pr[j+1:] break rel_data.properties['lms']=self.levenshtein(self.a,pr) else: rel_data.properties['incoming']+=1 rel_data.properties['dms']=node.properties['incoming']/(self.size*1.0) rel_data.push() node.properties['type']='property' node.properties['hyp']='yes' node.push()