def main(): csvitems=[] data=[] tables=["StatesandCapitals.csv","RiversandSourceState.csv"] size=[] for nameOfFile in tables: Neo4jDrive.insertNode(nameOfFile) node=Neo4jDrive.findNodeByName(nameOfFile) node.properties['type']='table' node.push() csvitems+=[CSVRead.readCSV(nameOfFile,firstRow=False, choice=[0,1])[1:]] size+=[len(csvitems[-1])] random.shuffle(csvitems[-1]) i=k=0 while len(csvitems)>0: for l,item in enumerate(csvitems): end=k+sample s=sample if k+sample>len(item): s=sample-(end-len(item)) end=len(item) data[i:i+s]=[[it,l] for it in item[k:end]] i+=s if k+sample>len(item): csvitems.remove(item) k+=sample run(data,tables,size)
def run(self): support = self.support totalNumberOfValues = self.totalNumberOfValues column = self.column columnNames = self.columnNames item = self.item rlist = sparqlQuerypy.findBottomUp(item) for r in rlist: rel_data = Neo4jDrive.insertNodeAndRelationship( columnNames[column], "cc", r[2]) node = Neo4jDrive.findNodeByName(r[2]) if node.properties['incoming'] == None: node.properties['incoming'] = 1 else: node.properties['incoming'] += 1 node.properties['type'] = 'type' node.push() rel_data = rel_data[0] rel_data.properties['rel_class'] = 'cc' rel_data.properties['support'] = support[item] / ( totalNumberOfValues * 1.0) rel_data.push()
def main(): Neo4jDrive.insertNode(nameOfFile) columnNames = CSVRead.readCSV(nameOfFile, firstRow=True, choice=[0, 1, 2, 3, 4]) for name in columnNames: Neo4jDrive.insertNodeAndRelationship(nameOfFile, "Column", name) #support=CSVRead.getSupport(nameOfFile,0) #totalNumberOfValues=CSVRead.numberOfItems(support) for column in range( sum([ 1 for _ in Neo4jDrive.findRelationshipsOfNode( nameOfFile, "Column") ])): support = CSVRead.getSupport(nameOfFile, column) totalNumberOfValues = CSVRead.numberOfItems(support) #print i.end_node #cNode=Neo4jDrive.findNodeByName(columnNames[column]) for item in support.keys(): k = itemThread(item, columnNames, column, support, totalNumberOfValues) k.start() k.join()
def lmsScore(self): relationships=self.relationships totalSize=self.totalSize ccClasses=set(Neo4jDrive.findAllCCNodes()) hypothesis=self.hypothesis for column in enumerate(columnNames): rlist=sparqlQuerypy.findPropertyClassesThird(column) relationships[column]['lms']={} ccClassesOfColumn=set(Neo4jDrive.findCCNodes(column)) for r in rlist: rangeList=sparqlQuerypy.findRange(r['s']['value']) if len(rangeList)==0: #does not have range objTypeList=set([sparqlQuerypy.findTypeOfObject(r['t']['value'])) if len(objTypeList & ccClassesofColumn)==0: continue #discard property if range(types of objects) don't exist in ccClasses. if (set(rangeList) & ccClassesofColumn)==0: continue #discard property if range(got through Sparql) doesn't exist in ccClasses. domainList=sparqlQuerypy.findDomain(r['t']['value']) if len(domainList)==0: #does not have a Domain domainList=[k['t']['value'] for k in sparqlQuerypy.findTypeOfSubject(r['s']['value']))] for domain in domainList: if r['s']['value'] not in relationships[column]['lms'].keys(): relationships[column]['lms'][r['s']['value']]={} if domain in hypothesis: if domain not in relationships[column]['lms'][r['s']['value']].keys(): relationships[column]['lms'][r['s']['value']]['d']= {'name':domain} else: if domain in ccClasses: hypothesis.add(domain) relationships[column]['lms'][r['s']['value']]['d']= {'name':domain}
def run(self): count=0 objtypes=[] rlist=sparqlQuerypy.findPropertyClassesFirst(self.a) for r in rlist: if u'r' not in r.keys(): ccClasses=Neo4jDrive.findCCNodes(self.a) buildString="(" for i in ccClasses: buildString+='<'+i+'>,' buildString=buildString[:-1] buildString+=")" propertyUsage=sparqlQuerypy.findPropertyClassesSecond(r['p']['value'],buildString) for item in (set([k['d']['value'] for k in propertyUsage]) & hypothesisSet): #rel=Neo4jDrive.insertNodeAndRelationship(self.a ,'cp', r['p']['value']) #self.hyplock.acquire() #hypothesisSet.add(r['p']['value']) #self.hyplock.release() #temp=Neo4jDrive.findNodeByName(r['p']['value']) #temp.properties['hyp']='yes' #temp.push() self.addProperty(r['p']['value']) rel=Neo4jDrive.insertNodeAndRelationship(r['p']['value'], 'd', item) for item in (set([k['d']['value'] for k in propertyUsage]) & set(self.allCC)): #rel=Neo4jDrive.insertNodeAndRelationship(self.a, 'cp', r['p']['value']) #self.hyplock.acquire() #hypothesisSet.add(r['p']['value']) #self.hyplock.release() #temp=Neo4jDrive.findNodeByName(r['p']['value']) #temp.properties['hyp']='yes' #temp.push() self.addProperty(r['p']['value']) rel=Neo4jDrive.insertNodeAndRelationship(r['p']['value'], 'd', item)
def run(data,tables,size): support=[[]] columnNames=[] for i,nameOfFile in enumerate(tables): columnNames+=[CSVRead.readCSV(nameOfFile,firstRow=True, choice=[0,1])] columnNames[i]=[c.strip() for c in columnNames[i]] for j,name in enumerate(columnNames[i]): z=Neo4jDrive.insertNodeAndRelationship(nameOfFile,"Column",name)[0] node=Neo4jDrive.findNodeByName(name) node.properties['type']='Column' node.push() z.properties['type']="Column" z.push() support[i]+=[CSVRead.getSupport(nameOfFile,j)] support+=[[]] support=support[:-1] totalNumberOfValues=CSVRead.getSize(nameOfFile,0) hyplock=Lock() stypelock=Lock() for itemPiece in data: indexOfFile=itemPiece[1] item=itemPiece[0] for column in range(len(columnNames[indexOfFile])): #support=CSVRead.getSupport(nameOfFile,column) #totalNumberOfValues=CSVRead.numberOfItems(support) k=ccThread(item[column],columnNames[indexOfFile],column,support[indexOfFile],size[indexOfFile]) k.start() k.join() for itemPiece in data: indexOfFile=itemPiece[1] item=itemPiece[0] for column in range(len(columnNames[indexOfFile])): #support=CSVRead.getSupport(nameOfFile,column) #totalNumberOfValues=CSVRead.numberOfItems(support) for perm_column in range(len(columnNames[indexOfFile])): if perm_column!=column: k=dmsThread(item[column],item[perm_column],size[indexOfFile],columnNames[indexOfFile],column,perm_column) k.start() k.join() allCC=set(Neo4jDrive.findAllCCNodes()) for s,c in enumerate(columnNames): for column in c: k=topDownThread(column,hyplock,stypelock,allCC,size[s]) k.start() k.join()
def ccScores(self): data=self.data columnNames=self.columnNames totalSize=self.totalSize relationships=self.relationships size=len(data) bitmap={} for i,column in enumerate(columnNames): relationships[column]={} bitmap[column]={} #this is a dictionary which is a set of flags per data value remembering if the increment already happened. for element in data: item=element[i] rlist=sparqlQuerypy.findBottomUp(item.strip()) print 'number of nodes for', item.strip(), " is ", len(rlist) bitmap[column][item]={} for r in rlist: if r[0] not in bitmap[column][item].keys(): bitmap[column][item][r[0]]=0 if r[0] not in relationships[column].keys(): relationships[column][r[0]]={} relationships[column][r[0]]['name']='cc' if 'incoming' not in relationships[column][r[0]].keys(): relationships[column][r[0]]['incoming']=1 relationships[column][r[0]]['cc']=1.0/totalSize else: relationships[column][r[0]]['incoming']+=1 relationships[column][r[0]]['cc']=relationships[column][r[0]]['incoming']*1.0/totalSize bitmap[column][item][r[0]]=1 classSet=set() # A set to save all the possible cc classes for ease of retrieval later and to streamline it. for column in columnNames: #Loop to push the relations and nodes to Neo4j for classes in relationships[column].keys(): classSet.add(classes) rel_data=Neo4jDrive.insertNodeAndRelationship(column,'cc',classes)[0] rel_data.properties['rel_class']='cc' rel_data.properties['fk']=relationships[column][classes]['cc'] rel_data.push() for classes in classSet: #Loop to update the CCS score for each class after the previous loop is over. CCS=sum(fk)/no(fk) for the node. print classes cummulative=0 # The accumulator linkNumbers=0 # The denominator for link in Neo4jDrive.findIncomingCCLinks(classes): #loop to find incoming cc edges. cummulative+=link[0].properties['fk'] linkNumbers+=1 node=Neo4jDrive.findNodeByName(classes) node.properties['ccs']=cummulative*1.0/linkNumbers node.properties['type']='cc' node.push()
def run(self): support=self.support totalNumberOfValues=self.totalNumberOfValues*1.0 column=self.column columnNames=self.columnNames item=self.item rlist=sparqlQuerypy.findBottomUp(item.strip()) print 'number of nodes for', item.strip(), " is ", len(rlist) log.write('number of nodes for'+str( item.strip())+ " is "+ str(len(rlist))+'\n') flag=0 for r in rlist: rel_data=Neo4jDrive.insertNodeAndRelationship(columnNames[column],"cc",r[2]) rel_data=rel_data[0] node=Neo4jDrive.findNodeByName(r[2]) if r[2]=='http://dbpedia.org/ontology/PopulatedPlace': print columnNames[column], 'Happening' print 'potato',rel_data if rel_data.properties['incoming']==None: #find out why this is not happenings rel_data.properties['incoming']=1 rel_data.properties['ccs']=1/totalNumberOfValues rel_data.push() #print 'tomato',rel_data else: if flag==0: rel_data.properties['incoming']+=1 rel_data.push() rel_data.properties['ccs']=node.properties['incoming']/totalNumberOfValues flag=1 node.properties['type']='cc' node.properties['ccs']=0 numberOfLinks=0 for link in Neo4jDrive.findIncomingCCLinks(r[2]): node.properties['ccs']+=link[0].properties['ccs'] numberOfLinks+=1 if numberOfLinks>0: node.properties['ccs']/=numberOfLinks node.push() rel_data.properties['rel_class'] = 'cc' #rel_data.properties['ccs']=node.proper/(totalNumberOfValues*1.0) rel_data.push()
def run(self): support=self.support totalNumberOfValues=self.totalNumberOfValues column=self.column columnNames=self.columnNames item=self.item node=Neo4jDrive.findNodeByName(item) if node== None: Neo4jDrive.insertNodeAndRelationship(columnNames[column],'dataItems',item) node=Neo4jDrive.findNodeByName(item) node.properties['fvalue']=support[item] node.push() rlist=sparqlQuerypy.findBottomUp(item) for r in rlist: try: rel_data=Neo4jDrive.insertNodeAndRelationship(item,"cc",r[0]) rel_data1=Neo4jDrive.insertNodeAndRelationship(r[0],"dd",r[2]) node=node=Neo4jDrive.findNodeByName(r[2]) if node.properties['incoming']==None: node.properties['incoming']=1 else: node.properties['incoming']+=1 node.properties['type']='type' node.push() except : print columnNames[column],'cc',r[0] rel_data=rel_data[0] rel_data.properties['rel_class'] = 'cc' rel_data.properties['support']=support[item]/(totalNumberOfValues*1.0) rel_data.push()
def main(): Neo4jDrive.insertNode(nameOfFile) columnNames=CSVRead.readCSV(nameOfFile,firstRow=True, choice=[0,1,2,3,4]) for name in columnNames: Neo4jDrive.insertNodeAndRelationship(nameOfFile,"Column",name) #support=CSVRead.getSupport(nameOfFile,0) #totalNumberOfValues=CSVRead.numberOfItems(support) for column in range(sum([1 for _ in Neo4jDrive.findRelationshipsOfNode(nameOfFile,"Column")])): support=CSVRead.getSupport(nameOfFile,column) totalNumberOfValues=CSVRead.numberOfItems(support) #print i.end_node #cNode=Neo4jDrive.findNodeByName(columnNames[column]) for item in support.keys(): k=itemThread(item,columnNames,column,support,totalNumberOfValues) k.start() k.join()
def run(self): rlist=sparqlQuerypy.findProperty2(self.label1,self.label2) print '------------------' log.write('----------------\n') log.write(str(datetime.datetime.now())+'\n') log.write(self.label1+self.label2) print self.label1,self.label2#,rlist cache=[] propertyUsage=[1] for r in rlist: if u'd' in r.keys(): self.addProperty(r['p']['value']) rel_data=Neo4jDrive.insertNodeAndRelationship(r['p']['value'],"domain",r['d']['value'])[0] rel_data['name']='domain' rel_data.push() else: ccClasses=Neo4jDrive.findCCNodes(self.columnNames[self.perm_column]) buildString="(" for i in ccClasses: buildString+='<'+i+'>,' buildString=buildString[:-1] buildString+=")" if r['p']['value'] not in cache: propertyUsage=sparqlQuerypy.findPropertyClassesSecond(r['p']['value'],buildString) cache+=[r['p']['value']] print len(propertyUsage),r['p']['value'] if len(propertyUsage)<15000: for item in (set([k['r']['value'] for k in propertyUsage]) & set(ccClasses)): self.addProperty(r['p']['value']) rel_data=Neo4jDrive.insertNodeAndRelationship(r['p']['value'],"domain",item)[0] rel_data['name']="domain" rel_data.push() node=Neo4jDrive.findNodeByName(item) node.properties['hyp']='yes' node.properties['type']='cc' node.push() self.incrementDms(rel_data) #for each table we have to put a score on the link between the what and what? The property and its domain? But then how is the score calculated? Is it number of columns in the table by total in that table or is it completely unique?
def addProperty(self,p): rel_data=Neo4jDrive.insertNodeAndRelationship(self.columnNames[self.column],"property",p) hypothesisSet.add(p) node=Neo4jDrive.findNodeByName(p) if node.properties['dcsincoming']==None: node.properties['dcsincoming']=1 node.properties['dcs']=1/(self.size*1.0) else: node.properties['dcsincoming']+=1 node.properties['dcs']=node.properties['dcsincoming']/(self.size*1.0) node.properties['type']='property' node.push() rel=Neo4jDrive.insertRelationship(self.columnNames[self.column], p, self.columnNames[self.perm_column])[0] if rel.properties['propCount']==None: rel.properties['type']='property_rel' rel.properties['name']=p rel.properties['count']=1 rel.properties['dms']=rel.properties['count']/(self.size*1.0) else: rel.properties['count']+=1 rel.properties['dms']=rel.properties['count']/(self.size*1.0) rel.push()
def main(): columnNames=[] colNam={} csvitems={} size={} tables=["StatesandCapitals.csv","RiversandSourceState.csv"] for i, nameOfFile in enumerate(tables): #pushes each table as a node into the graph along with the columns Neo4jDrive.insertNode(nameOfFile) node=Neo4jDrive.findNodeByName(nameOfFile) node.properties['type']='table' node.push() #end of push columnNames+=[CSVRead.readCSV(nameOfFile,firstRow=True, choice=[0,1])] columnNames[i]=[c.strip() for c in columnNames[i]] colNam[nameOfFile]=[c.strip() for c in columnNames[i]] for j,name in enumerate(columnNames[i]): z=Neo4jDrive.insertNodeAndRelationship(nameOfFile,"Column",name)[0] node=Neo4jDrive.findNodeByName(name) node.properties['type']='Column' node.push() z.properties['type']="Column" z.push() #end of the Column Pushing csvitems[nameOfFile]=CSVRead.readCSV(nameOfFile,firstRow=False,choice=[0,1])[1:] #stores each data set in a dictionary of lists size[nameOfFile]=[len(csvitems[nameOfFile])] #stores the sizes of the lists in a dictionary called size random.shuffle(csvitems[nameOfFile]) #shuffles for randomness relationships={} iterations=1 convergence=False #the test flag for whether convergence has been reached while(not convergence): for table in tables: start=sample*(iterations-1) end=sample*iterations rt=runThread(table, csvitems[table][start:end], colNam[table],end,relationships) rt.start() rt.join() iterations+=1 if end>5:convergence=True
def run(self): support = self.support totalNumberOfValues = self.totalNumberOfValues column = self.column columnNames = self.columnNames item = self.item rlist = sparqlQuerypy.findBottomUp(item) for r in rlist: rel_data = Neo4jDrive.insertNodeAndRelationship(columnNames[column], "cc", r[2]) node = Neo4jDrive.findNodeByName(r[2]) if node.properties["incoming"] == None: node.properties["incoming"] = 1 else: node.properties["incoming"] += 1 node.properties["type"] = "type" node.push() rel_data = rel_data[0] rel_data.properties["rel_class"] = "cc" rel_data.properties["support"] = support[item] / (totalNumberOfValues * 1.0) rel_data.push()
def addProperty(self,p): print self.a, p rel_data=Neo4jDrive.insertNodeAndRelationship(self.a,"cp",p)[0] rel_data.properties['type']='cp' self.hyplock.acquire() hypothesisSet.add(p) self.hyplock.release() node=Neo4jDrive.findNodeByName(p) if rel_data.properties['incoming']==None: rel_data.properties['incoming']=1 rel_data.properties['dms']=1/(self.size*1.0) pr=p for j in range(len(pr)-1,0,-1): if pr[j]=='/': pr=pr[j+1:] break rel_data.properties['lms']=self.levenshtein(self.a,pr) else: rel_data.properties['incoming']+=1 rel_data.properties['dms']=node.properties['incoming']/(self.size*1.0) rel_data.push() node.properties['type']='property' node.properties['hyp']='yes' node.push()
import Neo4jDrive import CSVWrite from py2neo import Graph import csv import math graph = Graph("http://*****:*****@localhost:7474/db/data/") i = 5 number = 6 with open('../csv/eggs%s.csv' % number, 'wb') as csvfile: writer = csv.writer(csvfile, delimiter=',', quotechar='{') writer.writerow( ['Domain Class', 'CCS Score', 'DCS Score', 'Table', 'Overall Score']) theTable = [] domains = {} numberOfColumns = Neo4jDrive.findTotalNumberOfColumns()[0][0] for record in graph.cypher.execute( "MATCH (n) where n.hyp='yes' return n.name, n.ccs, n.DCS"): domain = record[0] ccs = record[1] dcs = (Neo4jDrive.findNumberOfColumns(domain)[0][0] * 1.0) / numberOfColumns r = [] table = Neo4jDrive.tableMembership(domain) if ccs != None and dcs != None and ccs != 0 and dcs != 0: csvs = math.sqrt((ccs * ccs) + (dcs * dcs)) entropy = -(ccs) / (ccs + dcs) * math.log( ccs / (ccs + dcs)) - (dcs) / (ccs + dcs) * math.log(dcs / (ccs + dcs)) overall = csvs * entropy * table
def main(): Neo4jDrive.insertNode(nameOfFile) columnNames=CSVRead.readCSV(nameOfFile,firstRow=True, choice=[0,1,2,3,4]) for name in columnNames: Neo4jDrive.insertNodeAndRelationship(nameOfFile,"Column",name) #support=CSVRead.getSupport(nameOfFile,0) #totalNumberOfValues=CSVRead.numberOfItems(support) for column in range(sum([1 for _ in Neo4jDrive.findRelationshipsOfNode(nameOfFile,"Column")])): support=CSVRead.getSupport(nameOfFile,column) totalNumberOfValues=CSVRead.numberOfItems(support) #print i.end_node #cNode=Neo4jDrive.findNodeByName(columnNames[column]) for item in support.keys(): node=Neo4jDrive.findNodeByName(item) if node== None: Neo4jDrive.insertNodeAndRelationship(columnNames[column],'dataItems',item) node=Neo4jDrive.findNodeByName(item) node.properties['fvalue']=support[item] node.push() rlist=sparqlQuerypy.findBottomUp(item) for r in rlist: try: rel_data=Neo4jDrive.insertNodeAndRelationship(item,"cc",r[0]) rel_data1=Neo4jDrive.insertNodeAndRelationship(r[0],"dd",r[2]) node=node=Neo4jDrive.findNodeByName(r[2]) if node.properties['incoming']==None: node.properties['incoming']=1 else: node.properties['incoming']+=1 node.properties['type']='type' node.push() except : print columnNames[column],'cc',r[0] rel_data=rel_data[0] rel_data.properties['rel_class'] = 'cc' rel_data.properties['support']=support[item]/(totalNumberOfValues*1.0) rel_data.push()
def dmsScore(self): data=self.data columnNames=self.columnNames totalSize=self.totalSize relationships=self.relationships size=len(data) cache=[] bitmap={} for i,column1 in enumerate(columnNames): if column1 not in relationships.keys(): relationships[column1]={} if column1 not in bitmap.keys(): bitmap[column1]={} for j,column2 in enumerate(columnNames): if column2 not in relationships.keys(): relationships[column2]={} if i==j: continue for element in data: print '--------------------' print element[i],'-->',element[j] item=(element[i],element[j]) rlist=sparqlQuerypy.findProperty2(element[i].strip(),element[j].strip()) cache=[] for r in rlist: if column2 not in relationships[column1].keys(): relationships[column1][column2]={} if column2 not in bitmap[column1].keys(): bitmap[column1][column2]={} if item not in bitmap[column1][column2]: bitmap[column1][column2][item]={} bitmap[column1][column2][item][r['p']['value']]=0 if r['p']['value'] not in relationships[column1][column2].keys(): relationships[column1][column2][r['p']['value']]={} if u'd' in r.keys(): print 'u d is in r.keys()' relationships[column1][column2][r['p']['value']]['name']='property' if 'count' not in relationships[column1][column2][r['p']['value']].keys(): relationships[column1][column2][r['p']['value']]['count']=1.0 if bitmap[column1][column2][item][r['p']['value']]==0: relationships[column1][column2][r['p']['value']]['count']+=1 bitmap[column1][column2][item][r['p']['value']]=1 print relationships[column1][column2][r['p']['value']]['count'] relationships[column1][column2][r['p']['value']]['dms']=relationships[column1][column2][r['p']['value']]['count']/totalSize if r['p']['value'] not in relationships[column2].keys(): relationships[column2][r['p']['value']]={} relationships[column2][r['p']['value']]['name']='cp' if r['p']['value'] not in relationships.keys(): relationships[r['p']['value']]={} if r['d']['value'] not in relationships[r['p']['value']].keys(): relationships[r['p']['value']][r['d']['value']]={'name':'domain'} #-----------------TODO: add to hypothesis-------------# else: ccClasses=Neo4jDrive.findCCNodes(column2) buildString="(" for ii in ccClasses: buildString+='<'+ii+'>,' buildString=buildString[:-1] buildString+=")" if r['p']['value'] not in cache: propertyUsage=sparqlQuerypy.findPropertyClassesSecond(r['p']['value'],buildString) cache+=[r['p']['value']] #bitmap[column1][column2][item][r['p']['value']]=0 for domain in (set([k['r']['value'] for k in propertyUsage]) & set(ccClasses)): relationships[column1][column2][r['p']['value']]['name']='property' if 'count' not in relationships[column1][column2][r['p']['value']].keys(): relationships[column1][column2][r['p']['value']]['count']=1.0 print "item and r['p']['value'], is", item,r['p']['value'] if bitmap[column1][column2][item][r['p']['value']]==0: relationships[column1][column2][r['p']['value']]['count']+=1 bitmap[column1][column2][item][r['p']['value']]=1 print relationships[column1][column2][r['p']['value']]['count'] relationships[column1][column2][r['p']['value']]['dms']=relationships[column1][column2][r['p']['value']]['count']/totalSize*1.0 if r['p']['value'] not in relationships[column2].keys(): relationships[column2][r['p']['value']]={} relationships[column2][r['p']['value']]['name']='cp' if r['p']['value'] not in relationships.keys(): relationships[r['p']['value']]={} if item not in relationships[r['p']['value']].keys(): relationships[r['p']['value']][domain]={'name':'domain'} bitmap[column1][column2]=None #-------------------------add to Hypothesis----------------------# #-----------------Uploading to Neo4j----------------------------# for i,column1 in enumerate(columnNames): for j,column2 in enumerate(columnNames): if column1==column2: continue if column2 not in relationships[column1].keys(): continue for rel in relationships[column1][column2].keys(): rel_data=Neo4jDrive.insertNodeAndRelationship(column1,rel,column2)[0] rel_data.properties['type']='property' rel_data.properties['name']=rel if 'dms' in relationships[column1][column2][rel].keys(): rel_data.properties['dms']=relationships[column1][column2][rel]['dms'] else: rel_data.properties['dms']=0 rel_data.push() rel_data=Neo4jDrive.insertNodeAndRelationship(column2,'cp',rel)[0] rel_data.properties['type']='cp' rel_data.push() for domain in relationships[rel].keys(): rel_data=Neo4jDrive.insertNodeAndRelationship(rel,'domain',domain)[0] rel_data.properties['type']='domain' rel_data.push()
import Neo4jDrive import CSVWrite from py2neo import Graph import csv import math graph = Graph("http://*****:*****@localhost:7474/db/data/") i=5 number=6 with open('../csv/eggs%s.csv'%number,'wb') as csvfile: writer=csv.writer(csvfile, delimiter=',',quotechar='{') writer.writerow(['Domain Class','CCS Score','DCS Score', 'Table','Overall Score']) theTable=[] domains={} numberOfColumns=Neo4jDrive.findTotalNumberOfColumns()[0][0] for record in graph.cypher.execute("MATCH (n) where n.hyp='yes' return n.name, n.ccs, n.DCS"): domain=record[0] ccs=record[1] dcs=(Neo4jDrive.findNumberOfColumns(domain)[0][0]*1.0)/numberOfColumns r=[] table=Neo4jDrive.tableMembership(domain) if ccs!=None and dcs!=None and ccs!=0 and dcs!=0: csvs=math.sqrt((ccs*ccs)+(dcs*dcs)) entropy=-(ccs)/(ccs+dcs)*math.log(ccs/(ccs+dcs))-(dcs)/(ccs+dcs)*math.log(dcs/(ccs+dcs)) overall=csvs*entropy*table else: overall='-' domains[domain]=overall r.append(domain) r.append(ccs) r.append(dcs)
def main(): Neo4jDrive.insertNode(nameOfFile) columnNames = CSVRead.readCSV(nameOfFile, firstRow=True, choice=[0, 1, 2, 3, 4]) for name in columnNames: Neo4jDrive.insertNodeAndRelationship(nameOfFile, "Column", name) # support=CSVRead.getSupport(nameOfFile,0) # totalNumberOfValues=CSVRead.numberOfItems(support) for column in range(sum([1 for _ in Neo4jDrive.findRelationshipsOfNode(nameOfFile, "Column")])): support = CSVRead.getSupport(nameOfFile, column) totalNumberOfValues = CSVRead.numberOfItems(support) # print i.end_node # cNode=Neo4jDrive.findNodeByName(columnNames[column]) for item in support.keys(): node = Neo4jDrive.findNodeByName(item) if node == None: Neo4jDrive.insertNodeAndRelationship(columnNames[column], "dataItems", item) node = Neo4jDrive.findNodeByName(item) node.properties["fvalue"] = support[item] node.push() rlist = sparqlQuerypy.findBottomUp(item) for r in rlist: try: rel_data = Neo4jDrive.insertNodeAndRelationship(item, "cc", r[0]) rel_data1 = Neo4jDrive.insertNodeAndRelationship(r[0], "dd", r[2]) node = node = Neo4jDrive.findNodeByName(r[2]) if node.properties["incoming"] == None: node.properties["incoming"] = 1 else: node.properties["incoming"] += 1 node.properties["type"] = "type" node.push() except: print columnNames[column], "cc", r[0] rel_data = rel_data[0] rel_data.properties["rel_class"] = "cc" rel_data.properties["support"] = support[item] / (totalNumberOfValues * 1.0) rel_data.push()