def __init__(self,trace=False): self.lexicon = Lexicon() self.sources = Tensor(rank=4) self.corpus = Tensor(rank=3) self.perspectives = dict([(x,Tensor(rank=2)) for x in PERSP_TYPES]) self.types = {} self.synonyms = {} self.trace = trace
class MemStore: def __init__(self,trace=False): self.lexicon = Lexicon() self.sources = Tensor(rank=4) self.corpus = Tensor(rank=3) self.perspectives = dict([(x,Tensor(rank=2)) for x in PERSP_TYPES]) self.types = {} self.synonyms = {} self.trace = trace def convert(self,statement): """ Backwards compatibility function for converting between integer and string representations of statements. """ return tuple([self.lexicon[x] for x in statement]) def incorporate(self,path,ext='.tsv'): """ Imports the statements into the store, processing all files with the specified extension ext in the path location. Lexicon and sources structures are updated (not overwritten) in the process. """ # first pass to update the lexicon expressions = [] for fname in [os.path.join(path,x) for x in os.listdir(path) if os.path.isfile(os.path.join(path,x)) and \ os.path.splitext(x)[-1].lower() == ext.lower()]: for line in open(fname,'r'): try: s,p,o,prov,rel = line.split('\t')[:5] rel = float(rel) expressions += [s,p,o,prov] except: sys.stderr.write('W (loading memory-based store) - '+\ 'something wrong with line:\n%s' % (line,)) self.lexicon.update(expressions) # second pass to update the sources tensor for fname in [os.path.join(path,x) for x in os.listdir(path) if os.path.isfile(os.path.join(path,x)) and \ os.path.splitext(x)[-1].lower() == ext.lower()]: for line in open(fname,'r'): try: s,p,o,prov,rel = line.split('\t')[:5] rel = float(rel) key = tuple([self.lexicon[x] for x in [s,p,o,prov]]) self.sources[key] = rel except: sys.stderr.write('W (loading memory-based store) - '+\ 'something wrong with line:\n%s' % (line,)) def dump(self,filename): # straightforward (but somehow slow) (de)serialisation using cPickle cPickle.dump(self,open(filename,'wb')) def load(self,filename): # straightforward (but somehow slow) (de)serialisation using cPickle self = cPickle(open(filename,'rb')) def exp(self,path,compress=True,core_only=True): # exporting the whole store as tab-separated value files to a directory # (gzip compression is used by default) # note that only lexicon, sources and corpus structures are exported, any # possibly precomputed corpus perspectives have to be re-created! # also, integer indices are used - for lexicalised (human readable) export # of sources and corpus, use exportSources() and exportCorpus() functions # setting the filenames lex_fn = os.path.join(path,'lexicon.tsv') src_fn = os.path.join(path,'sources.tsv') crp_fn = os.path.join(path,'corpus.tsv') if compress: lex_fn += '.gz' src_fn += '.gz' crp_fn += '.gz' openner, sig = open, 'w' if compress: openner, sig = gzip.open, 'wb' lex_f = openner(lex_fn,sig) src_f = openner(src_fn,sig) crp_f = openner(crp_fn,sig) self.lexicon.to_file(lex_f) self.sources.to_file(src_f) self.corpus.to_file(crp_f) lex_f.close() src_f.close() crp_f.close() def imp(self,path,compress=True): # importing the whole store as tab-separated value files from a directory # effectively an inverse of the exp() function lex_fn = os.path.join(path,'lexicon.tsv') src_fn = os.path.join(path,'sources.tsv') crp_fn = os.path.join(path,'corpus.tsv') if compress: lex_fn += '.gz' src_fn += '.gz' crp_fn += '.gz' openner, sig = open, 'r' if compress: openner, sig = gzip.open, 'rb' lex_f = openner(lex_fn,sig) src_f = openner(src_fn,sig) crp_f = openner(crp_fn,sig) self.lexicon.from_file(lex_f) self.sources.from_file(src_f) self.corpus.from_file(crp_f) lex_f.close() src_f.close() crp_f.close() def computeCorpus(self): # number of all triples N = 0 # x -> number of independednt occurences in the store indep_freq = {} # (x,y) -> number of joint occurences in the store joint_freq = {} # (s,p,o) -> number of occurences tripl_freq = {} # (s,p,o) -> (provenance, relevance) spo2pr = {} # going through all the statements in the sources for s,p,o,d in self.sources.keys(): N += 1 if indep_freq.has_key(s): indep_freq[s] += 1 else: indep_freq[s] = 1 if indep_freq.has_key(o): indep_freq[o] += 1 else: indep_freq[o] = 1 if joint_freq.has_key((s,o)): joint_freq[(s,o)] += 1 else: joint_freq[(s,o)] = 1 if tripl_freq.has_key((s,p,o)): tripl_freq[(s,p,o)] += 1 else: tripl_freq[(s,p,o)] = 1 if not spo2pr.has_key((s,p,o)): spo2pr[(s,p,o)] = [] spo2pr[(s,p,o)].append((d,self.sources[(s,p,o,d)])) # going only through the unique triples now regardless of their provenance for s,p,o in spo2pr: # a list of relevances of particular statement sources src_rels = [x[1] for x in spo2pr[(s,p,o)]] # absolute frequency of the triple times it's mutual information score joint = joint_freq[(s,o)] if (o,s) in joint_freq: joint += joint_freq[(o,s)] # frequency times mutual information score fMI = 0.0 try: fMI = \ tripl_freq[(s,p,o)]*log(float(N*joint)/(indep_freq[s]*indep_freq[o]),2) except ValueError: continue # setting the corpus tensor value self.corpus[(s,p,o)] = fMI*(float(sum(src_rels))/len(src_rels)) def normaliseCorpus(self,cut_off=0.95,min_quo=0.1): # corpus normalisation by a value that is greater or equal to the # percentage of weight values given by the cut_off parameter # (if the values are below zero, they are set to the min_quo # fraction of the minimal normalised value ws = sorted(self.corpus.values()) norm_cons = ws[int(cut_off*len(ws)):][0] min_norm = min([x for x in ws if x > 0])*min_quo for key in self.corpus: w = self.corpus[key]/norm_cons if w < 0: w = min_norm if w > 1: w = 1.0 self.corpus[key] = w def computePerspective(self,ptype): self.perspectives[ptype] = self.corpus.matricise(PERSP2PIVDIM[ptype]) def indexSources(self): self.sources.index() def indexCorpus(self): self.corpus.index() def indexPerspective(self,ptype): self.perspectives[ptype].index() def getProvenance(self,statement): # getting the statement elements s,p,o = statement # getting integer ID versions of the statement elements if type(s) in [unicode,str]: s = self.lexicon[s] if type(p) in [unicode,str]: p = self.lexicon[p] if type(o) in [unicode,str]: o = self.lexicon[o] # evalating query on the sources tensor and collating the results return [x[3] for x, rel in self.sources.query((s,p,o,None))] def getRelevance(self,prov): if type(prov) in [unicode,str]: prov = self.lexicon[prov] return max(set([rel for x, rel in \ self.sources.query((None,None,None,prov))])) def exportSources(self,filename,lexicalised=True): # export the sources tensor to a file, in a tab-separated value format, # either with integer or lexicalised keys f = open(filename,'w') if lexicalised: f.write('\n'.join(['\t'.join([self.lexicon[x] for x in [s,p,o,d]]+\ [str(w)]) for (s,p,o,d),w in self.sources.items()])) else: f.write('\n'.join(['\t'.join([str(x) for x in [s,p,o,d,w]]) for \ (s,p,o,d),w in self.sources.items()])) f.close() def exportCorpus(self,filename,lexicalised=True): # export the corpus tensor to a file, in a tab-separated value format # either with integer or lexicalised keys f = open(filename,'w') if lexicalised: f.write('\n'.join(['\t'.join([self.lexicon[x] for x in [s,p,o]]+[str(w)])\ for (s,p,o),w in self.corpus.items()])) else: f.write('\n'.join(['\t'.join([str(x) for x in [s,p,o,w]]) for \ (s,p,o),w in self.corpus.items()])) f.close()