def contexttoindexneighbours(self, item, number=10, weights=False, permutationname="nil"): """ Return the items whose index vectors are most similar to the given item's context vector. I.e. items which have occurred in contexts with the item. """ permutation = self.permutationcollection[permutationname] neighbourhood = {} for i in self.indexspace: neighbourhood[i] = sparsevectors.sparsecosine( self.contextspace[item], sparsevectors.permute(self.indexspace[i], permutation)) if not number: number = len(neighbourhood) if weights: r = sorted(neighbourhood.items(), key=lambda k: neighbourhood[k[0]], reverse=True)[:number] else: r = sorted(neighbourhood, key=lambda k: neighbourhood[k], reverse=True)[:number] return r
def useoperator(self, vector, operator): newvec = vector if operator: if not self.isoperator(operator): self.addoperator(operator) newvec = sparsevectors.permute( vector, self.permutationcollection[operator]) return newvec
def addintoitem(self, item, vector, weight=1, operator=None): if not self.contains(item): self.additem(item) if operator is not None: vector = sparsevectors.permute(vector, operator) self.contextspace[item] = sparsevectors.sparseadd(self.contextspace[item], sparsevectors.normalise(vector), weight) self.changed = True
def applyoperator(self, item, operator, constant, weight): self.contextspace[item] = sparsevectors.sparseadd( self.contextspace[item], sparsevectors.normalise(sparsevectors.permute(self.constantcollection[constant], self.permutationcollection[operator])), weight) if operator == "morphology": self.morphologyspace[item] = sparsevectors.sparseadd( self.morphologyspace[item], sparsevectors.normalise(sparsevectors.permute(self.constantcollection[constant], self.permutationcollection[operator])), weight) else: self.attributespace[item] = sparsevectors.sparseadd( self.attributespace[item], sparsevectors.normalise(sparsevectors.permute(self.constantcollection[constant], self.permutationcollection[operator])), weight)
def indextocontextneighbours(self, item, number=10, weights=False, permutationname="nil"): permutation = self.permutationcollection[permutationname] neighbourhood = {} for i in self.contextspace: neighbourhood[i] = sparsevectors.sparsecosine(sparsevectors.permute(self.indexspace[item], permutation), self.contextspace[i]) if weights: r = sorted(neighbourhood.items(), key=lambda k: neighbourhood[k[0]], reverse=True)[:number] else: r = sorted(neighbourhood, key=lambda k: neighbourhood[k], reverse=True)[:number] return r
def postriplevector(self, text, poswindow=3): poses = nltk.pos_tag(text) windows = [poses[ii:ii + poswindow] for ii in range(len(poses) - poswindow + 1 + 2)] onevector = self.pospermutations["vector"] vector = sparsevectors.newemptyvector(self.dimensionality) for sequence in windows: for item in sequence: if item[1] not in self.pospermutations: self.pospermutations[item[1]] = sparsevectors.createpermutation(self.dimensionality) onevector = sparsevectors.permute(onevector, self.pospermutations[item[1]]) vector = sparsevectors.sparseadd(vector, onevector) return vector
def processfile(file): global sentencestorage, utterancespace sentenceindex = 0 textvector = wordspace.newemptyvector() with open(file, "r", encoding="utf-8") as textfile: rawtext = textfile.read().lower() rawtext = re.sub('\n', ' ', rawtext) rawtext = re.sub('\"', ' ', rawtext) rawtext = re.sub('\s+', ' ', rawtext) sents = sent_tokenize(rawtext) for sentence in sents: sentenceindex += 1 sentencestorage[sentenceindex] = sentence allsurfacewords = nltk.word_tokenize(sentence) wordspace.chkwordspace(allsurfacewords, debug) analyses = [] try: analyses = semanticdependencyparse.semanticdepparse( sentence.lower(), debug) except: logger("PARSE ERROR " + str(sentenceindex) + "\t" + sentence, error) kk = 0 for analysis in analyses: words = analysis.values() wordspace.checkwordspacelist(words, debug) for role in analysis: if role not in wordspace.permutationcollection: wordspace.permutationcollection[ role] = sparsevectors.createpermutation( wordspace.dimensionality) u = getvector(analysis, sentence) win = 1 sentencesequence = 0 startindexforthistext = 0 while win < sentencesequence: if sentenceindex - win > startindexforthistext: u = sparsevectors.sparseadd( u, sparsevectors.permute( sparsevectors.normalise( utterancespace[sentenceindex - win]), wordspace.permutationcollection["discourse"])) win += 1 if kk > 0: sentenceindex += 1 utterancespace[sentenceindex] = u textvector = sparsevectors.sparseadd(textvector, u, 1) kk += 1 textspace[file] = textvector return textvector
def onesequencevector(self, subsequence, accumulator=None, loglevel=False): if accumulator is None: accumulator = self.sequencelabel if subsequence == []: return accumulator else: head = subsequence[0] # type: str tail = subsequence[1:] if head not in self.permutations: self.permutations[head] = sparsevectors.createpermutation( self.dimensionality) self.changed = True return self.onesequencevector( tail, sparsevectors.permute(accumulator, self.permutations[head]))
def getvector(roleworddict, sentencestring): uvector = {} # vector for test item for role in roleworddict: item = roleworddict[role] uvector = sparsevectors.sparseadd( uvector, sparsevectors.permute( sparsevectors.normalise(wordspace.indexspace[item]), wordspace.permutationcollection[role]), wordspace.frequencyweight(item)) lexicalwindow = 1 if lexicalwindow > 0: wds = word_tokenize(sentencestring.lower()) windows = [ wds[i:i + lexicalwindow] for i in range(len(wds) - lexicalwindow + 1) ] for sequence in windows: thisvector = {} for item in sequence: thisvector = sparsevectors.sparseadd( sparsevectors.permute( thisvector, wordspace.permutationcollection["sequence"]), wordspace.indexspace[item], wordspace.frequencyweight(item)) uvector = sparsevectors.sparseadd( uvector, sparsevectors.normalise(thisvector)) pos = 1 if pos > 0: wds = word_tokenize(sentencestring) posanalyses = nltk.pos_tag(wds) poslist = [i[1] for i in posanalyses] windows = [ poslist[i:i + lexicalwindow] for i in range(len(poslist) - lexicalwindow + 1) ] for sequence in windows: thisvector = {} for item in sequence: thisvector = sparsevectors.sparseadd( sparsevectors.permute( thisvector, wordspace.permutationcollection["sequence"]), wordspace.indexspace[item], wordspace.frequencyweight(item)) uvector = sparsevectors.sparseadd( uvector, sparsevectors.normalise(thisvector)) style = True if style: wds = word_tokenize(sentencestring) cpw = len(sentencestring) / len(wds) wps = len(wds) sl = True if sl: if wps > 8: uvector = sparsevectors.sparseadd(uvector, longsentencevector) if wps < 5: uvector = sparsevectors.sparseadd(uvector, shortsentencevector) posanalyses = nltk.pos_tag(wds) poslist = [i[1] for i in posanalyses] for poses in poslist: if poses == "RB" or poses == "RBR" or poses == "RBS": uvector = sparsevectors.sparseadd(uvector, adverbvector) for w in wds: if w in negationlist: uvector = sparsevectors.sparseadd(uvector, negationvector) if w in hedgelist: uvector = sparsevectors.sparseadd(uvector, hedgevector) if w in amplifierlist: uvector = sparsevectors.sparseadd(uvector, amplifiervector) # attitude terms # verb stats # seq newordgrams # verb classes use wordspace! # sent sequences return uvector