def __init__(self, dimensionality: int = 2000, denseness: int = 10, name: str = "no name"): self.name = name self.indexspace = {} # dict: string - sparse vector self.contextspace = {} # dict: string - denser vector self.tag = {} # dict: string - string self.tagged = {} # dict: string - list: str self.dimensionality = dimensionality self.denseness = denseness self.permutationcollection = { "nil": list(range(self.dimensionality)), "before": sparsevectors.createpermutation(self.dimensionality), "after": sparsevectors.createpermutation(self.dimensionality) } self.observedfrequency = {} # dict: string - int self.constantdenseness = 10 self.languagemodel = LanguageModel() self.poswindow = 3 self.changed = False
def postriplevector(self, text, poswindow=3): poses = nltk.pos_tag(text) windows = [poses[ii:ii + poswindow] for ii in range(len(poses) - poswindow + 1 + 2)] onevector = self.pospermutations["vector"] vector = sparsevectors.newemptyvector(self.dimensionality) for sequence in windows: for item in sequence: if item[1] not in self.pospermutations: self.pospermutations[item[1]] = sparsevectors.createpermutation(self.dimensionality) onevector = sparsevectors.permute(onevector, self.pospermutations[item[1]]) vector = sparsevectors.sparseadd(vector, onevector) return vector
def processfile(file): global sentencestorage, utterancespace sentenceindex = 0 textvector = wordspace.newemptyvector() with open(file, "r", encoding="utf-8") as textfile: rawtext = textfile.read().lower() rawtext = re.sub('\n', ' ', rawtext) rawtext = re.sub('\"', ' ', rawtext) rawtext = re.sub('\s+', ' ', rawtext) sents = sent_tokenize(rawtext) for sentence in sents: sentenceindex += 1 sentencestorage[sentenceindex] = sentence allsurfacewords = nltk.word_tokenize(sentence) wordspace.chkwordspace(allsurfacewords, debug) analyses = [] try: analyses = semanticdependencyparse.semanticdepparse( sentence.lower(), debug) except: logger("PARSE ERROR " + str(sentenceindex) + "\t" + sentence, error) kk = 0 for analysis in analyses: words = analysis.values() wordspace.checkwordspacelist(words, debug) for role in analysis: if role not in wordspace.permutationcollection: wordspace.permutationcollection[ role] = sparsevectors.createpermutation( wordspace.dimensionality) u = getvector(analysis, sentence) win = 1 sentencesequence = 0 startindexforthistext = 0 while win < sentencesequence: if sentenceindex - win > startindexforthistext: u = sparsevectors.sparseadd( u, sparsevectors.permute( sparsevectors.normalise( utterancespace[sentenceindex - win]), wordspace.permutationcollection["discourse"])) win += 1 if kk > 0: sentenceindex += 1 utterancespace[sentenceindex] = u textvector = sparsevectors.sparseadd(textvector, u, 1) kk += 1 textspace[file] = textvector return textvector
def onesequencevector(self, subsequence, accumulator=None, loglevel=False): if accumulator is None: accumulator = self.sequencelabel if subsequence == []: return accumulator else: head = subsequence[0] # type: str tail = subsequence[1:] if head not in self.permutations: self.permutations[head] = sparsevectors.createpermutation( self.dimensionality) self.changed = True return self.onesequencevector( tail, sparsevectors.permute(accumulator, self.permutations[head]))
def addoperator(self, item): self.permutationcollection[item] = sparsevectors.createpermutation( self.dimensionality)