def textvector(self, string, frequencyweighting=True, loglevel=False): uvector = sparsevectors.newemptyvector(self.dimensionality) if self.window > 0: windows = [string[ii:ii + self.window] for ii in range(len(string) - self.window + 1)] for sequence in windows: thisvector = self.makevector(sequence) if frequencyweighting: factor = self.frequencyweight(sequence) else: factor = 1 logger(sequence + " " + str(factor), loglevel) if loglevel: logger(str(sparsevectors.sparsecosine(uvector, sparsevectors.normalise(thisvector))), loglevel) uvector = sparsevectors.sparseadd(uvector, sparsevectors.normalise(thisvector), factor) else: words = nltk.word_tokenize(string) if self.binaryfrequencies: wordlist = set(words) # not a list, a set but hey else: wordlist = words for w in wordlist: if frequencyweighting: factor = self.frequencyweight(w) else: factor = 1 if w not in self.indexspace: self.additem(w) else: self.observe(w) uvector = sparsevectors.sparseadd(uvector, sparsevectors.normalise(self.indexspace[w]), factor) return uvector
def observecollocation(self, item, otheritem): if not self.contains(item): self.additem(item) if not self.contains(otheritem): self.additem(otheritem) self.addintoitem(item, sparsevectors.normalise(self.indexspace[otheritem])) self.addintoitem(otheritem, sparsevectors.normalise(self.indexspace[item]))
def observecollocation(self, item, otheritem, operator="nil"): if not self.contains(item): self.additem(item) if not self.contains(otheritem): self.additem(otheritem) self.contextspace[item] = sparsevectors.sparseadd( self.contextspace[item], sparsevectors.normalise(self.indexspace[otheritem]))
def applyoperator(self, item, operator, constant, weight): self.contextspace[item] = sparsevectors.sparseadd( self.contextspace[item], sparsevectors.normalise(sparsevectors.permute(self.constantcollection[constant], self.permutationcollection[operator])), weight) if operator == "morphology": self.morphologyspace[item] = sparsevectors.sparseadd( self.morphologyspace[item], sparsevectors.normalise(sparsevectors.permute(self.constantcollection[constant], self.permutationcollection[operator])), weight) else: self.attributespace[item] = sparsevectors.sparseadd( self.attributespace[item], sparsevectors.normalise(sparsevectors.permute(self.constantcollection[constant], self.permutationcollection[operator])), weight)
def addintoitem(self, item, vector, weight=1, operator=None): if not self.contains(item): self.additem(item) if operator is not None: vector = sparsevectors.permute(vector, operator) self.contextspace[item] = sparsevectors.sparseadd(self.contextspace[item], sparsevectors.normalise(vector), weight) self.changed = True
def addintoitem(self, item, vector, weight=1): if not self.contains(item): vector = sparsevectors.newrandomvector(self.dimensionality, self.denseness) self.indexspace[item] = vector self.globalfrequency[item] = 0 self.contextspace[item] = sparsevectors.newemptyvector( self.dimensionality) self.contextspace[item] = \ sparsevectors.sparseadd(self.contextspace[item], sparsevectors.normalise(vector), weight)
def sequencevector(self, sequence, initialvector=None, loglevel=False): if initialvector is None: initialvector = sparsevectors.newemptyvector(self.dimensionality) windowlist = self.windows(sequence) logger(str(windowlist), loglevel) for w in windowlist: initialvector = sparsevectors.sparseadd( initialvector, sparsevectors.normalise( self.onesequencevector(w, None, loglevel))) return initialvector
def processfile(file): global sentencestorage, utterancespace sentenceindex = 0 textvector = wordspace.newemptyvector() with open(file, "r", encoding="utf-8") as textfile: rawtext = textfile.read().lower() rawtext = re.sub('\n', ' ', rawtext) rawtext = re.sub('\"', ' ', rawtext) rawtext = re.sub('\s+', ' ', rawtext) sents = sent_tokenize(rawtext) for sentence in sents: sentenceindex += 1 sentencestorage[sentenceindex] = sentence allsurfacewords = nltk.word_tokenize(sentence) wordspace.chkwordspace(allsurfacewords, debug) analyses = [] try: analyses = semanticdependencyparse.semanticdepparse( sentence.lower(), debug) except: logger("PARSE ERROR " + str(sentenceindex) + "\t" + sentence, error) kk = 0 for analysis in analyses: words = analysis.values() wordspace.checkwordspacelist(words, debug) for role in analysis: if role not in wordspace.permutationcollection: wordspace.permutationcollection[ role] = sparsevectors.createpermutation( wordspace.dimensionality) u = getvector(analysis, sentence) win = 1 sentencesequence = 0 startindexforthistext = 0 while win < sentencesequence: if sentenceindex - win > startindexforthistext: u = sparsevectors.sparseadd( u, sparsevectors.permute( sparsevectors.normalise( utterancespace[sentenceindex - win]), wordspace.permutationcollection["discourse"])) win += 1 if kk > 0: sentenceindex += 1 utterancespace[sentenceindex] = u textvector = sparsevectors.sparseadd(textvector, u, 1) kk += 1 textspace[file] = textvector return textvector
def rolevector(roledict, initialvector=None, loglevel=False): if initialvector is None: initialvector = sparsevectors.newemptyvector(dimensionality) for role in roledict: for item in roledict[role]: ctxspace.observe(item, False, debug) tmp = initialvector initialvector = sparsevectors.sparseadd( initialvector, sparsevectors.normalise( ctxspace.useoperator(ctxspace.indexspace[item], role))) if loglevel: logger( role + " " + item + " " + str(sparsevectors.sparsecosine(tmp, initialvector)), loglevel) return initialvector
def tweetvector(string): uvector = sparsevectors.newemptyvector(ngramspace.dimensionality) if window > 0: windows = [ string[ii:ii + window] for ii in range(len(string) - window + 1) ] for sequence in windows: if ngramspace.contains(sequence): thisvector = ngramspace.indexspace[sequence] # ngramspace.observe(sequence) # should we be learning stuff now? naaw. else: thisvector = stringspace.makevector(sequence) # ngramspace.additem(sequence, thisvector) # should it be added to cache? naaw. factor = ngramspace.frequencyweight(sequence) uvector = sparsevectors.sparseadd( uvector, sparsevectors.normalise(thisvector), factor) return uvector
def tokenvector(tokenlist, initialvector=None, weights=True, loglevel=False): if initialvector is None: initialvector = sparsevectors.newemptyvector(dimensionality) for item in tokenlist: if not weights or str(item).startswith( "JiK" ): # cxg features should not be weighted the same way lex feats are weight = 1 else: weight = ctxspace.languagemodel.frequencyweight(item, True) ctxspace.observe(item, True) tmp = initialvector initialvector = sparsevectors.sparseadd( initialvector, sparsevectors.normalise(ctxspace.contextspace[item]), weight) if loglevel: logger( item + " " + str(weight) + " " + str(sparsevectors.sparsecosine(tmp, initialvector)), loglevel) return initialvector
def textvector(self, words, frequencyweighting=True, binaryfrequencies=False, loglevel=False): self.docs += 1 uvector = sparsevectors.newemptyvector(self.dimensionality) if binaryfrequencies: wordlist = set(words) # not a list, a set but hey else: wordlist = words for w in wordlist: if frequencyweighting: factor = self.frequencyweight(w) else: factor = 1 if w not in self.indexspace: self.additem(w) else: self.observe(w) self.df[w] += 1 uvector = sparsevectors.sparseadd( uvector, sparsevectors.normalise(self.indexspace[w]), factor) return uvector
def getvector(roleworddict, sentencestring): uvector = {} # vector for test item for role in roleworddict: item = roleworddict[role] uvector = sparsevectors.sparseadd( uvector, sparsevectors.permute( sparsevectors.normalise(wordspace.indexspace[item]), wordspace.permutationcollection[role]), wordspace.frequencyweight(item)) lexicalwindow = 1 if lexicalwindow > 0: wds = word_tokenize(sentencestring.lower()) windows = [ wds[i:i + lexicalwindow] for i in range(len(wds) - lexicalwindow + 1) ] for sequence in windows: thisvector = {} for item in sequence: thisvector = sparsevectors.sparseadd( sparsevectors.permute( thisvector, wordspace.permutationcollection["sequence"]), wordspace.indexspace[item], wordspace.frequencyweight(item)) uvector = sparsevectors.sparseadd( uvector, sparsevectors.normalise(thisvector)) pos = 1 if pos > 0: wds = word_tokenize(sentencestring) posanalyses = nltk.pos_tag(wds) poslist = [i[1] for i in posanalyses] windows = [ poslist[i:i + lexicalwindow] for i in range(len(poslist) - lexicalwindow + 1) ] for sequence in windows: thisvector = {} for item in sequence: thisvector = sparsevectors.sparseadd( sparsevectors.permute( thisvector, wordspace.permutationcollection["sequence"]), wordspace.indexspace[item], wordspace.frequencyweight(item)) uvector = sparsevectors.sparseadd( uvector, sparsevectors.normalise(thisvector)) style = True if style: wds = word_tokenize(sentencestring) cpw = len(sentencestring) / len(wds) wps = len(wds) sl = True if sl: if wps > 8: uvector = sparsevectors.sparseadd(uvector, longsentencevector) if wps < 5: uvector = sparsevectors.sparseadd(uvector, shortsentencevector) posanalyses = nltk.pos_tag(wds) poslist = [i[1] for i in posanalyses] for poses in poslist: if poses == "RB" or poses == "RBR" or poses == "RBS": uvector = sparsevectors.sparseadd(uvector, adverbvector) for w in wds: if w in negationlist: uvector = sparsevectors.sparseadd(uvector, negationvector) if w in hedgelist: uvector = sparsevectors.sparseadd(uvector, hedgevector) if w in amplifierlist: uvector = sparsevectors.sparseadd(uvector, amplifiervector) # attitude terms # verb stats # seq newordgrams # verb classes use wordspace! # sent sequences return uvector
print(probe, mc, n[mc], sentencerepository[mc]) print(space.contexttoindexneighbourswithweights(probe)) for v in vectorrepository: print(v, sentencerepository[v], sep="\t", end="\t") # print(v, vectorrepository[v]) ww = nltk.word_tokenize(sentencerepository[v]) vec = sparsevectors.newemptyvector(dimensionality) # for www in ww: # print(www, space.indexspace[www], space.globalfrequency[www], space.frequencyweight(www), sparsevectors.sparsecosine(space.indexspace[www], vectorrepository[v])) nvn = {} for www in ww: nvn[www] = sparsevectors.sparsecosine(space.indexspace[www], vectorrepository[v]) vec = sparsevectors.sparseadd( vec, sparsevectors.normalise(space.indexspace[www]), space.frequencyweight(www)) m = sorted(ww, key=lambda k: nvn[k], reverse=True)[:5] for mc in m: if nvn[mc] > 0.0001: print(mc, nvn[mc], sep=":", end="\t") print() if False: for w in space.items(): print(w, space.globalfrequency[w], space.indexspace[w], sep="\t") print("\t\t", space.contextspace[w]) # show that constructional items work the same way # show that permuted semantic roles work "semantic grep"
logger("Started training.", monitor) textindex = 0 textspace = SemanticSpace(dimensionality, denseness) modifiedtextspace = SemanticSpace(dimensionality, denseness) squintfeaturespace = SemanticSpace(dimensionality, denseness) fullspace = SemanticSpace(dimensionality, denseness) textdepot = {} modifiedtextdepot = {} featuredepot = {} e = xml.etree.ElementTree.parse(textfile).getroot() for b in e.iter("document"): textindex += 1 tvector = sparsevectors.normalise( stringspace.textvector(b.text, frequencyweighting)) textspace.additem(textindex, tvector) newtext = squintinglinguist.generalise(b.text) mvector = sparsevectors.normalise( stringspace.textvector(newtext, frequencyweighting)) modifiedtextspace.additem(textindex, mvector) features = squintinglinguist.featurise(b.text) fvector = sparsevectors.newemptyvector(dimensionality) for feature in features: fv = stringspace.getvector(feature) fvector = sparsevectors.sparseadd(fvector, sparsevectors.normalise(fv), stringspace.frequencyweight(feature)) fvector = sparsevectors.normalise(fvector) squintfeaturespace.additem(textindex, fvector) pvector = sparsevectors.normalise(stringspace.postriplevector(b.text)) avector = sparsevectors.sparseadd(
monitor) authorindex = 0 testitemspace = SemanticSpace() nn = 0 for file in testfiles: authorname = file.split(".")[0].split("/")[-1] authorindex += 1 logger("Reading " + str(authorindex) + " " + file, monitor) workingvector = sparsevectors.newemptyvector(dimensionality) e = xml.etree.ElementTree.parse(file).getroot() for b in e.iter("document"): origtext = b.text avector = sparsevectors.newemptyvector(dimensionality) if fulltext: avector = sparsevectors.normalise( stringspace.textvector(origtext, frequencyweighting)) if generalise: newtext = squintinglinguist.generalise(origtext) avector = sparsevectors.sparseadd( avector, sparsevectors.normalise( stringspace.textvector(newtext, frequencyweighting))) if featurise: features = squintinglinguist.featurise(origtext) for feature in features: fv = stringspace.getvector(feature) avector = sparsevectors.sparseadd( avector, sparsevectors.normalise(fv), stringspace.frequencyweight(feature)) if postriples: posttriplevector = stringspace.postriplevector(origtext)
def doallthefiles(rangelimit=4000): filelist = {} seenfile = {} antal_frag = 0 for ix in range(rangelimit): filelist[ix] = {} seenfile[ix] = True for cat in categories: fn = "{}{}.of_{:0>4d}.json.txt".format(path, cat, ix) try: os.stat(fn) filelist[ix][cat] = fn except: seenfile[ix] = None filelist[ix][cat] = None del filelist[ix] logger( "index {} did not match up {} file: {}".format( ix, cat, fn), error) logger("antal filer: {}".format(len(filelist)), monitor) conditions = ["wp", "wd", "wn", "wdp", "wnp", "wnd", "wndp"] vocabulary = {} vocabulary_words = Counter() vocabulary_labels = Counter() vocabulary["wp"] = Counter() vocabulary["wd"] = Counter() vocabulary["wn"] = Counter() vocabulary["wnp"] = Counter() vocabulary["wnd"] = Counter() vocabulary["wdp"] = Counter() vocabulary["wndp"] = Counter() outfrag = {} for fileindex in filelist: if seenfile[fileindex]: zippy = mergefiles(filelist[fileindex][categories[0]], filelist[fileindex][categories[1]], filelist[fileindex][categories[2]], filelist[fileindex][categories[3]]) wp_f = open( '{}{}/new_{:0>4d}.txt'.format(outpath, "wp", fileindex), "w+") wd_f = open( '{}{}/new_{:0>4d}.txt'.format(outpath, "wd", fileindex), "w+") wn_f = open( '{}{}/new_{:0>4d}.txt'.format(outpath, "wn", fileindex), "w+") wnp_f = open( '{}{}/new_{:0>4d}.txt'.format(outpath, "wnp", fileindex), "w+") wnd_f = open( '{}{}/new_{:0>4d}.txt'.format(outpath, "wnd", fileindex), "w+") wdp_f = open( '{}{}/new_{:0>4d}.txt'.format(outpath, "wdp", fileindex), "w+") wndp_f = open( '{}{}/new_{:0>4d}.txt'.format(outpath, "wndp", fileindex), "w+") for fragment in zippy: antal_frag += 1 for cc in conditions: outfrag[cc] = [] for oneitem in fragment: vocabulary_words.update([oneitem[0]]) vocabulary_labels.update([oneitem[1]]) vocabulary_labels.update([oneitem[2]]) vocabulary_labels.update([oneitem[3]]) vocabulary["wp"].update( [joinstring.join([oneitem[0], oneitem[1]])]) outfrag["wp"].append("".join([oneitem[0], oneitem[1]])) vocabulary["wd"].update( [joinstring.join([oneitem[0], oneitem[2]])]) outfrag["wd"].append("".join([oneitem[0], oneitem[2]])) vocabulary["wn"].update( [joinstring.join([oneitem[0], oneitem[3]])]) outfrag["wn"].append("".join([oneitem[0], oneitem[3]])) vocabulary["wnp"].update([ joinstring.join([oneitem[0], oneitem[1], oneitem[2]]) ]) outfrag["wnp"].append("".join( [oneitem[0], oneitem[1], oneitem[2]])) vocabulary["wnd"].update([ joinstring.join([oneitem[0], oneitem[1], oneitem[3]]) ]) outfrag["wnd"].append("".join( [oneitem[0], oneitem[1], oneitem[3]])) vocabulary["wdp"].update([ joinstring.join([oneitem[0], oneitem[2], oneitem[3]]) ]) outfrag["wdp"].append("".join( [oneitem[0], oneitem[2], oneitem[3]])) vocabulary["wndp"].update([ joinstring.join( [oneitem[0], oneitem[1], oneitem[2], oneitem[3]]) ]) outfrag["wndp"].append("".join( [oneitem[0], oneitem[1], oneitem[2], oneitem[3]])) wp_f.write(" ".join(outfrag["wp"]) + "\n") wd_f.write(" ".join(outfrag["wd"]) + "\n") wn_f.write(" ".join(outfrag["wn"]) + "\n") wnp_f.write(" ".join(outfrag["wnp"]) + "\n") wnd_f.write(" ".join(outfrag["wnd"]) + "\n") wdp_f.write(" ".join(outfrag["wdp"]) + "\n") wndp_f.write(" ".join(outfrag["wndp"]) + "\n") wn_f.close() wd_f.close() wp_f.close() wnd_f.close() wnp_f.close() wdp_f.close() wndp_f.close() logger("antal fragment: {}".format(antal_frag), monitor) vocab_words = {w for w, c in vocabulary_words.items() if c >= MINCOUNT} size_vocab = len(vocab_words) logger("antal ord std: {}".format(size_vocab), monitor) embeddings = {} for w in vocab_words: embeddings[w] = sparsevectors.newrandomvector(dimensionality, density) vocab_labels = {w for w, c in vocabulary_labels.items() if c >= MINCOUNT} size_vocab = len(vocab_labels) logger("antal tag tot: {}".format(size_vocab), monitor) labelembeddings = {} for w in vocab_labels: try: labelembeddings[w] = sparsevectors.newrandomvector( dimensionality, labeldensity) except IndexError: logger("Indexerror: {}".format(w), error) for cc in conditions: vocab_words = {w for w, c in vocabulary[cc].items() if c >= MINCOUNT} size_vocab = len(vocab_words) compositeembeddings = {} logger("antal ord i {}: {}".format(cc, size_vocab), monitor) with open('{}{}/vocab.words.txt'.format(outpath, cc), "w+") as f: for wdl in sorted(list(vocab_words)): wd = "".join(wdl.split(joinstring)) f.write('{}\n'.format(wd)) vv = embeddings[wdl.split(joinstring)[0]] for ll in wdl.split(joinstring)[1:]: vv = sparsevectors.sparseadd(vv, labelembeddings[ll]) compositeembeddings[wd] = sparsevectors.listify( sparsevectors.normalise(vv), dimensionality) with open('{}{}/compositevectors.txt'.format(outpath, cc), "w+") as f: for www in compositeembeddings: f.write("{} {}\n".format( www, " ".join(map(str, compositeembeddings[www]))))