def addconstant(self, item): self.changed = True self.additem( item, sparsevectors.newrandomvector( self.dimensionality, self.dimensionality // self.constantdenseness))
def additem(self, item, vector="dummy"): if vector is "dummy": vector = sparsevectors.newrandomvector(self.dimensionality, self.denseness) if not self.contains(item): self.indexspace[item] = vector self.contextspace[item] = sparsevectors.newemptyvector( self.dimensionality)
def addintoitem(self, item, vector, weight=1): if not self.contains(item): vector = sparsevectors.newrandomvector(self.dimensionality, self.denseness) self.indexspace[item] = vector self.globalfrequency[item] = 0 self.contextspace[item] = sparsevectors.newemptyvector( self.dimensionality) self.contextspace[item] = \ sparsevectors.sparseadd(self.contextspace[item], sparsevectors.normalise(vector), weight)
def additem(self, item, vector=None): """ Add new item to the space. Add randomly generated index vector (unless one is given as an argument or one already is recorded in index space); add empty context space, prep LanguageModel to accommodate item. Should normally be called from observe() but also at times from addintoitem. """ if item not in self.indexspace: if vector is None: vector = sparsevectors.newrandomvector(self.dimensionality, self.denseness) self.indexspace[item] = vector self.contextspace[item] = sparsevectors.newemptyvector(self.dimensionality) self.changed = True self.observedfrequency[item] = 0
def additem(self, item, vector="dummy"): if vector is "dummy": vector = sparsevectors.newrandomvector(self.dimensionality, self.denseness) if not self.contains(item): self.indexspace[item] = vector self.globalfrequency[item] = 1 self.contextspace[item] = sparsevectors.newemptyvector(self.dimensionality) self.attributespace[item] = sparsevectors.newemptyvector(self.dimensionality) self.morphologyspace[item] = sparsevectors.newemptyvector(self.dimensionality) # self.textspace[item] = sparsevectors.newemptyvector(self.dimensionality) # self.utterancespace[item] = sparsevectors.newemptyvector(self.dimensionality) # self.authorspace[item] = sparsevectors.newemptyvector(self.dimensionality) self.bign += 1
def additemintoitem(self, item, otheritem, weight=1, operator=None): """ Update the context vector of item by adding in the index vector of otheritem multiplied by the scalar weight. If item is unknown, add it to the space. If otheritem is unknown add only an index vector to the space. :param item: str :param otheritem: str :param weight: float :param permutation: list :return: None """ if not self.contains(item): self.additem(item) if otheritem not in self.indexspace: self.indexspace[otheritem] = sparsevectors.newrandomvector(self.dimensionality, self.denseness) self.addintoitem(item, self.indexspace[otheritem], weight, operator)
def __init__(self, dimensionality=2000, window=3, sequencelabel=None, permutations={}): self.window = window self.changed = False self.dimensionality = dimensionality if sequencelabel is None: self.sequencelabel = sparsevectors.newrandomvector( dimensionality, dimensionality // 10) self.changed = True else: self.sequencelabel = sequencelabel self.permutations = permutations self.error = True self.debug = False self.monitor = False
str(seenw[something]) + "\n") wordstatsoutfile.flush() wordstatsoutfile.close() logger( "Computing ngram frequency-based weights with " + str(i) + " files " + str(file), monitor) e = xml.etree.ElementTree.parse(file).getroot() for b in e.iter("document"): string = b.text words = word_tokenize(string) str(string).replace("\n", "") windows = [ string[ii:ii + window] for ii in range(len(string) - window + 1) ] for sequence in windows: seen[sequence] += 1 if seen[sequence] == 1: thisvector = stringspace.makevector(sequence) itemj = {} itemj["string"] = sequence itemj["indexvector"] = thisvector pickle.dump(itemj, ngramvectoroutfile) for word in words: seenw[word] += 1 if seenw[word] == 1: itemj = {} itemj["string"] = sequence itemj["indexvector"] = sparsevectors.newrandomvector( dimensionality, denseness) pickle.dump(itemj, wordvectoroutfile)
def additem(self, item): self.indexspace[item] = sparsevectors.newrandomvector(self.dimensionality, self.denseness) self.globalfrequency[item] = 1 self.bign += 1
def doallthefiles(rangelimit=4000): filelist = {} seenfile = {} antal_frag = 0 for ix in range(rangelimit): filelist[ix] = {} seenfile[ix] = True for cat in categories: fn = "{}{}.of_{:0>4d}.json.txt".format(path, cat, ix) try: os.stat(fn) filelist[ix][cat] = fn except: seenfile[ix] = None filelist[ix][cat] = None del filelist[ix] logger( "index {} did not match up {} file: {}".format( ix, cat, fn), error) logger("antal filer: {}".format(len(filelist)), monitor) conditions = ["wp", "wd", "wn", "wdp", "wnp", "wnd", "wndp"] vocabulary = {} vocabulary_words = Counter() vocabulary_labels = Counter() vocabulary["wp"] = Counter() vocabulary["wd"] = Counter() vocabulary["wn"] = Counter() vocabulary["wnp"] = Counter() vocabulary["wnd"] = Counter() vocabulary["wdp"] = Counter() vocabulary["wndp"] = Counter() outfrag = {} for fileindex in filelist: if seenfile[fileindex]: zippy = mergefiles(filelist[fileindex][categories[0]], filelist[fileindex][categories[1]], filelist[fileindex][categories[2]], filelist[fileindex][categories[3]]) wp_f = open( '{}{}/new_{:0>4d}.txt'.format(outpath, "wp", fileindex), "w+") wd_f = open( '{}{}/new_{:0>4d}.txt'.format(outpath, "wd", fileindex), "w+") wn_f = open( '{}{}/new_{:0>4d}.txt'.format(outpath, "wn", fileindex), "w+") wnp_f = open( '{}{}/new_{:0>4d}.txt'.format(outpath, "wnp", fileindex), "w+") wnd_f = open( '{}{}/new_{:0>4d}.txt'.format(outpath, "wnd", fileindex), "w+") wdp_f = open( '{}{}/new_{:0>4d}.txt'.format(outpath, "wdp", fileindex), "w+") wndp_f = open( '{}{}/new_{:0>4d}.txt'.format(outpath, "wndp", fileindex), "w+") for fragment in zippy: antal_frag += 1 for cc in conditions: outfrag[cc] = [] for oneitem in fragment: vocabulary_words.update([oneitem[0]]) vocabulary_labels.update([oneitem[1]]) vocabulary_labels.update([oneitem[2]]) vocabulary_labels.update([oneitem[3]]) vocabulary["wp"].update( [joinstring.join([oneitem[0], oneitem[1]])]) outfrag["wp"].append("".join([oneitem[0], oneitem[1]])) vocabulary["wd"].update( [joinstring.join([oneitem[0], oneitem[2]])]) outfrag["wd"].append("".join([oneitem[0], oneitem[2]])) vocabulary["wn"].update( [joinstring.join([oneitem[0], oneitem[3]])]) outfrag["wn"].append("".join([oneitem[0], oneitem[3]])) vocabulary["wnp"].update([ joinstring.join([oneitem[0], oneitem[1], oneitem[2]]) ]) outfrag["wnp"].append("".join( [oneitem[0], oneitem[1], oneitem[2]])) vocabulary["wnd"].update([ joinstring.join([oneitem[0], oneitem[1], oneitem[3]]) ]) outfrag["wnd"].append("".join( [oneitem[0], oneitem[1], oneitem[3]])) vocabulary["wdp"].update([ joinstring.join([oneitem[0], oneitem[2], oneitem[3]]) ]) outfrag["wdp"].append("".join( [oneitem[0], oneitem[2], oneitem[3]])) vocabulary["wndp"].update([ joinstring.join( [oneitem[0], oneitem[1], oneitem[2], oneitem[3]]) ]) outfrag["wndp"].append("".join( [oneitem[0], oneitem[1], oneitem[2], oneitem[3]])) wp_f.write(" ".join(outfrag["wp"]) + "\n") wd_f.write(" ".join(outfrag["wd"]) + "\n") wn_f.write(" ".join(outfrag["wn"]) + "\n") wnp_f.write(" ".join(outfrag["wnp"]) + "\n") wnd_f.write(" ".join(outfrag["wnd"]) + "\n") wdp_f.write(" ".join(outfrag["wdp"]) + "\n") wndp_f.write(" ".join(outfrag["wndp"]) + "\n") wn_f.close() wd_f.close() wp_f.close() wnd_f.close() wnp_f.close() wdp_f.close() wndp_f.close() logger("antal fragment: {}".format(antal_frag), monitor) vocab_words = {w for w, c in vocabulary_words.items() if c >= MINCOUNT} size_vocab = len(vocab_words) logger("antal ord std: {}".format(size_vocab), monitor) embeddings = {} for w in vocab_words: embeddings[w] = sparsevectors.newrandomvector(dimensionality, density) vocab_labels = {w for w, c in vocabulary_labels.items() if c >= MINCOUNT} size_vocab = len(vocab_labels) logger("antal tag tot: {}".format(size_vocab), monitor) labelembeddings = {} for w in vocab_labels: try: labelembeddings[w] = sparsevectors.newrandomvector( dimensionality, labeldensity) except IndexError: logger("Indexerror: {}".format(w), error) for cc in conditions: vocab_words = {w for w, c in vocabulary[cc].items() if c >= MINCOUNT} size_vocab = len(vocab_words) compositeembeddings = {} logger("antal ord i {}: {}".format(cc, size_vocab), monitor) with open('{}{}/vocab.words.txt'.format(outpath, cc), "w+") as f: for wdl in sorted(list(vocab_words)): wd = "".join(wdl.split(joinstring)) f.write('{}\n'.format(wd)) vv = embeddings[wdl.split(joinstring)[0]] for ll in wdl.split(joinstring)[1:]: vv = sparsevectors.sparseadd(vv, labelembeddings[ll]) compositeembeddings[wd] = sparsevectors.listify( sparsevectors.normalise(vv), dimensionality) with open('{}{}/compositevectors.txt'.format(outpath, cc), "w+") as f: for www in compositeembeddings: f.write("{} {}\n".format( www, " ".join(map(str, compositeembeddings[www]))))
def addconstant(self, item): self.additem( item, sparsevectors.newrandomvector(self.dimensionality, self.dimensionality // 10))
path = "/home/jussi/aktuellt/2018.recfut/tf_ner/data/recfut/" # read words file if __name__ == '__main__': # Load vocab with open(path + "vocab.words.txt", "r+") as f: word_to_idx = {line.strip(): idx for idx, line in enumerate(f)} size_vocab = len(word_to_idx) print("antal ord {}".format(size_vocab)) # Array of zeros embeddings = np.zeros((size_vocab, dimensionality)) for word in word_to_idx: vector = sparsevectors.newrandomvector(dimensionality, density) word_idx = word_to_idx[word] embeddings[word_idx] = sparsevectors.listify(vector, dimensionality) np.savez_compressed(path + 'randomindex.npz', embeddings=embeddings) with open(path + "vocab.words.txt", "r+") as f: word_to_idx = {line.strip(): idx for idx, line in enumerate(f)} size_vocab = len(word_to_idx) print("antal ord {}".format(size_vocab)) # Array of zeros embeddings = np.zeros((size_vocab, dimensionality)) for word in word_to_idx:
def addconstant(self, item): self.constantcollection[item] = sparsevectors.newrandomvector(self.dimensionality, self.denseness)