class LRmulticlass(object): def __init__(self): self.model = None def json2Vocab(self, jsonInstance): vocabd = {} for k in jsonInstance.keys(): vocabd[self.vocab.GetID(k)] = jsonInstance[k] return vocabd def json2Vector(self, jsonInstance): result = np.zeros(self.vocab.GetVocabSize()) for k in jsonInstance.keys(): if self.vocab.GetID(k) > 0: #print self.vocab.GetID(k) result[self.vocab.GetID(k) - 1] = jsonInstance[k] return result def Train(self, jsonDataset): x, y = [d[0] for d in jsonDataset], [int(d[1]) for d in jsonDataset] self.vocab = Vocab() x_vocabd = [self.json2Vocab(d) for d in x] with open("vocab_train.save", 'wb') as vocabfile: pickle.dump(self.vocab, vocabfile) self.vocab.Lock() X_matrix = np.zeros((len(x_vocabd), self.vocab.GetVocabSize())) for i in range(len(x_vocabd)): for (j, v) in x_vocabd[i].items(): X_matrix[i, j - 1] = v lrmulti = LogisticRegression(solver='lbfgs', multi_class='multinomial') lrmulti.fit(X_matrix, np.array(y)) self.model = lrmulti #fsvocab = open('vocab_train.save', 'wb') #pickle.dump(self.vocab, fsvocab) #fsvocab.flush() def Predict(self, jsonInstance): with open("vocab_train.save", 'rb') as vocabfile: self.vocab = pickle.load(vocabfile) self.vocab.Lock() return self.model.predict( self.json2Vector(jsonInstance).reshape(1, -1)) def PredictProba(self, jsonInstance): return self.model.predict_proba( self.json2Vector(jsonInstance).reshape(1, -1)) def printWeights(self, outFile): fwout = open(outFile, 'w') classes = self.model.coef_.shape[0] for i in range(classes): fwout.write("Class %s\n" % i) curCatWeights = self.model.coef_[i] for j in np.argsort(curCatWeights): try: fwout.write("%s\t%s\n" % (self.vocab.GetWord(j + 1), curCatWeights[j])) except KeyError: pass
def test(): max_len = 5 vocab_file = 'data/vocab_mc5.txt' vocab_src = Vocab('model_vocab') vocab_src.load_vocab(vocab_file) vocab_size = vocab_src.get_n_words outputs = torch.randn(max_len, vocab_size) outputs = functional.softmax(outputs) beam_search_decoder(outputs, 3, vocab_src)
def __init__(self, opt): self.opt = opt # 命令行的一些操作参数 self.char_vocab = Vocab('char') self.tag_vocab = Vocab('tag', is_tag=True) self.char_embedding = None # 这个是借鉴的呜呜呜子 # 不论如何,将 data 定义在这里 self.char_embedding_dim = 256 # 这个主要是用于存放 训练 验证 测试数据的迭代器。 self.train_iter = None self.dev_iter = None self.test_iter = None
def Train(self, jsonDataset): x, y = [d[0] for d in jsonDataset], [int(d[1]) for d in jsonDataset] self.vocab = Vocab() x_vocabd = [self.json2Vocab(d) for d in x] self.vocab.Lock() X_matrix = np.zeros((len(x_vocabd), self.vocab.GetVocabSize())) for i in range(len(x_vocabd)): for (j,v) in x_vocabd[i].items(): X_matrix[i,j-1] = v lrmulti = LogisticRegression(solver='lbfgs', multi_class='multinomial', C=0.3) lrmulti.fit(X_matrix, np.array(y)) self.model = lrmulti
def __init__(self,batch,train_kbest = None,train_gold = None,dev_kbest = None,dev_gold = None, test_kbest = None,test_gold = None,vocab_path = None): self.vocab = None self.train_kbest = train_kbest self.train_gold = train_gold self.dev_kbest = dev_kbest self.dev_gold = dev_gold self.batch = batch self.test_kbest = test_kbest self.test_gold = test_gold if os.path.exists(vocab_path): print 'load vocab' self.max_degree,self.vocab = data_util.load_dict(vocab_path) else: print 'creat vocab' self.vocab = Vocab.Vocab(self.train_gold) print 'get max_degree' self.max_degree = self.get_max_degree() print 'save dictionary' data_util.save_dict(self.vocab,self.max_degree, vocab_path) print 'vocab size:' + str(self.vocab.size()) print 'max_degree' + str(self.max_degree) print 'get dev data' self.dev_data = dev_reader.read_dev(dev_kbest,dev_gold,self.vocab) print 'number of dev:'+str(len(self.dev_data)) #self.test_data = dev_reader.read_dev(test_kbest,test_gold,self.vocab) # print 'create train batch' # self.train_iter = train_iterator.train_iterator(train_kbest,train_gold,self.vocab,self.batch) print 'get train data' self.train_data = dev_reader.read_dev(train_kbest,train_gold,self.vocab) print 'number of train:'+str(len(self.train_data))
def add_album(album): '''this can only be called once at a time!''' album = RDF.Node(RDF.Uri(album)) cmd = ['gnupod_addsong.pl', '--decode=mp3'] for track in TripleStore.model.get_targets(album, Vocab.ns['mo'].track): cmd.append(Vocab.track_filename(track)) artist = TripleStore.model.get_target(album, Vocab.ns['foaf'].maker) for tag in Vocab.tags(artist): cmd += ['-p', tag] for tag in Vocab.tags(album): cmd += ['-p', tag] subprocess.call(cmd)
def parseQuestion(self, line, number): if line[TYPE_POSITION] == 'T/F': question = TrueFalse( line[ANSWER_POSITION].strip(), line[QUESTION_POSITION].strip().strip('"') ) # i feel like I shouldn't need to do this, but I don't want it to print with quotes self.quiz.addQuestion(question) elif line[TYPE_POSITION] == 'V': question = Vocab(line[ANSWER_POSITION].strip(), line[QUESTION_POSITION].strip().strip('"')) self.quiz.addQuestion(question) elif line[TYPE_POSITION] == 'M': question = MultipleChoice( line[MULT_ANSWER_POSITION].strip(), line[QUESTION_POSITION].strip().strip('"')) for option in line[QUESTION_POSITION + 1:MULT_ANSWER_POSITION]: question.addOption(option) self.quiz.addQuestion(question) else: print("Unknown Question Type. Skipping Question on line", number)
def load_data(): global data_collection start = time.time() if os.path.exists(paras.DATA_COL): with open(paras.DATA_COL, 'rb') as f: data_collection = pickle.load(f) else: vocab = Vocab.Vocab() schemas = Schema.load_schema() train_data = Data.load_data(paras.TRAIN_DATA_MERGE, schemas) test_data = Data.load_data(paras.TEST_DATA, schemas) train_data.get_indexes(vocab) test_data.get_indexes(vocab) print(Data.tot1) print(Data.tot2) print('train_data number:', len(train_data.data)) data_collection = DataCollection(vocab, schemas, train_data, test_data) with open(paras.DATA_COL, 'wb') as f: pickle.dump(data_collection, f) end = time.time() data_collection.vocab.print_info() data_collection.schemas.print_info() print('load data time cost:', end - start)
class Data: def __init__(self, opt): self.opt = opt # 命令行的一些操作参数 self.char_vocab = Vocab('char') self.tag_vocab = Vocab('tag', is_tag=True) self.char_embedding = None # 这个是借鉴的呜呜呜子 # 不论如何,将 data 定义在这里 self.char_embedding_dim = 256 # 这个主要是用于存放 训练 验证 测试数据的迭代器。 self.train_iter = None self.dev_iter = None self.test_iter = None # train_data 是一个,我们统一使用BIO格式, 这里 MSRA 使用了 BIO, 我们不再去判断别的, # 随着数据集的增加,这个肯定是要补充的啦 def build_tag_vocab(self, train_data): """ 生成 tag 的 vocab,tag_vocab没有 unk_token,但是我给搞了一个pad, :param train_data:训练数据集,类型为 DataFrame,sentence 为 char list :return: """ print('building tag vocab') for tag_list in train_data['label']: for tag in tag_list: self.tag_vocab.add(tag) self.tag_vocab.tag_add_pad() def build_char_vocab(self, train_data, dev_data, test_data): """ 根据数据集加载出不同的 vocab 以及 tag,然后我们生成数据集的时候就可以从文件直接生成迭代数据集 :return: """ print('building char vocab') for sentence in train_data['sentence']: for token in sentence: self.char_vocab.add(token) for sentence in test_data['sentence']: for token in sentence: self.char_vocab.add(token) for sentence in dev_data['sentence']: for token in sentence: self.char_vocab.add(token) # 用于首次运行模型,将vocab, tag_vocab, pretrained_embedding保存 # 通过如下代码我们假装自己准备好了 除了数据集之外的东西 def build_vocab_pipeline(self): if self.opt.load_data is None: train_data = get_data(self.opt.train) dev_data = get_data(self.opt.dev) test_data = get_data(self.opt.test) self.build_char_vocab(train_data, dev_data, test_data) self.build_tag_vocab(train_data) # 通过之前的步骤,在首次,我们已经加载好了词语向量。 # 接下来,我们来加载中文字词向量 self.load_char_pretrained_embedding('data/news_char_256.vec') # 接下来我们要保存这些东西。 self.char_vocab.save(self.opt.save_data + os.sep + 'char_vocab') self.tag_vocab.save(self.opt.save_data + os.sep + 'tag_vocab') # 保存词向量。 print('saving vector of char') pretrained_file_name = 'char_embedding_matrix_' + str(self.char_embedding_dim) np.save(self.opt.save_data + os.sep + pretrained_file_name, self.char_embedding) else: # 这种情况就是直接加载数据 # 首先 加载两个词向量 self.char_vocab.load(self.opt.load_data + os.sep + 'char_vocab') self.tag_vocab.load(self.opt.load_data + os.sep + 'tag_vocab') self.char_embedding = np.load(self.opt.load_data + os.sep + 'char_embedding_matrix_256.npy') self.char_embedding_dim = self.char_embedding.shape[1] def build_data(self, batch_size): # 下面我们已经初始化了 词汇表 词向量 self.build_vocab_pipeline() # 接下来,我们开始准备数据 然后初始化到当前的 data里面 # 但是batch_size是存在模型的config里面的,记得传过来哦 if self.opt.status.lower() == 'train': self.train_iter = data_iterator(self.opt.train, self.char_vocab, self.tag_vocab, batch_size) self.dev_iter = data_iterator(self.opt.dev, self.char_vocab, self.tag_vocab, batch_size) elif self.opt.status.lower() == 'test': self.test_iter = data_iterator(self.opt.test, self.char_vocab, self.tag_vocab, batch_size) elif self.opt.status.lower() == 'decode': pass else: print('input error:train or test or decode') def load_char_pretrained_embedding(self, char_pretrained_path): self.char_embedding, self.char_embedding_dim = load_pretrained_embedding(char_pretrained_path, self.char_vocab)
if (self.shuffle == 1): print 'DataProvider.readNextCache shuffle' random.shuffle(self.samples) else: #sort by group size print 'DataProvider.readNextCache sort' self.samples.sort(key=lambda SampleGroup: len(SampleGroup.samples), reverse=False) if __name__ == "__main__": lang = "CMN" path = "../data/" filePath = path + "train.f.txt" hotFilePath = path + "NodeHot.txt" muti_name_file = path + "multiname.txt" att_file = "" batch_size = 8 cache_size = 20000 maxSampCount = 0 shuffle = 1 word_vocab = Vocab(path + "voc_char.txt") word_vocab.load() kb_vocab = Vocab(path + "voc_kb.txt") kb_vocab.load() kbp_type_vocab = Vocab(path + "kb_type.txt") kbp_type_vocab.load() kb_type_vocab = Vocab(path + "fb_type.txt") kb_type_vocab.load() dp = DataProvider(lang, filePath, hotFilePath, muti_name_file, att_file, batch_size, cache_size, maxSampCount, shuffle, word_vocab, kb_vocab, kbp_type_vocab, kb_type_vocab)
Rare_word = "Rare" Numeric = "Numeric" AllCap = "AllCap" LastCap = "LastCap" Delim = "Delimiter" min_count = 5 TagCount = {} TriGramCount = {} BiGramCount = {} UniGramCount = {} mp={} delimiters = [",","\'\'","``","#","$","(",")",".",":",";","%","-","}","{"] #Yet to be done TotalCount=0 V = {} V = Vocab.VocabGenerator() #Hyper - Parameters lamb1 = 0.65 lamb2 = 0.25 lamb3 = 0.1 def TagClean(s): if s in delimiters: return "DLM" elif s=="PRP$": return "PRP" elif s=="WP$": return "WP" elif s=="RBR" or s=="RBS" or s=="RB" or s=="WRB": return "RB"
import Vocab from config import config import pandas as pd config_test = config.Configurable(r'C:\Users\ACH\Desktop\PycharmProjects\pythonProject\config\db.conf') # 训练集合 train = pd.read_csv(config_test.train_dir)[config_test.cloums.split(',')] dev = pd.read_csv(config_test.dev_dir)[config_test.cloums.split(',')] temp = pd.concat([train,dev],axis=0) print(temp.head(10)) # 字典创建 vocab_pre = Vocab.Vocab_built(max_len=50) vocab = vocab_pre.get_vocab_comments(train) #print(vocab.stoi) {word:id}'<unk>': 0, '<pad>': 1, 'the': 2, 'a': 3, 'and': 4, #print(vocab.stoi) ''' 这里需要注意,我们的pad需要补1,不能是unk '''
def GET(self, tag): artists, albums = Vocab.artists_albums_tagged(tag) return render.tagged(tag, artists, albums)
def POST(self): i = web.input() Vocab.rate(i.uri, int(i.rating) * 2)
def GET(self): web.header('Content-Type', 'text/html; charset=utf-8') artists, albums = Vocab.artists_albums() return render.albums(artists, albums)
def POST(self): i = web.input() Vocab.tag(i.uri, i.tags)
if(self.cache_score2.has_key(key_str)): return self.cache_score2[key_str] dis = self.tfidf_cos_dis(v1, v2) if(wikiIsNull == True): dis = avgDis if(dis > maxDis): dis = maxDis dis = dis / maxDis dis = int((dis - 0.000001) * 10) self.cache_score2[key_str] = dis if(len(self.cache_score2) > self.cache_max_size): self.cache_score2.clear() return dis #return 0~9 if __name__ == "__main__": vocab=Vocab("../data/voc_char.txt") vocab.load() mat=DocMatcher() mat._loadIDF("../data/IDF.txt",vocab) v1=['a','f','c','%UNK%','%UNK%'] v2=['a','b','e','askasasas','aaaaaaasadad'] v3=[] v4=[] for v in v1: v3.append(vocab.search(v)) for v in v2: v4.append(vocab.search(v)) print mat.tfidf_cos_dis(v3,v4)
def loadVocab(self, vocab_file, vocab_name): vob = Vocab(vocab_file) vob.load() if (self.debug_mod >= 1): vob.printVocab(vocab_name, 10) return vob
def __init__(self): ''' You can add more arguments for examples actions and model paths. You need to load your model here. actions: provides indices for actions. it has the same order as the data/vocabs.actions file. ''' # if you prefer to have your own index for actions, change this. self.actions = [ 'SHIFT', 'LEFT-ARC:rroot', 'LEFT-ARC:cc', 'LEFT-ARC:number', 'LEFT-ARC:ccomp', 'LEFT-ARC:possessive', 'LEFT-ARC:prt', 'LEFT-ARC:num', 'LEFT-ARC:nsubjpass', 'LEFT-ARC:csubj', 'LEFT-ARC:conj', 'LEFT-ARC:dobj', 'LEFT-ARC:nn', 'LEFT-ARC:neg', 'LEFT-ARC:discourse', 'LEFT-ARC:mark', 'LEFT-ARC:auxpass', 'LEFT-ARC:infmod', 'LEFT-ARC:mwe', 'LEFT-ARC:advcl', 'LEFT-ARC:aux', 'LEFT-ARC:prep', 'LEFT-ARC:parataxis', 'LEFT-ARC:nsubj', 'LEFT-ARC:<null>', 'LEFT-ARC:rcmod', 'LEFT-ARC:advmod', 'LEFT-ARC:punct', 'LEFT-ARC:quantmod', 'LEFT-ARC:tmod', 'LEFT-ARC:acomp', 'LEFT-ARC:pcomp', 'LEFT-ARC:poss', 'LEFT-ARC:npadvmod', 'LEFT-ARC:xcomp', 'LEFT-ARC:cop', 'LEFT-ARC:partmod', 'LEFT-ARC:dep', 'LEFT-ARC:appos', 'LEFT-ARC:det', 'LEFT-ARC:amod', 'LEFT-ARC:pobj', 'LEFT-ARC:iobj', 'LEFT-ARC:expl', 'LEFT-ARC:predet', 'LEFT-ARC:preconj', 'LEFT-ARC:root', 'RIGHT-ARC:rroot', 'RIGHT-ARC:cc', 'RIGHT-ARC:number', 'RIGHT-ARC:ccomp', 'RIGHT-ARC:possessive', 'RIGHT-ARC:prt', 'RIGHT-ARC:num', 'RIGHT-ARC:nsubjpass', 'RIGHT-ARC:csubj', 'RIGHT-ARC:conj', 'RIGHT-ARC:dobj', 'RIGHT-ARC:nn', 'RIGHT-ARC:neg', 'RIGHT-ARC:discourse', 'RIGHT-ARC:mark', 'RIGHT-ARC:auxpass', 'RIGHT-ARC:infmod', 'RIGHT-ARC:mwe', 'RIGHT-ARC:advcl', 'RIGHT-ARC:aux', 'RIGHT-ARC:prep', 'RIGHT-ARC:parataxis', 'RIGHT-ARC:nsubj', 'RIGHT-ARC:<null>', 'RIGHT-ARC:rcmod', 'RIGHT-ARC:advmod', 'RIGHT-ARC:punct', 'RIGHT-ARC:quantmod', 'RIGHT-ARC:tmod', 'RIGHT-ARC:acomp', 'RIGHT-ARC:pcomp', 'RIGHT-ARC:poss', 'RIGHT-ARC:npadvmod', 'RIGHT-ARC:xcomp', 'RIGHT-ARC:cop', 'RIGHT-ARC:partmod', 'RIGHT-ARC:dep', 'RIGHT-ARC:appos', 'RIGHT-ARC:det', 'RIGHT-ARC:amod', 'RIGHT-ARC:pobj', 'RIGHT-ARC:iobj', 'RIGHT-ARC:expl', 'RIGHT-ARC:predet', 'RIGHT-ARC:preconj', 'RIGHT-ARC:root' ] parser = OptionParser() parser.add_option("--train", dest="train_file", metavar="FILE", default=None) parser.add_option("--train_data", dest="train_data_file", metavar="FILE", default='data/train.data') parser.add_option("--test", dest="test_file", metavar="FILE", default=None) parser.add_option("--output", dest="output_file", metavar="FILE", default=None) parser.add_option("--model", dest="model_path", metavar="FILE", default='src/trained3.model') # This option changes to trained2.model for part 2 as we are saving each model seperately per part # (similarly use trained3.model for part 3) # parser.add_option("--model", dest="model_path", metavar="FILE", default='src/trained2.model') parser.add_option("--vocab", dest="vocab_path", metavar="FILE", default=None) parser.add_option("--we", type="int", dest="we", default=64) parser.add_option("--pe", type="int", dest="pe", default=32) parser.add_option("--de", type="int", dest="de", default=32) parser.add_option("--hidden", type="int", dest="hidden", default=200) # This option changes to 400 for Part 2 # parser.add_option("--hidden", type="int", dest="hidden", default=400) parser.add_option("--minibatch", type="int", dest="minibatch", default=1000) parser.add_option("--epochs", type="int", dest="epochs", default=7) (options, args) = parser.parse_args() net_properties = NetProperties(options.we, options.pe, options.de, options.hidden, options.minibatch) # creating vocabulary file vocab = Vocab() # constructing network self.network = Network(vocab, net_properties) if os.path.isfile(options.model_path): print("Loading saved model") # loading network trained model self.network.load(options.model_path) else: print("Training model") # training self.network.train(options.train_data_file, options.epochs) # saving network self.network.save(options.model_path)
def POST(self): i = web.input() Vocab.rate(i.uri, int(i.rating)*2)