class LRmulticlass(object):
    def __init__(self):
        self.model = None

    def json2Vocab(self, jsonInstance):
        vocabd = {}
        for k in jsonInstance.keys():
            vocabd[self.vocab.GetID(k)] = jsonInstance[k]
        return vocabd

    def json2Vector(self, jsonInstance):
        result = np.zeros(self.vocab.GetVocabSize())
        for k in jsonInstance.keys():
            if self.vocab.GetID(k) > 0:
                #print self.vocab.GetID(k)
                result[self.vocab.GetID(k) - 1] = jsonInstance[k]
        return result

    def Train(self, jsonDataset):
        x, y = [d[0] for d in jsonDataset], [int(d[1]) for d in jsonDataset]
        self.vocab = Vocab()
        x_vocabd = [self.json2Vocab(d) for d in x]
        with open("vocab_train.save", 'wb') as vocabfile:
            pickle.dump(self.vocab, vocabfile)
        self.vocab.Lock()
        X_matrix = np.zeros((len(x_vocabd), self.vocab.GetVocabSize()))
        for i in range(len(x_vocabd)):
            for (j, v) in x_vocabd[i].items():
                X_matrix[i, j - 1] = v
        lrmulti = LogisticRegression(solver='lbfgs', multi_class='multinomial')
        lrmulti.fit(X_matrix, np.array(y))
        self.model = lrmulti
        #fsvocab = open('vocab_train.save', 'wb')
        #pickle.dump(self.vocab, fsvocab)
        #fsvocab.flush()

    def Predict(self, jsonInstance):
        with open("vocab_train.save", 'rb') as vocabfile:
            self.vocab = pickle.load(vocabfile)
        self.vocab.Lock()
        return self.model.predict(
            self.json2Vector(jsonInstance).reshape(1, -1))

    def PredictProba(self, jsonInstance):
        return self.model.predict_proba(
            self.json2Vector(jsonInstance).reshape(1, -1))

    def printWeights(self, outFile):
        fwout = open(outFile, 'w')
        classes = self.model.coef_.shape[0]
        for i in range(classes):
            fwout.write("Class %s\n" % i)
            curCatWeights = self.model.coef_[i]
            for j in np.argsort(curCatWeights):
                try:
                    fwout.write("%s\t%s\n" %
                                (self.vocab.GetWord(j + 1), curCatWeights[j]))
                except KeyError:
                    pass
예제 #2
0
def test():
    max_len = 5
    vocab_file = 'data/vocab_mc5.txt'
    vocab_src = Vocab('model_vocab')
    vocab_src.load_vocab(vocab_file)
    vocab_size = vocab_src.get_n_words
    outputs = torch.randn(max_len, vocab_size)
    outputs = functional.softmax(outputs)
    beam_search_decoder(outputs, 3, vocab_src)
예제 #3
0
 def __init__(self, opt):
     self.opt = opt   # 命令行的一些操作参数
     self.char_vocab = Vocab('char')
     self.tag_vocab = Vocab('tag', is_tag=True)
     self.char_embedding = None  # 这个是借鉴的呜呜呜子
     # 不论如何,将 data 定义在这里
     self.char_embedding_dim = 256
     # 这个主要是用于存放 训练 验证 测试数据的迭代器。
     self.train_iter = None
     self.dev_iter = None
     self.test_iter = None
	def Train(self, jsonDataset):
		x, y = [d[0] for d in jsonDataset], [int(d[1]) for d in jsonDataset]
		self.vocab = Vocab()
		x_vocabd = [self.json2Vocab(d) for d in x]
		self.vocab.Lock()
		X_matrix = np.zeros((len(x_vocabd), self.vocab.GetVocabSize()))
		for i in range(len(x_vocabd)):
			for (j,v) in x_vocabd[i].items():
				X_matrix[i,j-1] = v
		lrmulti = LogisticRegression(solver='lbfgs', multi_class='multinomial', C=0.3)
		lrmulti.fit(X_matrix, np.array(y))
		self.model = lrmulti
예제 #5
0
 def __init__(self,batch,train_kbest = None,train_gold = None,dev_kbest = None,dev_gold = None,
              test_kbest = None,test_gold = None,vocab_path = None):
     self.vocab = None
     self.train_kbest = train_kbest
     self.train_gold = train_gold
     self.dev_kbest = dev_kbest
     self.dev_gold = dev_gold
     self.batch = batch
     self.test_kbest = test_kbest
     self.test_gold = test_gold
     if os.path.exists(vocab_path):
         print 'load vocab'
         self.max_degree,self.vocab = data_util.load_dict(vocab_path)
     else:
         print 'creat vocab'
         self.vocab = Vocab.Vocab(self.train_gold)
         print 'get max_degree'
         self.max_degree = self.get_max_degree()
         print 'save dictionary'
         data_util.save_dict(self.vocab,self.max_degree, vocab_path)
     print 'vocab size:' + str(self.vocab.size())
     print 'max_degree' + str(self.max_degree)
     print 'get dev data'
     self.dev_data = dev_reader.read_dev(dev_kbest,dev_gold,self.vocab)
     print 'number of dev:'+str(len(self.dev_data))
     #self.test_data = dev_reader.read_dev(test_kbest,test_gold,self.vocab)
     # print 'create train batch'
     # self.train_iter = train_iterator.train_iterator(train_kbest,train_gold,self.vocab,self.batch)
     print 'get train data'
     self.train_data = dev_reader.read_dev(train_kbest,train_gold,self.vocab)
     print 'number of train:'+str(len(self.train_data))
예제 #6
0
def add_album(album):
    '''this can only be called once at a time!'''
    album = RDF.Node(RDF.Uri(album))

    cmd = ['gnupod_addsong.pl', '--decode=mp3']

    for track in TripleStore.model.get_targets(album, Vocab.ns['mo'].track):
        cmd.append(Vocab.track_filename(track))

    artist = TripleStore.model.get_target(album, Vocab.ns['foaf'].maker)
    for tag in Vocab.tags(artist):
        cmd += ['-p', tag]

    for tag in Vocab.tags(album):
        cmd += ['-p', tag]

    subprocess.call(cmd)
예제 #7
0
파일: gnupod.py 프로젝트: bct/rdf-music
def add_album(album):
  '''this can only be called once at a time!'''
  album = RDF.Node(RDF.Uri(album))

  cmd = ['gnupod_addsong.pl', '--decode=mp3']

  for track in TripleStore.model.get_targets(album, Vocab.ns['mo'].track):
    cmd.append(Vocab.track_filename(track))

  artist = TripleStore.model.get_target(album, Vocab.ns['foaf'].maker)
  for tag in Vocab.tags(artist):
    cmd += ['-p', tag]

  for tag in Vocab.tags(album):
    cmd += ['-p', tag]

  subprocess.call(cmd)
예제 #8
0
 def parseQuestion(self, line, number):
     if line[TYPE_POSITION] == 'T/F':
         question = TrueFalse(
             line[ANSWER_POSITION].strip(),
             line[QUESTION_POSITION].strip().strip('"')
         )  # i feel like I shouldn't need to do this, but I don't want it to print with quotes
         self.quiz.addQuestion(question)
     elif line[TYPE_POSITION] == 'V':
         question = Vocab(line[ANSWER_POSITION].strip(),
                          line[QUESTION_POSITION].strip().strip('"'))
         self.quiz.addQuestion(question)
     elif line[TYPE_POSITION] == 'M':
         question = MultipleChoice(
             line[MULT_ANSWER_POSITION].strip(),
             line[QUESTION_POSITION].strip().strip('"'))
         for option in line[QUESTION_POSITION + 1:MULT_ANSWER_POSITION]:
             question.addOption(option)
         self.quiz.addQuestion(question)
     else:
         print("Unknown Question Type. Skipping Question on line", number)
예제 #9
0
def load_data():
    global data_collection
    start = time.time()
    if os.path.exists(paras.DATA_COL):
        with open(paras.DATA_COL, 'rb') as f:
            data_collection = pickle.load(f)
    else:
        vocab = Vocab.Vocab()
        schemas = Schema.load_schema()
        train_data = Data.load_data(paras.TRAIN_DATA_MERGE, schemas)
        test_data = Data.load_data(paras.TEST_DATA, schemas)
        train_data.get_indexes(vocab)
        test_data.get_indexes(vocab)
        print(Data.tot1)
        print(Data.tot2)
        print('train_data number:', len(train_data.data))
        data_collection = DataCollection(vocab, schemas, train_data, test_data)
        with open(paras.DATA_COL, 'wb') as f:
            pickle.dump(data_collection, f)
    end = time.time()
    data_collection.vocab.print_info()
    data_collection.schemas.print_info()
    print('load data time cost:', end - start)
예제 #10
0
class Data:
    def __init__(self, opt):
        self.opt = opt   # 命令行的一些操作参数
        self.char_vocab = Vocab('char')
        self.tag_vocab = Vocab('tag', is_tag=True)
        self.char_embedding = None  # 这个是借鉴的呜呜呜子
        # 不论如何,将 data 定义在这里
        self.char_embedding_dim = 256
        # 这个主要是用于存放 训练 验证 测试数据的迭代器。
        self.train_iter = None
        self.dev_iter = None
        self.test_iter = None

    # train_data 是一个,我们统一使用BIO格式, 这里 MSRA 使用了 BIO, 我们不再去判断别的,
    # 随着数据集的增加,这个肯定是要补充的啦
    def build_tag_vocab(self, train_data):
        """
        生成 tag 的 vocab,tag_vocab没有 unk_token,但是我给搞了一个pad,
        :param train_data:训练数据集,类型为 DataFrame,sentence 为 char list
        :return:
        """
        print('building tag vocab')
        for tag_list in train_data['label']:
            for tag in tag_list:
                self.tag_vocab.add(tag)
        self.tag_vocab.tag_add_pad()

    def build_char_vocab(self, train_data, dev_data, test_data):
        """
        根据数据集加载出不同的 vocab 以及 tag,然后我们生成数据集的时候就可以从文件直接生成迭代数据集
        :return:
        """
        print('building char vocab')
        for sentence in train_data['sentence']:
            for token in sentence:
                self.char_vocab.add(token)
        for sentence in test_data['sentence']:
            for token in sentence:
                self.char_vocab.add(token)
        for sentence in dev_data['sentence']:
            for token in sentence:
                self.char_vocab.add(token)

    # 用于首次运行模型,将vocab, tag_vocab, pretrained_embedding保存
    # 通过如下代码我们假装自己准备好了 除了数据集之外的东西
    def build_vocab_pipeline(self):
        if self.opt.load_data is None:
            train_data = get_data(self.opt.train)
            dev_data = get_data(self.opt.dev)
            test_data = get_data(self.opt.test)
            self.build_char_vocab(train_data, dev_data, test_data)
            self.build_tag_vocab(train_data)
            # 通过之前的步骤,在首次,我们已经加载好了词语向量。
            # 接下来,我们来加载中文字词向量
            self.load_char_pretrained_embedding('data/news_char_256.vec')
            # 接下来我们要保存这些东西。
            self.char_vocab.save(self.opt.save_data + os.sep + 'char_vocab')
            self.tag_vocab.save(self.opt.save_data + os.sep + 'tag_vocab')
            # 保存词向量。
            print('saving vector of char')
            pretrained_file_name = 'char_embedding_matrix_' + str(self.char_embedding_dim)
            np.save(self.opt.save_data + os.sep + pretrained_file_name, self.char_embedding)
        else:
            # 这种情况就是直接加载数据
            # 首先 加载两个词向量
            self.char_vocab.load(self.opt.load_data + os.sep + 'char_vocab')
            self.tag_vocab.load(self.opt.load_data + os.sep + 'tag_vocab')
            self.char_embedding = np.load(self.opt.load_data + os.sep + 'char_embedding_matrix_256.npy')
            self.char_embedding_dim = self.char_embedding.shape[1]

    def build_data(self, batch_size):
        # 下面我们已经初始化了 词汇表 词向量
        self.build_vocab_pipeline()
        # 接下来,我们开始准备数据 然后初始化到当前的 data里面
        # 但是batch_size是存在模型的config里面的,记得传过来哦
        if self.opt.status.lower() == 'train':
            self.train_iter = data_iterator(self.opt.train, self.char_vocab, self.tag_vocab, batch_size)
            self.dev_iter = data_iterator(self.opt.dev, self.char_vocab, self.tag_vocab, batch_size)
        elif self.opt.status.lower() == 'test':
            self.test_iter = data_iterator(self.opt.test, self.char_vocab, self.tag_vocab, batch_size)
        elif self.opt.status.lower() == 'decode':
            pass
        else:
            print('input error:train or test or decode')

    def load_char_pretrained_embedding(self, char_pretrained_path):
        self.char_embedding, self.char_embedding_dim = load_pretrained_embedding(char_pretrained_path, self.char_vocab)
예제 #11
0
        if (self.shuffle == 1):
            print 'DataProvider.readNextCache shuffle'
            random.shuffle(self.samples)
        else:  #sort by group size
            print 'DataProvider.readNextCache sort'
            self.samples.sort(key=lambda SampleGroup: len(SampleGroup.samples),
                              reverse=False)

if __name__ == "__main__":
    lang = "CMN"
    path = "../data/"
    filePath = path + "train.f.txt"
    hotFilePath = path + "NodeHot.txt"
    muti_name_file = path + "multiname.txt"
    att_file = ""
    batch_size = 8
    cache_size = 20000
    maxSampCount = 0
    shuffle = 1
    word_vocab = Vocab(path + "voc_char.txt")
    word_vocab.load()
    kb_vocab = Vocab(path + "voc_kb.txt")
    kb_vocab.load()
    kbp_type_vocab = Vocab(path + "kb_type.txt")
    kbp_type_vocab.load()
    kb_type_vocab = Vocab(path + "fb_type.txt")
    kb_type_vocab.load()
    dp = DataProvider(lang, filePath, hotFilePath, muti_name_file, att_file,
                      batch_size, cache_size, maxSampCount, shuffle,
                      word_vocab, kb_vocab, kbp_type_vocab, kb_type_vocab)
예제 #12
0
Rare_word = "Rare"
Numeric = "Numeric"
AllCap = "AllCap"
LastCap = "LastCap"
Delim = "Delimiter"
min_count = 5
TagCount = {}
TriGramCount = {}
BiGramCount = {}
UniGramCount = {}
mp={}
delimiters = [",","\'\'","``","#","$","(",")",".",":",";","%","-","}","{"]
#Yet to be done
TotalCount=0
V = {}
V = Vocab.VocabGenerator()

#Hyper - Parameters
lamb1 = 0.65
lamb2 = 0.25
lamb3 = 0.1

def TagClean(s):
	if s in delimiters:
		return "DLM"		
	elif s=="PRP$":
		return "PRP"
	elif s=="WP$":
		return "WP"
	elif s=="RBR" or s=="RBS" or s=="RB" or s=="WRB":
		return "RB"
예제 #13
0
import Vocab
from config import config
import pandas as pd
config_test = config.Configurable(r'C:\Users\ACH\Desktop\PycharmProjects\pythonProject\config\db.conf')
# 训练集合
train = pd.read_csv(config_test.train_dir)[config_test.cloums.split(',')]
dev = pd.read_csv(config_test.dev_dir)[config_test.cloums.split(',')]
temp = pd.concat([train,dev],axis=0)
print(temp.head(10))
# 字典创建
vocab_pre = Vocab.Vocab_built(max_len=50)
vocab = vocab_pre.get_vocab_comments(train)
#print(vocab.stoi) {word:id}'<unk>': 0, '<pad>': 1, 'the': 2, 'a': 3, 'and': 4,
#print(vocab.stoi)
'''
这里需要注意,我们的pad需要补1,不能是unk
'''
예제 #14
0
    def GET(self, tag):
        artists, albums = Vocab.artists_albums_tagged(tag)

        return render.tagged(tag, artists, albums)
예제 #15
0
 def POST(self):
     i = web.input()
     Vocab.rate(i.uri, int(i.rating) * 2)
예제 #16
0
    def GET(self):
        web.header('Content-Type', 'text/html; charset=utf-8')

        artists, albums = Vocab.artists_albums()

        return render.albums(artists, albums)
예제 #17
0
    def POST(self):
        i = web.input()

        Vocab.tag(i.uri, i.tags)
예제 #18
0
파일: DocMatcher.py 프로젝트: zjulins/Rank
        if(self.cache_score2.has_key(key_str)):
            return self.cache_score2[key_str]

        dis = self.tfidf_cos_dis(v1, v2)
        if(wikiIsNull == True):
            dis = avgDis
        if(dis > maxDis):
            dis = maxDis
        dis = dis / maxDis
        dis = int((dis - 0.000001) * 10)

        self.cache_score2[key_str] = dis
        if(len(self.cache_score2) > self.cache_max_size):
            self.cache_score2.clear()

        return dis #return 0~9
        
if __name__ == "__main__":
    vocab=Vocab("../data/voc_char.txt")
    vocab.load()
    mat=DocMatcher()
    mat._loadIDF("../data/IDF.txt",vocab)
    v1=['a','f','c','%UNK%','%UNK%']
    v2=['a','b','e','askasasas','aaaaaaasadad']
    v3=[]
    v4=[]
    for v in v1:
        v3.append(vocab.search(v))
    for v in v2:
        v4.append(vocab.search(v))
    print mat.tfidf_cos_dis(v3,v4)
예제 #19
0
 def loadVocab(self, vocab_file, vocab_name):
     vob = Vocab(vocab_file)
     vob.load()
     if (self.debug_mod >= 1):
         vob.printVocab(vocab_name, 10)
     return vob
예제 #20
0
    def __init__(self):
        '''
            You can add more arguments for examples actions and model paths.
            You need to load your model here.
            actions: provides indices for actions.
            it has the same order as the data/vocabs.actions file.
        '''
        # if you prefer to have your own index for actions, change this.
        self.actions = [
            'SHIFT', 'LEFT-ARC:rroot', 'LEFT-ARC:cc', 'LEFT-ARC:number',
            'LEFT-ARC:ccomp', 'LEFT-ARC:possessive', 'LEFT-ARC:prt',
            'LEFT-ARC:num', 'LEFT-ARC:nsubjpass', 'LEFT-ARC:csubj',
            'LEFT-ARC:conj', 'LEFT-ARC:dobj', 'LEFT-ARC:nn', 'LEFT-ARC:neg',
            'LEFT-ARC:discourse', 'LEFT-ARC:mark', 'LEFT-ARC:auxpass',
            'LEFT-ARC:infmod', 'LEFT-ARC:mwe', 'LEFT-ARC:advcl',
            'LEFT-ARC:aux', 'LEFT-ARC:prep', 'LEFT-ARC:parataxis',
            'LEFT-ARC:nsubj', 'LEFT-ARC:<null>', 'LEFT-ARC:rcmod',
            'LEFT-ARC:advmod', 'LEFT-ARC:punct', 'LEFT-ARC:quantmod',
            'LEFT-ARC:tmod', 'LEFT-ARC:acomp', 'LEFT-ARC:pcomp',
            'LEFT-ARC:poss', 'LEFT-ARC:npadvmod', 'LEFT-ARC:xcomp',
            'LEFT-ARC:cop', 'LEFT-ARC:partmod', 'LEFT-ARC:dep',
            'LEFT-ARC:appos', 'LEFT-ARC:det', 'LEFT-ARC:amod', 'LEFT-ARC:pobj',
            'LEFT-ARC:iobj', 'LEFT-ARC:expl', 'LEFT-ARC:predet',
            'LEFT-ARC:preconj', 'LEFT-ARC:root', 'RIGHT-ARC:rroot',
            'RIGHT-ARC:cc', 'RIGHT-ARC:number', 'RIGHT-ARC:ccomp',
            'RIGHT-ARC:possessive', 'RIGHT-ARC:prt', 'RIGHT-ARC:num',
            'RIGHT-ARC:nsubjpass', 'RIGHT-ARC:csubj', 'RIGHT-ARC:conj',
            'RIGHT-ARC:dobj', 'RIGHT-ARC:nn', 'RIGHT-ARC:neg',
            'RIGHT-ARC:discourse', 'RIGHT-ARC:mark', 'RIGHT-ARC:auxpass',
            'RIGHT-ARC:infmod', 'RIGHT-ARC:mwe', 'RIGHT-ARC:advcl',
            'RIGHT-ARC:aux', 'RIGHT-ARC:prep', 'RIGHT-ARC:parataxis',
            'RIGHT-ARC:nsubj', 'RIGHT-ARC:<null>', 'RIGHT-ARC:rcmod',
            'RIGHT-ARC:advmod', 'RIGHT-ARC:punct', 'RIGHT-ARC:quantmod',
            'RIGHT-ARC:tmod', 'RIGHT-ARC:acomp', 'RIGHT-ARC:pcomp',
            'RIGHT-ARC:poss', 'RIGHT-ARC:npadvmod', 'RIGHT-ARC:xcomp',
            'RIGHT-ARC:cop', 'RIGHT-ARC:partmod', 'RIGHT-ARC:dep',
            'RIGHT-ARC:appos', 'RIGHT-ARC:det', 'RIGHT-ARC:amod',
            'RIGHT-ARC:pobj', 'RIGHT-ARC:iobj', 'RIGHT-ARC:expl',
            'RIGHT-ARC:predet', 'RIGHT-ARC:preconj', 'RIGHT-ARC:root'
        ]

        parser = OptionParser()
        parser.add_option("--train",
                          dest="train_file",
                          metavar="FILE",
                          default=None)
        parser.add_option("--train_data",
                          dest="train_data_file",
                          metavar="FILE",
                          default='data/train.data')
        parser.add_option("--test",
                          dest="test_file",
                          metavar="FILE",
                          default=None)
        parser.add_option("--output",
                          dest="output_file",
                          metavar="FILE",
                          default=None)
        parser.add_option("--model",
                          dest="model_path",
                          metavar="FILE",
                          default='src/trained3.model')
        # This option changes to trained2.model for part 2 as we are saving each model seperately per part
        # (similarly use trained3.model for part 3)
        # parser.add_option("--model", dest="model_path", metavar="FILE", default='src/trained2.model')
        parser.add_option("--vocab",
                          dest="vocab_path",
                          metavar="FILE",
                          default=None)
        parser.add_option("--we", type="int", dest="we", default=64)
        parser.add_option("--pe", type="int", dest="pe", default=32)
        parser.add_option("--de", type="int", dest="de", default=32)
        parser.add_option("--hidden", type="int", dest="hidden", default=200)
        # This option changes to 400 for Part 2
        # parser.add_option("--hidden", type="int", dest="hidden", default=400)
        parser.add_option("--minibatch",
                          type="int",
                          dest="minibatch",
                          default=1000)
        parser.add_option("--epochs", type="int", dest="epochs", default=7)

        (options, args) = parser.parse_args()

        net_properties = NetProperties(options.we, options.pe, options.de,
                                       options.hidden, options.minibatch)

        # creating vocabulary file
        vocab = Vocab()

        # constructing network
        self.network = Network(vocab, net_properties)

        if os.path.isfile(options.model_path):
            print("Loading saved model")
            # loading network trained model
            self.network.load(options.model_path)
        else:
            print("Training model")
            # training
            self.network.train(options.train_data_file, options.epochs)

            # saving network
            self.network.save(options.model_path)
예제 #21
0
파일: viewify.py 프로젝트: bct/rdf-music
  def GET(self):
    web.header('Content-Type', 'text/html; charset=utf-8')

    artists, albums = Vocab.artists_albums()

    return render.albums(artists, albums)
예제 #22
0
파일: viewify.py 프로젝트: bct/rdf-music
 def POST(self):
   i = web.input()
   Vocab.rate(i.uri, int(i.rating)*2)
예제 #23
0
파일: viewify.py 프로젝트: bct/rdf-music
  def GET(self, tag):
    artists, albums = Vocab.artists_albums_tagged(tag)

    return render.tagged(tag, artists, albums)
예제 #24
0
파일: viewify.py 프로젝트: bct/rdf-music
  def POST(self):
    i = web.input()

    Vocab.tag(i.uri, i.tags)