def getDataSet(opt):
    import dataloader
    dataset = dataloader.getDataset(opt)
    #    files=[os.path.join(data_dir,data_name)   for data_name in ['train.txt','test.txt','dev.txt']]

    # return dataset.getFormatedData(opt)
    return dataset.process()
Beispiel #2
0
def getDataset(opt):
    # use dataloader part
    import dataloader
    dataset = dataloader.getDataset(opt)
    # return the processed file name: text and label
    # by dataset.process() funtion
    return dataset.getFormatedData()
Beispiel #3
0
    opt.alphabet=alphabet

#    alphabet.dump(opt.dataset+".alphabet")
    for data in datas:
        if "bert" not in opt.model.lower():
            data["text"]= data["text"].apply(lambda text: [alphabet.get(word,alphabet.unknow_token)  for word in text[:opt.max_seq_len]] + [alphabet.padding_token] *int(opt.max_seq_len-len(text)) )
        else :
            data["text"]= data["text"].apply(process_with_bert,tokenizer=tokenizer,max_seq_len = opt.max_seq_len)
        data["label"]=data["label"].apply(lambda text: label_alphabet.get(text))

    return map(lambda x:BucketIterator(x,opt),datas)#map(BucketIterator,datas)  #

def loadDataWithoutEmbedding(opt):
    datas=[]
    for filename in getDataSet(opt):
        df = pd.read_csv(filename,header = None,sep="\t",names=["text","label"]).fillna('0')
        df["text"]= df["text"].str.lower()
        datas.append((df["text"],df["label"]))
    return datas

if __name__ =="__main__":
    import opts
    opt = opts.parse_opt()
    opt.max_seq_len=-1
    import dataloader
    dataset= dataloader.getDataset(opt)
    datas=loadData(opt)


def getDataSet(opt):
    import dataloader
    dataset = dataloader.getDataset(opt)
    return dataset.getFormatedData()