def sub_process(numIters,pool,id): dictionary = Dictionary() dictionary.add_word('<pad>') # add padding word with open(args.output+str(id), 'w') as fout: qdar = tqdm.tqdm(range(numIters),total= numIters,ascii=True) for i in qdar: # for item in pool: item = pool[i] # words = tokenizer(' '.join(item['text'].split())) words = SymSpellCheck(item['text']) data = { 'label': int(item['stars']) - 1, 'text': list(map(lambda x: proc_token(x), words)) } fout.write(json.dumps(data) + '\n') fout.flush() # for item in data['text']: # dictionary.add_word(item) # qdar.set_postfix(dictSize=str(len(dictionary))) with open(args.dict+str(id), 'w') as fout: # save dictionary for fast next process fout.write(json.dumps(dictionary.idx2word) + '\n')
def get_wiki_curated_NDWdata(): X, y = get_wiki_curated_data() train_idx = [] dd = Dictionary() for idx, label in enumerate(y): if label == '': continue if not dd.isDW(label): train_idx.append(idx) y = [y[i] for i in train_idx] X = X[train_idx] return X, y
def get_wiki_bow_DWdata(): X, y = get_wiki_bow_data() train_idx = [] NDW_idx = [] dd = Dictionary() for idx, label in enumerate(y): if label == '': continue if dd.isDW(label): train_idx.append(idx) else: NDW_idx.append(idx) NDW_y = [y[i] for i in NDW_idx] NDW_X = X[NDW_idx] y = [y[i] for i in train_idx] X = X[train_idx] return X, y, NDW_X, NDW_y
def read_hownet(hownet_path): hownet = Dictionary() with open(hownet_path, encoding='UTF8') as f: id = -1 str = '' kdml = '' for line in f.readlines(): if line.startswith('NO'): stage = 1 elif line.startswith('W_C'): stage = 2 elif line.startswith('DEF'): stage = 3 else: stage = 0 if stage == 1: id = int(line[4:-1]) elif stage == 2: str = line[4:-1] elif stage == 3: kdml = line[4:-1] hownet.add_sense(id, str, kdml) return hownet
def __init__(self): self.assistant = Assistant(lang='de') self.dictionary = Dictionary() set_lang('de')
class Galileo: def __init__(self): self.assistant = Assistant(lang='de') self.dictionary = Dictionary() set_lang('de') def start_service(self): while True: if "galileo" in self.assistant.listen(): self.assistant.speak( 'Hallo, möchtest du eine Erklärung erhalten oder eine Erklärung eingeben?' ) answer = self.assistant.listen() if not self.check_answer(answer): continue if 'erhalten' in answer: self.process_explanation() elif 'eingeben' in answer: self.get_explanation() else: self.assistant.speak( 'Tut mir leid, das kann ich noch nicht!') def process_explanation(self): self.assistant.speak("Gib bitte ein Thema an!") answer = self.assistant.listen() if not self.check_answer(answer): return for topic in self.dictionary.get_dict(): if topic in answer: self.assistant.speak( f'Die Erklärung zu dem Thema {topic} lautet: {self.dictionary.get_dict()[topic]}!' ) return try: self.assistant.speak( page(search(answer)[0]).summary.split('\n')[0]) except (IndexError | PageError): self.assistant.speak( f'Zu diesem Thema wurde noch keine Erklärung eingegeben, deshalb kann ich dir leider nicht helfen.' ) def get_explanation(self): self.assistant.speak('Wie lautet dein Thema?') first_answer = self.assistant.listen() if not self.check_answer(first_answer): return self.assistant.speak('Gib die Erklärung zum entsprechenden Thema ein!') second_answer = self.assistant.listen() if not self.check_answer(second_answer): return self.assistant.speak( f'Deine Erklärung für das Thema {first_answer} war {second_answer}! Ist das für dich in Ordnung?' ) third_answer = self.assistant.listen() if not self.check_answer(third_answer): return if 'ja' in third_answer: self.dictionary.add_value(first_answer, second_answer) self.assistant.speak( 'Danke, die anderen Kinder werden sich freuen!') else: self.assistant.speak( 'Ok, dann breche ich den aktuellen Vorgang ab!') def check_answer(self, answer): if answer == '': self.assistant.speak( 'Da du nicht mehr mit mir geredet hast, breche ich den aktuellen Vorgang ab!' ) return False else: return True
if torch.cuda.is_available(): print("WARNING: CUDA device detected, continue to use cpu device!") device = torch.device('cpu') torch.manual_seed(args.seed) else: device = torch.device('cpu') torch.manual_seed(args.seed) random.seed(args.seed) # Load Dictionary assert os.path.exists(args.train_data), "No training data detected!" assert os.path.exists(args.val_data), "No validation data detected!" assert os.path.exists(args.test_data), "No test data detected!" print('Begin to load the dictionary.') dictionary = Dictionary(path=args.dictionary) # n_token: number of tokens in the dictionary n_token = len(dictionary) # initialize the classifier; interesting way to use dictionary as input; more readable and better use it in the future # important: remember to change the type when switching to another model if args.encoder == "CNN": model = Classifier_CNN({ 'dropout': args.dropout, 'ntoken': n_token, 'ninp': args.emsize, 'encoder': args.encoder, 'nfc': args.nfc, 'dictionary': dictionary, 'word-vector': args.word_vector,
def main(argv): import gzip, os from itertools import ifilter from util import ureader, uwriter, closing, Dictionary, SpanishSet, EnglishSet if '-h' in sys.argv: usage() # Reverse source and target during collection reverse = '-r' in argv _contextual = '-c' in argv _probabilities = '-p' in argv if not '-f' in argv: usage() p_args = 1 + argv.index('-f') src_lang, tgt_lang, in_fname, out_fname = argv[p_args:p_args + 4] # The input stream def check_record(categories): def _do_check((sent_id, src_id, src_word, src_pos, tgt_id, tgt_word, tgt_pos)): "Check <en, es> is a pair of words with pos in categories." pos = src_pos[0] # same as tgt_pos[0] return pos in categories and is_word(src_word, 'en') and is_word(tgt_word, 'es') return _do_check print 'Creating dictionaries ...', dictionary = Dictionary(en=EnglishSet(), es=SpanishSet()) is_word = dictionary.is_word print 'done.' def streams_for(categories, mode): return map(lambda c: uwriter(gzip.open('{}.{}.gz'.format(out_fname, c.lower()), mode)), categories) def noncontextual(): if '-c' in argv or '-p' in argv: # Just to be sure: -c and -n are mutually exclusive usage() categories = 'nv' idx = categories.index def fields(rec): return idx(rec[3][0]), rec[2], rec[5] print 'Reading', in_fname, '...', in_stream = ifilter(check_record(categories), map(lambda l: l.lower().split(), ureader(gzip.open(in_fname)))) print 'done.' # Output files, one for each category with closing(*streams_for(categories, 'wb')) as out_streams: print 'Processing noncontextually', in_stream, 'for', categories, '...', in_recs = imap(fields, in_stream) collected = collect(in_recs, categories, reverse=reverse) process(collected, out_streams, src_lang, tgt_lang, categories, reverse) print 'done.' def contextual(): """Contextual, with probabilities. """ from itertools import product from util import grouped fname = argv[1 + argv.index('-c')] if not os.access(fname, os.F_OK): print 'Cannot access', fname usage() print 'Reading relations ...', lines = ureader(gzip.open(fname)).readlines() print 'done.' print 'Extracting records ...', recs = [l[:-1].lower().split() for l in lines] print 'done.' print 'Indexing relations ...', ctx = dict(((int(dep[0]), int(dep[2])), dep) for dep in recs) print 'done.' UNK = u'__unk__' print 'Gathering categories ...', categories = list(set(r[3] for r in recs)) + [UNK] print '({})'.format(u', '.join(categories)), 'done.' idx = categories.index src, tgt, src_id = (2, 5, 1) if not reverse else (5, 2, 4) def fields(rec): s_id, w_id = int(rec[0]), int(rec[src_id]) try: dep = ctx[(s_id, w_id)] except KeyError: dep = [UNK] * 6 return idx(dep[3]), (rec[src], dep[4]), rec[tgt] print 'Reading', in_fname, '...', in_stream = ifilter(check_record('v'), map(lambda l: l.lower().split(), ureader(gzip.open(in_fname)))) print 'done.' # Output files, one for each category dotjoin = '.'.join fnames = [dotjoin(p) for p in product(categories, ('px', 'pmi'))] with closing(*streams_for(fnames, 'wb')) as out_streams: print 'Processing', in_fname, 'for', categories, '...', in_recs = imap(fields, in_stream) collected = collect3(in_recs, categories, reverse=reverse) print 'done.\nOutputting files', ', '.join(fnames), '...' process_p2(collected, grouped(2, out_streams), src_lang, tgt_lang) print 'done.' def contextual_noprob(): """Contextual, without probabilities. """ fname = argv[1 + argv.index('-c')] if not os.access(fname, os.F_OK): print 'Cannot access', fname usage() print "Contextual, no probabilities." print 'Reading relations ...', lines = ureader(gzip.open(fname)).readlines() print 'done.' print 'Extracting records ...', recs = [l[:-1].lower().split() for l in lines] print 'done.' print 'Indexing relations ...', # Record schema: sentence_id, noun_id, verb_id, noun, verb ctx = dict(((int(dep[0]), int(dep[2])), dep) for dep in recs) print 'done.' UNK = u'__unk__' print 'Gathering categories ...', categories = list(set(r[3] for r in recs)) + [UNK] print '({})'.format(u', '.join(categories)), 'done.' index = categories.index src, tgt, src_id = (2, 5, 1) if not reverse else (5, 2, 4) def fields(rec): s_id, w_id = int(rec[0]), int(rec[src_id]) try: dep = ctx[(s_id, w_id)] except KeyError: dep = [UNK] * 6 return index(dep[3]), (rec[src], rec[tgt]), dep[4] print 'Reading', in_fname, '...', in_stream = ifilter(check_record('v'), map(lambda l: l.lower().split(), ureader(gzip.open(in_fname)))) print 'done.' # Output files, one for each category with closing(*streams_for(categories, 'wb')) as out_streams: print 'Processing', in_fname, 'for', categories, '...', in_recs = imap(fields, in_stream) collected = collect(in_recs, categories) print 'done.' assert len(collected) == len(out_streams), '{} != {}'.format(collected, out_streams) print 'Outputting files', ', '.join(categories), '...' process_counts(collected, out_streams, src_lang, tgt_lang) print 'done.' def probabilities(): if '-c' in argv or '-n' in argv: # Just to be sure: -c and -n are mutually exclusive usage() src, tgt = (2, 5) if not reverse else (5, 2) def fields(rec): return rec[3][0], rec[src], rec[tgt] # categories = ['v.px', 'v.pmi'] categories = ['n.px', 'n.pmi'] print 'Reading', in_fname, '...', in_stream = ifilter(check_record('n'), map(lambda l: l.lower().split(), ureader(gzip.open(in_fname)))) print 'done.' # Output files, one for each category with closing(*streams_for(categories, 'wb')) as out_streams: in_recs = imap(fields, in_stream) print 'Processing probabilities in', in_fname, 'for', categories, '...', collected = collect2(in_recs, reverse=reverse) print 'done.' print 'Outputting files ...', process_p(collected, out_streams, src_lang, tgt_lang) print 'done.' if not _contextual: noncontextual() elif not _probabilities: contextual_noprob() else: probabilities()
import random from util import Dictionary import spacy if __name__ == '__main__': parser = argparse.ArgumentParser('Tokenizer') parser.add_argument('--input', type=str, default='', help='input file') parser.add_argument('--output', type=str, default='', help='output file') parser.add_argument('--labels', type=str, default='', help='label file') parser.add_argument('--dict', type=str, default='', help='dictionary file') parser.add_argument('--label-data', action='store_true', help='to parse label file into json format') parser.add_argument('--shuffle', action='store_true', help='output shuffled data to file') args = parser.parse_args() tokenizer = spacy.load('en_core_web_md') dictionary = Dictionary() dictionary.add_word('<pad>') # add padding word lab2int = {} int2lab = {} with open(args.labels, 'r') as labfile: for line in labfile: labint, labtext = line.strip().split('\t') labint = int(labint) lab2int[labtext] = labint int2lab[labint] = labtext with open(args.output, 'w') as fout: lines = open(args.input).readlines() if args.shuffle: random.shuffle(lines) for i, line in enumerate(lines): if not line.startswith("#STARTDIALOGUE"): # data: input<tab>label<tab>response<tab>interp<tab>correct<tab>...
def main(cfg): # Set the random seed manually for reproducibility. torch.manual_seed(cfg.seed) if torch.cuda.is_available(): if not cfg.cuda: print( "WARNING: You have a CUDA device, so you should probably run with --cuda" ) else: torch.cuda.manual_seed(cfg.seed) random.seed(cfg.seed) # Load Dictionary assert os.path.exists(cfg.data.train_data) assert os.path.exists(cfg.data.val_data) print('Begin to load the dictionary.') global dictionary dictionary = Dictionary(path=cfg.data.dictionary) global best_val_loss global best_acc best_val_loss = None best_acc = None n_token = len(dictionary) global model model = Classifier({ 'dropout': cfg.model.dropout, 'ntoken': n_token, 'nlayers': cfg.model.nlayers, 'nhid': cfg.model.nhid, 'ninp': cfg.model.emsize, 'pooling': 'all', 'attention-unit': cfg.model.attention_unit, 'attention-hops': cfg.model.attention_hops, 'nfc': cfg.model.nfc, 'dictionary': dictionary, 'word-vector': cfg.data.word_vector, 'class-number': cfg.class_number }) if cfg.cuda: model = model.cuda() global I I = torch.zeros(cfg.training.batch_size, cfg.model.attention_hops, cfg.model.attention_hops) for i in range(cfg.training.batch_size): for j in range(cfg.model.attention_hops): I.data[i][j][j] = 1 if cfg.cuda: I = I.cuda() global criterion global optimizer criterion = nn.CrossEntropyLoss() if cfg.training.optimizer == 'Adam': optimizer = optim.Adam(model.parameters(), lr=cfg.training.lr, betas=[0.9, 0.999], eps=1e-8, weight_decay=0) elif cfg.training.optimizer == 'SGD': optimizer = optim.SGD(model.parameters(), lr=cfg.training.lr, momentum=0.9, weight_decay=0.01) else: raise Exception('For other optimizers, please add it yourself. ' 'supported ones are: SGD and Adam.') print('Begin to load data.') global data_train data_train = open(cfg.data.train_data).readlines() global data_val data_val = open(cfg.data.val_data).readlines() try: for epoch in range(cfg.training.epochs): train(epoch, cfg) except KeyboardInterrupt: print('-' * 89) print('Exit from training early.') data_val = open(cfg.data.test_data).readlines() evaluate_start_time = time.time() test_loss, acc = evaluate(cfg) print('-' * 89) fmt = '| test | time: {:5.2f}s | test loss (pure) {:5.4f} | Acc {:8.4f}' print(fmt.format((time.time() - evaluate_start_time), test_loss, acc)) print('-' * 89) exit(0)
def load_data_set(vocab_size, dataset_type, level='l3'): """ Loads the dataset. Args: dataset_type: Dbpedia or WIKI or WOS vocab_size: {int} size of the vocabulary Returns: x_train: {df} with col_names=['text','label'] x_val: {df} df['text'][0] is an np array with indices of words x_test: {df} df['label'][0] is an np.int with indice of the label category word_to_id : {dict} words mapped to indices cat2id: {dict} categories mapped to indices """ save_list_str = [ "x_train", "x_val", "x_test", "dictionary", "dataLoader.cat2id" ] return_list = [] if os.path.exists("./data/%s.x_train.p" % dataset_type): print("---loading pre-process %s data---" % dataset_type) # load already processed data for item in save_list_str: with open("./data/%s." % dataset_type + item + ".p", "rb") as f: return_list.append(pickle.load(f)) print("----finish data loading----{%d} train,{%d} val, {%d} test" % (len(return_list[0]), len(return_list[1]), len(return_list[2]))) print("{%d} words in dictionary, {%d} classes----" % (len(return_list[3]), len(return_list[4]))) dictionary = return_list[3] data_loc = '/home/ml/ksinha4/mlp/hier-class/data' dataLoader = Data_Utility() dataLoader.cat2id = return_list[4] pdb.set_trace() x_test = dataLoader.read(data_loc=data_loc, file_name="wos_data_n_test.csv", column='l2') x_test['label'] = dataLoader.transfer_cat_to_id(x_test['label']) return_list[2] = x_test return return_list print("----initial %s data loading and processing----" % dataset_type) dataLoader = Data_Utility() if dataset_type == "DBpedia": data_loc = '/home/ml/ksinha4/mlp/hier-class/data' x_train = dataLoader.read(data_loc=data_loc, file_name="df_small_train.csv", column=level) x_test = dataLoader.read(data_loc=data_loc, file_name="df_small_test.csv", column=level) elif dataset_type == "WIKI": data_loc = '/home/ml/ksinha4/datasets/data_WIKI' x_train = dataLoader.read(data_loc=data_loc, file_name="full_docs_2_train.csv", column=level) x_test = dataLoader.read(data_loc=data_loc, file_name="full_docs_2_test.csv", column=level) # "/home/ml/ksinha4/datasets/data_WOS/WebOfScience/WOS46985" elif dataset_type == "WOS": data_loc = '/home/ml/ksinha4/mlp/hier-class/data' x_train = dataLoader.read(data_loc=data_loc, file_name="wos_data_n_train.csv", column=level) x_test = dataLoader.read(data_loc=data_loc, file_name="wos_data_n_test.csv", column=level) else: raise Exception('this dataset type is not implemented yet') x_val = x_train[:int(0.1 * len(x_train))] x_train = x_train[int(0.1 * len(x_train)):] print("----finish data loading----{%d} train,{%d} val, {%d} test" % (len(x_train), len(x_val), len(x_test))) # processing dictionary and cat2id dictionary = Dictionary() dictionary.word2idx, dictionary.idx2word = dataLoader.assign_word_ids( x_train['text'].append(x_val['text']), vocab_size=vocab_size) dataLoader.assign_category_ids(list(x_train['label']) + \ list(x_val['label']) + list(x_test['label'])) x_train['label'] = dataLoader.transfer_cat_to_id(x_train['label']) x_val['label'] = dataLoader.transfer_cat_to_id(x_val['label']) x_test['label'] = dataLoader.transfer_cat_to_id(x_test['label']) print("----processed {%d} word_2_id, {%d}cat_2_id----" %\ (len(dictionary.word2idx),len(dataLoader.cat2id))) # save the processed files in pickle save_list = [x_train, x_val, x_test, dictionary, dataLoader.cat2id] for i in range(len(save_list_str)): if not os.path.exists("./data"): os.mkdir("./data") with open("./data/%s.%s.p" % (dataset_type, save_list_str[i]), 'wb') as f: pickle.dump(save_list[i], f) return x_train, x_val, x_test, dictionary, dataLoader.cat2id
from util import Dictionary import json print('Begin to load the dictionary.') dictList = [] for i in range(16): dictList.append(Dictionary(path='./Data/data_clean/dict'+str(i))) for i in range(15): for word in list(dictList[i+1].word2idx.keys()): dictList[0].add_word(word) print('dict size: '+str(len(dictList[0]))) with open('./Data/data_clean/dictall', 'w') as fout: # save dictionary for fast next process fout.write(json.dumps(dictList[0].idx2word) + '\n') with open('./Data/data_clean/trainsetall', "w") as fout: for i in range(16): with open('./Data/data_clean/trainset'+str(i), "r") as infile: lines = infile.readlines() for line in lines: fout.write(line)