def fromText2Image(filedir='dataset'): assert os.path.exists( filedir), 'file %s direction does not exit.' % (filedir) textfiles = { 'acceleration': '%s/text/acceleration' % filedir, # 'underfitting': '%s/text/underfitting' % filedir } for key, value in textfiles.items(): # training aim_path = '%s/training/' % (value.replace('text', 'image')) utils.mkdir(aim_path) trainingText = utils.read_text(filename='%s/training.txt' % (value)).split('\n') if not len(os.listdir(aim_path)) == 2 * len(trainingText): print 'generating %s image' % (aim_path) mp_generate(textlines=trainingText, aim_path=aim_path, threads_num=8) del trainingText else: pass # test aim_path = '%s/test/' % (value.replace('text', 'image')) utils.mkdir(aim_path) testText = utils.read_text(filename='%s/test.txt' % (value)).split('\n') if not len(os.listdir(aim_path)) == 2 * len(testText): print 'generating %s image' % (aim_path) mp_generate(textlines=testText, aim_path=aim_path, threads_num=8) del testText else: pass
def main(): arg_parser = argparse.ArgumentParser( description="Neural Machine Translation Testing") arg_parser.add_argument("--model_file", required=True, help="Model File") arg_parser.add_argument("--valid_data", required=True, nargs="+", help="Validation_data") args = arg_parser.parse_args() args = vars(args) print(args) model = Seq2Seq.load(args["model_file"]) print(model) model.device = "cpu" tr_dev_dataset_fn, en_dev_dataset_fn = args["valid_data"] tr_valid_data = read_text(tr_dev_dataset_fn) en_valid_data = read_text(en_dev_dataset_fn) valid_data = list(zip(tr_valid_data, en_valid_data)) src_valid, tgt_valid = add_start_end_tokens(valid_data) hypotheses = beam_search(model, src_valid, beam_size=3, max_decoding_time_step=70) top_hypotheses = [hyps[0] for hyps in hypotheses] bleu_score = compute_corpus_level_bleu_score(tgt_valid, top_hypotheses) print('Corpus BLEU: {}'.format(bleu_score * 100))
def main(argv=None): org_dic, _ = load_dictionary(FLAGS.data, MAX_VOCAB_SIZE, FLAGS.data_dir) train_texts, train_labels = read_text("%s/train" % FLAGS.data, FLAGS.data_dir) test_texts, test_labels = read_text("%s/test" % FLAGS.data, FLAGS.data_dir) train_seqs, train_seqs_mask = text_encoder(train_texts, org_dic, config.word_max_len[FLAGS.data]) test_seqs, test_seqs_mask = text_encoder(test_texts, org_dic, config.word_max_len[FLAGS.data]) print("Dataset ", FLAGS.data, " loaded!") glove_embedding_matrix = np.load(FLAGS.data_dir + "aux_files/embeddings_glove_%s_%d.npy" % (FLAGS.data, MAX_VOCAB_SIZE)) dist_mat = np.load(FLAGS.data_dir + "aux_files/small_dist_counter_%s_%d.npy" % (FLAGS.data, MAX_VOCAB_SIZE)) for stop_word in config.stop_words: if stop_word in org_dic: dist_mat[org_dic[stop_word], :, :] = 0 train_log = train( train_seqs, train_seqs_mask, train_labels, test_seqs, test_seqs_mask, test_labels, glove_embedding_matrix, dist_mat, config.num_classes[FLAGS.data], )
def __init__(self, split=None, transform_function=None): self.path = "path-of-trainging-dataset" self.transform_function = transform_function fname_path = self.path path_pointJSON = "%s/pointDict.json" % self.path if split == "train": self.imgNames = [ t.replace(".jpg\n", "") for t in ut.read_text(fname_path + "/train.txt") ] elif split == "val": self.imgNames = [ t.replace(".jpg \n", "") for t in ut.read_text(fname_path + "/train_val.txt") ] elif split == "test": self.imgNames = [ t.replace("\n", "") for t in ut.read_text(fname_path + "/test.txt") ] if os.path.exists(path_pointJSON): self.pointsJSON = ut.load_json(path_pointJSON) else: pointDict = get_pointDict(self.path, self.imgNames) ut.save_json(path_pointJSON, pointDict) self.split = split self.n_classes = 2
def __init__(self, split=None, transform_function=None): self.path = path = "//mnt/datasets/public/issam/VOCdevkit/VOC2007/" self.transform_function = transform_function fname_path = "%s/ImageSets/Main" % path path_pointJSON = "%s/pointDict.json" % path if split == "train": self.imgNames = [ t.replace("\n", "") for t in ut.read_text(fname_path + "/train.txt") ] elif split == "val": self.imgNames = [ t.replace("\n", "") for t in ut.read_text(fname_path + "/val.txt") ] elif split == "test": self.imgNames = [ t.replace("\n", "") for t in ut.read_text(fname_path + "/test.txt") ] if os.path.exists(path_pointJSON): self.pointsJSON = ut.load_json(path_pointJSON) else: pointDict = get_pointDict(path, self.imgNames) ut.save_json(path_pointJSON, pointDict) # for j, key in enumerate(pointDict): # print(j) # pList1 = pointDict[key] # pList2 = self.pointsJSON[key] # for p1 in pList1: # y, x = p1["y"], p1["x"] # flag = False # for p2 in pList2: # y2, x2 = p2["y"], p2["x"] # if y == y2 and x == x2: # flag = True # break # assert flag == True self.split = split self.n_classes = 21
def __validate_mentions(doc_list_file, mentions, dst_miss_match_file): print 'checking miss match' doc_mentions = Mention.arrange_mentions_by_docid(mentions) doc_paths = load_doc_paths(doc_list_file) doc_head = '<?xml version="1.0" encoding="utf-8"?>\n' miss_match_cnt = 0 fout = open(dst_miss_match_file, 'wb') for doc_path in doc_paths: docid = doc_id_from_path(doc_path) cur_doc_mentions = doc_mentions.get(docid, list()) if not cur_doc_mentions: continue doc_text = read_text(doc_path, True) if doc_text.startswith(doc_head): doc_text = doc_text[len(doc_head):] for m in cur_doc_mentions: name_in_doc = doc_text[m.beg_pos:m.end_pos + 1] if name_in_doc != m.name: miss_match_cnt += 1 fout.write('%s\t%s\t%d\t%d\t%s\n' % (docid, m.name.encode('utf-8'), m.beg_pos, m.end_pos, name_in_doc.encode('utf-8'))) # print '%s\t%s\t%d\t%d\t%s' % (docid, m.name, m.beg_pos, m.end_pos, name_in_doc) fout.close() print miss_match_cnt, 'miss match'
def write_cited_contexts(doc_ids, index_folder, files_folder): contexts = defaultdict(list) for doc_id in progress(doc_ids): citations = get_cited_papers(doc_id) for cited, start, end in citations : text = utils.read_text(doc_id) contexts[cited].append(find_sentence(text, start, end)) # if len(contexts) > 100000: break fields = [pylucene.DocField("id", stored=True, indexed=False), pylucene.DocField("contexts", stored=False, indexed=True)] index = pylucene.Index(index_folder, fields) print "Writing contexts to file for %d documents." % len(contexts) for i, (doc_id, ctxs) in enumerate(contexts.items()) : text = u"\n".join(ctxs) index.add(id=doc_id, contexts=text) # Commit and print progress every 1K entries if i%1000==0 and i: index.commit() print "%d documents indexed and written to disk." % i # Also write contexts into files with open(os.path.join(files_folder, "%s.txt"%doc_id), "w") as f : print >> f, text.encode("UTF-8") index.close()
def gen(batch_size=50): current_index = batch_size index = ['/01%04d' % i for i in xrange(batch_size)] while 1: X = np.asanyarray( [utils.read_npy(traingPath + i + '.npy') for i in index], dtype=NP_DTYPE) str2index = lambda line: [chrst[i] for i in line] y = np.asanyarray([ str2index(line) for line in [utils.read_text(traingPath + i + '.txt') for i in index] ], dtype=NP_DTYPE) if current_index + batch_size > trainingFileNum: index = range(current_index, trainingFileNum) + range( 0, batch_size + current_index - trainingFileNum) index = ['/01%04d' % i for i in index] current_index = batch_size + current_index - trainingFileNum else: index = [ '/01%04d' % i for i in range(current_index, current_index + batch_size) ] current_index += batch_size yield [ X, y, np.ones(batch_size) * int(width - 2), np.ones(batch_size) * n_len ], np.ones(batch_size)
def __init__(self, root="",split=None, transform_function=None): self.split = split self.n_classes = 2 self.transform_function = transform_function ############################ # self.path_base = "/home/tammy/LCFCN/datasets/TRANCOS_v3" self.path_base ="/floyd/input/logcounting" # self.path_base = "/mnt/datasets/public/issam/Trancos/" if split == "train": fname = self.path_base + "/image_sets/training.txt" elif split == "val": fname = self.path_base + "/image_sets/validation.txt" elif split == "test": fname = self.path_base + "/image_sets/test.txt" self.img_names = [name.replace(".jpg\n","") for name in ut.read_text(fname)] self.path = self.path_base + "/images/" self.path_dots = self.path_base + "/dots/" assert os.path.exists(self.path + self.img_names[0] + ".jpg")
def extend_matrix(sourceDir, targetDir, encodingMethod=1): utils.mkdir(targetDir) print 'extending matrix of %s ...' % (sourceDir) index = sorted([fl for fl in os.listdir(sourceDir) if fl.endswith('txt')]) extendData = [ sourceDir + '/' + fl for fl in sorted(index, key=lambda name: int(name.split('.')[0])) ] widthDict = utils.width_dict(filename='material/width.txt') for i, fl in enumerate(extendData): tmpMatrix = np.zeros(shape=(1, maxlen, chrstlen), dtype=float) sntc = utils.read_text(fl) for j, char in enumerate(sntc): tmpMatrix[0, j, chrst[char]] = 1.0 extendMatrix = model.predict(tmpMatrix)[0] # print ''.join([chrst_reversed[ch] for ch in [np.argmax(line) for line in extendMatrix]]) mat = utils.extend_matrix(original=extendMatrix, length=600, string=sntc, widthDict=widthDict, chrst=chrst, encodingMethod=encodingMethod) utils.write_npy(filename='%s/%s.npy' % (targetDir, index[i].split('.')[0]), mat=mat)
def input_text(self): if not self.edition or not self.user: raise quit() print( '\ntype in the review. (exit by typing two continuous line breaks.)\n' ) self.text = utils.read_text() print('\nthanks.')
def gen_for_test(start, batch_size=50): index = ['/01%04d' % i for i in xrange(start, start + batch_size)] while 1: X = np.asanyarray( [utils.read_npy(testPath + i + '.npy') for i in index], dtype=NP_DTYPE) y = [utils.read_text(testPath + i + '.txt') for i in index] yield X, y
def build(name) -> [str, None]: school = cfg_schools.data[name] ext_name = school.get('section-times') if ext_name is None: return None src_path = os.path.join(SRC_DIR, name + '.' + ext_name) text = utils.read_text(src_path) text = text.strip().replace('\n', SPACE) text = SPACE + 'sectionTimes: ' + text return TAG_BEFORE + text + TAG_AFTER
def main(): readme = utils.read_text(README_NAME) start = readme.index(TAG_ADAPTED_TABLE_START) + len( TAG_ADAPTED_TABLE_START) end = readme.index(TAG_ADAPTED_TABLE_END) with utils.open_text(README_NAME, 'w') as f: f.write(readme[:start]) f.write('\n') f.write(create_adapted_table()) f.write(readme[end:])
def make_vocab(cls, data_path, save_path): texts = read_text(data_path) words = [word for text in tqdm(texts) for word in cls.tokenizer.morphs(preprocess_text(text))] word_counter = Counter(words) vocab = {"[PAD]": 0, "[UNK]": 1} idx = 2 for word, count in word_counter.most_common(): vocab[word] = idx idx += 1 save_json(save_path, vocab)
def main(): try: path = sys.argv[1] except IndexError: print("Error: No input files") sys.exit() text = read_text(path) # print p = Parser() info = p.parse(text) pprint(info)
def make_dataset(self, data_dir, save_dir): data = utils.read_text(data_dir) f = open(save_dir, 'wb') print('generating sentences') sents = utils.generate_sentences(data) print('converting sentences to code points') sents_cp = utils.convert_sent_to_id(sents) for i in tqdm(sents_cp): x = i[:-1] y = i[1:] pkl.dump((x, y), f) f.close() print('done')
def __extract_nom_mentions(nom_dict_file, doc_list_file, words_pos_file, dst_nom_mentions_file): noms = load_nom_dict(nom_dict_file) nom_name_list = [n for n in noms] nom_name_list.sort(key=lambda x: -len(x)) nom_name_list = [n.split(' ') for n in nom_name_list] doc_path_dict = __load_doc_paths_as_dict(doc_list_file) mentions = list() f_wp = open(words_pos_file, 'r') for i, line in enumerate(f_wp): vals = line.rstrip().split('\t') docid = vals[0] if (i + 1) % 10 == 0: print i + 1, docid doc_path = doc_path_dict[docid] doc_text = read_text(doc_path).decode('utf-8') if doc_text.startswith(doc_head): doc_text = doc_text[len(doc_head):] num_sentences = int(vals[1]) for j in xrange(num_sentences): sentence = __next_sentence_in_words_pos_file(f_wp) words = [tup[0].lower() for tup in sentence] # print words hit_spans, hit_indices = find_phrases_in_words(nom_name_list, words, False) for hit_span, hit_idx in izip(hit_spans, hit_indices): beg_pos = sentence[hit_span[0]][3] end_pos = sentence[hit_span[1] - 1][4] tags = [tup[2] for tup in sentence[hit_span[0]:hit_span[1]]] # print tags # if 'NN' not in tags and 'NNP' not in tags: # continue if 'NN' not in tags: continue name = doc_text[beg_pos:end_pos + 1].replace('\n', ' ') if '<' in name or 'http:' in name or '>' in name: continue m = Mention(name=name, beg_pos=beg_pos, end_pos=end_pos, docid=docid, mention_type='NOM', entity_type='PER', kbid='NIL00000') mentions.append(m) # print sentence[hit_span[0]], sentence[hit_span[1]] # print nom_name_list[hit_idx], name # break f_wp.close() Mention.save_as_edl_file(mentions, dst_nom_mentions_file)
def build_fs_dict(token_date_dir, result_dir, fs_percent=1): PMI_SO = result_dir + os.sep + 'PMI-SO' print 'Reading text...' doc_str_list_token, doc_class_list_token = utils.read_text([token_date_dir + os.sep + x for x in FNAME_LIST], SAMP_TAG) print 'End Reading' doc_terms_list_train = utils.get_doc_terms_list(doc_str_list_token) class_set = utils.get_class_set(doc_class_list_token) term_set = utils.get_term_set(doc_terms_list_train) print 'PMI-SO Sentiment Lexicon Construction...' df_term = utils.stat_df_term(term_set, doc_terms_list_train) df_class = utils.stat_df_class(class_set, doc_class_list_token) df_term_class = utils.stat_df_term_class(term_set, class_set, doc_terms_list_train, doc_class_list_token) term_set_fs, term_score_list = utils.feature_selection_mi(df_class, df_term_class) save_score_list(term_score_list, term_set_fs, PMI_SO)
def make_nikaya(sutra_urls): nikaya = _MyNikaya() nikaya.title_chinese = '長部' nikaya.title_pali = 'Digha Nikāya', nikaya.abbreviation = 'DN' for url in sutra_urls: chinese, pali, modified = read_text(url) header_lines, main_lines = split_chinese_lines(chinese) info = analyse_header(header_lines) if info.pin_title is not None: if not nikaya.pins or nikaya.pins[-1].title != info.pin_title: pin = _Pin() pin.serial = info.pin_serial pin.title = info.pin_title nikaya.pins.append(pin) sutra = Sutra() sutra.serial_start = info.sutra_serial_start sutra.serial_end = info.sutra_serial_end sutra.pali = pali sutra.chinese = chinese sutra.main_lines = main_lines sutra.modified = modified sutra.serial = sutra.serial_start sutra.title = info.sutra_title sutra.sec_title = sutra.serial + ' ' + sutra.title sutra.abbreviation = '{}.{}'.format(nikaya.abbreviation, sutra.serial) nikaya.pins[-1].sutras.append(sutra) return nikaya
def load_data(self, trainingNum=None, testNum=None): self.trainingNum = trainingNum self.testNum = testNum text = utils.read_text(filename=self.filename) sentences = [ text[i:i + self.maxlen] for i in xrange(0, len(text), self.step) ] sentences = [sntc for sntc in sentences if len(sntc) == self.maxlen] random.shuffle(sentences) if trainingNum == None: pass else: self.traingData = sentences[:self.trainingNum] if testNum == None: pass else: self.testData = sentences[-self.testNum:] del sentences
def make_vocab(cls, data_path, save_path): data = read_text(data_path) texts = make_texts(data) letters = [] for text in tqdm(texts): text = preprocess_text(text) for char in text: try: ls = hgtk.letter.decompose(char) except: ls = ["[NUM]", "[NUM]", "[NUM]"] letters.extend(ls) letter_counter = Counter(letters) vocab = {"[PAD]": 0, "[UNK]": 1} idx = 2 for char, count in letter_counter.most_common(): vocab[char] = idx idx += 1 save_json(save_path, vocab)
def get_tokens_per_citation(doc_id): ''' Fetches all cited papers by paper 'doc_id', gets the contexts around these citations and return them in a dict structure {cited_paper_id: [token1, token2, ..., tokenN]}. If a paper is cited at more than one location then the tokens for each contexts are merged together. ''' citations = contexts.get_cited_papers(doc_id) text = utils.read_text(doc_id) tokens_per_citation = defaultdict(list) ctxs = {} for cited, start, end in citations: # Only process citation if cited paper is known (cited != None) if cited: if (start, end) not in ctxs: ctxs[(start, end)] = tokenizer.tokenize( contexts.find_sentence(text, start, end)) tokens_per_citation[cited] += ctxs[(start, end)] return tokens_per_citation
def main(input_file, vocabulary_file): """Automatically check and correct the spelling of a file.""" vocabulary = utils.read_vocabulary(vocabulary_file) logging.info("Read %i words.", len(vocabulary)) text = utils.read_text(input_file) check(text, vocabulary)
nb_epochs = 100 train_batch_size = 128 val_batch_size = 256 sample_mode = 'argmax' reverse = True data_path = './data' train_books = ['nietzsche.txt', 'pride_and_prejudice.txt', 'shakespeare.txt', 'war_and_peace.txt'] val_books = ['wonderland.txt'] if __name__ == '__main__': # Prepare training data. text = read_text(data_path, train_books) vocab = tokenize(text) vocab = list(filter(None, set(vocab))) # `maxlen` is the length of the longest word in the vocabulary # plus two SOS and EOS characters. maxlen = max([len(token) for token in vocab]) + 2 train_encoder, train_decoder, train_target = transform( vocab, maxlen, error_rate=error_rate, shuffle=False) print(train_encoder[:10]) print(train_decoder[:10]) print(train_target[:10]) input_chars = set(' '.join(train_encoder)) target_chars = set(' '.join(train_decoder)) nb_input_chars = len(input_chars)
def input_text(self): if not self.edition or not self.user: raise quit() print '\ntype in the review. (exit by typing two continuous line breaks.)\n' self.text = utils.read_text() print '\nthanks.'
parser.add_argument("--train_w2v", help="train word2vec model from input file", action="store_true") parser.add_argument("--train_cnn", help="train CNN model from input labels and data", action="store_true") parser.add_argument("--predict_cnn", help="use CNN model to predict labels for test data", action="store_true") parser.add_argument("--validate_cnn", action="store_true") args = parser.parse_args() if args.train_d2v: text = [] for i in args.di: text += utils.read_text(i) utils.logger.info("text reading finished") tokens = utils.tokenize_paragraph_d2v(text) utils.logger.info("text tokenizing finished") utils.compute_paragraph_doc2vec(tokens, vector_size=args.vector_size, epochs=25, workers=multiprocessing.cpu_count(), model_path=args.dm) utils.logger.info("doc2vec training finished") elif args.train_w2v: text = [] for i in args.wi: text += utils.read_text(i) utils.logger.info("text reading finished") tokens = utils.tokenize_paragraph_w2v(text)
print('\nConverting {} ...'.format(TMP_TEST_PROTOTXT_FILE)) convert_prototxt() print('\Making {} ...'.format(DST_DEPLOY_PROTOTXT_FILE)) make_deploy_prototxt() print('\nPostprocessing {} ...'.format(DST_TEST_PROTOTXT_FILE)) postprocess_test_prototxt() info = {} info['weights'] = WEIGHTS_FILE info['lmdb_dir'] = LMDB_TARGET_DIR info['lmdb_image_count'] = LMDB_IMAGE_COUNT info['test_prototxt_f32'] = SRC_TEST_PROTOTXT_FILE info['test_prototxt_i8'] = DST_TEST_PROTOTXT_FILE info['deploy_prototxt_f32'] = SRC_DEPLOY_PROTOTXT_FILE info['deploy_prototxt_i8'] = DST_DEPLOY_PROTOTXT_FILE info['label_map_file'] = LABEL_MAP_FILE info['name_size_file'] = NAME_SIZE_FILE utils.write_json(PREPARED_INFO_FILE, info) finally: print('\nFinalizing...') if caffe_package_init_file_uid: if os.path.isfile(caffe_package_init_file): existed_uid = utils.read_text(caffe_package_init_file) print('{} exists'.format(caffe_package_init_file)) if existed_uid == caffe_package_init_file_uid: os.remove(caffe_package_init_file) print('Removed')
def _read_standoff(self, corpus_dir, encoding="UTF-8"): docs = {} for filename in glob(os.path.join(corpus_dir, "*.ann")): doc = read_text(filename.replace(".ann", ".txt"), encoding=encoding) cursor = 0 start_offsets = {} end_offsets = {} sentences = [] for sentence_index, sentence in enumerate(doc.split("\n")): tokens = sentence.split(" ") for token_index, token in enumerate(tokens): start_offsets[cursor] = (sentence_index, token_index) end_offsets[cursor + len(token)] = (sentence_index, token_index) cursor += len(token) + 1 sentences.append({"tokens": tokens, "mentions": []}) assert len(doc) == cursor - 1 mentions = {} references = {} for line in read_lines(filename, encoding=encoding): if line.startswith("T"): matcher = MENTION_PATTERN.match(line) mention_id, mention_label, mention_start_offset, mention_end_offset, mention_string = ( matcher.groups()) assert mention_id not in mentions if mention_label in self.get_labels(): mentions[mention_id] = { "id": mention_id, "label": mention_label, "start": int(mention_start_offset), "end": int(mention_end_offset), "string": mention_string, "references": {}, } elif line.startswith("N"): matcher = REFERENCE_PATTERN.match(line) reference_id, mention_id, resource_name, record_id, reference_string = ( matcher.groups()) assert reference_id not in references references[reference_id] = { "id": reference_id, "mention": mention_id, "resource": resource_name, "record": record_id, "string": reference_string, } for reference in references.values(): if reference["mention"].startswith("T"): resource_record_pair = (reference["resource"], reference["record"]) assert ( resource_record_pair not in mentions[reference["mention"]]["references"]) mentions[reference["mention"]]["references"][ resource_record_pair] = reference["string"] seen_mentions = defaultdict(dict) for mention in mentions.values(): left_sentence_index, mention_start_offset = start_offsets[ mention["start"]] right_sentence_index, mention_end_offset = end_offsets[ mention["end"]] assert (left_sentence_index == right_sentence_index and mention_start_offset <= mention_end_offset and " ".join(sentences[left_sentence_index]["tokens"] [mention_start_offset:mention_end_offset + 1]) == mention["string"]) if ( mention_start_offset, mention_end_offset, mention["label"], ) in seen_mentions[left_sentence_index]: seen_mention = seen_mentions[left_sentence_index][ mention_start_offset, mention_end_offset, mention["label"]] assert not (seen_mention["references"] and mention["references"] and seen_mention["references"] != mention["references"]) seen_mention["references"].update(mention["references"]) else: sentences[left_sentence_index]["mentions"].append({ "id": mention["id"], "label": mention["label"], "start": mention_start_offset, "end": mention_end_offset, "references": mention["references"], }) seen_mentions[left_sentence_index][ mention_start_offset, mention_end_offset, mention["label"]] = mention docs[os.path.basename(filename)] = {"sentences": sentences} return docs
def make_nikaya(sutra_urls): nikaya = MyNikaya() nikaya.title_chinese = "增支部" nikaya.title_pali = ("Aṅguttara nikāya",) nikaya.abbreviation = "AN" for url in sutra_urls: chinese, pali, modified = read_text(url) header_lines, main_lines = split_chinese_lines(chinese) info = analyse_header(header_lines) if info.ji_serial is not None: if not nikaya.jis or nikaya.jis[-1].serial != info.ji_serial: ji = Ji() ji.serial = info.ji_serial nikaya.jis.append(ji) if info.pin_serial is not None: if not nikaya.jis[-1].pins or nikaya.jis[-1].pins[-1].serial != info.pin_serial: pin = Pin() pin.serial = info.pin_serial pin.title = info.pin_title nikaya.jis[-1].pins.append(pin) sutra = Sutra() sutra.serial_start = info.sutra_serial_start sutra.serial_end = info.sutra_serial_end sutra.pali = pali sutra.chinese = chinese sutra.main_lines = main_lines sutra.modified = modified if sutra.serial_start == sutra.serial_end: sutra.serial = sutra.serial_start else: sutra.serial = "{}-{}".format(sutra.serial_start, sutra.serial_end) if info.sutra_title: sutra.title = info.sutra_title else: sutra.title = "" if sutra.title: sutra.sec_title = sutra.serial + " " + sutra.title else: sutra.sec_title = sutra.serial sutra.abbreviation = "{}.{}.{}".format(nikaya.abbreviation, nikaya.jis[-1].serial, sutra.serial) nikaya.jis[-1].pins[-1].sutras.append(sutra) return nikaya
print("Reading {}...".format(args.inputdir)) d = {} d1 = [] X = [] Y = [] for infile in glob.glob(args.inputdir + '/*/*/*'): dic = {} instance = os.path.split(os.path.dirname(infile))[-1] review_file = open(infile,'r').read() X.append(review_file) Y.append(instance) if instance not in d: d[instance] = [] d[instance].append(review_file) X, _ = read_text(X) df = pd.DataFrame(X) df = df.fillna(0) original_author_names = Y.copy() Y = label_encoder(Y) # Do what you need to read the documents here. print("Constructing table with {} feature dimensions and {}% test instances...".format(args.dims, args.testsize)) # Build the table here. X = reduce_dim(df, args.dims) train_X, test_X, train_Y, test_Y, tag = shuffle_split(X, Y, test_split = args.testsize) train_X = pd.DataFrame(train_X) test_X = pd.DataFrame(test_X) train_Y = pd.DataFrame(train_Y)
model.add(LSTM(256, dropout=0.1, recurrent_dropout=0.1)) model.add(Dense(1024, activation='relu')) model.add(Dropout(0.3)) model.add(Dense(vocab_size, activation='softmax')) model.compile( loss='categorical_crossentropy', optimizer='adam', metrics=[ 'accuracy', ], ) return model if __name__ == '__main__': text = read_text([FILE_NAME, FILE_NAME2]) text = preprocess(text, force=True) max_words = 100000000 WINDOW = 4 tokenizer = Tokenizer(num_words=max_words, filters='"#$%&()*+-/:;<=>@[\]^_`{|}~') tokenizer.fit_on_texts(text) X_train = tokenizer.texts_to_sequences(text) print('Train shape:', np.array(X_train).shape) X_train_time, Y_train_time = create_dataset(np.array(X_train), WINDOW) vocab_size = len(tokenizer.word_index) + 1
import os import unidecode def read_text(data_path, list_of_books): text = '' for book in list_of_books: file_path = os.path.join(data_path, book) strings = unidecode.unidecode(open(file_path).read()) text += strings + ' ' return text test_path = 'E:\\level4\\second\\NLP\\project\\deep-spell-checkr-master\\data' test_book = ['input.txt'] test_sentence = read_text(test_path, test_book) true_path = 'E:\\level4\\second\\NLP\\project\\deep-spell-checkr-master\\data' true_book = ['true.txt'] true_sentences = read_text(true_path, true_book) string = '' arr = [] for x in true_sentences: if (x == ' '): arr.append(string) string = '' else: string = string + x true = arr if __name__ == '__main__': text = read_text(data_path, books) vocab = tokenize(text)
error_rate = 0.6 reverse = True model_path = './models/seq2seq.h5' hidden_size = 512 sample_mode = 'argmax' data_path = './data' books = [ 'nietzsche.txt', 'pride_and_prejudice.txt', 'shakespeare.txt', 'war_and_peace.txt' ] test_sentence = 'The rabbit-hole went straight on like a tunnel for some way, and then dipped suddenly down, so suddenly that Alice had not a moment to think about stopping herself before she found herself falling down a very deep well.' if __name__ == '__main__': text = read_text(data_path, books) vocab = tokenize(text) vocab = list(filter(None, set(vocab))) # `maxlen` is the length of the longest word in the vocabulary # plus two SOS and EOS characters. maxlen = max([len(token) for token in vocab]) + 2 train_encoder, train_decoder, train_target = transform( vocab, maxlen, error_rate=error_rate, shuffle=False) tokens = tokenize(test_sentence) tokens = list(filter(None, tokens)) nb_tokens = len(tokens) misspelled_tokens, _, target_tokens = transform(tokens, maxlen, error_rate=error_rate, shuffle=False)
# -*- coding: utf-8 -*- # !/usr/bin/env python ''' @author: Yang @time: 17-11-14 下午9:18 ''' import os import utils import re import matplotlib.pyplot as plt import numpy as np plt.figure() # , 'underfitting' Types = ['acceleration'] for dataType in Types: dirs = 'model/%s/image/' % (dataType) for subdir in os.listdir(dirs): logFile = dirs + subdir + '/log.txt' logLines = utils.read_text(filename=logFile).strip().split('\n') logLines = [re.findall(r'\d+\.?\d*', line) for line in logLines] logLines = np.asarray(logLines, dtype=np.float) plt.plot(logLines[:, -1], label=logFile) plt.legend(loc='best') plt.grid() plt.show()
import utils as ut if __name__ == '__main__': lines = ut.read_text("inputs/day1_1.txt") digits = lines[0] n_digits = len(digits) d_sum = 0 for i in range(n_digits): d = digits[i] if i == (n_digits - 1): d_next = digits[0] else: d_next = digits[i + 1] if int(d) == int(d_next): d_sum += int(d) print(d_sum)
def make_nikaya(sutra_urls): nikaya = MyNikaya() nikaya.title_chinese = '相應部' nikaya.title_pali = 'Saṃyutta Nikāya', nikaya.abbreviation = 'SN' for url in sutra_urls: chinese, pali, modified = read_text(url) header_lines, main_lines = split_chinese_lines(chinese) info = analyse_header(header_lines) if info.pian_serial is not None: if not nikaya.subs or nikaya.subs[-1].serial != info.pian_serial: pian = Pian() pian.serial = info.pian_serial pian.title = info.pian_title nikaya.subs.append(pian) if info.xiangying_serial is not None: if not nikaya.subs[-1].subs or nikaya.subs[-1].subs[-1].serial != info.xiangying_serial: xiangying = XiangYing() xiangying.serial = info.xiangying_serial xiangying.title = info.xiangying_title xiangying.sec_title = '{} {}'.format(xiangying.serial, xiangying.title) nikaya.subs[-1].subs.append(xiangying) if info.pin_serial is not None: if not nikaya.subs[-1].subs[-1].subs or nikaya.subs[-1].subs[-1].subs[-1].serial != info.pin_serial: pin = Pin() pin.serial = info.pin_serial pin.title = info.pin_title nikaya.subs[-1].subs[-1].subs.append(pin) if not nikaya.pians[-1].xiangyings[-1].pins: pin = Pin() pin.serial = 1 pin.title = '(未分品)' nikaya.pians[-1].xiangyings[-1].pins.append(pin) sutra = Sutra() sutra.serial_start = info.sutra_serial_start sutra.serial_end = info.sutra_serial_end sutra.pali = pali sutra.chinese = chinese sutra.main_lines = main_lines sutra.modified = modified if sutra.serial_start == sutra.serial_end: sutra.serial = sutra.serial_start else: sutra.serial = '{}-{}'.format(sutra.serial_start, sutra.serial_end) if info.sutra_title: sutra.title = info.sutra_title else: sutra.title = '' if sutra.title: sutra.sec_title = sutra.serial + ' ' + sutra.title else: sutra.sec_title = sutra.serial sutra.abbreviation = '{}.{}.{}'.format(nikaya.abbreviation, nikaya.pians[-1].xiangyings[-1].serial, sutra.serial) nikaya.pians[-1].xiangyings[-1].pins[-1].sutras.append(sutra) return nikaya