def save_vectors(): if not MODEL_FILEPATH.exists(): raise TrainedModelNotFoundError() write_lines([f"{len(model.words)} {len(model[model.words[0]])}"] + \ [w + ' ' + ' '.join(map(str, model['a'])) for w in model.words], MODEL_FILEPATH.with_suffix('.vec'))
def generate_for_each(macro_prefix, header_guard_prefix, supported_size, nested_loops_count, output_dir): ''' Generate FOR_EACH macros, one per required nested level. Each for loop will support a VA_ARGS_length up to the specified `supported_size`. ''' lines = utils.generate_header_begin(macro_prefix, header_guard_prefix, 'for_each') lines.append('') # Generate #includes for CAT, DEFER, INC macros lines.append('#include "{}arg.hpp"'.format(macro_prefix.lower())) lines.append('#include "{}cat.hpp"'.format(macro_prefix.lower())) lines.append('#include "{}inc.hpp"'.format(macro_prefix.lower())) lines.append('') # Generate multiple for each macros for nested loops for x in range(nested_loops_count): # Generate macro which invokes the correct iteration lines += generate_for_each_dispatch_macro(macro_prefix, x) lines.append('') # Generate the ITERN macros. for i in range(supported_size, 1, -1): lines += generate_for_each_itern_macro(macro_prefix, x, i) # Generate the ITER1 macro. lines += generate_for_each_iter1_macro(macro_prefix, x) lines.append('') lines.append(utils.generate_header_end()) utils.write_lines( utils.get_output_file_name(macro_prefix, 'for_each', output_dir), lines)
def _write_ft_file(filename, text_key=TEXT_KEY, label_key=LABEL_KEY, context_key=CONTEXT_KEY, remove_empty=True, clean=True, relevant_labels=None, label_prefix=LABEL_PREFIX, context_prefix=CONTEXT_PREFIX): data = _load_data(filename, text_key=text_key, remove_empty=remove_empty) if clean: data = _clean_texts(data, text_key=text_key) data = _clean_labels(data, label_key=label_key, relevant_labels=relevant_labels) data = _clean_contexts(data, context_key=context_key, context_prefix=context_prefix) def add_prefix(label): return f"{label_prefix}{label if len(label) > 0 else 'unknown'}" def make_row(label, text, context): row = [add_prefix(label)] + ([context] if context else []) + [text] return ' '.join(row).strip() TRAIN_FT_FILEPATH.parent.mkdir(parents=True, exist_ok=True) texts = [ make_row(row[label_key], row[context_key] if context_key in row.keys() else '', row[text_key]) for row in data ] write_lines(texts, TRAIN_FT_FILEPATH)
def main(): config = utils.Config() filenames = os.listdir( os.path.join(config.getpath("data"), "ptbwsj_wo_rstdt", "segmented")) filenames = [n for n in filenames if n.endswith(".txt")] filenames.sort() utils.mkdir( os.path.join(config.getpath("data"), "ptbwsj_wo_rstdt", "preprocessed")) for filename in pyprind.prog_bar(filenames): path_seg = os.path.join(config.getpath("data"), "ptbwsj_wo_rstdt", "segmented", filename) path_raw = os.path.join(config.getpath("data"), "ptbwsj_wo_rstdt", "raw", filename) path_dst = os.path.join(config.getpath("data"), "ptbwsj_wo_rstdt", "preprocessed", filename.replace(".txt", ".edus")) # Input edus = utils.read_lines(path_seg, process=lambda line: line) edus = remove_empty_lines(filename, edus) raw_lines = utils.read_lines(path_raw, process=lambda line: line) raw_lines = remove_empty_lines(filename, raw_lines) assert count_chars(edus) == count_chars(raw_lines) # Processing edus = convert_edus(edus, raw_lines) assert count_chars(edus) == count_chars(raw_lines) # Output utils.write_lines(path_dst, edus)
def srt2ss(path, is_ass=True): timer = utils.Timer() timer.start() subs = utils.load_sub_file(path) start_time = utils.get_start_time(subs, 'ass') end_time = utils.get_end_time(subs, 'ass') plaintext = utils.get_plaintext(subs) sub_block = [] LAYER = 0 STYLE = 'Default' NAME = '' MARGINL = 0 MARGINV = 0 EFFECT = '' for i in range(len(subs)): sub_block.append('Dialogue: %d, %s, %s, %s, %s, %d, %d, %s, %s' % (LAYER, start_time[i], end_time[i], STYLE, NAME, MARGINL, MARGINV, EFFECT, plaintext[i])) utils.write_txt('%s.ass' % (output_filename), script_info()) utils.write_lines('%s.ass' % (output_filename), sub_block, mode='a') timer.stop() print('转换完成,用时%.2f秒' % (timer.elapsed))
def pre_process_data(raw_data, tokenizer, config, logger): ''' raw_data: dir or a specific file ''' vocab_file = os.path.join(config.tokenized_data_dir, 'vocab.txt') sample_file = os.path.join(config.tokenized_data_dir, 'samples.txt') if os.path.isfile(vocab_file) and os.path.isfile(sample_file): logger.info("vocab file and sample file already existed!") return Data(vocab_file, sample_file, config, logger) else: logger.info("Genarate vocabulary and tokenized samples.") if os.path.isfile(raw_data): raw_data = [raw_data] else: raw_data = glob.glob(os.path.join(raw_data, '*')) samples = set() for file in raw_data: for qa in parse_raw_file(file): q = qa[0] a = qa[1] tokenized_q = tokenize_one_line( sentence=q, cut_fun=tokenizer.tokenize, specical_symbol=config.special_symbol, mode=config.source_language_type, lower=config.source_language_lower) tokenized_a = tokenize_one_line( sentence=a, cut_fun=tokenizer.tokenize, specical_symbol=config.special_symbol, mode=config.target_language_type, lower=config.target_language_lower) samples.add(tokenized_q + "\t" + tokenized_a) logger.info('sample size:{}'.format(len(samples))) logger.info("save samples in '{}'".format(sample_file)) write_lines(sample_file, samples) source_vocab, target_vocab, special_vocab = create_vocabulary( samples, config.special_symbol) source_vocab = set(list(source_vocab.keys())) for s_symbol in config.vocab_remains: if s_symbol in source_vocab: source_vocab.discard(s_symbol) if s_symbol in target_vocab: target_vocab.discard(s_symbol) if s_symbol in special_vocab: special_vocab.discard(s_symbol) logger.info('vocab size:{}'.format( len(source_vocab) + len(target_vocab) + len(special_vocab) + len(config.vocab_remains))) logger.info('save vocabulary in "{}"'.format(vocab_file)) with open(vocab_file, 'w', encoding='utf8') as f: for line in config.vocab_remains: f.write(line + '\n') for line in special_vocab: f.write(line + '\n') for line in source_vocab | target_vocab: f.write(line + '\n') return Data(vocab_file, sample_file, config, logger)
def hit_keyword(fn): keywords = [] f = open(fn) while True: line = f.readline() if not line: break keywords.append(line.strip("\n")) line = f.readline() line = f.readline() from utils import write_lines write_lines("./data/query_hit_keyword.dat", keywords) f.close()
def hit_keyword(fn): keywords = [] f = open(fn) while True: line = f.readline() if not line: break keywords.append(line.strip("\n")) line = f.readline() line = f.readline() from utils import write_lines write_lines("./data/query_hit_keyword.dat",keywords) f.close()
def generate_cat(macro_prefix, header_prefix, supported_size, output_dir): ''' Generate CAT macro which concatenate its two arguments. ''' lines = utils.generate_header_begin(macro_prefix, header_prefix, 'cat') lines.append('') lines += utils.get_cat_lines(macro_prefix, '', supported_size) lines.append('') lines.append(utils.generate_header_end()) utils.write_lines( utils.get_output_file_name(macro_prefix, 'cat', output_dir), lines)
def main(): config = utils.Config() path_out = os.path.join(config.getpath("data"), "ptbwsj_wo_rstdt") utils.mkdir(path_out) utils.mkdir(os.path.join(path_out, "raw")) sections = os.listdir(config.getpath("ptbwsj")) sections.sort() rstdt_wsj_filenames = get_rstdt_wsj_filenames() count = 0 for sec_i, section in enumerate(sections): print("[%d/%d] Processing %s" % \ (sec_i+1, len(sections), os.path.join(config.getpath("ptbwsj"), section))) filenames = os.listdir(os.path.join(config.getpath("ptbwsj"), section)) filenames = [n for n in filenames if n.startswith("wsj_")] filenames.sort() for filename in filenames: if filename in rstdt_wsj_filenames: print("Skipped %s (which is contained in RST-DT)" % filename) continue count += 1 try: lines = utils.read_lines(os.path.join(config.getpath("ptbwsj"), section, filename), process=lambda line: line) except UnicodeDecodeError: lines = [] for line in codecs.open( os.path.join(config.getpath("ptbwsj"), section, filename), "r", "latin-1"): line = line.strip() lines.append(line) assert lines[0] == ".START" lines = lines[1:] top_empty_count = 0 for line_i in range(len(lines)): if lines[line_i] == "": top_empty_count += 1 else: break lines = lines[top_empty_count:] utils.write_lines( os.path.join(config.getpath("data"), "ptbwsj_wo_rstdt", "raw", filename + ".txt"), lines) print("Processed %d files." % count)
def generate_arg(macro_prefix, header_guard_prefix, supported_size, output_dir): ''' Generate the ARG_LENGTH macro. ''' # Seed lines with header guard. lines = utils.generate_header_begin(macro_prefix, header_guard_prefix, 'arg') lines.append('') # Generate ARG_NTH, where N = `supported_size` + 1, giving back the Nth # argument in the variable arguments. args_list = ['_{}'.format(i) for i in range(1, supported_size + 1)] args_str = ', '.join(args_list) arg_nth = ARG_NTH_TEMPLATE.format(macro_prefix.upper(), supported_size + 1, args_str) lines.append(arg_nth) lines.append('') # Generate ARG_LENGTH for getting the length of the variable arguments. # Works for lists from 1 to `supported_size`. lengths_list = ['{}'.format(i) for i in range(supported_size, 0, -1)] lengths_str = ', '.join(lengths_list) arg_length = ARG_LENGTH_TEMPLATE.format(macro_prefix.upper(), supported_size + 1, lengths_str) lines.append(arg_length) lines.append('') # Generate macro for getting the second of the variable arguments. lines.append(ARG_2ND_TEMPLATE.format(macro_prefix)) lines.append('') # Generate CAT for use in ARG macros. lines += utils.get_cat_lines(macro_prefix, 'ARG_', 2) lines.append('') # Generate ARG_IS_SINGLE for checking if the variable arguments is of size # 1. lines.append(ARG_IS_SINGLE_TEMPLATE.format(macro_prefix.upper())) lines.append('') # Generate ARG_SINGULAR for checking if de-paranthesized argument is # a list of size 1. lines.append(ARG_IS_SINGULAR_TEMPLATE.format(macro_prefix.upper())) lines.append('') lines.append(utils.generate_header_end()) utils.write_lines( utils.get_output_file_name(macro_prefix, 'arg', output_dir), lines)
def ss2srt(path, chinese_only=False, english_only=False, Tchinese_only=False): timer = utils.Timer() timer.start() subs = utils.load_sub_file(path) start_time = utils.get_start_time(subs, 'srt') end_time = utils.get_end_time(subs, 'srt') plaintext = utils.get_plaintext(subs) format_sub = [] if chinese_only == True: pass elif english_only == True: pass elif Tchinese_only == True: pass else: # 简体&英文双语 for i in range(len(subs)): format_sub.append('%s\n' % (i + 1)) if start_time[i] == start_time[i - 1] and end_time[i] == end_time[i - 1]: format_sub.append('%s --> %s\n' % (start_time[i], end_time[i])) format_sub.append('%s' % (plaintext[i - 1])) format_sub.append('%s\n' % (plaintext[i])) elif start_time[i] == start_time[i + 1] and end_time[i] == end_time[i + 1]: format_sub.remove(format_sub[-1]) pass else: format_sub.append('%s --> %s\n' % (start_time[i], end_time[i])) format_sub.append('%s\n' % (plaintext[i])) utils.write_lines('%s.srt' % ("test"), format_sub) timer.stop() print('转换完成,用时%.2f秒' % (timer.elapsed))
def generate_inc(macro_prefix, header_guard_prefix, supported_size, output_dir): ''' Generate INC macros which gives a number one higher than the invoked INC. ''' lines = utils.generate_header_begin(macro_prefix, header_guard_prefix, 'inc') lines.append('') lines.append('// INC_N gives back N+1') # Generate INC_N macros, which gives back N + 1. inc_macros = [ INC_TEMPLATE.format(macro_prefix.upper(), i, i + 1) for i in range(supported_size) ] lines += inc_macros lines.append('') lines.append(utils.generate_header_end()) utils.write_lines( utils.get_output_file_name(macro_prefix, 'inc', output_dir), lines)
def extract_plain_text(path, english_only=False, chinese_only=False): timer = utils.Timer() timer.start() subs = utils.load_sub_file(path) plaintext = utils.get_plaintext(subs) if english_only and chinese_only == True: print( '仅保留中文和仅保留英文不能同时勾选\nChinese only and English only cannot be checked at the same time' ) sys.exit(0) elif chinese_only: chinese_lines = [] for i in range(len(plaintext)): chinese_lines.append(utils.chinese_only(plaintext[i]) + '\n') utils.write_lines('%s.txt' % (output_file_name), chinese_lines) elif english_only: english_lines = [] for i in range(len(plaintext)): english_lines.append(utils.english_only(plaintext[i]) + '\n') utils.write_lines('%s.txt' % (output_file_name), english_lines) else: utils.write_lines('%s.txt' % (output_file_name), plaintext) timer.stop() print('提取完成,用时%.2f秒' % (timer.elapsed))
def main(*args): assert len(args) >= 2 word_embeddings = np.load("embedding/word_embeddings.npy") position_embeddings_1 = np.load("embedding/position_embeddings_1.npy") position_embeddings_2 = np.load("embedding/position_embeddings_2.npy") embeddings = make_dict(word_embeddings, position_embeddings_1, position_embeddings_2) from models import build_model model = build_model(embeddings) weights_path = args[0] model.load_weights(weights_path) dis2idx_1 = json_load("embedding/dis2idx_1.json") dis2idx_2 = json_load("embedding/dis2idx_2.json") word2idx = json_load("embedding/word2idx.json") encoder = Encoder(word2idx, dis2idx_1, dis2idx_2) input_file = args[1] sentences, y = read_input(input_file) data = list(map(list, zip(*[s.generate_features(encoder) for s in sentences]))) scores = model.predict(data, verbose=False) predictions = scores.argmax(-1) idx2relation = read_relations("origin_data/relations.txt") outputs = ["{} {}".format(prediction, idx2relation[prediction]) for prediction in predictions] print("\n".join(outputs)) timestamp = int(datetime.now().timestamp()) output_folder = "output/test/%d" % timestamp os.makedirs(output_folder, exist_ok=True) print("output folder: %s" % output_folder) output_file = os.path.join(output_folder, 'output.txt') error_list_file = os.path.join(output_folder, 'error_list.txt') error_predictions_file = os.path.join(output_folder, 'error_predictions.txt') write_lines(output_file, outputs) error_list = [] error_predictions = [] for sentence, label, prediction in zip(sentences, y, predictions): if label != prediction: error_list.append('{} {}'.format(label, str(sentence))) error_predictions.append('{} {}'.format(prediction, idx2relation[prediction])) write_lines(error_list_file, error_list) write_lines(error_predictions_file, error_predictions)
def random_login(self): tsp = utils.random_timestamp() outlier = random.random() < OUTLIERS_RATE row = (self.uid, tsp, self.random_country(tsp, outlier)) return outlier, row def random_country(self, tsp, outlier): pass class Sedentary(User): def random_country(self, _, outlier): return self.countries[outlier] class BusinessTraveler(User): def random_country(self, tsp, outlier): return self.countries[outlier ^ utils.isweekend(tsp)] class FrequentFlyer(User): def __init__(self, userid): super().__init__(userid) self.has_outliers = False def random_country(self, tsp, outlier): return random.choice(self.countries) OUTLIERS_RATE = 0.05 for uid, init in enumerate((Sedentary, BusinessTraveler, FrequentFlyer)): user = init(uid) utils.write_lines("logins{}".format(user.uid), 500, user.random_login, user.has_outliers)
def random_country(self, tsp, outlier): pass class Sedentary(User): def random_country(self, _, outlier): return self.countries[outlier] class BusinessTraveler(User): def random_country(self, tsp, outlier): return self.countries[outlier ^ utils.isweekend(tsp)] class FrequentFlyer(User): def __init__(self, userid): super().__init__(userid) self.has_outliers = False def random_country(self, tsp, outlier): return random.choice(self.countries) OUTLIERS_RATE = 0.05 for uid, init in enumerate((Sedentary, BusinessTraveler, FrequentFlyer)): user = init(uid) utils.write_lines("logins{}".format(user.uid), 500, user.random_login, user.has_outliers)
if is_(opts.file): queries.extend(get_words(opts.file, i=opts.start, j=opts.stop)) if is_(opts.query): queries.extend(opts.query) for i, q in enumerate(queries): LOGGER.info('+++ QUERY #%s: %s +++\n' % (i, q)) RESULT_DIR = osp.join(RESULT_PREFIX, q) mkdir_p(RESULT_DIR) if opts.load_urls: urls = read_lines(osp.join(opts.load_urls, q, 'urls.txt')) elif opts.sch: urls = image_search(q, opts.target) write_lines(urls, osp.join(RESULT_DIR, 'urls.txt')) if opts.load_preds: preds = read_lines(osp.join(opts.load_preds, q, 'preds.txt')) elif opts.rsch: preds = reverse_search_urls(q, *urls, lang=opts.target, n_img=opts.n_img) write_lines(preds, osp.join(RESULT_DIR, 'preds.txt')) # TODO # if opts.pred: # for top_n in 1, 3, 5, 10, 20, 25: # for use_lang in True, False: # pred_filtered = filter_results(preds, q, lang=opts.lang)
def main(args): assert args.inside_rstdt ^ args.outside_rstdt config = utils.Config() nlp = spacy.load("en_core_web_sm", disable=["tagger", "parser", "ner", "textcat"]) # Collect file names in RST-DT rstdt_train_filenames = os.listdir( os.path.join(config.getpath("data"), "rstdt", "wsj", "train")) rstdt_test_filenames = os.listdir( os.path.join(config.getpath("data"), "rstdt", "wsj", "test")) rstdt_train_filenames = [ n for n in rstdt_train_filenames if n.endswith(".edus.tokens") ] rstdt_test_filenames = [ n for n in rstdt_test_filenames if n.endswith(".edus.tokens") ] rstdt_train_filenames = [ n[:-len(".edus.tokens")] for n in rstdt_train_filenames ] rstdt_test_filenames = [ n[:-len(".edus.tokens")] for n in rstdt_test_filenames ] assert len(rstdt_train_filenames) == 347 assert len(rstdt_test_filenames) == 38 if args.outside_rstdt: # Prepare the target directory: /path/to/data/ptbwsj_wo_rstdt utils.mkdir(os.path.join(config.getpath("data"), "ptbwsj_wo_rstdt")) sections = os.listdir(config.getpath("ptbwsj")) sections.sort() count = 0 for section in pyprind.prog_bar(sections): # File names of articles in PTB-WSJ filenames = os.listdir(os.path.join(config.getpath("ptbwsj"), section)) filenames = [n for n in filenames if n.startswith("wsj_")] filenames.sort() for filename in filenames: # Read text of the article try: lines = utils.read_lines(os.path.join(config.getpath("ptbwsj"), section, filename), process=lambda line: line) except UnicodeDecodeError: lines = [] for line in codecs.open( os.path.join(config.getpath("ptbwsj"), section, filename), "r", "latin-1"): line = line.strip() lines.append(line) # Remove the ".START" markers assert lines[0] == ".START" lines = lines[1:] for i in range(len(lines)): lines[i] = lines[i].replace(".START", "") lines[i] = " ".join(lines[i].split()) # Remove the beginning empty lines top_empty_count = 0 for line_i in range(len(lines)): if lines[line_i] == "": top_empty_count += 1 else: break lines = lines[top_empty_count:] # Tokenization tokenized_lines = [] for line in lines: if line == "": tokens = "" else: doc = nlp(line) tokens = [token.text for token in doc] tokens = " ".join(tokens) tokenized_lines.append(tokens) if args.inside_rstdt: if filename in rstdt_train_filenames: # File inside RST-DT training set utils.write_lines( os.path.join(config.getpath("data"), "rstdt", "wsj", "train", filename + ".doc.tokens"), tokenized_lines) count += 1 elif filename in rstdt_test_filenames: # File inside RST-DT test set utils.write_lines( os.path.join(config.getpath("data"), "rstdt", "wsj", "test", filename + ".doc.tokens"), tokenized_lines) count += 1 else: continue else: if filename in rstdt_train_filenames: continue elif filename in rstdt_test_filenames: continue else: # File outside RST-DT utils.write_lines( os.path.join(config.getpath("data"), "ptbwsj_wo_rstdt", filename + ".doc.tokens"), tokenized_lines) count += 1 print("Processed %d files." % count)