def generate_answers(config, model, processor, qn_uuid_data, context_token_data, qn_token_data): uuid2ans = {} # maps uuid to string containing predicted answer data_size = len(qn_uuid_data) num_batches = ((data_size - 1) / config.batch_size) + 1 batch_num = 0 detokenizer = MosesDetokenizer() print("Generating answers...") for batch in get_batch_generator(processor.word2id, qn_uuid_data, context_token_data, qn_token_data, config.batch_size, config.context_len, config.question_len): # Get the predicted spans pred_start_batch, pred_end_batch = processor.test_one_batch( batch, model) # Convert pred_start_batch and pred_end_batch to lists length batch_size pred_start_batch = pred_start_batch.tolist() pred_end_batch = pred_end_batch.tolist() # For each example in the batch: for ex_idx, (pred_start, pred_end) in enumerate( zip(pred_start_batch, pred_end_batch)): # Original context tokens (no UNKs or padding) for this example context_tokens = batch.context_tokens[ex_idx] # list of strings # Check the predicted span is in range assert pred_start in range(len(context_tokens)) assert pred_end in range(len(context_tokens)) # Predicted answer tokens pred_ans_tokens = context_tokens[pred_start:pred_end + 1] # list of strings # Detokenize and add to dict uuid = batch.uuids[ex_idx] uuid2ans[uuid] = detokenizer.detokenize(pred_ans_tokens, return_str=True) batch_num += 1 if batch_num % 10 == 0: print("Generated answers for %i/%i batches = %.2f%%" % (batch_num, num_batches, batch_num * 100.0 / num_batches)) print("Finished generating answers for dataset.") return uuid2ans
def main(args): detok = MosesDetokenizer("fi") with open(args.infile, "r") as fi, open(args.outfile, "w") as fo: for line in fi: data = json.loads(line.strip()) data["text"] = html.unescape(detok(data["text"].split())) fo.write(json.dumps(data, ensure_ascii=False) + "\n")
def clean_txt(filename): file = open(filename, 'rt') text = file.read() file.close() # split into words by white space tokens = word_tokenize(text) with MosesDetokenizer('en') as detokenize: detokenize(tokens) text_string = " ".join(str(x) for x in tokens) # detokenizer = MosesDetokenizer(lang='en') # detokenizer.detokenize(tokens) # stemming of words # snowball = SnowballStemmer("english", ignore_stopwords=True) # stemmed = [snowball.stem(word) for word in tokens] # print('oink oink im a pig !!!!!!!!!!!!!!!!!!!!!!') # print(tokens[:100]) print(type(text_string)) print(text_string) # dates = re.findall(r'[A-Z][a-z]{1,8}\s\d{1,3}([a-z]{1,3})?,\s\d{2,4}', text_string) dates = re.findall(r"[a-zA-Z]{4,9} \d+", text_string) print(dates)
def __init__(self, server, servable_name, t2t_usr_dir, problem, data_dir, timeout_secs): super(EnZhNmtClient).__init__() tf.logging.set_verbosity(tf.logging.INFO) validate_flags(server, servable_name) usr_dir.import_usr_dir(t2t_usr_dir) self.problem = registry.problem(problem) self.hparams = tf.contrib.training.HParams( data_dir=os.path.expanduser(data_dir)) self.problem.get_hparams(self.hparams) self.request_fn = make_request_fn(server, servable_name, timeout_secs) self.moses_tokenizer = MosesTokenizer('en') self.moses_detokenizer = MosesDetokenizer('zh') if problem.endswith("_rev"): fname = "targets" else: fname = "inputs" if self.problem.has_inputs else "targets" self.input_encoder = self.problem.feature_info[fname].encoder if problem.endswith("_rev"): self.output_decoder = self.problem.feature_info["inputs"].encoder else: self.output_decoder = self.problem.feature_info["targets"].encoder
def add_trade_mark_sign(string): data = [ word + '\N{TRADE MARK SIGN}' if len(word) > 5 else word for sent in sent_tokenize(string) for word in word_tokenize(sent) ] with MosesDetokenizer() as detokenize: res = detokenize(data) return res
def detok_copy(self) -> Any: logging.warn("Creating a Moses-detokenized copy of this corpus!") detokenize_inner = MosesDetokenizer('en') detokenize = lambda x: detokenize_inner(x.split()) corpus = Corpus() for utt_id, sent in self.items(): corpus[utt_id] = detokenize(sent) return corpus
def get_detokenized_target(self, trg, batch_size): targets = [] with MosesDetokenizer(self.trg_lang) as detok: for i in range(batch_size): t = self.tokenizer.detokenize(trg[:, i].tolist()) t = detok(t.split()) targets.append(t) return targets
def main(system_outputs_folder, clustered_outputs_folder, output_file): random.seed(37) detokenize = MosesDetokenizer('en') inputs, preds, scores, systems = load_directory(system_outputs_folder, clustered_outputs_folder, detokenize) rows = make_rows(inputs, preds, scores, systems) output_csv(rows, output_file)
def __init__(self): tf.logging.set_verbosity(tf.logging.INFO) validate_flags() usr_dir.import_usr_dir(FLAGS.t2t_usr_dir) self.problem = registry.problem(FLAGS.problem) self.hparams = tf.contrib.training.HParams( data_dir=os.path.expanduser(FLAGS.data_dir)) self.problem.get_hparams(self.hparams) self.request_fn = make_request_fn() self.mose_detokenizer = MosesDetokenizer('en')
def main(opt): if not os.path.exists(opt.output_dir): os.makedirs(opt.output_dir) bc = BertClient() detokenize = MosesDetokenizer('en') all_results = {} for json_file in glob.glob(os.path.join(opt.input_dir, '*.json')): out_json_file = os.path.join(opt.output_dir, os.path.basename(json_file)) ## Check to make sure file doesn't already exist if not os.path.isfile(out_json_file): with open(json_file, 'r') as f: try: experiment = json.load(f) print('Processing ' + json_file) except: print('Error processing ' + json_file) print('Skipping it.') continue for ex_num, example in enumerate(experiment): if ex_num % 10 == 0: print("Clustering output: " + str(ex_num)) candidates = example['pred'] scores = example['scores'] candidates, scores = remove_duplicates(candidates, scores) if opt.method == 'kmeans': candidates, scores = kmeans_filtering( candidates, scores, opt.num_cands, True, bc, detokenize) elif opt.method == 'distance': candidates, scores = distance_filtering( candidates, scores, opt.num_cands, False, bc, detokenize) elif opt.method == 'kmeans_mod': candidates, scores = kmeans_mod_filtering( candidates, scores, opt.num_cands, True, bc, detokenize) else: raise ValueError('Not a valid filtering method') example['pred'] = candidates example['scores'] = scores out_json_file = os.path.join(opt.output_dir, os.path.basename(json_file)) with open(out_json_file, 'w') as f: json.dump(experiment, f) else: print("SKIPPING: " + json_file)
def __init__(self, pos, config, config_global, logger, entity_linker_name: str): super(BaseEntityLinkingStage, self).__init__(pos, config, config_global, logger) self._entity_linker_name = entity_linker_name self._entity_linker_cache = self._provide_cache( self._entity_linker_name, human_readable=False) # note: we are not using NLTK TreebankWordDetokenizer here, because that one replaces double quotes with two # single quotes which makes mappings between the tokenized and detokenized strings needlessly complicated self._detokenizer = MosesDetokenizer("en")
def __init__(self, pos, config, config_global, logger): super(SentenceBertEmbeddingFeaturePreparationStage, self).__init__(pos, config, config_global, logger) self._pretrained_model_name = config["pretrained_model_name"] self._cache = self._provide_cache("sentence_bert", bind_parameters=config) # note: we are not using NLTK TreebankWordDetokenizer here, because that one replaces double quotes with two # single quotes which makes mappings between the tokenized and detokenized strings needlessly complicated self._detokenizer = MosesDetokenizer("en")
def __init__(self): tf.logging.set_verbosity(tf.logging.INFO) validate_flags() usr_dir.import_usr_dir(FLAGS.t2t_usr_dir) self.problem = registry.problem(FLAGS.problem) self.hparams = tf.contrib.training.HParams( data_dir=os.path.expanduser(FLAGS.data_dir)) self.problem.get_hparams(self.hparams) self.request_fn = make_request_fn() self.tokenizer = MosesTokenizer('en') self.moses_detokenizer = MosesDetokenizer('zh') self.delimiter = re.compile("(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s")
def make_human_readable(files: list, detokenize: bool): if detokenize: detokenizer = MosesDetokenizer("en") print("Postprocessing on {} files...".format(len(files))) for file in files: print("Working on: {}".format(file)) with open(file, "r") as fin, open(file+".human_readable", "w") as fout: for line in fin: cleanline = strip_chars(line) if detokenize: cleanline = detokenizer(cleanline.strip().split()) fout.write("{}\n".format(cleanline))
def main(test_file, output_file, lang): lang = lang.split('_')[0] # detokenizer = MosesDetokenizer(lang) detokenize = MosesDetokenizer(lang) sent_id = 1 with open(output_file, 'w+') as out: for line in open(test_file): if line.startswith('# text ='): line = line.split('=', 1)[1].strip() words = line.split() out_text = detokenize(words) out.write(f'# sent_id = {sent_id}\n# text = {out_text}\n\n') sent_id += 1
def moses_detokenize(self, inp: Path, out: Path, col=0, lang='en', post_op=None): log.info(f"detok : {inp} --> {out}") tok_lines = IO.get_lines(inp, col=col, line_mapper=lambda x: x.split()) with MosesDetokenizer(lang=lang) as detok: detok_lines = (detok(tok_line) for tok_line in tok_lines) if post_op: detok_lines = (post_op(line) for line in detok_lines) IO.write_lines(out, detok_lines)
def main(args): splits = MosesSentenceSplitter('fi') detok = MosesDetokenizer("fi") with open(args.infile, "r") as fi, open(args.outfile, "w") as fo: for line in fi: data = json.loads(line.strip()) text = html.unescape(detok(data["text"].split())) if args.moses_tokenized else data["text"] sents = splits([text]) for i, s in enumerate(sents): d = data.copy() d["text"] = s if "id" in d.keys(): d["id"] = d["id"] + f"-s{i}" fo.write(json.dumps(d, ensure_ascii=False) + "\n")
class MosesTokenizer(Tokenizer): def __init__(self): super().__init__() self._tokenizer = NLTKMosesTokenizer() self._detokenizer = MosesDetokenizer() def tokenize(self, sentence): return self._tokenizer.tokenize(sentence) def detokenize(self, tokens): """Unescape Moses punctuation tokens. Replaces escape sequences like [ with the original characters (such as '['), so they better align to the original text. """ return [self._detokenizer.unescape_xml(t) for t in tokens]
def main(system_outputs_folder, clustered_outputs_folder, outputs_folder_100, input_file, gold_output_file, output_file): random.seed(37) detokenize = MosesDetokenizer('en') ## Gets predicted responses from all systems inputs, preds, scores, systems = load_directory(system_outputs_folder, clustered_outputs_folder, outputs_folder_100, detokenize) ## Gets gold responses gold_dict = get_gold_responses(input_file, gold_output_file, detokenize) ## Formats responses into rows for MTurk experiment rows = make_rows(inputs, preds, scores, systems, gold_dict) output_csv(rows, output_file)
def __init__(self, srclang, targetlang, sourcebpe=None, targetbpe=None, sourcespm=None, targetspm=None): self.bpe_source = None self.bpe_target = None self.sp_processor_source = None self.sp_processor_target = None self.sentences = [] # load BPE model for pre-processing if sourcebpe: # print("load BPE codes from " + sourcebpe, flush=True) BPEcodes = open(sourcebpe, 'r', encoding="utf-8") self.bpe_source = BPE(BPEcodes) if targetbpe: # print("load BPE codes from " + targetbpe, flush=True) BPEcodes = open(targetbpe, 'r', encoding="utf-8") self.bpe_target = BPE(BPEcodes) # load SentencePiece model for pre-processing if sourcespm: # print("load sentence piece model from " + sourcespm, flush=True) self.sp_processor_source = sentencepiece.SentencePieceProcessor() self.sp_processor_source.Load(sourcespm) if targetspm: # print("load sentence piece model from " + targetspm, flush=True) self.sp_processor_target = sentencepiece.SentencePieceProcessor() self.sp_processor_target.Load(targetspm) # pre- and post-processing tools self.tokenizer = None self.detokenizer = None # TODO: should we have support for other sentence splitters? # print("start pre- and post-processing tools") self.sentence_splitter = MosesSentenceSplitter(srclang) self.normalizer = MosesPunctuationNormalizer(srclang) if self.bpe_source: self.tokenizer = MosesTokenizer(srclang) if self.bpe_source: self.detokenizer = MosesDetokenizer(targetlang)
def translate(self, src, trg): """Given a source a target tokenized tensors, outputs the non-tokenized translation from the model, as well as the non-tokenized target Args: src: trg: Returns: """ src, src_len = src trg, trg_len = trg device = next(self.model.parameters()).device batch_size = src.shape[1] bos = [self.insert_target_start] * (batch_size * self.beam_size) bos = torch.tensor(bos, dtype=torch.int64, device=device).view(1, -1) if self.beam_size == 1: generator = self.generator.greedy_search else: generator = self.generator.beam_search with torch.no_grad(): context = self.model.encode(src, src_len) context = [context, src_len, None] preds, lengths, counter = generator(batch_size, bos, context) preds = preds.cpu() targets = self.get_detokenized_target(trg, batch_size) output = [] with MosesDetokenizer(self.trg_lang) as detokenizer: for pred in preds: pred = pred.tolist() detok = self.tokenizer.detokenize(pred) detok = detokenizer(detok.split()) output.append(detok) return output, targets
def stopwords_e_pontuacao(self, instancia): ## tokenizar com nltk instancia = instancia.split() ### remove punctuation from each word table = str.maketrans('', '', string.punctuation) instancia = [w.translate(table) for w in instancia] ### convert to lower case and remove everything that is not alphabetic instancia = [word for word in instancia if word.isalpha()] ## filter out StopWords stopwords = nltk.corpus.stopwords.words('portuguese') + [ 'aqui', 'a', 'rs', 'é', '/', 'fdp', '%', 'pfvr', 'cadê', 'né', 'q', 'pq', '#', '@', 'mt', 'youtube', 'hj', 'dnv', 'mto', 'vc', 'eh', 'r$', 'rt', 'via', 'vía' ] stopwords.remove("não") instancia = [w for w in instancia if not w in stopwords] ## detokenizer (necessary to pass as arg to make an textblob object) with MosesDetokenizer('pt') as detokenize: instancia = detokenize(instancia) return instancia
def detokenize(wordsOrSentences, joinSentences=True, logger=None, verbose=True): global multiReplacerSingleton wordsOrSentences = copy.deepcopy(wordsOrSentences) words = wordsOrSentences if multiReplacerSingleton is None: repls = \ { # " ,": ",", # " .": ".", # " ?": "?", # " !": "!", # " )": ")", # "( ": "(", # " :": ":", # " '": "'", " n't": "n't", } multiReplacerSingleton = MultiReplacer(repls) with MosesDetokenizer('en') as detokenizeSingleton: def __detokenizeWords(words): text = detokenizeSingleton(words) text = multiReplacerSingleton.replace(text) return text if words is None or len(words) == 0: return "" if isinstance(words[0], list): sentences = words for i in range(len(sentences)): words = sentences[i] text = __detokenizeWords(words) sentences[i] = text if joinSentences: return "\n".join(sentences) else: return sentences elif isinstance(words[0], str): return __detokenizeWords(words) else: logError("words[0] must be either a list (so words are sentences)", logger, verbose=verbose) return None
def get_embs(candidates, normalize=False): """Returns the sequence embedding for each candidate.""" bc = BertClient() detokenize = MosesDetokenizer('en') detoked_cands = [] for i, cand in enumerate(candidates): detoked_cands.append(detokenize(cand)) if len(detokenize(cand)) == 0: print(i) print(cand) print(detokenize(cand)) embs = bc.encode(detoked_cands) if normalize: embs = [e / np.linalg.norm(e) for e in embs] # This line is necessary depending on how the BERT server is setup. # embs = [np.mean(emb, 0) for emb in embs] return embs
def random_inflect(source: str, inflection_counts: Dict[str, int] = None) -> str: have_inflections = {'NOUN', 'VERB', 'ADJ'} tokenized = MosesTokenizer(lang='en').tokenize( source) # Tokenize the sentence upper = False if tokenized[0][0].isupper(): upper = True tokenized[0] = tokenized[0].lower() pos_tagged = nltk.pos_tag(tokenized, tagset='universal') # POS tag words in sentence for i, word in enumerate(tokenized): lemmas = lemminflect.getAllLemmas(word) # Only operate on content words (nouns/verbs/adjectives) if lemmas and pos_tagged[i][1] in have_inflections and pos_tagged[i][ 1] in lemmas: lemma = lemmas[pos_tagged[i][1]][0] inflections = (i, [(tag, infl) for tag, tup in lemminflect.getAllInflections( lemma, upos=pos_tagged[i][1]).items() for infl in tup]) if inflections[1]: # Use inflection distribution for weighted random sampling if specified # Otherwise unweighted if inflection_counts: counts = [ inflection_counts[tag] for tag, infl in inflections[1] ] inflection = random.choices(inflections[1], weights=counts)[0][1] else: inflection = random.choices(inflections[1])[0][1] tokenized[i] = inflection if upper: tokenized[0] = tokenized[0].title() return MosesDetokenizer(lang='en').detokenize(tokenized)
import torch from allennlp.common.checks import ConfigurationError from allennlp.common.params import Params from allennlp.modules.seq2seq_encoders.seq2seq_encoder import Seq2SeqEncoder from allennlp.nn.util import device_mapping, masked_softmax from mosestokenizer import MosesDetokenizer from torch.autograd import Variable from torch.nn import Dropout, Linear, Parameter, init logger = logging.getLogger(__name__) # pylint: disable=invalid-name SOS_TOK, EOS_TOK = "<SOS>", "<EOS>" # Note: using the full 'detokenize()' method is not recommended, since it does # a poor job of adding correct whitespace. Use unescape_xml() only. _MOSES_DETOKENIZER = MosesDetokenizer() def copy_iter(elems): """Simple iterator yielding copies of elements.""" for elem in elems: yield copy.deepcopy(elem) def wrap_singleton_string(item: Union[Sequence, str]): """ Wrap a single string as a list. """ if isinstance(item, str): # Can't check if iterable, because a string is an iterable of # characters, which is not what we want. return [item] return item
def main(opt): bc = BertClient() detokenize = MosesDetokenizer('en') all_results = {} for json_file in glob.glob(os.path.join(opt.dir, '*.json')): with open(json_file, 'r') as f: try: experiment = json.load(f) print('Processing ' + json_file) except: print('Error processing ' + json_file) print('Skipping it.') exp_name = os.path.basename(json_file).replace('.json', '') eval_results = [] for example in experiment['results']: candidates = example['pred'] ex_results = {} ex_results['dist_from_mean_emb'] = eval_emb_stats( candidates, bc, detokenize) ex_results['num_distinct_1grams'] = eval_distinct_k( candidates, 1) ex_results['num_distinct_2grams'] = eval_distinct_k( candidates, 2) ex_results['entropy_2grams'] = eval_entropy(candidates, 2) ex_results['entropy_4grams'] = eval_entropy(candidates, 4) min_edit, mean_edit, max_edit = eval_edit_distance(candidates) ex_results['min_edit_distance'] = min_edit ex_results['mean_edit_distance'] = mean_edit ex_results['max_edit_distance'] = max_edit eval_results.append(ex_results) all_results[exp_name] = { 'ex_results': eval_results, 'perplexity': experiment['ppl'], 'score': experiment['score'] } per_experiment_keys = ['perplexity', 'score'] per_example_keys = list(all_results[exp_name]['ex_results'][0].keys()) outfile = os.path.join(opt.dir, 'results.csv') with open(outfile, 'w') as csv_file: fieldnames = ['exp'] + per_experiment_keys + per_example_keys writer = csv.DictWriter(csv_file, fieldnames=fieldnames) writer.writeheader() for exp_name, results in all_results.items(): csv_line = {'exp': exp_name} for key in per_experiment_keys: csv_line[key] = results[key] for key in per_example_keys: csv_line[key] = np.mean([ r[key] for r in results['ex_results'] if r[key] != np.nan ]) writer.writerow(csv_line) print('Evaluation results written to %s' % outfile)
def __init__(self, system_id, apply_disc=False): super().__init__(system_id, apply_disc=apply_disc) self.keyword_model = os.environ.get( "CWC_KEYWORD_MODEL_" + system_id.upper(), self.model_folder + "/ROC_title_keyword_e500_h1000_edr0.4_hdr0.1_511_lr10.pt") self.keyword_vocab = os.environ.get( "CWC_KEYWORD_VOCAB_" + system_id.upper(), self.model_folder + "/ROC_title_keyword_e500_h1000_edr0.4_hdr0.1_511_lr10.pkl") self.story_model = os.environ.get( "CWC_STORY_MODEL_" + system_id.upper(), self.model_folder + "/ROC_title_key_story_e1000_h1500_edr0.2_hdr0.1_511_lr10.pt") self.story_vocab = os.environ.get( "CWC_STORY_VOCAB_" + system_id.upper(), self.model_folder + "/ROC_title_key_story_e1000_h1500_edr0.2_hdr0.1_511_lr10.pkl") self.scorers_config = os.environ.get( "CWC_SCORERS_CONFIG_" + system_id.upper(), self.model_folder + "/scorer_weights_abl.tsv") self.gold_titles = os.environ.get( "CWC_GOLD_TITLES_" + system_id.upper(), self.data_folder + "/ROCStories_all_merge_tokenize.titlesepkey.all") torch.manual_seed(self.torch_seed) # Load models and vocab dictionaries, init stopping symbols for generation self.kw_model = load_model(self.keyword_model, self.use_cuda) self.st_model = load_model(self.story_model, self.use_cuda) self.kw_dict = load_pickle(self.keyword_vocab) self.st_dict = load_pickle(self.story_vocab) self.kw_vocab_size = len(self.kw_dict) self.st_vocab_size = len(self.st_dict) self.st_eos_id = self.st_dict.word2idx[self.story_end] self.st_unk_id = self.st_dict.word2idx[self.story_unk] #self.kw_eos_id = self.kw_dict.word2idx[self.story_end] this is clearly wrong but seems to not be used ever self.kw_eot_id = self.kw_dict.word2idx[self.title_end] self.kw_end_id = self.kw_dict.word2idx[self.kw_end] self.kw_sep_id = self.kw_dict.word2idx[self.kw_sep] self.st_sep_id = self.st_dict.word2idx[self.story_sep] # self.special_chars = [self.kw_end, self.story_end, self.kw_sep, self.story_sep, self.title_end] self.title2storyline = read_gold_storylines(self.gold_titles, self.title_end) self.special_chars = SPECIAL_CHARACTERS self.nlp = init_nlp_model() self.detokenizer = MosesDetokenizer('en') if self.apply_disc: print("%s: Using BeamRerankDecoder" % (self.system_id)) scorers, coefs = read_scorers(self.scorers_config, self.use_cuda) self.decoder = BeamRerankDecoder( self.st_model, scorers, coefs, beam_size=self.beam_size, sep=self.st_sep_id, temperature=None, terms=[self.st_eos_id], forbidden=[self.st_unk_id, self.st_eos_id], use_cuda=self.use_cuda) else: print("%s: Using BeamSearchDecoder" % (self.system_id)) self.decoder = BeamSearchDecoder(self.st_model, self.beam_size, self.st_eos_id, verbosity=False, dictionary=self.st_dict, sep=self.st_sep_id)
parser = argparse.ArgumentParser() parser.add_argument('split_directory', type=str) args = parser.parse_args() file_paths = glob.glob(os.path.join(args.split_directory, '*.test.csv')) assert len(file_paths) == 1 test_filepath = file_paths[0] print(f'Read csv file from {test_filepath}') test_df = pd.read_csv(test_filepath, encoding='utf-8') en_tokenizer = MosesTokenizer() en_detokenizer = MosesDetokenizer() test_df['en'] = test_df['en'].apply( lambda x: en_detokenizer(en_tokenizer(x))) test_df['th'] = test_df['th'].apply( lambda x: ' '.join(th_word_space_tokenize(x))).apply(th_detokenize) test_df[['en']].to_csv(os.path.join(args.split_directory, 'test.detok.en'), encoding='utf-8', sep="\t", index=False, header=False, escapechar="", quotechar="", quoting=csv.QUOTE_NONE) test_df[['th']].to_csv(os.path.join(args.split_directory, 'test.detok.th'),
def __init__(self, dependency, nlp): self.detokenizer = MosesDetokenizer('en') self.dependency = dependency self.nlp = nlp