def create_batches(sentences, src2dst_dict, text_processor: TextProcessor, resume_index=0, end_index=-1): print(len(src2dst_dict)) print("Getting batches...") index = 0 for sid in src2dst_dict.keys(): index += 1 if index >= end_index and end_index > 0: break if index <= resume_index: continue tids = list(src2dst_dict[sid]) source_tokenized = torch.LongTensor(tok_sen(sentences[sid])) trans_cands = list( map(lambda i: torch.LongTensor(tok_sen(sentences[i])), tids)) candidates = pad_sequence(trans_cands, batch_first=True, padding_value=text_processor.pad_token_id()) target_langs = list( map( lambda i: text_processor.lang_id(sentences[i].strip().split( " ")[0]), tids)) src_lang = torch.LongTensor( [text_processor.lang_id(sentences[sid].strip().split(" ")[0])]) yield sid, source_tokenized, torch.LongTensor( tids), candidates, src_lang, torch.LongTensor(target_langs)
def test_data(self): path_dir_name = os.path.dirname(os.path.realpath(__file__)) data_path = os.path.join(path_dir_name, "sample.txt") with tempfile.TemporaryDirectory() as tmpdirname: processor = TextProcessor() processor.train_tokenizer([data_path], vocab_size=1000, to_save_dir=tmpdirname, languages={ "<mzn>": 0, "<glk": 1 }) create_batches.write(text_processor=processor, cache_dir=tmpdirname, seq_len=512, txt_file=data_path, sen_block_size=10) dataset = TextDataset(save_cache_dir=tmpdirname, max_cache_size=3) assert dataset.line_num == 70 dataset.__getitem__(3) assert len(dataset.current_cache) == 3 dataset.__getitem__(9) assert len(dataset.current_cache) == 3 dataset.__getitem__(69) assert len(dataset.current_cache) == 2
def __init__(self, text_processor: TextProcessor, config: BertConfig = None, encoder: BertModel = None, enc_layer: int = 6, embed_dim: int = 768, intermediate_dim: int = 3072): super(LM, self).__init__() self.text_processor: TextProcessor = text_processor if config is not None: self.config = config else: self.config = lm_config.get_config( vocab_size=text_processor.tokenizer.get_vocab_size(), pad_token_id=text_processor.pad_token_id(), bos_token_id=text_processor.bos_token_id(), eos_token_id=text_processor.sep_token_id(), enc_layer=enc_layer, embed_dim=embed_dim, intermediate_dim=intermediate_dim) self.config["type_vocab_size"] = len(text_processor.languages) self.config = BertConfig(**self.config) self.masked_lm = BertOutputLayer(self.config) if encoder is None: self.encoder: BertModel = BertModel(self.config) self.encoder.init_weights() else: self.encoder = encoder self.encoder._tie_or_clone_weights( self.masked_lm.decoder, self.encoder.embeddings.word_embeddings)
def mask_text(mask_prob, pads, texts, text_processor: TextProcessor, mask_eos: bool = True): assert 0 < mask_prob < 1 mask = torch.empty(texts.size()).uniform_(0, 1) < mask_prob mask[~pads] = False # We should not mask pads. if not mask_eos: eos_idx = texts == text_processor.sep_token_id() mask[eos_idx] = False # We should not mask end-of-sentence (usually in case of BART training). masked_ids = texts[mask] random_index = lambda: random.randint(len(text_processor.special_tokens), text_processor.vocab_size() - 1) rand_select = lambda r, c: text_processor.mask_token_id() if r < 0.8 else ( random_index() if r < 0.9 else int(masked_ids[c])) replacements = list(map(lambda i: rand_select(random.random(), i), range(masked_ids.size(0)))) texts[mask] = torch.LongTensor(replacements) return mask, masked_ids, texts
def createSecondDataset(self, datasetFileName="df2.tsv"): # the text processor object is initialized self.tp = TextProcessor() # the term encoding is built self.tp.buildEncoding("encoding.pickle") # the term frequency dictionary is built self.tp.buildTf(fileName='tf.pickle') # the function which build the tfidf, still not normalized, is invoked self._createtfidf() # the function which computes the euclidean norm of the description in the house ad collection self._computeEuclideanNorm() # this function creates the tfidf dictionary but normalized self._createNormalizedtfidf() # the dataframe is created pd.DataFrame.from_dict(data=self.tfidf_normalized, columns=range(1, self.tp.VOCABULARY_SIZE + 1), orient='index').to_csv(datasetFileName, sep='\t')
def test_albert_init(self): path_dir_name = os.path.dirname(os.path.realpath(__file__)) data_path = os.path.join(path_dir_name, "sample.txt") with tempfile.TemporaryDirectory() as tmpdirname: processor = TextProcessor() processor.train_tokenizer([data_path], vocab_size=1000, to_save_dir=tmpdirname, languages={"<en>": 0}) lm = LM(text_processor=processor) assert lm.encoder.base_model.embeddings.word_embeddings.num_embeddings == 1000 lm.save(tmpdirname) new_lm = LM.load(tmpdirname) assert new_lm.config == lm.config
def write(text_processor: TextProcessor, output_file: str, txt_file: str, output_txt: bool = False): with open(txt_file, "r") as fp, open(output_file, "w") as writer: for line in fp: if len(line.strip()) == 0 or len(line.strip()) == 0: continue tok_line = text_processor.tokenize_one_line(line.strip(), ignore_middle_eos=True) if output_txt: tokenized = [text_processor.id2token(tok) for tok in tok_line][1:-1] tokenized = list( map(lambda tok: tok if tok != "<unk>" else "unk", tokenized)) else: tokenized = [str(tok) for tok in tok_line] writer.write(" ".join(tokenized) + "\n")
def write(text_processor: TextProcessor, output_file: str, input_file: str, max_len: int, sample_size: int): with open(input_file, "r") as r: obj = json.load(r) annotations = obj["annotations"] captions = list( map(lambda annotation: caption_data(annotation), annotations)) print(len(captions)) skipped_long_sens = 0 image_path_dict, unique_images = dict(), dict() tok_captions = {} image_ids = {} for ci, c in enumerate(captions): if ci % 1000 == 0: print(ci, "/", len(captions), "->", len(tok_captions), len(unique_images), end="\r") tok_sen = text_processor.tokenize_one_sentence(c[1]) if len(tok_sen) > max_len: skipped_long_sens += 1 continue path = c[0] if path not in image_path_dict: image_id = len(unique_images) unique_images[image_id] = path image_path_dict[path] = image_id elif path in image_path_dict: image_id = image_path_dict[path] unique_images[image_id] = path caption_id = len(tok_captions) tok_captions[caption_id] = tok_sen image_ids[caption_id] = image_id if (ci + 1) >= sample_size and sample_size > 0: break print("Skipped long sentences:", skipped_long_sens, "from", len(captions)) tok_captions_sorted = sorted(tok_captions.items(), key=lambda item: len(item[1])) caption_sorted = list( map(lambda e: (image_ids[e[0]], e[1]), tok_captions_sorted)) print("Longest sentence", len(tok_captions_sorted[-1][1])) with open(output_file, "wb") as wfp: marshal.dump((unique_images, caption_sorted), wfp) print("Dumped", len(caption_sorted), "captions from", len(unique_images), "unique images")
def write(text_processor: TextProcessor, output_file: str, input_file: str, max_len: int, sample_size: int, lang): eos = "</s>" if lang is not None: lang = "<" + lang + ">" skipped_long_sens = 0 image_path_dict, unique_images = dict(), dict() tok_captions = {} image_ids = {} with open(input_file, "r") as r: for ci, line in enumerate(r): try: path, caption = line.strip().split("\t") if lang is not None and not caption.startswith(lang): caption = " ".join([lang, caption, eos]) tok_sen = text_processor.tokenize_one_sentence(caption) if len(tok_sen) > max_len: skipped_long_sens += 1 continue if "." not in path: # Does not have extension; will add jpg. if os.path.exists(path + ".jpg"): path = path + ".jpg" elif os.path.exists(path + ".jpeg"): path = path + ".jpeg" elif os.path.exists(path + ".JPG"): path = path + ".JPG" elif os.path.exists(path + ".png"): path = path + ".png" elif os.path.exists(path + ".PNG"): path = path + ".PNG" if path not in image_path_dict: image_id = len(unique_images) unique_images[image_id] = path image_path_dict[path] = image_id elif path in image_path_dict: image_id = image_path_dict[path] unique_images[image_id] = path caption_id = len(tok_captions) tok_captions[caption_id] = tok_sen image_ids[caption_id] = image_id if (ci + 1) >= sample_size and sample_size > 0: break except: print(line.strip()) print("Skipped long sentences:", skipped_long_sens) tok_captions_sorted = sorted(tok_captions.items(), key=lambda item: len(item[1])) caption_sorted = list(map(lambda e: (image_ids[e[0]], e[1]), tok_captions_sorted)) print("Longest sentence", len(tok_captions_sorted[-1][1])) with open(output_file, "wb") as wfp: marshal.dump((unique_images, caption_sorted), wfp) print("Dumped", len(caption_sorted), "captions from", len(unique_images), "unique images")
def load(out_dir: str, tok_dir: str): text_processor = TextProcessor(tok_model_path=tok_dir) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") with open(os.path.join(out_dir, "mt_config"), "rb") as fp: enc_layer, embed_dim, intermediate_dim = pickle.load(fp) mt_model = Caption2Image(text_processor=text_processor, enc_layer=enc_layer, embed_dim=embed_dim, intermediate_dim=intermediate_dim) mt_model.load_state_dict(torch.load(os.path.join(out_dir, "mt_model.state_dict"), map_location=device), strict=False) return mt_model
def load(out_dir: str): text_processor = TextProcessor(tok_model_path=out_dir) with open(os.path.join(out_dir, "config"), "rb") as fp: config = pickle.load(fp) if isinstance(config, dict): # For older configs config = BertConfig(**config) lm = LM(text_processor=text_processor, config=config) lm.load_state_dict( torch.load(os.path.join(out_dir, "model.state_dict"))) return lm
def __init__(self, text_processor: TextProcessor, enc_layer: int = 6, embed_dim: int = 768, intermediate_dim: int = 3072): super(Caption2Image, self).__init__() self.text_processor: TextProcessor = text_processor self.config = lm_config.get_config(vocab_size=text_processor.tokenizer.get_vocab_size(), pad_token_id=text_processor.pad_token_id(), bos_token_id=text_processor.bos_token_id(), eos_token_id=text_processor.sep_token_id(), enc_layer=enc_layer, embed_dim=embed_dim, intermediate_dim=intermediate_dim) self.enc_layer = enc_layer self.embed_dim = embed_dim self.intermediate_dim = intermediate_dim self.config["type_vocab_size"] = len(text_processor.languages) self.config = BertConfig(**self.config) self.encoder = BertEncoderModel(self.config) self.encoder.init_weights() self.input_attention = nn.Linear(self.config.hidden_size, 1) self.decoder = nn.Linear(self.config.hidden_size, 49 * self.config.hidden_size)
def mass_mask(mask_prob, pad_indices, src_text, text_processor: TextProcessor) -> Dict: """ 20% of times, mask from start to middle 20% of times, mask from middle to end 60% of times, mask a random index """ index_range = pad_indices - (1 - mask_prob) * pad_indices src_mask = torch.zeros(src_text.size(), dtype=torch.bool) to_recover = [] to_recover_pos = [] for i, irange in enumerate(index_range): range_size = int(pad_indices[i] / 2) r = random.random() last_idx = int(math.ceil(irange)) if r > 0.8: start = 1 elif r > 0.6: start = last_idx else: start = random.randint(2, last_idx) if last_idx >= 2 else 2 end = start + range_size src_mask[i, start:end] = True to_recover.append(src_text[i, start - 1:end]) to_recover_pos.append(torch.arange(start - 1, end)) to_recover = pad_sequence(to_recover, batch_first=True, padding_value=text_processor.pad_token_id()) to_recover_pos = pad_sequence(to_recover_pos, batch_first=True, padding_value=int(src_text.size(-1)) - 1) assert 0 < mask_prob < 1 masked_ids = src_text[:, 1:][src_mask[:, 1:]] mask_idx = src_text[src_mask] random_index = lambda: random.randint(len(text_processor.special_tokens), text_processor.vocab_size() - 1) rand_select = lambda r, c: text_processor.mask_token_id() if r < 0.8 else ( random_index() if r < 0.9 else int(mask_idx[c])) replacements = list(map(lambda i: rand_select(random.random(), i), range(mask_idx.size(0)))) src_text[src_mask] = torch.LongTensor(replacements) return {"src_mask": src_mask, "targets": masked_ids, "src_text": src_text, "to_recover": to_recover, "positions": to_recover_pos, "mask_idx": mask_idx}
def get_tokenizer(train_path: Optional[str] = None, model_path: Optional[str] = None, vocab_size: Optional[int] = None) -> TextProcessor: if not os.path.exists(model_path): os.makedirs(model_path) print("Training Tokenizer...") text_processor = TextProcessor() print("Writing raw text...") languages = set() with open(train_path + ".tmp", "w") as wf: with open(train_path, "r") as rf: for i, line in enumerate(rf): spl = [ sen.strip() for sen in line.split("</s>") if len(sen.strip()) > 0 ] if len(spl) == 0: continue if spl[0].startswith("<"): sen_split = spl[0].strip().split(" ") spl[0] = " ".join(sen_split[1:]) languages.add(sen_split[0]) wf.write("\n".join(spl)) wf.write("\n") if (i + 1) % 1000000 == 0: print(i + 1, "\r", end="") print("Writing raw text done!") print(" ".join(languages)) text_processor.train_tokenizer( paths=[train_path + ".tmp"], vocab_size=vocab_size, to_save_dir=model_path, languages={l: i for i, l in enumerate(sorted(languages))}) print("Removing temporary file!") os.system("rm " + train_path + ".tmp &") print("done!")
def load(cls, out_dir: str, tok_dir: str, use_obj: bool = False): text_processor = TextProcessor(tok_model_path=tok_dir) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") with open(os.path.join(out_dir, "mt_config"), "rb") as fp: lang_dec, use_proposals, enc_layer, dec_layer, embed_dim, intermediate_dim, tie_embed, resnet_depth, freeze_image = pickle.load( fp) mt_model = cls(text_processor=text_processor, lang_dec=lang_dec, use_proposals=use_proposals, tie_embed=tie_embed, enc_layer=enc_layer, dec_layer=dec_layer, embed_dim=embed_dim, intermediate_dim=intermediate_dim, freeze_image=freeze_image, resnet_depth=resnet_depth, use_obj=use_obj) mt_model.load_state_dict(torch.load(os.path.join(out_dir, "mt_model.state_dict"), map_location=device), strict=False) return mt_model
def test_train_tokenizer(self): path_dir_name = os.path.dirname(os.path.realpath(__file__)) data_path = os.path.join(path_dir_name, "sample.txt") with tempfile.TemporaryDirectory() as tmpdirname: processor = TextProcessor() processor.train_tokenizer([data_path], vocab_size=1000, to_save_dir=tmpdirname, languages={"<en>": 0}) assert processor.tokenizer.get_vocab_size() == 1000 sen1 = "Obama signed many landmark bills into law during his first two years in office." assert processor._tokenize(sen1) is not None many_sens = "\n".join([sen1] * 10) assert len(processor.tokenize(many_sens)) == 10 new_prcoessor = TextProcessor(tok_model_path=tmpdirname) assert new_prcoessor.tokenizer.get_vocab_size() == 1000 sen2 = "Obama signed many landmark bills into law during his first two years in office." assert processor._tokenize(sen2) is not None
def load_raw_data(self): textprocessor = TextProcessor(self.in_dir, self.dictionary_file, self.hashtag_file) textprocessor.load_dictioanry() textprocessor.load_hashtag() dat = pd.read_csv(self.in_dir + '/' + self.input_file, header=None) dat.columns = ['tweet', 'hashtag'] # n = len(dat) # nlist = range(0,n) dat['id'] = None dat = dat[['id', 'tweet', 'hashtag']] total = ['id', 'tweet', 'hashtag'] total = total + list(textprocessor.hashtag) dat = dat.reindex(columns=list(total), fill_value=0) dat['tweet'] = dat['tweet'].apply(textprocessor.cleanup) dat['tweet'] = dat['tweet'].apply(textprocessor.informal_norm) dat['hashtag'] = dat['hashtag'].apply(textprocessor.del_hashtag) dat = dat.drop( dat[dat['hashtag'].map(len) < 1].index).reset_index(drop=True) dat['tweet'] = dat['tweet'].apply(textprocessor.drop_tweet) dat = dat.drop( dat[dat['tweet'].map(len) < 1].index).reset_index(drop=True) n = len(dat) nlist = range(0, n) dat['id'] = nlist # assign label for i in range(len(dat['hashtag'])): tmp_list = dat['hashtag'][i].split(",") for j in range(len(tmp_list)): tmp_list[j] = tmp_list[j].replace(' ', '') dat[tmp_list[j]][i] = 1 return dat.drop(columns=['hashtag'])
def __init__(self): self.textprocessor = TextProcessor() self.low_weight = 0.001 self.missclick_weight = 0.1 self.insert_weight = 0.1 self.delete_weight = 0.1
target_langs = list( map( lambda i: text_processor.lang_id(sentences[i].strip().split( " ")[0]), tids)) src_lang = torch.LongTensor( [text_processor.lang_id(sentences[sid].strip().split(" ")[0])]) yield sid, source_tokenized, torch.LongTensor( tids), candidates, src_lang, torch.LongTensor(target_langs) if __name__ == "__main__": parser = get_option_parser() (options, args) = parser.parse_args() print("Loading text processor...") text_processor = TextProcessor(options.tokenizer_path) num_processors = max(torch.cuda.device_count(), 1) print("Loading model...") model = Seq2Seq.load(Seq2Seq, options.model, tok_dir=options.tokenizer_path) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = model.to(device) num_gpu = torch.cuda.device_count() assert num_gpu <= 1 if options.fp16: model = amp.initialize(model, opt_level="O2") max_capacity = options.total_capacity * 1000000
def train(options): lex_dict = None if options.dict_path is not None: lex_dict = get_lex_dict(options.dict_path) if not os.path.exists(options.model_path): os.makedirs(options.model_path) text_processor = TextProcessor(options.tokenizer_path) assert text_processor.pad_token_id() == 0 image_captioner = Seq2Seq.load(ImageCaptioning, options.pretrained_path, tok_dir=options.tokenizer_path) txt2ImageModel = Caption2Image( text_processor=text_processor, enc_layer=options.encoder_layer, embed_dim=options.embed_dim, intermediate_dim=options.intermediate_layer_dim) print("Model initialization done!") # We assume that the collator function returns a list with the size of number of gpus (in case of cpus, collator = dataset.ImageTextCollator() num_batches = max(1, torch.cuda.device_count()) optimizer = build_optimizer(txt2ImageModel, options.learning_rate, warump_steps=options.warmup) trainer = Caption2ImageTrainer( model=txt2ImageModel, caption_model=image_captioner, mask_prob=options.mask_prob, optimizer=optimizer, clip=options.clip, beam_width=options.beam_width, max_len_a=options.max_len_a, max_len_b=options.max_len_b, len_penalty_ratio=options.len_penalty_ratio, fp16=options.fp16, mm_mode=options.mm_mode) pin_memory = torch.cuda.is_available() img_train_loader = ImageMTTrainer.get_img_loader( collator, dataset.ImageCaptionDatasetwNegSamples, options.train_path, txt2ImageModel, num_batches, options, pin_memory, lex_dict=lex_dict) img_dev_loader = ImageMTTrainer.get_img_loader( collator, dataset.ImageCaptionDatasetwNegSamples, options.dev_path, txt2ImageModel, num_batches, options, pin_memory, lex_dict=lex_dict, shuffle=False, denom=2) step, train_epoch = 0, 1 while options.step > 0 and step < options.step: print("train epoch", train_epoch) step = trainer.train_epoch(img_data_iter=img_train_loader, img_dev_data_iter=img_dev_loader, max_step=options.step, lex_dict=lex_dict, saving_path=options.model_path, step=step) train_epoch += 1
def train(options): if not os.path.exists(options.model_path): os.makedirs(options.model_path) text_processor = TextProcessor(options.tokenizer_path) lm_class = ReformerLM if options.reformer else LM if options.pretrained_path is None: lm = lm_class(text_processor=text_processor, size=options.model_size) else: lm = lm_class.load(options.pretrained_path) if options.reformer: lm.config.hidden_dropout_prob = options.dropout lm.config.local_attention_probs_dropout_prob = options.dropout lm.config.lsh_attention_probs_dropout_prob = options.dropout else: LMTrainer.config_dropout(lm, options.dropout) train_data = dataset.TextDataset(save_cache_dir=options.train_path, max_cache_size=options.cache_size) dev_data = dataset.TextDataset(save_cache_dir=options.dev_path, max_cache_size=options.cache_size, load_all=True) if options.continue_train: with open(os.path.join(options.pretrained_path, "optim"), "rb") as fp: optimizer = pickle.load(fp) else: optimizer = build_optimizer(lm, options.learning_rate, options.warmup) trainer = LMTrainer(model=lm, mask_prob=options.mask_prob, optimizer=optimizer, clip=options.clip) collator = dataset.TextCollator(pad_idx=text_processor.pad_token_id()) train_sampler, dev_sampler = None, None pin_memory = torch.cuda.is_available() loader = data_utils.DataLoader(train_data, batch_size=options.batch, shuffle=False, pin_memory=pin_memory, collate_fn=collator, sampler=train_sampler) dev_loader = data_utils.DataLoader(dev_data, batch_size=options.batch, shuffle=False, pin_memory=pin_memory, collate_fn=collator, sampler=dev_sampler) step, train_epoch = 0, 1 while step <= options.step: print("train epoch", train_epoch) step = trainer.train_epoch(data_iter=loader, dev_data_iter=dev_loader, saving_path=options.model_path, step=step)
def __init__(self, root_img_dir: str, data_bin_file: str, max_capacity: int, text_processor: TextProcessor, max_img_per_batch: int, lex_dict=None, ngpu=1, use_neg_samples: bool = False): self.ngpu = ngpu self.lex_dict = lex_dict self.size_transform = transforms.Resize(256) self.crop = transforms.CenterCrop(224) self.to_tensor = transforms.ToTensor() self.img_normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) self.pad_idx = text_processor.pad_token_id() self.batches = [] self.root_img_dir = root_img_dir max_capacity *= 1000000 self.image_batches = [] self.lang_ids = set() self.all_captions = [] self.use_neg_samples = use_neg_samples print("Start", datetime.datetime.now()) cur_batch, cur_imgs, cur_lex_cand_batch = [], [], [] cur_max_len = 0 with open(data_bin_file, "rb") as fp: self.unique_images, captions = marshal.load(fp) lang_id = text_processor.id2token(captions[0][1][0]) self.lang_ids.add(int(captions[0][1][0])) self.lang = text_processor.languages[ lang_id] if lang_id in text_processor.languages else 0 for caption_info in captions: image_id, caption = caption_info if self.unique_images[image_id].lower().endswith(".png"): continue caption = torch.LongTensor(caption) cur_batch.append(caption) self.all_captions.append(caption) if self.lex_dict is not None: lex_cands = get_lex_suggestions( self.lex_dict, caption, text_processor.pad_token_id()) cur_lex_cand_batch.append(lex_cands) cur_imgs.append(image_id) cur_max_len = max(cur_max_len, len(caption)) batch_capacity_size = 2 * (cur_max_len**3) * len(cur_batch) if (len(cur_imgs) > max_img_per_batch or batch_capacity_size > max_capacity ) and len( cur_batch[:-1]) >= self.ngpu and len(cur_batch) > 1: batch_tensor = pad_sequence(cur_batch[:-1], batch_first=True, padding_value=self.pad_idx) lex_cand_batch = None if self.lex_dict is not None: lex_cand_batch = pad_sequence( cur_lex_cand_batch[:-1], batch_first=True, padding_value=self.pad_idx) cur_lex_cand_batch = [cur_lex_cand_batch[-1]] pads = batch_tensor != self.pad_idx pad_indices = [int(pads.size(1)) - 1] * int(pads.size(0)) pindices = torch.nonzero(~pads) for (r, c) in pindices: pad_indices[r] = min(pad_indices[r], int(c)) self.batches.append( (batch_tensor, pads, torch.LongTensor(pad_indices), lex_cand_batch)) self.image_batches.append(cur_imgs[:-1]) cur_batch = [cur_batch[-1]] cur_imgs = [cur_imgs[-1]] cur_max_len = len(cur_batch[0]) if len(cur_batch) > 0: batch_tensor = pad_sequence(cur_batch, batch_first=True, padding_value=self.pad_idx) pads = batch_tensor != self.pad_idx pad_indices = [int(pads.size(1)) - 1] * int(pads.size(0)) lex_cand_batch = None if self.lex_dict is not None: lex_cand_batch = pad_sequence(cur_lex_cand_batch, batch_first=True, padding_value=self.pad_idx) pindices = torch.nonzero(~pads) for (r, c) in pindices: pad_indices[r] = min(pad_indices[r], int(c)) self.batches.append( (batch_tensor, pads, torch.LongTensor(pad_indices), lex_cand_batch)) self.image_batches.append(cur_imgs) print( "Loaded %d image batches of %d unique images and %d all captions!" % (len(self.batches), len( self.unique_images), len(self.all_captions))) print("End", datetime.datetime.now())
dest="output_file", help="Output pickle file.", metavar="FILE", default=None) parser.add_option("--tok", dest="tokenizer_path", help="Path to the tokenizer folder", metavar="FILE", default=None) parser.add_option("--max-len", dest="max_len", help="Maximum tokenized caption length", type="int", default=256) parser.add_option("--sample", dest="sample_size", type="int", default=-1) (options, args) = parser.parse_args() return options if __name__ == "__main__": options = get_options() tokenizer = TextProcessor(options.tokenizer_path) print("Writing batches") write(text_processor=tokenizer, output_file=options.output_file, input_file=options.file, max_len=options.max_len, sample_size=options.sample_size) print("Finished")
def __init__(self, text_processor: TextProcessor, lang_dec: bool = True, use_proposals=False, tie_embed=False, enc_layer: int = 6, dec_layer: int = 3, embed_dim: int = 768, intermediate_dim: int = 3072, freeze_image: bool = False, resnet_depth: int = 1, use_obj: bool=False): super(Seq2Seq, self).__init__() self.text_processor: TextProcessor = text_processor self.config = lm_config.get_config(vocab_size=text_processor.tokenizer.get_vocab_size(), pad_token_id=text_processor.pad_token_id(), bos_token_id=text_processor.bos_token_id(), eos_token_id=text_processor.sep_token_id(), enc_layer=enc_layer, embed_dim=embed_dim, intermediate_dim=intermediate_dim) self.enc_layer = enc_layer self.dec_layer = dec_layer self.embed_dim = embed_dim self.intermediate_dim = intermediate_dim self.config["type_vocab_size"] = len(text_processor.languages) self.config = BertConfig(**self.config) dec_config = copy.deepcopy(self.config) dec_config.num_hidden_layers = self.dec_layer self.encoder = BertEncoderModel(self.config) self.encoder.init_weights() self.lang_dec = lang_dec self.tie_embed = tie_embed if not lang_dec: self.decoder = BertDecoderModel(dec_config) self.encoder._tie_or_clone_weights(self.encoder.embeddings.position_embeddings, self.decoder.embeddings.position_embeddings) self.encoder._tie_or_clone_weights(self.encoder.embeddings.token_type_embeddings, self.decoder.embeddings.token_type_embeddings) self.encoder._tie_or_clone_weights(self.encoder.embeddings.word_embeddings, self.decoder.embeddings.word_embeddings) if tie_embed: self.output_layer = BertOutputLayer(dec_config) self.encoder._tie_or_clone_weights(self.output_layer, self.encoder.embeddings.word_embeddings) self.encoder._tie_or_clone_weights(self.encoder.embeddings.position_embeddings, self.decoder.embeddings.position_embeddings) self.decoder._tie_or_clone_weights(self.output_layer, self.decoder.embeddings.word_embeddings) else: self.output_layer = nn.ModuleList([BertOutputLayer(dec_config) for _ in text_processor.languages]) if len(self.encoder.encoder.layer) == len(self.decoder.decoder.layer): for i in range(len(self.encoder.encoder.layer)): self.decoder.decoder.layer[i].attention = self.encoder.encoder.layer[i].attention else: dec = BertDecoderModel(dec_config) self.decoder = nn.ModuleList([copy.deepcopy(dec) for _ in text_processor.languages]) self.output_layer = nn.ModuleList([BertOutputLayer(dec_config) for _ in text_processor.languages]) for i, dec in enumerate(self.decoder): if tie_embed: self.encoder._tie_or_clone_weights(self.output_layer[i], self.encoder.embeddings.word_embeddings) dec.embeddings.position_embeddings = self.encoder.embeddings.position_embeddings dec._tie_or_clone_weights(self.output_layer[i], dec.embeddings.word_embeddings) dec._tie_or_clone_weights(self.encoder.embeddings.token_type_embeddings, dec.embeddings.token_type_embeddings) self.use_proposals = use_proposals if self.use_proposals: self.proposal_embedding = self.encoder.embeddings.word_embeddings self.lexical_gate = nn.Parameter(torch.zeros(1, self.config.hidden_size).fill_(0.1), requires_grad=True) self.lexical_layer_norm = nn.LayerNorm(self.config.hidden_size, eps=self.config.layer_norm_eps) self.freeze_image = freeze_image self.resnet_depth = resnet_depth
def test_albert_seq2seq_init(self): path_dir_name = os.path.dirname(os.path.realpath(__file__)) data_path = os.path.join(path_dir_name, "sample.txt") with tempfile.TemporaryDirectory() as tmpdirname: processor = TextProcessor() processor.train_tokenizer([data_path], vocab_size=1000, to_save_dir=tmpdirname, languages={ "<en>": 0, "<fa>": 1 }) seq2seq = Seq2Seq(text_processor=processor) src_inputs = torch.tensor([[ 1, 2, 3, 4, 5, processor.pad_token_id(), processor.pad_token_id() ], [1, 2, 3, 4, 5, 6, processor.pad_token_id()]]) tgt_inputs = torch.tensor( [[6, 8, 7, processor.pad_token_id(), processor.pad_token_id()], [6, 8, 7, 8, processor.pad_token_id()]]) src_mask = (src_inputs != processor.pad_token_id()) tgt_mask = (tgt_inputs != processor.pad_token_id()) src_langs = torch.tensor([[0], [0]]).squeeze() tgt_langs = torch.tensor([[1], [1]]).squeeze() seq_output = seq2seq(src_inputs, tgt_inputs, src_mask, tgt_mask, src_langs, tgt_langs, log_softmax=True) assert list(seq_output.size()) == [5, processor.vocab_size()] seq_output = seq2seq(src_inputs, tgt_inputs, src_mask, tgt_mask, src_langs, tgt_langs) assert list(seq_output.size()) == [5, processor.vocab_size()]
sorted_examples = list(map(lambda len_item: examples[len_item[0]], sorted_lens)) with open(output_file + "." + str(part_num), "wb") as fw: marshal.dump(sorted_examples, fw) def get_options(): global options parser = OptionParser() parser.add_option("--src", dest="src_data_path", help="Path to the source txt file", metavar="FILE", default=None) parser.add_option("--dst", dest="dst_data_path", help="Path to the target txt file", metavar="FILE", default=None) parser.add_option("--output", dest="output_path", help="Output marshal file ", metavar="FILE", default=None) parser.add_option("--tok", dest="tokenizer_path", help="Path to the tokenizer folder", metavar="FILE", default=None) parser.add_option("--max_seq_len", dest="max_seq_len", help="Max sequence length", type="int", default=175) parser.add_option("--min_seq_len", dest="min_seq_len", help="Max sequence length", type="int", default=1) parser.add_option("--src-lang", dest="src_lang", type="str", default=None) parser.add_option("--dst-lang", dest="dst_lang", type="str", default=None) (options, args) = parser.parse_args() return options if __name__ == "__main__": options = get_options() tokenizer = TextProcessor(options.tokenizer_path) print(datetime.datetime.now(), "Writing batches") src_lang = tokenizer.token_id("<" + options.src_lang + ">") dst_lang = tokenizer.token_id("<" + options.dst_lang + ">") if options.dst_lang is not None else None write(text_processor=tokenizer, output_file=options.output_path, src_txt_file=options.src_data_path, dst_txt_file=options.dst_data_path, src_lang=src_lang, dst_lang=dst_lang) print(datetime.datetime.now(), "Finished")
def write(text_processor: TextProcessor, output_file: str, src_txt_file: str, src_lang: int, dst_txt_file: str = None, dst_lang: int = None, min_len: int = 1, max_len: int = 175): examples = {} line_num = 0 src_lang_str = text_processor.languages[text_processor.id2token(src_lang)] lens = {} if dst_txt_file is not None: dst_lang_str = text_processor.languages[text_processor.id2token(dst_lang)] with open(src_txt_file, "r") as s_fp, open(dst_txt_file, "r") as d_fp: for src_line, dst_line in zip(s_fp, d_fp): if len(src_line.strip()) == 0 or len(dst_line.strip()) == 0: continue src_tok_line = text_processor.tokenize_one_sentence_with_langid(src_line.strip(), src_lang) dst_tok_line = text_processor.tokenize_one_sentence_with_langid(dst_line.strip(), dst_lang) if min_len <= len(src_tok_line) <= max_len and min_len <= len(dst_tok_line) <= max_len: examples[line_num] = (src_tok_line, dst_tok_line, src_lang_str, dst_lang_str) lens[line_num] = len(dst_tok_line) line_num += 1 if line_num % 1000 == 0: print(line_num, end="\r") print("\nSorting") sorted_lens = sorted(lens.items(), key=lambda item: item[1]) sorted_examples = [] print("Sorted examples") for len_item in sorted_lens: line_num = len(sorted_examples) sorted_examples.append(examples[len_item[0]]) print("Dumping") with open(output_file, "wb") as fw: marshal.dump(sorted_examples, fw) else: part_num = 0 # Used for MASS training where we only have source sentences. with open(src_txt_file, "r") as s_fp: for src_line in s_fp: if len(src_line.strip()) == 0: continue src_tok_line = text_processor.tokenize_one_sentence_with_langid(src_line.strip(), src_lang) if min_len <= len(src_tok_line) <= max_len: examples[line_num] = (src_tok_line, src_lang_str) lens[line_num] = len(src_tok_line) line_num += 1 if line_num % 1000 == 0: print(line_num, "\r", end="") if len(examples) >= 6000000: print(datetime.datetime.now(), "Sorting and writing", part_num) sorted_lens = sorted(lens.items(), key=lambda item: item[1]) sorted_examples = list(map(lambda len_item: examples[len_item[0]], sorted_lens)) with open(output_file + "." + str(part_num), "wb") as fw: marshal.dump(sorted_examples, fw) examples = {} lens = {} part_num += 1 if len(examples) > 0: print(datetime.datetime.now(), "Sorting and writing", part_num) sorted_lens = sorted(lens.items(), key=lambda item: item[1]) sorted_examples = list(map(lambda len_item: examples[len_item[0]], sorted_lens)) with open(output_file + "." + str(part_num), "wb") as fw: marshal.dump(sorted_examples, fw)
class HtmlProcessor: def __init__(self, directory="./htmls/", MAX_DOCUMENTS=10000): # directory where all the .html file are stored self.directory = directory # this variable is the number of ads needed to perform the analysis self.MAX_DOCUMENTS = MAX_DOCUMENTS # this variable will contain the text processor object needed to extract info # by the descriptions of the house advertisements self.tp = None # the tfidf dictionary that will contain each tfidf value is initalized self.tfidf = {} # this variable will contain the euclidean of each document(description) in the corpus(collection of ads) self.euclidean_norm = {} # this variable contains the tfidf values but normalized such that the tfidf vector # of each document has a unit euclidean norm. self.tfidf_normalized = {} def createFirstDataset(self): # variable useful to keep track of the index of each house ad ad_index = 0 # fields contained into the JSON value and that are interest for the purposes of the assignment fields = ['locali', 'bagni', 'prezzo', 'superficie'] # lists that will contain the values of each column of the dataframe price = [] locali = [] superficie = [] bagni = [] piano = [] allFeatures = [price, locali, superficie, bagni, piano] # this list will contain the index of the dataframe indexes = [] # start a row sample with the fields set to 0 row_sample = dict.fromkeys(fields, 0) # this variable will contains the sample for the 'piano' variable piano_sample = None # name of the attribute that contains the piano value attribute_name_piano = 'title' # for each html file for filePath in tqdm(listdir(self.directory)): if (ad_index >= self.MAX_DOCUMENTS): break with open(self.directory + filePath, "r") as filereader: html = filereader.read() # Initialize the BeautifulSoup parser soup = BeautifulSoup(html, 'html.parser') # find the json document where most of the features are contained jsonFile = soup.find(id='js-hydration').text.strip() # extract a python dictionary value from the JSON value metadata = json.loads(jsonFile) # toCheck is an auxiliary variable to control the flow # in fact since there are more than one loop, it is not possible # to use only the 'continue' statement to manage the flow. # the var/flag @toCheck is True, at the beginning, and if something bad happens # its state is changed to False and, once the code is back into the main loop, # the 'continue' statement is involved toCheck = True for field in fields: # if one of the fields is not present then it is a 'bad' house ad # Three checks are performed: # 1- if the current @field is not present in the @metadata dictionary # then the house ad is not good and so the flag @toCheck is uploaded # 2- check if the flag @toCheck is already false and sequently 'break' the loop # 3- If the JSON value is None then the features of interest aren't present # and the loop must be broken. if field not in metadata or not toCheck or metadata[ field] is None: toCheck = False break # now I am sure that the field is present # Three checks are performed: # 1- The '+'(plus) char in the string. This is due to the fact that sometimes # the exact info about how many 'locali' or 'bagni' isn't present, so the advertiser # put at the end of the number a plus sign(so '5+' means 5,6,7,...,N locali) # 2- The '-' character is present when the advertiser want to put a range of values # for instance an house price could be '180.000-300.000', it means from 180k to 300k # 3- The 'da' string means literally 'from' and it means that the advertiser wants to put # a lower limit to one of the features. # All 3 cases need to be avoided because the assignment requires a unique value # for every feature. if not ('+' in metadata[field] or '-' in metadata[field] or 'da' in metadata[field]): # split the value tmp = metadata[field].split(' ') # the flag state is put to False toCheck = False for string in tmp: # on the string is invoked a replace of the '€' and '.' character # because of the encoding the advertiser use for the price if (string.replace('.', '').replace('€', '').isdigit()): # if the string is a digit then the flag can be newly be # assigned to True toCheck = True row_sample[field] = int( string.replace('.', '').replace('€', '')) else: # if one of the previous condition is encountered then it is not possible # to include the house ad to the collection for the sequent analysis toCheck = False break # at this point, the code flow is back into the main loop and if the flag is not True # then a 'continue' statement is invoked if not toCheck: continue # now it remains to extract the piano, the description and the link # extracting piano field piano_abbr = soup.find('abbr', attrs={'class': 'text-bold im-abbr'}) # if the tag where the 'piano' value can be found it is not present in the html # or the tag doesn't have the attribute with the 'piano' value, then the house ad # must be discarded if (not piano_abbr is None ) and attribute_name_piano in piano_abbr.attrs: # if the piano is 'terra'(T),'rialzato'(R) or 'seminterrato'(S) # the piano field is set to 1. if piano_abbr.attrs[attribute_name_piano] in { 'T', 'R', 'S', 'Piano terra' }: piano_sample = 1 elif piano_abbr.attrs[attribute_name_piano].isdigit(): piano_sample = int(piano_abbr.attrs[attribute_name_piano]) else: continue else: # the ad doesn't contain the piano info therefore it is a 'bad' ad continue # extracting description description_div = soup.find( 'div', attrs={'class': 'col-xs-12 description-text text-compressed'}) # if the tag where the description is contained it is not present in the html file # then the house ad must be discarded if (description_div is None): continue # extract the link link_tag = soup.find('link', attrs={'rel': 'canonical'}) # if the tag where the link is contained it is not present in the html file # then the house ad must be discarded if not 'href' in link_tag.attrs: continue # At this point of the code flow, it is possible to say surely that the house ad # can be included in the collection for the sequent analysis. # # open a file and store the description # save the description on a textfile with open("./descriptions/descriptionAD#" + str(ad_index) + ".txt", "w") as text_file: text_file.write(description_div.text.strip()) # retrieve the link value in the 'href' attribute of the tag adLink = link_tag.attrs['href'] # upload the value of the ad_index ad_index += 1 # Now it's possible to appreciate the use of additional variables for the fields of interest # they are useful because if each variable is appended to the list immediately # then anytime an issue involves(every time a "break" or "continue" statement appears) # it is needed to pop the element from the list. # Instead, with the use of auxiliary variable it is possible to first assign it # and then append all of them at the end, when we are sure no issues are present # for that specific house ad. indexes.append(adLink) price.append(row_sample['prezzo']) locali.append(row_sample['locali']) superficie.append(row_sample['superficie']) bagni.append(row_sample['bagni']) piano.append(piano_sample) # columns of the dataframe columnsDf = ["price", "locali", "superficie", "bagni", "piano"] # initialize the dataframe to be stored on the filesystem toReturn = pd.DataFrame() # iterate over each column of the dataframe to be returned # and assign the column to the corresponding list of values for i in range(len(columnsDf)): # assign the list of values to the specific column toReturn[columnsDf[i]] = allFeatures[i] # upload the indexes toReturn.index = indexes # save the dataframe on the filesystem toReturn.to_csv('dataframe1.tsv', sep='\t') def createSecondDataset(self, datasetFileName="df2.tsv"): # the text processor object is initialized self.tp = TextProcessor() # the term encoding is built self.tp.buildEncoding("encoding.pickle") # the term frequency dictionary is built self.tp.buildTf(fileName='tf.pickle') # the function which build the tfidf, still not normalized, is invoked self._createtfidf() # the function which computes the euclidean norm of the description in the house ad collection self._computeEuclideanNorm() # this function creates the tfidf dictionary but normalized self._createNormalizedtfidf() # the dataframe is created pd.DataFrame.from_dict(data=self.tfidf_normalized, columns=range(1, self.tp.VOCABULARY_SIZE + 1), orient='index').to_csv(datasetFileName, sep='\t') def _createNormalizedtfidf(self): # now it will be created the dictionary that contains the tfidf values NORMALIZED # so each tfidf value will be divided by the euclidean norm of the document into # which the word is contained. # iterating over the keys of the @tfidf dictionary.REMEMBER: this dict has as keys the # doc indexes. for doc_index in self.tfidf: # at the beginning the tfidf vector will be a vector of zero self.tfidf_normalized[doc_index] = [0] * self.tp.VOCABULARY_SIZE # the tfidf values for that document are retrieve tfidf_list = self.tfidf[doc_index] # iterate over each tuple @tfidfTuple(structure: (wordId, tfidf{wordID,doc_index})) for tfidfTuple in tfidf_list: # this line access the tfidf vector in position @tfidfTuple[0]-1 which is the index # of the word related and update the vector with the tfidf value divided by the euclidean # norm of the document self.tfidf_normalized[doc_index][tfidfTuple[0] - 1] = ( tfidfTuple[1] / self.euclidean_norm[doc_index]) # compute the euclidean norm of each document def _computeEuclideanNorm(self): # for each doc_index in the tfidf dictionary. # REMEMBER: Why are you using as key of the @tfidf dictionary the doc_index? # because in this way it will be much easier to build the final matrix # containing the tfidf values for doc_index in self.tfidf: sum_of_squares = 0 # all the words contained in the document @doc_index is retrieved together with # the related tf-idf value(both contained in the @tfidfTuple) tfidf_list = self.tfidf[doc_index] # iterate over each tuple @tfidfTuple(structure: (wordId, tfidf{wordID,doc_index})) for tfidfTuple in tfidf_list: # retrieve the tfidf value tmp = tfidfTuple[1] sum_of_squares += (tmp**2) # the euclidean norm is equal to the square root of the sum of the squares of # each component of the vector self.euclidean_norm[doc_index] = sum_of_squares**0.5 def _createtfidf(self): # for each word(equal to for each column in the tfidf matrix) for word in self.tp.tf: # df is the document frequency of the word, so the number of documents into which # the word is contained, therefore given the structure of the @tp.tf dictionary # the 'df' is equal to the length of the list given by tp.tf[word] df = len(self.tp.tf[word]) # the wordID is given by the term encoding given by the dictionary @tp.term_enc wordId = self.tp.term_enc[word] # for each tuple contained in the list @tp.tf[word]. REMEMBER: the tuple has a length of 2. # the first element is the doc_index and the second is the tf value for tfTuple in self.tp.tf[word]: # retrieve the doc_index doc_index = int(tfTuple[0]) # retrieve the term frequency of that word in the document @doc_index tfValue = tfTuple[1] # calculate the idf value idf = log(self.tp.NUMBER_OF_DOCS / df + 1) # if the doc_index is already a key in the dictionary then the new tuple # is only appended if doc_index in self.tfidf: self.tfidf[int(doc_index)].append( (int(wordId) - 1, tfValue * idf)) # otherwise a new entry corresponding to that doc_index is added and the value # is a list with a single element which is the tuple (wordId, tfidf{wordID,doc_index}) else: self.tfidf[int(doc_index)] = [(int(wordId) - 1, tfValue * idf)]
import time; from textprocessor import TextProcessor from bayes_classifier import BayesClassifier form_file = 'formy' polish_texts = ['dramat', 'popul', 'proza', 'publ', 'wp'] path_to_files = './data/' show_simmilar = True; num_of_simmilar = 4 if __name__ == '__main__': textprocessor = TextProcessor() textprocessor.create_dictionary(path_to_file = path_to_files, form_file=form_file) textprocessor.improve_dictionary(path_to_files = path_to_files, polish_texts=polish_texts) dict_of_words = textprocessor.dict_of_words; input_word = input('Napisz pojedyncze slowo:\n') start = time.time(); input_word = textprocessor.map_chars(input_word) if not input_word in dict_of_words: bayes_classifier = BayesClassifier() simmilar_words = bayes_classifier.calculate(f'{input_word}', dict_of_words) unmapped_words = [] for word in simmilar_words: unmapped_words.append(textprocessor.unmap_words(word)) print(f'Slowo nie wystepuje w polskim jezyku.') print(f'Moze chodzilo o \'{unmapped_words[0]}\'?') show_hints = input('Chcesz zobaczyc inne mozliwosci? t/n\n') if(show_hints == 't'): print(f'Inne możliwosci {unmapped_words[1:num_of_simmilar]}\n') else:
'sp', 'llc', 'co', 'ltd', 'tel', 'email', ' ', 'tel', 'fax', 'gmail', 'com', 'eu', 'pl', 'telfax', 'office', 'burg', 'poland' ] fileName = "./data/pap.txt" note_idx = 98 #https://www.datascienceassn.org/sites/default/files/users/user1/lsa_presentation_final.pdf?fbclid=IwAR3ax6JNemqmWzfau24-UwePT7isOEDP5mAE3jbCQG92dITVVwV9ZS7CYiA if __name__ == '__main__': start = time.time() ### PREPROCESSING textProcessor = TextProcessor() textProcessor.create_dictionary("data", "odm.txt") lineWords = [] for line in open(fileName, 'r', encoding='utf-8'): read_line = line.replace('#', '').strip('\n').strip(' ') if not read_line.isdigit(): lineWords.append(textProcessor.preprocess(read_line)) textProcessor.create_frequency_dict(lineWords) preprocessed = [] textProcessor.pre_process_vol_2(lineWords) ### Document-term matrix stop_list = get_stop_list()