Example #1
0
def create_batches(sentences,
                   src2dst_dict,
                   text_processor: TextProcessor,
                   resume_index=0,
                   end_index=-1):
    print(len(src2dst_dict))

    print("Getting batches...")
    index = 0

    for sid in src2dst_dict.keys():
        index += 1
        if index >= end_index and end_index > 0:
            break
        if index <= resume_index:
            continue
        tids = list(src2dst_dict[sid])
        source_tokenized = torch.LongTensor(tok_sen(sentences[sid]))
        trans_cands = list(
            map(lambda i: torch.LongTensor(tok_sen(sentences[i])), tids))
        candidates = pad_sequence(trans_cands,
                                  batch_first=True,
                                  padding_value=text_processor.pad_token_id())
        target_langs = list(
            map(
                lambda i: text_processor.lang_id(sentences[i].strip().split(
                    " ")[0]), tids))
        src_lang = torch.LongTensor(
            [text_processor.lang_id(sentences[sid].strip().split(" ")[0])])
        yield sid, source_tokenized, torch.LongTensor(
            tids), candidates, src_lang, torch.LongTensor(target_langs)
Example #2
0
    def test_data(self):
        path_dir_name = os.path.dirname(os.path.realpath(__file__))
        data_path = os.path.join(path_dir_name, "sample.txt")

        with tempfile.TemporaryDirectory() as tmpdirname:
            processor = TextProcessor()
            processor.train_tokenizer([data_path],
                                      vocab_size=1000,
                                      to_save_dir=tmpdirname,
                                      languages={
                                          "<mzn>": 0,
                                          "<glk": 1
                                      })
            create_batches.write(text_processor=processor,
                                 cache_dir=tmpdirname,
                                 seq_len=512,
                                 txt_file=data_path,
                                 sen_block_size=10)
            dataset = TextDataset(save_cache_dir=tmpdirname, max_cache_size=3)
            assert dataset.line_num == 70

            dataset.__getitem__(3)
            assert len(dataset.current_cache) == 3

            dataset.__getitem__(9)
            assert len(dataset.current_cache) == 3

            dataset.__getitem__(69)
            assert len(dataset.current_cache) == 2
Example #3
0
    def __init__(self,
                 text_processor: TextProcessor,
                 config: BertConfig = None,
                 encoder: BertModel = None,
                 enc_layer: int = 6,
                 embed_dim: int = 768,
                 intermediate_dim: int = 3072):
        super(LM, self).__init__()
        self.text_processor: TextProcessor = text_processor

        if config is not None:
            self.config = config
        else:
            self.config = lm_config.get_config(
                vocab_size=text_processor.tokenizer.get_vocab_size(),
                pad_token_id=text_processor.pad_token_id(),
                bos_token_id=text_processor.bos_token_id(),
                eos_token_id=text_processor.sep_token_id(),
                enc_layer=enc_layer,
                embed_dim=embed_dim,
                intermediate_dim=intermediate_dim)

            self.config["type_vocab_size"] = len(text_processor.languages)
            self.config = BertConfig(**self.config)

        self.masked_lm = BertOutputLayer(self.config)
        if encoder is None:
            self.encoder: BertModel = BertModel(self.config)
            self.encoder.init_weights()
        else:
            self.encoder = encoder
        self.encoder._tie_or_clone_weights(
            self.masked_lm.decoder, self.encoder.embeddings.word_embeddings)
Example #4
0
def mask_text(mask_prob, pads, texts, text_processor: TextProcessor, mask_eos: bool = True):
    assert 0 < mask_prob < 1
    mask = torch.empty(texts.size()).uniform_(0, 1) < mask_prob
    mask[~pads] = False  # We should not mask pads.
    if not mask_eos:
        eos_idx = texts == text_processor.sep_token_id()
        mask[eos_idx] = False  # We should not mask end-of-sentence (usually in case of BART training).

    masked_ids = texts[mask]
    random_index = lambda: random.randint(len(text_processor.special_tokens), text_processor.vocab_size() - 1)
    rand_select = lambda r, c: text_processor.mask_token_id() if r < 0.8 else (
        random_index() if r < 0.9 else int(masked_ids[c]))
    replacements = list(map(lambda i: rand_select(random.random(), i), range(masked_ids.size(0))))
    texts[mask] = torch.LongTensor(replacements)
    return mask, masked_ids, texts
Example #5
0
 def createSecondDataset(self, datasetFileName="df2.tsv"):
     # the text processor object is initialized
     self.tp = TextProcessor()
     # the term encoding is built
     self.tp.buildEncoding("encoding.pickle")
     # the term frequency dictionary is built
     self.tp.buildTf(fileName='tf.pickle')
     # the function which build the tfidf, still not normalized, is invoked
     self._createtfidf()
     # the function which computes the euclidean norm of the description in the house ad collection
     self._computeEuclideanNorm()
     # this function creates the tfidf dictionary but normalized
     self._createNormalizedtfidf()
     # the dataframe is created
     pd.DataFrame.from_dict(data=self.tfidf_normalized,
                            columns=range(1, self.tp.VOCABULARY_SIZE + 1),
                            orient='index').to_csv(datasetFileName,
                                                   sep='\t')
Example #6
0
    def test_albert_init(self):
        path_dir_name = os.path.dirname(os.path.realpath(__file__))
        data_path = os.path.join(path_dir_name, "sample.txt")

        with tempfile.TemporaryDirectory() as tmpdirname:
            processor = TextProcessor()
            processor.train_tokenizer([data_path],
                                      vocab_size=1000,
                                      to_save_dir=tmpdirname,
                                      languages={"<en>": 0})
            lm = LM(text_processor=processor)
            assert lm.encoder.base_model.embeddings.word_embeddings.num_embeddings == 1000

            lm.save(tmpdirname)

            new_lm = LM.load(tmpdirname)

            assert new_lm.config == lm.config
Example #7
0
def write(text_processor: TextProcessor,
          output_file: str,
          txt_file: str,
          output_txt: bool = False):
    with open(txt_file, "r") as fp, open(output_file, "w") as writer:
        for line in fp:
            if len(line.strip()) == 0 or len(line.strip()) == 0: continue
            tok_line = text_processor.tokenize_one_line(line.strip(),
                                                        ignore_middle_eos=True)

            if output_txt:
                tokenized = [text_processor.id2token(tok)
                             for tok in tok_line][1:-1]
                tokenized = list(
                    map(lambda tok: tok
                        if tok != "<unk>" else "unk", tokenized))
            else:
                tokenized = [str(tok) for tok in tok_line]
            writer.write(" ".join(tokenized) + "\n")
def write(text_processor: TextProcessor, output_file: str, input_file: str,
          max_len: int, sample_size: int):
    with open(input_file, "r") as r:
        obj = json.load(r)

    annotations = obj["annotations"]
    captions = list(
        map(lambda annotation: caption_data(annotation), annotations))
    print(len(captions))

    skipped_long_sens = 0
    image_path_dict, unique_images = dict(), dict()

    tok_captions = {}
    image_ids = {}
    for ci, c in enumerate(captions):
        if ci % 1000 == 0:
            print(ci,
                  "/",
                  len(captions),
                  "->",
                  len(tok_captions),
                  len(unique_images),
                  end="\r")
        tok_sen = text_processor.tokenize_one_sentence(c[1])
        if len(tok_sen) > max_len:
            skipped_long_sens += 1
            continue

        path = c[0]
        if path not in image_path_dict:
            image_id = len(unique_images)
            unique_images[image_id] = path
            image_path_dict[path] = image_id
        elif path in image_path_dict:
            image_id = image_path_dict[path]
            unique_images[image_id] = path

        caption_id = len(tok_captions)
        tok_captions[caption_id] = tok_sen
        image_ids[caption_id] = image_id

        if (ci + 1) >= sample_size and sample_size > 0:
            break

    print("Skipped long sentences:", skipped_long_sens, "from", len(captions))
    tok_captions_sorted = sorted(tok_captions.items(),
                                 key=lambda item: len(item[1]))
    caption_sorted = list(
        map(lambda e: (image_ids[e[0]], e[1]), tok_captions_sorted))
    print("Longest sentence", len(tok_captions_sorted[-1][1]))
    with open(output_file, "wb") as wfp:
        marshal.dump((unique_images, caption_sorted), wfp)
    print("Dumped", len(caption_sorted), "captions from", len(unique_images),
          "unique images")
def write(text_processor: TextProcessor, output_file: str, input_file: str, max_len: int, sample_size: int, lang):
    eos = "</s>"
    if lang is not None:
        lang = "<" + lang + ">"

    skipped_long_sens = 0
    image_path_dict, unique_images = dict(), dict()

    tok_captions = {}
    image_ids = {}
    with open(input_file, "r") as r:
        for ci, line in enumerate(r):
            try:
                path, caption = line.strip().split("\t")
                if lang is not None and not caption.startswith(lang):
                    caption = " ".join([lang, caption, eos])
                tok_sen = text_processor.tokenize_one_sentence(caption)
                if len(tok_sen) > max_len:
                    skipped_long_sens += 1
                    continue
                if "." not in path:  # Does not have extension; will add jpg.
                    if os.path.exists(path + ".jpg"):
                        path = path + ".jpg"
                    elif os.path.exists(path + ".jpeg"):
                        path = path + ".jpeg"
                    elif os.path.exists(path + ".JPG"):
                        path = path + ".JPG"
                    elif os.path.exists(path + ".png"):
                        path = path + ".png"
                    elif os.path.exists(path + ".PNG"):
                        path = path + ".PNG"
                if path not in image_path_dict:
                    image_id = len(unique_images)
                    unique_images[image_id] = path
                    image_path_dict[path] = image_id
                elif path in image_path_dict:
                    image_id = image_path_dict[path]
                    unique_images[image_id] = path

                caption_id = len(tok_captions)
                tok_captions[caption_id] = tok_sen
                image_ids[caption_id] = image_id

                if (ci + 1) >= sample_size and sample_size > 0:
                    break
            except:
                print(line.strip())

    print("Skipped long sentences:", skipped_long_sens)
    tok_captions_sorted = sorted(tok_captions.items(), key=lambda item: len(item[1]))
    caption_sorted = list(map(lambda e: (image_ids[e[0]], e[1]), tok_captions_sorted))
    print("Longest sentence", len(tok_captions_sorted[-1][1]))
    with open(output_file, "wb") as wfp:
        marshal.dump((unique_images, caption_sorted), wfp)
    print("Dumped", len(caption_sorted), "captions from", len(unique_images), "unique images")
Example #10
0
    def load(out_dir: str, tok_dir: str):
        text_processor = TextProcessor(tok_model_path=tok_dir)
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        with open(os.path.join(out_dir, "mt_config"), "rb") as fp:
            enc_layer, embed_dim, intermediate_dim = pickle.load(fp)

            mt_model = Caption2Image(text_processor=text_processor, enc_layer=enc_layer, embed_dim=embed_dim,
                                     intermediate_dim=intermediate_dim)
            mt_model.load_state_dict(torch.load(os.path.join(out_dir, "mt_model.state_dict"), map_location=device),
                                     strict=False)
            return mt_model
Example #11
0
 def load(out_dir: str):
     text_processor = TextProcessor(tok_model_path=out_dir)
     with open(os.path.join(out_dir, "config"), "rb") as fp:
         config = pickle.load(fp)
         if isinstance(config, dict):
             # For older configs
             config = BertConfig(**config)
         lm = LM(text_processor=text_processor, config=config)
         lm.load_state_dict(
             torch.load(os.path.join(out_dir, "model.state_dict")))
         return lm
Example #12
0
    def __init__(self, text_processor: TextProcessor, enc_layer: int = 6, embed_dim: int = 768,
                 intermediate_dim: int = 3072):
        super(Caption2Image, self).__init__()
        self.text_processor: TextProcessor = text_processor
        self.config = lm_config.get_config(vocab_size=text_processor.tokenizer.get_vocab_size(),
                                           pad_token_id=text_processor.pad_token_id(),
                                           bos_token_id=text_processor.bos_token_id(),
                                           eos_token_id=text_processor.sep_token_id(),
                                           enc_layer=enc_layer, embed_dim=embed_dim, intermediate_dim=intermediate_dim)

        self.enc_layer = enc_layer
        self.embed_dim = embed_dim
        self.intermediate_dim = intermediate_dim
        self.config["type_vocab_size"] = len(text_processor.languages)
        self.config = BertConfig(**self.config)

        self.encoder = BertEncoderModel(self.config)
        self.encoder.init_weights()

        self.input_attention = nn.Linear(self.config.hidden_size, 1)
        self.decoder = nn.Linear(self.config.hidden_size, 49 * self.config.hidden_size)
Example #13
0
def mass_mask(mask_prob, pad_indices, src_text, text_processor: TextProcessor) -> Dict:
    """
        20% of times, mask from start to middle
        20% of times, mask from middle to end
        60% of times, mask a random index
    """
    index_range = pad_indices - (1 - mask_prob) * pad_indices
    src_mask = torch.zeros(src_text.size(), dtype=torch.bool)
    to_recover = []
    to_recover_pos = []
    for i, irange in enumerate(index_range):
        range_size = int(pad_indices[i] / 2)
        r = random.random()
        last_idx = int(math.ceil(irange))
        if r > 0.8:
            start = 1
        elif r > 0.6:
            start = last_idx
        else:
            start = random.randint(2, last_idx) if last_idx >= 2 else 2

        end = start + range_size
        src_mask[i, start:end] = True
        to_recover.append(src_text[i, start - 1:end])
        to_recover_pos.append(torch.arange(start - 1, end))
    to_recover = pad_sequence(to_recover, batch_first=True, padding_value=text_processor.pad_token_id())
    to_recover_pos = pad_sequence(to_recover_pos, batch_first=True, padding_value=int(src_text.size(-1)) - 1)

    assert 0 < mask_prob < 1
    masked_ids = src_text[:, 1:][src_mask[:, 1:]]
    mask_idx = src_text[src_mask]
    random_index = lambda: random.randint(len(text_processor.special_tokens), text_processor.vocab_size() - 1)
    rand_select = lambda r, c: text_processor.mask_token_id() if r < 0.8 else (
        random_index() if r < 0.9 else int(mask_idx[c]))
    replacements = list(map(lambda i: rand_select(random.random(), i), range(mask_idx.size(0))))
    src_text[src_mask] = torch.LongTensor(replacements)
    return {"src_mask": src_mask, "targets": masked_ids, "src_text": src_text, "to_recover": to_recover,
            "positions": to_recover_pos, "mask_idx": mask_idx}
def get_tokenizer(train_path: Optional[str] = None,
                  model_path: Optional[str] = None,
                  vocab_size: Optional[int] = None) -> TextProcessor:
    if not os.path.exists(model_path):
        os.makedirs(model_path)

    print("Training Tokenizer...")
    text_processor = TextProcessor()
    print("Writing raw text...")
    languages = set()
    with open(train_path + ".tmp", "w") as wf:
        with open(train_path, "r") as rf:
            for i, line in enumerate(rf):
                spl = [
                    sen.strip() for sen in line.split("</s>")
                    if len(sen.strip()) > 0
                ]
                if len(spl) == 0: continue
                if spl[0].startswith("<"):
                    sen_split = spl[0].strip().split(" ")
                    spl[0] = " ".join(sen_split[1:])
                    languages.add(sen_split[0])
                wf.write("\n".join(spl))
                wf.write("\n")
                if (i + 1) % 1000000 == 0:
                    print(i + 1, "\r", end="")
    print("Writing raw text done!")

    print(" ".join(languages))
    text_processor.train_tokenizer(
        paths=[train_path + ".tmp"],
        vocab_size=vocab_size,
        to_save_dir=model_path,
        languages={l: i
                   for i, l in enumerate(sorted(languages))})
    print("Removing temporary file!")
    os.system("rm " + train_path + ".tmp &")
    print("done!")
Example #15
0
    def load(cls, out_dir: str, tok_dir: str, use_obj: bool = False):
        text_processor = TextProcessor(tok_model_path=tok_dir)
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        with open(os.path.join(out_dir, "mt_config"), "rb") as fp:
            lang_dec, use_proposals, enc_layer, dec_layer, embed_dim, intermediate_dim, tie_embed, resnet_depth, freeze_image = pickle.load(
                fp)

            mt_model = cls(text_processor=text_processor, lang_dec=lang_dec, use_proposals=use_proposals,
                               tie_embed=tie_embed, enc_layer=enc_layer, dec_layer=dec_layer, embed_dim=embed_dim,
                               intermediate_dim=intermediate_dim, freeze_image=freeze_image, resnet_depth=resnet_depth,
                               use_obj=use_obj)

            mt_model.load_state_dict(torch.load(os.path.join(out_dir, "mt_model.state_dict"), map_location=device),
                                     strict=False)
            return mt_model
Example #16
0
    def test_train_tokenizer(self):
        path_dir_name = os.path.dirname(os.path.realpath(__file__))
        data_path = os.path.join(path_dir_name, "sample.txt")

        with tempfile.TemporaryDirectory() as tmpdirname:
            processor = TextProcessor()
            processor.train_tokenizer([data_path],
                                      vocab_size=1000,
                                      to_save_dir=tmpdirname,
                                      languages={"<en>": 0})
            assert processor.tokenizer.get_vocab_size() == 1000
            sen1 = "Obama signed many landmark bills into law during his first two years in office."
            assert processor._tokenize(sen1) is not None

            many_sens = "\n".join([sen1] * 10)
            assert len(processor.tokenize(many_sens)) == 10

            new_prcoessor = TextProcessor(tok_model_path=tmpdirname)
            assert new_prcoessor.tokenizer.get_vocab_size() == 1000
            sen2 = "Obama signed many landmark bills into law during his first two years in office."
            assert processor._tokenize(sen2) is not None
Example #17
0
    def load_raw_data(self):
        textprocessor = TextProcessor(self.in_dir, self.dictionary_file,
                                      self.hashtag_file)
        textprocessor.load_dictioanry()
        textprocessor.load_hashtag()
        dat = pd.read_csv(self.in_dir + '/' + self.input_file, header=None)

        dat.columns = ['tweet', 'hashtag']
        # n = len(dat)
        # nlist = range(0,n)
        dat['id'] = None
        dat = dat[['id', 'tweet', 'hashtag']]

        total = ['id', 'tweet', 'hashtag']
        total = total + list(textprocessor.hashtag)
        dat = dat.reindex(columns=list(total), fill_value=0)
        dat['tweet'] = dat['tweet'].apply(textprocessor.cleanup)
        dat['tweet'] = dat['tweet'].apply(textprocessor.informal_norm)

        dat['hashtag'] = dat['hashtag'].apply(textprocessor.del_hashtag)
        dat = dat.drop(
            dat[dat['hashtag'].map(len) < 1].index).reset_index(drop=True)

        dat['tweet'] = dat['tweet'].apply(textprocessor.drop_tweet)
        dat = dat.drop(
            dat[dat['tweet'].map(len) < 1].index).reset_index(drop=True)
        n = len(dat)
        nlist = range(0, n)
        dat['id'] = nlist

        # assign label
        for i in range(len(dat['hashtag'])):
            tmp_list = dat['hashtag'][i].split(",")
            for j in range(len(tmp_list)):
                tmp_list[j] = tmp_list[j].replace(' ', '')
                dat[tmp_list[j]][i] = 1
        return dat.drop(columns=['hashtag'])
Example #18
0
 def __init__(self):
     self.textprocessor = TextProcessor()
     self.low_weight = 0.001
     self.missclick_weight = 0.1
     self.insert_weight = 0.1
     self.delete_weight = 0.1
Example #19
0
        target_langs = list(
            map(
                lambda i: text_processor.lang_id(sentences[i].strip().split(
                    " ")[0]), tids))
        src_lang = torch.LongTensor(
            [text_processor.lang_id(sentences[sid].strip().split(" ")[0])])
        yield sid, source_tokenized, torch.LongTensor(
            tids), candidates, src_lang, torch.LongTensor(target_langs)


if __name__ == "__main__":
    parser = get_option_parser()
    (options, args) = parser.parse_args()

    print("Loading text processor...")
    text_processor = TextProcessor(options.tokenizer_path)
    num_processors = max(torch.cuda.device_count(), 1)

    print("Loading model...")
    model = Seq2Seq.load(Seq2Seq,
                         options.model,
                         tok_dir=options.tokenizer_path)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)
    num_gpu = torch.cuda.device_count()

    assert num_gpu <= 1
    if options.fp16:
        model = amp.initialize(model, opt_level="O2")

    max_capacity = options.total_capacity * 1000000
    def train(options):
        lex_dict = None
        if options.dict_path is not None:
            lex_dict = get_lex_dict(options.dict_path)
        if not os.path.exists(options.model_path):
            os.makedirs(options.model_path)

        text_processor = TextProcessor(options.tokenizer_path)
        assert text_processor.pad_token_id() == 0

        image_captioner = Seq2Seq.load(ImageCaptioning,
                                       options.pretrained_path,
                                       tok_dir=options.tokenizer_path)
        txt2ImageModel = Caption2Image(
            text_processor=text_processor,
            enc_layer=options.encoder_layer,
            embed_dim=options.embed_dim,
            intermediate_dim=options.intermediate_layer_dim)

        print("Model initialization done!")

        # We assume that the collator function returns a list with the size of number of gpus (in case of cpus,
        collator = dataset.ImageTextCollator()
        num_batches = max(1, torch.cuda.device_count())

        optimizer = build_optimizer(txt2ImageModel,
                                    options.learning_rate,
                                    warump_steps=options.warmup)

        trainer = Caption2ImageTrainer(
            model=txt2ImageModel,
            caption_model=image_captioner,
            mask_prob=options.mask_prob,
            optimizer=optimizer,
            clip=options.clip,
            beam_width=options.beam_width,
            max_len_a=options.max_len_a,
            max_len_b=options.max_len_b,
            len_penalty_ratio=options.len_penalty_ratio,
            fp16=options.fp16,
            mm_mode=options.mm_mode)

        pin_memory = torch.cuda.is_available()
        img_train_loader = ImageMTTrainer.get_img_loader(
            collator,
            dataset.ImageCaptionDatasetwNegSamples,
            options.train_path,
            txt2ImageModel,
            num_batches,
            options,
            pin_memory,
            lex_dict=lex_dict)

        img_dev_loader = ImageMTTrainer.get_img_loader(
            collator,
            dataset.ImageCaptionDatasetwNegSamples,
            options.dev_path,
            txt2ImageModel,
            num_batches,
            options,
            pin_memory,
            lex_dict=lex_dict,
            shuffle=False,
            denom=2)

        step, train_epoch = 0, 1
        while options.step > 0 and step < options.step:
            print("train epoch", train_epoch)
            step = trainer.train_epoch(img_data_iter=img_train_loader,
                                       img_dev_data_iter=img_dev_loader,
                                       max_step=options.step,
                                       lex_dict=lex_dict,
                                       saving_path=options.model_path,
                                       step=step)
            train_epoch += 1
Example #21
0
    def train(options):
        if not os.path.exists(options.model_path):
            os.makedirs(options.model_path)

        text_processor = TextProcessor(options.tokenizer_path)

        lm_class = ReformerLM if options.reformer else LM
        if options.pretrained_path is None:
            lm = lm_class(text_processor=text_processor,
                          size=options.model_size)
        else:
            lm = lm_class.load(options.pretrained_path)

        if options.reformer:
            lm.config.hidden_dropout_prob = options.dropout
            lm.config.local_attention_probs_dropout_prob = options.dropout
            lm.config.lsh_attention_probs_dropout_prob = options.dropout
        else:
            LMTrainer.config_dropout(lm, options.dropout)

        train_data = dataset.TextDataset(save_cache_dir=options.train_path,
                                         max_cache_size=options.cache_size)
        dev_data = dataset.TextDataset(save_cache_dir=options.dev_path,
                                       max_cache_size=options.cache_size,
                                       load_all=True)

        if options.continue_train:
            with open(os.path.join(options.pretrained_path, "optim"),
                      "rb") as fp:
                optimizer = pickle.load(fp)
        else:
            optimizer = build_optimizer(lm, options.learning_rate,
                                        options.warmup)

        trainer = LMTrainer(model=lm,
                            mask_prob=options.mask_prob,
                            optimizer=optimizer,
                            clip=options.clip)

        collator = dataset.TextCollator(pad_idx=text_processor.pad_token_id())
        train_sampler, dev_sampler = None, None

        pin_memory = torch.cuda.is_available()
        loader = data_utils.DataLoader(train_data,
                                       batch_size=options.batch,
                                       shuffle=False,
                                       pin_memory=pin_memory,
                                       collate_fn=collator,
                                       sampler=train_sampler)
        dev_loader = data_utils.DataLoader(dev_data,
                                           batch_size=options.batch,
                                           shuffle=False,
                                           pin_memory=pin_memory,
                                           collate_fn=collator,
                                           sampler=dev_sampler)

        step, train_epoch = 0, 1
        while step <= options.step:
            print("train epoch", train_epoch)
            step = trainer.train_epoch(data_iter=loader,
                                       dev_data_iter=dev_loader,
                                       saving_path=options.model_path,
                                       step=step)
Example #22
0
    def __init__(self,
                 root_img_dir: str,
                 data_bin_file: str,
                 max_capacity: int,
                 text_processor: TextProcessor,
                 max_img_per_batch: int,
                 lex_dict=None,
                 ngpu=1,
                 use_neg_samples: bool = False):
        self.ngpu = ngpu
        self.lex_dict = lex_dict
        self.size_transform = transforms.Resize(256)
        self.crop = transforms.CenterCrop(224)
        self.to_tensor = transforms.ToTensor()
        self.img_normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                                  std=[0.229, 0.224, 0.225])
        self.pad_idx = text_processor.pad_token_id()
        self.batches = []
        self.root_img_dir = root_img_dir
        max_capacity *= 1000000
        self.image_batches = []
        self.lang_ids = set()
        self.all_captions = []
        self.use_neg_samples = use_neg_samples

        print("Start", datetime.datetime.now())
        cur_batch, cur_imgs, cur_lex_cand_batch = [], [], []
        cur_max_len = 0
        with open(data_bin_file, "rb") as fp:
            self.unique_images, captions = marshal.load(fp)
            lang_id = text_processor.id2token(captions[0][1][0])
            self.lang_ids.add(int(captions[0][1][0]))
            self.lang = text_processor.languages[
                lang_id] if lang_id in text_processor.languages else 0
            for caption_info in captions:
                image_id, caption = caption_info
                if self.unique_images[image_id].lower().endswith(".png"):
                    continue
                caption = torch.LongTensor(caption)
                cur_batch.append(caption)
                self.all_captions.append(caption)
                if self.lex_dict is not None:
                    lex_cands = get_lex_suggestions(
                        self.lex_dict, caption, text_processor.pad_token_id())
                    cur_lex_cand_batch.append(lex_cands)

                cur_imgs.append(image_id)
                cur_max_len = max(cur_max_len, len(caption))
                batch_capacity_size = 2 * (cur_max_len**3) * len(cur_batch)
                if (len(cur_imgs) > max_img_per_batch
                        or batch_capacity_size > max_capacity
                    ) and len(
                        cur_batch[:-1]) >= self.ngpu and len(cur_batch) > 1:
                    batch_tensor = pad_sequence(cur_batch[:-1],
                                                batch_first=True,
                                                padding_value=self.pad_idx)
                    lex_cand_batch = None
                    if self.lex_dict is not None:
                        lex_cand_batch = pad_sequence(
                            cur_lex_cand_batch[:-1],
                            batch_first=True,
                            padding_value=self.pad_idx)
                        cur_lex_cand_batch = [cur_lex_cand_batch[-1]]
                    pads = batch_tensor != self.pad_idx
                    pad_indices = [int(pads.size(1)) - 1] * int(pads.size(0))
                    pindices = torch.nonzero(~pads)
                    for (r, c) in pindices:
                        pad_indices[r] = min(pad_indices[r], int(c))

                    self.batches.append(
                        (batch_tensor, pads, torch.LongTensor(pad_indices),
                         lex_cand_batch))
                    self.image_batches.append(cur_imgs[:-1])

                    cur_batch = [cur_batch[-1]]
                    cur_imgs = [cur_imgs[-1]]
                    cur_max_len = len(cur_batch[0])

            if len(cur_batch) > 0:
                batch_tensor = pad_sequence(cur_batch,
                                            batch_first=True,
                                            padding_value=self.pad_idx)
                pads = batch_tensor != self.pad_idx
                pad_indices = [int(pads.size(1)) - 1] * int(pads.size(0))
                lex_cand_batch = None
                if self.lex_dict is not None:
                    lex_cand_batch = pad_sequence(cur_lex_cand_batch,
                                                  batch_first=True,
                                                  padding_value=self.pad_idx)

                pindices = torch.nonzero(~pads)
                for (r, c) in pindices:
                    pad_indices[r] = min(pad_indices[r], int(c))

                self.batches.append(
                    (batch_tensor, pads, torch.LongTensor(pad_indices),
                     lex_cand_batch))
                self.image_batches.append(cur_imgs)

        print(
            "Loaded %d image batches of %d unique images and %d all captions!"
            % (len(self.batches), len(
                self.unique_images), len(self.all_captions)))
        print("End", datetime.datetime.now())
                      dest="output_file",
                      help="Output pickle file.",
                      metavar="FILE",
                      default=None)
    parser.add_option("--tok",
                      dest="tokenizer_path",
                      help="Path to the tokenizer folder",
                      metavar="FILE",
                      default=None)
    parser.add_option("--max-len",
                      dest="max_len",
                      help="Maximum tokenized caption length",
                      type="int",
                      default=256)
    parser.add_option("--sample", dest="sample_size", type="int", default=-1)
    (options, args) = parser.parse_args()
    return options


if __name__ == "__main__":
    options = get_options()
    tokenizer = TextProcessor(options.tokenizer_path)

    print("Writing batches")
    write(text_processor=tokenizer,
          output_file=options.output_file,
          input_file=options.file,
          max_len=options.max_len,
          sample_size=options.sample_size)
    print("Finished")
Example #24
0
    def __init__(self, text_processor: TextProcessor, lang_dec: bool = True, use_proposals=False, tie_embed=False,
                 enc_layer: int = 6, dec_layer: int = 3, embed_dim: int = 768, intermediate_dim: int = 3072,
                 freeze_image: bool = False, resnet_depth: int = 1, use_obj: bool=False):
        super(Seq2Seq, self).__init__()
        self.text_processor: TextProcessor = text_processor
        self.config = lm_config.get_config(vocab_size=text_processor.tokenizer.get_vocab_size(),
                                           pad_token_id=text_processor.pad_token_id(),
                                           bos_token_id=text_processor.bos_token_id(),
                                           eos_token_id=text_processor.sep_token_id(),
                                           enc_layer=enc_layer, embed_dim=embed_dim, intermediate_dim=intermediate_dim)

        self.enc_layer = enc_layer
        self.dec_layer = dec_layer
        self.embed_dim = embed_dim
        self.intermediate_dim = intermediate_dim
        self.config["type_vocab_size"] = len(text_processor.languages)
        self.config = BertConfig(**self.config)
        dec_config = copy.deepcopy(self.config)
        dec_config.num_hidden_layers = self.dec_layer

        self.encoder = BertEncoderModel(self.config)
        self.encoder.init_weights()
        self.lang_dec = lang_dec
        self.tie_embed = tie_embed
        if not lang_dec:
            self.decoder = BertDecoderModel(dec_config)
            self.encoder._tie_or_clone_weights(self.encoder.embeddings.position_embeddings,
                                               self.decoder.embeddings.position_embeddings)
            self.encoder._tie_or_clone_weights(self.encoder.embeddings.token_type_embeddings,
                                               self.decoder.embeddings.token_type_embeddings)
            self.encoder._tie_or_clone_weights(self.encoder.embeddings.word_embeddings,
                                               self.decoder.embeddings.word_embeddings)

            if tie_embed:
                self.output_layer = BertOutputLayer(dec_config)
                self.encoder._tie_or_clone_weights(self.output_layer, self.encoder.embeddings.word_embeddings)
                self.encoder._tie_or_clone_weights(self.encoder.embeddings.position_embeddings,
                                                   self.decoder.embeddings.position_embeddings)
                self.decoder._tie_or_clone_weights(self.output_layer, self.decoder.embeddings.word_embeddings)
            else:
                self.output_layer = nn.ModuleList([BertOutputLayer(dec_config) for _ in text_processor.languages])

            if len(self.encoder.encoder.layer) == len(self.decoder.decoder.layer):
                for i in range(len(self.encoder.encoder.layer)):
                    self.decoder.decoder.layer[i].attention = self.encoder.encoder.layer[i].attention

        else:
            dec = BertDecoderModel(dec_config)
            self.decoder = nn.ModuleList([copy.deepcopy(dec) for _ in text_processor.languages])
            self.output_layer = nn.ModuleList([BertOutputLayer(dec_config) for _ in text_processor.languages])
            for i, dec in enumerate(self.decoder):
                if tie_embed:
                    self.encoder._tie_or_clone_weights(self.output_layer[i], self.encoder.embeddings.word_embeddings)
                    dec.embeddings.position_embeddings = self.encoder.embeddings.position_embeddings
                dec._tie_or_clone_weights(self.output_layer[i], dec.embeddings.word_embeddings)
                dec._tie_or_clone_weights(self.encoder.embeddings.token_type_embeddings,
                                          dec.embeddings.token_type_embeddings)

        self.use_proposals = use_proposals
        if self.use_proposals:
            self.proposal_embedding = self.encoder.embeddings.word_embeddings
            self.lexical_gate = nn.Parameter(torch.zeros(1, self.config.hidden_size).fill_(0.1), requires_grad=True)
            self.lexical_layer_norm = nn.LayerNorm(self.config.hidden_size, eps=self.config.layer_norm_eps)

        self.freeze_image = freeze_image
        self.resnet_depth = resnet_depth
Example #25
0
    def test_albert_seq2seq_init(self):
        path_dir_name = os.path.dirname(os.path.realpath(__file__))
        data_path = os.path.join(path_dir_name, "sample.txt")

        with tempfile.TemporaryDirectory() as tmpdirname:
            processor = TextProcessor()
            processor.train_tokenizer([data_path],
                                      vocab_size=1000,
                                      to_save_dir=tmpdirname,
                                      languages={
                                          "<en>": 0,
                                          "<fa>": 1
                                      })
            seq2seq = Seq2Seq(text_processor=processor)
            src_inputs = torch.tensor([[
                1, 2, 3, 4, 5,
                processor.pad_token_id(),
                processor.pad_token_id()
            ], [1, 2, 3, 4, 5, 6, processor.pad_token_id()]])
            tgt_inputs = torch.tensor(
                [[6, 8, 7,
                  processor.pad_token_id(),
                  processor.pad_token_id()],
                 [6, 8, 7, 8, processor.pad_token_id()]])
            src_mask = (src_inputs != processor.pad_token_id())
            tgt_mask = (tgt_inputs != processor.pad_token_id())
            src_langs = torch.tensor([[0], [0]]).squeeze()
            tgt_langs = torch.tensor([[1], [1]]).squeeze()
            seq_output = seq2seq(src_inputs,
                                 tgt_inputs,
                                 src_mask,
                                 tgt_mask,
                                 src_langs,
                                 tgt_langs,
                                 log_softmax=True)
            assert list(seq_output.size()) == [5, processor.vocab_size()]

            seq_output = seq2seq(src_inputs, tgt_inputs, src_mask, tgt_mask,
                                 src_langs, tgt_langs)
            assert list(seq_output.size()) == [5, processor.vocab_size()]
Example #26
0
            sorted_examples = list(map(lambda len_item: examples[len_item[0]], sorted_lens))
            with open(output_file + "." + str(part_num), "wb") as fw:
                marshal.dump(sorted_examples, fw)


def get_options():
    global options
    parser = OptionParser()
    parser.add_option("--src", dest="src_data_path", help="Path to the source txt file", metavar="FILE", default=None)
    parser.add_option("--dst", dest="dst_data_path", help="Path to the target txt file", metavar="FILE", default=None)
    parser.add_option("--output", dest="output_path", help="Output marshal file ", metavar="FILE", default=None)
    parser.add_option("--tok", dest="tokenizer_path", help="Path to the tokenizer folder", metavar="FILE", default=None)
    parser.add_option("--max_seq_len", dest="max_seq_len", help="Max sequence length", type="int", default=175)
    parser.add_option("--min_seq_len", dest="min_seq_len", help="Max sequence length", type="int", default=1)
    parser.add_option("--src-lang", dest="src_lang", type="str", default=None)
    parser.add_option("--dst-lang", dest="dst_lang", type="str", default=None)
    (options, args) = parser.parse_args()
    return options


if __name__ == "__main__":
    options = get_options()
    tokenizer = TextProcessor(options.tokenizer_path)

    print(datetime.datetime.now(), "Writing batches")
    src_lang = tokenizer.token_id("<" + options.src_lang + ">")
    dst_lang = tokenizer.token_id("<" + options.dst_lang + ">") if options.dst_lang is not None else None
    write(text_processor=tokenizer, output_file=options.output_path, src_txt_file=options.src_data_path,
          dst_txt_file=options.dst_data_path, src_lang=src_lang, dst_lang=dst_lang)
    print(datetime.datetime.now(), "Finished")
Example #27
0
def write(text_processor: TextProcessor, output_file: str, src_txt_file: str, src_lang: int,
          dst_txt_file: str = None, dst_lang: int = None, min_len: int = 1, max_len: int = 175):
    examples = {}
    line_num = 0
    src_lang_str = text_processor.languages[text_processor.id2token(src_lang)]
    lens = {}
    if dst_txt_file is not None:
        dst_lang_str = text_processor.languages[text_processor.id2token(dst_lang)]
        with open(src_txt_file, "r") as s_fp, open(dst_txt_file, "r") as d_fp:
            for src_line, dst_line in zip(s_fp, d_fp):
                if len(src_line.strip()) == 0 or len(dst_line.strip()) == 0: continue
                src_tok_line = text_processor.tokenize_one_sentence_with_langid(src_line.strip(), src_lang)
                dst_tok_line = text_processor.tokenize_one_sentence_with_langid(dst_line.strip(), dst_lang)

                if min_len <= len(src_tok_line) <= max_len and min_len <= len(dst_tok_line) <= max_len:
                    examples[line_num] = (src_tok_line, dst_tok_line, src_lang_str, dst_lang_str)
                    lens[line_num] = len(dst_tok_line)
                    line_num += 1

                if line_num % 1000 == 0:
                    print(line_num, end="\r")

        print("\nSorting")
        sorted_lens = sorted(lens.items(), key=lambda item: item[1])
        sorted_examples = []
        print("Sorted examples")
        for len_item in sorted_lens:
            line_num = len(sorted_examples)
            sorted_examples.append(examples[len_item[0]])

        print("Dumping")
        with open(output_file, "wb") as fw:
            marshal.dump(sorted_examples, fw)

    else:
        part_num = 0
        # Used for MASS training where we only have source sentences.
        with open(src_txt_file, "r") as s_fp:
            for src_line in s_fp:
                if len(src_line.strip()) == 0: continue
                src_tok_line = text_processor.tokenize_one_sentence_with_langid(src_line.strip(), src_lang)
                if min_len <= len(src_tok_line) <= max_len:
                    examples[line_num] = (src_tok_line, src_lang_str)
                    lens[line_num] = len(src_tok_line)
                    line_num += 1
                    if line_num % 1000 == 0:
                        print(line_num, "\r", end="")

                if len(examples) >= 6000000:
                    print(datetime.datetime.now(), "Sorting and writing", part_num)
                    sorted_lens = sorted(lens.items(), key=lambda item: item[1])
                    sorted_examples = list(map(lambda len_item: examples[len_item[0]], sorted_lens))
                    with open(output_file + "." + str(part_num), "wb") as fw:
                        marshal.dump(sorted_examples, fw)
                    examples = {}
                    lens = {}
                    part_num += 1

        if len(examples) > 0:
            print(datetime.datetime.now(), "Sorting and writing", part_num)
            sorted_lens = sorted(lens.items(), key=lambda item: item[1])
            sorted_examples = list(map(lambda len_item: examples[len_item[0]], sorted_lens))
            with open(output_file + "." + str(part_num), "wb") as fw:
                marshal.dump(sorted_examples, fw)
Example #28
0
class HtmlProcessor:
    def __init__(self, directory="./htmls/", MAX_DOCUMENTS=10000):
        # directory where all the .html file are stored
        self.directory = directory
        # this variable is the number of ads needed to perform the analysis
        self.MAX_DOCUMENTS = MAX_DOCUMENTS
        # this variable will contain the text processor object needed to extract info
        # by the descriptions of the house advertisements
        self.tp = None
        # the tfidf dictionary that will contain each tfidf value is initalized
        self.tfidf = {}
        # this variable will contain the euclidean of each document(description) in the corpus(collection of ads)
        self.euclidean_norm = {}
        # this variable contains the tfidf values but normalized such that the tfidf vector
        # of each document has a unit euclidean norm.
        self.tfidf_normalized = {}

    def createFirstDataset(self):
        # variable useful to keep track of the index of each house ad
        ad_index = 0
        # fields contained into the JSON value and that are interest for the purposes of the assignment
        fields = ['locali', 'bagni', 'prezzo', 'superficie']
        # lists that will contain the values of each column of the dataframe
        price = []
        locali = []
        superficie = []
        bagni = []
        piano = []
        allFeatures = [price, locali, superficie, bagni, piano]
        # this list will contain the index of the dataframe
        indexes = []
        # start a row sample with the fields set to 0
        row_sample = dict.fromkeys(fields, 0)
        # this variable will contains the sample for the 'piano' variable
        piano_sample = None
        # name of the attribute that contains the piano value
        attribute_name_piano = 'title'

        # for each html file
        for filePath in tqdm(listdir(self.directory)):
            if (ad_index >= self.MAX_DOCUMENTS):
                break
            with open(self.directory + filePath, "r") as filereader:
                html = filereader.read()
            # Initialize the BeautifulSoup parser
            soup = BeautifulSoup(html, 'html.parser')
            # find the json document where most of the features are contained
            jsonFile = soup.find(id='js-hydration').text.strip()
            # extract a python dictionary value from the JSON value
            metadata = json.loads(jsonFile)
            # toCheck is an auxiliary variable to control the flow
            # in fact since there are more than one loop, it is not possible
            # to use only the 'continue' statement to manage the flow.
            # the var/flag @toCheck is True, at the beginning, and if something bad happens
            # its state is changed to False and, once the code is back into the main loop,
            # the 'continue' statement is involved
            toCheck = True
            for field in fields:
                # if one of the fields is not present then it is a 'bad' house ad
                # Three checks are performed:
                # 1- if the current @field is not present in the @metadata dictionary
                # then the house ad is not good and so the flag @toCheck is uploaded
                # 2- check if the flag @toCheck is already false and sequently 'break' the loop
                # 3- If the JSON value is None then the features of interest aren't present
                # and the loop must be broken.
                if field not in metadata or not toCheck or metadata[
                        field] is None:
                    toCheck = False
                    break
                # now I am sure that the field is present
                # Three checks are performed:
                # 1- The '+'(plus) char in the string. This is due to the fact that sometimes
                # the exact info about how many 'locali' or 'bagni' isn't present, so the advertiser
                # put at the end of the number a plus sign(so '5+' means 5,6,7,...,N locali)
                # 2- The '-' character is present when the advertiser want to put a range of values
                # for instance an house price could be '180.000-300.000', it means from 180k to 300k
                # 3- The 'da' string means literally 'from' and it means that the advertiser wants to put
                # a lower limit to one of the features.
                # All 3 cases need to be avoided because the assignment requires a unique value
                # for every feature.
                if not ('+' in metadata[field] or '-' in metadata[field]
                        or 'da' in metadata[field]):
                    # split the value
                    tmp = metadata[field].split(' ')
                    # the flag state is put to False
                    toCheck = False
                    for string in tmp:
                        # on the string is invoked a replace of the '€' and '.' character
                        # because of the encoding the advertiser use for the price
                        if (string.replace('.', '').replace('€',
                                                            '').isdigit()):
                            # if the string is a digit then the flag can be newly be
                            # assigned to True
                            toCheck = True
                            row_sample[field] = int(
                                string.replace('.', '').replace('€', ''))
                else:
                    # if one of the previous condition is encountered then it is not possible
                    # to include the house ad to the collection for the sequent analysis
                    toCheck = False
                    break

            # at this point, the code flow is back into the main loop and if the flag is not True
            # then a 'continue' statement is invoked
            if not toCheck:
                continue
            # now it remains to extract the piano, the description and the link
            # extracting piano field
            piano_abbr = soup.find('abbr',
                                   attrs={'class': 'text-bold im-abbr'})
            # if the tag where the 'piano' value can be found it is not present in the html
            # or the tag doesn't have the attribute with the 'piano' value, then the house ad
            # must be discarded
            if (not piano_abbr is None
                ) and attribute_name_piano in piano_abbr.attrs:
                # if the piano is 'terra'(T),'rialzato'(R) or 'seminterrato'(S)
                # the piano field is set to 1.
                if piano_abbr.attrs[attribute_name_piano] in {
                        'T', 'R', 'S', 'Piano terra'
                }:
                    piano_sample = 1
                elif piano_abbr.attrs[attribute_name_piano].isdigit():
                    piano_sample = int(piano_abbr.attrs[attribute_name_piano])
                else:
                    continue
            else:
                # the ad doesn't contain the piano info therefore it is a 'bad' ad
                continue

            # extracting description
            description_div = soup.find(
                'div',
                attrs={'class': 'col-xs-12 description-text text-compressed'})
            # if the tag where the description is contained it is not present in the html file
            # then the house ad must be discarded
            if (description_div is None):
                continue
            # extract the link
            link_tag = soup.find('link', attrs={'rel': 'canonical'})
            # if the tag where the link is contained it is not present in the html file
            # then the house ad must be discarded
            if not 'href' in link_tag.attrs:
                continue
            # At this point of the code flow, it is possible to say surely that the house ad
            # can be included in the collection for the sequent analysis.
            #
            # open a file and store the description
            # save the description on a textfile
            with open("./descriptions/descriptionAD#" + str(ad_index) + ".txt",
                      "w") as text_file:
                text_file.write(description_div.text.strip())
            # retrieve the link value in the 'href' attribute of the tag
            adLink = link_tag.attrs['href']
            # upload the value of the ad_index
            ad_index += 1
            # Now it's possible to appreciate the use of additional variables for the fields of interest
            # they are useful because if each variable is appended to the list immediately
            # then anytime an issue involves(every time a "break" or "continue" statement appears)
            # it is needed to pop the element from the list.
            # Instead, with the use of auxiliary variable it is possible to first assign it
            # and then append all of them at the end, when we are sure no issues are present
            # for that specific house ad.
            indexes.append(adLink)
            price.append(row_sample['prezzo'])
            locali.append(row_sample['locali'])
            superficie.append(row_sample['superficie'])
            bagni.append(row_sample['bagni'])
            piano.append(piano_sample)

        # columns of the dataframe
        columnsDf = ["price", "locali", "superficie", "bagni", "piano"]
        # initialize the dataframe to be stored on the filesystem
        toReturn = pd.DataFrame()
        # iterate over each column of the dataframe to be returned
        # and assign the column to the corresponding list of values
        for i in range(len(columnsDf)):
            # assign the list of values to the specific column
            toReturn[columnsDf[i]] = allFeatures[i]
        # upload the indexes
        toReturn.index = indexes
        # save the dataframe on the filesystem
        toReturn.to_csv('dataframe1.tsv', sep='\t')

    def createSecondDataset(self, datasetFileName="df2.tsv"):
        # the text processor object is initialized
        self.tp = TextProcessor()
        # the term encoding is built
        self.tp.buildEncoding("encoding.pickle")
        # the term frequency dictionary is built
        self.tp.buildTf(fileName='tf.pickle')
        # the function which build the tfidf, still not normalized, is invoked
        self._createtfidf()
        # the function which computes the euclidean norm of the description in the house ad collection
        self._computeEuclideanNorm()
        # this function creates the tfidf dictionary but normalized
        self._createNormalizedtfidf()
        # the dataframe is created
        pd.DataFrame.from_dict(data=self.tfidf_normalized,
                               columns=range(1, self.tp.VOCABULARY_SIZE + 1),
                               orient='index').to_csv(datasetFileName,
                                                      sep='\t')

    def _createNormalizedtfidf(self):
        # now it will be created the dictionary that contains the tfidf values NORMALIZED
        # so each tfidf value will be divided by the euclidean norm of the document into
        # which the word is contained.
        # iterating over the keys of the @tfidf dictionary.REMEMBER: this dict has as keys the
        # doc indexes.
        for doc_index in self.tfidf:
            # at the beginning the tfidf vector will be a vector of zero
            self.tfidf_normalized[doc_index] = [0] * self.tp.VOCABULARY_SIZE
            # the tfidf values for that document are retrieve
            tfidf_list = self.tfidf[doc_index]
            # iterate over each tuple @tfidfTuple(structure: (wordId, tfidf{wordID,doc_index}))
            for tfidfTuple in tfidf_list:
                # this line access the tfidf vector in position @tfidfTuple[0]-1 which is the index
                # of the word related and update the vector with the tfidf value divided by the euclidean
                # norm of the document
                self.tfidf_normalized[doc_index][tfidfTuple[0] - 1] = (
                    tfidfTuple[1] / self.euclidean_norm[doc_index])

    # compute the euclidean norm of each document
    def _computeEuclideanNorm(self):
        # for each doc_index in the tfidf dictionary.
        # REMEMBER: Why are you using as key of the @tfidf dictionary the doc_index?
        # because in this way it will be much easier to build the final matrix
        # containing the tfidf values
        for doc_index in self.tfidf:
            sum_of_squares = 0
            # all the words contained in the document @doc_index is retrieved together with
            # the related tf-idf value(both contained in the @tfidfTuple)
            tfidf_list = self.tfidf[doc_index]
            # iterate over each tuple @tfidfTuple(structure: (wordId, tfidf{wordID,doc_index}))
            for tfidfTuple in tfidf_list:
                # retrieve the tfidf value
                tmp = tfidfTuple[1]
                sum_of_squares += (tmp**2)
            # the euclidean norm is equal to the square root of the sum of the squares of
            # each component of the vector
            self.euclidean_norm[doc_index] = sum_of_squares**0.5

    def _createtfidf(self):
        # for each word(equal to for each column in the tfidf matrix)
        for word in self.tp.tf:
            # df is the document frequency of the word, so the number of documents into which
            # the word is contained, therefore given the structure of the @tp.tf dictionary
            # the 'df' is equal to the length of the list given by tp.tf[word]
            df = len(self.tp.tf[word])
            # the wordID is given by the term encoding given by the dictionary @tp.term_enc
            wordId = self.tp.term_enc[word]
            # for each tuple contained in the list @tp.tf[word]. REMEMBER: the tuple has a length of 2.
            # the first element is the doc_index and the second is the tf value
            for tfTuple in self.tp.tf[word]:
                # retrieve the doc_index
                doc_index = int(tfTuple[0])
                # retrieve the term frequency of that word in the document @doc_index
                tfValue = tfTuple[1]
                # calculate the idf value
                idf = log(self.tp.NUMBER_OF_DOCS / df + 1)
                # if the doc_index is already a key in the dictionary then the new tuple
                # is only appended
                if doc_index in self.tfidf:
                    self.tfidf[int(doc_index)].append(
                        (int(wordId) - 1, tfValue * idf))
                # otherwise a new entry corresponding to that doc_index is added and the value
                # is a list with a single element which is the tuple (wordId, tfidf{wordID,doc_index})
                else:
                    self.tfidf[int(doc_index)] = [(int(wordId) - 1,
                                                   tfValue * idf)]
Example #29
0
import time;
from textprocessor import TextProcessor
from bayes_classifier import BayesClassifier

form_file = 'formy'
polish_texts = ['dramat', 'popul', 'proza', 'publ', 'wp']
path_to_files = './data/'
show_simmilar = True;
num_of_simmilar = 4

if __name__ == '__main__':
    textprocessor = TextProcessor()
    textprocessor.create_dictionary(path_to_file = path_to_files, form_file=form_file)
    textprocessor.improve_dictionary(path_to_files = path_to_files, polish_texts=polish_texts)
    dict_of_words = textprocessor.dict_of_words;

    input_word = input('Napisz pojedyncze slowo:\n')
    start = time.time();
    input_word = textprocessor.map_chars(input_word)
    if not input_word in dict_of_words:
        bayes_classifier = BayesClassifier()
        simmilar_words = bayes_classifier.calculate(f'{input_word}', dict_of_words)
        unmapped_words = []
        for word in simmilar_words:
            unmapped_words.append(textprocessor.unmap_words(word))
        print(f'Slowo nie wystepuje w polskim jezyku.')
        print(f'Moze chodzilo o \'{unmapped_words[0]}\'?')
        show_hints = input('Chcesz zobaczyc inne mozliwosci? t/n\n')
        if(show_hints == 't'):
            print(f'Inne możliwosci {unmapped_words[1:num_of_simmilar]}\n')
    else:
Example #30
0
        'sp', 'llc', 'co', 'ltd', 'tel', 'email', ' ', 'tel', 'fax', 'gmail',
        'com', 'eu', 'pl', 'telfax', 'office', 'burg', 'poland'
    ]


fileName = "./data/pap.txt"
note_idx = 98

#https://www.datascienceassn.org/sites/default/files/users/user1/lsa_presentation_final.pdf?fbclid=IwAR3ax6JNemqmWzfau24-UwePT7isOEDP5mAE3jbCQG92dITVVwV9ZS7CYiA

if __name__ == '__main__':
    start = time.time()

    ### PREPROCESSING

    textProcessor = TextProcessor()
    textProcessor.create_dictionary("data", "odm.txt")

    lineWords = []
    for line in open(fileName, 'r', encoding='utf-8'):
        read_line = line.replace('#', '').strip('\n').strip(' ')
        if not read_line.isdigit():
            lineWords.append(textProcessor.preprocess(read_line))

    textProcessor.create_frequency_dict(lineWords)
    preprocessed = []
    textProcessor.pre_process_vol_2(lineWords)

    ### Document-term matrix

    stop_list = get_stop_list()