Exemple #1
0
def preProcess():
    print 'PreProcess Reuters Corpus'
    start_time = time.time()
    docs = 0
    bad = 0
    tokenizer = Tokenizer()

    if not os.path.isdir(Paths.base):
        os.makedirs(Paths.base)

    with open(Paths.text_index, 'w') as fileid_out:
      with codecs.open(Paths.texts_clean, 'w', 'utf-8-sig') as out:
          with codecs.open(Paths.reuter_test, 'w', 'utf-8-sig') as test:
              for f in reuters.fileids():
                  contents = reuters.open(f).read()
                  try:
                      tokens = tokenizer.tokenize(contents)
                      docs += 1
                      if docs % 1000 == 0:
                          print "Normalised %d documents" % (docs)

                      out.write(' '.join(tokens) + "\n")
                      # if f.startswith("train"):
                      #
                      # else:
                      #     test.write(' '.join(tokens) + "\n")
                      fileid_out.write(f + "\n")

                  except UnicodeDecodeError:
                      bad += 1
    print "Normalised %d documents" % (docs)
    print "Skipped %d bad documents" % (bad)
    print 'Finished building train file ' + Paths.texts_clean
    end_time = time.time()
    print '(Time to preprocess Reuters Corpus: %s)' % (end_time - start_time)
Exemple #2
0
def append(h_in, h_out, split):
    tkn = Tokenizer(args.tagger)
    data = h_in[split]  # train, dev, test
    cur_size = len(data['product'])

    bcateid = data['bcateid'][()]
    mcateid = data['mcateid'][()]
    scateid = data['scateid'][()]
    dcateid = data['dcateid'][()]

    def get_label(i, vocab_type="bmsd"):
        b = bcateid[i]
        m = mcateid[i]
        s = scateid[i]
        d = dcateid[i]

        if split == 'train':
            if vocab_type == "bmsd":
                y = bmsd_vocab['%s>%s>%s>%s' % (b, m, s, d)]
            else:
                raise
            return y
        else:
            return -1

    h_out['img_feat'] = data['img_feat'][:]
    h_out['pid'] = data['pid'][:]
    h_out['label'] = [get_label(i, vocab_type="bmsd") for i in range(cur_size)]

    for col in columns:
        result = []
        for i in range(cur_size):
            txt = normalize(data[col][i], col_type=col)
            words = tkn.tokenize(txt)
            result.append(np.string_(words))

        h_out[col] = np.array(result, dtype="S1000")
Exemple #3
0
class DataLoader(object):
    """Load the images and labels from the database and process into batches
    Attributes:
        data_base_dir (str): Folder with the processed images
        label_path (str): File with latex math formulas
        max_aspect_ratio (int): Maximum aspect ratio (width/height) for images
        max_encoder_l_h (int): Maximum size for the images height
        max_encoder_l_w (int): Maximum size for the images width
        max_decoder_l (int): Maximum number of tokens for the latex formula
    """
    def __init__(self, data_base_dir, label_path, max_aspect_ratio,
                 max_encoder_l_h, max_encoder_l_w, max_decoder_l,
                 max_vocab_size, initial_id2voc, initial_voc2id):

        # folder with processed images
        self.data_base_dir = data_base_dir
        # .lst file with formulas
        self.label_path = label_path
        self.max_width = 10000
        self.max_aspect_ratio = max_aspect_ratio
        self.max_encoder_l_h = max_encoder_l_h
        self.max_encoder_l_w = max_encoder_l_w
        self.max_decoder_l = max_decoder_l
        self.min_aspect_ratio = 0.5
        self.vocab_size = max_vocab_size
        self.tokenizer = Tokenizer(initial_id2voc, initial_voc2id)
        # buffer to save groups of batches with same width and height
        self.buffer = defaultdict(lambda: defaultdict(list))

    def process_batch(self, buf, img_width, img_height):
        """ Return a batch of images with labels and take it out of the buffer
        Args:
            buf (:obj:dict:dict:list): object containing images according
                                       to the images size and width
            img_width (int): size of the image's width in the batch
            img_height (int): size of the image's height in the batch
        """
        # store images and targets in tensors
        batch_size = len(buf[img_width][img_height])
        images = torch.Tensor(batch_size, 1, img_height, img_width)
        img_paths = []
        max_target_length = max([
            len(buf_element[1]) for buf_element in buf[img_width][img_height]
        ])

        for k in range(batch_size):
            img_paths.append(buf[img_width][img_height][k][2])
            images[k] = torch.from_numpy(buf[img_width][img_height][k][0])

        targets = torch.zeros(batch_size, max_target_length - 1)
        targets_eval = torch.zeros(batch_size, max_target_length - 1)

        num_nonzer = 0
        for m in range(len(buf[img_width][img_height])):
            num_nonzer = (num_nonzer + len(buf[img_width][img_height][m][1]) -
                          2)

            for j in range(len(buf[img_width][img_height][m][1]) - 1):
                targets[m][j] = buf[img_width][img_height][m][1][j]
                targets_eval[m][j] = buf[img_width][img_height][m][1][j + 1]
        # restart buffer
        buf[img_width][img_height] = []
        return images, targets, targets_eval, num_nonzer, img_paths

    def create_data_generator(self, batch_size, directory_path):
        """ Create a generator that will yield the images and labels
        Args:
            batch_size (int): size of the batch to generate
            directory_path (string): path of the file containing
                                     filenames of the images and formulas
        """
        image_list = read_formulas_directory(directory_path)

        for i in range(0, len(image_list)):
            # Get the image path and read the image
            img_path = image_list[i][0]
            img = imageio.imread("../data/images_processed/" + img_path)
            # Convert color image to grayscale
            # (the shape of the image object changes from (h,w,3) to (h,w))
            rgb2gray_weights = [0.299, 0.587, 0.114]
            img = np.average(img, weights=rgb2gray_weights, axis=2)
            # Get the formula number and save it to a list
            label_str = image_list[i][1]
            # tokenize function
            label_list = self.tokenizer.tokenize(self.label_path, label_str)

            origH = img.shape[0]
            origW = img.shape[1]

            # if list of tokens is too big, truncate
            if len(label_list) > self.max_decoder_l:
                label_list = label_list[:self.max_decoder_l]

            bounds_check = (len(label_list), math.floor(origH / 8.0),
                            math.floor(origW / 8.0))
            bounds_tuple = (self.max_decoder_l, self.max_encoder_l_h,
                            self.max_encoder_l_w)
            if bounds_check <= bounds_tuple:
                # get aspect_ratio and assure is between
                # max and min aspect ratios defined
                aspect_ratio = origW / origH
                aspect_ratio = min(aspect_ratio, self.max_aspect_ratio)
                aspect_ratio = max(aspect_ratio, self.min_aspect_ratio)

                imgW = origW
                imgH = origH

                self.buffer[imgW][imgH].append([img, label_list, img_path])
                # when buffer reaches batch_size,
                # return images and targets as tensors
                if len(self.buffer[imgW][imgH]) == batch_size:
                    images, targets, targets_eval, num_nonzer, img_paths = (
                        self.process_batch(self.buffer, imgW, imgH))
                    yield images, targets, targets_eval, num_nonzer, img_paths

                # when we have gone through all the lines,
                # return incomplete batches stored in buffer
                if i == len(image_list) - 1:
                    for imgW in self.buffer:
                        for imgH in self.buffer[imgW]:
                            if len(self.buffer[imgW][imgH]) > 0:
                                images, targets, targets_eval, num_nonzer, img_paths = (
                                    self.process_batch(self.buffer, imgW,
                                                       imgH))
                                yield images, targets, targets_eval, num_nonzer, img_paths
Exemple #4
0
class ExsBuilder:
    """ExsBuilder produces a list of examples given a document set"""
    def __init__(self,
                 bert_model='bert-base-uncased',
                 file_emb='',
                 vocab_size=150000,
                 min_src_nsents=1,
                 max_src_nsents=50,
                 min_src_ntokens_per_sent=3,
                 max_src_ntokens_per_sent=100):
        logger.info('=== Initializing a example builder'.ljust(80, '='))
        self.min_src_nsents = min_src_nsents
        self.max_src_nsents = max_src_nsents
        self.min_src_ntokens_per_sent = min_src_ntokens_per_sent
        self.max_src_ntokens_per_sent = max_src_ntokens_per_sent

        logger.debug(f'Loading BERT pre-trained model [{bert_model}]')
        self.tokB = BertTokenizer.from_pretrained(bert_model)
        self.tokC = None
        if file_emb != '':
            logger.debug('Loading the WBMET dictionary for custom tokenizer')
            self.tokC = Tokenizer(vocab_size=vocab_size)
            self.tokC.from_pretrained(file_emb)
        self.doc_lbl_freq = [0, 0]  # document-level [irrel, rel]
        self.ext_lbl_freq = [0, 0]  # token-level [irrel, rel]

    @staticmethod
    def tokenize(data, src_keys=['title', 'body'], tgt_key='text'):
        """Use Stanford CoreNLP tokenizer to tokenize all the documents."""
        REMAP = {
            "-LRB-": "(",
            "-RRB-": ")",
            "-LCB-": "{",
            "-RCB-": "}",
            "-LSB-": "[",
            "-RSB-": "]",
            "``": '"',
            "''": '"'
        }
        with CoreNLPClient(annotators=['tokenize', 'ssplit'], threads=CPU_CNT)\
                as client:
            for did, d in tqdm(data.items()):
                text = ''
                for k in src_keys:
                    text += d[k] + ' '
                ann = client.annotate(text.strip())
                tokens = []  # list of tokenized sentences
                for sent in ann.sentence:
                    tokens.append([
                        REMAP[t.word] if t.word in REMAP else t.word.lower()
                        for t in sent.token
                    ])
                d[tgt_key] = tokens

    def encode(self, exs):
        """Convert sequences into indicies and create data entries for
        model inputs"""
        rtn = []
        logger.info('Encoding examples...')
        for qid, did, rel, doc, flds, mesh, keywords in tqdm(exs):
            entry = {
                'qid': qid,
                'did': did,
                'src': [],
                'src_sent_lens': [],
                'tgtB': [],
                'tgtB_sent_lens': [],
                'tgtC': [],
                'tgtC_sent_lens': []
            }

            # src
            for s in doc:  # CoreNLP tokenized sequences (list of sentences)
                if len(s) <= self.min_src_ntokens_per_sent:
                    continue
                src_str = ' '.join(s[:self.max_src_ntokens_per_sent])
                entry['src'] += self.tokB.convert_tokens_to_ids(
                    self.tokB.tokenize(src_str))
                entry['src_sent_lens'].append(len(entry['src']))
            if len(entry['src']) == 0:
                continue

            # tgt - fields
            tgt_tokens = set()  # Used in identifying token-level labels
            for seq in flds:  # flds (disease, gene, demo)
                # BERT
                ids = self.tokB.convert_tokens_to_ids(self.tokB.tokenize(seq))
                tgt_tokens.update(ids)
                entry['tgtB'] += ids
                entry['tgtB_sent_lens'].append(len(entry['tgtB']))
                # BMET
                ids = self.tokC.convert_tokens_to_ids(self.tokC.tokenize(seq))
                ids = list(filter(lambda x: x > 1, ids))  # Remove UNKs
                entry['tgtC'] += ids
                entry['tgtC_sent_lens'].append(len(entry['tgtC']))

            # tgt - mesh
            mesh = [f'εmesh_{t}' for t in mesh[0].lower().split()]
            ids = self.tokC.convert_tokens_to_ids(mesh)
            ids = list(filter(lambda x: x > 1, ids))  # Remove UNKs
            entry['tgtC'] += ids
            entry['tgtC_sent_lens'].append(len(entry['tgtC']))

            # tgt - keywords
            seq = ' '.join(keywords)
            ids = self.tokC.convert_tokens_to_ids(self.tokC.tokenize(seq))
            ids = list(filter(lambda x: x > 1, ids))  # Remove UNKs
            tgt_tokens.update(ids)
            entry['tgtC'] += ids
            entry['tgtC_sent_lens'].append(len(entry['tgtC']))
            entry['token_labels'] = \
                [1 if t in tgt_tokens else 0 for t in entry['src']]
            sum_ = sum(entry['token_labels'])
            self.ext_lbl_freq[0] += len(entry['token_labels']) - sum_
            self.ext_lbl_freq[1] += sum_
            entry['doc_label'] = 0 if rel == 0 else 1
            rtn.append(entry)
        return rtn

    def build_trec_exs(self, topics, docs):
        """For each topic and doc pair, encode them, and construct example list
        """
        exs = list()
        # Tokenize document using Stanford CoreNLP Tokenizer
        logger.debug(
            'Tokenizing %s documents using Stanford CoreNLP '
            'Tokenizer...', len(docs))
        self.tokenize(docs)

        # Add positive examples
        for qid in topics:
            for did, rel in topics[qid]['docs']:
                if did not in docs or \
                        len(docs[did]['text']) < self.min_src_nsents:
                    continue
                d = docs[did]
                # Complete keywords: doc_keywords > doc_mesh > q_mesh
                keywords = d['keywords'] if len(d['keywords']) > 0 \
                    else d['mesh_names']
                if len(keywords) == 0 and rel > 0:
                    keywords = [topics[qid]['mesh'][1]]

                exs.append(
                    (qid, did, rel, d['text'][:self.max_src_nsents],
                     topics[qid]['fields'], topics[qid]['mesh'], keywords))
                self.doc_lbl_freq[int(rel > 0)] += 1

        # Add negative examples
        neg_docs_ids = [did for did, d in docs.items() if not d['pos']]
        qids = random.choices(list(topics.keys()), k=len(neg_docs_ids))
        for i, did in enumerate(neg_docs_ids):
            exs.append(
                (qids[i], did, 0, docs[did]['text'][:self.max_src_nsents],
                 topics[qid]['fields'], topics[qid]['mesh'], []))
            self.doc_lbl_freq[0] += 1
        random.shuffle(exs)
        rtn = self.encode(exs)

        return rtn

    # todo. Following function will be changed
    def build(self, examples, docs):
        """Bulding examples is done in two modes: one for data preparation and
        the other for prediction.

        In data preparation,
        - `exs` are quries in TREC ref datasets
        - `docs` consists of pos and neg documents prepared by `read_pubmed_docs`

        In prediction,
        - `exs` only contains one query with no labels
        - `docs` the retrieved documents from Solr search results

        """
        # Tokenize documents and build examples with doc_labels
        exs = []
        # Title and Text are multivalued ('text_general' in Solr)
        results = docs
        docs = {}
        for r in results:
            title = ' '.join(r['ArticleTitle'] if 'ArticleTitle' in r else [])
            body = ' '.join(r['AbstractText'] if 'AbstractText' in r else [])
            docs[r['id']] = (title + ' ' + body).strip()
        logger.debug(f'Tokinizing {len(docs)} retrieved docs...')
        pos_docs = self.tokenize(docs)

        # Build examples (with dummy label -1)
        qid = list(examples.keys())[0]  # There's only one anyways
        logger.info(f'Preparing examples for {qid}...')
        for did, text in pos_docs.items():
            if len(pos_docs[did]) < self.min_src_nsents:
                continue
            exs.append((qid, did, -1, pos_docs[did][:self.max_src_nsents],
                        examples[qid]['topics']))

        data = self.encode(exs)
        return data
Exemple #5
0
class TrecTopics(TextDirectoryCorpus):
    def __init__(self,
                 topics_path,
                 min_depth=0,
                 max_depth=None,
                 metadata=True,
                 lemmatization=True,
                 use_stop=True,
                 pattern=None,
                 exclude_pattern=None,
                 **kwargs):
        super(TrecTopics, self).__init__(topics_path,
                                         dictionary={},
                                         metadata=metadata,
                                         min_depth=min_depth,
                                         max_depth=max_depth,
                                         pattern=pattern,
                                         exclude_pattern=exclude_pattern,
                                         lines_are_documents=True,
                                         **kwargs)

        self.topics = {}
        self.topics_vecs = None
        self.topic_row_maps = {}
        self.oov = {}

        self.tokenizer = Tokenizer(minimum_len=TOKEN_MIN_LEN,
                                   maximum_len=TOKEN_MAX_LEN,
                                   lowercase=True,
                                   output_lemma=lemmatization,
                                   use_stopwords=use_stop,
                                   extra_stopwords=EXTRA_STOPWORDS)

    def get_texts(self):

        inside_top = False
        inside_desc = False
        inside_narr = False
        topic_no = None
        title = ""
        desc = ""
        narr = ""
        for line in self.getstream():
            if line.startswith("<top>"):
                inside_top = True
                continue

            if inside_desc:
                if line.startswith("<"):
                    inside_desc = False
                else:
                    # desc += line + linesep
                    desc += line + " "
                    continue

            if inside_narr:
                if line.startswith("<"):
                    inside_narr = False
                else:
                    # narr += line + linesep
                    narr += line + " "
                    continue

            if inside_top:
                if line.startswith("<num>"):
                    topic_no = line[line.find("Number:", len("<num>")) +
                                    len("Number:"):].strip()
                elif line.startswith("<title>"):
                    title = line[len("<title>"):].strip().replace("Topic:", "")
                elif line.startswith("<desc>"):
                    inside_desc = True
                elif line.startswith("<narr>"):
                    inside_narr = True
                elif line.startswith("</top>"):
                    inside_top = False
                    yield int(topic_no), \
                          self.tokenizer.tokenize(title), self.tokenizer.tokenize(desc), self.tokenizer.tokenize(narr)
                    title = ""
                    desc = ""
                    narr = ""

    def init(self):
        for topic_no, title, desc, narr in self.get_texts():
            self.topics[topic_no] = {
                "title": title,
                "desc": desc,
                "narr": narr
            }

    def get_title(self, topic_no):
        return self.topics[topic_no]["title"]

    def get_desc(self, topic_no):
        return self.topics[topic_no]["desc"]

    def get_narr(self, topic_no):
        return self.topics[topic_no]["narr"]

    def total_topics(self):
        return len(self.topics)

    def indexedize(self, vocab_dict, topic_no, term_list):
        index_list = []
        for term in term_list:
            if term in vocab_dict:
                index_list.append(vocab_dict[term].index)
            else:
                self.oov.setdefault(topic_no, set()).add(term)
        return index_list

    def vectorize(self,
                  vocab_dict,
                  include_title=True,
                  include_desc=False,
                  include_narr=False,
                  norm='l2'):
        assert include_title or include_desc or include_narr is True

        vector_length = len(vocab_dict)

        topic_row = [0]
        index_col = []
        freq_val = []

        for topic_no, title, desc, narr in self.get_texts():

            self.topic_row_maps.setdefault(topic_no, len(self.topic_row_maps))

            indexed_word_list = []
            if include_title:
                indexed_word_list.extend(
                    self.indexedize(vocab_dict, topic_no, title))
            if include_desc:
                indexed_word_list.extend(
                    self.indexedize(vocab_dict, topic_no, desc))
            if include_narr:
                indexed_word_list.extend(
                    self.indexedize(vocab_dict, topic_no, narr))

            for index in indexed_word_list:
                index_col.append(index)
                freq_val.append(1)
            topic_row.append(len(index_col))

        self.topics_vecs = normalize(csr_matrix(
            (freq_val, index_col, topic_row),
            dtype=int,
            shape=(len(topic_row) - 1, vector_length)).toarray(),
                                     norm=norm,
                                     axis=1)

        # self.topics_vecs = csr_matrix((freq_val, index_col, topic_row), dtype=int,
        #                               shape=(len(topic_row)-1, vector_length)).toarray()

    def get_topic_vector(self, topic_no):
        assert 51 <= topic_no <= 450
        return self.topics_vecs[self.topic_row_maps[topic_no]]