예제 #1
0
def build_word_dict(args, examples, fields, dict_size=None,
                    no_special_token=False):
    """Return a dictionary from question and document words in
    provided examples.
    """
    word_dict = Vocabulary(no_special_token)
    for w in load_words(args, examples, fields, dict_size):
        word_dict.add(w)
    return word_dict
예제 #2
0
def build_word_dict(args, examples, fields, dict_size=None,
                    special_token="pad_unk", attrname="tokens"):
    """Return a dictionary from question and document words in
    provided examples.
    """
    word_dict = Vocabulary(no_special_token)
    for w in load_words(args, examples, fields, dict_size, \
                       num_spec_tokens=len(special_tokens.split("_")),\
                       attrname=attrname):
        word_dict.add(w)
    return word_dict
예제 #3
0
 def _insert(iterable):
     words = []
     for w in iterable:
         w = Vocabulary.normalize(w)
         if valid_words and w not in valid_words:
             continue
         words.append(w)
     word_count.update(words)
예제 #4
0
def top_summary_words(args, examples, word_dict):
    """Count and return the most common question words in provided examples."""
    word_count = Counter()
    for ex in examples:
        for w in ex['summary'].tokens:
            w = Vocabulary.normalize(w)
            if w in word_dict:
                word_count.update([w])
    return word_count.most_common(args.tune_partial)
예제 #5
0
def index_embedding_words(embedding_file):
    """Put all the words in embedding_file into a set."""
    words = set()
    with open(embedding_file) as f:
        for line in tqdm(f, total=count_file_lines(embedding_file)):
            w = Vocabulary.normalize(line.rstrip().split(' ')[0])
            words.add(w)

    words.update([BOS_WORD, EOS_WORD, PAD_WORD, UNK_WORD])
    return words
예제 #6
0
 def form_src_vocab(self) -> None:
     self.src_vocab = Vocabulary()
     assert self.src_vocab.remove(BOS_WORD)
     assert self.src_vocab.remove(EOS_WORD)
     self.src_vocab.add_tokens(self.tokens)
예제 #7
0
class Code(object):
    """
    Code containing annotated text, original text, selection label and
    all the extractive spans that can be an answer for the associated question.
    """
    def __init__(self, _id=None):
        self._id = _id
        self._language = None
        self._text = None
        self._tokens = []
        self._type = []
        self._mask = []
        self.src_vocab = None  # required for Copy Attention

    @property
    def id(self) -> str:
        return self._id

    @property
    def language(self) -> str:
        return self._language

    @language.setter
    def language(self, param: str) -> None:
        self._language = param

    @property
    def text(self) -> str:
        return self._text

    @text.setter
    def text(self, param: str) -> None:
        self._text = param

    @property
    def type(self) -> list:
        return self._type

    @type.setter
    def type(self, param: list) -> None:
        assert isinstance(param, list)
        self._type = param

    @property
    def mask(self) -> list:
        return self._mask

    @mask.setter
    def mask(self, param: list) -> None:
        assert isinstance(param, list)
        self._mask = param

    @property
    def tokens(self) -> list:
        return self._tokens

    @tokens.setter
    def tokens(self, param: list) -> None:
        assert isinstance(param, list)
        self._tokens = param
        self.form_src_vocab()

    def form_src_vocab(self) -> None:
        self.src_vocab = Vocabulary()
        assert self.src_vocab.remove(BOS_WORD)
        assert self.src_vocab.remove(EOS_WORD)
        self.src_vocab.add_tokens(self.tokens)

    def vectorize(self, word_dict, _type='word') -> list:
        if _type == 'word':
            return [word_dict[w] for w in self.tokens]
        elif _type == 'char':
            return [
                word_dict.word_to_char_ids(w).tolist() for w in self.tokens
            ]
        else:
            assert False
예제 #8
0
 def _insert(iterable):
     words = []
     for w in iterable:
         w = Vocabulary.normalize(w)
         words.append(w)
     word_count.update(words)