def load_data(args, filename, max_examples=-1, dataset_name='msmarco'):
    """Load examples from preprocessed file. One example per line, JSON encoded."""

    # Load JSON lines
    with open(filename) as f:
        data = [
            json.loads(line)
            for line in tqdm(f, total=count_file_lines(filename))
        ]

    examples = []
    # based on model_type, we arrange the data
    model_type = args.model_type.upper()
    for example in tqdm(data):
        if dataset_name == 'msmarco':
            session_queries = []
            for query in example['query']:
                qObj = Query(query['id'])
                qObj.text = ' '.join(query['tokens'])
                qtokens = query['tokens']
                qtokens = [BOS_WORD] + qtokens + [EOS_WORD]

                if len(qtokens) == 0 or len(qtokens) > args.max_query_len:
                    continue

                qObj.tokens = qtokens

                # --- record the candidate documents
                candidates = []
                for candidate in query['candidates']:
                    document = Document(candidate['id'])
                    # TODO: what should we use for documents? title/content?
                    content_tokens = candidate['content'].split()
                    if len(content_tokens) == 0:
                        continue

                    content_tokens = content_tokens[:args.max_doc_len]
                    document.tokens = content_tokens
                    assert isinstance(candidate['label'], bool)
                    document.label = 1 if candidate['label'] else 0
                    candidates.append(document)

                if len(candidates) == args.num_candidates:
                    qObj.documents = candidates
                    session_queries.append(qObj)

            # sessions must contain at least 2 queries
            if len(session_queries) < 2:
                continue

            session = Session(example['session_id'])
            session.queries = session_queries
            examples.append(session)

        if max_examples != -1 and len(examples) > max_examples:
            break

    return examples
def load_data(args, filename, max_examples=-1, dataset_name='msmarco'):
    """Load examples from preprocessed file. One example per line, JSON encoded."""

    # Load JSON lines
    with open(filename) as f:
        data = [
            json.loads(line)
            for line in tqdm(f, total=count_file_lines(filename))
        ]

    examples = []
    # based on model_type, we arrange the data
    model_type = args.model_type.upper()
    for example in tqdm(data):
        if dataset_name == 'msmarco':
            session_queries = []
            for query in example['query']:
                qObj = Query(query['id'])
                qObj.text = ' '.join(query['tokens'])
                qtokens = query['tokens']
                qtokens = [BOS_WORD] + qtokens + [EOS_WORD]

                if len(qtokens) == 0 or len(qtokens) > args.max_query_len:
                    continue

                qObj.tokens = qtokens
                session_queries.append(qObj)

            # sessions must contain at least 2 queries
            if len(session_queries) < 2:
                continue

            if model_type == 'SEQ2SEQ':
                # every session will contain only 2 queries
                for i in range(len(session_queries) - 1):
                    session = Session(example['session_id'] + str(i))
                    session.queries = session_queries[i:i + 2]
                    assert len(session) == 2
                    examples.append(session)
            elif model_type == 'ACG':
                # every session will contain only 2 queries
                # but the first query is the concatenation of all previous queries till timestep i
                for i in range(len(session_queries) - 1):
                    session = Session(example['session_id'] + str(i))
                    session.add_one_query(session_queries[0:i + 1])
                    session.add_query(session_queries[i + 1])
                    assert len(session) == 2
                    examples.append(session)
            elif model_type == 'HREDQS':
                session = Session(example['session_id'])
                session.queries = session_queries
                examples.append(session)

        if max_examples != -1 and len(examples) > max_examples:
            break

    return examples
def index_embedding_words(embedding_file):
    """Put all the words in embedding_file into a set."""
    words = set()
    with open(embedding_file) as f:
        for line in tqdm(f, total=count_file_lines(embedding_file)):
            w = Vocabulary.normalize(line.rstrip().split(' ')[0])
            words.add(w)

    words.update([BOS_WORD, EOS_WORD, PAD_WORD, UNK_WORD])
    return words
Exemple #4
0
    def load_embeddings(self, words, embedding_file):
        """Load pretrained embeddings for a given list of words, if they exist.
        Args:
            words: iterable of tokens. Only those that are indexed in the
              dictionary are kept.
            embedding_file: path to text file of embeddings, space separated.
        """
        emb_layer = self.network.embedder.word_embeddings
        words = {w for w in words if w in self.src_dict}
        logger.info('Loading pre-trained embeddings for %d words from %s' %
                    (len(words), embedding_file))

        # When normalized, some words are duplicated. (Average the embeddings).
        vec_counts, embedding = {}, {}
        with open(embedding_file, encoding='utf8') as f:
            # Skip first line if of form count/dim.
            line = f.readline().rstrip().split(' ')
            if len(line) != 2:
                f.seek(0)

            duplicates = set()
            for line in tqdm(f, total=count_file_lines(embedding_file)):
                parsed = line.rstrip().split(' ')
                assert (len(parsed) == emb_layer.word_vec_size + 1)
                w = self.src_dict.normalize(parsed[0])
                if w in words:
                    vec = torch.Tensor([float(i) for i in parsed[1:]])
                    if w not in vec_counts:
                        vec_counts[w] = 1
                        embedding[w] = vec
                    else:
                        duplicates.add(w)
                        vec_counts[w] = vec_counts[w] + 1
                        embedding[w].add_(vec)

            if len(duplicates) > 0:
                logging.warning('WARN: Duplicate embedding found for %s' %
                                ', '.join(duplicates))

        for w, c in vec_counts.items():
            embedding[w].div_(c)

        emb_layer.init_word_vectors(self.src_dict, embedding,
                                    self.args.fix_embeddings)
        logger.info('Loaded %d embeddings (%.2f%%)' %
                    (len(vec_counts), 100 * len(vec_counts) / len(words)))
def load_data(args,
              filename,
              max_examples=-1,
              dataset_name='msmarco'):
    """Load examples from preprocessed file. One example per line, JSON encoded."""

    # Load JSON lines
    with open(filename) as f:
        data = [json.loads(line) for line in
                tqdm(f, total=count_file_lines(filename))]

    examples = []
    for session in tqdm(data):
        if dataset_name == 'msmarco':
            for query in session['query']:
                qObj = Query(query['id'])
                qtokens = query['tokens']
                qtokens = [BOS_WORD] + qtokens + [EOS_WORD]

                if len(qtokens) == 0 or len(qtokens) > args.max_query_len:
                    continue
                if len(query['candidates']) != args.num_candidates:
                    continue

                if args.use_char_ngram > 0:
                    char_n_grams = []
                    offset = args.use_char_ngram
                    for i in range(len(qtokens)):
                        term = '#' + qtokens[i] + '#'
                        for j in range(0, len(term) - offset + 1):
                            char_n_grams.append(term[j:j + offset])
                    qtokens = char_n_grams

                qObj.tokens = qtokens
                candidates = []
                for candidate in query['candidates']:
                    document = Document(candidate['id'])
                    # TODO: what should we use for documents? title/content?
                    content_tokens = candidate['content'].split()
                    if len(content_tokens) == 0:
                        continue
                    content_tokens = content_tokens[:args.max_doc_len - 2]
                    content_tokens = [BOS_WORD] + content_tokens + [EOS_WORD]

                    if args.use_char_ngram > 0:
                        char_n_grams = []
                        offset = args.use_char_ngram
                        for i in range(len(content_tokens)):
                            term = '#' + content_tokens[i] + '#'
                            for j in range(0, len(term) - offset + 1):
                                char_n_grams.append(term[j:j + offset])
                        content_tokens = char_n_grams

                    document.tokens = content_tokens
                    assert isinstance(candidate['label'], bool)
                    document.label = 1 if candidate['label'] else 0
                    candidates.append(document)

                if len(candidates) == args.num_candidates:
                    qObj.documents = candidates
                    examples.append(qObj)

        if max_examples != -1 and len(examples) > max_examples:
            break

    return examples