Example #1
0
    def set_embeddings(self):
        # Read word embeddings.
        if not self.opt.get('embedding_file'):
            logger.warning(
                '[ WARNING: No embeddings provided. ' 'Keeping random initialization. ]'
            )
            return
        logger.info('[ Loading pre-trained embeddings ]')
        embeddings = load_embeddings(self.opt, self.word_dict)
        logger.info('[ Num embeddings = %d ]' % embeddings.size(0))

        # Sanity check dimensions
        new_size = embeddings.size()
        old_size = self.network.embedding.weight.size()
        if new_size[1] != old_size[1]:
            raise RuntimeError('Embedding dimensions do not match.')
        if new_size[0] != old_size[0]:
            logger.warning(
                '[ WARNING: Number of embeddings changed (%d->%d) ]'
                % (old_size[0], new_size[0])
            )

        # Swap weights
        self.network.embedding.weight.data = embeddings

        # If partially tuning the embeddings, keep the old values
        if self.opt['tune_partial'] > 0:
            if self.opt['tune_partial'] + 2 < embeddings.size(0):
                fixed_embedding = embeddings[self.opt['tune_partial'] + 2 :]
                self.network.fixed_embedding = fixed_embedding
Example #2
0
def split_into_seen_unseen(dpath: str):
    """
    Following WoW, we have overlap in train, valid, and test seen but none in test
    unseen. Do an 80:10:5:5 split between train, valid, test_seen, test_unseen or as
    close to it.

    ~205 documents for test_unseen to do this, and movies 1 and 3 have 90 and 117
    movies, respectively, which is about that
    """
    random.seed(42)
    cdir = os.path.join(dpath, "conversations")
    new = {"train": {}, "valid": {}, "test_seen": {}, "test_unseen": {}}
    for fold in ["test", "valid", "train"]:
        with PathManager.open(os.path.join(cdir, f"{fold}_deduped.json")) as f:
            data = json.load(f)
        for k, v in data.items():
            if v["wikiDocumentIdx"] == 1 or v["wikiDocumentIdx"] == 3:
                new["test_unseen"][k] = v
            else:
                rand = random.randint(1, 95)
                if rand <= 80:
                    new["train"][k] = v
                elif rand <= 90:
                    new["valid"][k] = v
                else:
                    new["test_seen"][k] = v

    for fold in new:
        with PathManager.open(
                os.path.join(cdir, f"{fold}_split_seen_unseen.json"),
                "w+") as f:
            json.dump(new[fold], f, indent=2)
        c_cnt = len(new[fold])
        logger.info(f"Seen/unseen {fold} conversation count: {c_cnt}")
def main():
    random.seed(42)

    # Get command line arguments
    parser = ParlaiParser(True, True)
    RemoteAgentAgent.add_cmdline_args(parser)
    opt = parser.parse_args()

    remote = RemoteAgentAgent(opt)
    if opt.get('task'):
        world = create_task(opt, [remote])
    else:
        if opt.get('model'):
            local = create_agent(opt)
        else:
            local = LocalHumanAgent(opt)
        # the remote-host goes **second**
        agents = [local, remote] if not opt['remote_host'] else [remote, local]
        world = DialogPartnerWorld(opt, agents)

    # Talk to the remote agent
    with world:
        while True:
            world.parley()
            logger.info(world.display())
def truncate(data, row, col):
    global MAX_SZ
    if len(data) > MAX_SZ:
        over = len(data) - MAX_SZ
        pct = over / len(data)
        logger.info('Data size is too large for scipy to index all of it. '
                    'Throwing out {} entries ({}%% of data).'.format(
                        over, pct))
        data = data[:MAX_SZ]
        row = row[:MAX_SZ]
        col = col[:MAX_SZ]
    return data, row, col
 def __init__(self, tfidf_path=None, strict=True):
     """
     Args:
         tfidf_path: path to saved model file
         strict: fail on empty queries or continue (and return empty result)
     """
     # Load from disk
     logger.info('Loading %s' % tfidf_path)
     matrix, metadata = utils.load_sparse_csr(tfidf_path)
     self.doc_mat = matrix
     self.ngrams = metadata['ngram']
     self.hash_size = metadata['hash_size']
     self.tokenizer = tokenizers.get_class(metadata['tokenizer'])()
     self.doc_freqs = metadata['doc_freqs'].squeeze()
     self.doc_dict = metadata.get('doc_dict', None)
     self.num_docs = self.doc_mat.shape[1] - 1
     self.strict = strict
Example #6
0
def build_deduped_split(dpath: str):
    """
    Original CMU-DoG has 110 ids that are used in multiple of train/valid/test.

    Get rid of the duplication.
    """
    cdir = os.path.join(dpath, "conversations")
    data = {}
    for fold in ["test", "valid", "train"]:
        fpath = os.path.join(cdir, f"{fold}.json")
        with PathManager.open(fpath) as f:
            data[fold] = json.load(f)

    train_len = len(data["train"])
    valid_len = len(data["valid"])
    test_len = len(data["test"])
    logger.info(
        f"Converation count with duplicates: train-{train_len}, valid-{valid_len}, test-{test_len}"
    )

    train_valid = set(data["train"].keys()) & set(data["valid"].keys())
    train_test = set(data["train"].keys()) & set(data["test"].keys())
    valid_test = set(data["valid"].keys()) & set(data["test"].keys())

    for key in train_valid:
        data["train"].pop(key)
    for key in train_test:
        data["train"].pop(key)
    for key in valid_test:
        data["test"].pop(key)

    train_len = len(data["train"])
    valid_len = len(data["valid"])
    test_len = len(data["test"])
    logger.info(
        f"Converation count without duplicates: train-{train_len}, valid-{valid_len}, test-{test_len}"
    )

    for fold in ["test", "valid", "train"]:
        fpath = os.path.join(cdir, f"{fold}_deduped.json")
        with PathManager.open(fpath, "w+") as f:
            json.dump(data[fold], f, indent=2)
def get_count_matrix(args, db_opts):
    """
    Form a sparse word to document count matrix (inverted index).

    M[i, j] = # times word i appears in document j.
    """
    global MAX_SZ
    with DocDB(**db_opts) as doc_db:
        doc_ids = doc_db.get_doc_ids()

    # Setup worker pool
    tok_class = tokenizers.get_class(args.tokenizer)
    workers = ProcessPool(args.num_workers,
                          initializer=init,
                          initargs=(tok_class, db_opts))

    # Compute the count matrix in steps (to keep in memory)
    logger.info('Mapping...')
    row, col, data = [], [], []
    step = max(int(len(doc_ids) / 10), 1)
    batches = [doc_ids[i:i + step] for i in range(0, len(doc_ids), step)]
    _count = partial(count, args.ngram, args.hash_size)
    for i, batch in enumerate(batches):
        logger.info('-' * 25 + 'Batch %d/%d' % (i + 1, len(batches)) +
                    '-' * 25)
        for b_row, b_col, b_data in workers.imap_unordered(_count, batch):
            row.extend(b_row)
            col.extend(b_col)
            data.extend(b_data)
            if len(data) > MAX_SZ:
                break
        if len(data) > MAX_SZ:
            logger.info('Reached max indexable size, breaking.')
            break
    workers.close()
    workers.join()

    logger.info('Creating sparse matrix...')
    data, row, col = truncate(data, row, col)

    count_matrix = sp.csr_matrix((data, (row, col)),
                                 shape=(args.hash_size, len(doc_ids) + 1))
    count_matrix.sum_duplicates()
    return count_matrix
def run(args):
    # ParlAI version of run method, modified slightly
    logger.info('Counting words...')
    count_matrix = get_count_matrix(args, {'db_path': args.db_path})

    logger.info('Making tfidf vectors...')
    tfidf = get_tfidf_matrix(count_matrix)

    logger.info('Getting word-doc frequencies...')
    freqs = get_doc_freqs(count_matrix)

    filename = args.out_dir

    logger.info('Saving to %s' % filename)
    metadata = {
        'doc_freqs': freqs,
        'tokenizer': args.tokenizer,
        'hash_size': args.hash_size,
        'ngram': args.ngram,
    }

    utils.save_sparse_csr(filename, tfidf, metadata)
def get_count_matrix_t(args, db_opts):
    """
    Form a sparse word to document count matrix (inverted index, torch ver).

    M[i, j] = # times word i appears in document j.
    """
    global MAX_SZ
    with DocDB(**db_opts) as doc_db:
        doc_ids = doc_db.get_doc_ids()

    # Setup worker pool
    tok_class = tokenizers.get_class(args.tokenizer)
    workers = ProcessPool(args.num_workers,
                          initializer=init,
                          initargs=(tok_class, db_opts))

    # Compute the count matrix in steps (to keep in memory)
    logger.info('Mapping...')
    row, col, data = [], [], []
    step = max(int(len(doc_ids) / 10), 1)
    batches = [doc_ids[i:i + step] for i in range(0, len(doc_ids), step)]
    _count = partial(count, args.ngram, args.hash_size)
    for i, batch in enumerate(batches):
        logger.info('-' * 25 + 'Batch %d/%d' % (i + 1, len(batches)) +
                    '-' * 25)
        for b_row, b_col, b_data in workers.imap_unordered(_count, batch):
            row.extend(b_row)
            col.extend(b_col)
            data.extend(b_data)
    workers.close()
    workers.join()

    logger.info('Creating sparse matrix...')
    count_matrix = torch.sparse.FloatTensor(
        torch.LongTensor([row, col]),
        torch.FloatTensor(data),
        torch.Size([args.hash_size, len(doc_ids) + 1]),
    ).coalesce()
    return count_matrix
def store_contents(opt,
                   task,
                   save_path,
                   context_length=-1,
                   include_labels=True):
    """
    Preprocess and store a corpus of documents in sqlite.

    Args:
        task: ParlAI tasks of text (and possibly values) to store.
        save_path: Path to output sqlite db.
        num_workers: Number of parallel processes to use when reading docs.
    """
    if os.path.isfile(save_path):
        raise RuntimeError('%s already exists! Not overwriting.' % save_path)

    logger.info('Reading into database...')
    conn = sqlite3.connect(save_path)
    c = conn.cursor()
    c.execute('CREATE TABLE documents (id INTEGER PRIMARY KEY, text, value);')
    if not task:
        logger.info('No data to initialize table: just creating table.')
        logger.info('Add more data by passing observations to the agent.')
        logger.info('Committing...')
        conn.commit()
        conn.close()
        return

    ordered_opt = opt.copy()
    dt = opt.get('datatype', '').split(':')
    ordered_opt['datatype'] = ':'.join([dt[0], 'ordered'] + dt[1:])
    ordered_opt['batchsize'] = 1
    ordered_opt['numthreads'] = 1
    ordered_opt['task'] = task
    teacher = create_task_agent_from_taskname(ordered_opt)[0]

    episode_done = False
    current = []
    triples = []
    context_length = context_length if context_length >= 0 else None
    context = deque(maxlen=context_length)
    with tqdm(total=teacher.num_episodes()) as pbar:
        while not teacher.epoch_done():
            # collect examples in episode
            while not episode_done:
                action = teacher.act()
                current.append(action)
                episode_done = action['episode_done']

            for ex in current:
                if 'text' in ex:
                    text = ex['text']
                    context.append(text)
                    if len(context) > 1:
                        text = '\n'.join(context)

                # add labels to context
                labels = ex.get('labels', ex.get('eval_labels'))
                label = None
                if labels is not None:
                    label = random.choice(labels)
                    if include_labels:
                        context.append(label)
                # use None for ID to auto-assign doc ids--we don't need to
                # ever reverse-lookup them
                triples.append((None, text, label))

            c.executemany('INSERT OR IGNORE INTO documents VALUES (?,?,?)',
                          triples)
            pbar.update()

            # reset flags and content
            episode_done = False
            triples.clear()
            current.clear()
            context.clear()

    logger.info('Read %d examples from %d episodes.' %
                (teacher.num_examples(), teacher.num_episodes()))
    logger.info('Committing...')
    conn.commit()
    conn.close()
Example #11
0
def main():
    # Get command line arguments
    parser = ParlaiParser()
    parser.add_argument('-n', '--num-iters', default=10, type=int)
    parser.add_argument('-a', '--num-agents', default=1, type=int)
    opt = parser.parse_args()

    agents = []
    for _ in range(opt['num_agents']):
        agents.append(Agent(opt))

    opt['datatype'] = 'train'
    world_train = create_task(opt, agents)

    opt['datatype'] = 'valid'
    world_valid = create_task(opt, agents)

    start = time.time()
    # train / valid loop
    for _ in range(1):
        logger.info('[ training ]')
        for _ in range(opt['num_iters']):  # train for a bit
            world_train.parley()

        logger.info('[ training summary. ]')
        logger.info(world_train.report())

        logger.info('[ validating ]')
        for _ in range(1):  # check valid accuracy
            world_valid.parley()

        logger.info('[ validation summary. ]')
        logger.info(world_valid.report())

    logger.info('finished in {} s'.format(round(time.time() - start, 2)))
    parser.add_argument(
        '--tokenizer',
        type=str,
        default='simple',
        help=("String option specifying tokenizer type to use "
              "(e.g. 'corenlp')"),
    )
    parser.add_argument(
        '--num-workers',
        type=int,
        default=None,
        help='Number of CPU processes (for tokenizing, etc)',
    )
    args = parser.parse_args()

    logger.info('Counting words...')
    count_matrix, doc_dict = get_count_matrix(args, {'db_path': args.db_path})

    logger.info('Making tfidf vectors...')
    tfidf = get_tfidf_matrix(count_matrix)

    logger.info('Getting word-doc frequencies...')
    freqs = get_doc_freqs(count_matrix)

    basename = os.path.splitext(os.path.basename(args.db_path))[0]
    basename += '-tfidf-ngram=%d-hash=%d-tokenizer=%s' % (
        args.ngram,
        args.hash_size,
        args.tokenizer,
    )
    filename = os.path.join(args.out_dir, basename)