def reformat_socher_vectors():
    """
    Formats the files output by Socher (2011)'s matlab code into byblo-compatible files.

    Before running this a list of all phrases needs to be extracted from the labelled data, and these need to
    be composed with Socher's matlab code. See note "Socher vectors" in Evernote.

    """
    logging.info('Reformatting events file %s ---> %s',
                 socher_output_vectors_file, socher_composed_vectors_file)

    # socher's code removes all PoS tags, so we can't translate his output
    # back to a DocumentFeature. Let's read the input to his code instead and
    # get the corresponding output vectors
    # get a list of all phrases that we attempted to compose
    with open(plaintext_socher_input_file) as infile:
        composed_phrases = [DocumentFeature.from_string(line.strip()) for line in infile]

    # get a list of all phrases where composition worked (no unknown words)
    with open(socher_output_phrases_file) as infile:
        success = [i for i, line in enumerate(infile) if '*UNKNOWN*' not in line]
        # pick out just the phrases that composes successfully
    composed_phrases = itemgetter(*success)(composed_phrases)

    # load all vectors, remove these containing unknown words
    mat = np.loadtxt(socher_output_vectors_file, delimiter=',')
    mat = mat[success, :]
    assert len(composed_phrases) == mat.shape[0]  # same number of rows

    # do the actual writing
    write_vectors_to_hdf(sp.coo_matrix(mat),
                         composed_phrases,
                         ['RAE-feat%d' % i for i in range(100)],  # Socher provides 100-dimensional vectors
                         socher_composed_vectors_file)
    def to_tsv(self, events_path, entries_path='', features_path='',
               entry_filter=lambda x: True, row_transform=lambda x: x,
               gzipped=False, enforce_word_entry_pos_format=True, dense_hd5=False):
        """
        Writes this thesaurus to Byblo-compatible file like the one it was most likely read from. In the
        process converts all entries to a DocumentFeature, so all entries must be parsable into one. May reorder the
        features of each entry.

        :param events_path: file to write to
        :param entry_filter: Called for every DocumentFeature that is an entry in this thesaurus. The vector will
         only be written if this callable return true
        :param row_transform: Callable, any transformation that might need to be done to each entry before converting
         it to a DocumentFeature. This is needed because some entries (e.g. african/J:amod-HEAD:leader) are not
         directly convertible (needs to be african/J_leader/N). Use this if the entries cannot be converted to
         DocumentFeature, e.g. if the data isn't PoS tagged.
         :param dense_hd5: if true, convert to a pandas `DataFrame` and write to a compressed HDF file. This is a 30%
          faster and produces 30% smaller files than using `gzipped`. This is only suitable for matrices with a small
          number of columns- this method enforces a hard limit of 1000.
          Requires PyTables and HDF5.
        :return: the file name
        """
        if enforce_word_entry_pos_format:
            rows = {i: DocumentFeature.from_string(row_transform(feat)) for (feat, i) in self.name2row.items()}
        else:
            rows = {i: feat for (feat, i) in self.name2row.items()}

        if dense_hd5 and len(self.columns) <= 1000:
            write_vectors_to_hdf(self.matrix, self.row_names, self.columns, events_path)
        else:
            write_vectors_to_disk(coo_matrix(self.matrix), rows, self.columns, events_path,
                                  features_path=features_path, entries_path=entries_path,
                                  entry_filter=entry_filter, gzipped=gzipped)
        return events_path
def merge_vectors(composed_dir, unigrams, output, workers=4, chunk_size=10000):
    # this particular dataset uses spaces instead of underscores. State this to avoid parsing issues
    DocumentFeature.ngram_separator = " "
    DIMS = 100  # SVD dimensionality

    files = glob(os.path.join(composed_dir, "*apt.vec.gz"))
    logging.info("Found %d composed phrase files", len(files))

    # ignore stuff that isn't unigrams, it will cause problems later
    unigrams = Vectors.from_tsv(unigrams, row_filter=lambda x, y: y.type == "1-GRAM")
    logging.info("Found %d unigram vectors", len(unigrams))

    mat, cols, rows = unigrams.to_sparse_matrix()
    unigrams.v.vocabulary_ = {x: i for i, x in enumerate(list(cols))}
    cols = set(cols)
    svd = TruncatedSVD(DIMS, random_state=0)
    logging.info("Reducing dimensionality of matrix of shape %r...", mat.shape)
    start = time.time()
    reduced_mat = svd.fit_transform(mat)
    logging.info(
        "Reduced using {} from shape {} to shape {} in {} seconds".format(
            svd, mat.shape, reduced_mat.shape, time.time() - start
        )
    )
    write_vectors_to_hdf(
        reduced_mat,
        rows,
        ["SVD:feat{0:03d}".format(i) for i in range(reduced_mat.shape[1])],
        "%s-unigrams-SVD%d" % (output, DIMS),
    )
    del mat

    for i, chunk in enumerate(grouper(chunk_size, files)):
        d = {}
        logging.info("Reading composed vectors, chunk %d...", i)
        for phrase, features in Parallel(n_jobs=workers)(delayed(_read_vector)(f) for f in chunk if f):
            if features:
                d[phrase] = features

        logging.info("Found %d non-empty composed vectors in this chunk, running SVD now...", len(d))
        if not d:
            continue

        composed_vec = Vectors(d, column_filter=lambda foo: foo in cols)
        # vectorize second matrix with the vocabulary (columns) of the first thesaurus to ensure shapes match
        # "project" composed matrix into space of unigram thesaurus
        extra_matrix = unigrams.v.transform([dict(fv) for fv in composed_vec.values()])
        assert extra_matrix.shape == (len(composed_vec), len(cols))
        logging.info("Composed matrix is of shape %r before SVD", extra_matrix.shape)

        extra_matrix = svd.transform(extra_matrix)
        write_vectors_to_hdf(
            extra_matrix,
            list(composed_vec.keys()),
            ["SVD:feat{0:03d}".format(i) for i in range(extra_matrix.shape[1])],
            "%s-phrases-chunk%d-SVD%d" % (output, i, DIMS),
        )
        del composed_vec
def _write_to_disk(reduced_mat, prefix, rows, use_hdf=True):
    events_file = prefix + '.events.filtered.strings'
    if use_hdf:
        write_vectors_to_hdf(reduced_mat, rows,
                             ['SVD:feat{0:03d}'.format(i) for i in range(reduced_mat.shape[1])],
                             events_file)
    else:
        write_vectors_to_disk(reduced_mat, rows,
                              ['SVD:feat{0:03d}'.format(i) for i in range(reduced_mat.shape[1])],
                              events_file)
def run_glove():
    logging.info('Starting training')
    with temp_chdir(args.glove_dir):
        run_and_log_output('sh {} {}'.format(glove_script, unlabelled_data))

    # convert their format to ours
    df = pd.read_csv(raw_vectors_file, sep=' ', index_col=0, header=None)
    logging.info('Done training, filtering junk and converting %d vectors to Byblo-compatible format', len(df))
    # remove any shit-looking tokens, they'll get in the way later
    mask = [DocumentFeature.from_string(x).type != 'EMPTY' and 3 < len(x) < 20 for x in df.index]
    logging.info('Keeping %d entries', sum(mask))
    logging.info('Shape of vectors before filtering %r', df.shape)
    df = df[mask]
    logging.info('Shape of vectors after filtering %r', df.shape)
    cols = ['f%d' % i for i in range(df.shape[1])]
    mkdirs_if_not_exists(output_dir)
    write_vectors_to_hdf(df.values, df.index, cols, formatted_vectors_file)
def compose_and_write_vectors(unigram_vectors, short_vector_dataset_name, composer_classes, remove_pos= False,
                              pretrained_Baroni_composer_file=None, pretrained_Guevara_composer_file=None,
                              pretrained_Gref_composer_file=None, categorical_vector_matrix_file=None,
                              output_dir='.', gzipped=True, dense_hd5=False,
                              row_filter=default_row_filter):
    """
    Extracts all composable features from a labelled classification corpus and dumps a composed vector for each of them
    to disk. The output file will also contain all unigram vectors that were passed in, and only unigrams!
    :param unigram_vectors: a file in Byblo events format that contain vectors for all unigrams OR
    a Vectors object. This will be used in the composition process.
    :type unigram_vectors: str or Vectors
    :param classification_corpora: Corpora to extract features from. Dict {corpus_path: conf_file}
    :param pretrained_Baroni_composer_file: path to pre-trained Baroni AN/NN composer file
    :param output_dir:
    :param composer_classes: what composers to use
    :type composer_classes: list
    """

    phrases_to_compose = get_all_document_features(remove_pos=remove_pos)
    # if this isn't a Vectors object assume it's the name of a file containing vectors and load them
    if not isinstance(unigram_vectors, Vectors):
        # ensure there's only unigrams in the set of unigram vectors
        # composers do not need any ngram vectors contain in this file, they may well be
        # observed ones
        unigram_vectors = Vectors.from_tsv(unigram_vectors,
                                           row_filter=row_filter)
        logging.info('Starting composition with %d unigram vectors', len(unigram_vectors))

    # doing this loop in parallel isn't worth it as pickling or shelving `vectors` is so slow
    # it negates any gains from using multiple cores
    for composer_class in composer_classes:
        if composer_class == BaroniComposer:
            assert pretrained_Baroni_composer_file is not None
            composer = BaroniComposer(unigram_vectors, pretrained_Baroni_composer_file)
        elif composer_class == GuevaraComposer:
            assert pretrained_Guevara_composer_file is not None
            composer = GuevaraComposer(unigram_vectors, pretrained_Guevara_composer_file)
        elif composer_class == GrefenstetteMultistepComposer:
            assert pretrained_Gref_composer_file is not None
            composer = GrefenstetteMultistepComposer(unigram_vectors, pretrained_Gref_composer_file)
        elif composer_class in [CopyObject, FrobeniusAdd, FrobeniusMult]:
            composer = composer_class(categorical_vector_matrix_file, unigram_vectors)
        else:
            composer = composer_class(unigram_vectors)

        try:
            # compose_all returns all unigrams and composed phrases
            mat, cols, rows = composer.compose_all(phrases_to_compose)

            events_path = os.path.join(output_dir,
                                       'composed_%s_%s.events.filtered.strings' % (short_vector_dataset_name,
                                                                                   composer.name))
            if dense_hd5:
                write_vectors_to_hdf(mat, rows, cols, events_path)
            else:
                rows2idx = {i: DocumentFeature.from_string(x) for (x, i) in rows.items()}
                write_vectors_to_disk(mat.tocoo(), rows2idx, cols, events_path,
                                      entry_filter=lambda x: x.type in {'AN', 'NN', 'VO', 'SVO', '1-GRAM'},
                                      gzipped=gzipped)
        except ValueError as e:
            logging.error('RED ALERT, RED ALERT')
            logging.error(e)
            continue