def write_clean_turian_unigrams():
    """
    Extracts unigram embeddings from Socher's binary distribution. These can be used by other composers.

    There are only 50k embeddings (presumably for the most frequent tokens in the corpus). The words have not
    been processed- there are punctuation-only tokens, uppercased words and non-lemmatized words. There isn't
    any PoS tag filtering either- words like "to", "while" and "there".

    I remove punctuation, then lowercase and lemmatize each entry. Multiple entries may map to the
    same canonical form. I select the shortest original entry (ties are broken by giving preference to
    words that are already lowercased). This could have been done better.
    Only vectors for the selected entries are kept. There's 33k canonical
    forms left, many of which are not nouns/adjs/verbs.

    We don't have a PoS tag for the canonical forms. I get around the problem by creating 3 copies of each
    canonical form and expand "cat" to cat/N, cat/J and cat/V, which all share the same vector.
    """
    logging.info('Writing Turian unigrams to %s', turian_unigram_vectors_file)
    mat = loadmat(socher_unigram_embedding_matlab)
    words = [w[0] for w in mat['words'].ravel()]
    df = pd.DataFrame(mat['We'].T, index=words)

    lmtzr = WordNetLemmatizer()
    clean_to_dirty = defaultdict(list)  # canonical -> [non-canonical]
    dirty_to_clean = dict()  # non-canonical -> canonical
    to_keep = set()  # which non-canonical forms forms we will keep
    #  todo this can be done based on frequency or something

    for w in words:
        if set(w).intersection(set(string.punctuation).union(set('0123456789'))):
            # not a real word- contains digits or punctuation
            continue

        lemma = lmtzr.lemmatize(w.lower())
        clean_to_dirty[lemma].append(w)
        dirty_to_clean[w] = lemma

    # decide which of possibly many non-canonical forms with the same lemma to keep
    # prefer shorter and lowercased non-canonical forms
    for lemma, dirty_list in clean_to_dirty.items():
        if len(dirty_list) > 1:
            best_lemma = min(dirty_list, key=lambda w: (len(w), not w.islower()))
        else:
            best_lemma = dirty_list[0]
        to_keep.add(best_lemma)

    # remove non-canonical forms we don't want
    idx_to_drop = [i for i, w in enumerate(df.index) if w not in to_keep]
    ddf = df.drop(df.index[idx_to_drop])
    # canonicalize whatever is left
    ddf.index = [lmtzr.lemmatize(w.lower()) for w in ddf.index]

    # we don't know what the PoS tags of the canonical forms are, so make them all of the same tag
    # e.g. expand "cat" to cat/N, cat/J and cat/V, which all share the same vector
    new_index = ['%s/%s'%(w, pos) for pos in 'NJV' for w in ddf.index]
    new_data = np.vstack([ddf.values] * 3)
    ddf = pd.DataFrame(new_data, index= new_index)
    dv = DenseVectors(ddf, allow_lexical_overlap=True)
    dv.to_tsv(turian_unigram_vectors_file)
    logging.info('Done')
def generate(output, dim):
    np.random.seed(0)
    feats = ['rand%d' % i for i in range(dim)]
    phrases = list(get_all_document_features(include_unigrams=True))
    vectors = np.random.random((len(phrases), dim))

    v = DenseVectors(pd.DataFrame(vectors, index=phrases, columns=feats))
    v.to_tsv(output, dense_hd5=True)
def test_distributional_with_vector_clusters(conf, tmpdir):
    # generate random vectors for the the appropriate features and cluster them first
    x_tr, _, _, _ = get_tokenized_data(conf['training_data'], conf['tokenizer'])
    feats = FeatureExtractor().extract_features_from_tree_list([foo[0] for foo in x_tr])
    vectors = np.random.random((len(feats), 10))
    v = DenseVectors(pd.DataFrame(vectors, index=feats))
    tmpfile = str(tmpdir.join('tmp_random_vectors'))
    v.to_tsv(tmpfile, dense_hd5=True)

    tmpclusters = str(tmpdir.join('tmp_random_clusters'))
    cluster_vectors(tmpfile, tmpclusters, n_clusters=5, n_jobs=1)

    conf['vector_sources']['neighbours_file'] = []
    conf['vectorizer']['class'] = 'eval.pipeline.multivectors.KmeansVectorizer'
    conf['vector_sources']['clusters_file'] = tmpclusters
    # the features of the document are cluster ids, not phrases
    # no point in checking in they are in the thesaurus
    conf['feature_selection']['must_be_in_thesaurus'] = False

    for debug_level in [0, 1, 2]:
        conf['debug_level'] = debug_level
        run_experiment(conf)