def run_glove():
    logging.info('Starting training')
    with temp_chdir(args.glove_dir):
        run_and_log_output('sh {} {}'.format(glove_script, unlabelled_data))

    # convert their format to ours
    df = pd.read_csv(raw_vectors_file, sep=' ', index_col=0, header=None)
    logging.info('Done training, filtering junk and converting %d vectors to Byblo-compatible format', len(df))
    # remove any shit-looking tokens, they'll get in the way later
    mask = [DocumentFeature.from_string(x).type != 'EMPTY' and 3 < len(x) < 20 for x in df.index]
    logging.info('Keeping %d entries', sum(mask))
    logging.info('Shape of vectors before filtering %r', df.shape)
    df = df[mask]
    logging.info('Shape of vectors after filtering %r', df.shape)
    cols = ['f%d' % i for i in range(df.shape[1])]
    mkdirs_if_not_exists(output_dir)
    write_vectors_to_hdf(df.values, df.index, cols, formatted_vectors_file)
def reformat_data():
    # glove requires the entire corpus to be on a single row
    logging.info('Starting corpus reformat')
    run_and_log_output('cat {}/* > tmp'.format(pos_only_data_dir))
    run_and_log_output('tr "\\n" " " < tmp > {}'.format(unlabelled_data))
    run_and_log_output('rm -f tmp')
    logging.info('Done with reformat')
def run_socher_code():
    # symlink the file Socher's code expects to where the list of phrases I'm interested is
    force_symlink(phrases_to_compose, socher_input_file)
    with temp_chdir(socher_base_dir):
        run_and_log_output('./phrase2Vector.sh')  # this takes a while