def run_glove(): logging.info('Starting training') with temp_chdir(args.glove_dir): run_and_log_output('sh {} {}'.format(glove_script, unlabelled_data)) # convert their format to ours df = pd.read_csv(raw_vectors_file, sep=' ', index_col=0, header=None) logging.info('Done training, filtering junk and converting %d vectors to Byblo-compatible format', len(df)) # remove any shit-looking tokens, they'll get in the way later mask = [DocumentFeature.from_string(x).type != 'EMPTY' and 3 < len(x) < 20 for x in df.index] logging.info('Keeping %d entries', sum(mask)) logging.info('Shape of vectors before filtering %r', df.shape) df = df[mask] logging.info('Shape of vectors after filtering %r', df.shape) cols = ['f%d' % i for i in range(df.shape[1])] mkdirs_if_not_exists(output_dir) write_vectors_to_hdf(df.values, df.index, cols, formatted_vectors_file)
def reformat_data(): # glove requires the entire corpus to be on a single row logging.info('Starting corpus reformat') run_and_log_output('cat {}/* > tmp'.format(pos_only_data_dir)) run_and_log_output('tr "\\n" " " < tmp > {}'.format(unlabelled_data)) run_and_log_output('rm -f tmp') logging.info('Done with reformat')
def run_socher_code(): # symlink the file Socher's code expects to where the list of phrases I'm interested is force_symlink(phrases_to_compose, socher_input_file) with temp_chdir(socher_base_dir): run_and_log_output('./phrase2Vector.sh') # this takes a while