def preprocess_for_brown_clustering(): input_filename = dirs.data_processed_text_file articles = fh.read_json(input_filename) keys = articles.keys() keys.sort() items = keys print len(items) processed_dict = {} output_filename = fh.make_filename(dirs.data_processed_brown_dir, 'input', 'txt') with codecs.open(output_filename, 'w', encoding='utf-8') as output_file: for k in keys: text = articles[k] tokens = [] sentences = text.split('\n') for s in sentences: sent_tokens = tokenizer.split_into_words(s, reattach=False, split_off_quotes=False, lemmatize=False, replace_numbers=True) tokens = tokens + sent_tokens if k in items: output_file.write(' '.join(tokens) + '\n') processed_dict[k] = tokens output_filename = fh.make_filename(dirs.data_processed_brown_dir, 'processed', 'json') fh.write_to_json(processed_dict, output_filename)
def run_pipeline(skip_corenlp=True, corenlp_dir=None, overwrite=False, extension='.xml', nice=False): output_dir = fh.makedirs(dirs.data_stanford_dir) temp_dir = fh.makedirs(dirs.data_raw_sentences_dir) xml_dir = fh.makedirs(output_dir, 'xml') # now done by preprocessing tools """ # part 1 print "Splitting files" split_into_files(input_filename, temp_dir) """ # part 2 if not skip_corenlp: filelist_filename = fh.make_filename(output_dir, 'filelist', 'txt') text_files = glob.glob(os.path.join(temp_dir, '*.txt')) text_files.sort() files = [] if overwrite: print "Reprocessing all files" files = text_files else: for f in text_files: basename = os.path.basename(f) if not os.path.exists(os.path.join(xml_dir, basename + extension)): files.append(f) print len(files), "files to process" with open(filelist_filename, 'w') as output_file: for f in files: output_file.write(f + '\n') if len(files) > 0: properties_file = os.path.join(os.getcwd(), 'core', 'external', 'CoreNLP.properties') print "Calling corenlp" call_corenlp(filelist_filename, xml_dir, corenlp_dir, properties_file, nice) # part 3 print "Parsing xml" xml_filelist_filename = fh.make_filename(output_dir, 'xml_filelist', 'txt') files = glob.glob(os.path.join(xml_dir, '*.txt' + extension)) with open(xml_filelist_filename, 'w') as output_file: for f in files: output_file.write(f + '\n') summary, dependencies = parse_xml_files(xml_filelist_filename, output_dir) # part 4 print "Writing summary" #parsed_filename = fh.make_filename(output_dir, 'parsed', 'json') parse_summary_to_files(summary, dependencies, output_dir)
def main(): usage = "%prog exp_dir_test_fold_dir" parser = OptionParser(usage=usage) parser.add_option('-t', dest='test_fold', default=0, help='Test fold; default=%default') (options, args) = parser.parse_args() test_fold = options.test_fold exp_dir = args[0] results = pd.DataFrame(columns=('masked', 'test', 'valid', 'dir')) run_dirs = glob.glob(os.path.join(exp_dir, 'bayes*reuse*')) for i, dir in enumerate(run_dirs): run_num = int(fh.get_basename_wo_ext(dir).split('_')[-1]) if run_num <= 40 and '1_' not in fh.get_basename_wo_ext(dir): results_dir = os.path.join(dir, 'results') test_file = fh.make_filename(results_dir, 'test_macro_f1', 'csv') valid_file = fh.make_filename(results_dir, 'valid_cv_macro_f1', 'csv') masked_valid_file = fh.make_filename(results_dir, 'masked_valid_cv_macro_f1', 'csv') try: test = pd.read_csv(test_file, header=False, index_col=0) valid = pd.read_csv(valid_file, header=False, index_col=0) masked_valid = pd.read_csv(masked_valid_file, header=False, index_col=0) #results.loc[run_num, 'iteration'] = run_num results.loc[i, 'masked'] = masked_valid['overall'].mean() results.loc[i, 'test'] = test['overall'].mean() results.loc[i, 'valid'] = valid['overall'].mean() results.loc[i, 'dir'] = fh.get_basename_wo_ext(dir) except: continue results.to_csv(fh.make_filename(exp_dir, 'summary', 'csv'), columns=results.columns) sorted = results.sort('valid') print sorted print "best by masked" sorted = results.sort('masked') print sorted.values[-1, :] print "best by valid" sorted = results.sort('valid') print sorted.values[-1, :]
def main(): usage = "%prog exp_dir_test_fold_dir" parser = OptionParser(usage=usage) parser.add_option("-t", dest="test_fold", default=0, help="Test fold; default=%default") (options, args) = parser.parse_args() test_fold = options.test_fold exp_dir = args[0] results = pd.DataFrame(columns=("masked", "test", "valid", "dir")) run_dirs = glob.glob(os.path.join(exp_dir, "bayes*reuse*")) for i, dir in enumerate(run_dirs): run_num = int(fh.get_basename(dir).split("_")[-1]) if run_num <= 40 and "1_" not in fh.get_basename(dir): results_dir = os.path.join(dir, "results") test_file = fh.make_filename(results_dir, "test_macro_f1", "csv") valid_file = fh.make_filename(results_dir, "valid_cv_macro_f1", "csv") masked_valid_file = fh.make_filename(results_dir, "masked_valid_cv_macro_f1", "csv") try: test = pd.read_csv(test_file, header=False, index_col=0) valid = pd.read_csv(valid_file, header=False, index_col=0) masked_valid = pd.read_csv(masked_valid_file, header=False, index_col=0) # results.loc[run_num, 'iteration'] = run_num results.loc[i, "masked"] = masked_valid["overall"].mean() results.loc[i, "test"] = test["overall"].mean() results.loc[i, "valid"] = valid["overall"].mean() results.loc[i, "dir"] = fh.get_basename(dir) except: continue results.to_csv(fh.make_filename(exp_dir, "summary", "csv"), columns=results.columns) sorted = results.sort("valid") print sorted print "best by masked" sorted = results.sort("masked") print sorted.values[-1, :] print "best by valid" sorted = results.sort("valid") print sorted.values[-1, :]
def split_into_files(input_filename, output_dir): data = fh.read_json(input_filename) keys = data.keys() keys.sort() filelist = [] for key in keys: key = key.rstrip('\n') line = data[key].rstrip('\n') normalized_filename = os.path.join(output_dir, key + '.txt') filelist.append(normalized_filename) with codecs.open(normalized_filename, 'w', encoding='utf-8') as output_file: output_file.write(line) filelist_filename = fh.make_filename(output_dir, 'filelist', 'txt') fh.write_list_to_text(filelist, filelist_filename) return filelist_filename
def preprocess_for_easysrl(): input_filename = dirs.data_processed_text_file articles = fh.read_json(input_filename) keys = articles.keys() keys.sort() labeled = list(ds.get_all_documents()) labeled.sort() processed_dict = {} output_filename = fh.make_filename(dirs.data_easysrl_dir, 'input', 'txt') with codecs.open(output_filename, 'w', encoding='utf-8') as output_file: count = 0 for k in labeled: output_file.write(k + ' starts here\n') text = articles[k] paragraphs = text.split('\n\n') for p in paragraphs: sentences = tokenizer.split_sentences(p.strip()) for s in sentences: output_file.write(s.strip() + '\n')
def main(): exp_dir = defines.exp_dir exp_name = 'bayes_opt_LR_alphas_reuse' df = pd.DataFrame() basenames = ['test_acc.csv', 'test_micro_f1.csv', 'test_macro_f1.csv', 'test_pp.csv'] rownames = ['model accuracy', 'model micro f1', 'model macro f1', 'model percent perfect'] for i, basename in enumerate(basenames): rowname = rownames[i] files = glob.glob(os.path.join(exp_dir, '*', 'test_fold_0', exp_name, 'results', basename)) gather_results(df, files, rowname) files = glob.glob(os.path.join(defines.data_raw_labels_dir, '*.csv')) for file in files: dataset = fh.get_basename(file) codes = labels.get_dataset_labels(dataset) if dataset in df.columns: df.loc['Number of responses', dataset] = codes.shape[0] df.loc['Number of labels', dataset] = codes.shape[1] output_dir = '/Users/dcard/Dropbox/CMU/DAP/results/' output_filename = fh.make_filename(output_dir, exp_name, 'csv') df.to_csv(output_filename)
def parse_xml_files(xml_filelist_filename, output_dir): filelist = fh.read_text(xml_filelist_filename) parsed_files = {} sentiments = {} dependencies = {} dependency_tuples = {} entities = {} coref = {} coref_entities = {} coref_heads = {} all_groups = {} jk_grams = {} amalgram_pairs = {} for file in filelist: file = file.rstrip('\n') print file # peel off both .txt and .xml basename = fh.get_basename_wo_ext(fh.get_basename_wo_ext(file)) sentences, doc_sentiments, doc_dependencies, doc_dependency_tuples, doc_entities, doc_coref, groups, _,\ doc_coref_entities, doc_coref_heads = parse_xml_output(file) parsed_files[basename] = sentences sentiments[basename] = doc_sentiments dependencies[basename] = doc_dependencies dependency_tuples[basename] = doc_dependency_tuples entities[basename] = doc_entities coref[basename] = doc_coref coref_entities[basename] = doc_coref_entities coref_heads[basename] = doc_coref_heads doc_jk_grams, doc_jk_indices = find_jk_grams(sentences) jk_grams[basename] = doc_jk_grams # output documents to amalgram format #amalgram_dir = os.path.join(dirs.data_amalgram_dir, 'input') #if not os.path.exists(amalgram_dir): # os.makedirs(amalgram_dir) tagged_sents = ['\n'.join([t['word'] + '\t' + t['POS'] for t in s]) + '\n' for s in sentences] # save word/tag pairs for amalgram tagged_sents = [[(t['word'], t['POS']) for t in s] for s in sentences] amalgram_pairs[basename] = tagged_sents # uncomment for extracting story elements... parsed_dir = os.path.join(output_dir, 'parsed') if not os.path.exists(parsed_dir): os.makedirs(parsed_dir) parsed_filename = os.path.join(parsed_dir, basename + '.json') fh.write_to_json(sentences, parsed_filename, sort_keys=False) sentiment_filename = fh.make_filename(output_dir, 'sentiments', 'json') fh.write_to_json(sentiments, sentiment_filename, sort_keys=False) dependencies_filename = fh.make_filename(output_dir, 'dependency_tuple_ids', 'json') fh.write_to_json(dependency_tuples, dependencies_filename, sort_keys=False) coref_filename = fh.make_filename(output_dir, 'entities', 'json') fh.write_to_json(coref, coref_filename, sort_keys=False) jkgrams_filename = fh.make_filename(output_dir, 'jkgrams', 'json') fh.write_to_json(jk_grams, jkgrams_filename, sort_keys=False) coref_heads_filename = fh.make_filename(output_dir, 'coref_heads', 'json') fh.write_to_json(coref_heads, coref_heads_filename, sort_keys=False) amalgram_keys = amalgram_pairs.keys() amalgram_keys.sort() amalgram_data_file = os.path.join(dirs.data_amalgram_dir, 'input.txt') with codecs.open(amalgram_data_file, 'w', encoding='utf-8') as output_file: for k in amalgram_keys: sents = amalgram_pairs[k] for s in sents: for p in s: output_file.write(p[0] + '\t' + p[1] + '\n') output_file.write('\n') for k in amalgram_keys: amalgram_data_file = os.path.join(dirs.data_amalgram_dir, k + '.txt') with codecs.open(amalgram_data_file, 'w', encoding='utf-8') as output_file: sents = amalgram_pairs[k] for s in sents: for p in s: output_file.write(p[0] + '\t' + p[1] + '\n') output_file.write('\n') amalgram_index_file = os.path.join(dirs.data_amalgram_dir, 'index.txt') with codecs.open(amalgram_index_file, 'w', encoding='utf-8') as output_file: for k in amalgram_keys: sents = amalgram_pairs[k] for s in sents: output_file.write(k + '\n') #all_groups_filename = fh.make_filename(output_dir, 'all_groups', 'json') #fh.write_to_json(all_groups, all_groups_filename) return parsed_files, dependencies
def parse_summary_to_files(parsed, dependencies, output_dir): words = {} lemmas = {} pos = {} ner = {} word_pos = {} lemma_pos = {} word_ner = {} lemma_ner = {} dependency_links = {} dependency_heads = {} dependency_tails = {} dependency_tuples = {} dependency_pairs = {} dicts = [words, lemmas, pos, ner, word_pos, lemma_pos, word_ner, lemma_ner] dicts2 = [dependency_links, dependency_heads, dependency_tails, dependency_tuples, dependency_pairs] jk_gram = [] last_tag = None last_ner_tag = None for key in parsed.keys(): # TODO: Actually want [a*]n+[pn+]* quote = None for d in dicts: d[key] = [] sentences = parsed[key] for s in sentences: for d in dicts: d[key].append([]) for token in s: words[key][-1].append(token['word']) lemmas[key][-1].append(token['lemma']) pos[key][-1].append(token['POS']) word_pos[key][-1].append(token['word'] + '_' + token['POS']) lemma_pos[key][-1].append(token['lemma'] + '_' + token['POS']) if token['NER'] != 'O': # if tag matches last tag, concatenate to old entires if token['NER'] == last_ner_tag and len(word_ner[key][-1]) > 0: word_ner[key][-1][-1] = '_'.join(word_ner[key][-1][-1].split('_')[:-1] + [token['word'], token['NER']]) lemma_ner[key][-1][-1] = '_'.join(lemma_ner[key][-1][-1].split('_')[:-1] + [token['lemma'], token['NER']]) #word_ner[key][-1].append(token['word'] + '_' + token['NER']) #lemma_ner[key][-1].append(token['lemma'] + '_' + token['NER']) else: ner[key][-1].append(token['NER']) word_ner[key][-1].append(token['word'] + '_' + token['NER']) lemma_ner[key][-1].append(token['lemma'] + '_' + token['NER']) last_ner_tag = token['NER'] # join the word and lemma lists into documents words[key] = '\n'.join([' '.join(sentence_tokens) for sentence_tokens in words[key]]) lemmas[key] = '\n'.join([' '.join(sentence_tokens) for sentence_tokens in lemmas[key]]) words_filename = fh.make_filename(output_dir, 'words', 'json') fh.write_to_json(words, words_filename, sort_keys=False) lemmas_filename = fh.make_filename(output_dir, 'lemmas', 'json') fh.write_to_json(lemmas, lemmas_filename, sort_keys=False) pos_filename = fh.make_filename(output_dir, 'pos', 'json') fh.write_to_json(pos, pos_filename, sort_keys=False) ner_filename = fh.make_filename(output_dir, 'ner', 'json') fh.write_to_json(ner, ner_filename, sort_keys=False) word_pos_filename = fh.make_filename(output_dir, 'word_pos', 'json') fh.write_to_json(word_pos, word_pos_filename, sort_keys=False) lemma_pos_filename = fh.make_filename(output_dir, 'lemma_pos', 'json') fh.write_to_json(lemma_pos, lemma_pos_filename, sort_keys=False) word_ner_filename = fh.make_filename(output_dir, 'word_ner', 'json') fh.write_to_json(word_ner, word_ner_filename, sort_keys=False) lemma_ner_filename = fh.make_filename(output_dir, 'lemma_ner', 'json') fh.write_to_json(lemma_ner, lemma_ner_filename, sort_keys=False) for key in dependencies.keys(): for d in dicts2: d[key] = [] sentences = dependencies[key] for s in sentences: for d in dicts2: d[key].append([]) for tuple in s: dependency_links[key][-1].append(tuple[1]) dependency_heads[key][-1].append(tuple[0] + '_' + tuple[1]) dependency_tails[key][-1].append(tuple[1] + '_' + tuple[2]) dependency_tuples[key][-1].append(tuple[0] + '_' + tuple[1] + '_' + tuple[2]) dependency_pairs[key][-1].append(tuple[0] + '_' + tuple[2]) dep_filename = fh.make_filename(output_dir, 'dependency_links', 'json') fh.write_to_json(dependency_links, dep_filename, sort_keys=False) dep_filename = fh.make_filename(output_dir, 'dependency_heads', 'json') fh.write_to_json(dependency_heads, dep_filename, sort_keys=False) dep_filename = fh.make_filename(output_dir, 'dependency_tails', 'json') fh.write_to_json(dependency_tails, dep_filename, sort_keys=False) dep_filename = fh.make_filename(output_dir, 'dependency_tuples', 'json') fh.write_to_json(dependency_tuples, dep_filename, sort_keys=False) dep_filename = fh.make_filename(output_dir, 'dependency_pairs', 'json') fh.write_to_json(dependency_pairs, dep_filename, sort_keys=False)