def interface(): args = argparse.ArgumentParser() args.add_argument("-i", "--inst-file", help="Institution profiles") args.add_argument("-f", "--faculty-file", help="Faculty profiles") args.add_argument("-g", "--gs-dir", help="Directory of GS profiles") args.add_argument("-d", "--dblp-dir", help="Directory of DBLP profiles") args = args.parse_args() return args if __name__ == "__main__": args = interface() inst = institution_parser.parse_institution_records(open(args.inst_file, "rU")) faculty = load_assistant_profs(open(args.faculty_file, "rU"), inst) # gs_prefix = os.path.join(args.gs_dir, 'GSP_') dblp_prefix = os.path.join(args.dblp_dir, "DBLP_") for f in faculty: # Check for each profile, download if missing """ if 'gs' in f: gs_file = os.path.join(args.gs_dir, GS_FILE % f['gs']) if not os.path.isfile(gs_file): print 'GS -> ', f['facultyName'] download_all_gs_pages(f['gs'], gs_prefix) """ if "dblp" in f: dblp_file = os.path.join(args.dblp_dir, DBLP_FILE % f["dblp"])
name = line.split(':', 1)[-1].strip() if name == next_name: output.write('# dblp_n : %d\n' % num_papers[next_ind]) output.write('# dblp_n_2011 : %d\n' % num_papers_2011[next_ind]) next_ind += 1 if next_ind < max_ind: next_name = names[next_ind] else: done = True if not done: print 'WARNING: failed to link all z-scores!' output.close() if __name__ == "__main__": args = interface() inst = institution_parser.parse_institution_records(open(args.inst_file)) faculty = load_assistant_profs(open(args.faculty_file, 'rU'), inst) load.load_all_publications(faculty, args.dblp_dir, gs_dir=None) dists, tots = get_paper_counts_by_topic(faculty) means, stds = get_topic_means_stds(dists, tots) print means print stds set_zscores(faculty, means, stds) #add_zscores_to_file(faculty, args.faculty_file, args.output_file) add_counts_to_file(faculty, args.faculty_file, args.output_file)
if len(w) < 3: return None return w def add_words_from_title(words, title, stop_words, lem): for word in title.split(): w = word_filter(word, stop_words, lem) if w: words.append(w) if __name__ == "__main__": args = interface() faculty = load.load_assistant_profs(open(args.input_file)) load.load_all_publications(faculty, args.dblp_dir, args.gs_dir) lem = WordNetLemmatizer() stop_words = stopwords.words('english') if args.custom_stops: custom_words = [ word_filter(w.strip(), [], lem) for w in open(args.custom_stops, 'rU') ] stop_words += custom_words for f in faculty: tag = None if 'dblp_pubs' in f:
# Finish writing the last record(s) if line: # Didn't reach the end yet, so grab the next line line = in_fp.readline() while line: output.write(line) line = in_fp.readline() print 'Linked %d DBLP profiles and %d GS profiles in new output file' % (dblp_linked, gs_linked) output.close() if __name__=="__main__": args = interface() faculty = load_assistant_profs(open(args.faculty_file, 'rU')) link_gs_profiles(faculty, args.gs_file) link_dblp_profiles(faculty, args.dblp_file) add_links_to_file(faculty, args.faculty_file, args.output_file) covered = 0 both = 0 for f in faculty: if 'gs' in f or 'dblp' in f: covered += 1 else: print f['facultyName'] if 'gs' in f and 'dblp' in f: both += 1 print '%d of %d have at least one of the two profiles.' % (covered, len(faculty)) print '%d of %d have both profiles.' % (both, len(faculty))
if len(w) < 3: return None return w def add_words_from_title(words, title, stop_words, lem): for word in title.split(): w = word_filter(word, stop_words, lem) if w: words.append(w) if __name__=="__main__": args = interface() inst = institution_parser.parse_institution_records(open(args.inst_file)) faculty = load.load_assistant_profs(open(args.fac_file), inst) load.load_all_publications(faculty, args.dblp_dir, args.gs_dir) lem = WordNetLemmatizer() stop_words = stopwords.words('english') if args.custom_stops: custom_words = [word_filter(w.strip(), [], lem) for w in open(args.custom_stops, 'rU')] stop_words += custom_words written = 0 for f in faculty: tag = None words = [] if 'dblp_pubs' in f: tag = f['dblp']
if len(w) < 3: return None return w def add_words_from_title(words, title, stop_words, lem): for word in title.split(): w = word_filter(word, stop_words, lem) if w: words.append(w) if __name__ == "__main__": args = interface() inst = institution_parser.parse_institution_records(open(args.inst_file)) faculty = load.load_assistant_profs(open(args.fac_file), inst) load.load_all_publications(faculty, args.dblp_dir, args.gs_dir) lem = WordNetLemmatizer() stop_words = stopwords.words('english') if args.custom_stops: custom_words = [ word_filter(w.strip(), [], lem) for w in open(args.custom_stops, 'rU') ] stop_words += custom_words written = 0 for f in faculty: tag = None words = []
HS_INST_FILE = os.path.join(DATA_DIR, 'inst_hs_CURRENT.txt') # Colors ACCENT_COLOR_1 = np.array([176., 116., 232.]) / 255. # Load the standard set of files # Business bs_inst = institution_parser.parse_institution_records(open( BS_INST_FILE, 'rU')) all_bs_faculty = [ person for person in faculty_parser.parse_faculty_records( open(BS_FACULTY_FILE, 'rU'), school_info=bs_inst, ranking='pi') ] bs_faculty = load.load_assistant_profs(open(BS_FACULTY_FILE, 'rU'), school_info=bs_inst, ranking='pi', year_start=1970, year_stop=2012) # bs_faculty_df = convert_faculty_list_to_df(bs_faculty) # Computer Science cs_inst = institution_parser.parse_institution_records(open( CS_INST_FILE, 'rU')) all_cs_faculty = [ person for person in faculty_parser.parse_faculty_records( open(CS_FACULTY_FILE, 'rU'), school_info=cs_inst, ranking='pi') ] cs_faculty = load.load_assistant_profs(open(CS_FACULTY_FILE, 'rU'), school_info=cs_inst, ranking='pi', year_start=1970,
if len(w) < 3: return None return w def add_words_from_title(words, title, stop_words, lem): for word in title.split(): w = word_filter(word, stop_words, lem) if w: words.append(w) if __name__=="__main__": args = interface() faculty = load.load_assistant_profs(open(args.input_file)) load.load_all_publications(faculty, args.dblp_dir, args.gs_dir) lem = WordNetLemmatizer() stop_words = stopwords.words('english') if args.custom_stops: custom_words = [word_filter(w.strip(), [], lem) for w in open(args.custom_stops, 'rU')] stop_words += custom_words for f in faculty: tag = None if 'dblp_pubs' in f: tag = f['dblp'] try: title = np.random.choice(f['dblp_pubs'])['title']