def generate(): # search init s3 = sssearch.SSSearch( # data dir path u_data.semantic_scholar_dir, { # thresholds "author": 50, "title": 50, "year": 2 }) # graph init graph = NET("network_citation_net") # TODO: color attribute for each paper # TODO: print statisics: # iterate through papers conf_filenames = [fn for fn in u_data.getConferenceFilenames()] conf_filenames = conf_filenames[:10] print("------------------------------------------------") print("[#] Analyzing Data:") for conf_filename in tqdm(conf_filenames): conf_name = conf_filename.replace(".json", "") data = u_data.getPapers(conf_filename) papers = data['papers'] for paper in papers: bibtex = p_features.getCitationsData(key) s3entry, _ = s3.query("2017", None, paper["title"]) # Dict organization # paper : {key, title, authors} # bibtex : {author, journal, title, year} # s3entry : {id, authors, title, abstract, journalName} # make node for this paper graph.addNode(s3entry["id"], {}) # TODO: node attributes # make edges to papers that this paper cites e_id = key uid_suffix = 0 for i in range(bibtex["size"]): s3cit, _ = s3.query(bibtex["year"], bibtex["author"], bibtex["title"]) graph.addNode(s3cit["id"], {}) # TODO: node attributes graph.addEdge(e_id + str(uid_suffix), s3entry["id"], s3cit["id"]) uid_suffix += 1 print("------------------------------------------------") print("[%] Writing file:") graph.write(u_data.systems_papers_directory + "net/")
def getAllAuthorsAttribute(attr): conf_filenames = u_data.getConferenceFilenames() values = {} for conf_filename in conf_filenames: authors = u_data.getAuthors(conf_filename) for a_name, a_data in authors.items(): if isinstance(a_data, str) or not attr in a_data: continue val = a_data[attr] # update existing entry if a_name in values: values[a_name] = max(values[a_name], val) # add entry else: values[a_name] = val return values
import semantic_scholar.s2data as s2data import utils.data as u_data from tqdm import tqdm fn_grepciters = u_data.systems_papers_directory+"script/grepciters.sh" # script grepciters = open(fn_grepciters,"w+") # remvove old data file grepciters.write("rm "+s2data.citers_fn+"\n") # loop through titles of papers cfns = [ fn for fn in u_data.getConferenceFilenames() ] i = 0 l = len(cfns)-1 for cfn in tqdm(cfns): cname = cfn.replace(".json","") data = u_data.getPapers(cfn) papers = data["papers"] for p in papers: title = p["title"].replace('"','\\"') <<<<<<< HEAD grepciters.write('grep -h -m 1 "'+title +'" '+u_data.semantic_scholar_dir+'*.json >> citers-'+str(i)+'.json'\n') ======= grepciters.write('grep -h "' + title + '"' + u_data.semantic_scholar_dir + '*.json >>' + # search raw data s2data.citers_fn + '\n' # put results into citers.json # ../semantic-scholar/*.json >> citers.json\n') # this was old approach >>>>>>> 28cb7a055fd557791704ce2ee5e52e8ef23f47d2 grepciters.write('echo "['+str(i)+'/'+str(l)+'] '+title+'"\n')
def generate(color_attribute): # graph graph = NET("collaboration_net" + "_color=" + color_attribute) # color attribute for each author color_attribute_dict = a_features.getAllAuthorsAttribute(color_attribute) color_attribute_dict_values = [ hind for hind in color_attribute_dict.values() ] color_attribute_min = min(color_attribute_dict_values) color_attribute_max = max(color_attribute_dict_values) with open(directory + graph.name + "_attributes.txt", "w+") as file: file.write("name hindex\n") for a_name, hind in color_attribute_dict.items(): file.write("\"" + a_name + "\"" + " " + str(hind) + "\n") # print statisics: print("------------------------------------------------") print(color_attribute + " statistics:") print(" - min :", color_attribute_min) print(" - max :", color_attribute_max) print(" - mean :", np.mean(color_attribute_dict_values)) print(" - median :", np.median(color_attribute_dict_values)) print(" - std :", np.std(color_attribute_dict_values)) def colorAttributeToColor(val): norm = (val - color_attribute_min) / color_attribute_max return u_colors.RGBToHexColor(norm, 0.0, 1 - norm) # iterate through papers conf_filenames = [fn for fn in u_data.getConferenceFilenames()] print("------------------------------------------------") features = a_features.getAllAuthorFeatures() print("[#] Analyzing Data:") for conf_filename in tqdm(conf_filenames): data = u_data.getPapers(conf_filename) papers = data['papers'] for paper in papers: # paper data uid_suffix = 0 # make same-paper edges unique paper_key = paper['key'] author_names = [utils.author_name(a)[0] for a in paper['authors']] author_unames = [ a_features.getAuthorUName(a, features) for a in paper['authors'] ] # iterate through collaborations indecies = [i for i in range(len((author_names)))] for (i1, i2) in u_combos.pairs_unordered(indecies): vals = [0] # for calculating max val in pair aus = [author_unames[i1], author_unames[i2]] ans = [author_names[i1], author_names[i2]] for i in range(2): au, an = aus[i], ans[i] # attributes a_attrs = {} # color_attribute if an in color_attribute_dict: val = color_attribute_dict[an] vals.append(val) a_attrs[color_attribute] = val # a_attrs["color"] = colorAttributeToColor(val) # add node to graph graph.addNode(au, a_attrs) # add edge to graph e_attrs = {"color": colorAttributeToColor(max(vals))} weight = 1 graph.addEdge(paper_key + "_" + str(uid_suffix), aus[0], aus[1], weight, e_attrs) uid_suffix += 1 print("------------------------------------------------") print("[%] Writing file:") graph.write(directory)
import utils.data as data import json papers = [] cfns = data.getConferenceFilenames() for cfn in cfns: papers += data.getPapers(cfn)["papers"] json.dump(papers, open(data.conf_directory+"allConferencePapers.json","w+"))