def get_s2_conferences(): # all_conf_papers : [(conf_name, conf_papers)] all_conf_papers = list(get_each_conference_papers()) papers = s2data.get_dict_gA() def match_paper_title_with_conf(paper_title): for conf_name, conf_papers in all_conf_papers: for paper in conf_papers: # print(conf_name,":",paper["title"]) if paper_title.lower() == paper["title"].lower(): return conf_name s2_paper_confs = {} # paper_id => conf_name # iterate through all s2 papers # papers = { k:v for k,v in list(papers.items())[1:10] } for paper_id, paper in papers.items(): conf_name = match_paper_title_with_conf(paper["title"]) if conf_name: s2_paper_confs[paper_id] = conf_name # print("[*]", paper["title"]) else: pass # print("[!!!]", paper["title"]) return s2_paper_confs
def get_s2id_to_key(): """ HAVE raw_papers :: [ raw_paper = (title, key) ] s2id_to_paper :: s2id => paper bad_to_good :: bad_title => good_title """ conf_papers = list(get_each_conference_papers()) raw_papers = sum([ps for (conf_name, ps) in conf_papers], []) s2id_to_paper = s2data.get_dict_gA() bad_to_good = missing_dicts.get_bad_to_good() """ USE good_to_s2id :: good_title => s2id title_to_key :: title => key """ good_to_s2id = { p["title"] : s2id for s2id, p in s2id_to_paper.items() } title_to_id = {} # fill good titles for good, s2id in good_to_s2id.items(): good = good.lower() title_to_id[good] = s2id # fill bad titles for bad, good in bad_to_good.items(): bad = bad.lower() good = good.lower() title_to_id[bad] = title_to_id[good] # del title_to_id[good] # title_to_key :: title => key title_to_key = { rp["title"].lower() : rp["key"] for rp in raw_papers } """ GOAL s2id_to_key :: s2id => paper_key """ # s2id_to_key :: s2id => paper_key s2id_to_key = {} missing = 0 for title, key in tqdm(title_to_key.items()): title = title.lower() if not title in title_to_id: missing += 1 continue s2id = title_to_id[title] s2id_to_key[s2id] = key print("missing:", missing) return s2id_to_key
def generate(): ################################################################ print("[#] Initializing GEXF") # graph init graph = GEXF("citations_papers") # parameters graph.setParameter("graph", "defaultedgetype", "directed") # attributes graph.addAttribute( "node", "conference" , "string", "" ) graph.addAttribute( "node", "title" , "string", "" ) graph.addAttribute( "node", "year" , "string", "" ) # TODO: color attribute for each paper # TODO: print statisics: ################################################################ print("[#] Loading Data:") gA = s2data.get_dict_gA() ################################################################ print("[#] Analyzing Data:") def safeindex(d,k): return d[k] if k in d else "MISSING" for id, paper in gA.items(): graph.addNode( id, { "title" : safeindex(paper,"title"), "conference" : safeindex(paper,"venue"), "year" : str(safeindex(paper,"year")) }) for out_id in paper["outCitations"]: graph.addEdge(safeindex(paper,"title"), id, out_id, 1) ################################################################ print("[#] Writing file:") graph.write("/home/blancheh/SystemsAnalysis/systems-papers/gexf/")
import utils.combinatorics as u_combos import utils.debug as debug from tqdm import tqdm import networkx as nx # modules import utils.data as u_data import authors.author_features as a_features import semantic_scholar.s2data as s2data from papers_network.papers_network import PapersNetwork ################################################################################ # Load Data debug.message("Loading Data") papers = s2data.get_dict_gA() print() ################################################################################ # Initialization debug.message("Creating Papers Network") G = PapersNetwork() G.add_papers(papers) G.fill_graph() if True: debug.message("Analyzing Network Statistics") G.save_adjacency_matrix_csv() quit()
def generate(): ################################################################ print("[#] Initializing GEXF") # graph init graph = GEXF("citations_conferences") # parameters graph.setParameter("graph", "defaultedgetype", "directed") # attributes # graph.addAttribute( "node", "conference" , "string", "" ) # graph.addAttribute( "node", "title" , "string", "" ) # graph.addAttribute( "node", "year" , "string", "" ) # TODO: color attribute for each paper # TODO: print statisics: ################################################################ print("[#] Loading Data:") gA = s2data.get_dict_gA() gB = s2data.get_dict_gB() ################################################################ print("[#] Analyzing Data:") conferences = [] edge_id = 0 missing_count = 0 def addNode_safe(conf): if not conf in conferences: graph.addNode(conf, {}) conferences.append(conf) for source_id, source_paper in gA.items(): # source node source_conf = conf_utils.normalize_conference(source_paper["venue"]) if len(source_conf) == 0: continue addNode_safe(source_conf) # for each outcite for target_id in source_paper["outCitations"]: if not target_id in gB: missing_count += 1 continue # target node target_conf = conf_utils.normalize_conference( gB[target_id]["venue"]) if len(target_conf) == 0: continue addNode_safe(target_conf) # edge graph.addEdge(str(edge_id), source_conf, target_conf) edge_id += 1 print("[>] missing count:", missing_count) ################################################################ print("[#] Writing file:") graph.write("/home/blancheh/SystemsAnalysis/systems-papers/gexf/")
import re import math import json # # Goal: # find the citers that weren't grepped into `citersdict` # # DEPRECATED # compile all of the s2 corpus files # together into one big megafile, so its # faster to grep it for titles # gA_dict = s2data.get_dict_gA() gA_string = str(gA_dict) # # search for all the papers # missing = {} # loop through titles of papers cfns = [ fn for fn in u_data.getConferenceFilenames() ] last_i = len(cfns) - 1 # start index i = 0
import utils.strings as util_str import conferences.conferences as conf_data import semantic_scholar.s2data as s2_data import utils.json as u_json from tqdm import tqdm DATA_DIR = "find_missing/data/" MISSING_RAWTITLES_FN = DATA_DIR + "missing_rawtitles.txt" RAWTITLE_TO_S2TITLE_FN = DATA_DIR + "rawtitle_to_s2title.json" RAWTITLE_TO_S2ID_FN = DATA_DIR + "rawtitle_to_s2id.json" conferences = list(conf_data.get_each_conference_papers()) rawpapers = sum([conf_papers for conf_name, conf_papers in conferences], []) known_papers = s2_data.get_dict_gA().values() match_threshold = 5 def is_match(s1, s2): return s1.lower() == s2.lower() rawtitle_to_s2title = {} rawtitle_to_s2id = {} def is_known(rawpaper): rawtitle = rawpaper["title"] for paper in known_papers: known_title = paper["title"]
# modules from gexf.gexf import GEXF import utils.data as u_data import semantic_scholar.s2data as s2data import chord.chord as chord import chord.chord_colors as chord_colors ################################################################ # parameters threshold = 150 ################################################################ print("[#] Loading Data:") gA = s2data.get_dict_gA() gB = s2data.get_dict_gB() ################################################################ print("[#] Analyzing Data:") # { conference : { conference: #citations } } conferences = {} def inc_conf(source, target): if not source in conferences: conferences[source] = {} if not target in conferences[source]: conferences[source][target] = 0 conferences[source][target] += 1
import utils.strings as util_str import conferences.conferences as conf_data import semantic_scholar.s2data as s2_data import json from tqdm import tqdm id_to_paper = s2_data.get_dict_gA() id_to_title = {} for p_id, p in id_to_paper.items(): id_to_title[p_id] = p["title"] PARENT_DIR = "find_missing/" BAD_TITLE_TO_ID_FN = PARENT_DIR + "bad_title_to_id.json" PARTIAL_TO_GOOD_FN = PARENT_DIR + "partial_to_good.json" BAD_TO_GOOD_FN = PARENT_DIR + "bad_to_good.json" def load_json(fn): with open(fn, "r+") as f: return json.load(f) def save_json(fn, obj): with open(fn, "w+") as f: json.dump(obj, f, indent=4) bad_title_to_id = load_json(BAD_TITLE_TO_ID_FN) partial_to_good = load_json(PARTIAL_TO_GOOD_FN) # get good titles for bad titles
import utils.strings as util_str import conferences.conferences as conf_data import semantic_scholar.s2data as s2_data import utils.json as u_json from tqdm import tqdm DATA_DIR = "find_missing/data/" MISSING_RAWTITLES_FN = DATA_DIR + "missing_rawtitles.txt" MISSING_RAWTITLES_ED_FN = DATA_DIR + "missing_rawtitles_editdistance.txt" RAWTITLE_TO_S2TITLE_FN = DATA_DIR + "rawtitle_to_s2title.json" RAWTITLE_TO_S2ID_FN = DATA_DIR + "rawtitle_to_s2id.json" MISSING_RAWTITLES_ED_IDS_FN = DATA_DIR + "missing_rawtitles_editdistance_ids.txt" s2id_to_s2paper = s2_data.get_dict_gA() with open(MISSING_RAWTITLES_ED_FN, "r+") as file: found_rawtitles = [line.strip() for line in file] with open(MISSING_RAWTITLES_ED_IDS_FN, "r+") as file: found_s2ids = [line.strip() for line in file] rawtitle_to_s2id = u_json.load_json(RAWTITLE_TO_S2ID_FN) rawtitle_to_s2title = u_json.load_json(RAWTITLE_TO_S2TITLE_FN) for i in range(len(found_rawtitles)): found_rawtitle = found_rawtitles[i] found_s2id = found_s2ids[i] if found_s2id == "!": continue if not found_s2id in s2id_to_s2paper: print("unfound:", found_s2id) continue
import utils.strings as util_str import conferences.conferences as conf_data import semantic_scholar.s2data as s2_data from tqdm import tqdm VERSION = 2 source_fn = "finding_missing/find_missing_papers_result.txt" target_fn = "finding_missing/find_missing_papers_result_{0}.txt".format(VERSION) known = s2_data.get_dict_gA() known_titles = [ p["title"] for p in known.values() ] title_to_id = { p["title"] : p_id for p_id, p in known.items() } # DONE # conferences = list(conf_data.get_each_conference_papers()) # papers = sum([ conf_papers for conf_name, conf_papers in conferences ], []) # paper_titles = [ p["title"] for p in papers ] with open(source_fn) as file: paper_titles = [ line.strip() for line in file ] match_threshold = 5 def is_match(s1, s2): return s1.lower() == s2.lower() # return or util_str.editDistance(s1, s2) <= match_threshold def is_known(paper_title): if paper_title in known_titles: return True return any([ is_match(paper_title, known_title)