def extract_metadata(): with open("config.json") as f: global_config = json.load(f) print global_config filepath = os.path.join(global_config["opendata"],"temp/*.pdf") filenames = glob.glob(filepath) print len(filenames) list_field = [] filename_output_text = os.path.join(global_config["home"],"output/paper-pdf.txt") filename_output_csv = os.path.join(global_config["home"],"output/paper-pdf.csv") list_paper = [] with codecs.open(filename_output_text, "wb","utf-8") as f: with open(filename_output_csv, "wb") as fcsv: writer = UnicodeWriter(fcsv) for filename in filenames: #if '87970177' not in filename: # continue with open(filename,'r') as fpdf: f.write(u"=================================\n\r") f.write(filename) f.write(u'\n\r') f.write(u'\n\r') ret = lib_pdf.pdf2text(fpdf, maxpages=1) for p in ["title","number_of_pages", "text"]: f.write("\n") f.write("\n") f.write(p) f.write("\n") print if p == "number_of_pages": content = str(ret[p]) else: content = ret[p] f.write(content.decode("utf-8",errors="ignore")) ret = lib_pdf.pdf2metadata_iswc(fpdf) ret["paper_id"]= int(filename.split("/")[-1][:-4]) assert ret["author"] list_paper.append(ret) print json.dumps(ret,indent=4) row = UtilString.json2list(ret, ["title","paper_id","author", "keyword","abstract"]) writer.writerow(row) #break filename_output_json = os.path.join(global_config["home"],"output/paper-pdf.json") content = lib_data.json2text(list_paper) with codecs.open(filename_output_json, "w","utf-8") as f: f.write(content)
def main(): filename = "config.json" filename = os.path.join(os.path.dirname(__file__), filename) with open(filename) as f: global_config = json.load(f) print global_config list_input = [ {"filename": "8796CorrespondingAuthors.csv", #TODO #"link_publisher":"tba", "proceedings_uri": "http://data.semanticweb.org/conference/iswc/2014/proceedings-1", }, {"filename": "8797CorrespondingAuthors.csv", #"link_publisher":"tba", "proceedings_uri": "http://data.semanticweb.org/conference/iswc/2014/proceedings-2", }, ] list_field=[ "author", "title", "pages", "year", "link_open_access", "link_publisher", "proceedings_uri", "paper_uri", "source_uri", "keywords", "abstract", "uri_me", "category", "source", "start_page", "paper_id", "EOL", ] map_key = { "Title":"title", "Authors":"author", "Start Page":"start_page", "Folder Index":"paper_id", "Paper no.":"paper_no", } list_key = { "link_publisher", "proceedings_uri", } list_item = [] counter = collections.Counter() for input in list_input: filename = os.path.join( global_config["home"],"data", input["filename"]) print filename with open(filename,'r') as f: csvreader = UnicodeReader(f) headers = csvreader.next() prev_item = None for row in csvreader: entry = dict(zip(headers, row)) print entry item = { "year":2014, "uri_me":"http://data.semanticweb.org/conference/iswc/2014", #"EOL":"EOL", } for k,v in map_key.items(): item[v] = entry[k].strip() for k in list_key: if k in input: item[k] = input[k] temp = entry["Paper no."] if temp.startswith("DC"): counter["DC"] += 1 category = "Doctoral Consortium Paper" else: counter[temp[0]] += 1 map_category = { "R": "Research Track Paper", "D": "Replication, Benchmark, Data and Software Track Paper", "I": "Semantic Web In Use Track Paper", } category = map_category[temp[0]] item["category"]= category list_item.append(item) if prev_item: prev_item["pages"]= "{}-{}".format(prev_item["start_page"], int(item["start_page"]) - 1) prev_item = item prev_item["pages"]= "{}-".format(prev_item["start_page"]) #update: paper uri for item in list_item: #paper_name = re.sub("\W+", "-", item[u"title"]).lower() paper_name = slugify.slugify(item[u"title"]) print item[u"title"] print paper_name item["link_open_access"] = "https://github.com/lidingpku/iswc2014/raw/master/paper/{}-{}.pdf".format(item['paper_id'],paper_name) print item["link_open_access"] print counter.most_common() print len(list_item) #create file filename = "paper-excel.csv" filename = os.path.join(global_config["home"],"output", filename) print filename with open(filename, "w") as f: csvwriter = UnicodeWriter(f) csvwriter.writerow(list_field) for item in list_item: row = UtilString.json2list(item, list_field) csvwriter.writerow(row) filename = "paper-excel.json" filename = os.path.join(global_config["home"],"output", filename) print filename with codecs.open(filename, "w","utf-8") as f: f.write(lib_data.json2text(list_item))
def render_json(data): global_config = load_gloabl_config() filename = "iswc2014-data.json" filename = os.path.join(global_config["home"],"output", filename) with codecs.open(filename, "w","utf-8") as f: f.write(lib_data.json2text(data)) for xtype in ["events","talks"]: filename = "iswc2014-data-{}.json".format(xtype) filename = os.path.join(global_config["home"],"output", filename) with codecs.open(filename, "w","utf-8") as f: temp = {xtype: data[xtype]} f.write(lib_data.json2text(temp)) map_paper = {} for paper in data["papers"]: map_paper [paper["paper_id"]]=paper map_talk = {} for talk in data["talks"]: map_talk[talk["id"]]=talk map_session = {} for session in data["sessions"]: list_paper_in_session = [] for paper_id in session["paper_list"]: paper = map_paper[paper_id] list_paper_in_session.append(paper) session["paper_all"]= list_paper_in_session map_session[session["session_name"]] = session for event in data["events"]: list_paper_in_session = [] if "session_name" in event and event["session_name"] in map_session: session = map_session[event["session_name"]] event.update(session) for paper_id in session["paper_list"]: paper = map_paper[paper_id] talk = map_talk[paper_id] paper.update(talk) list_paper_in_session.append(paper) event["talk_all"]= list_paper_in_session #dc, industry for track_index in [3,4]: track = data['tracks'][track_index] list_paper = [] map_session = {} for paper_id in track["paper_list"]: paper = map_paper[paper_id] if "session_name" in paper: session_name = paper["session_name"] map_session[session_name] = map_session.get(session_name, lib_data.json_update({"paper_all":[]}, paper, ["session_name", "session_time"])) map_session[session_name]["paper_all"].append(paper) map_session[session_name]["paper_count"]= len(map_session[session_name]["paper_all"]) else: list_paper.append(paper) section_id = "track_{}".format(track['track_id']) if list_paper: data[section_id] = list_paper else: data[section_id] = sorted(map_session.values(), key=lambda session: session["session_time"]) filename = "iswc2014-data-expand.json" filename = os.path.join(global_config["home"],"output", filename) with codecs.open(filename, "w","utf-8") as f: f.write(lib_data.json2text(data)) template = mustache_template.program2 output = pystache.render(template, data) filename = "iswc2014-program.htm" filename = os.path.join(global_config["home"],"output", filename) with codecs.open(filename, "w","utf-8") as f: f.write(output) template = mustache_template.paper_csv output = pystache.render(template, data) filename = "iswc2014-paper-abstract.csv" filename = os.path.join(global_config["home"],"output", filename) with codecs.open(filename, "w","utf-8") as f: f.write(output) template = mustache_template.event_tsv output = pystache.render(template, data) output = output.replace("T01","T1") output = output.replace("T02","T2") output = output.replace("&","&") output = output.replace(" (*)","") output = output.replace(" (**)","") filename = "iswc2014-event.tsv" filename = os.path.join(global_config["home"],"output", filename) with codecs.open(filename, "w","utf-8") as f: f.write(output)
def create_json(): map_paper_excel, map_paper_excel_no, map_paper_pdf, map_paper_session, map_name_session, map_event_session = load_paper_json() ret = {} #event_index list_event = sorted(map_event_session.values(), key=lambda event: event["id"]) #print lib_data.json2text(list_session) ret["events"]= list_event map_session_event = {} for event in map_event_session.values(): if "session_name" in event: map_session_event[event["session_name"]] = event #session_index map_session = {} for paper in sorted(map_paper_session.values(), key=lambda paper: paper["session_index"]): session_id = paper["session_id"] session_info = map_session.get(session_id, lib_data.json_update({},paper, ["session_no","session_name","session_id"])) map_session[session_id] =session_info paper_no = paper["Paper no."] paper_id = map_paper_excel_no[paper_no]['paper_id'] paper_list = session_info.get("paper_list",[]) lib_data.list_append_unique(paper_list, paper_id) session_info["paper_list"] =paper_list session_info["paper_count"]= len(session_info["paper_list"]) list_session = map_session.values() for session in map_name_session.values(): if "paper_list" in session: list_session.append(session) list_session = sorted(list_session, key=lambda paper: paper["session_id"]) #print lib_data.json2text(list_session) ret["sessions"]= list_session #Track_index map_track = {} TRACK_MAP=[ {"track_id":"In Use", "track_name":"In Use Track", "category": "Semantic Web In Use Track Paper"}, {"track_id":"RDBS", "track_name":"Replication, Benchmark, Data and Software Track","category": "Replication, Benchmark, Data and Software Track Paper"}, {"track_id":"Research", "track_name":"Research Track","category": "Research Track Paper"}, {"track_id":"DC", "track_name":"Doctoral Consortium", "category":"Doctoral Consortium Paper"}, {"track_id":"Industry", "track_name":"Industry Track","category": "Industry Track Paper"}, ] for paper in map_paper_excel.values(): category = paper["category"] track = map_track.get(category, {"category": category}) map_track[category]=track paper_id = paper['paper_id'] paper_list = track.get("paper_list",[]) lib_data.list_append_unique(paper_list, paper_id) track["paper_list"] = sorted(paper_list) print lib_data.json2text(map_track.keys()) for track in TRACK_MAP: if track["category"] in map_track: track["paper_list"]= map_track[track["category"]]["paper_list"] track["paper_count"]= len(track["paper_list"]) ret["tracks"] = TRACK_MAP #print lib_data.json2text(TRACK_MAP) #map_paper_id2info for paper_id, paper in map_paper_excel.items(): if paper_id.startswith("industry"): continue paper_pdf = map_paper_pdf.get(paper_id) lib_data.json_update(paper, paper_pdf, ["keywords", "abstract","number_of_pages"]) if "pages" in paper: end_page = int(paper["start_page"])+ paper_pdf["number_of_pages"] - 1 paper["pages"]= "{}-{}".format(paper["start_page"], end_page) list_paper = sorted(map_paper_excel.values(), key=lambda paper: paper["paper_id"]) ret["papers"] = list_paper list_talk = [] for session_info in ret["sessions"]: session_name = session_info["session_name"] start_diff = 0 for paper_id in session_info["paper_list"]: paper_info = map_paper_excel[paper_id] event = map_session_event[session_name] if "Regular Talks" in session_name: diff_len = 15 elif "Pechakucha" in session_name: diff_len = 10 elif paper_info["paper_no"].endswith("*"): diff_len= 10 else: diff_len= 20 talk = { "day": event["day"], "start": time_add(event["start"], start_diff), "end": time_add(event["start"], start_diff+diff_len), "event": event["id"], "paper": paper_id, "paper_title": paper_info["title"], "paper_author": paper_info["author"], "id": paper_id, } print talk list_talk.append(talk) start_diff +=diff_len ret["talks"] = list_talk print lib_data.json2text(ret) return ret
def load_paper_json(): filename = "config.json" filename = os.path.join(os.path.dirname(__file__), filename) with open(filename) as f: global_config = json.load(f) print global_config filename = "industry.txt" filename = os.path.join(global_config["home"],"data", filename) with open(filename, "r") as f: content = f.read() ret =[] list_line = [] line_prev = None session_name = "Regular Talks" session_time=None time_slot = None paper_index = 1 for line in content.split("\n"): line=line.strip() if len(line)<=0: continue if line[0] == "1": #time was given time_slot, session_x = line.split(" ", 1) if "session" in session_x.lower(): session_name = session_x session_time = time_slot rest_of_line = " ".join(line.split(" ")[1:]) else: rest_of_line = line if "." in rest_of_line: parts = rest_of_line.rsplit(".",1) paper_id = "industry%02d" % paper_index paper_index+=1 item ={ "title":parts[1].strip(), "author":parts[0], "category":"Industry Track Paper", "session_name": session_name, "session_time": session_time, "paper_id": paper_id, } if line[0] == "1": item["talk_time"]= time_slot print '"{}","{}"'.format(item["author"],item["title"]) ret.append(item) line_prev=line print len(list_line) print lib_data.json2text(ret) filename = "paper-industry.json" filename = os.path.join(global_config["home"],"output", filename) with codecs.open(filename, "w","utf-8") as f: f.write(lib_data.json2text(ret)) return ret