def main(): global_config = load_gloabl_config() #load json to get paper uri filename = "paper-excel.json" filename = os.path.join(global_config["home"],"output", filename) with open(filename, 'r') as f: content = f.read() list_paper_pdf = lib_data.text2json(content) print len(list_paper_pdf) for paper_info in list_paper_pdf: print paper_info["link_open_access"] filename_new = paper_info["link_open_access"].split("/")[-1] paper_id = filename_new.split("-",1)[0] filename_x = os.path.join(global_config["opendata"],"temp/{}.pdf".format(paper_id)) filename_new = os.path.join(global_config["opendata"],"paper/{}".format(filename_new)) print filename_x, filename_new shutil.copy2(filename_x,filename_new)
def load_paper_json(): global_config = load_gloabl_config() filename = "paper-excel.json" filename = os.path.join(global_config["home"],"output", filename) with open(filename, 'r') as f: content = f.read() list_paper_excel = lib_data.text2json(content) print len(list_paper_excel) map_paper_excel = {} map_paper_excel_no = {} for paper in list_paper_excel: map_paper_excel[str(paper["paper_id"])] = paper map_paper_excel_no[str(paper["paper_no"])] = paper map_name_session = {} filename = "paper-industry.json" filename = os.path.join(global_config["home"],"output", filename) with open(filename, 'r') as f: content = f.read() list_paper_industry = lib_data.text2json(content) set_session_name = set() for paper in list_paper_industry: paper_id = str(paper["paper_id"]) map_paper_excel[paper_id] = paper session_name = paper["session_name"] set_session_name.add(session_name) default_session_id = 100 + len(set_session_name) default_entry = { "session_time": paper["session_time"], "session_name": session_name, "session_id" : default_session_id, "session_index" : default_session_id, } entry = map_name_session.get(session_name, default_entry) map_name_session[session_name]=entry paper_list = entry.get("paper_list",[]) lib_data.list_append_unique(paper_list, paper_id) entry["paper_list"] =paper_list entry["paper_count"]= len(entry["paper_list"]) filename = "paper-pdf.json" filename = os.path.join(global_config["home"],"output", filename) with open(filename, 'r') as f: content = f.read() list_paper_pdf = lib_data.text2json(content) print len(list_paper_pdf) map_paper_pdf = {} for paper in list_paper_pdf: map_paper_pdf[str(paper["paper_id"])] = paper filename = "session.csv" filename = os.path.join(global_config["home"],"data", filename) map_paper_session = {} with open(filename,'r') as f: csvreader = UnicodeReader(f) headers = csvreader.next() session_no = None session_name = None session_index = 1 for row in csvreader: entry = dict(zip(headers, row)) if entry.get("Paper no."): entry["session_no"] = session_no entry["session_id"] = int(session_no.split(" ")[-1]) entry["session_name"] = session_name entry["session_index"] = session_index session_index+=1 map_paper_session[entry["Paper no."]]=entry map_name_session[session_name]=entry else: session_no = entry["Session no"] session_name = entry["Title"].strip() session_index = 1 print len(map_paper_session) filename = "event.csv" filename = os.path.join(global_config["home"],"data", filename) map_event_session = {} with open(filename,'r') as f: csvreader = UnicodeReader(f) headers = csvreader.next() for row in csvreader: if row[0].startswith("#"): continue entry = dict(zip(headers, row)) print entry event_start, event_end = entry["Time"].split("-") event_day = entry["day"] for k,v in entry.items(): if k in ["Time","day"]: continue if v: event_id = (len(map_event_session)+1) event = { "day":event_day, "start":event_start.strip(), "end": event_end.strip(), "name": v.strip(), "location": k, "id": event_id, } if "Session" in v or "Industry Track:" in v: session_name = v.replace("Session:","") session_name = session_name.replace("Industry Track:","") session_name = re.sub("\([^\)]+\)","", session_name) session_name = session_name.strip() if session_name not in map_name_session: print session_name assert session_name in map_name_session event["session_name"] = session_name map_event_session[event_id] = event print len(map_paper_session) return map_paper_excel, map_paper_excel_no, map_paper_pdf, map_paper_session, map_name_session, map_event_session