def main():
    global_config = load_gloabl_config()


    #load json to get paper uri
    filename = "paper-excel.json"
    filename  = os.path.join(global_config["home"],"output", filename)
    with open(filename, 'r') as f:
        content = f.read()
        list_paper_pdf = lib_data.text2json(content)

    print len(list_paper_pdf)

    for paper_info in list_paper_pdf:
        print paper_info["link_open_access"]
        filename_new = paper_info["link_open_access"].split("/")[-1]
        paper_id = filename_new.split("-",1)[0]


        filename_x = os.path.join(global_config["opendata"],"temp/{}.pdf".format(paper_id))
        filename_new = os.path.join(global_config["opendata"],"paper/{}".format(filename_new))

        print filename_x, filename_new
        shutil.copy2(filename_x,filename_new)
Ejemplo n.º 2
0
def load_paper_json():
    global_config = load_gloabl_config()



    filename = "paper-excel.json"
    filename  = os.path.join(global_config["home"],"output", filename)
    with open(filename, 'r') as f:
        content = f.read()
        list_paper_excel = lib_data.text2json(content)
    print len(list_paper_excel)
    map_paper_excel = {}
    map_paper_excel_no = {}
    for paper  in list_paper_excel:
        map_paper_excel[str(paper["paper_id"])] = paper

        map_paper_excel_no[str(paper["paper_no"])] = paper


    map_name_session = {}



    filename = "paper-industry.json"
    filename = os.path.join(global_config["home"],"output", filename)
    with open(filename, 'r') as f:
        content = f.read()
        list_paper_industry = lib_data.text2json(content)

    set_session_name = set()
    for paper in list_paper_industry:
        paper_id = str(paper["paper_id"])
        map_paper_excel[paper_id] = paper

        session_name = paper["session_name"]

        set_session_name.add(session_name)

        default_session_id = 100 + len(set_session_name)
        default_entry = {
            "session_time": paper["session_time"],
            "session_name": session_name,
            "session_id" : default_session_id,
            "session_index" : default_session_id,

        }

        entry = map_name_session.get(session_name, default_entry)

        map_name_session[session_name]=entry

        paper_list = entry.get("paper_list",[])
        lib_data.list_append_unique(paper_list, paper_id)
        entry["paper_list"] =paper_list
        entry["paper_count"]= len(entry["paper_list"])


    filename = "paper-pdf.json"
    filename  = os.path.join(global_config["home"],"output", filename)
    with open(filename, 'r') as f:
        content = f.read()
        list_paper_pdf = lib_data.text2json(content)

    print len(list_paper_pdf)
    map_paper_pdf = {}
    for paper in list_paper_pdf:
        map_paper_pdf[str(paper["paper_id"])] = paper





    filename = "session.csv"
    filename  = os.path.join(global_config["home"],"data", filename)
    map_paper_session = {}

    with open(filename,'r') as f:
        csvreader = UnicodeReader(f)
        headers = csvreader.next()
        session_no = None
        session_name = None
        session_index = 1
        for row in csvreader:
            entry = dict(zip(headers, row))

            if entry.get("Paper no."):
                entry["session_no"] = session_no
                entry["session_id"] = int(session_no.split(" ")[-1])
                entry["session_name"] = session_name
                entry["session_index"] = session_index
                session_index+=1
                map_paper_session[entry["Paper no."]]=entry

                map_name_session[session_name]=entry
            else:
                session_no = entry["Session no"]
                session_name = entry["Title"].strip()
                session_index = 1


    print len(map_paper_session)




    filename = "event.csv"
    filename  = os.path.join(global_config["home"],"data", filename)
    map_event_session  = {}

    with open(filename,'r') as f:
        csvreader = UnicodeReader(f)
        headers = csvreader.next()
        for row in csvreader:
            if row[0].startswith("#"):
                continue

            entry = dict(zip(headers, row))

            print entry

            event_start, event_end = entry["Time"].split("-")
            event_day = entry["day"]

            for k,v in entry.items():
                if k in ["Time","day"]:
                    continue
                if v:
                    event_id = (len(map_event_session)+1)

                    event = {
                        "day":event_day,
                        "start":event_start.strip(),
                        "end": event_end.strip(),
                        "name": v.strip(),
                        "location": k,
                        "id": event_id,
                    }

                    if "Session" in v or "Industry Track:" in v:
                        session_name = v.replace("Session:","")
                        session_name = session_name.replace("Industry Track:","")

                        session_name = re.sub("\([^\)]+\)","", session_name)
                        session_name = session_name.strip()

                        if session_name not in map_name_session:
                            print session_name

                        assert session_name in map_name_session

                        event["session_name"] = session_name

                    map_event_session[event_id] = event

    print len(map_paper_session)



    return map_paper_excel, map_paper_excel_no, map_paper_pdf, map_paper_session, map_name_session, map_event_session