def extract_metadata():
    with open("config.json") as f:
        global_config = json.load(f)
    print global_config

    filepath = os.path.join(global_config["opendata"],"temp/*.pdf")
    filenames = glob.glob(filepath)
    print len(filenames)

    list_field = []

    filename_output_text = os.path.join(global_config["home"],"output/paper-pdf.txt")
    filename_output_csv  = os.path.join(global_config["home"],"output/paper-pdf.csv")
    list_paper = []
    with codecs.open(filename_output_text, "wb","utf-8") as f:
        with open(filename_output_csv, "wb") as fcsv:
            writer = UnicodeWriter(fcsv)

            for filename in filenames:
                #if '87970177' not in filename:
                #    continue

                with open(filename,'r') as fpdf:
                    f.write(u"=================================\n\r")
                    f.write(filename)
                    f.write(u'\n\r')
                    f.write(u'\n\r')
                    ret = lib_pdf.pdf2text(fpdf, maxpages=1)
                    for p in ["title","number_of_pages", "text"]:
                        f.write("\n")
                        f.write("\n")
                        f.write(p)
                        f.write("\n")
                        print
                        if p == "number_of_pages":
                            content = str(ret[p])
                        else:
                            content = ret[p]

                        f.write(content.decode("utf-8",errors="ignore"))

                    ret = lib_pdf.pdf2metadata_iswc(fpdf)
                    ret["paper_id"]= int(filename.split("/")[-1][:-4])
                    assert ret["author"]
                    list_paper.append(ret)
                    print json.dumps(ret,indent=4)
                    row = UtilString.json2list(ret, ["title","paper_id","author", "keyword","abstract"])
                    writer.writerow(row)

                #break

    filename_output_json  = os.path.join(global_config["home"],"output/paper-pdf.json")
    content = lib_data.json2text(list_paper)
    with codecs.open(filename_output_json, "w","utf-8") as f:
        f.write(content)
def main():
    filename = "config.json"
    filename = os.path.join(os.path.dirname(__file__), filename)
    with open(filename) as f:
        global_config = json.load(f)
    print global_config


    list_input = [
        {"filename": "8796CorrespondingAuthors.csv",
         #TODO
         #"link_publisher":"tba",
         "proceedings_uri": "http://data.semanticweb.org/conference/iswc/2014/proceedings-1",
         },
        {"filename": "8797CorrespondingAuthors.csv",
         #"link_publisher":"tba",
         "proceedings_uri": "http://data.semanticweb.org/conference/iswc/2014/proceedings-2",
         },
    ]

    list_field=[
        "author",
        "title",
        "pages",
        "year",
        "link_open_access",
        "link_publisher",
        "proceedings_uri",
        "paper_uri",
        "source_uri",
        "keywords",
        "abstract",
        "uri_me",
        "category",
        "source",
        "start_page",
        "paper_id",
        "EOL",
    ]
    map_key = {
        "Title":"title",
        "Authors":"author",
        "Start Page":"start_page",
        "Folder Index":"paper_id",
        "Paper no.":"paper_no",
    }

    list_key = {
        "link_publisher",
        "proceedings_uri",
    }

    list_item = []
    counter = collections.Counter()

    for input in list_input:
        filename = os.path.join( global_config["home"],"data", input["filename"])
        print filename
        with open(filename,'r') as f:
            csvreader = UnicodeReader(f)
            headers = csvreader.next()

            prev_item = None
            for row in csvreader:
                entry = dict(zip(headers, row))

                print entry

                item = {
                    "year":2014,
                    "uri_me":"http://data.semanticweb.org/conference/iswc/2014",
                    #"EOL":"EOL",
                }
                for k,v in map_key.items():
                    item[v] = entry[k].strip()

                for k in list_key:
                    if k in input:
                        item[k] = input[k]

                temp = entry["Paper no."]
                if temp.startswith("DC"):
                    counter["DC"] += 1
                    category = "Doctoral Consortium Paper"
                else:
                    counter[temp[0]] += 1
                    map_category = {
                        "R": "Research Track Paper",
                        "D": "Replication, Benchmark, Data and Software Track Paper",
                        "I": "Semantic Web In Use Track Paper",
                    }
                    category = map_category[temp[0]]

                item["category"]= category

                list_item.append(item)

                if prev_item:
                    prev_item["pages"]= "{}-{}".format(prev_item["start_page"], int(item["start_page"]) - 1)

                prev_item = item

            prev_item["pages"]= "{}-".format(prev_item["start_page"])

    #update: paper uri
    for item in list_item:

        #paper_name = re.sub("\W+", "-", item[u"title"]).lower()
        paper_name = slugify.slugify(item[u"title"])
        print item[u"title"]
        print paper_name

        item["link_open_access"] = "https://github.com/lidingpku/iswc2014/raw/master/paper/{}-{}.pdf".format(item['paper_id'],paper_name)
        print item["link_open_access"]


    print counter.most_common()
    print len(list_item)

    #create file
    filename = "paper-excel.csv"
    filename = os.path.join(global_config["home"],"output", filename)
    print filename
    with open(filename, "w") as f:
        csvwriter = UnicodeWriter(f)
        csvwriter.writerow(list_field)

        for item in list_item:
            row = UtilString.json2list(item, list_field)
            csvwriter.writerow(row)

    filename = "paper-excel.json"
    filename = os.path.join(global_config["home"],"output", filename)
    print filename
    with codecs.open(filename, "w","utf-8") as f:
        f.write(lib_data.json2text(list_item))
Ejemplo n.º 3
0
def render_json(data):
    global_config = load_gloabl_config()


    filename = "iswc2014-data.json"
    filename = os.path.join(global_config["home"],"output", filename)
    with codecs.open(filename, "w","utf-8") as f:
        f.write(lib_data.json2text(data))



    for xtype in ["events","talks"]:
        filename = "iswc2014-data-{}.json".format(xtype)
        filename = os.path.join(global_config["home"],"output", filename)
        with codecs.open(filename, "w","utf-8") as f:
            temp = {xtype: data[xtype]}
            f.write(lib_data.json2text(temp))





    map_paper = {}
    for paper in data["papers"]:
        map_paper [paper["paper_id"]]=paper

    map_talk = {}
    for talk in data["talks"]:
        map_talk[talk["id"]]=talk

    map_session = {}
    for session in data["sessions"]:
        list_paper_in_session = []
        for paper_id in session["paper_list"]:
            paper = map_paper[paper_id]
            list_paper_in_session.append(paper)
        session["paper_all"]= list_paper_in_session

        map_session[session["session_name"]] = session

    for event in data["events"]:
        list_paper_in_session = []
        if "session_name" in event and event["session_name"] in map_session:
            session = map_session[event["session_name"]]
            event.update(session)
            for paper_id in session["paper_list"]:
                paper = map_paper[paper_id]
                talk = map_talk[paper_id]
                paper.update(talk)
                list_paper_in_session.append(paper)
            event["talk_all"]= list_paper_in_session

    #dc, industry
    for track_index in [3,4]:
        track = data['tracks'][track_index]
        list_paper = []
        map_session = {}

        for paper_id in track["paper_list"]:
            paper = map_paper[paper_id]
            if "session_name" in paper:
                session_name = paper["session_name"]
                map_session[session_name] = map_session.get(session_name, lib_data.json_update({"paper_all":[]}, paper, ["session_name", "session_time"]))
                map_session[session_name]["paper_all"].append(paper)
                map_session[session_name]["paper_count"]= len(map_session[session_name]["paper_all"])
            else:
                list_paper.append(paper)


        section_id = "track_{}".format(track['track_id'])
        if list_paper:
            data[section_id] = list_paper
        else:
            data[section_id] = sorted(map_session.values(), key=lambda session: session["session_time"])




    filename = "iswc2014-data-expand.json"
    filename = os.path.join(global_config["home"],"output", filename)
    with codecs.open(filename, "w","utf-8") as f:
        f.write(lib_data.json2text(data))



    template = mustache_template.program2
    output = pystache.render(template, data)



    filename = "iswc2014-program.htm"
    filename = os.path.join(global_config["home"],"output", filename)
    with codecs.open(filename, "w","utf-8") as f:
        f.write(output)




    template = mustache_template.paper_csv
    output = pystache.render(template, data)

    filename = "iswc2014-paper-abstract.csv"
    filename = os.path.join(global_config["home"],"output", filename)
    with codecs.open(filename, "w","utf-8") as f:
        f.write(output)




    template = mustache_template.event_tsv
    output = pystache.render(template, data)
    output = output.replace("T01","T1")
    output = output.replace("T02","T2")
    output = output.replace("&","&")
    output = output.replace(" (*)","")
    output = output.replace(" (**)","")

    filename = "iswc2014-event.tsv"
    filename = os.path.join(global_config["home"],"output", filename)
    with codecs.open(filename, "w","utf-8") as f:
        f.write(output)
Ejemplo n.º 4
0
def create_json():
    map_paper_excel, map_paper_excel_no, map_paper_pdf, map_paper_session, map_name_session, map_event_session = load_paper_json()


    ret = {}
    #event_index
    list_event = sorted(map_event_session.values(), key=lambda event: event["id"])
    #print lib_data.json2text(list_session)
    ret["events"]= list_event

    map_session_event = {}
    for event in map_event_session.values():
        if "session_name" in event:
            map_session_event[event["session_name"]] = event



    #session_index
    map_session = {}
    for paper in sorted(map_paper_session.values(), key=lambda paper: paper["session_index"]):
        session_id = paper["session_id"]
        session_info = map_session.get(session_id, lib_data.json_update({},paper, ["session_no","session_name","session_id"]))
        map_session[session_id] =session_info

        paper_no = paper["Paper no."]
        paper_id = map_paper_excel_no[paper_no]['paper_id']
        paper_list = session_info.get("paper_list",[])
        lib_data.list_append_unique(paper_list, paper_id)
        session_info["paper_list"] =paper_list
        session_info["paper_count"]= len(session_info["paper_list"])


    list_session = map_session.values()
    for session in map_name_session.values():
        if "paper_list" in session:
            list_session.append(session)

    list_session = sorted(list_session, key=lambda paper: paper["session_id"])

    #print lib_data.json2text(list_session)
    ret["sessions"]= list_session

    #Track_index
    map_track = {}
    TRACK_MAP=[
        {"track_id":"In Use", "track_name":"In Use Track", "category": "Semantic Web In Use Track Paper"},
        {"track_id":"RDBS", "track_name":"Replication, Benchmark, Data and Software  Track","category": "Replication, Benchmark, Data and Software Track Paper"},
        {"track_id":"Research", "track_name":"Research Track","category": "Research Track Paper"},
        {"track_id":"DC", "track_name":"Doctoral Consortium", "category":"Doctoral Consortium Paper"},
        {"track_id":"Industry", "track_name":"Industry Track","category": "Industry Track Paper"},
    ]

    for paper in map_paper_excel.values():
        category = paper["category"]
        track = map_track.get(category, {"category": category})
        map_track[category]=track

        paper_id = paper['paper_id']
        paper_list = track.get("paper_list",[])
        lib_data.list_append_unique(paper_list, paper_id)
        track["paper_list"] = sorted(paper_list)

    print lib_data.json2text(map_track.keys())

    for track in TRACK_MAP:
        if track["category"] in map_track:
            track["paper_list"]= map_track[track["category"]]["paper_list"]
            track["paper_count"]= len(track["paper_list"])

    ret["tracks"] = TRACK_MAP
    #print lib_data.json2text(TRACK_MAP)

    #map_paper_id2info
    for paper_id, paper in map_paper_excel.items():
        if paper_id.startswith("industry"):
            continue

        paper_pdf = map_paper_pdf.get(paper_id)

        lib_data.json_update(paper, paper_pdf, ["keywords", "abstract","number_of_pages"])
        if "pages" in paper:
            end_page = int(paper["start_page"])+ paper_pdf["number_of_pages"] - 1
            paper["pages"]= "{}-{}".format(paper["start_page"], end_page)

    list_paper = sorted(map_paper_excel.values(), key=lambda paper: paper["paper_id"])

    ret["papers"] = list_paper


    list_talk = []
    for session_info in ret["sessions"]:
        session_name = session_info["session_name"]
        start_diff = 0
        for paper_id in session_info["paper_list"]:
            paper_info = map_paper_excel[paper_id]

            event = map_session_event[session_name]
            if "Regular Talks" in session_name:
                diff_len = 15
            elif "Pechakucha" in session_name:
                diff_len = 10
            elif paper_info["paper_no"].endswith("*"):
                diff_len= 10
            else:
                diff_len= 20



            talk = {
                "day": event["day"],
                "start": time_add(event["start"], start_diff),
                "end": time_add(event["start"], start_diff+diff_len),
                "event": event["id"],
                "paper": paper_id,
                "paper_title": paper_info["title"],
                "paper_author": paper_info["author"],
                "id": paper_id,
            }
            print talk

            list_talk.append(talk)

            start_diff +=diff_len

    ret["talks"] = list_talk



    print lib_data.json2text(ret)

    return ret
def load_paper_json():
    filename = "config.json"
    filename = os.path.join(os.path.dirname(__file__), filename)
    with open(filename) as f:
        global_config = json.load(f)
    print global_config


    filename = "industry.txt"
    filename = os.path.join(global_config["home"],"data", filename)
    with open(filename, "r") as f:
        content = f.read()

    ret =[]
    list_line = []
    line_prev = None
    session_name = "Regular Talks"
    session_time=None
    time_slot = None
    paper_index = 1
    for line in content.split("\n"):
        line=line.strip()

        if len(line)<=0:
            continue

        if line[0] == "1":
            #time was given
            time_slot, session_x = line.split(" ", 1)
            if "session" in session_x.lower():
                session_name = session_x
                session_time = time_slot

            rest_of_line = " ".join(line.split(" ")[1:])
        else:
            rest_of_line = line

        if "." in rest_of_line:
            parts = rest_of_line.rsplit(".",1)
            paper_id = "industry%02d" % paper_index
            paper_index+=1

            item ={
                "title":parts[1].strip(),
                "author":parts[0],
                "category":"Industry Track Paper",
                "session_name": session_name,
                "session_time": session_time,
                "paper_id": paper_id,
            }

            if line[0] == "1":
                item["talk_time"]= time_slot


            print '"{}","{}"'.format(item["author"],item["title"])

            ret.append(item)

        line_prev=line

    print len(list_line)
    print lib_data.json2text(ret)

    filename = "paper-industry.json"
    filename = os.path.join(global_config["home"],"output", filename)
    with codecs.open(filename, "w","utf-8") as f:
        f.write(lib_data.json2text(ret))

    return ret