コード例 #1
0
def extract_metadata():
    with open("config.json") as f:
        global_config = json.load(f)
    print global_config

    filepath = os.path.join(global_config["opendata"],"temp/*.pdf")
    filenames = glob.glob(filepath)
    print len(filenames)

    list_field = []

    filename_output_text = os.path.join(global_config["home"],"output/paper-pdf.txt")
    filename_output_csv  = os.path.join(global_config["home"],"output/paper-pdf.csv")
    list_paper = []
    with codecs.open(filename_output_text, "wb","utf-8") as f:
        with open(filename_output_csv, "wb") as fcsv:
            writer = UnicodeWriter(fcsv)

            for filename in filenames:
                #if '87970177' not in filename:
                #    continue

                with open(filename,'r') as fpdf:
                    f.write(u"=================================\n\r")
                    f.write(filename)
                    f.write(u'\n\r')
                    f.write(u'\n\r')
                    ret = lib_pdf.pdf2text(fpdf, maxpages=1)
                    for p in ["title","number_of_pages", "text"]:
                        f.write("\n")
                        f.write("\n")
                        f.write(p)
                        f.write("\n")
                        print
                        if p == "number_of_pages":
                            content = str(ret[p])
                        else:
                            content = ret[p]

                        f.write(content.decode("utf-8",errors="ignore"))

                    ret = lib_pdf.pdf2metadata_iswc(fpdf)
                    ret["paper_id"]= int(filename.split("/")[-1][:-4])
                    assert ret["author"]
                    list_paper.append(ret)
                    print json.dumps(ret,indent=4)
                    row = UtilString.json2list(ret, ["title","paper_id","author", "keyword","abstract"])
                    writer.writerow(row)

                #break

    filename_output_json  = os.path.join(global_config["home"],"output/paper-pdf.json")
    content = lib_data.json2text(list_paper)
    with codecs.open(filename_output_json, "w","utf-8") as f:
        f.write(content)
コード例 #2
0
def main():
    filename = "config.json"
    filename = os.path.join(os.path.dirname(__file__), filename)
    with open(filename) as f:
        global_config = json.load(f)
    print global_config


    list_input = [
        {"filename": "8796CorrespondingAuthors.csv",
         #TODO
         #"link_publisher":"tba",
         "proceedings_uri": "http://data.semanticweb.org/conference/iswc/2014/proceedings-1",
         },
        {"filename": "8797CorrespondingAuthors.csv",
         #"link_publisher":"tba",
         "proceedings_uri": "http://data.semanticweb.org/conference/iswc/2014/proceedings-2",
         },
    ]

    list_field=[
        "author",
        "title",
        "pages",
        "year",
        "link_open_access",
        "link_publisher",
        "proceedings_uri",
        "paper_uri",
        "source_uri",
        "keywords",
        "abstract",
        "uri_me",
        "category",
        "source",
        "start_page",
        "paper_id",
        "EOL",
    ]
    map_key = {
        "Title":"title",
        "Authors":"author",
        "Start Page":"start_page",
        "Folder Index":"paper_id",
        "Paper no.":"paper_no",
    }

    list_key = {
        "link_publisher",
        "proceedings_uri",
    }

    list_item = []
    counter = collections.Counter()

    for input in list_input:
        filename = os.path.join( global_config["home"],"data", input["filename"])
        print filename
        with open(filename,'r') as f:
            csvreader = UnicodeReader(f)
            headers = csvreader.next()

            prev_item = None
            for row in csvreader:
                entry = dict(zip(headers, row))

                print entry

                item = {
                    "year":2014,
                    "uri_me":"http://data.semanticweb.org/conference/iswc/2014",
                    #"EOL":"EOL",
                }
                for k,v in map_key.items():
                    item[v] = entry[k].strip()

                for k in list_key:
                    if k in input:
                        item[k] = input[k]

                temp = entry["Paper no."]
                if temp.startswith("DC"):
                    counter["DC"] += 1
                    category = "Doctoral Consortium Paper"
                else:
                    counter[temp[0]] += 1
                    map_category = {
                        "R": "Research Track Paper",
                        "D": "Replication, Benchmark, Data and Software Track Paper",
                        "I": "Semantic Web In Use Track Paper",
                    }
                    category = map_category[temp[0]]

                item["category"]= category

                list_item.append(item)

                if prev_item:
                    prev_item["pages"]= "{}-{}".format(prev_item["start_page"], int(item["start_page"]) - 1)

                prev_item = item

            prev_item["pages"]= "{}-".format(prev_item["start_page"])

    #update: paper uri
    for item in list_item:

        #paper_name = re.sub("\W+", "-", item[u"title"]).lower()
        paper_name = slugify.slugify(item[u"title"])
        print item[u"title"]
        print paper_name

        item["link_open_access"] = "https://github.com/lidingpku/iswc2014/raw/master/paper/{}-{}.pdf".format(item['paper_id'],paper_name)
        print item["link_open_access"]


    print counter.most_common()
    print len(list_item)

    #create file
    filename = "paper-excel.csv"
    filename = os.path.join(global_config["home"],"output", filename)
    print filename
    with open(filename, "w") as f:
        csvwriter = UnicodeWriter(f)
        csvwriter.writerow(list_field)

        for item in list_item:
            row = UtilString.json2list(item, list_field)
            csvwriter.writerow(row)

    filename = "paper-excel.json"
    filename = os.path.join(global_config["home"],"output", filename)
    print filename
    with codecs.open(filename, "w","utf-8") as f:
        f.write(lib_data.json2text(list_item))
コード例 #3
0
def main():
    ###################################################################        
    # load config file
    with open("config.json") as f:
        global_config = json.load( f)



    """	load three csv files, 
    	aggregate them to form a join
    """
    json_person = {}
    filename = os.path.join(global_config["home"], "data/work/iswc2013/raw/payments.csv")
    json_payment = UtilCsv.csv2json(filename)
    for entry in json_payment:
        key = entry["name"].lower()
        if key in json_person:
            data = json_person[key]
        else:
            data = {"name":entry["name"], 
                    "paid":False, 
                    "attend":False, 
                    "paper":[]}
            json_person[key]=data
        data["email_payment"]= entry["email"]
        data["id_payment"]= entry["id"]
        data["paid"]= True


    filename = os.path.join(global_config["home"], "data/work/iswc2013/raw/attendees.csv")
    json_attendees = UtilCsv.csv2json(filename)
    for entry in json_attendees:
        key = entry["name"].lower()
        if key in json_person:
            data = json_person[key]
        else:
            data = {"name":entry["name"], 
                    "paid":False, 
                    "attend":False, 
                    "paper":[]}
            json_person[key]=data
        data["email_attendees"]= entry["email"]
        data["id_attendees"]= entry["id"]
        data["attend"]= True

    json_output=[]

    filename = os.path.join(global_config["home"], "data/source/iswc-2013-paper.csv")
    json_paper = UtilCsv.csv2json(filename)

    #split authors
    for entry in json_paper:
        title = entry["title"]
        entry["author_list"]= [x.strip() for x in entry["author"].split(',')]

        #print len(json_paper), entry
        data_paper = { "paid":[], "attend":[]}
        for key in ["title","category","author"]:
            data_paper[key] =entry[key]

        json_output.append(data_paper)

        for name in entry["author_list"]:
            key =name.lower()
            if key in json_person:
                json_person[key]["paper"].append(title)

                if json_person[key]["paid"]:
                    data_paper["paid"].append(name)
                if json_person[key]["attend"]:
                    data_paper["attend"].append(name)

    filename_output = os.path.join(global_config["home"], "data/work/iswc2013/raw/stat_paper.csv")
    with open(filename_output,"w") as f:
        csvwriter = UnicodeWriter(f)

        headers = ["category","author","title","paid","attend"]
        csvwriter.writerow(headers)

        for entry in json_output:
            #print entry
            row = UtilString.json2list(entry, headers)
            csvwriter.writerow(row)

    filename_output = os.path.join(global_config["home"], "data/work/iswc2013/raw/stat_person.csv")
    with open(filename_output,"w") as f:
        csvwriter = UnicodeWriter(f)

        headers = ["name","paid","attend","paper"]
        csvwriter.writerow(headers)

        for entry in sorted(json_person.values(), key=lambda x:x["name"]):
            #print entry
            row = UtilString.json2list(entry, headers)
            csvwriter.writerow(row)
コード例 #4
0
def easychair_paper_author(
	filename_input_paper, filename_input_author,
	filename_output_paper, filename_output_author):
	#process author
	with open(filename_input_author) as f:
		html_doc = f.read()
	soup = BeautifulSoup(html_doc)

	map_author_all ={}

	attr_pattern = { "class" : "ct_table"}
	list_author = HtmlUtil.extract_table(soup, attr_pattern)
	for author in list_author["rows"]:
		#print author
		name = author[u"Author"]
		map_author_all[name] = author
		author["Homepage"] = ""

	print "{} authors found".format(len(map_author_all))

	#process paper
	with open(filename_input_paper) as f:
		html_doc = f.read()
	soup = BeautifulSoup(html_doc)

	list_paper =[]

	list_author = []
	list_div_paper = soup.find_all('div', attrs= { "class" : "paper"})
	for div_paper in list_div_paper:
		paper ={}
		list_paper.append(paper)

		for cls in ["authors", "title"]:
			div_cls = div_paper.find('span', attrs= { "class" : cls})
			#print div_cls.text

			if cls =="authors":
				text = div_cls.text.encode('utf8')
				text = text.replace(" and ", ", ")
				text = re.sub("\.\s*$","", text)
				text = text.strip()
				paper[cls] = text


				for el in HtmlUtil.extract_links(div_cls):
					name = el["text"]
					list_author.append( name)
					if name not in map_author_all:
						print "ERROR: name [{}] not in author, with homepage".format(name)
						map_author_all[name]={"Author": name}

					el["link"] = el["link"].replace("http://http:/","http://")
					map_author_all[name]["Homepage"]=el["link"]

				for x in text.split(","):
					name =  x.strip()
					list_author.append( name )
					if name not in map_author_all:
						print "ERROR: name [{}] not in author, without homepage".format(name)
						map_author_all[name]={"Author": name}

			else:
				paper[cls] = div_cls.text.encode('utf8')

	list_author = sorted(set(list_author))

	list_div_abstract = soup.find_all('div', attrs= { "class" : "abstract"})
	for index, div_abstract in enumerate(list_div_abstract):
		abstract = div_abstract.text.encode('utf8').replace("Abstract:", "")
		abstract = abstract.strip()
		list_paper[index]["abstract"] =abstract
	
	print "{} papers write".format(len(list_paper))
	with open(filename_output_paper, "w") as f:
		csvwriter = UnicodeWriter(f)
		headers = ["authors", "title","abstract"]
		csvwriter.writerow(headers)

		for paper in list_paper:
			row = UtilString.json2list(paper, headers)
			csvwriter.writerow(row)

	print "{} authors write".format(len(list_author))
	with open(filename_output_author, "w") as f:
		csvwriter = UnicodeWriter(f)
		headers = ["Author", "Affiliation","Country","Email","Homepage"]
		csvwriter.writerow(headers)

		for name in list_author:
			author = map_author_all[name]
			row = UtilString.json2list(author, headers)
			csvwriter.writerow(row)