def getPapersAbstract(): from collections import defaultdict from dcclient import DataCenterClient f = open("E:\\ids.txt") f.next() ids = [] import codecs f_out = codecs.open("E:\\abstracts_1.txt","w",encoding="utf-8") from bs4 import UnicodeDammit for line in f: x = line.split("\n") ids.append(int(x[0])) print len(ids) c = DataCenterClient("tcp://10.1.1.211:32011") for i in range(len(ids)/1000): print "DUMP %s"%(i*1000) x = c.getPublicationsById(ids[i*1000:(i+1)*1000]) id_set = set(ids) count = 0 abs = {} conf = {} authors = {} title = {} year = {} for p in x.publications: abs[p.id] = p.abs.replace("\n"," ").replace("\t"," ") conf[p.id] = p.jconf_name authors[p.id] = ",".join([str(a) for a in p.author_ids]) title[p.id] = p.title year[p.id] = p.year for p in abs: if len(abs[p]) > 2: f_out.write("%s\n%s\n%s\n%s\n%s\n%s\n"%(p,year[p],conf[p],authors[p],title[p],UnicodeDammit(abs[p]).markup))
def getPapersCitation(): from collections import defaultdict from dcclient import DataCenterClient f = open("E:\\vis.txt") f.next() ids = [] for line in f: x = line.split("\t") ids.append(int(x[0])) c = DataCenterClient("tcp://10.1.1.211:32011") x = c.getPublicationsById(ids) id_set = set(ids) count = 0 citation = defaultdict(set) for p in x.publications: for y in p.cite_pubs: if y in id_set: print count count += 1 citation[p.id].add(y) for y in p.cited_by_pubs: if y in id_set: print count count += 1 citation[y].add(p.id) f_out = open("E:\\citation.txt", "w") for p in citation: for q in citation[p]: f_out.write("%s\t%s\n" % (p, q))
def getPapersCitation(): from collections import defaultdict from dcclient import DataCenterClient f = open("E:\\vis.txt") f.next() ids = [] for line in f: x = line.split("\t") ids.append(int(x[0])) c = DataCenterClient("tcp://10.1.1.211:32011") x = c.getPublicationsById(ids) id_set = set(ids) count = 0 citation = defaultdict(set) for p in x.publications: for y in p.cite_pubs: if y in id_set: print count count += 1 citation[p.id].add(y) for y in p.cited_by_pubs: if y in id_set: print count count += 1 citation[y].add(p.id) f_out = open("E:\\citation.txt","w") for p in citation: for q in citation[p]: f_out.write("%s\t%s\n"%(p,q))
def getPapersAbstract(): from collections import defaultdict from dcclient import DataCenterClient f = open("E:\\ids.txt") f.next() ids = [] import codecs f_out = codecs.open("E:\\abstracts_1.txt", "w", encoding="utf-8") from bs4 import UnicodeDammit for line in f: x = line.split("\n") ids.append(int(x[0])) print len(ids) c = DataCenterClient("tcp://10.1.1.211:32011") for i in range(len(ids) / 1000): print "DUMP %s" % (i * 1000) x = c.getPublicationsById(ids[i * 1000:(i + 1) * 1000]) id_set = set(ids) count = 0 abs = {} conf = {} authors = {} title = {} year = {} for p in x.publications: abs[p.id] = p.abs.replace("\n", " ").replace("\t", " ") conf[p.id] = p.jconf_name authors[p.id] = ",".join([str(a) for a in p.author_ids]) title[p.id] = p.title year[p.id] = p.year for p in abs: if len(abs[p]) > 2: f_out.write("%s\n%s\n%s\n%s\n%s\n%s\n" % (p, year[p], conf[p], authors[p], title[p], UnicodeDammit(abs[p]).markup))
def getCitationNetwork(): import time import datetime from collections import defaultdict c = DataCenterClient("tcp://10.1.1.211:32011") x = c.searchPublications("deep learning") data_fields = [ "id", "mid", "uid", "parent", "type", "t", "user_created_at", "followers_count", "statuses_count", "friends_count", "username", "text", "words", "verified", "emotion" ] items = [] cite_pubs = [] key_terms = defaultdict(int) year_terms = defaultdict(lambda: defaultdict(int)) for p in x.publications: if p.year <= 1970: continue item, children, parents, kt = extractPublication(p) if len(children) > 0: items.append(item) cite_pubs.extend(children) cite_pubs.extend(parents) for k in kt: key_terms[k.lower()] += 1 year_terms[p.year][k.lower()] += 1 cite_pubs = list(set(cite_pubs)) x = c.getPublicationsById(cite_pubs) for p in x.publications: if p.year <= 1970: continue item, children, parents, kt = extractPublication(p) if len(children) > 0 and len(children) > 0: items.append(item) cite_pubs.extend(children) for k in kt: key_terms[k.lower()] += 1 year_terms[p.year][k.lower()] += 1 sorted_key_terms = sorted(key_terms.items(), key=lambda x: x[1], reverse=True) import json dump = open("pubs_dump.json", "w") d = json.dumps(items) dump.write(d) dump.close()
def getPapersAbstractYearConf(): from collections import defaultdict from dcclient import DataCenterClient import codecs from bs4 import UnicodeDammit import os f = open("E:\\ids.txt") f.next() ids = [] for line in f: x = line.split("\n") ids.append(int(x[0])) c = DataCenterClient("tcp://10.1.1.211:32011") def createFile(year, conf): if not os.path.exists(str(year)): os.makedirs(str(year)) return codecs.open(os.path.join(str(year), conf), "w", encoding="utf-8") #files = defaultdict(dict) files = {} for i in range(len(ids) / 10000): print "DUMP %s" % (i * 10000) x = c.getPublicationsById(ids[i * 10000:(i + 1) * 10000]) id_set = set(ids) count = 0 abs = {} conf = {} title = {} year = {} for p in x.publications: abs[p.id] = p.abs.replace("\n", " ").replace("\t", " ") conf[p.id] = p.jconf_name #.replace("/"," ").replace("*"," ") title[p.id] = p.title year[p.id] = p.year for p in abs: if len(abs[p]) > 2 and len(conf[p]) > 1: #if not files[year[p]].has_key(conf[p]): if not files.has_key(year[p]): files[year[p]] = codecs.open(str(year[p]), "w", encoding="utf-8") #files[year[p]][conf[p]] = createFile(year[p], conf[p]) file = files[year[p]] file.write( "%s\n%s\n%s\n%s\n" % (p, conf[p], title[p], UnicodeDammit(abs[p]).markup))
def getCitationNetwork(): import time import datetime from collections import defaultdict c = DataCenterClient("tcp://10.1.1.211:32011") x = c.searchPublications("deep learning") data_fields = ["id", "mid", "uid", "parent", "type", "t", "user_created_at", "followers_count", "statuses_count", "friends_count", "username", "text", "words", "verified", "emotion"]; items = [] cite_pubs = [] key_terms = defaultdict(int) year_terms = defaultdict(lambda: defaultdict(int)) for p in x.publications: if p.year <= 1970: continue item, children, parents, kt = extractPublication(p) if len(children) > 0: items.append(item) cite_pubs.extend(children) cite_pubs.extend(parents) for k in kt: key_terms[k.lower()] += 1 year_terms[p.year][k.lower()] += 1 cite_pubs = list(set(cite_pubs)) x = c.getPublicationsById(cite_pubs) for p in x.publications: if p.year <= 1970: continue item, children, parents, kt = extractPublication(p) if len(children) > 0 and len(children) > 0: items.append(item) cite_pubs.extend(children) for k in kt: key_terms[k.lower()] += 1 year_terms[p.year][k.lower()] += 1 sorted_key_terms = sorted(key_terms.items(), key=lambda x: x[1], reverse=True) import json dump = open("pubs_dump.json", "w") d = json.dumps(items) dump.write(d) dump.close()
def getPapersAbstractYearConf(): from collections import defaultdict from dcclient import DataCenterClient import codecs from bs4 import UnicodeDammit import os f = open("E:\\ids.txt") f.next() ids = [] for line in f: x = line.split("\n") ids.append(int(x[0])) c = DataCenterClient("tcp://10.1.1.211:32011") def createFile(year, conf): if not os.path.exists(str(year)): os.makedirs(str(year)) return codecs.open(os.path.join(str(year), conf), "w", encoding="utf-8") #files = defaultdict(dict) files = {} for i in range(len(ids) / 10000): print "DUMP %s" % (i * 10000) x = c.getPublicationsById(ids[i * 10000:(i + 1) * 10000]) id_set = set(ids) count = 0 abs = {} conf = {} title = {} year = {} for p in x.publications: abs[p.id] = p.abs.replace("\n", " ").replace("\t", " ") conf[p.id] = p.jconf_name#.replace("/"," ").replace("*"," ") title[p.id] = p.title year[p.id] = p.year for p in abs: if len(abs[p]) > 2 and len(conf[p]) > 1: #if not files[year[p]].has_key(conf[p]): if not files.has_key(year[p]): files[year[p]] = codecs.open(str(year[p]), "w", encoding="utf-8") #files[year[p]][conf[p]] = createFile(year[p], conf[p]) file = files[year[p]] file.write("%s\n%s\n%s\n%s\n" % (p, conf[p], title[p], UnicodeDammit(abs[p]).markup))