def getPapersAbstract(): from collections import defaultdict from dcclient import DataCenterClient f = open("E:\\ids.txt") f.next() ids = [] import codecs f_out = codecs.open("E:\\abstracts_1.txt","w",encoding="utf-8") from bs4 import UnicodeDammit for line in f: x = line.split("\n") ids.append(int(x[0])) print len(ids) c = DataCenterClient("tcp://10.1.1.211:32011") for i in range(len(ids)/1000): print "DUMP %s"%(i*1000) x = c.getPublicationsById(ids[i*1000:(i+1)*1000]) id_set = set(ids) count = 0 abs = {} conf = {} authors = {} title = {} year = {} for p in x.publications: abs[p.id] = p.abs.replace("\n"," ").replace("\t"," ") conf[p.id] = p.jconf_name authors[p.id] = ",".join([str(a) for a in p.author_ids]) title[p.id] = p.title year[p.id] = p.year for p in abs: if len(abs[p]) > 2: f_out.write("%s\n%s\n%s\n%s\n%s\n%s\n"%(p,year[p],conf[p],authors[p],title[p],UnicodeDammit(abs[p]).markup))
def main(): import time import datetime c = DataCenterClient("tcp://10.1.1.211:32011") x = c.searchPublications("data mining") data_fields = ["id", "mid", "uid", "parent", "type", "t", "user_created_at", "followers_count", "statuses_count", "friends_count", "username", "text", "words", "verified", "emotion"]; items = [] for p in x.publications: au = "0" if len(p.author_ids) > 0: au = p.author_ids[0] dt = datetime.datetime(p.year, 1, 1, 1, 1) t = int(time.mktime(dt.timetuple())) children = [] parents = [] for x in p.cited_by_pubs: children.append(str(x)) y = [str(p.id), str(p.id), str(au), children, 0, t, t, p.n_citations, p.n_citations, p.n_citations, p.authors, p.title, "hello,world"] items.append(y) import json dump = open("pubs_dump.json","w") d = json.dumps(items) dump.write(d) dump.close() import pickle terms = pickle.load(open("..\\static\\pickle\\terms_dump_all.pickle"))
def getPapersCitation(): from collections import defaultdict from dcclient import DataCenterClient f = open("E:\\vis.txt") f.next() ids = [] for line in f: x = line.split("\t") ids.append(int(x[0])) c = DataCenterClient("tcp://10.1.1.211:32011") x = c.getPublicationsById(ids) id_set = set(ids) count = 0 citation = defaultdict(set) for p in x.publications: for y in p.cite_pubs: if y in id_set: print count count += 1 citation[p.id].add(y) for y in p.cited_by_pubs: if y in id_set: print count count += 1 citation[y].add(p.id) f_out = open("E:\\citation.txt", "w") for p in citation: for q in citation[p]: f_out.write("%s\t%s\n" % (p, q))
def getPapersCitation(): from collections import defaultdict from dcclient import DataCenterClient f = open("E:\\vis.txt") f.next() ids = [] for line in f: x = line.split("\t") ids.append(int(x[0])) c = DataCenterClient("tcp://10.1.1.211:32011") x = c.getPublicationsById(ids) id_set = set(ids) count = 0 citation = defaultdict(set) for p in x.publications: for y in p.cite_pubs: if y in id_set: print count count += 1 citation[p.id].add(y) for y in p.cited_by_pubs: if y in id_set: print count count += 1 citation[y].add(p.id) f_out = open("E:\\citation.txt","w") for p in citation: for q in citation[p]: f_out.write("%s\t%s\n"%(p,q))
def getPapersAbstract(): from collections import defaultdict from dcclient import DataCenterClient f = open("E:\\ids.txt") f.next() ids = [] import codecs f_out = codecs.open("E:\\abstracts_1.txt", "w", encoding="utf-8") from bs4 import UnicodeDammit for line in f: x = line.split("\n") ids.append(int(x[0])) print len(ids) c = DataCenterClient("tcp://10.1.1.211:32011") for i in range(len(ids) / 1000): print "DUMP %s" % (i * 1000) x = c.getPublicationsById(ids[i * 1000:(i + 1) * 1000]) id_set = set(ids) count = 0 abs = {} conf = {} authors = {} title = {} year = {} for p in x.publications: abs[p.id] = p.abs.replace("\n", " ").replace("\t", " ") conf[p.id] = p.jconf_name authors[p.id] = ",".join([str(a) for a in p.author_ids]) title[p.id] = p.title year[p.id] = p.year for p in abs: if len(abs[p]) > 2: f_out.write("%s\n%s\n%s\n%s\n%s\n%s\n" % (p, year[p], conf[p], authors[p], title[p], UnicodeDammit(abs[p]).markup))
def getPapersAbstractYearConf1(): from collections import defaultdict from dcclient import DataCenterClient import codecs from bs4 import UnicodeDammit import os f = open("E:\\ids.txt") f.next() ids = [] for line in f: x = line.split("\n") ids.append(int(x[0])) c = DataCenterClient("tcp://10.1.1.211:32011") def createFile(year, conf): if not os.path.exists(str(year)): os.makedirs(str(year)) return codecs.open(os.path.join(str(year), conf), "w", encoding="utf-8") #files = defaultdict(dict) files = {} confs = [ 5056, 4906, 4276, 1935, 2651, 3399, 3183, 4650, 2039, 1938, 710, 3489 ] for y in confs: x = c.getPublicationsByConfId([y]) abs = {} conf = {} title = {} year = {} for p in x.publications: abs[p.id] = p.abs.replace("\n", " ").replace("\t", " ") conf[p.id] = p.jconf_name #.replace("/"," ").replace("*"," ") title[p.id] = p.title year[p.id] = p.year for p in abs: if len(abs[p]) > 2 and len(conf[p]) > 1: #if not files[year[p]].has_key(conf[p]): if not files.has_key(year[p]): files[year[p]] = codecs.open("pubs\\" + str(year[p]), "w", encoding="utf-8") #files[year[p]][conf[p]] = createFile(year[p], conf[p]) file = files[year[p]] file.write( "%s\n%s\n%s\n%s\n" % (p, conf[p], title[p], UnicodeDammit(abs[p]).markup)) for f in files: files[f].close()
def getTopicTrend(): import time import datetime from collections import defaultdict c = DataCenterClient("tcp://10.1.1.211:32011") x = c.searchAuthors("deep learning") data_fields = [ "id", "mid", "uid", "parent", "type", "t", "user_created_at", "followers_count", "statuses_count", "friends_count", "username", "text", "words", "verified", "emotion" ] pubs = [] key_terms = defaultdict(int) year_terms = defaultdict(lambda: defaultdict(int)) for a in x.authors: result = c.getPublicationsByAuthorId([a.naid]) for p in result.publications: if p.year > 1970: item, children, kt = extractPublication(p) pubs.append(item) for k in kt: key_terms[k.lower()] += 1 year_terms[p.year][k.lower()] += 1 sorted_key_terms = sorted(key_terms.items(), key=lambda x: x[1], reverse=True) year_tfidf = defaultdict(list) for y in year_terms: for k in year_terms[y]: if key_terms[k] > 1: year_tfidf[y].append({ "text": k, "size": float(year_terms[y][k]) / key_terms[k] }) import json dump = open("pubs_dump.json", "w") d = json.dumps(items) dump.write(d) dump.close() dump = open("year_tfidf.json", "w") d = json.dumps(year_tfidf) dump.write(d) dump.close()
def getCitationNetwork(): import time import datetime from collections import defaultdict c = DataCenterClient("tcp://10.1.1.211:32011") x = c.searchPublications("deep learning") data_fields = [ "id", "mid", "uid", "parent", "type", "t", "user_created_at", "followers_count", "statuses_count", "friends_count", "username", "text", "words", "verified", "emotion" ] items = [] cite_pubs = [] key_terms = defaultdict(int) year_terms = defaultdict(lambda: defaultdict(int)) for p in x.publications: if p.year <= 1970: continue item, children, parents, kt = extractPublication(p) if len(children) > 0: items.append(item) cite_pubs.extend(children) cite_pubs.extend(parents) for k in kt: key_terms[k.lower()] += 1 year_terms[p.year][k.lower()] += 1 cite_pubs = list(set(cite_pubs)) x = c.getPublicationsById(cite_pubs) for p in x.publications: if p.year <= 1970: continue item, children, parents, kt = extractPublication(p) if len(children) > 0 and len(children) > 0: items.append(item) cite_pubs.extend(children) for k in kt: key_terms[k.lower()] += 1 year_terms[p.year][k.lower()] += 1 sorted_key_terms = sorted(key_terms.items(), key=lambda x: x[1], reverse=True) import json dump = open("pubs_dump.json", "w") d = json.dumps(items) dump.write(d) dump.close()
def getPapersAbstractYearConf(): from collections import defaultdict from dcclient import DataCenterClient import codecs from bs4 import UnicodeDammit import os f = open("E:\\ids.txt") f.next() ids = [] for line in f: x = line.split("\n") ids.append(int(x[0])) c = DataCenterClient("tcp://10.1.1.211:32011") def createFile(year, conf): if not os.path.exists(str(year)): os.makedirs(str(year)) return codecs.open(os.path.join(str(year), conf), "w", encoding="utf-8") #files = defaultdict(dict) files = {} for i in range(len(ids) / 10000): print "DUMP %s" % (i * 10000) x = c.getPublicationsById(ids[i * 10000:(i + 1) * 10000]) id_set = set(ids) count = 0 abs = {} conf = {} title = {} year = {} for p in x.publications: abs[p.id] = p.abs.replace("\n", " ").replace("\t", " ") conf[p.id] = p.jconf_name #.replace("/"," ").replace("*"," ") title[p.id] = p.title year[p.id] = p.year for p in abs: if len(abs[p]) > 2 and len(conf[p]) > 1: #if not files[year[p]].has_key(conf[p]): if not files.has_key(year[p]): files[year[p]] = codecs.open(str(year[p]), "w", encoding="utf-8") #files[year[p]][conf[p]] = createFile(year[p], conf[p]) file = files[year[p]] file.write( "%s\n%s\n%s\n%s\n" % (p, conf[p], title[p], UnicodeDammit(abs[p]).markup))
def get_data_from_datacenter(): from dcclient import DataCenterClient data_center = DataCenterClient("tcp://10.1.1.211:32011") f = open("E:\\ids.txt") f.next() ids = [] for line in f: x = line.split("\n") ids.append(int(x[0])) print len(ids) for i in range(len(ids)/1000): print "DUMP %s"%(i*1000) x = c.getPublicationsById(ids[i*1000:(i+1)*1000]) id_set = set(ids) count = 0 abs = {} conf = {} authors = {} title = {} year = {} for p in x.publications: abs[p.id] = p.abs.replace("\n"," ").replace("\t"," ") conf[p.id] = p.jconf_name authors[p.id] = ",".join([str(a) for a in p.author_ids]) title[p.id] = p.title year[p.id] = p.year for p in abs: if len(abs[p]) > 2: f_out.write("%s\n%s\n%s\n%s\n%s\n%s\n"%(p,year[p],conf[p],authors[p],title[p],UnicodeDammit(abs[p]).markup))
def getCitationNetwork(): import time import datetime from collections import defaultdict c = DataCenterClient("tcp://10.1.1.211:32011") x = c.searchPublications("deep learning") data_fields = ["id", "mid", "uid", "parent", "type", "t", "user_created_at", "followers_count", "statuses_count", "friends_count", "username", "text", "words", "verified", "emotion"]; items = [] cite_pubs = [] key_terms = defaultdict(int) year_terms = defaultdict(lambda: defaultdict(int)) for p in x.publications: if p.year <= 1970: continue item, children, parents, kt = extractPublication(p) if len(children) > 0: items.append(item) cite_pubs.extend(children) cite_pubs.extend(parents) for k in kt: key_terms[k.lower()] += 1 year_terms[p.year][k.lower()] += 1 cite_pubs = list(set(cite_pubs)) x = c.getPublicationsById(cite_pubs) for p in x.publications: if p.year <= 1970: continue item, children, parents, kt = extractPublication(p) if len(children) > 0 and len(children) > 0: items.append(item) cite_pubs.extend(children) for k in kt: key_terms[k.lower()] += 1 year_terms[p.year][k.lower()] += 1 sorted_key_terms = sorted(key_terms.items(), key=lambda x: x[1], reverse=True) import json dump = open("pubs_dump.json", "w") d = json.dumps(items) dump.write(d) dump.close()
def getPapersAbstractYearConf1(): from collections import defaultdict from dcclient import DataCenterClient import codecs from bs4 import UnicodeDammit import os f = open("E:\\ids.txt") f.next() ids = [] for line in f: x = line.split("\n") ids.append(int(x[0])) c = DataCenterClient("tcp://10.1.1.211:32011") def createFile(year, conf): if not os.path.exists(str(year)): os.makedirs(str(year)) return codecs.open(os.path.join(str(year), conf), "w", encoding="utf-8") #files = defaultdict(dict) files = {} confs = [5056, 4906, 4276, 1935, 2651, 3399, 3183, 4650, 2039, 1938, 710, 3489] for y in confs: x = c.getPublicationsByConfId([y]) abs = {} conf = {} title = {} year = {} for p in x.publications: abs[p.id] = p.abs.replace("\n", " ").replace("\t", " ") conf[p.id] = p.jconf_name#.replace("/"," ").replace("*"," ") title[p.id] = p.title year[p.id] = p.year for p in abs: if len(abs[p]) > 2 and len(conf[p]) > 1: #if not files[year[p]].has_key(conf[p]): if not files.has_key(year[p]): files[year[p]] = codecs.open("pubs\\" + str(year[p]), "w", encoding="utf-8") #files[year[p]][conf[p]] = createFile(year[p], conf[p]) file = files[year[p]] file.write("%s\n%s\n%s\n%s\n" % (p, conf[p], title[p], UnicodeDammit(abs[p]).markup)) for f in files: files[f].close()
def getPapersAbstractYearConf(): from collections import defaultdict from dcclient import DataCenterClient import codecs from bs4 import UnicodeDammit import os f = open("E:\\ids.txt") f.next() ids = [] for line in f: x = line.split("\n") ids.append(int(x[0])) c = DataCenterClient("tcp://10.1.1.211:32011") def createFile(year, conf): if not os.path.exists(str(year)): os.makedirs(str(year)) return codecs.open(os.path.join(str(year), conf), "w", encoding="utf-8") #files = defaultdict(dict) files = {} for i in range(len(ids) / 10000): print "DUMP %s" % (i * 10000) x = c.getPublicationsById(ids[i * 10000:(i + 1) * 10000]) id_set = set(ids) count = 0 abs = {} conf = {} title = {} year = {} for p in x.publications: abs[p.id] = p.abs.replace("\n", " ").replace("\t", " ") conf[p.id] = p.jconf_name#.replace("/"," ").replace("*"," ") title[p.id] = p.title year[p.id] = p.year for p in abs: if len(abs[p]) > 2 and len(conf[p]) > 1: #if not files[year[p]].has_key(conf[p]): if not files.has_key(year[p]): files[year[p]] = codecs.open(str(year[p]), "w", encoding="utf-8") #files[year[p]][conf[p]] = createFile(year[p], conf[p]) file = files[year[p]] file.write("%s\n%s\n%s\n%s\n" % (p, conf[p], title[p], UnicodeDammit(abs[p]).markup))
def getTopicTrend(): import time import datetime from collections import defaultdict c = DataCenterClient("tcp://10.1.1.211:32011") x = c.searchAuthors("deep learning") data_fields = ["id", "mid", "uid", "parent", "type", "t", "user_created_at", "followers_count", "statuses_count", "friends_count", "username", "text", "words", "verified", "emotion"]; pubs = [] key_terms = defaultdict(int) year_terms = defaultdict(lambda: defaultdict(int)) for a in x.authors: result = c.getPublicationsByAuthorId([a.naid]) for p in result.publications: if p.year > 1970: item, children, kt = extractPublication(p) pubs.append(item) for k in kt: key_terms[k.lower()] += 1 year_terms[p.year][k.lower()] += 1 sorted_key_terms = sorted(key_terms.items(), key=lambda x: x[1], reverse=True) year_tfidf = defaultdict(list) for y in year_terms: for k in year_terms[y]: if key_terms[k] > 1: year_tfidf[y].append({"text": k, "size": float(year_terms[y][k]) / key_terms[k]}) import json dump = open("pubs_dump.json", "w") d = json.dumps(items) dump.write(d) dump.close() dump = open("year_tfidf.json", "w") d = json.dumps(year_tfidf) dump.write(d) dump.close()