def getObsCoreFile(fname): """ Takes psv format file, returns a dict with keys the filenames to use, the values an array of tuples for that filename, each tuple, the vals, anbool, where anbool says is info is taken from corresponding vals. vals is the conversion of the psv row into a dict. """ (rdr, fh) = open_obscore(fname) rnum = 0 rpass = 0 idx = 1 globalrowdict={} h_at={} for row in rdr: vals=row2dict(row) obs_id = vals['obs_id'] h_at[obs_id]=[(vals, 1)] fh.close() return h_at
def writeBibliographyFile(fname, ohead, bibcodes, format="n3"): """Write out bibliographic records using the obscore table in fname. bibcodes is a dictionary with key: obsid, value: list of bibcodes. The output is written to ohead.<format> """ (rdr, fh) = open_obscore(fname) graph = makeGraph() nbib = 0 for row in rdr: obs_id = get_column(row, "obs_id") access_url = get_column(row, "access_url") thedate = "_".join(get_column(row, "date_obs").split()) for (k, bs) in bibcodes.iteritems(): # The HUT bibcodes appear to use obsid values which are # prefixes of the obscore ones. # if not obs_id.startswith(k): continue # Create the URIs that represent the data and observation objects from # the obscore table. # uri_hash = base64.urlsafe_b64encode(access_url[::-1]) #daturi = mkURI("/obsv/MAST/obsid/{0}/data/".format(obs_id), uri_hash) #obsuri = mkURI("/obsv/MAST/obsid/{0}/observation/".format(obs_id), uri_hash) daturi = mkURI( "/obsv/data/MAST/obsid/{0}/".format(obs_id + "-" + thedate), uri_hash) #obsuri = mkURI("/obsv/observation/MAST/obsid/{0}/".format(obs_id), uri_hash) obsuri = mkURI( "/obsv/observation/MAST/obsid/{0}/".format(obs_id + "-" + thedate)) # Loop through each bibcode, linking them to the data/observation URIs # for b in bs: biburi = URIRef(ads_baseurl + "/bib#" + cleanFragment(b)) gadd(graph, biburi, adsbase.aboutScienceProduct, daturi) gadd(graph, biburi, adsbase.aboutScienceProcess, obsuri) nbib += len(bs) print("# bibcodes = {0}".format(nbib)) fh.close() writeGraph(graph, "{0}.{1}".format(ohead, format), format=format)
def writeBibliographyFile(fname, ohead, bibcodes, format="n3"): """Write out bibliographic records using the obscore table in fname. bibcodes is a dictionary with key: obsid, value: list of bibcodes. The output is written to ohead.<format> """ (rdr, fh) = open_obscore(fname) graph = makeGraph() nbib = 0 for row in rdr: obs_id = get_column(row, "obs_id") access_url = get_column(row, "access_url") thedate="_".join(get_column(row, "date_obs").split()) for (k,bs) in bibcodes.iteritems(): # The HUT bibcodes appear to use obsid values which are # prefixes of the obscore ones. # >> I'm not sure if this is relevant to FUSE, but am keeping # the stuff here just in case because it's also in WUPPE. # if not obs_id.startswith(k): continue # Create the URIs that represent the data and observation objects from # the obscore table. # uri_hash = base64.urlsafe_b64encode(access_url[::-1]) #daturi = mkURI("/obsv/MAST/obsid/{0}/data/".format(obs_id), uri_hash) #obsuri = mkURI("/obsv/MAST/obsid/{0}/observation/".format(obs_id), uri_hash) daturi = mkURI("/obsv/data/MAST/obsid/{0}/".format(obs_id+"-"+thedate), uri_hash) #obsuri = mkURI("/obsv/observation/MAST/obsid/{0}/".format(obs_id), uri_hash) obsuri = mkURI("/obsv/observation/MAST/obsid/{0}/".format(obs_id+"-"+thedate)) # Loop through each bibcode, linking them to the data/observation URIs # for b in bs: biburi = URIRef(ads_baseurl + "/bib#" + cleanFragment(b)) gadd(graph, biburi, adsbase.aboutScienceProduct, daturi) gadd(graph, biburi, adsbase.aboutScienceProcess, obsuri) nbib += len(bs) print("# bibcodes = {0}".format(nbib)) fh.close() writeGraph(graph, "{0}.{1}".format(ohead, format), format=format)
def getObsCoreFile(fname, ohead, nsplit=10000, format="n3"): """Convert the given obscore file from MAST (in psv format) into RDF. Rows that can not be converted are ignored (an error message is displayed on STDERR in this case). Since the input file is large we now split apart the output every nsplit rows. The output is written to ohead.<i>.<format> where i is a counter, starting at 1 """ (rdr, fh) = open_obscore(fname) rnum = 0 rpass = 0 idx = 1 graph = makeGraph() for row in rdr: rnum += 1 try: gr = addObsCoreRow(row) graph += gr rpass += 1 except Exception, e: sys.stderr.write("ERROR: row# {0}\n{1}\n".format(rnum, str(e))) if rnum % 500 == 0: print ("Processed row: {0}".format(rnum)) if rnum % nsplit == 0: # TODO: do we want to catch IO errors here? writeGraph(graph, "{0}.{1}.{2}".format(ohead, idx, format), format=format) idx += 1 graph = makeGraph()
def getObsCoreFile(odhfname, fname, ohead, nsplit=10000, format="n3"): """Convert the given obscore file from MAST (in psv format) into RDF. Rows that can not be converted are ignored (an error message is displayed on STDERR in this case). Since the input file is large we now split apart the output every nsplit rows. The output is written to ohead.<i>.<format> where i is a counter, starting at 1 """ obsdatahash={} (rdr, fh) = open_obscore(fname) rnum = 0 rpass = 0 idx = 1 graph = makeGraph() globalrowdict={} h_at={} for row in rdr: vals=row2dict(row) obs_id = vals['obs_id'] at_time="_".join(vals['date_obs'].split()) if obs_id == '': raise ValueError("No obs_id value in this row!") access_url = vals['access_url'] access_name=access_url.split('/')[-1].split('_ph_')[0] if access_name.find('_sum') !=-1: access_name=access_name.split('_sum')[0] anbool=1 if access_name.find('_imcscor')!=-1: access_name=access_name.split('_imcscor')[0] anbool=0 #print "access url", access_url if access_url.strip() == '': raise ValueError("Empty access_url for row") dayfind=access_url.find(obs_id+"_d") nightfind=access_url.find(obs_id+"_n") afind=access_url.find(obs_id+"_a") if dayfind!=-1: d2key=obs_id+"_d" #dkey=obs_id#lets not to day separately elif afind!=-1: d2key=obs_id+"_a" elif nightfind!=-1: d2key=obs_id+"_a" else: d2key=obs_id #dkey=obs_id+"--"+access_name dkey=obs_id if not globalrowdict.has_key(dkey): globalrowdict[dkey]=[] globalrowdict[dkey].append((vals, at_time, access_name, d2key, anbool)) #print "LLLLL" for dkey in globalrowdict.keys(): print "grd", dkey, len(globalrowdict[dkey]) dalen=len(globalrowdict[dkey]) h_an={} for ele in globalrowdict[dkey]: vals, at_time, access_name, d2key, anbool=ele print "time",at_time, dkey, access_name, anbool if not h_an.has_key(access_name): h_an[access_name]=[] if anbool==1 or dalen==1: h_an[access_name].append((ele, at_time)) else: h_an[access_name].append((ele, None)) #print "han", h_an h_an2={} for item in h_an.keys(): #Sprint "hanitem", h_an[item] thetimelist=[e[1] for e in h_an[item] if e[1]!=None] if len(thetimelist)>=1: thetime=thetimelist[0] else: #This happens like in pupaeast when there is only imscor print "OOOOOOOOOOOOOOOPS", len(thetimelist) h_an2[item]=[(e[0],thetime) for e in h_an[item]] print "deekee",dkey for k in h_an2.keys(): for item in h_an2[k]: #print "<<<",item[0][0],">>>" if not h_at.has_key(dkey+"="+item[1]): h_at[dkey+"="+item[1]]=[] h_at[dkey+"="+item[1]].append(item[0]) for oid in h_at.keys(): print "OID",oid #print "<<<",h_at[oid],">>>" graph=addObsCoreObs(oid,h_at[oid], obsdatahash) writeGraph(graph, "{0}.{1}.{2}".format(ohead, oid, format), format=format) fh.close() fd=open(odhfname,"w") fd.write(str(obsdatahash)) fd.close()
def getObsCoreFile(fname): """ Takes psv format file, returns a dict with keys the filenames to use, the values an array of tuples for that filename, each tuple, the vals, anbool, where anbool says is info is taken from corresponding vals. vals is the conversion of the psv row into a dict. """ (rdr, fh) = open_obscore(fname) rnum = 0 rpass = 0 idx = 1 globalrowdict = {} h_at = {} for row in rdr: vals = row2dict(row) obs_id = vals['obs_id'] at_time = "_".join(vals['date_obs'].split()) if obs_id == '': raise ValueError("No obs_id value in this row!") access_url = vals['access_url'] access_name = access_url.split('/')[-1].split('_ph_')[0] if access_name.find('_sum') != -1: access_name = access_name.split('_sum')[0] anbool = 1 if access_name.find('_imcscor') != -1: access_name = access_name.split('_imcscor')[0] anbool = 0 #print "access url", access_url if access_url.strip() == '': raise ValueError("Empty access_url for row") dayfind = access_url.find(obs_id + "_d") nightfind = access_url.find(obs_id + "_n") afind = access_url.find(obs_id + "_a") if dayfind != -1: d2key = obs_id + "_d" #dkey=obs_id#lets not to day separately elif afind != -1: d2key = obs_id + "_a" elif nightfind != -1: d2key = obs_id + "_a" else: d2key = obs_id #dkey=obs_id+"--"+access_name dkey = obs_id if not globalrowdict.has_key(dkey): globalrowdict[dkey] = [] globalrowdict[dkey].append((vals, at_time, access_name, d2key, anbool)) #print "LLLLL" for dkey in globalrowdict.keys(): print "grd", dkey, len(globalrowdict[dkey]) dalen = len(globalrowdict[dkey]) h_an = {} for ele in globalrowdict[dkey]: vals, at_time, access_name, d2key, anbool = ele print "time", at_time, dkey, access_name, anbool if not h_an.has_key(access_name): h_an[access_name] = [] if anbool == 1 or dalen == 1: h_an[access_name].append((ele, at_time)) else: h_an[access_name].append((ele, None)) #print "han", h_an h_an2 = {} for item in h_an.keys(): #Sprint "hanitem", h_an[item] thetimelist = [e[1] for e in h_an[item] if e[1] != None] if len(thetimelist) >= 1: thetime = thetimelist[0] else: #This happens like in pupaeast when there is only imscor print "OOOOOOOOOOOOOOOPS", len(thetimelist) h_an2[item] = [(e[0], thetime) for e in h_an[item]] print "deekee", dkey for k in h_an2.keys(): for item in h_an2[k]: #print "<<<",item[0][0],">>>" if not h_at.has_key(dkey + "=" + item[1]): h_at[dkey + "=" + item[1]] = [] h_at[dkey + "=" + item[1]].append((item[0][0], item[0][4])) #add the anbool and vals in here #unbool tells you which row contains the information we ought to use #in this case not imscor. in default case anbool=1 for everything. fh.close() return h_at
def getObsCoreFile(odhfname, fname, ohead, nsplit=10000, format="n3"): """Convert the given obscore file from MAST (in psv format) into RDF. Rows that can not be converted are ignored (an error message is displayed on STDERR in this case). Since the input file is large we now split apart the output every nsplit rows. The output is written to ohead.<i>.<format> where i is a counter, starting at 1 """ obsdatahash = {} (rdr, fh) = open_obscore(fname) rnum = 0 rpass = 0 idx = 1 graph = makeGraph() globalrowdict = {} h_at = {} for row in rdr: vals = row2dict(row) obs_id = vals['obs_id'] at_time = "_".join(vals['date_obs'].split()) if obs_id == '': raise ValueError("No obs_id value in this row!") access_url = vals['access_url'] access_name = access_url.split('/')[-1].split('_ph_')[0] if access_name.find('_sum') != -1: access_name = access_name.split('_sum')[0] anbool = 1 if access_name.find('_imcscor') != -1: access_name = access_name.split('_imcscor')[0] anbool = 0 #print "access url", access_url if access_url.strip() == '': raise ValueError("Empty access_url for row") dayfind = access_url.find(obs_id + "_d") nightfind = access_url.find(obs_id + "_n") afind = access_url.find(obs_id + "_a") if dayfind != -1: d2key = obs_id + "_d" #dkey=obs_id#lets not to day separately elif afind != -1: d2key = obs_id + "_a" elif nightfind != -1: d2key = obs_id + "_a" else: d2key = obs_id #dkey=obs_id+"--"+access_name dkey = obs_id if not globalrowdict.has_key(dkey): globalrowdict[dkey] = [] globalrowdict[dkey].append((vals, at_time, access_name, d2key, anbool)) #print "LLLLL" for dkey in globalrowdict.keys(): print "grd", dkey, len(globalrowdict[dkey]) dalen = len(globalrowdict[dkey]) h_an = {} for ele in globalrowdict[dkey]: vals, at_time, access_name, d2key, anbool = ele print "time", at_time, dkey, access_name, anbool if not h_an.has_key(access_name): h_an[access_name] = [] if anbool == 1 or dalen == 1: h_an[access_name].append((ele, at_time)) else: h_an[access_name].append((ele, None)) #print "han", h_an h_an2 = {} for item in h_an.keys(): #Sprint "hanitem", h_an[item] thetimelist = [e[1] for e in h_an[item] if e[1] != None] if len(thetimelist) >= 1: thetime = thetimelist[0] else: #This happens like in pupaeast when there is only imscor print "OOOOOOOOOOOOOOOPS", len(thetimelist) h_an2[item] = [(e[0], thetime) for e in h_an[item]] print "deekee", dkey for k in h_an2.keys(): for item in h_an2[k]: #print "<<<",item[0][0],">>>" if not h_at.has_key(dkey + "=" + item[1]): h_at[dkey + "=" + item[1]] = [] h_at[dkey + "=" + item[1]].append(item[0]) for oid in h_at.keys(): print "OID", oid #print "<<<",h_at[oid],">>>" graph = addObsCoreObs(oid, h_at[oid], obsdatahash) writeGraph(graph, "{0}.{1}.{2}".format(ohead, oid, format), format=format) fh.close() fd = open(odhfname, "w") fd.write(str(obsdatahash)) fd.close()
def getObsCoreFile(fname): """ Takes psv format file, returns a dict with keys the filenames to use, the values an array of tuples for that filename, each tuple, the vals, anbool, where anbool says is info is taken from corresponding vals. vals is the conversion of the psv row into a dict. """ (rdr, fh) = open_obscore(fname) rnum = 0 rpass = 0 idx = 1 globalrowdict={} h_at={} for row in rdr: vals=row2dict(row) obs_id = vals['obs_id'] at_time="_".join(vals['date_obs'].split()) if obs_id == '': raise ValueError("No obs_id value in this row!") access_url = vals['access_url'] access_name=access_url.split('/')[-1].split('_ph_')[0] if access_name.find('_sum') !=-1: access_name=access_name.split('_sum')[0] anbool=1 if access_name.find('_imcscor')!=-1: access_name=access_name.split('_imcscor')[0] anbool=0 #print "access url", access_url if access_url.strip() == '': raise ValueError("Empty access_url for row") dayfind=access_url.find(obs_id+"_d") nightfind=access_url.find(obs_id+"_n") afind=access_url.find(obs_id+"_a") if dayfind!=-1: d2key=obs_id+"_d" #dkey=obs_id#lets not to day separately elif afind!=-1: d2key=obs_id+"_a" elif nightfind!=-1: d2key=obs_id+"_a" else: d2key=obs_id #dkey=obs_id+"--"+access_name dkey=obs_id if not globalrowdict.has_key(dkey): globalrowdict[dkey]=[] globalrowdict[dkey].append((vals, at_time, access_name, d2key, anbool)) #print "LLLLL" for dkey in globalrowdict.keys(): print "grd", dkey, len(globalrowdict[dkey]) dalen=len(globalrowdict[dkey]) h_an={} for ele in globalrowdict[dkey]: vals, at_time, access_name, d2key, anbool=ele print "time",at_time, dkey, access_name, anbool if not h_an.has_key(access_name): h_an[access_name]=[] if anbool==1 or dalen==1: h_an[access_name].append((ele, at_time)) else: h_an[access_name].append((ele, None)) #print "han", h_an h_an2={} for item in h_an.keys(): #Sprint "hanitem", h_an[item] thetimelist=[e[1] for e in h_an[item] if e[1]!=None] if len(thetimelist)>=1: thetime=thetimelist[0] else: #This happens like in pupaeast when there is only imscor print "OOOOOOOOOOOOOOOPS", len(thetimelist) h_an2[item]=[(e[0],thetime) for e in h_an[item]] print "deekee",dkey for k in h_an2.keys(): for item in h_an2[k]: #print "<<<",item[0][0],">>>" if not h_at.has_key(dkey+"="+item[1]): h_at[dkey+"="+item[1]]=[] h_at[dkey+"="+item[1]].append((item[0][0], item[0][4])) #add the anbool and vals in here #unbool tells you which row contains the information we ought to use #in this case not imscor. in default case anbool=1 for everything. fh.close() return h_at