Python open_obscore Exemples, psv.open_obscore Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : ingest_fuse.py Projet : kayebohemier/semflow

def getObsCoreFile(fname):
    """
    Takes psv format file, returns a dict with keys the filenames to use, the values
    an array of tuples for that filename, each tuple, the vals, anbool, where anbool says is
    info is taken from corresponding vals. vals is the conversion of the psv row into a dict.
    """
    
    (rdr, fh) = open_obscore(fname)

    rnum = 0
    rpass = 0

    idx = 1
    
    globalrowdict={}
    h_at={}
    for row in rdr:
        vals=row2dict(row)
        obs_id = vals['obs_id']
        h_at[obs_id]=[(vals, 1)]
            
                    
            
    fh.close()    
    return h_at

Exemple #2

0

Afficher le fichier

def getObsCoreFile(fname):
    """
    Takes psv format file, returns a dict with keys the filenames to use, the values
    an array of tuples for that filename, each tuple, the vals, anbool, where anbool says is
    info is taken from corresponding vals. vals is the conversion of the psv row into a dict.
    """
    
    (rdr, fh) = open_obscore(fname)

    rnum = 0
    rpass = 0

    idx = 1
    
    globalrowdict={}
    h_at={}
    for row in rdr:
        vals=row2dict(row)
        obs_id = vals['obs_id']
        h_at[obs_id]=[(vals, 1)]
            
                    
            
    fh.close()    
    return h_at

Exemple #3

0

Afficher le fichier

Fichier : ingest_hut_bibcodes.py Projet : nuhaltinsoy/semflow

def writeBibliographyFile(fname, ohead, bibcodes, format="n3"):
    """Write out bibliographic records using the obscore table in fname.

    bibcodes is a dictionary with key: obsid, value: list of bibcodes.

    The output is written to
    
        ohead.<format>

    """

    (rdr, fh) = open_obscore(fname)

    graph = makeGraph()

    nbib = 0
    for row in rdr:

        obs_id = get_column(row, "obs_id")
        access_url = get_column(row, "access_url")
        thedate = "_".join(get_column(row, "date_obs").split())
        for (k, bs) in bibcodes.iteritems():
            # The HUT bibcodes appear to use obsid values which are
            # prefixes of the obscore ones.
            #
            if not obs_id.startswith(k):
                continue

            # Create the URIs that represent the data and observation objects from
            # the obscore table.
            #
            uri_hash = base64.urlsafe_b64encode(access_url[::-1])
            #daturi = mkURI("/obsv/MAST/obsid/{0}/data/".format(obs_id), uri_hash)
            #obsuri = mkURI("/obsv/MAST/obsid/{0}/observation/".format(obs_id), uri_hash)
            daturi = mkURI(
                "/obsv/data/MAST/obsid/{0}/".format(obs_id + "-" + thedate),
                uri_hash)
            #obsuri = mkURI("/obsv/observation/MAST/obsid/{0}/".format(obs_id), uri_hash)
            obsuri = mkURI(
                "/obsv/observation/MAST/obsid/{0}/".format(obs_id + "-" +
                                                           thedate))
            # Loop through each bibcode, linking them to the data/observation URIs
            #
            for b in bs:
                biburi = URIRef(ads_baseurl + "/bib#" + cleanFragment(b))
                gadd(graph, biburi, adsbase.aboutScienceProduct, daturi)
                gadd(graph, biburi, adsbase.aboutScienceProcess, obsuri)

            nbib += len(bs)
            print("# bibcodes = {0}".format(nbib))

    fh.close()

    writeGraph(graph, "{0}.{1}".format(ohead, format), format=format)

Exemple #4

0

Afficher le fichier

Fichier : ingest_fuse_bibcodes.py Projet : kayebohemier/semflow

def writeBibliographyFile(fname, ohead, bibcodes, format="n3"):
    """Write out bibliographic records using the obscore table in fname.

    bibcodes is a dictionary with key: obsid, value: list of bibcodes.

    The output is written to
    
        ohead.<format>

    """

    (rdr, fh) = open_obscore(fname)

    graph = makeGraph()

    nbib = 0
    for row in rdr:

        obs_id = get_column(row, "obs_id")
        access_url = get_column(row, "access_url")
        thedate="_".join(get_column(row, "date_obs").split())
        for (k,bs) in bibcodes.iteritems():
            # The HUT bibcodes appear to use obsid values which are
            # prefixes of the obscore ones.
            # >> I'm not sure if this is relevant to FUSE, but am keeping
            # the stuff here just in case because it's also in WUPPE.
            #
            if not obs_id.startswith(k):
                continue

            # Create the URIs that represent the data and observation objects from
            # the obscore table.
            #
            uri_hash = base64.urlsafe_b64encode(access_url[::-1])
            #daturi = mkURI("/obsv/MAST/obsid/{0}/data/".format(obs_id), uri_hash)
            #obsuri = mkURI("/obsv/MAST/obsid/{0}/observation/".format(obs_id), uri_hash)
            daturi = mkURI("/obsv/data/MAST/obsid/{0}/".format(obs_id+"-"+thedate), uri_hash)
            #obsuri = mkURI("/obsv/observation/MAST/obsid/{0}/".format(obs_id), uri_hash)
            obsuri = mkURI("/obsv/observation/MAST/obsid/{0}/".format(obs_id+"-"+thedate))
            # Loop through each bibcode, linking them to the data/observation URIs
            #
            for b in bs:
                biburi = URIRef(ads_baseurl + "/bib#" + cleanFragment(b))
                gadd(graph, biburi, adsbase.aboutScienceProduct, daturi)
                gadd(graph, biburi, adsbase.aboutScienceProcess, obsuri)


            nbib += len(bs)
            print("# bibcodes = {0}".format(nbib))

    fh.close()

    writeGraph(graph, "{0}.{1}".format(ohead, format), format=format)

Exemple #5

0

Afficher le fichier

Fichier : ingest_mast_obscore.old.py Projet : nuhaltinsoy/semflow

def getObsCoreFile(fname, ohead, nsplit=10000, format="n3"):
    """Convert the given obscore file from MAST (in psv format) into
    RDF.

    Rows that can not be converted are ignored (an error message is
    displayed on STDERR in this case).

    Since the input file is large we now split apart the output every
    nsplit rows. The output is written to
    
        ohead.<i>.<format>

    where i is a counter, starting at 1
    """

    (rdr, fh) = open_obscore(fname)

    rnum = 0
    rpass = 0

    idx = 1
    graph = makeGraph()

    for row in rdr:
        rnum += 1
        try:
            gr = addObsCoreRow(row)
            graph += gr
            rpass += 1
        
        except Exception, e:
            sys.stderr.write("ERROR: row# {0}\n{1}\n".format(rnum, str(e)))

        if rnum % 500 == 0:
            print ("Processed row: {0}".format(rnum))

        if rnum % nsplit == 0:
            # TODO: do we want to catch IO errors here?
            writeGraph(graph,
                       "{0}.{1}.{2}".format(ohead, idx, format),
                       format=format)
            idx += 1
            graph = makeGraph()

Exemple #6

0

Afficher le fichier

Fichier : ingest_mast_obscore.old.2.py Projet : DougBurke/semflow

def getObsCoreFile(odhfname, fname, ohead, nsplit=10000, format="n3"):
    """Convert the given obscore file from MAST (in psv format) into
    RDF.

    Rows that can not be converted are ignored (an error message is
    displayed on STDERR in this case).

    Since the input file is large we now split apart the output every
    nsplit rows. The output is written to
    
        ohead.<i>.<format>

    where i is a counter, starting at 1
    """
    obsdatahash={}
    (rdr, fh) = open_obscore(fname)

    rnum = 0
    rpass = 0

    idx = 1
    graph = makeGraph()
    globalrowdict={}
    h_at={}
    for row in rdr:
        vals=row2dict(row)
        obs_id = vals['obs_id']
        at_time="_".join(vals['date_obs'].split())
        if obs_id == '':
            raise ValueError("No obs_id value in this row!")
        access_url = vals['access_url']
        access_name=access_url.split('/')[-1].split('_ph_')[0]
        if access_name.find('_sum') !=-1:
            access_name=access_name.split('_sum')[0]
        anbool=1
        if access_name.find('_imcscor')!=-1:
            access_name=access_name.split('_imcscor')[0]
            anbool=0
        #print "access url", access_url
        if access_url.strip() == '':
            raise ValueError("Empty access_url for row")
        dayfind=access_url.find(obs_id+"_d")
        nightfind=access_url.find(obs_id+"_n")
        afind=access_url.find(obs_id+"_a")
        if dayfind!=-1:
            d2key=obs_id+"_d"
            #dkey=obs_id#lets not to day separately
        elif afind!=-1:
            d2key=obs_id+"_a"
        elif nightfind!=-1:
            d2key=obs_id+"_a"
        else:
            d2key=obs_id
            
        #dkey=obs_id+"--"+access_name
        dkey=obs_id
        if not globalrowdict.has_key(dkey):
            globalrowdict[dkey]=[]
        globalrowdict[dkey].append((vals, at_time, access_name, d2key, anbool))
    
    #print "LLLLL"    
    for dkey in globalrowdict.keys():
        print "grd", dkey, len(globalrowdict[dkey])
        dalen=len(globalrowdict[dkey])
        h_an={}
        for ele in globalrowdict[dkey]:
            vals, at_time, access_name, d2key, anbool=ele
            print "time",at_time, dkey, access_name, anbool
            if not h_an.has_key(access_name):
                h_an[access_name]=[]
            if anbool==1 or dalen==1:
                h_an[access_name].append((ele, at_time))
            else:
                h_an[access_name].append((ele, None))
        #print "han", h_an
        h_an2={}        
        for item in h_an.keys():
            #Sprint "hanitem", h_an[item]
            thetimelist=[e[1] for e in h_an[item] if e[1]!=None]
            if len(thetimelist)>=1:
                thetime=thetimelist[0]
            else:
                #This happens like in pupaeast when there is only imscor
                print "OOOOOOOOOOOOOOOPS", len(thetimelist)
            h_an2[item]=[(e[0],thetime) for e in h_an[item]]
        print "deekee",dkey
        for k in h_an2.keys():
            for item in h_an2[k]:
                #print "<<<",item[0][0],">>>"
                if not h_at.has_key(dkey+"="+item[1]):
                    h_at[dkey+"="+item[1]]=[]
                h_at[dkey+"="+item[1]].append(item[0])
            
            
                    
            
        
        
    for oid in h_at.keys():
        print "OID",oid
        #print "<<<",h_at[oid],">>>"
        graph=addObsCoreObs(oid,h_at[oid], obsdatahash)
        writeGraph(graph,
                       "{0}.{1}.{2}".format(ohead, oid, format),
                       format=format)

            
    fh.close()
    fd=open(odhfname,"w")
    fd.write(str(obsdatahash))
    fd.close()

Exemple #7

0

Afficher le fichier

Fichier : ingest_hut.py Projet : nuhaltinsoy/semflow

def getObsCoreFile(fname):
    """
    Takes psv format file, returns a dict with keys the filenames to use, the values
    an array of tuples for that filename, each tuple, the vals, anbool, where anbool says is
    info is taken from corresponding vals. vals is the conversion of the psv row into a dict.
    """

    (rdr, fh) = open_obscore(fname)

    rnum = 0
    rpass = 0

    idx = 1

    globalrowdict = {}
    h_at = {}
    for row in rdr:
        vals = row2dict(row)
        obs_id = vals['obs_id']
        at_time = "_".join(vals['date_obs'].split())
        if obs_id == '':
            raise ValueError("No obs_id value in this row!")
        access_url = vals['access_url']
        access_name = access_url.split('/')[-1].split('_ph_')[0]
        if access_name.find('_sum') != -1:
            access_name = access_name.split('_sum')[0]
        anbool = 1
        if access_name.find('_imcscor') != -1:
            access_name = access_name.split('_imcscor')[0]
            anbool = 0
        #print "access url", access_url
        if access_url.strip() == '':
            raise ValueError("Empty access_url for row")
        dayfind = access_url.find(obs_id + "_d")
        nightfind = access_url.find(obs_id + "_n")
        afind = access_url.find(obs_id + "_a")
        if dayfind != -1:
            d2key = obs_id + "_d"
            #dkey=obs_id#lets not to day separately
        elif afind != -1:
            d2key = obs_id + "_a"
        elif nightfind != -1:
            d2key = obs_id + "_a"
        else:
            d2key = obs_id

        #dkey=obs_id+"--"+access_name
        dkey = obs_id
        if not globalrowdict.has_key(dkey):
            globalrowdict[dkey] = []
        globalrowdict[dkey].append((vals, at_time, access_name, d2key, anbool))

    #print "LLLLL"
    for dkey in globalrowdict.keys():
        print "grd", dkey, len(globalrowdict[dkey])
        dalen = len(globalrowdict[dkey])
        h_an = {}
        for ele in globalrowdict[dkey]:
            vals, at_time, access_name, d2key, anbool = ele
            print "time", at_time, dkey, access_name, anbool
            if not h_an.has_key(access_name):
                h_an[access_name] = []
            if anbool == 1 or dalen == 1:
                h_an[access_name].append((ele, at_time))
            else:
                h_an[access_name].append((ele, None))
        #print "han", h_an
        h_an2 = {}
        for item in h_an.keys():
            #Sprint "hanitem", h_an[item]
            thetimelist = [e[1] for e in h_an[item] if e[1] != None]
            if len(thetimelist) >= 1:
                thetime = thetimelist[0]
            else:
                #This happens like in pupaeast when there is only imscor
                print "OOOOOOOOOOOOOOOPS", len(thetimelist)
            h_an2[item] = [(e[0], thetime) for e in h_an[item]]
        print "deekee", dkey
        for k in h_an2.keys():
            for item in h_an2[k]:
                #print "<<<",item[0][0],">>>"
                if not h_at.has_key(dkey + "=" + item[1]):
                    h_at[dkey + "=" + item[1]] = []
                h_at[dkey + "=" + item[1]].append((item[0][0], item[0][4]))
                #add the anbool and vals in here
                #unbool tells you which row contains the information we ought to use
                #in this case not imscor. in default case anbool=1 for everything.

    fh.close()
    return h_at

Exemple #8

0

Afficher le fichier

def getObsCoreFile(odhfname, fname, ohead, nsplit=10000, format="n3"):
    """Convert the given obscore file from MAST (in psv format) into
    RDF.

    Rows that can not be converted are ignored (an error message is
    displayed on STDERR in this case).

    Since the input file is large we now split apart the output every
    nsplit rows. The output is written to
    
        ohead.<i>.<format>

    where i is a counter, starting at 1
    """
    obsdatahash = {}
    (rdr, fh) = open_obscore(fname)

    rnum = 0
    rpass = 0

    idx = 1
    graph = makeGraph()
    globalrowdict = {}
    h_at = {}
    for row in rdr:
        vals = row2dict(row)
        obs_id = vals['obs_id']
        at_time = "_".join(vals['date_obs'].split())
        if obs_id == '':
            raise ValueError("No obs_id value in this row!")
        access_url = vals['access_url']
        access_name = access_url.split('/')[-1].split('_ph_')[0]
        if access_name.find('_sum') != -1:
            access_name = access_name.split('_sum')[0]
        anbool = 1
        if access_name.find('_imcscor') != -1:
            access_name = access_name.split('_imcscor')[0]
            anbool = 0
        #print "access url", access_url
        if access_url.strip() == '':
            raise ValueError("Empty access_url for row")
        dayfind = access_url.find(obs_id + "_d")
        nightfind = access_url.find(obs_id + "_n")
        afind = access_url.find(obs_id + "_a")
        if dayfind != -1:
            d2key = obs_id + "_d"
            #dkey=obs_id#lets not to day separately
        elif afind != -1:
            d2key = obs_id + "_a"
        elif nightfind != -1:
            d2key = obs_id + "_a"
        else:
            d2key = obs_id

        #dkey=obs_id+"--"+access_name
        dkey = obs_id
        if not globalrowdict.has_key(dkey):
            globalrowdict[dkey] = []
        globalrowdict[dkey].append((vals, at_time, access_name, d2key, anbool))

    #print "LLLLL"
    for dkey in globalrowdict.keys():
        print "grd", dkey, len(globalrowdict[dkey])
        dalen = len(globalrowdict[dkey])
        h_an = {}
        for ele in globalrowdict[dkey]:
            vals, at_time, access_name, d2key, anbool = ele
            print "time", at_time, dkey, access_name, anbool
            if not h_an.has_key(access_name):
                h_an[access_name] = []
            if anbool == 1 or dalen == 1:
                h_an[access_name].append((ele, at_time))
            else:
                h_an[access_name].append((ele, None))
        #print "han", h_an
        h_an2 = {}
        for item in h_an.keys():
            #Sprint "hanitem", h_an[item]
            thetimelist = [e[1] for e in h_an[item] if e[1] != None]
            if len(thetimelist) >= 1:
                thetime = thetimelist[0]
            else:
                #This happens like in pupaeast when there is only imscor
                print "OOOOOOOOOOOOOOOPS", len(thetimelist)
            h_an2[item] = [(e[0], thetime) for e in h_an[item]]
        print "deekee", dkey
        for k in h_an2.keys():
            for item in h_an2[k]:
                #print "<<<",item[0][0],">>>"
                if not h_at.has_key(dkey + "=" + item[1]):
                    h_at[dkey + "=" + item[1]] = []
                h_at[dkey + "=" + item[1]].append(item[0])

    for oid in h_at.keys():
        print "OID", oid
        #print "<<<",h_at[oid],">>>"
        graph = addObsCoreObs(oid, h_at[oid], obsdatahash)
        writeGraph(graph,
                   "{0}.{1}.{2}".format(ohead, oid, format),
                   format=format)

    fh.close()
    fd = open(odhfname, "w")
    fd.write(str(obsdatahash))
    fd.close()

Exemple #9

0

Afficher le fichier

Fichier : ingest_hut.py Projet : DougBurke/semflow

def getObsCoreFile(fname):
    """
    Takes psv format file, returns a dict with keys the filenames to use, the values
    an array of tuples for that filename, each tuple, the vals, anbool, where anbool says is
    info is taken from corresponding vals. vals is the conversion of the psv row into a dict.
    """
    
    (rdr, fh) = open_obscore(fname)

    rnum = 0
    rpass = 0

    idx = 1
    
    globalrowdict={}
    h_at={}
    for row in rdr:
        vals=row2dict(row)
        obs_id = vals['obs_id']
        at_time="_".join(vals['date_obs'].split())
        if obs_id == '':
            raise ValueError("No obs_id value in this row!")
        access_url = vals['access_url']
        access_name=access_url.split('/')[-1].split('_ph_')[0]
        if access_name.find('_sum') !=-1:
            access_name=access_name.split('_sum')[0]
        anbool=1
        if access_name.find('_imcscor')!=-1:
            access_name=access_name.split('_imcscor')[0]
            anbool=0
        #print "access url", access_url
        if access_url.strip() == '':
            raise ValueError("Empty access_url for row")
        dayfind=access_url.find(obs_id+"_d")
        nightfind=access_url.find(obs_id+"_n")
        afind=access_url.find(obs_id+"_a")
        if dayfind!=-1:
            d2key=obs_id+"_d"
            #dkey=obs_id#lets not to day separately
        elif afind!=-1:
            d2key=obs_id+"_a"
        elif nightfind!=-1:
            d2key=obs_id+"_a"
        else:
            d2key=obs_id
            
        #dkey=obs_id+"--"+access_name
        dkey=obs_id
        if not globalrowdict.has_key(dkey):
            globalrowdict[dkey]=[]
        globalrowdict[dkey].append((vals, at_time, access_name, d2key, anbool))
    
    #print "LLLLL"    
    for dkey in globalrowdict.keys():
        print "grd", dkey, len(globalrowdict[dkey])
        dalen=len(globalrowdict[dkey])
        h_an={}
        for ele in globalrowdict[dkey]:
            vals, at_time, access_name, d2key, anbool=ele
            print "time",at_time, dkey, access_name, anbool
            if not h_an.has_key(access_name):
                h_an[access_name]=[]
            if anbool==1 or dalen==1:
                h_an[access_name].append((ele, at_time))
            else:
                h_an[access_name].append((ele, None))
        #print "han", h_an
        h_an2={}        
        for item in h_an.keys():
            #Sprint "hanitem", h_an[item]
            thetimelist=[e[1] for e in h_an[item] if e[1]!=None]
            if len(thetimelist)>=1:
                thetime=thetimelist[0]
            else:
                #This happens like in pupaeast when there is only imscor
                print "OOOOOOOOOOOOOOOPS", len(thetimelist)
            h_an2[item]=[(e[0],thetime) for e in h_an[item]]
        print "deekee",dkey
        for k in h_an2.keys():
            for item in h_an2[k]:
                #print "<<<",item[0][0],">>>"
                if not h_at.has_key(dkey+"="+item[1]):
                    h_at[dkey+"="+item[1]]=[]
                h_at[dkey+"="+item[1]].append((item[0][0], item[0][4]))
                #add the anbool and vals in here
                #unbool tells you which row contains the information we ought to use
                #in this case not imscor. in default case anbool=1 for everything.
            
                    
            
    fh.close()    
    return h_at