Example #1
0
def json_download(fname, url, retry=3, ret=False):
    """
    下载cve json格式的zip
    """
    while retry > 0:
        try:
            print("[+] DOWNLOAD %s to %s %s" % (fname, url, retry))
            r = requests.get(url, stream=True)  # stream=True
            with open(path('../data/json', fname), 'wb') as f:
                shutil.copyfileobj(r.raw, f)
            retry = retry - 1
            with zipfile.ZipFile(path('../data/json', fname)) as zf:
                print("[+] UNZIP %s" % fname)
                retry = 0
                zf.extractall(path=path('../data/json'))
                ret = True
        except Exception as e:
            ret = False
            print("[DOWNLOAD ERROR] %s error:%s" % (url, repr(e)))

    return ret
Example #2
0
def mitre_expdb_all(reparse=False):  # todo 存储 对比新增
    """
    从cve.mitre.org中提取CVE exp label,弥补nvd.nist中Resource中Exploit标记的不足
    更新策略:全量更新,返回全部数据
    :return cve_exp:tuple_dict, cve id mapping edb id, e.g. {'cve-id':('cve-id','edb-id')}
    """
    hfile = path('../data/nvd', 'source-EXPLOIT-DB.html')
    if reparse == True:
        os.remove(hfile)
    else:
        pass
    if not os.path.exists(hfile):
        r = requests.get(
            'https://cve.mitre.org/data/refs/refmap/source-EXPLOIT-DB.html')
        html = r.content
        with codecs.open(hfile, 'wb') as f:
            f.write(html)
    else:
        pass
    cve_exp = dict()
    if os.path.exists(hfile):
        with codecs.open(hfile, 'rb') as f:
            soup = BeautifulSoup(f, 'html.parser')
            for tr in soup.find_all('tr'):
                exp_db = ''
                cve = ''
                for td in tr.find_all('td'):
                    t = str(td)

                    if re.search(r'EXPLOIT-DB:(\d+)', t):
                        r = re.search(r'EXPLOIT-DB:(\d+)', t)
                        exp_db = r.group(1)
                    elif re.search(r'(CVE-[\d]+-[\d]+)', t):
                        r = re.findall(r'(CVE-[\d]+-[\d]+)', t)
                        cve = []
                        for c in r:
                            cve.append(c)
                    else:
                        continue

                if exp_db and cve:
                    if isinstance(cve, list):
                        for c in cve:
                            cve_exp[c] = (c, exp_db)
                    else:
                        cve_exp[cve] = (cve, exp_db)
    print('[+] Searched all exp from remote mitre expdb')
    return cve_exp
Example #3
0
def draw_pic(
        table='nvd',
        column='CVE_Items_cve_problemtype_problemtype_data_description_value',
        time='2020',
        top=5):
    """
    作图,例如CWE,可扩展至其他字段
    """
    if len(time) == 2:
        start = 0
        end = 5
    elif len(time) == 4:
        start = 6
        end = 8
    else:
        start = 9
        end = 11
    so = SQLite("data/cve.db")
    sql = "select cast(substr(CVE_Items_publishedDate,{start},{end}) as int)as ct,{column},count(CVE_Items_cve_CVE_data_meta_ID) from {table} \
          where CVE_Items_publishedDate like '%{time}%' \
          group by substr(CVE_Items_publishedDate,{start},{end}),{column} \
          order by ct DESC".format(column=column,
                                   table=table,
                                   time=time,
                                   start=start,
                                   end=end)
    r = so.query(sql)
    value_list = [i for i in r]

    # split
    r = []
    for i in value_list:
        if i[1] not in ['', None]:
            if ';' in i[1]:
                cwe = i[1].split(';')
                for c in cwe:
                    temp = (i[0], c, i[2])
                    r.append(temp)
            else:
                r.append(i)
    # merge
    rr = dict()
    for i in r:
        if i[0] in rr.keys():
            if i[1] in rr[i[0]].keys():
                rr[i[0]][i[1]] += i[2]
            else:
                rr[i[0]][i[1]] = i[2]
        else:
            rr[i[0]] = {i[1]: i[2]}

    for k, v in rr.items():
        rr[k] = OrderedDict(
            sorted(v.items(), key=lambda item: item[1], reverse=True))

    # 计算所有时间的top CWE:根据名次排
    unique_cwe = list(set([ki for k, v in rr.items() for ki, kv in v.items()]))
    most_value = []
    if len(unique_cwe) > top:
        for k, v in rr.items():
            t = 0
            for i in rr[k]:
                if i not in ['NVD-CWE-Other', 'NVD-CWE-noinfo']:
                    most_value.append(i)
                    t += 1
                    if t == top:
                        break

    topN = Counter(most_value).most_common(top)

    # 根据top CWE计算CWE count
    topN_year = dict()
    for j in topN:
        temp = dict()
        for k, v in rr.items():
            if j[0] in v:
                temp[k] = v[j[0]]
            else:
                temp[k] = 0
        topN_year[j[0]] = temp

    for k, v in topN_year.items():
        plt.rcParams['font.sans-serif'] = ['SimHei']  # 解决中文乱码
        plt.rcParams['font.family'] = 'sans-serif'
        plt.rcParams['axes.unicode_minus'] = False  # 坐标轴负号的处理
        plt.plot(v.keys(), v.values(), label=k)
        plt.xticks(rotation=45)
        plt.yticks(rotation=30)
    plt.legend()
    plt.xlabel('Time')
    plt.ylabel('CVE Number')
    if column == 'CVE_Items_cve_problemtype_problemtype_data_description_value':
        title = '%s-CWE趋势图' % (time)
    plt.title(title)
    if not os.path.exists(path('../data/img')):
        os.mkdir(path('../data/img'))
    plt.savefig(path('../data/img', '{title}.png'.format(title=title)))
    plt.close('all')
Example #4
0
def do_molecfit(headers, spectra, wave=[], mode='HARPS', load_previous=False):
    """This is a function that pipes a list of s1d spectra into molecfit, and
    executes it. It first launces the molecfit gui on the middle spectrum of the
    sequence, and then loops through the entire list, returning the transmission
    spectra of the Earths atmosphere in the same order as the list provided.
    These can then be used to correct the s1d spectra or the e2ds spectra.
    Note that the s1d spectra are assumed to be in the barycentric frame in vaccuum,
    but that the output transmission spectrum is in the observers frame, and e2ds files
    are in air wavelengths by default.

    If you have run do_molecfit before, and want to reuse the output of the previous run
    for whatever reason, set the load_previous keyword to True. This will reload the
    list of transmission spectra created last time, if available.
    """

    import pdb
    import numpy as np
    import matplotlib.pyplot as plt
    import sys
    import os.path
    import lib.utils as ut
    import pickle
    import copy
    molecfit_input_folder = '/Users/hoeijmakers/Molecfit/share/molecfit/spectra/cross_cor/'
    molecfit_prog_folder = '/Users/hoeijmakers/Molecfit/bin/'
    temp_specname = copy.deepcopy(
        mode)  #The name of the temporary file used (without extension).
    #The spectrum will be named like this.fits There should be a this.par file as well,
    #that contains a line pointing molecfit to this.fits:
    parname = temp_specname + '.par'

    #====== ||  START OF PROGRAM   ||======#
    N = len(headers)
    if N != len(spectra):
        print(
            'ERROR in prep_for_molecfit: Length of list of headers is not equal to length of list of spectra (%s , %s)'
            % (N, len(spectra)))
        sys.exit()

    #Test that the input root and molecfit roots exist; that the molecfit root contains the molecfit executables.
    #that the input root contains the desired parfile and later fitsfile.
    molecfit_input_root = ut.path(molecfit_input_folder)
    molecfit_prog_root = ut.path(molecfit_prog_folder)
    if os.path.isdir(molecfit_input_root) != True:
        print('ERROR in prep_for_molecfit: ' + molecfit_input_root +
              ' does not exist!')
        sys.exit()
    if os.path.isdir(molecfit_prog_root) != True:
        print('ERROR in prep_for_molecfit: ' + molecfit_prog_root +
              ' does not exist!')
        sys.exit()
    if os.path.isfile(molecfit_input_root + parname) != True:
        print('ERROR in prep_for_molecfit: ' + molecfit_input_root +
              temp_specname + '.par does not exist!')
        sys.exit()
    if os.path.isfile(molecfit_prog_root + 'molecfit') != True:
        print('ERROR in prep_for_molecfit: ' + molecfit_prog_root +
              'molecfit does not exist!')
        sys.exit()
    if os.path.isfile(molecfit_prog_root + 'molecfit_gui') != True:
        print('ERROR in do_molecfit: ' + molecfit_prog_root +
              'molecfit_gui does not exist!')
        sys.exit()

    pickle_outpath = molecfit_input_root + 'previous_run_of_do_molecfit.pkl'

    if load_previous == True:
        if os.path.isfile(pickle_outpath) == False:
            print(
                'WARNING in do_molecfit: Previously saved run is not available.'
            )
            print('The user will have to re-fit.')
            print('That run will then be saved.')
            load_previous = False
        else:
            pickle_in = open(pickle_outpath, "rb")
            list_of_wls, list_of_fxc, list_of_trans = pickle.load(pickle_in)

    if load_previous == False:
        list_of_wls = []
        list_of_fxc = []
        list_of_trans = []

        middle_i = int(
            round(0.5 * N)
        )  #We initialize molecfit on the middle spectrum of the time series.
        write_file_to_molecfit(molecfit_input_root,
                               temp_specname + '.fits',
                               headers,
                               spectra,
                               middle_i,
                               mode=mode,
                               wave=wave)
        print(molecfit_input_root)
        print(temp_specname + '.fits')
        print(headers[middle_i])

        execute_molecfit(molecfit_prog_root,
                         molecfit_input_root + parname,
                         gui=True)
        wl, fx, trans = retrieve_output_molecfit(molecfit_input_root +
                                                 temp_specname)
        remove_output_molecfit(molecfit_input_root, temp_specname)

        for i in range(N):  #range(len(spectra)):
            print('Fitting spectrum %s from %s' % (i + 1, len(spectra)))
            t1 = ut.start()
            write_file_to_molecfit(molecfit_input_root,
                                   temp_specname + '.fits',
                                   headers,
                                   spectra,
                                   i,
                                   mode=mode,
                                   wave=wave)
            execute_molecfit(molecfit_prog_root,
                             molecfit_input_root + parname,
                             gui=False)
            wl, fx, trans = retrieve_output_molecfit(molecfit_input_root +
                                                     temp_specname)
            remove_output_molecfit(molecfit_input_root, temp_specname)
            list_of_wls.append(wl * 1000.0)  #Convert to nm.
            list_of_fxc.append(fx / trans)
            list_of_trans.append(trans)
            ut.end(t1)

        pickle_outpath = molecfit_input_root + 'previous_run_of_do_molecfit.pkl'
        with open(pickle_outpath, 'wb') as f:
            pickle.dump((list_of_wls, list_of_fxc, list_of_trans), f)

    to_do_manually = check_fit_gui(list_of_wls, list_of_fxc, list_of_trans)
    if len(to_do_manually) > 0:
        print('The following spectra were selected to redo manually:')
        print(to_do_manually)
        #CHECK THAT THIS FUNCIONALITY WORKS:
        for i in to_do_manually:
            write_file_to_molecfit(molecfit_input_root,
                                   temp_specname + '.fits',
                                   headers,
                                   spectra,
                                   int(i),
                                   mode=mode,
                                   wave=wave)
            execute_molecfit(molecfit_prog_root,
                             molecfit_input_root + parname,
                             gui=True)
            wl, fx, trans = retrieve_output_molecfit(molecfit_input_root +
                                                     temp_specname)
            list_of_wls[int(i)] = wl * 1000.0  #Convert to nm.
            list_of_fxc[int(i)] = fxc
            list_of_trans[int(i)] = trans
    return (list_of_wls, list_of_trans)
Example #5
0
def cve_monitor(monitor_init=False):
    """
    存量离线解析,增量在线更新
    :return ret:boolean, if parsed json to sqlite3 or not
    :return cve_day_add:tuple_dict, cve added in given time
    :return exp_day_add:tuple_dict, exp added in given time
    """
    json_list = []
    # 取配置文件中控制存量解析的参数,得到待解析入库文件列表
    if monitor_init == 'True':
        # 清空已有的CVE数据
        so = SQLite('data/cve.db')
        so.execute('delete from nvd')

        zips = glob.glob('data/json/nvdcve-1.1-*.json.zip')
        for z in zips:
            with zipfile.ZipFile(z) as zf:
                print("[+] UNZIP %s" % z)
                zf.extractall(path=path('data/json'))

        jsons_stock = glob.glob('data/json/nvdcve-1.1-*.json')
        jsons_stock = [i for i in jsons_stock]
        json_list.extend(jsons_stock)
    else:
        # 下载最新的modified数据
        modified_zip, modified_link = (
            'nvdcve-1.1-modified.json.zip',
            'https://nvd.nist.gov/feeds/json/cve/1.1/nvdcve-1.1-modified.json.zip'
        )
        json_download(modified_zip, modified_link)
        json_list.extend([path('../data/json', 'nvdcve-1.1-modified.json')])

    # 统一解析存量和增量cve数据
    for j in json_list:
        # 插入前处理逻辑:从modified中获取cve exp added
        exp_day_add = dict()
        cve_day_add = dict()
        if "modified" in j:
            sql, modified_data = json2tuple_dict(j)
            modified_time = time_delta(delta=-1, format="%Y-%m-%d")
            cve_exps = day_modified_exp(modified_data,
                                        time=modified_time,
                                        key='Exploit')
            cve_list = list(cve_exps.values())
            cve_day_add = day_modified_exp(modified_data, time=modified_time)
            for cve in cve_list:
                so, exid = cve_exists_where(
                    db='cve',
                    table='nvd',
                    key=['CVE_Items_cve_CVE_data_meta_ID'],
                    where='CVE_Items_cve_CVE_data_meta_ID="{d}"'.format(
                        d=cve[0]))
                so1, eid = cve_exists_where(
                    db='cve',
                    table='nvd',
                    key=['CVE_Items_cve_CVE_data_meta_ID'],
                    where=
                    'CVE_Items_cve_CVE_data_meta_ID="{d}" and CVE_Items_cve_references_reference_data_tags not like "%Exploit%"'
                    .format(d=cve[0]))
                if exid:
                    if eid:
                        print(
                            '[+] %s occurs to pre cve.db with no exploit,so added'
                            % cve[0])
                        exp_day_add[cve[0]] = cve
                    else:
                        print(
                            '[+] %s occurs to prev cve.db with exploit,so give up'
                            % cve[0])
                else:
                    print(
                        '[+] %s never occurs to prev cve.db with exploit,so added'
                        % cve[0])
                    exp_day_add[cve[0]] = cve

        # 将nvd日新增的exp插入exp.db
        first_part_exp_add = list(exp_day_add.values())
        exp_table(first_part_exp_add)
        # 开始正式插入cve.db
        sql, cve_data = json2tuple_dict(j)
        ret = sqlite_insert(sql, cve_data, dir_name='data/cve.db')
        print("[+] Parsed %s to sqlite3" % j)

    return ret, cve_day_add, exp_day_add
def read_ESPRESSO_S2D_JULIA(inpath,outname,air=True,molecfit=True):
    """
    reads in the ESPRESSO files and prepares them for use in molecfit
    this functions then calls do_mmolecfit from the molecfit module
    input:
        inpath: type: string, path to the s2d ESPRESSO files
        outpath: type: string, path to where the telluric correction should be saved
        air: type:boolean, is the wavelength in air or vacuum
        molecfit: type: boolean, function can be run with or without starting molecfit
    note for Jens: The wavelength is much easier in ESPRESSO compared to HARPS, so I kicked out
    all the parts needed for that in the HARPS function
    Since Romain has for some reason only given me the S2D files and not the S1D files (God knows why),
    ans also only fiber A (again, God knows why), this function only does molecfit on the sodium orders
    Author: Julia V. Seidel.
    """
    import os
    import pdb
    from astropy.io import fits
    import numpy as np
    import matplotlib.pyplot as plt
    import sys
    import utils as ut
    import molecfit as mol
    import pyfits
    import copy
    import scipy.interpolate as interp
    import pickle

    #First check the input:
    ut.typetest('inpath in read_ESPRESSO_S2D ',inpath,str)
    ut.typetest('outname in read_ESPRESSO_S2D ',outname,str)
    ut.typetest('air in read_ESPRESSO_S2D ',air,bool)
    if os.path.exists(inpath) != True:
        print("ERROR in read_ESPRESSO_S2D: Data input path (%s) does not exist." % inpath)
        sys.exit()

    filelist=os.listdir(inpath)
    N=len(filelist)

    if len(filelist) == 0:
        print("ERROR in read_ESPRESSO_S2D: input folder (%s) is empty." % inpath)
        sys.exit()

    #The following variables define the lists in which all the necessary data will be stored.
    framename=[]
    header=[]
    type=[]
    texp=np.array([])
    date=[]
    npx=np.array([])
    norders=np.array([])
    mjd=np.array([])
    ccfmjd=np.array([])
    s1dmjd=np.array([])
    s2d=[]
    airmass=np.array([])
    berv=np.array([])
    wave=[]

    outpath = ut.path(outname)
    if os.path.exists(outpath) != True:
        os.makedirs(outpath)

    #ccftotal = 0 #This will hold the sum of the CCFs
    s2d_count = 0
    sci_count = 0



    for i in range(N):
        if filelist[i].endswith('S2D_A.fits'):
            s2d_count += 1
            print(filelist[i])
            #data,hdr=fits.getdata(inpath+filelist[i],header=True)

            hdul = fits.open(inpath+filelist[i])
            data = copy.deepcopy(hdul[1].data)
            hdr1 = hdul[0].header
            hdr2 = hdul[1].header
            wavedata=copy.deepcopy(hdul[5].data)
            wave.append(wavedata)
            hdul.close()
            del hdul[0].data
            if hdr2['EXTNAME'] == 'SCIDATA':
                framename.append(filelist[i])
                header.append(hdr1)
                type.append(hdr2['EXTNAME'])
                texp=np.append(texp,hdr1['EXPTIME'])
                date.append(hdr1['DATE-OBS'])
                mjd=np.append(mjd,hdr1['MJD-OBS'])
                npx=np.append(npx,hdr2['NAXIS1'])
                norders=np.append(norders,hdr2['NAXIS2'])
                s2d.append(data)
                s2d_count += 1

                sci_count += 1
                berv=np.append(berv,hdr1['HIERARCH ESO QC BERV'])
                airmass=np.append(airmass,0.5*(hdr1['HIERARCH ESO TEL3 AIRM START']+hdr1['HIERARCH ESO TEL3 AIRM END']))


    #Now we catch some errors:
    #-The above should have read a certain number of s2d files.
    #-A certain number of these should be SCIENCE frames.
    #-There should be at least one WAVE file.
    #-All exposures should have the same number of spectral orders.
    #-All orders should have the same number of pixels (this is true for ESPRESSO).
    #-The wave frame should have the same dimensions as the order frames.
    #-If nowave is set, test that all frames used the same wave_A calibrator.
    #-The blaze file needs to have the same shape as the s2d files.
    #-The number of s1d files should be the same as the number of s2d files.



    # if s2d_count != s1d_count:
#    #     print('ERROR in read_ESPRESSO_s2d: The numbers of 1ds and s2d files are different.')
#    #     print("These are the files and their types:")
#    #     for i in range(len(type)):
#    #         print('   '+framename[i]+'  %s' % type[i])
#    #     sys.exit()
#    if s2d_count == 0:
#        print("ERROR in read_ESPRESSO_s2d: The input folder (%s) does not contain files ending in s2d.fits." % inpath)
#        sys.exit()
#    if sci_count == 0:
#        print("ERROR in read_ESPRESSO_s2d: The input folder (%2) contains s2d files, but none of them are classified as SCIENCE frames with the HIERARCH ESO DPR CATG keyword.")
#        print("These are the files and their types:")
#        for i in range(len(type)):
#            print('   '+framename[i]+'  %s' % type[i])
#        sys.exit()
#    if np.max(np.abs(norders-norders[0])) == 0:
#        norders=int(norders[0])
#    else:
#        print("ERROR in read_ESPRESSO_s2d: Not all files have the same number of orders.")
#        print("These are the files and their number of orders:")
#        for i in range(len(type)):
#            print('   '+framename[i]+'  %s' % norders[i])
#        sys.exit()
#    if np.max(np.abs(npx-npx[0])) == 0:
#        npx=int(npx[0])
#    else:
#        print("ERROR IN read_ESPRESSO_s2d: Not all files have the same number of pixels.")
#        print("These are the files and their number of pixels:")
#        for i in range(len(type)):
#            print('   '+framename[i]+'  %s' % npx[i])
#        sys.exit()
#    if np.max(np.abs(nrv-nrv[0])) == 0:
#        nrv=int(nrv[0])
#    else:
#        print("ERROR IN read_ESPRESSO_s2d: Not all files have the same number of pixels.")
#        print("These are the files and their number of pixels:")
#        for i in range(len(type)):
#            print('   '+framename[i]+'  %s' % npx[i])
#        sys.exit()



#
#    if len(s1dhdr) != len(s2d) and molecfit == True:
#        print('ERROR in read_ESPRESSO_s2d: The number of s1d SCIENCE files and s2d SCIENCE files is not the same. (%s vs %s)' % (len(s1dhdr),len(s2d)))
#        print('Switching off the molecfit option will suppress this error.')



    #Ok, so now we should have ended up with a number of lists that contain all
    #the relevant information of our science frames.
    #We determine how to sort the resulting lists in time:
#    sorting = np.argsort(mjd)
    s2dsorting = np.argsort(mjd)

    #First sort the s1d files for application of molecfit.
    if molecfit == True:
        s2dhdr_sorted=[]
        s2d_sorted=[]
        wave_sorted=[]
        for i in s2dsorting:
            s2dhdr_sorted.append(header[i])
            s2d_sorted.append(s2d[i])
            wave_sorted.append(wave[i])

        # print('Molecfit will be executed onto the files in this order:')
        # for x in s1dhdr_sorted:
        #     print(x['DATE-OBS'])
        list_of_wls,list_of_trans = mol.do_molecfit(s2dhdr_sorted,s2d_sorted,mode='ESPRESSO',load_previous=False,order=116,wave=wave_sorted)
        mol.write_telluric_transmission_to_file(list_of_wls,list_of_trans,outpath+'telluric_transmission_spectra.pkl')




#    ccftotal = 0.0
    #Now we loop over all exposures and collect the i-th order from each exposure,
    #put these into a new matrix and save them to FITS images:
    f=open(outpath+'obs_times','w',newline='\n')
    headerline = 'MJD'+'\t'+'DATE'+'\t'+'EXPTIME'+'\t'+'MEAN AIRMASS'+'\t'+'BERV (km/s)'+'\t'+'FILE NAME'

    for i in range(int(norders[0])):
        order = np.zeros((sci_count,int(npx[0])))

        wave_axis = wave[0][i]/10.0#Convert to nm.
#        ccforder = np.zeros((ccf_count,nrv))
        print('CONSTRUCTING ORDER %s' % i)
        c = 0#To count the number of science frames that have passed. The counter
        # c is not equal to j because the list of files contains not only SCIENCE
        # frames.
#        cc = 0#Same for ccfs
#        for j in range(len(ccfsorting)):
#            ccf=ccfs[ccfsorting[j]]
#            ccforder[cc,:] = ccf[i,:]
#            cc+=1
        for j in range(len(s2dsorting)):#Loop over exposures
            if i ==0:
                print('---'+type[s2dsorting[j]]+'  '+date[s2dsorting[j]])

            exposure = s2d[s2dsorting[j]]


            order[c,:] = exposure[i,:]
            #T_i = interp.interp1d(list_of_wls[j],list_of_trans[j])#This should be time-sorted, just as the s2d files.
            #Do a manual check here that the MJDs are identical.
            #Also, determiine what to do with airtovac.
            #tel_order[c,:] = T_i[wave_axis]
            #Now I also need to write it to file.
            if i ==0:#Only do it the first time, not for every order.
                line = str(mjd[s2dsorting[j]])+'\t'+date[s2dsorting[j]]+'\t'+str(texp[s2dsorting[j]])+'\t'+str(airmass[s2dsorting[j]])+'\t'+str(berv[s2dsorting[j]])+'\t'+framename[s2dsorting[j]]+'\n'
                f.write(line)
            c+=1
#        ccftotal+=ccforder



#        fits.writeto(outpath+'ccf_'+str(i)+'.fits',ccforder,overwrite=True)
        fits.writeto(outpath+'order_'+str(i)+'.fits',order,overwrite=True)
        fits.writeto(outpath+'wave_'+str(i)+'.fits',wave_axis,overwrite=True)
#    fits.writeto(outpath+'ccftotal.fits',ccftotal,overwrite=True)
    f.close()
    print('Time-table written to '+outpath+'obs_times')
def read_ESPRESSO_S2D(inpath,outname,air=True,nowave=False,molecfit=True,mode='HARPS'):
    """THIS IS A PYTHON TRANSLATION OF READ_DATA (BELOW). IT SHOULD NOT WORK
    WITH A PATHS FILE, JUST A FOLDER THAT CONTAINS ONLY FITS FILES AND THEN
    IT WORKS FROM THE KEYWORDS TO DO EVERYTHING AUTOMATICALLY.

    WRITE GOOD TESTS AND DOCUMENTATION.

    ALSO, ULTIMATELY THIS WILL NEED A WRAPPER THAT CAN SWITCH BETWEEN DIFFERENT STANDARD DATASETS.
    IN THE CASE OF UVES (AND MAYBE MOST OTHER DATASETS) IT WILL NEED TO DEAL WITH BERV CORRECTIONS.
    GREAT WAY TO DO THIS IS HERE: https://docs.astropy.org/en/stable/coordinates/velocities.html
    DID THAT WITH JEHAN FOR 55 CNC E.

    Set the nowave keyword to True if the dataset has no wave files associated with it.
    This may happen if you downloaded ESO Advanced Data Products, which include
    reduced science e2ds's but not reduced wave e2ds's. The wavelength solution
    is still encoded in the fits header however, so we take it from there, instead.


    IF IN THE FUTURE A BERV KEYWORD WOULD BE MISSING, I HAVE INCLUDED AN ASTROPY
    IMPLEMENTATION THAT ACCURATELY CALCULATES THE BERV FROM THE MJD. SEE SYSTEM_PARAMETERS.PY
    """
    import os
    import pdb
    from astropy.io import fits
    import numpy as np
    import matplotlib.pyplot as plt
    import sys
    import lib.utils as ut
    import lib.molecfit as mol
    import pyfits
    import copy
    import scipy.interpolate as interp
    import pickle
    import lib.constants as const

    #First check the input:
    ut.typetest('inpath in read_ESPRESSO_S2D ',inpath,str)
    ut.typetest('outname in read_ESPRESSO_S2D ',outname,str)
    ut.typetest('air in read_ESPRESSO_S2D ',air,bool)
    if os.path.exists(inpath) != True:
        print("ERROR in read_ESPRESSO_S2D: Data input path (%s) does not exist." % inpath)
        sys.exit()

    filelist=os.listdir(inpath)
    N=len(filelist)

    if len(filelist) == 0:
        print("ERROR in read_ESPRESSO_S2D: input folder (%s) is empty." % inpath)
        sys.exit()

    #The following variables define the lists in which all the necessary data will be stored.
    framename=[]
    header=[]
    s1dhdr=[]
    type=[]
    texp=np.array([])
    date=[]
    mjd=np.array([])
    ccfmjd=np.array([])
    s1dmjd=np.array([])
    npx=np.array([])
    nrv=np.array([])
    norders=np.array([])
    e2ds=[]
    s1d=[]
    wave1d=[]
    airmass=np.array([])
    berv=np.array([])
    wave=[]
    blaze=[]
    ccfs=[]
    wavefile_used = []
    outpath = ut.path('data/'+outname)
    if os.path.exists(outpath) != True:
        os.makedirs(outpath)

    #ccftotal = 0 #This will hold the sum of the CCFs
    e2ds_count = 0
    sci_count = 0
    wave_count = 0
    ccf_count = 0
    blaze_count = 0
    s1d_count = 0


    #MODE SWITCHING HERE:
    catkeyword = 'EXTNAME'
    bervkeyword = 'HIERARCH ESO QC BERV'

    airmass_keyword1 = 'HIERARCH ESO TEL'
    airmass_keyword2 = ' AIRM '
    airmass_keyword3_start = 'START'
    airmass_keyword3_end = 'END'


    # berv=np.append(berv,hdr1['HIERARCH ESO QC BERV'])
    # airmass=np.append(airmass,0.5*(hdr1['HIERARCH ESO TEL3 AIRM START']+hdr1['HIERARCH ESO TEL3 AIRM END']))



    for i in range(N):
        if filelist[i].endswith('S2D_A.fits'):
            e2ds_count += 1
            print(filelist[i])
            #data,hdr=fits.getdata(inpath+filelist[i],header=True)

            hdul = fits.open(inpath+filelist[i])
            data = copy.deepcopy(hdul[1].data)
            hdr = hdul[0].header
            hdr2 = hdul[1].header
            wavedata=copy.deepcopy(hdul[5].data)
            hdul.close()
            del hdul[1].data

            if hdr2[catkeyword] == 'SCIDATA':
                print('science keyword found')
                framename.append(filelist[i])
                header.append(hdr)
                type.append('SCIENCE')
                texp=np.append(texp,hdr['EXPTIME'])
                date.append(hdr['DATE-OBS'])
                mjd=np.append(mjd,hdr['MJD-OBS'])
                npx=np.append(npx,hdr2['NAXIS1'])
                norders=np.append(norders,hdr2['NAXIS2'])
                e2ds.append(data)

                sci_count += 1
                berv=np.append(berv,hdr[bervkeyword]*1000.0)

                telescope = hdr['TELESCOP'][-1]
                airmass = np.append(airmass,0.5*(hdr[airmass_keyword1+telescope+' AIRM START']+hdr[airmass_keyword1+telescope+' AIRM END']))
                wave.append(wavedata*(1.0-hdr[bervkeyword]*1000.0/const.c))
                #Ok.! So unlike HARPS, ESPRESSO wavelengths are BERV corrected in the S2Ds.
                #WHY!!!?. WELL SO BE IT. IN ORDER TO HAVE E2DSes THAT ARE ON THE SAME GRID, AS REQUIRED, WE UNDO THE BERV CORRECTION HERE.
                #WHEN COMPARING WAVE[0] WITH WAVE[1], YOU SHOULD SEE THAT THE DIFFERENCE IS NILL.
                #THATS WHY LATER WE JUST USE WAVE[0] AS THE REPRESENTATIVE GRID FOR ALL.

        if filelist[i].endswith('CCF_A.fits'):
            #ccf,hdr=fits.getdata(inpath+filelist[i],header=True)
            hdul = fits.open(inpath+filelist[i])
            ccf = copy.deepcopy(hdul[1].data)
            hdr = hdul[0].header
            hdr2 = hdul[1].header
            hdul.close()
            del hdul[1].data

            if hdr2[catkeyword] == 'SCIDATA':
                print('CCF ADDED')
                #ccftotal+=ccf
                ccfs.append(ccf)
                ccfmjd=np.append(ccfmjd,hdr['MJD-OBS'])
                nrv=np.append(nrv,hdr2['NAXIS1'])
                ccf_count += 1

        if filelist[i].endswith('S1D_A.fits'):
            hdul = fits.open(inpath+filelist[i])
            data_table = copy.deepcopy(hdul[1].data)
            hdr = hdul[0].header
            hdr2 = hdul[1].header
            hdul.close()
            del hdul[1].data
            if hdr['HIERARCH ESO PRO SCIENCE'] == True:
                s1d.append(data_table.field(2))
                wave1d.append(data_table.field(1))
                s1dhdr.append(hdr)
                s1dmjd=np.append(s1dmjd,hdr['MJD-OBS'])
                s1d_count += 1
    #Now we catch some errors:
    #-The above should have read a certain number of e2ds files.
    #-A certain number of these should be SCIENCE frames.
    #-There should be at least one WAVE file.
    #-All exposures should have the same number of spectral orders.
    #-All orders should have the same number of pixels (this is true for HARPS).
    #-The wave frame should have the same dimensions as the order frames.
    #-If nowave is set, test that all frames used the same wave_A calibrator.
    #-The blaze file needs to have the same shape as the e2ds files.
    #-The number of s1d files should be the same as the number of e2ds files.


    if ccf_count != sci_count:
        print("ERROR in read_ESPRESSO_S2D: There is a different number of science CCFs as there is science frames.")
        sys.exit()
    # if e2ds_count != s1d_count:
    #     print('ERROR in read_HARPS_e2ds: The numbers of 1ds and e2ds files are different.')
    #     print("These are the files and their types:")
    #     for i in range(len(type)):
    #         print('   '+framename[i]+'  %s' % type[i])
    #     sys.exit()
    if e2ds_count == 0:
        print("ERROR in read_ESPRESSO_S2D: The input folder (%s) does not contain files ending in e2ds.fits." % inpath)
        sys.exit()
    if sci_count == 0:
        print("ERROR in read_ESPRESSO_S2D: The input folder (%2) contains e2ds files, but none of them are classified as SCIENCE frames with the HIERARCH ESO DPR CATG/OBS-TYPE keyword.")
        print("These are the files and their types:")
        for i in range(len(type)):
            print('   '+framename[i]+'  %s' % type[i])
        sys.exit()
    if np.max(np.abs(norders-norders[0])) == 0:
        norders=int(norders[0])
    else:
        print("ERROR in read_ESPRESSO_S2D: Not all files have the same number of orders.")
        print("These are the files and their number of orders:")
        for i in range(len(type)):
            print('   '+framename[i]+'  %s' % norders[i])
        sys.exit()
    if np.max(np.abs(npx-npx[0])) == 0:
        npx=int(npx[0])
    else:
        print("ERROR IN read_ESPRESSO_S2D: Not all files have the same number of pixels.")
        print("These are the files and their number of pixels:")
        for i in range(len(type)):
            print('   '+framename[i]+'  %s' % npx[i])
        sys.exit()
    if np.max(np.abs(nrv-nrv[0])) == 0:
        nrv=int(nrv[0])
    else:
        print("ERROR IN read_ESPRESSO_S2D: Not all files have the same number of pixels.")
        print("These are the files and their number of pixels:")
        for i in range(len(type)):
            print('   '+framename[i]+'  %s' % npx[i])
        sys.exit()


    # print(wave[0][0,:]-wave[1][0,:])
    # print(wave1d[0]-wave1d[2])
    wave=wave[0]#SELECT ONLY THE FIRST WAVE FRAME. The rest is ignored.
    wave1d=wave1d[0]
    # else:
    #     if nowave == False:
    #         print("ERROR in read_HARPS_e2ds: No wave_A.fits file was detected.")
    #         print("These are the files in the folder:")
    #         for i in range(N):
    #             print(filelist[i])
    #         print("This may have happened if you downloaded the HARPS data from the")
    #         print("ADP query form, which doesn't include wave_A files (as far as I")
    #         print("have seen). Set the /nowave keyword in your call to read_HARPS_e2ds")
    #         print("if you indeed do not expect a wave_A file to be present.")
    # if nowave == True:
    #     if all(x == wavefile_used[0] for x in wavefile_used):
    #         print("Nowave is set, and simple wavelength calibration extraction")
    #         print("works, as all files in the dataset used the same wave_A file.")
    #         wave=wave[0]
    #     else:
    #         print("ERROR IN read_HARPS_e2ds: Nowave is set, but not all files")
    #         print("in the dataset used the same wave_A file when the pipeline was")
    #         print("run. Catching this requres an interpolation step that is currently")
    #         print("not yet implemented. Exiting. These are the filenames and their")
    #         print("wave_A file used:")
    #         for i in range(N-1):
    #             print('   '+framename[i]+'  %s' % wavefile_used[0])
    #         wave=wave[0]
    #         print("I ALLOW YOU TO CONTINUE BUT USING ONLY THE FIRST WAVELENGTH")
    #         print("SOLUTION. A PART OF THE DATA MAY BE AFFECTED BY HAVING ASSUMED")
    #         print("THE WRONG SOLUTION. If you are doing transits, you don't need")
    #         print("this kind of precision.")


    if np.shape(wave) != np.shape(e2ds[0]):
        print("ERROR in read_ESPRESSO_S2D: A wave file was detected but its shape (%s,%s) does not match that of the orders (%s,%s)" % (np.shape(wave)[0],np.shape(wave)[1],np.shape(e2ds[0])[0],np.shape(e2ds[0])[1]))
    if len(s1dhdr) != len(e2ds) and molecfit == True:
        print('ERROR in read_HARPS_e2ds: The number of s1d SCIENCE files and e2ds SCIENCE files is not the same. (%s vs %s)' % (len(s1dhdr),len(e2ds)))
        print('Switching off the molecfit option will suppress this error.')


    #Ok, so now we should have ended up with a number of lists that contain all
    #the relevant information of our science frames.
    #We determine how to sort the resulting lists in time:
    sorting = np.argsort(mjd)
    ccfsorting = np.argsort(ccfmjd)
    s1dsorting = np.argsort(s1dmjd)


    #First sort the s1d files for application of molecfit.
    if molecfit == True:
        s1dhdr_sorted=[]
        s1d_sorted=[]
        for i in range(len(s1dsorting)):
            s1dhdr_sorted.append(s1dhdr[s1dsorting[i]])
            s1d_sorted.append(s1d[s1dsorting[i]])

        # print(s1dhdr_sorted[0])
        #
        # f=open('ILOVEHEADERS','w')
        # for k in s1dhdr_sorted[0]:
        #     f.write(str(k)+'\n')
        # f.close()
        #
        # sys.exit()
        list_of_wls,list_of_trans = mol.do_molecfit(s1dhdr_sorted,s1d_sorted,wave=wave1d,load_previous=False,mode=mode)
        mol.write_telluric_transmission_to_file(list_of_wls,list_of_trans,outpath+'telluric_transmission_spectra.pkl')


    ccftotal = 0.0
    #Now we loop over all exposures and collect the i-th order from each exposure,
    #put these into a new matrix and save them to FITS images:
    f=open(outpath+'obs_times','w',newline='\n')
    headerline = 'MJD'+'\t'+'DATE'+'\t'+'EXPTIME'+'\t'+'MEAN AIRMASS'+'\t'+'BERV (km/s)'+'\t'+'FILE NAME'
    for i in range(norders):
        order = np.zeros((sci_count,npx))
        ccforder = np.zeros((ccf_count,nrv))
        wave_axis = wave[i,:]/10.0#Convert to nm.
        print('CONSTRUCTING ORDER %s' % i)
        c = 0#To count the number of science frames that have passed. The counter
        # c is not equal to j because the list of files contains not only SCIENCE
        # frames.
        cc = 0#Same for ccfs
        for j in range(len(ccfsorting)):
            ccf=ccfs[ccfsorting[j]]
            ccforder[cc,:] = ccf[i,:]
            cc+=1
        for j in range(len(sorting)):#Loop over exposures
            if i ==0:
                print('---'+type[sorting[j]]+'  '+date[sorting[j]])
            if type[sorting[j]] == 'SCIENCE':#This check may be redundant.
                exposure = e2ds[sorting[j]]
                order[c,:] = exposure[i,:]
                #T_i = interp.interp1d(list_of_wls[j],list_of_trans[j])#This should be time-sorted, just as the e2ds files.
                #Do a manual check here that the MJDs are identical.
                #Also, determiine what to do with airtovac.
                #tel_order[c,:] = T_i[wave_axis]
                #Now I also need to write it to file.
                if i ==0:#Only do it the first time, not for every order.
                    line = str(mjd[sorting[j]])+'\t'+date[sorting[j]]+'\t'+str(texp[sorting[j]])+'\t'+str(airmass[sorting[j]])+'\t'+str(berv[sorting[j]])+'\t'+framename[sorting[j]]+'\n'
                    f.write(line)
                c+=1
        ccftotal+=ccforder



        fits.writeto(outpath+'ccf_'+str(i)+'.fits',ccforder,overwrite=True)
        fits.writeto(outpath+'order_'+str(i)+'.fits',order,overwrite=True)
        fits.writeto(outpath+'wave_'+str(i)+'.fits',wave_axis,overwrite=True)
    fits.writeto(outpath+'ccftotal.fits',ccftotal,overwrite=True)
    f.close()
    print('Time-table written to '+outpath+'obs_times')
    print('WARNING: FORMATTING IS STILL SCREWED UP!')
    print('FIGURE OUT HOW TO FORMAT THOSE LINES IN A MORE HUMAN READABLE WAY')
    print('WHEN YOU HAVE INTERNET AGAIN.')
Example #8
0
def exp_model(epoch, delta=0):
    """
    exp训练及预测模型
    :param todo: 当天当次新增的CVE
    :return exp_proba: list or []
    """
    exp_proba = exp_proba2 = list()
    so, cve = cve_query(db='cve', table='nvd', key=['*'])
    cve_df = pd.DataFrame(cve, columns=cve_tags)
    cve_csv = cve_df[[
        'CVE_Items_cve_CVE_data_meta_ID',
        'CVE_Items_cve_description_description_data_value',
        'CVE_Items_publishedDate', 'CVE_EXP_label'
    ]]
    cve_csv.to_csv('CVE_EXP_2020.csv', index=0)
    # 抑制模型衰减:每月重训练一次
    month_day = time_delta(delta=0, format='%Y-%m-%d')
    if int(month_day.split('-')[2]) == 404:  #ecs内存不足
        print('[+] Retrain model')
        x = cve_df['CVE_Items_cve_description_description_data_value'].astype(
            'str')
        y = cve_df['CVE_EXP_label'].astype('int')
        nlp = wordindex(char_level=False)
        fx, fy = nlp.fit_transform(x, y)
        joblib.dump(nlp, 'data/model/nlp.h5')
        train_x, valid_x, train_y, valid_y = train_test_split(
            fx, fy, random_state=2019, test_size=0.3)
        model = textcnn(input_type='wordindex',
                        max_len=nlp.max_length,
                        input_dim=nlp.input_dim,
                        output_dim=16,
                        class_num=1)
        model.fit(train_x,
                  train_y,
                  validation_data=(valid_x, valid_y),
                  epochs=1,
                  batch_size=128)
        joblib.dump(model, 'data/model/textcnn.h5')

    print('[+] Load predict model')
    nlp = joblib.load('data/model/nlp.h5')
    model = joblib.load('data/model/textcnn.h5')
    # 测试:预测当天当次新增CVE
    if epoch:
        cve_epoch = pd.DataFrame(
            list(epoch.values()),
            columns=[
                'CVE_Items_cve_CVE_data_meta_ID',
                'CVE_Items_cve_description_description_data_value',
                'CVE_Items_publishedDate', 'CVE_Items_lastModifiedDate',
                'Source'
            ])
        print(cve_epoch)
        x = cve_epoch[
            'CVE_Items_cve_description_description_data_value'].astype('str')
        fx = nlp.transform(x)
        #model.summary()
        pre = model.predict(fx)
        pre = pd.DataFrame(pre)
        epoch_exp_proba = pd.concat([
            cve_epoch[[
                'CVE_Items_cve_CVE_data_meta_ID',
                'CVE_Items_cve_description_description_data_value',
                'CVE_Items_publishedDate'
            ]], pre
        ],
                                    axis=1)
        epoch_exp_proba.columns = [
            'CVE_ID', 'Description', 'PubDate', 'EXP_Proba'
        ]
        epoch_exp_proba = epoch_exp_proba.sort_values(by='EXP_Proba',
                                                      ascending=False)
        epoch_exp_proba = epoch_exp_proba.values.tolist()
    else:
        epoch_exp_proba = None
        print('[INFO] No CVE Today Epoch')

    # 测试:预测当天新增CVE
    cve = []
    for d in delta:
        modified_time = time_delta(delta=d, format="%Y-%m-%d")
        so, tmp = cve_query_where(
            db='cve',
            table='nvd',
            key=['*'],
            where='CVE_Items_publishedDate like "%{}%"'.format(modified_time))
        cve = cve + tmp
    if cve:
        cve_df = pd.DataFrame(cve, columns=cve_tags)
        x = cve_df['CVE_Items_cve_description_description_data_value'].astype(
            'str')
        fx = nlp.transform(x)
        #model.summary()
        pre = model.predict(fx)
        pre = pd.DataFrame(pre)
        day_exp_proba = pd.concat([
            cve_df[[
                'CVE_Items_cve_CVE_data_meta_ID',
                'CVE_Items_cve_description_description_data_value',
                'CVE_Items_publishedDate'
            ]], pre
        ],
                                  axis=1)
        day_exp_proba.columns = [
            'CVE_ID', 'Description', 'PubDate', 'EXP_Proba'
        ]
        day_exp_proba = day_exp_proba.sort_values(by='EXP_Proba',
                                                  ascending=False)
        day_exp_proba = day_exp_proba.values.tolist()
    else:
        day_exp_proba = None
        print('[INFO] No CVE Today')

    # 测试:预测本月新增CVE
    time = time_delta(format="%Y-%m")
    so, cve = cve_query_where(
        db='cve',
        table='nvd',
        key=['*'],
        where='CVE_Items_publishedDate like "%{}%"'.format(time))
    if cve:
        cve_df = pd.DataFrame(cve, columns=cve_tags)
        x = cve_df['CVE_Items_cve_description_description_data_value'].astype(
            'str')
        fx = nlp.transform(x)
        pre = model.predict(fx)
        pre = pd.DataFrame(pre)
        month_exp_proba = pd.concat([
            cve_df[[
                'CVE_Items_cve_CVE_data_meta_ID',
                'CVE_Items_cve_description_description_data_value',
                'CVE_Items_publishedDate'
            ]], pre
        ],
                                    axis=1)
        month_exp_proba.columns = [
            'CVE_ID', 'Description', 'PubDate', 'EXP_Proba'
        ]
        month_exp_proba = month_exp_proba.sort_values(by='EXP_Proba',
                                                      ascending=False)

        so, exist_cid = cve_exists(db='exp', table='exps', key=['cve_id'])
        month_exp_proba['Ground_Truth'] = month_exp_proba.apply(
            lambda x: 1 if x['CVE_ID'] in exist_cid else 'None', axis=1)
        month_exp_proba = month_exp_proba.values.tolist()
    else:
        month_exp_proba = None
        print('[INFO] No CVE Month')
    with open(path('../data/log', 'cveflow.log'), 'a+') as f:
        f.write('[Done] CVE EXP Prediction')
    return epoch_exp_proba, day_exp_proba, month_exp_proba