def read_data_for_features_extraction(set='test',save=False):
  """
  Extracts the features from all seismic files
  If option 'save' is set, then save the pandas DataFrame as a .csv file
  """
  from scipy.io.matlab import mio
  from options import MultiOptions
  opt = MultiOptions()

  if set == 'train':
    opt.opdict['feat_filename'] = '%s/features/%s'%(opt.opdict['outdir'],opt.opdict['feat_train'])
  print opt.opdict['feat_filename']

  if save:
    if os.path.exists(opt.opdict['feat_filename']):
      print "WARNING !! File %s already exists"%opt.opdict['feat_filename']
      print "Check if you really want to replace it..." 
      sys.exit()

  list_features = opt.opdict['feat_list']
  df = pd.DataFrame(columns=list_features)

  if set == 'test':
    datafiles = glob.glob(os.path.join(opt.opdict['datadir'],'TestSet/SigEve_*'))
    datafiles.sort()
    liste = [os.path.basename(datafiles[i]).split('_')[1].split('.mat')[0] for i in range(len(datafiles))]
    liste = map(int,liste) # sort the list of file following the event number
    liste.sort()

    tsort = opt.read_csvfile(opt.opdict['label_filename'])
    tsort.index = tsort.Date

    for ifile,numfile in enumerate(liste):
      file = os.path.join(opt.opdict['datadir'],'TestSet/SigEve_%d.mat'%numfile)
      print ifile,file
      mat = mio.loadmat(file)

      counter = 0
      for comp in opt.opdict['channels']:
        counter = counter + 1
        ind = (numfile,'BOR',comp)
        dic = pd.DataFrame(columns=list_features,index=[ind])
        dic['EventType'] = tsort[tsort.Date==numfile].Type.values[0]
        dic['Ponset'] = 0

        s = SeismicTraces(mat,comp)
        list_attr = s.__dict__.keys()

        if len(list_attr) > 2:
          if opt.opdict['option'] == 'norm':
            dic = extract_norm_features(s,list_features,dic)
          elif opt.opdict['option'] == 'hash':
            if ifile in [409,1026,1027,1028,1993,2121,2122,2123,2424,2441,3029,3058,3735,3785,3852,3930,4200,4463,4464,4746,6150,6382,6672,6733]:
              continue
            dic = extract_hash_features(s,list_features,dic,opt.opdict['permut_file'],plot=False)
          df = df.append(dic)

        if counter == 3 and ('Rectilinearity' in list_features or 'Planarity' in list_features or 'Azimuth' in list_features or 'Incidence' in list_features):
          d_mean = (df.Dur[(numfile,'BOR',comp)] + df.Dur[(numfile,'BOR','E')] + df.Dur[(numfile,'BOR','Z')])/3.
          po_mean = int((df.Ponset[(numfile,'BOR',comp)] + df.Ponset[(numfile,'BOR','E')] + df.Ponset[(numfile,'BOR','Z')])/3)
          s.read_all_files(mat,False)
          rect, plan, az, iang = polarization_analysis(s,d_mean,po_mean,plot=False)
          if 'Rectilinearity' in list_features:
            df.Rectilinearity[(numfile,'BOR','Z')], df.Rectilinearity[(numfile,'BOR','N')], df.Rectilinearity[(numfile,'BOR','E')] = rect, rect, rect
          if 'Planarity' in list_features:
            df.Planarity[(numfile,'BOR','Z')], df.Planarity[(numfile,'BOR','N')], df.Planarity[(numfile,'BOR','E')] = plan, plan, plan
          if list_features or 'Azimuth':
            df.Azimuth[(numfile,'BOR','Z')], df.Azimuth[(numfile,'BOR','N')], df.Azimuth[(numfile,'BOR','E')] = az, az, az
          if 'Incidence' in list_features:
            df.Incidence[(numfile,'BOR','Z')], df.Incidence[(numfile,'BOR','N')], df.Incidence[(numfile,'BOR','E')] = iang, iang, iang

  elif set == 'train':
    datafile = os.path.join(opt.opdict['datadir'],'TrainingSetPlusSig_2.mat')
    mat = mio.loadmat(datafile)
    hob_all_EB = {}
    for i in range(mat['KurtoEB'].shape[1]):
      print "EB", i
      if i!=10 and i!=61:
        continue
      counter = 0
      for comp in opt.opdict['channels']:
        counter = counter + 1
        dic = pd.DataFrame(columns=list_features,index=[(i,'BOR',comp)])
        dic['EventType'] = 'EB'
        dic['Ponset'] = 0
        
        s = SeismicTraces(mat,comp,train=[i,'EB'])
        list_attr = s.__dict__.keys()
        if len(list_attr) > 2:
          if opt.opdict['option'] == 'norm':
            dic = extract_norm_features(s,list_features,dic)
          elif opt.opdict['option'] == 'hash':
            dic = extract_hash_features(s,list_features,dic,opt.opdict['permut_file'],plot=False)
          df = df.append(dic)

      neb = i+1
      if counter == 3 and ('Rectilinearity' in list_features or 'Planarity' in list_features or 'Azimuth' in list_features or 'Incidence' in list_features):
        d_mean = (df.Dur[(i,'BOR',comp)] + df.Dur[(i,'BOR','E')] + df.Dur[(i,'BOR','Z')])/3.
        po_mean = int((df.Ponset[(i,'BOR',comp)] + df.Ponset[(i,'BOR','E')] + df.Ponset[(i,'BOR','Z')])/3)
        s.read_all_files(mat,train=[i,'EB'])
        rect, plan, az, iang = polarization_analysis(s,d_mean,po_mean,plot=False)
        if 'Rectilinearity' in list_features:
          df.Rectilinearity[(i,'BOR','Z')], df.Rectilinearity[(i,'BOR','N')], df.Rectilinearity[(i,'BOR','E')] = rect, rect, rect
        if 'Planarity' in list_features:
          df.Planarity[(i,'BOR','Z')], df.Planarity[(i,'BOR','N')], df.Planarity[(i,'BOR','E')] = plan, plan, plan
        if 'Azimuth' in list_features:
          df.Azimuth[(i,'BOR','Z')], df.Azimuth[(i,'BOR','N')], df.Azimuth[(i,'BOR','E')] = az, az, az
        if 'Incidence' in list_features:
          df.Incidence[(i,'BOR','Z')], df.Incidence[(i,'BOR','N')], df.Incidence[(i,'BOR','E')] = iang, iang, iang


    for i in range(mat['KurtoVT'].shape[1]):
      print "VT", i+neb
      if i != 5:
        continue
      counter = 0
      for comp in opt.opdict['channels']:
        counter = counter + 1
        dic = pd.DataFrame(columns=list_features,index=[(i+neb,'BOR',comp)])
        dic['EventType'] = 'VT'
        dic['Ponset'] = 0

        s = SeismicTraces(mat,comp,train=[i,'VT'])

        list_attr = s.__dict__.keys()
        if len(list_attr) > 2:
          if opt.opdict['option'] == 'norm':
            dic = extract_norm_features(s,list_features,dic)
          elif opt.opdict['option'] == 'hash':
            dic = extract_hash_features(s,list_features,dic,opt.opdict['permut_file'],plot=False)
          df = df.append(dic)
      if counter == 3 and ('Rectilinearity' in list_features or 'Planarity' in list_features or 'Azimuth' in list_features or 'Incidence' in list_features):
        d_mean = (df.Dur[(i+neb,'BOR',comp)] + df.Dur[(i+neb,'BOR','E')] + df.Dur[(i+neb,'BOR','Z')])/3.
        po_mean = int((df.Ponset[(i+neb,'BOR',comp)] + df.Ponset[(i+neb,'BOR','E')] + df.Ponset[(i+neb,'BOR','Z')])/3)
        s.read_all_files(mat,train=[i,'VT'])
        rect, plan, az, iang = polarization_analysis(s,d_mean,po_mean,plot=False)
        if 'Rectilinearity' in list_features:
          df.Rectilinearity[(i+neb,'BOR','Z')], df.Rectilinearity[(i+neb,'BOR','N')], df.Rectilinearity[(i+neb,'BOR','E')] = rect, rect, rect
        if 'Planarity' in list_features:
          df.Planarity[(i+neb,'BOR','Z')], df.Planarity[(i+neb,'BOR','N')], df.Planarity[(i+neb,'BOR','E')] = plan, plan, plan
        if 'Azimuth' in list_features:
          df.Azimuth[(i+neb,'BOR','Z')], df.Azimuth[(i+neb,'BOR','N')], df.Azimuth[(i+neb,'BOR','E')] = az, az, az
        if 'Incidence' in list_features:
          df.Incidence[(i+neb,'BOR','Z')], df.Incidence[(i+neb,'BOR','N')], df.Incidence[(i+neb,'BOR','E')] = iang, iang, iang

  if save:
    print "Features written in %s"%opt.opdict['feat_filename']
    df.to_csv(opt.opdict['feat_filename'])
Example #2
0
def read_data_for_features_extraction(set='test', save=False):
    """
  Extracts the features from all seismic files
  If option 'save' is set, then save the pandas DataFrame as a .csv file
  """
    from scipy.io.matlab import mio
    from options import MultiOptions
    opt = MultiOptions()

    if set == 'train':
        opt.opdict['feat_filename'] = '%s/features/%s' % (
            opt.opdict['outdir'], opt.opdict['feat_train'])
    print opt.opdict['feat_filename']

    if save:
        if os.path.exists(opt.opdict['feat_filename']):
            print "WARNING !! File %s already exists" % opt.opdict[
                'feat_filename']
            print "Check if you really want to replace it..."
            sys.exit()

    list_features = opt.opdict['feat_list']
    df = pd.DataFrame(columns=list_features)

    if set == 'test':
        datafiles = glob.glob(
            os.path.join(opt.opdict['datadir'], 'TestSet/SigEve_*'))
        datafiles.sort()
        liste = [
            os.path.basename(datafiles[i]).split('_')[1].split('.mat')[0]
            for i in range(len(datafiles))
        ]
        liste = map(int,
                    liste)  # sort the list of file following the event number
        liste.sort()

        tsort = opt.read_csvfile(opt.opdict['label_filename'])
        tsort.index = tsort.Date

        for ifile, numfile in enumerate(liste):
            file = os.path.join(opt.opdict['datadir'],
                                'TestSet/SigEve_%d.mat' % numfile)
            print ifile, file
            mat = mio.loadmat(file)

            counter = 0
            for comp in opt.opdict['channels']:
                counter = counter + 1
                ind = (numfile, 'BOR', comp)
                dic = pd.DataFrame(columns=list_features, index=[ind])
                dic['EventType'] = tsort[tsort.Date == numfile].Type.values[0]
                dic['Ponset'] = 0

                s = SeismicTraces(mat, comp)
                list_attr = s.__dict__.keys()

                if len(list_attr) > 2:
                    if opt.opdict['option'] == 'norm':
                        dic = extract_norm_features(s, list_features, dic)
                    elif opt.opdict['option'] == 'hash':
                        if ifile in [
                                409, 1026, 1027, 1028, 1993, 2121, 2122, 2123,
                                2424, 2441, 3029, 3058, 3735, 3785, 3852, 3930,
                                4200, 4463, 4464, 4746, 6150, 6382, 6672, 6733
                        ]:
                            continue
                        dic = extract_hash_features(s,
                                                    list_features,
                                                    dic,
                                                    opt.opdict['permut_file'],
                                                    plot=False)
                    df = df.append(dic)

                if counter == 3 and ('Rectilinearity' in list_features
                                     or 'Planarity' in list_features
                                     or 'Azimuth' in list_features
                                     or 'Incidence' in list_features):
                    d_mean = (df.Dur[(numfile, 'BOR', comp)] +
                              df.Dur[(numfile, 'BOR', 'E')] +
                              df.Dur[(numfile, 'BOR', 'Z')]) / 3.
                    po_mean = int((df.Ponset[(numfile, 'BOR', comp)] +
                                   df.Ponset[(numfile, 'BOR', 'E')] +
                                   df.Ponset[(numfile, 'BOR', 'Z')]) / 3)
                    s.read_all_files(mat, False)
                    rect, plan, az, iang = polarization_analysis(s,
                                                                 d_mean,
                                                                 po_mean,
                                                                 plot=False)
                    if 'Rectilinearity' in list_features:
                        df.Rectilinearity[(numfile, 'BOR',
                                           'Z')], df.Rectilinearity[(
                                               numfile, 'BOR',
                                               'N')], df.Rectilinearity[(
                                                   numfile, 'BOR',
                                                   'E')] = rect, rect, rect
                    if 'Planarity' in list_features:
                        df.Planarity[(numfile, 'BOR', 'Z')], df.Planarity[(
                            numfile, 'BOR',
                            'N')], df.Planarity[(numfile, 'BOR',
                                                 'E')] = plan, plan, plan
                    if list_features or 'Azimuth':
                        df.Azimuth[(numfile, 'BOR', 'Z')], df.Azimuth[(
                            numfile, 'BOR',
                            'N')], df.Azimuth[(numfile, 'BOR',
                                               'E')] = az, az, az
                    if 'Incidence' in list_features:
                        df.Incidence[(numfile, 'BOR', 'Z')], df.Incidence[(
                            numfile, 'BOR',
                            'N')], df.Incidence[(numfile, 'BOR',
                                                 'E')] = iang, iang, iang

    elif set == 'train':
        datafile = os.path.join(opt.opdict['datadir'],
                                'TrainingSetPlusSig_2.mat')
        mat = mio.loadmat(datafile)
        hob_all_EB = {}
        for i in range(mat['KurtoEB'].shape[1]):
            print "EB", i
            if i != 10 and i != 61:
                continue
            counter = 0
            for comp in opt.opdict['channels']:
                counter = counter + 1
                dic = pd.DataFrame(columns=list_features,
                                   index=[(i, 'BOR', comp)])
                dic['EventType'] = 'EB'
                dic['Ponset'] = 0

                s = SeismicTraces(mat, comp, train=[i, 'EB'])
                list_attr = s.__dict__.keys()
                if len(list_attr) > 2:
                    if opt.opdict['option'] == 'norm':
                        dic = extract_norm_features(s, list_features, dic)
                    elif opt.opdict['option'] == 'hash':
                        dic = extract_hash_features(s,
                                                    list_features,
                                                    dic,
                                                    opt.opdict['permut_file'],
                                                    plot=False)
                    df = df.append(dic)

            neb = i + 1
            if counter == 3 and ('Rectilinearity' in list_features
                                 or 'Planarity' in list_features
                                 or 'Azimuth' in list_features
                                 or 'Incidence' in list_features):
                d_mean = (df.Dur[(i, 'BOR', comp)] + df.Dur[(i, 'BOR', 'E')] +
                          df.Dur[(i, 'BOR', 'Z')]) / 3.
                po_mean = int(
                    (df.Ponset[(i, 'BOR', comp)] + df.Ponset[(i, 'BOR', 'E')] +
                     df.Ponset[(i, 'BOR', 'Z')]) / 3)
                s.read_all_files(mat, train=[i, 'EB'])
                rect, plan, az, iang = polarization_analysis(s,
                                                             d_mean,
                                                             po_mean,
                                                             plot=False)
                if 'Rectilinearity' in list_features:
                    df.Rectilinearity[(i, 'BOR', 'Z')], df.Rectilinearity[(
                        i, 'BOR',
                        'N')], df.Rectilinearity[(i, 'BOR',
                                                  'E')] = rect, rect, rect
                if 'Planarity' in list_features:
                    df.Planarity[(i, 'BOR', 'Z')], df.Planarity[(
                        i, 'BOR', 'N')], df.Planarity[(i, 'BOR',
                                                       'E')] = plan, plan, plan
                if 'Azimuth' in list_features:
                    df.Azimuth[(i, 'BOR', 'Z')], df.Azimuth[(
                        i, 'BOR', 'N')], df.Azimuth[(i, 'BOR',
                                                     'E')] = az, az, az
                if 'Incidence' in list_features:
                    df.Incidence[(i, 'BOR', 'Z')], df.Incidence[(
                        i, 'BOR', 'N')], df.Incidence[(i, 'BOR',
                                                       'E')] = iang, iang, iang

        for i in range(mat['KurtoVT'].shape[1]):
            print "VT", i + neb
            if i != 5:
                continue
            counter = 0
            for comp in opt.opdict['channels']:
                counter = counter + 1
                dic = pd.DataFrame(columns=list_features,
                                   index=[(i + neb, 'BOR', comp)])
                dic['EventType'] = 'VT'
                dic['Ponset'] = 0

                s = SeismicTraces(mat, comp, train=[i, 'VT'])

                list_attr = s.__dict__.keys()
                if len(list_attr) > 2:
                    if opt.opdict['option'] == 'norm':
                        dic = extract_norm_features(s, list_features, dic)
                    elif opt.opdict['option'] == 'hash':
                        dic = extract_hash_features(s,
                                                    list_features,
                                                    dic,
                                                    opt.opdict['permut_file'],
                                                    plot=False)
                    df = df.append(dic)
            if counter == 3 and ('Rectilinearity' in list_features
                                 or 'Planarity' in list_features
                                 or 'Azimuth' in list_features
                                 or 'Incidence' in list_features):
                d_mean = (df.Dur[(i + neb, 'BOR', comp)] +
                          df.Dur[(i + neb, 'BOR', 'E')] +
                          df.Dur[(i + neb, 'BOR', 'Z')]) / 3.
                po_mean = int((df.Ponset[(i + neb, 'BOR', comp)] +
                               df.Ponset[(i + neb, 'BOR', 'E')] +
                               df.Ponset[(i + neb, 'BOR', 'Z')]) / 3)
                s.read_all_files(mat, train=[i, 'VT'])
                rect, plan, az, iang = polarization_analysis(s,
                                                             d_mean,
                                                             po_mean,
                                                             plot=False)
                if 'Rectilinearity' in list_features:
                    df.Rectilinearity[(
                        i + neb, 'BOR', 'Z')], df.Rectilinearity[(
                            i + neb, 'BOR',
                            'N')], df.Rectilinearity[(i + neb, 'BOR',
                                                      'E')] = rect, rect, rect
                if 'Planarity' in list_features:
                    df.Planarity[(i + neb, 'BOR', 'Z')], df.Planarity[(
                        i + neb, 'BOR',
                        'N')], df.Planarity[(i + neb, 'BOR',
                                             'E')] = plan, plan, plan
                if 'Azimuth' in list_features:
                    df.Azimuth[(i + neb, 'BOR', 'Z')], df.Azimuth[(
                        i + neb, 'BOR', 'N')], df.Azimuth[(i + neb, 'BOR',
                                                           'E')] = az, az, az
                if 'Incidence' in list_features:
                    df.Incidence[(i + neb, 'BOR', 'Z')], df.Incidence[(
                        i + neb, 'BOR',
                        'N')], df.Incidence[(i + neb, 'BOR',
                                             'E')] = iang, iang, iang

    if save:
        print "Features written in %s" % opt.opdict['feat_filename']
        df.to_csv(opt.opdict['feat_filename'])