Beispiel #1
0
def main():
    """
    creates and loads table1, 2, 3
    """
    session, cluster = set_keyspace()
    extract_data()
    df = pd.read_csv('processed_data.csv')

    # create tables
    for query in create_table_queries:
        create_table(session, query)

    # loading data to the tables
    for row in df.values:
        table1_data = (row[i] for i in [0, 9, 5, 8, 3])
        table2_data = (row[0], row[9], row[8], row[3], row[1]+' '+row[4], row[10])
        table3_data = (row[9], row[1]+' '+row[4], row[10])
        # load all of them
        load_table(session, table1_insert_query, table1_data)
        load_table(session, table2_insert_query, table2_data)
        load_table(session, table3_insert_query, table3_data)

    test(session)

    # drop_tables(session)

    session.shutdown()
    cluster.shutdown()
 def test_extract_data(self):
     root = join('testdata', 'test_extract_data')
     for fname in os.listdir(root):
         if splitext(fname)[1] != '.ess':
             continue
         with open(join(root, fname) + '.json') as f:
             cdata = json.loads(f.read())
         plugininfo, fidarray, playerdata, playerflags, required_plugins\
             = extract.extract_data(join(root, fname))
         self.assertEqual(cdata['plugininfo'], plugininfo)
         self.assertEqual(cdata['fidarray'], fidarray)
         self.assertEqual(sorted(cdata['required plugins']),
                          sorted(required_plugins))
         self.assertEqual(cdata['flags'], playerflags)
         self.assertEqual(cdata['hair color'], playerdata['hair color'])
         self.assertEqual(cdata['skin color'], playerdata['skin color'])
         self.assertEqual(cdata['head texture'], playerdata['head texture'])
         self.assertEqual(cdata['headparts'], playerdata['headparts'])
         self.assertEqual(cdata['unknown1'], playerdata['unknown1'])
         self.assertEqual(cdata['face morph values'],
                          playerdata['face morph values'])
         self.assertEqual(cdata['faceparts'], playerdata['faceparts'])
         if 'female' in cdata:
             self.assertIn('female', playerdata)
             self.assertEqual(cdata['female'], playerdata['female'])
         if 'race' in cdata:
             self.assertIn('race', playerdata)
             self.assertEqual(cdata['race'], playerdata['race'])
Beispiel #3
0
  def extract_random_segments_for_given_patient_during_warning(self,segment_label, patient_no):   #during warning related to AR(l)__autocorrelation lag

    current_patient_ = patient_list[patient_no]  
    patient_ann_ = current_patient_[:-4] + '-nsrr.xml'
    ann_, onset_, duration_ = extract_anns(TEST_ANN_PATH + patient_ann_)
    eeg_dict_, info_dict_ = extract_data(TEST_DATA_PATH + current_patient_, ann_, onset_)
    return eeg_dict_[segment_label][np.random.choice(len(eeg_dict[segment_label]))]
Beispiel #4
0
  def extract_test_segments_for_given_patient(self, patient_no):   #helper

    current_patient = patient_list[patient_no]  
    patient_ann = current_patient[:-4] + '-nsrr.xml'
    ann, onset, duration = extract_anns(TEST_ANN_PATH + patient_ann)
    eeg_dict, info_dict = extract_data(TEST_DATA_PATH + current_patient, ann, onset,duration[-1])
    len_dict = {}
    
    for i in eeg_dict.keys(): 
      len_dict[i] = len(eeg_dict[i])
      #print(f"eeg_dict{i} is {eeg_dict[i]}")
      #random.shuffle(eeg_dict[i])
    #print(len_dict)

    selected_tuples=[]
    for _ in range(excess_segments_needed_per_patient):   
      for i in eeg_dict.keys():
        if i==4:
          continue
        if len_dict[i]!=0:
          selected_tuples.append((int(i),eeg_dict[i][np.random.choice(len(eeg_dict[i]))]))
          
    print(f"RAM: {resource.getrusage(resource.RUSAGE_SELF).ru_maxrss/1024} MB")   
  
    for t in selected_tuples:
      yield t
    def get_ref_dicts(self):

        #indices of fit dicts, found beforehand, can use FitDictFinder class to get desired dicts
        fit_ref_list = [3, 10, 19, 21, 22, 26, 28, 33, 36, 38, 40, 42, 43, 51, 52, 53, 55, 56, 57, 60, 62,\
                        64, 70, 72, 73, 82, 83, 85, 87, 89, 90, 93, 95, 101, 103, 107, 109, 113, 115, 116,\
                        117, 123, 130, 131, 135, 138, 139, 141, 148, 155, 156, 157, 159]
        selected_ref_indices = list(
            np.random.choice(fit_ref_list,
                             size=self.num_patients,
                             replace=False))

        for i in selected_ref_indices:
            ref = patient_list[i]
            ann_ref = ref[:-4] + '-nsrr.xml'

            ann, onset, duration = extract_anns(TRAIN_ANN_PATH + ann_ref)
            preprocess = 'std'
            eeg_dict = extract_data(TRAIN_DATA_PATH + ref,
                                    ann,
                                    onset,
                                    duration[-1],
                                    preprocess=preprocess)

            len_dict = {}
            for i in eeg_dict.keys():
                len_dict[i] = len(eeg_dict[i])

            print(len_dict)
            print(ref)
            print(ann_ref)

            yield eeg_dict
 def test_extract_data(self):
     root = join('testdata', 'test_extract_data')
     for fname in os.listdir(root):
         if splitext(fname)[1] != '.ess':
             continue
         with open(join(root, fname)+'.json') as f:
             cdata = json.loads(f.read())
         plugininfo, fidarray, playerdata, playerflags, required_plugins\
             = extract.extract_data(join(root, fname))
         self.assertEqual(cdata['plugininfo'], plugininfo)
         self.assertEqual(cdata['fidarray'], fidarray)
         self.assertEqual(sorted(cdata['required plugins']), sorted(required_plugins))
         self.assertEqual(cdata['flags'], playerflags)
         self.assertEqual(cdata['hair color'], playerdata['hair color'])
         self.assertEqual(cdata['skin color'], playerdata['skin color'])
         self.assertEqual(cdata['head texture'], playerdata['head texture'])
         self.assertEqual(cdata['headparts'], playerdata['headparts'])
         self.assertEqual(cdata['unknown1'], playerdata['unknown1'])
         self.assertEqual(cdata['face morph values'], playerdata['face morph values'])
         self.assertEqual(cdata['faceparts'], playerdata['faceparts'])
         if 'female' in cdata:
             self.assertIn('female', playerdata)
             self.assertEqual(cdata['female'], playerdata['female'])
         if 'race' in cdata:
             self.assertIn('race', playerdata)
             self.assertEqual(cdata['race'], playerdata['race'])
Beispiel #7
0
    def extract_random_segments_for_given_patient_during_warning(
            self, segment_label,
            patient_no):  #during warning related to AR(l)__autocorrelation lag

        current_patient_ = self.patient_list[patient_no]
        print(f'\nCurrent Patient: {current_patient_}')
        patient_ann_ = current_patient_[:-4] + '-nsrr.xml'
        ann_, onset_, duration_ = extract_anns(self.ann_path + patient_ann_)
        eeg_dict_, info_dict_ = extract_data(self.data_path + current_patient_,
                                             ann_, onset_, duration_[-1])
        #print(np.random.choice(len(eeg_dict_[segment_label])-1), len(eeg_dict_[segment_label]))
        return (int(segment_label), eeg_dict_[segment_label][np.random.choice(
            len(eeg_dict_[segment_label]) - 1)])
Beispiel #8
0
    def extract_random_segments_for_given_patient(
            self, patient_no, num_segs_chosen_per_patient):  #helper

        current_patient = self.patient_list[patient_no]
        print(f'\nCurrent Patient: {current_patient}')
        patient_ann = current_patient[:-4] + '-nsrr.xml'
        ann, onset, duration = extract_anns(self.ann_path + patient_ann)
        eeg_dict, info_dict = extract_data(self.data_path + current_patient,
                                           ann, onset, duration[-1])
        len_dict = {}

        for i in eeg_dict.keys():
            len_dict[i] = len(eeg_dict[i])

        print(len_dict)

        tuples = []  #all (label, segment)
        for label in eeg_dict.keys():
            # flag = (label == 0  or label == 2)
            for seg in range(len_dict[label]):
                tuples.append((int(label), eeg_dict[label][seg]))
                # if flag and np.random.random() < 0.3:
                #   tuples.append((int(label), eeg_dict[label][seg]))
                # if not flag:
                #   tuples.append((int(label), eeg_dict[label][seg]))

        # l = []
        # for t in tuples:
        #   l.append(t[0])
        # print(f"tuples: {np.unique(l, return_counts=True)}")
        random.shuffle(tuples)

        selected_tuples = []
        for i in range(num_segs_chosen_per_patient):
            selected_tuples.append(tuples.pop())
            # t = tuples.pop()
            # if t[0] == self.ref_label:
            #   selected_tuples.append(t)   #popping after shuffling equivalent to sampling randomly
            # if t[0] != self.ref_label and np.random.random()<0.2:
            #   selected_tuples.append(t)

        del tuples

        # l = []
        # for t in selected_tuples:
        #   l.append(t[0])

        # print(f"selected_tuples: {np.unique(l, return_counts=True)}")

        for t in selected_tuples:
            yield t
Beispiel #9
0
    def __init__(self):
        data, target = extract_data()

        # splits the data into training and testing
        X_train, X_test, Y_train, Y_test = \
            train_test_split(data, target, test_size=0.3)

        clf = RandomForestClassifier(n_estimators=500)

        clf.fit(X_train, Y_train)

        y_pred = clf.predict(X_test)

        print("Accuracy: ", metrics.accuracy_score(Y_test, y_pred))
def write_final(dirname, work, final, extract_methods):
    df = extract_data(work)

    if 'csv' in extract_methods:
        csv = os.path.join(final, dirname + ".csv")
        df.to_csv(csv, index=False, header=True)
        print "\tSUCCESS: Extracted data from .out file. CSV written to ./final/%s.csv" % dirname

    if 'sqlite3' in extract_methods:
        db_path = os.path.join(final, "data.db")
        conn = sqlite3.connect(
            db_path, timeout=10)  # 10 seconds to avoid write deadlock?
        try:
            sqlio.write_frame(df,
                              name='trees_fvsaggregate',
                              con=conn,
                              flavor='sqlite',
                              if_exists='append')
        except sqlite3.IntegrityError as e:
            if e.message.endswith("are not unique"):
                # try to drop and rerun
                cursor = conn.cursor()

                delete_sql = """DELETE FROM trees_fvsaggregate
                  WHERE var = '%(var)s'
                  AND rx = %(rx)d
                  AND cond = %(cond)d
                  AND site = %(site)d
                  AND climate = '%(climate)s'
                """ % df.irow(0)  # assume the dataframe has the same data

                res = cursor.execute(delete_sql)
                if res.rowcount > 0:
                    print "\tNOTICE : Deleting %d old rows from ./final/data.db" % res.rowcount

                # try again
                sqlio.write_frame(df,
                                  name='trees_fvsaggregate',
                                  con=conn,
                                  flavor='sqlite',
                                  if_exists='append')

            else:
                # something else went wrong
                conn.rollback()
                raise sqlite3.IntegrityError(e.message)

        conn.commit()
        conn.close()
        print "\tSUCCESS: Extracted data from .out file. Row appended to ./final/data.db"
Beispiel #11
0
    def __init__(self, depth=10, min_leaf=10, num_trees=10):
        data, target = extract_data()

        # splits the data into training and testing
        X_train, X_test, Y_train, Y_test = \
            train_test_split(data, target, test_size=0.3)

        self.max_depth = depth
        self.min_leaf = min_leaf
        self.n_features = int(sqrt(len(data[0]) - 1))
        self.num_trees = num_trees

        self.combine_lists(X_train, Y_train)
        self.combine_lists(X_test, Y_test)

        self.generate_random_forest(X_train, Y_train, X_test, Y_test)
Beispiel #12
0
def scrape_world_select():
    dt = datetime.now()
    response = get_osrs_world_select()
    status_code = response.status_code

    if (response.ok):
        world_data, total_player_data = extract_data(response)

        world_data, total_player_count = (transform_data(
            world_data, total_player_data, dt))

        load_data(world_data, total_player_count)
    else:
        print('Bad Response - HTTP', status_code)

    update_logs(dt, status_code)
def specific_patient_dataset(patient_no):
    current_patient = patient_list[patient_no]
    patient_ann = current_patient[:-4] + '-nsrr.xml'
    ann, onset, duration = extract_anns(TRAIN_ANN_PATH + patient_ann)
    eeg_dict, info_dict = extract_data(TRAIN_DATA_PATH + current_patient, ann,
                                       onset)
    len_dict = {}
    eeg_dict_list = []

    for i in eeg_dict.keys():
        len_dict[i] = len(eeg_dict[i])
        if i == 4 and len_dict[i] != 0:
            print(
                f"{len_dict[i]} keys of segment 4 in patient no: {patient_no}")
        random.shuffle(eeg_dict[i])
    eeg_dict_list.append(eeg_dict)
    return eeg_dict_list, len_dict
    def extract_random_segments_for_given_patient(self, patient_no,
                                                  num_segs_chosen):  #helper

        current_patient = patient_list[patient_no]
        patient_ann = current_patient[:-4] + '-nsrr.xml'
        ann, onset, duration = extract_anns(TRAIN_ANN_PATH + patient_ann)
        eeg_dict, info_dict = extract_data(TRAIN_DATA_PATH + current_patient,
                                           ann, onset, duration[-1])
        len_dict = {}
        labels_available = []

        for i in eeg_dict.keys():
            len_dict[i] = len(eeg_dict[i])
            if len_dict[i] != 0:
                labels_available.append(i)
            #print(f"eeg_dict{i} is {eeg_dict[i]}")
            #random.shuffle(eeg_dict[i])
        #print(len_dict)

        tuples = []  #all (label, segment)
        for label in eeg_dict.keys():
            for seg in range(len_dict[label]):
                tuples.append((int(label), eeg_dict[label][seg]))

        random.shuffle(tuples)

        selected_tuples = []
        for i in range(num_segs_chosen):
            selected_tuples.append(tuples.pop(
            ))  #popping after shuffling equivalent to sampling randomly
            print(
                f"{i}th segment chosen of label {selected_tuples[i][0]} from patient {patient_no}"
            )
        #print(f"RAM: {resource.getrusage(resource.RUSAGE_SELF).ru_maxrss/1024} MB")
        del tuples
        print(
            f"RAM: {resource.getrusage(resource.RUSAGE_SELF).ru_maxrss/1024} MB"
        )
        process = psutil.Process(os.getpid())
        print("Gen RAM Free: " + humanize.naturalsize( psutil.virtual_memory().available ), \
              " | Proc size: " + humanize.naturalsize( process.memory_info().rss))

        for t in selected_tuples:
            yield t
    def get_fit_dict_indices(self):
        fit_dict_indices = []

        for i in range(len(patient_list)):
            current_patient = patient_list[i]
            patient_ann = current_patient[:-4] + '-nsrr.xml'
            ann, onset, duration = extract_anns(TRAIN_ANN_PATH + patient_ann)
            preprocess = 'std'
            eeg_dict, info_dict = extract_data(TRAIN_DATA_PATH +
                                               current_patient,
                                               ann,
                                               onset,
                                               duration[-1],
                                               preprocess=preprocess)
            flag = self.is_dict_fit(eeg_dict)
            print(i, flag)
            if flag:
                fit_dict_indices.append(i)
                print(fit_dict_indices)
Beispiel #16
0
def buildDB():

	# clean the data file at private/filename
    noisy_file = os.path.join(request.folder, 'private','027__BTECH__6TH SEM.txt')
    clean_file = os.path.join(request.folder, 'private','cleanFile.txt')
    preprocess.cleanData(noisy_file, clean_file)


    # extract college information
    lines = preprocess.readFile(clean_file)
    colleges = extract.extract_data(lines)

    #insert into table colleges
    for name in colleges.keys():
    	db.colleges.insert(name=name)

    # insert into table students
    for name in colleges.keys():
        for student in colleges[name].students:

            credits = (4,4,4,4,4,4,1,1,1,1,1)
            total_credits = 0
            for i in xrange(len(credits)):
                total_credits = total_credits + credits[i]

            percentage = (credits[0]*int(student.marks[0]) + credits[1]*int(student.marks[1]) +  credits[2]*int(student.marks[2]) +
                         credits[3]*int(student.marks[3]) + credits[4]*int(student.marks[4]) + credits[5]*int(student.marks[5]) +
                         credits[6]*int(student.marks[6]) + credits[7]*int(student.marks[7]) + credits[8]*int(student.marks[8]) +
                         credits[9]*int(student.marks[9]) + credits[10]*int(student.marks[10]))

            percentage = (percentage * 1.0) / total_credits

            if percentage < 50.0:
                continue

            collegeid = db(db.colleges.name == name).select()[0]['id']
            db.students.insert(colleges_id = collegeid, rollNo = student.rollNo[1:], name = student.name,
                           subj1 = int(student.marks[0]), subj2 = int(student.marks[1]), subj3 = int(student.marks[2]),
                           subj4 = int(student.marks[3]), subj5 = int(student.marks[4]), subj6 = int(student.marks[5]),
                           subj7 = int(student.marks[6]), subj8 = int(student.marks[7]), subj9 = int(student.marks[8]),
                           subj10 = int(student.marks[9]), subj11 = int(student.marks[10]), percentage = percentage)

    return 'db built'
Beispiel #17
0
def write_final(dirname, work, final, extract_methods):
    df = extract_data(work)

    if 'csv' in extract_methods:
        csv = os.path.join(final, dirname + ".csv")
        df.to_csv(csv, index=False, header=True)
        print "\tSUCCESS: Extracted data from .out file. CSV written to ./final/%s.csv" % dirname

    if 'sqlite3' in extract_methods:
        db_path = os.path.join(final, "data.db")
        conn = sqlite3.connect(db_path, timeout=10)  # 10 seconds to avoid write deadlock?
        try:
            sqlio.write_frame(df, name='trees_fvsaggregate',
                con=conn, flavor='sqlite', if_exists='append')
        except sqlite3.IntegrityError as e:
            if e.message.endswith("are not unique"):
                # try to drop and rerun
                cursor = conn.cursor()

                delete_sql = """DELETE FROM trees_fvsaggregate
                  WHERE var = '%(var)s'
                  AND rx = %(rx)d
                  AND cond = %(cond)d
                  AND site = %(site)d
                  AND climate = '%(climate)s'
                """ % df.irow(0)  # assume the dataframe has the same data

                res = cursor.execute(delete_sql)
                if res.rowcount > 0:
                    print "\tNOTICE : Deleting %d old rows from ./final/data.db" % res.rowcount

                # try again
                sqlio.write_frame(df, name='trees_fvsaggregate',
                    con=conn, flavor='sqlite', if_exists='append')

            else:
                # something else went wrong
                conn.rollback()
                raise sqlite3.IntegrityError(e.message)

        conn.commit()
        conn.close()
        print "\tSUCCESS: Extracted data from .out file. Row appended to ./final/data.db"
    def extract_random_segments_for_given_patient_during_warning(
            self, segment_label,
            patient_no):  #during warning related to AR(l)__autocorrelation lag

        current_patient_ = patient_list[patient_no]
        patient_ann_ = current_patient_[:-4] + '-nsrr.xml'
        ann_, onset_, duration_ = extract_anns(TRAIN_ANN_PATH + patient_ann_)
        eeg_dict_, info_dict_ = extract_data(
            TRAIN_DATA_PATH + current_patient_, ann_, onset_, duration_[-1])
        len_dict = {}
        labels_available = []

        for i in eeg_dict_.keys():
            len_dict[i] = len(eeg_dict_[i])
            if len_dict[i] != 0:
                labels_available.append(i)
        segment_label = np.random.choice(labels_available)
        return (int(segment_label),
                eeg_dict_[segment_label][(len(eeg_dict_[segment_label]) - 1) //
                                         2])
Beispiel #19
0
    def extract_test_segments_for_given_patient(self, patient_no):  #helper

        current_patient = patient_list[patient_no]
        patient_ann = current_patient[:-4] + '-nsrr.xml'
        ann, onset, duration = extract_anns(TEST_ANN_PATH + patient_ann)
        preprocess = None  #no-preprocessing
        eeg_dict, stat = extract_data(TEST_DATA_PATH + current_patient,
                                      ann,
                                      onset,
                                      duration[-1],
                                      preprocess=preprocess,
                                      return_stats=True)

        len_dict = {}
        for i in eeg_dict.keys():
            len_dict[i] = len(eeg_dict[i])

        selected_tuples = []

        for i in eeg_dict.keys():
            if len_dict[i] != 0:
                #print(f"Label: {i}: {len_dict[i]}")
                if i == 1:
                    seg_indices = np.random.choice(len(eeg_dict[i]),
                                                   min(11, len(eeg_dict[i])),
                                                   replace=False)
                    for j in seg_indices:
                        selected_tuples.append((int(i), eeg_dict[i][j]))
                else:
                    seg_indices = np.random.choice(len(eeg_dict[i]),
                                                   min(10, len(eeg_dict[i])),
                                                   replace=False)
                    for j in seg_indices:
                        selected_tuples.append((int(i), eeg_dict[i][j]))

                #print(seg_indices)

        for t in selected_tuples:
            yield t, stat
  def extract_random_segments_for_given_patient(self, patient_no, num_segs_chosen_per_patient):   #helper

    current_patient = self.patient_list[patient_no]  
    patient_ann = current_patient[:-4] + '-nsrr.xml'
    ann, onset, duration = extract_anns(self.ann_path + patient_ann)
    preprocess = None  #getting un-preprocessed segments
    eeg_dict, stat = extract_data(self.data_path + current_patient, ann, onset, duration[-1], preprocess=preprocess, return_stats=True)
    self.stats.append(stat)

    len_dict = {}
    for i in eeg_dict.keys(): 
      len_dict[i] = len(eeg_dict[i])

    selected_tuples = []
    labels = []
    tuples = []    #all (label, segment)

    for label in [1]:
      for seg in range(len_dict[label]): 
        selected_tuples.append((int(label), eeg_dict[label][seg]))
        labels.append(label)

    for label in [0,2,3,4]:
      for seg in range(len_dict[label]): 
        tuples.append((int(label), eeg_dict[label][seg]))

    random.shuffle(tuples)

    for _ in range(num_segs_chosen_per_patient-len(selected_tuples)):
      t = tuples.pop()
      selected_tuples.append(t)
      labels.append(t[0])

    del tuples

    self.segs_global.extend(labels)
    self.data_list.extend(selected_tuples)
    del selected_tuples
def main():
    result = False
    time = sys.argv[1]

    for key, value in LOAD_SURVEY_TYPES.items():
        result = load_data(value, time)
        if not result:
            print("\nExecution of Data Load of " + value + " Failed")
            break

    if result:
        ("\nExecution of Data Load Successful")
        result = procedure_execute(time)
        if result:
            print("\nExecution of Stored Procedures Successful")
            result = extract_data(time)
            if result:
                print("\nExecution of Data Extract Successful")
                print("\nExecution for " + time + " is Completed")
            elif not result:
                print("\nExecution of Data Extract Failed")
        elif not result:
            print("\nExecution of Stored Procedures Failed")
# If the densenet features were pre-computed, we don't have to recompute them
load_densenet_features = os.path.isfile(densenet_features_path)
load_frames = os.path.isdir(frames_path)


## EXTRACTION ##

# If not already done, we extract the relevant frames from the raw MUG dataset
if not load_frames:
    extract_frames(subjects_path, frames_path)


# Now we extract the training and target data from the frames
if not load_densenet_features:
    x, y = extract_data(frames_path)
else:
    with open(y_data_path, 'rb') as f:
        y = pickle.load(f)


if load_densenet_features:
    with open(densenet_features_path, 'rb') as f:
        densenet_features = pickle.load(f)
else:
    # Create DenseNet model
    densenet = DenseNet121(include_top=False, input_shape=(img_size, img_size, nb_channels))
    densenet = Model(inputs=densenet.input, output=GlobalAveragePooling2D(name='avg_pool')(densenet.output))

    # Extract the DenseNet features
    densenet_features = extract_model_features(densenet, x)