Example #1
0
def index():
    cz = drought >> mask(X.country_name == 'Czech Republic')
    plotdf = utils.drought_add_facecolor(cz)
    f_drought = utils.drought_rate_plot(plotdf)
    url = "/station_detail/station_id=@station"
    # taptool = f_drought.select(type=TapTool)
    # taptool.callback = OpenURL(url=url)
    f_drought.add_tools(TapTool(callback=OpenURL(url=url)))
    bokeh_map = utils.drought_map(plotdf)
    bokeh_map.add_tools(TapTool(callback=OpenURL(url=url)))
    script_drought, div_drought = components(f_drought)
    script_map, div_map = components(bokeh_map)
    return render_template(
        'main_page.html',
        stations=cz,
        js_resources=js_resources,
        css_resources=css_resources,
        script_drought=script_drought,
        script_map=script_map,
        div_drought=div_drought,
        div_map=div_map,
    )
	def build(self):
		st.title('Training Explorer')
		
		st.sidebar.text('Options')
		self.action_to_do = st.sidebar.selectbox('What would you like to do?', ('View Skills', 'Add New Skill', 'View Overall Performance'))
		self.a_number = st.sidebar.text_input('Enter employee number or name')
		#get input from user: view skills or add new skills
		if self.action_to_do == 'View Skills':
			self.view_skills = True
		elif self.action_to_do == 'Add New Skill':
			self.add_skill = True
		elif self.action_to_do == 'View Overall Performance':
			self.overall = True

		try:
			if self.view_skills and len(self.a_number) > 0:
				#process query
				try:
					q_ = (self.database >> df.mask(self.database.EmployeeNum==self.a_number))
					emp_name = q_['ResourceName'].unique()[0]
					a_number = q_['EmployeeNum'].unique()[0]
					bu = q_['Capability'].unique()[0]
					courses = q_[['CourseName', 'Status']]
					courses.index = np.arange(0, q_.shape[0])

					st.header('Employee Information')
					self.employee_im = np.array(Image.open('blank_profile.png'))
					st.image(cv.resize(self.employee_im, (150,150), cv.INTER_CUBIC), use_column_width=False)
					st.subheader('Name: {}'.format(emp_name))
					st.write('Employee Number: {}'.format(a_number))
					st.write('Capability: {}'.format(bu))
					st.write('Courses:')
					st.table(courses)
				
					#filter to get relevant information
					total_courses = len(q_['Status'])
					completed = len((q_ >> df.mask(q_.Status=='completed')))
					in_progress = (len((q_ >> df.mask(q_.Status=='in-progress'))))
					started = len((q_ >> df.mask(q_.Status=='registered-but-not-started')))
					
					#write information to user
					st.write('total courses: {}'.format(total_courses))
					st.write('completed: {}'.format(completed))
					st.write('in progress: {}'.format(in_progress))
					st.write('started: {}'.format(started))

					st.write('Have you finished any courses?')
					finished_courses = st.multi_select('',(''))


				except IndexError:
					st.write('Error: value entered not found in database')
		except AttributeError:
			pass

		try:
			if self.add_skill and len(self.a_number) > 0:
				st.write('Adding new skill for {}'.format(self.a_number))
				course_list = []
				for val in self.database['CourseName'].unique():
					course_list.append(val)
				self.courses_chosen = st.multiselect('The following are available', tuple(course_list))
				st.subheader('You have chosen the following courses')
				for course in self.courses_chosen:
					st.write(course)
				if st.button('Accept'):
					if len(self.courses_chosen) > 0:
						self.register_new_courses( self.a_number, self.courses_chosen)
				elif len(self.courses_chosen) == 0:
					st.write('Error: You did not select any courses')
		except AttributeError:
			pass

		try:
			if self.overall and len(self.a_number) > 0:
				st.write('Viewing Overall Perfomance')
				st.write(self.database)
		except AttributeError:
			pass

		st.sidebar.image(self.logo, use_column_width=True)
		st.sidebar.text(self.today)
"""Extracts source data for one year and station to CSV file for manual inspection."""

import utils
import logging
import constants
from dfply import X, mask

logging.basicConfig(
    level=logging.DEBUG,
    format=constants.logfmt,
    handlers=[logging.StreamHandler()],
)
year = 2019
station = 'SPE00120323'
logging.debug(f"extraction started for year={year} and station={station}")
filename = f"{year}.csv.gz"
by_year_path = '../../data/by_year'
df = utils.df_file(filename, by_year_path)
logging.debug(f"{len(df)} rows extracted for year={year}")
df_station = df >> mask(X.station == station)
logging.debug(f"{len(df_station)} rows for station={station}")
df_prcp = df_station >> mask(X.element == 'PRCP')
logging.debug(f"{len(df_prcp)} rows with precipitation element")
outfile = '../../data/manual_inspection/extract.csv'
df_prcp.to_csv(outfile, index=False)
logging.debug(f"{len(df_prcp)} rows saved to {outfile}")
Example #4
0
    def create_fmap(mapping, similarity, angle, ranked=ranked, dims=dims):
        data = pd.DataFrame.from_dict({
            'feature':
            similarity.index.values.tolist(),
            'dim1':
            mapping.iloc[:, 0].tolist(),
            'dim2':
            mapping.iloc[:, 1].tolist(),
            'dim3':
            mapping.iloc[:, 2].tolist()
        })

        if ranked:
            data = data >> arrange(
                X.dim1) >> mutate(dim1=np.arange(data.shape[0]) + 1)
            data = data >> arrange(
                X.dim2) >> mutate(dim2=np.arange(data.shape[0]) + 1)

        data = data >> arrange(X.dim1)
        data = data >> mutate(
            resize_x=np.round_(np.linspace(1, dims, data.shape[0])))
        data.resize_x = data.resize_x.astype(int)
        data = data >> arrange(X.dim2)
        data = data >> mutate(
            resize_y=np.round_(np.linspace(1, dims, data.shape[0])))
        data.resize_y = data.resize_y.astype(int)

        data2 = pd.DataFrame.from_dict({
            'rot_x': data.resize_x,
            'rot_y': data.resize_y
        })
        data2 = rotate_2_col_mat(data2, angle)
        data = data >> bind_cols(data2)
        del data2

        data = data >> arrange(X.rot_x)
        data = data >> mutate(x=np.round_(np.linspace(1, dims, data.shape[0])))
        data.x = data.x.astype(int)

        data = data >> arrange(X.rot_y)
        data = data >> mutate(y=np.round_(np.linspace(1, dims, data.shape[0])))
        data.y = data.y.astype(int)

        data = data >> arrange(X.dim3)

        data2 = {}
        data2['X'] = data >> select(X.x, X.y)
        data2['Y'] = data2['X'].drop_duplicates()
        data2['X'] = np.arange(data2['Y'].shape[0])
        data2['Z'] = data
        for i in data2['X']:
            data2['result'] = data2['Z'] >> mask(X.x == data2['Z'].x[i])
            data2['result'] = data2['result'] >> mask(X.y == data2['Z'].y[i])
            data2['result.z'] = np.arange(data2['result'].shape[0]) + 1
            data2['result.z'] = data2['result.z'].tolist()
            data2['result'] = data2['result'] >> mutate(z=data2['result.z'])
            if i == 0:
                data2['results'] = data2['result']
            else:
                data2['results'] = pd.DataFrame.append(data2['results'],
                                                       data2['result'])
        data = data2['results']
        del data2

        data2a = similarity.index.values
        data2b = data >> mask(data.feature.isin(data2a))
        data2a = pd.DataFrame.from_dict({'feature': data2a})
        data2a = data2a >> mask(
            data2a.feature.isin(data2b['feature'].to_numpy()))
        data = data2a >> left_join(data2b, by='feature')
        del data2a, data2b

        data = data.set_index('feature')
        data = data >> select(X.x, X.y, X.z)
        data = data >> arrange(X.z, X.y, X.x)

        return data
Example #5
0
def read(path):
    """
  Read a .ts.tar.gz file to a TidySet
  
  This function read multiple files archived by tar with gzip compression
  to a TidySet.
  
  :param path: A character of .ts.tar.gz file path (include file extension).
  :return: output A TidySet, an ExpressionSet with three tables. Function of
  write_ts_tar_gz can write this file from the TidySet.
  """

    filename = path
    path = re.sub('.ts.tar.gz', '', filename)
    os.mkdir(path)
    tar = tarfile.open(filename)
    tar.extractall(path)
    tar.close()

    f = open(path + '/others.txt', 'r')
    others = f.read()
    f.close()

    other = re.split('\n', others)
    elements = []
    for i in np.arange(len(other)):
        if re.search('^>>', other[i]):
            elements.append(i)

    XX = np.arange(len(elements)).tolist()
    Y = elements
    Z = np.arange(len(other)).tolist()
    K = []
    for i in XX:
        if i < (len(Y) - 1):
            L = Z[(Y[i] + 1):Y[i + 1]]
        else:
            L = Z[(Y[i] + 1):]
        K.append(L)

    XX = np.arange(len(K))
    Y = K
    Z = other
    K = []
    for i in elements:
        K.append(re.sub('>>', '', other[i]))
    M = dict()
    for i in XX:
        L = []
        for j in Y[i]:
            L.append(Z[j])
        L = ' '.join(L)
        M[K[i]] = L
    others = M
    del XX, Y, Z, K, L, M, i, j, f, other, elements

    adata = pd.read_csv(path + '/exprs.csv',
                        names=re.split('\\s', others['sampleNames']))
    adata.index = re.split('\\s', others['featureNames'])

    pdata_names = re.split('\\s', others['varLabels'])
    pdata_dtype = re.split('\\s', others['varClass'])
    pdata = dict()
    for i in np.arange(len(pdata_dtype)):
        if pdata_dtype[i] == 'numeric':
            pdata[pdata_names[i]] = 'float64'
        elif pdata_dtype[i] == 'integer':
            pdata[pdata_names[i]] = 'int64'
        elif pdata_dtype[i] == 'factor':
            pdata[pdata_names[i]] = 'category'
        else:
            pdata[pdata_names[i]] = 'object'
    pdata = pd.read_csv(path + '/pData.csv', names=pdata_names, dtype=pdata)
    pdata.index = re.split('\\s', others['sampleNames'])
    string = re.split('\\s', others['varMetadata'])
    i = 0
    for c in string:
        if string[i] == 'NA': string[i] = np.NaN
        i += 1
    pmetadata = pd.DataFrame(string,
                             index=re.split('\\s', others['varLabels']),
                             columns=['labelDescription'])
    pdata = AnnotatedDataFrame(pdata, pmetadata)

    fdata_names = re.split('\\s', others['fvarLabels'])
    fdata_dtype = re.split('\\s', others['fvarClass'])
    fdata = dict()
    for i in np.arange(len(fdata_dtype)):
        if fdata_dtype[i] == 'numeric':
            fdata[fdata_names[i]] = 'float64'
        elif fdata_dtype[i] == 'integer':
            fdata[fdata_names[i]] = 'int64'
        elif fdata_dtype[i] == 'factor':
            fdata[fdata_names[i]] = 'category'
        else:
            fdata[fdata_names[i]] = 'object'
    fdata = pd.read_csv(path + '/fData.csv', names=fdata_names, dtype=fdata)
    fdata.index = re.split('\\s', others['featureNames'])
    fdata.index.name = 'pos_id'

    sim_names = re.split('\\s', others['simNames'])
    sim_dtype = dict()
    for i in np.arange(len(sim_names)):
        sim_dtype[sim_names[i]] = 'float64'
    similarity = pd.read_csv(path + '/similarity.csv',
                             names=sim_names,
                             dtype=sim_dtype)
    similarity.index = sim_names

    ontomap = adata.transpose()
    for i in np.arange(ontomap.columns.values.shape[0]):
        dim = re.split('x|y|z', ontomap.columns.values[i])
        if i > 0:
            dim[1] = np.max([int(dim[1]), int(dim_[1])])
            dim[2] = np.max([int(dim[2]), int(dim_[2])])
            dim[3] = np.max([int(dim[3]), int(dim_[3])])
        dim_ = dim
    del dim_
    ontomap = ontomap.to_numpy()
    ontomap = ontomap.reshape(ontomap.shape[0] * ontomap.shape[1])
    ontomap = np.array(ontomap)
    ontomap = ontomap.reshape(adata.shape[1], dim[2], dim[1], dim[3])

    ontotype = {}
    for i in np.arange(fdata.shape[1] - 1):
        data0 = fdata >> select(~X.feature)
        data = data0.iloc[:, i]
        data = data.reset_index(inplace=False)
        data = data.rename(columns={data.columns.values[1]: 'ontotype'})
        data = data >> mask(X.ontotype == 1)
        data = data >> left_join(fdata.reset_index(inplace=False),
                                 var='pos_id')
        data = data >> select(X.pos_id, X.feature)

        data2 = data >> separate(X.pos_id, ['a', 'x', 'y', 'z'], sep='x|y|z')
        data2 = data2[['feature', 'x', 'y', 'z']].set_index('feature')

        ontotype[data0.columns.values[i]] = data2

    ontotype['root'] = fdata.reset_index(inplace=False) >> select(
        X.pos_id, X.feature)
    ontotype['root'] = ontotype['root'] >> mutate(f_str=X.feature.astype(str))
    ontotype['root'] = ontotype['root'] >> mask(
        X.f_str != 'nan') >> select(~X.f_str)
    ontotype['root'] = ontotype['root'] >> separate(
        X.pos_id, ['a', 'x', 'y', 'z'], sep='x|y|z')
    ontotype['root'] = ontotype['root'][['feature', 'x', 'y',
                                         'z']].set_index('feature')

    string = re.split('\\s', others['fvarMetadata'])
    i = 0
    for c in string:
        if string[i] == 'NA': string[i] = np.NaN
        i += 1
    fmetadata = pd.DataFrame(string,
                             index=re.split('\\s', others['fvarLabels']),
                             columns=['labelDescription'])
    fdata.index.name = None
    fdata = AnnotatedDataFrame(fdata, fmetadata)

    ontology_names = re.split('\\s', others['ontoNames'])
    ontology_dtype = re.split('\\s', others['ontoClass'])
    ontology = dict()
    for i in np.arange(len(ontology_dtype)):
        if ontology_dtype[i] == 'numeric':
            ontology[ontology_names[i]] = 'float64'
        elif ontology_dtype[i] == 'integer':
            ontology[ontology_names[i]] = 'int64'
        elif ontology_dtype[i] == 'factor':
            ontology[ontology_names[i]] = 'category'
        else:
            ontology[ontology_names[i]] = 'object'
    ontology = pd.read_csv(path + '/ontology.csv',
                           names=ontology_names,
                           dtype=ontology)

    xData = MIAME(name=others['name'],
                  lab=others['lab'],
                  contact=others['contact'],
                  title=others['title'],
                  abstract=others['abstract'],
                  url=others['url'],
                  pubMedIds=others['pubMedIds'],
                  other={
                      'similarity': similarity,
                      'ontomap': ontomap,
                      'ontotype': ontotype,
                      'ontology': ontology
                  })

    for i in os.listdir(path):
        os.remove(path + '/' + i)
    os.rmdir(path)

    if re.match('^( +)', others['annotation']):
        annot = ''
    else:
        annot = re.sub(' +', ' ', others['annotation'])

    eset = ExpressionSet(assayData=adata.to_numpy(),
                         phenoData=pdata,
                         featureData=fdata,
                         experimentData=xData,
                         annotation=annot)

    return eset
Example #6
0
def compile(value,
            outcome,
            similarity,
            mapping,
            ontology,
            ranked=True,
            dims=7,
            decreasing=False,
            seed_num=33):
    """
  Make a TidySet for visible neural network (VNN) modeling
  
  This function create a TidySet, an ExpressionSet class to orchestrate five
  data into a single set of three tables.
  
  :param value: Instance-feature value, a pandas data frame with rows for
  instances and columns for features. All rows in value should have names. All
  values should be floating numbers.
  :param outcome: Outcome, a single-column pandas data frame of binary integers
  with the same rows as the instances. The row numbers and the order of outcome
  should be the same with those of value. Value  of 0 and 1 should refer to
  non-event and event outcome, respectively.
  :param similarity: Feature similarity, a square pandas data frame of floating
  numbers containing feature-feature similarity measures.
  :param mapping: Feature three-dimensional mapping, a pandas data frame of
  floating numbers with rows for features and three columns for three dimensions
  where the features are mapped onto.
  :param ontology: Ontology, a pandas data frame with rows for ontologies and
  four columns for source, target, similarity, and relation. Feature (source)-
  ontology (target) relation should be annotated as 'feature', while ontology-
  ontology relation should be annotated as 'is_a'. To differentiate between
  feature and ontology names, a prefix of 'ONT:' precedes an ontology name. All
  columns except similarity in ontology should be strings. Similarity (a
  floating number) is a minimum threshold by which either features or ontologies
  (source) belong to an ontology (target).
  :return: output TidySet, an ExpressionSet with three tables. Instance-feature
  value and outcome pandas data frame are compiled as a phenotype pandas data
  frame with rows for instances and columns for features and outcome. Instance-
  feature value and feature three-dimensional mapping pandas data frame are
  compiled as an expression two-dimensional array with rows for positions of
  features and columns for instances. The mapping, similarity, and ontology
  pandas data frame are compiled as a feature pandas data frame with rows for
  positions of features and columns for feature names and ontological relations.
  For easier access, the similarity pandas data frame, ontomap four-dimensional
  numpy array, ontotype dictionary of pandas data frame, and ontology pandas
  data frame are included in experiment notes that can be called using function
  of notes.
  """

    pb = ProgressBar(8)
    tick = 0
    pb.start()

    # Leibniz formula for pi
    # https://en.wikipedia.org/wiki/Leibniz_formula_for_%CF%80
    # pi=1
    # for i in range(1,int(10e+6)):
    #   pi+=((-1)**i)*(1/(2*i+1))
    # pi=pi*4

    tick += 1
    pb.update(tick)  #1

    def rotate_2_col_mat(X, angle):
        angle = (math.pi / 180 * angle) * -1
        M = np.array([
            math.cos(angle),
            math.sin(angle), -math.sin(angle),
            math.cos(angle)
        ])
        M = M.reshape(2, 2)
        M = np.dot(X.to_numpy(), M)
        M = pd.DataFrame(M,
                         index=X.index.values.tolist(),
                         columns=X.columns.values.tolist())
        return M

    def create_fmap(mapping, similarity, angle, ranked=ranked, dims=dims):
        data = pd.DataFrame.from_dict({
            'feature':
            similarity.index.values.tolist(),
            'dim1':
            mapping.iloc[:, 0].tolist(),
            'dim2':
            mapping.iloc[:, 1].tolist(),
            'dim3':
            mapping.iloc[:, 2].tolist()
        })

        if ranked:
            data = data >> arrange(
                X.dim1) >> mutate(dim1=np.arange(data.shape[0]) + 1)
            data = data >> arrange(
                X.dim2) >> mutate(dim2=np.arange(data.shape[0]) + 1)

        data = data >> arrange(X.dim1)
        data = data >> mutate(
            resize_x=np.round_(np.linspace(1, dims, data.shape[0])))
        data.resize_x = data.resize_x.astype(int)
        data = data >> arrange(X.dim2)
        data = data >> mutate(
            resize_y=np.round_(np.linspace(1, dims, data.shape[0])))
        data.resize_y = data.resize_y.astype(int)

        data2 = pd.DataFrame.from_dict({
            'rot_x': data.resize_x,
            'rot_y': data.resize_y
        })
        data2 = rotate_2_col_mat(data2, angle)
        data = data >> bind_cols(data2)
        del data2

        data = data >> arrange(X.rot_x)
        data = data >> mutate(x=np.round_(np.linspace(1, dims, data.shape[0])))
        data.x = data.x.astype(int)

        data = data >> arrange(X.rot_y)
        data = data >> mutate(y=np.round_(np.linspace(1, dims, data.shape[0])))
        data.y = data.y.astype(int)

        data = data >> arrange(X.dim3)

        data2 = {}
        data2['X'] = data >> select(X.x, X.y)
        data2['Y'] = data2['X'].drop_duplicates()
        data2['X'] = np.arange(data2['Y'].shape[0])
        data2['Z'] = data
        for i in data2['X']:
            data2['result'] = data2['Z'] >> mask(X.x == data2['Z'].x[i])
            data2['result'] = data2['result'] >> mask(X.y == data2['Z'].y[i])
            data2['result.z'] = np.arange(data2['result'].shape[0]) + 1
            data2['result.z'] = data2['result.z'].tolist()
            data2['result'] = data2['result'] >> mutate(z=data2['result.z'])
            if i == 0:
                data2['results'] = data2['result']
            else:
                data2['results'] = pd.DataFrame.append(data2['results'],
                                                       data2['result'])
        data = data2['results']
        del data2

        data2a = similarity.index.values
        data2b = data >> mask(data.feature.isin(data2a))
        data2a = pd.DataFrame.from_dict({'feature': data2a})
        data2a = data2a >> mask(
            data2a.feature.isin(data2b['feature'].to_numpy()))
        data = data2a >> left_join(data2b, by='feature')
        del data2a, data2b

        data = data.set_index('feature')
        data = data >> select(X.x, X.y, X.z)
        data = data >> arrange(X.z, X.y, X.x)

        return data

    def order_angle_by_channel(mapping,
                               similarity,
                               ranked=ranked,
                               dims=dims,
                               decreasing=False):
        angles = np.arange(360) + 1
        for i in angles:
            if i == 1:
                data_ = create_fmap(mapping, similarity, i, ranked, dims)
                data = [np.max(data_['z'])]
            else:
                data_ = create_fmap(mapping, similarity, i, ranked, dims)
                data.append(np.max(data_['z']))

        data = pd.DataFrame.from_dict({
            'angle': angles,
            'channel': np.array(data)
        })
        data = data >> arrange(X.channel, ascending=decreasing == False)
        return data

    tick += 1
    pb.update(tick)  #2
    np.random.seed(seed_num)
    angle = order_angle_by_channel(mapping, similarity, ranked, dims,
                                   decreasing)
    angle = angle >> mask(X.channel == np.min(angle['channel']))
    angle = angle['angle'].values
    angle = np.random.choice(np.arange(angle.shape[0]).tolist(), 1, False)

    tick += 1
    pb.update(tick)  #3
    fmap = create_fmap(mapping, similarity, angle, ranked, dims)

    fval = value[fmap.index.values].to_numpy()
    fval = pd.DataFrame(fval,
                        index=value.index.values,
                        columns=value.columns.values)

    fboth = fmap >> summarize_each([np.max], X.x, X.y, X.z)
    fboth = fboth.to_numpy()
    data = []
    for i in np.arange(fboth.shape[1]):
        data_ = np.arange(fboth[:, i]) + 1
        data.append(data_.tolist())
        del data_

    fboth = np.meshgrid(data[0], data[1], data[2])
    del data
    fboth = np.array(fboth).T.reshape(-1, 3)
    fboth = pd.DataFrame(fboth, columns=fmap.columns.values)
    fboth = fboth >> arrange(X.z, X.y, X.x)

    fboth = fboth >> left_join(fmap.reset_index(inplace=False),
                               by=['x', 'y', 'z'])

    idx = []
    for i in fboth['feature'].values.tolist():
        idx.append(str(i) != 'nan')

    fval = fval[fboth['feature'][idx]].to_numpy()
    fval = np.matrix.transpose(fval)
    fval = pd.DataFrame(fval,
                        index=fboth['feature'][idx],
                        columns=value.index.values)

    fboth = fboth >> left_join(fval.reset_index(inplace=False), by='feature')

    fboth = fboth >> mutate(x_='x') >> unite(
        'x', ['x_', 'x'], remove=False, sep='')
    fboth = fboth >> select(~X.x_)
    fboth = fboth >> unite('pos_id', ['x', 'y'], remove=True, sep='y')
    fboth = fboth >> unite('pos_id', ['pos_id', 'z'], remove=False, sep='z')
    fboth = fboth >> select(~X.z)

    ori_ontology = ontology

    def str_detect(string, pattern):
        match = []
        for i in string:
            match.append('ONT:' in i)
        return match

    while np.sum(str_detect(ontology['source'], 'ONT:')) > 0:

        data = ontology >> mask(X.relation == 'feature')
        for i in np.arange(ontology.shape[0]):
            if 'ONT:' in ontology['source'][i]:
                data2 = data >> mask(X.target == ontology['source'][i])
                if data2.shape[0] > 0:
                    data_ = pd.DataFrame.from_dict({
                        'source':
                        data2['source'],
                        'target':
                        ontology['target'][i],
                        'similarity':
                        ontology['similarity'][i],
                        'relation':
                        'feature'
                    })
                else:
                    data_ = ontology.iloc[i, :]
            else:
                data_ = ontology.iloc[i, :]

            if i == 0:
                data2 = data_
            else:
                data2 = data.append(data_)
        ontology = data2
    del data_, data, data2

    tick += 1
    pb.update(tick)  #4
    adata = fboth >> select(~X.feature)
    adata = adata.set_index('pos_id')
    adata = adata.fillna(0)

    pdata = value >> mutate(outcome=outcome.astype(int))
    pdata = pdata >> select(X.outcome, fmap.index.values.tolist())

    fdata = fboth >> select(X.pos_id, X.feature)
    fdata2 = ontology >> select(X.source, X.target)
    fdata2 = fdata2.drop_duplicates()
    fdata2 = fdata2 >> separate(X.target, ['t1', 't2'])
    fdata2 = fdata2 >> mutate(t1='ONT') >> unite('target', ['t1', 't2'],
                                                 sep='')
    fdata2 = fdata2 >> mutate(included=1) >> spread(X.target, X.included)
    fdata2 = fdata2 >> rename(feature=X.source)
    fdata = fdata >> left_join(fdata2, by='feature')
    del fdata2
    fdata = fdata.set_index('pos_id')

    tick += 1
    pb.update(tick)  #5
    ontomap = adata.transpose()
    for i in np.arange(ontomap.columns.values.shape[0]):
        dim = re.split('x|y|z', ontomap.columns.values[i])
        if i > 0:
            dim[1] = np.max([int(dim[1]), int(dim_[1])])
            dim[2] = np.max([int(dim[2]), int(dim_[2])])
            dim[3] = np.max([int(dim[3]), int(dim_[3])])
        dim_ = dim
    del dim_
    ontomap = ontomap.to_numpy()
    ontomap = ontomap.reshape(ontomap.shape[0] * ontomap.shape[1])
    ontomap = np.array(ontomap)
    ontomap = ontomap.reshape(adata.shape[1], dim[2], dim[1], dim[3])

    tick += 1
    pb.update(tick)  #6
    ontotype = {}
    for i in np.arange(fdata.shape[1] - 1):
        data0 = fdata >> select(~X.feature)
        data = data0.iloc[:, i]
        data = data.reset_index(inplace=False)
        data = data.rename(columns={data.columns.values[1]: 'ontotype'})
        data = data >> mask(X.ontotype == 1)
        data = data >> left_join(fdata.reset_index(inplace=False),
                                 var='pos_id')
        data = data >> select(X.pos_id, X.feature)

        data2 = data >> separate(X.pos_id, ['a', 'x', 'y', 'z'], sep='x|y|z')
        data2 = data2[['feature', 'x', 'y', 'z']].set_index('feature')

        ontotype[data0.columns.values[i]] = data2

    ontotype['root'] = fdata.reset_index(inplace=False) >> select(
        X.pos_id, X.feature)
    ontotype['root'] = ontotype['root'] >> mutate(f_str=X.feature.astype(str))
    ontotype['root'] = ontotype['root'] >> mask(
        X.f_str != 'nan') >> select(~X.f_str)
    ontotype['root'] = ontotype['root'] >> separate(
        X.pos_id, ['a', 'x', 'y', 'z'], sep='x|y|z')
    ontotype['root'] = ontotype['root'][['feature', 'x', 'y',
                                         'z']].set_index('feature')

    data2a = fmap.reset_index(inplace=False)
    data2b = similarity[data2a['feature'].to_numpy().tolist()]
    data2b = data2b.reset_index(inplace=False)
    data2a = data2a >> rename(index=X.feature)
    similarity = data2a >> left_join(data2b, by='index')
    del data2a, data2b
    similarity = similarity >> select(~X.x, ~X.y, ~X.z)
    similarity = similarity.set_index('index')
    similarity.index.name = None

    tick += 1
    pb.update(tick)  #7
    adata.index.name = None
    fdata.index.name = None
    ori_ontology.index = pd.Index(np.arange(ori_ontology.shape[0]))
    output = ExpressionSet(assayData=adata.to_numpy(),
                           phenoData=AnnotatedDataFrame(pdata),
                           featureData=AnnotatedDataFrame(fdata),
                           experimentData=MIAME(
                               other={
                                   'similarity': similarity,
                                   'ontomap': ontomap,
                                   'ontotype': ontotype,
                                   'ontology': ori_ontology
                               }))

    tick += 1
    pb.update(tick)  #8
    return output
    def exec(self):

        try:
            log.info('[START] {}'.format("exec"))

            # fileInfoPattrn = '{}/{}/{}'.format(globalVar['inpPath'], serviceName, 'data/csv/inp_01.csv')
            # fileInfo = glob.glob(fileInfoPattrn)
            # if (len(fileInfo) < 1): raise Exception("[ERROR] fileInfo : {} : {}".format("자료를 확인해주세요.", fileInfoPattrn))

            # saveFile = '{}/{}_{}'.format(globalVar['figPath'], serviceName, '2021_nagano_S1_01_raw.png')
            # log.info('[CHECK] saveFile : {}'.format(saveFile))

            # fileInfo = fileList[0]
            # for i, fileInfo in enumerate(fileList):
            #     globalVar['inpData{0:02d}'.format(i + 1)] = pd.read_csv(fileInfo, na_filter=False)

            # breakpoint()

            # 파일 읽기
            fileInfoPattrn = '{}/{}/{}'.format(globalVar['inpPath'],
                                               serviceName,
                                               'data/csv/inp_*.csv')
            fileList = glob.glob(fileInfoPattrn)
            if (len(fileList) < 6):
                raise Exception("[ERROR] fileInfo : {} : {}".format(
                    "자료를 확인해주세요.", fileInfoPattrn))

            inpData01 = pd.read_csv(fileList[0], na_filter=False)
            inpData02 = pd.read_csv(fileList[1], na_filter=False)
            inpData03 = pd.read_csv(fileList[2], na_filter=False)
            inpData04 = pd.read_csv(fileList[3], na_filter=False)
            inpData05 = pd.read_csv(fileList[4], na_filter=False)
            inpData06 = pd.read_csv(fileList[5], na_filter=False)

            # 기간 및 자치구에 따른 데이터 병합
            data = ((
                inpData01 >> dfply.left_join(inpData02, by=('기간', '자치구')) >>
                dfply.left_join(inpData03, by=('기간', '자치구')) >>
                dfply.left_join(inpData04, by=('기간', '자치구')) >>
                dfply.left_join(inpData05, by=('기간', '자치구')) >>
                dfply.left_join(inpData06, by=('기간', '자치구')) >>
                dfply.mask(dfply.X.자치구 != '합계') >> dfply.drop([
                    '합계_검거', '살인_발생', '살인_검거', '강도_발생', '강도_검거', '강간강제추행_발생',
                    '강간강제추행_검거', '절도_발생', '절도_검거', '폭력_발생', '폭력_검거', '합계', '소계'
                ])))

            # 컬럼 개수 : 42개
            len(data.columns.values)

            # 컬럼 형태
            # dataStep1.dtypes

            # ======================================================
            #  범죄횟수를 기준으로 각 상관계수 행렬 시각화
            # ======================================================
            # data = pd.DataFrame(data.dropna(axis=0))

            tmpColY = data.iloc[:, 2]
            tmpColXStep1 = data.iloc[:, 3:21:1]
            tmpColXStep2 = data.iloc[:, 22:41:1]

            dataStep1 = pd.concat([tmpColY, tmpColXStep1], axis=1)
            dataStep1Corr = dataStep1.corr(method='pearson')
            saveImg = '{}/{}_{}'.format(globalVar['figPath'], serviceName,
                                        '상관계수 상단 행렬.png')

            makeCorrPlot(dataStep1, saveImg)

            dataStep2 = pd.concat([tmpColY, tmpColXStep2], axis=1)
            dataStep2Corr = dataStep2.corr(method='pearson')
            saveImg = '{}/{}_{}'.format(globalVar['figPath'], serviceName,
                                        '상관계수 하단 행렬.png')

            makeCorrPlot(dataStep2, saveImg)

            # ===================================================================
            #  전체 데이터셋 (기간, 자치구)을 이용한 독립변수 및 종속 변수 선정
            # ===================================================================
            dataL1 = ((data >> dfply.drop(dfply.X.기간, dfply.X.자치구)))

            # ===================================================================
            #  [상관분석 > 유의미한 변수] 전체 데이터셋 (기간, 자치구)을 이용한 독립변수 및 종속 변수 선정
            # ===================================================================
            selCol = [
                '범죄횟수', '지구대파출소치안센터', '119안전센터', 'CCTV설치현황', '비거주용건물내주택',
                '계_사업체수', '계_종사자수'
            ]
            dataL1 = data[selCol]

            # 결측값에 대한 행 제거 (그에 따른 index 변화로 인해 pd.DataFrame 재변환)
            dataL2 = pd.DataFrame(dataL1.dropna(axis=0))
            dataL2.rename(columns={'범죄횟수': 'total'}, inplace=True)

            # 요약 통계량
            dataL2.describe()

            # 자치구 데이터셋 (기간 평균)을 이용한 독립변수 및 종속 변수 선정
            # selCol = ['기간', '자치구', '범죄횟수', '지구대파출소치안센터', 'CCTV설치현황', '전체세대', '비거주용건물내주택', '계_사업체수']
            # dataL1 = data[selCol]
            #
            # pd.plotting.scatter_matrix(dataL1)
            # plt.show()
            #
            # dataL2 = ((dataL1 >>
            #      group_by(X.자치구) >>
            #      summarize(
            #          total=X.범죄횟수.mean()
            #          , maenX1=X.지구대파출소치안센터.mean()
            #          , maenX2=X.CCTV설치현황.mean()
            #          , maenX3=X.전체세대.mean()
            #          , maenX4=X.비거주용건물내주택.mean()
            #          , maenX5=X.계_사업체수.mean()
            #          ) >>
            #         # arrange(X.number, ascending=False)
            #         drop(X.자치구)
            #      ))

            # ========================================
            #  회귀모형 수행
            # ========================================
            selVarList = list(
                dataL2.columns[~dataL2.columns.str.contains('total')])

            # 다중선형회귀 모형
            result = train_test_linreg(dataL2, selVarList)

            # 릿지 모형
            # result = train_test_ridge(dataL2, selVarList, 1.0)

            # =======================================
            #  시각화
            # ======================================
            # 트레이닝 데이터
            trainValY = result['Y_train'].values
            trainPredValY = result['Y_pred_train']
            saveImg = '{}/{}_{}'.format(globalVar['figPath'], serviceName,
                                        '트레이닝 데이터_상관계수 행렬.png')

            makeScatterPlot(trainValY, trainPredValY, saveImg)

            # 테스트 데이터
            testValY = result['Y_test'].values
            testPredValY = result['Y_pred_test']
            saveImg = '{}/{}_{}'.format(globalVar['figPath'], serviceName,
                                        '테스트 데이터_상관계수 행렬.png')

            makeScatterPlot(testValY, testPredValY, saveImg)

            # =======================================
            #  교차검증 수행
            # ======================================
            X = dataL2[selVarList]
            Y = dataL2.total
            X_train, X_test, Y_train, Y_test = train_test_split(X,
                                                                Y,
                                                                test_size=0.4)

            # Pre-allocate models and corresponding parameter candidates
            models = []
            params = []

            model = ('Linear', LinearRegression())
            param = {}

            models.append(model)
            params.append(param)

            log.info("[CHECK] models : {%s}", models)
            log.info("[CHECK] params : {%s}", params)

            kfold = KFold(n_splits=10, shuffle=True)

            results = []

            # [교차검증] 트레이닝 데이터
            for i in range(1):
                model = models[i]
                param = params[i]
                result = gridsearch_cv_for_regression(model=model,
                                                      param=param,
                                                      kfold=kfold,
                                                      train_input=X_train,
                                                      train_target=Y_train)
                result.best_score_
                results.append(result)

            # [교차검증] 테스트 데이터
            for i in range(len(results)):
                testValY = Y_test.values
                testPredValY = results[i].predict(X_test)

                saveImg = '{}/{}_{}'.format(globalVar['figPath'], serviceName,
                                            '테스트 데이터_산점도.png')
                makeScatterPlot(testValY, testPredValY, saveImg)

        except Exception as e:
            log.error("Exception : {}".format(e))
            raise e
        finally:
            log.info('[END] {}'.format("exec"))
Example #8
0
import geopandas as gpd

import dfply

import matplotlib.patheffects as pe

import pandas as pd

##reads in migration summary
ca_sum = pd.read_csv("/Users/alanleach/Desktop/ca-migration/ca-sum.csv")

##retrieves state shapefiles from Tiger Web and merges with ca_sum
geo_state = gpd.read_file(
    "https://www2.census.gov/geo/tiger/TIGER2017/STATE/tl_2017_us_state.zip")

geo_state_cont = geo_state >> dfply.mask(geo_state.NAME != 'Alaska',
                                         geo_state.NAME != 'Hawaii')

geo_state_cont['STATEFP'] = geo_state_cont.STATEFP.astype(int)

geo_state_cont = geo_state_cont.merge(ca_sum, on='STATEFP', how='left')

geo_state_cont = geo_state_cont.dropna()

geo_state_cont['emigrants'] = geo_state_cont.emigrants.astype(int)

geo_state_cont = geo_state_cont.to_crs(
    "+proj=cea +lon_0=0 +lat_ts=45 +x_0=0 +y_0=0 +ellps=WGS84 +units=m +no_defs"
)  ##changes projection to gall-peters

##creates a dataframe with polygon centroids for labeling the states
geo_state_points = geo_state_cont.copy()