コード例 #1
0
ファイル: fullseq.py プロジェクト: lis123kr/GPA
	def maf5list(self, ws, book):
		from analyzer import dataframe_to_rows
		from pandas import merge
		infolog("writing list of data")
		P0, merged = None, None
		dumascol = [book.col_GenomeStructure, book.col_RepeatRegion, book.col_ORF, book.col_DumaPosition, book.col_DumaSeq]
		for i in range(book.nsheets):
			book.BP35[i]['maf'] = divide(book.BP35[i][['minor']], book.BP35[i][['sum']]) * 100
			if merged is not None:
				merged = merge(merged, book.BP35[i][ book.BP35[i]['maf']>=5.0], on=dumascol, how='outer', right_index=True, left_index=True)
			else:
				merged = book.BP35[i][ book.BP35[i]['maf']>= 5.0 ]
		merged = merged[book.col_DumaPosition].values.tolist()
		for i in range(book.nsheets):
			if P0 is not None:
				P0 = merge(P0, book.BP35[i][ book.BP35[i][book.col_DumaPosition].isin(merged) ], on=dumascol, how='outer',
					left_index=True, right_index=True)
			else:
				P0 = book.BP35[i][ book.BP35[i][book.col_DumaPosition].isin(merged) ]

		# P0.to_excel('a.xlsx')
		# bp = bp[ self.dumascol + []]
		rows = dataframe_to_rows(P0, index=False)

		for r_idx, row in enumerate(rows, 1):
			for c_idx, value in enumerate(row, 1):
				if str(value) != 'nan':
					ws.cell(row=r_idx, column=c_idx, value=value)
コード例 #2
0
ファイル: fullseq.py プロジェクト: lis123kr/GPA
	def sheet6(self,ws, book):
		infolog("Start sheet6")
		col = 'B'
		for i in range(0, len(self.s1)):
			ws[col+'2'] = str(self.s1[i]) + '~' + str(self.s2[i]) +'%' # if self.s2[i] != 51.0 else str(self.s1[i]) + '%~'

			# get basepair between s1 and s2
			mxr = []
			for n in range(0, book.nsheets):
				_, _, minor_ = book.get_Number_of_GPS(book.BP35[n], self.s1[i], self.s2[i])
				mxr.append(book.BP35[n].loc[minor_.index])

			for x in range(-1, book.nsheets):
				ws[col+'3'] = book.sheet_list[x] if x != -1 else "Major/Minor"
				rows = 4
				for a in range(0, len(book.col_basepair)):
					for b in range(0, len(book.col_basepair)):
						if a==b:
							continue
						ws[col+str(rows)] = len(mxr[x][logical_and( mxr[x]['major_idx']==a, 
							mxr[x]['minor_idx']==b )]) if x != -1 else book.col_basepair[a] + '/' + book.col_basepair[b]
						rows = rows + 1
				col = next_col(col)
			col = next_col(col)
		infolog("End sheet6")
コード例 #3
0
ファイル: book.py プロジェクト: lis123kr/GPA
    def mergesequence(self):
        from analyzer import infolog
        from pandas import merge
        infolog("mergesequence start")
        # print(type(self.sheet_list), len(self.sheet_list))
        for sheet in self.sheets:
            # sheet = self.xls.parse(x)
            # print(type(sheet))
            if self.P0 is None:
                self.P0 = sheet
            else:
                self.P0 = merge(self.P0,
                                sheet,
                                how='outer',
                                on=[
                                    self.col_GenomeStructure,
                                    self.col_RepeatRegion, self.col_ORF,
                                    self.col_DumaPosition, self.col_DumaSeq
                                ])
            sheet = sheet[sheet[self.col_Sequence] != '-']
            self.BPRaw.append(sheet)
            self.BPRawLength.append(len(sheet))

        self.Dumas = self.P0[self.P0[self.col_DumaSeq] != '-']
        infolog("mergesequence end")
コード例 #4
0
	def BaseComp(self, ws, types_, book):
		"""
			major가 같은 것, minor도 같은 것만 추출
			minor x -> 증가 포함
			1. major_idx_x == major_idx_y - 치환 제거
			2. (minor_idx_x == minor_idx_y) or ((minor_idx_x != minor_idx_y) and minor_x == 0)
		"""
		infolog("lowhigh BaseComp start")
		ws.merge_cells(start_row=1, start_column=1, end_row=2, end_column=1)
		ws['B1'], ws['B2'] = 'Major', 'Minor'
		ws['D1'], ws['H1'], ws['L1'], ws['P1'] = 'A', 'G', 'C', 'T'
		ws['G2'], ws['K2'], ws['O2'] = 'a', 'a', 'a'
		ws['C2'], ws['L2'], ws['P2'] = 'g', 'g', 'g'
		ws['D2'], ws['H2'], ws['Q2'] = 'c', 'c', 'c'
		ws['E2'], ws['I2'], ws['M2'] = 't', 't', 't'
		cols = ['A','G','C','T']		

		bp = self.BPmergedinc if types_ is "INC" else self.BPmergeddec

		for r in range(book.nsheets):
			ws['A'+str(r+3)] = book.sheet_list[r]

		for r in range(book.nsheets):
			col = 'C'
			cond1 = bp[r]['major_idx_x'] == bp[r]['major_idx_y']
			cond2 = bp[r]['minor_idx_x'] == bp[r]['minor_idx_y']
			cond3 = logical_and( bp[r]['minor_idx_x'] != bp[r]['minor_idx_y'], bp[r]['minor_x'] == 0 )

			idx = logical_and(cond1, logical_or(cond2, cond3))
			tmp = bp[r][ idx.values ]

			for a in range(len(cols)):
				for b in range(len(cols)):
					if a == b:
						continue
					# major는 _x, _y 가 이미 같고, minor는 _y기준으로 하면 _x에서 minor가 0이던값 무시됨
					ws[col+str(r+3)] = len(tmp[ logical_and(tmp['major_idx_y'] == a, tmp['minor_idx_y'] == b).values ])
					col = next_col(col)
				col = next_col(col)

		infolog("lowhigh BaseComp end")
コード例 #5
0
	def Init_Individual(self, book):
		infolog("lowhigh init start")
		sheet_names = []
		BPmaf = []
		for bp in book.BP35:
			#get maf 5%
			infolog("bp 5.0 ~ 50.0")
			s, l, minor_ = book.get_Number_of_GPS(bp, 5.0, 51.0)
			BPmaf.append(bp.loc[minor_.index])

		
		for i, x in enumerate(BPmaf):
			for j, y in enumerate(BPmaf):
				if i >= j:
					continue
				infolog("{0} -> {1}".format(i,j))
				sheet_names.append(book.sheet_list[i]+'->'+book.sheet_list[j])
				# low의 maf5%이상과 high의 maf5%이상 값을 병합
				# 둘 중 하나라도 sum >= 35.0 and maf 5% 이상인 값에서 분석을 진행
				dumaspos = pd.merge(x,y, how='outer', on=self.dumascol, suffixes=('_x', '_y'), right_index=True, left_index=True)[[book.col_DumaPosition]]

				# dumas position 기준으로 양쪽 위치 추출
				x1 = book.BP35[i][ book.BP35[i][book.col_DumaPosition].isin(dumaspos[book.col_DumaPosition].values.tolist())] if len(dumaspos) is not 0 else pd.DataFrame()
				y1 = book.BP35[j][ book.BP35[j][book.col_DumaPosition].isin(dumaspos[book.col_DumaPosition].values.tolist())] if len(dumaspos) is not 0 else pd.DataFrame()

				merged = pd.merge(x1,y1, how='outer', on=self.dumascol, suffixes=('_x', '_y'), right_index=True, left_index=True)
				
				if(len(x1) is 0 or len(y1) is 0):
					self.BPmergedinc.append(pd.DataFrame())
					self.BPmergeddec.append(pd.DataFrame())
					continue
								
				x_maf = divide(x1[['minor']], x1[['sum']]) * 100
				y_maf = divide(y1[['minor']], y1[['sum']]) * 100
				
				# 증가하는 위치와 감소하는 위치
				## 양쪽에 서로 없는 index가 있으면 Error - 한쪽에 값이 없으면 NaN : dropna함수로 제거
				merged['diffofinc'] = y_maf - x_maf
				merged['diffofdec'] = x_maf - y_maf

				cond1 = merged['major_idx_x'] == merged['major_idx_y']
				cond2 = merged['diffofinc'] >= 5.0
				cond3 = merged['diffofdec'] >= 5.0				

				# 한쪽에 값이 없으면 drop되기 때문에 어디쪽에 해도 상관은 없음							
				self.BPmergedinc.append(pd.DataFrame.dropna(merged[ logical_and(cond1, cond2).values ], how='all'))
				self.BPmergeddec.append(pd.DataFrame.dropna(merged[ logical_and(cond1, cond3).values ], how='all'))

		book.sheet_list = sheet_names
		book.nsheets = len(sheet_names)
		infolog("lowhigh init end")
コード例 #6
0
ファイル: fullseq.py プロジェクト: lis123kr/GPA
	def sheet2(self, ws, book):
		infolog("Start Sheet2")
		
		ws['A1'] = "strain"
		ws.merge_cells(start_row=1, start_column=1, end_row=2, end_column=2)
		ws['C1'] = "Genome length (bp)"
		ws.merge_cells(start_row=1, start_column=3, end_row=2, end_column=3)
		ws['D1'] = "Range of minor allele frequency(%)"
		ws.merge_cells(start_row=1, start_column=4, end_row=1, end_column=4+len(self.s1))

		col = 'D'
		for ss1, ss2 in zip(self.s1, self.s2):
			ws[col+'2'] = str(ss1) + '≤f<' + str(ss2)
			col = next_col(col)

		for i in range(book.nsheets):
			infolog("Writing {0} sheet".format(book.sheet_list[i]))	

			ws['A' + str(i+3)] = book.sheet_list[i]
			ws['C' + str(i+3)] = book.BPRawLength[i]
			ws.merge_cells(start_row=3+i, start_column=1, end_row=3+i, end_column=2)
			
			col = 'D'
			for r1, r2 in zip(self.s1, self.s2):
				s, l, _ = book.get_Number_of_GPS(book.BP35[i], r1, r2) if len(book.BPxMinor[i])!=0 else (0,0)
				ws[col + str(i+3)] = str(round(s/l, 3))+'%' if l is not 0 else '-'
				col = next_col(col)
		infolog("End Sheet2")
コード例 #7
0
	def PolymorphicSite(self, ws, types_, book):
		infolog("lowhigh PolymorphicSite start")
		ws.title = "INC-Polymorphic site"
		# A col
		ws['A1'] = types_
		ws.merge_cells(start_row=1, start_column=1, end_row=2, end_column=2)
		
		# C col
		ws['C1'] = "Increase in genetic polymrphism (%)"
		ws.merge_cells(start_row=1, start_column=3, end_row=1, end_column=6)

		from numpy import logical_and
		for i in range(book.nsheets):
			ws['A' + str(i+3)] = book.sheet_list[i]
			ws.merge_cells(start_row=i+3, start_column=1, end_row=i+3, end_column=2)
			col = 'C'
			bp = self.BPmergedinc[i][['diffofinc']] if types_ == 'INC' else self.BPmergeddec[i][['diffofdec']]
			for s1_, s2_ in zip(self.s1, self.s2):
				ws[col+'2'] = str(s1_) + '≤n<' + str(s2_) # 'Sum' if s1_ == 5.0 and s2_ == 51.0 else str(s1_) + '≤n<' + str(s2_) 
				ws[col+str(i+3)] = len(bp[ logical_and(bp >= s1_, bp < s2_).values]) if len(bp) is not 0 else 0
				col = next_col(col)			
		infolog("lowhigh PolymorphicSite end")
コード例 #8
0
ファイル: book.py プロジェクト: lis123kr/GPA
    def preprocessing(self):
        from analyzer import infolog
        from pandas import DataFrame
        infolog("preprocessing start")
        for bp in self.BPRaw:
            self.BPRawLength.append(len(bp))

            # 염기 A, G, C, T의 합계 컬럼 추가
            bp["sum"] = bp[self.col_basepair].sum(axis=1)

            # 35bp
            bp = bp[[
                self.col_GenomeStructure, self.col_RepeatRegion,
                self.col_DumaPosition, self.col_DumaSeq, self.col_ORF,
                self.col_Sequence, "sum"
            ] + self.col_basepair]
            # sum 값이 35 이상인 데이터 추출
            # constraions = 35.0
            bp = bp[bp["sum"] >= self.constraints]
            pmajor, pminor = self.get_major_minor(bp[self.col_basepair])
            # major, minor의 값을 추출
            self.BPxMajor.append(pmajor)
            self.BPxMinor.append(pminor)
            # pminor, pmajor가 값을 가지지 않을 때 에러 발생
            bp['minor'] = pminor if len(pminor) is not 0 else 0
            bp['major'] = pmajor if len(pmajor) is not 0 else 0

            self.BP35.append(bp)

        # major, minor의 인덱스 컬럼 추가
        for bp in self.BP35:
            argx = -bp[self.col_basepair]  # A, G, C, T의 순으로 우선순위를 위해 음수로 정렬
            bps = argx.values.argsort(axis=1)
            bp['minor_idx'] = DataFrame(bps[:, 1], index=bp.index)
            bp['major_idx'] = DataFrame(bps[:, 0], index=bp.index)
        infolog("preprocessing end")
コード例 #9
0
ファイル: fullseq.py プロジェクト: lis123kr/GPA
	def sheet1(self,ws, book):
		infolog("Start Sheet1")

		ws.title = "Polymorphic site"
		ws['A1'] = "strain"
		ws['C1'] = "Genome length (bp)"
		ws['D1'] = "Average of MAF"
		ws['E1'] = "Number of polymorphic site"
		ws.merge_cells(start_row=1, start_column=5, end_row=1, end_column=5+len(self.s1))
		ws.merge_cells(start_row=1, start_column=1, end_row=2, end_column=2)
		ws.merge_cells(start_row=1, start_column=3, end_row=2, end_column=3)
		ws.merge_cells(start_row=1, start_column=4, end_row=2, end_column=4)

		col = 'E'
		for ci in range(len(self.s1)):
			ws[col+'2'] = str(self.s1[ci]) + '≤n<' + str(self.s2[ci])
			col = next_col(col)

		for ni in range(book.nsheets):
			infolog("Writing {} sheet".format(book.sheet_list[ni]))
			ws['A' + str(ni+3)] = book.sheet_list[ni]
			ws.merge_cells(start_row=3+ni, start_column=1, end_row=3+ni, end_column=2)
			ws['C' + str(ni+3)] = book.BPRawLength[ni]
			s, l, _ = book.get_Number_of_GPS(book.BP35[ni], 5.0, 51.0)
			ws['D' + str(ni+3)] = str(round(s / l, 3)) + '%'

			col = 'E'
			for ss1, ss2 in zip(self.s1, self.s2):
				ws[col + str(ni+3)] = book.get_Number_of_GPS(book.BP35[ni], ss1, ss2)[1]
				col = next_col(col)
		# ws['E2'] = "2.5≤f<5"
		# ws['F2'] = "5≤f<15"
		# ws['G2'] = "15≤f<25"
		# ws['H2'] = "25≤f"
		# ws['I2'] = "sum"
		# # A col
		
		# for i in range(book.nsheets):
		# 	infolog("{0} Writing {1} sheet".format(time.time(), book.sheet_list[i]))
			
		# 	ws['A' + str(i+3)] = book.sheet_list[i]
		# 	ws.merge_cells(start_row=3+i, start_column=1, end_row=3+i, end_column=2)
		# 	ws['C' + str(i+3)] = book.BPRawLength[i]
		# 	s, l, _ = book.get_Number_of_GPS(book.BP35[i], 5.0, 51.0)
		# 	ws['D' + str(i+3)] = str(round(s / l, 3)) + '%'
		# 	ws['E' + str(i+3)] = book.get_Number_of_GPS(book.BP35[i], 2.5, 5.0)[1]
		# 	ws["F" + str(i+3)] = book.get_Number_of_GPS(book.BP35[i], 5.0, 15.0)[1]
		# 	ws["G" + str(i+3)] = book.get_Number_of_GPS(book.BP35[i], 15.0, 25.0)[1]
		# 	ws["H" + str(i+3)] = book.get_Number_of_GPS(book.BP35[i], 25.0, 51.0)[1]
		# 	ws["I" + str(i+3)] = book.get_Number_of_GPS(book.BP35[i], 5.0, 51.0)[1]

		infolog("End Sheet1")		
コード例 #10
0
ファイル: fullseq.py プロジェクト: lis123kr/GPA
	def sheet4_5(self, ws, title, book):
		infolog("Start sheet {0}".format(4 if title is "ORF" else 5))
		ws['B2'] = book.filename
		ws["C2"] = "(BP_full)Length"
		ws.merge_cells(start_row=2, start_column=3, end_row=2, end_column=3+book.nsheets-1)
		ws["B3"] = title
		col = 'C'
		for i in range(0, book.nsheets):
			ws[col + str(3)] = book.sheet_list[i]
			col = next_col(col)		

		# Full - 35이하 포함
		col = 'B'
		orf_ = book.ORF if title is 'ORF' else book.NCR
		rows = list(range(4, len(orf_) + 4))
		for c in range(0, book.nsheets+1):
			cnt_ = 0
			for i in range(0, len(orf_)):
				if c is 0:
					ws[col+str(rows[i])] = orf_[i]
				else:
					tmp = len(book.BPRaw[c-1][ book.BPRaw[c-1][book.col_ORF] == orf_[i] ])
					ws[col+str(rows[i])] = tmp
					cnt_ += tmp
			ws[col + str(4+len(orf_))] = cnt_ if c is not 0 else 'total'
			col = next_col(col)

		# s1~s2 범위
		col = chr(ord('A')+3+book.nsheets)
		for i in range(0, len(self.s1)):
			co = (5+3*book.nsheets) * i + (5+book.nsheets)
			ws[col+str(2)] = str(self.s1[i]) + '~' + str(self.s2[i]) +'%' if self.s2[i] != 51.0 else str(self.s1[i]) + '%~'
			ws[col+str(3)] = title # NCR or ORF
			for ix in range(0, len(orf_)):
				ws[col+str(rows[ix])] = orf_[ix]
			ws[col+str(len(orf_) + 4)] = 'total'

			# Number of GPS of each sheets
			gps = list()
			col = next_col(col)			
			ws[col+str(2)] = "Number of GPS"
			ws.merge_cells(start_row=2, start_column=co, end_row=2, end_column=co+book.nsheets)

			dcol = next_n_col(col, book.nsheets)
			for n in range(0, book.nsheets):
				ws[col+str(3)] = book.sheet_list[n]
				_, _, tx_minor = book.get_Number_of_GPS(book.BP35[n], self.s1[i], self.s2[i])
				tx_rows = book.BP35[n].loc[ tx_minor.index ]

				g = []
				cnt_ = 0
				for ix in range(0, len(orf_)):
					mrows = tx_rows[ tx_rows[book.col_ORF] == orf_[ix] ]
					ws[col+str(rows[ix])] = len(mrows)
					cnt_ += len(mrows)
					g.append(len(mrows))

					# 180612
					ws[dcol+str(rows[ix])] = float(ws[dcol+str(rows[ix])].value) + len(mrows) if ws[dcol+str(rows[ix])].value != None else len(mrows)

				ws[col+str(len(orf_) + 4)] = cnt_
				ws[dcol+str(len(orf_) + 4)] = float(ws[dcol+str(len(orf_) + 4)].value) + cnt_ if ws[dcol+str(len(orf_) + 4)].value != None else cnt_
				g.append(cnt_)
				gps.append(g)
				col = next_col(col)
			col = next_col(col)

			gcol = next_n_col(col, book.nsheets)
			# Average MAF at GPS  of each sheets
			ws[col+str(2)] = "Average MAF at GPS"
			ws.merge_cells(start_row=2, start_column=co+book.nsheets+1, end_row=2, end_column=co+2*book.nsheets+1)
			for n in range(0, book.nsheets):
				val, cnt_ = 0, 0
				ws[col+str(3)] = book.sheet_list[n]
				_, _, tx_minor = book.get_Number_of_GPS(book.BP35[n], self.s1[i], self.s2[i])
				tx_rows = book.BP35[n].loc[ tx_minor.index ]
				for ix in range(0, len(orf_)):
					mrows = tx_rows[ tx_rows[book.col_ORF] == orf_[ix] ]
					if len(mrows) is 0:
						ws[col+str(rows[ix])] = '-'

						# 180612
						ws[gcol+str(rows[ix])] = ws[gcol+str(rows[ix])].value if ws[gcol+str(rows[ix])].value != None else 0
					else:
						maf_ = divide(mrows[["minor"]], mrows[["sum"]]) * 100
						val += maf_.sum()[0]
						cnt_ += len(maf_)
						ws[col+str(rows[ix])] = str(round(maf_.sum()[0] / len(maf_), 3)) + '%'

						# 180612
						ws[gcol+str(rows[ix])] = float(ws[gcol+str(rows[ix])].value) + maf_.sum()[0] if ws[gcol+str(rows[ix])].value != None else maf_.sum()[0]
				# total
				ws[col+str(len(orf_)+4)] = str(round(val / cnt_, 3)) + '%' if cnt_ is not 0 else '-'
				ws[gcol+str(len(orf_)+4)] = float(ws[gcol+str(len(orf_)+4)].value) + val if ws[gcol+str(len(orf_)+4)].value != None else val
				col = next_col(col)
			col = next_col(col)
			# "Number of GPS / Length" of each sheets
			ws[col+str(2)] = "Number of GPS / Length * 100"
			ws.merge_cells(start_row=2, start_column=co+2*book.nsheets+2, end_row=2, end_column=co+3*book.nsheets+2)
			ncol = 'C'
			d2col = next_n_col(col, book.nsheets)
			for n in range(0, book.nsheets):
				ws[col+str(3)] = book.sheet_list[n]
				for ix in range(0, len(orf_)):
					l_ = int(str(ws[ncol+str(rows[ix])].value))
					ws[col+str(rows[ix])] = (gps[n][ix] / l_)*100 if l_ is not 0 else '-'
					ws[d2col+str(rows[ix])] = float(ws[d2col+str(rows[ix])].value) + l_ if ws[d2col+str(rows[ix])].value != None else l_
				# total
				l_ = int(str(ws[ncol+str(len(orf_)+4)].value))
				ws[col+str(len(orf_)+4)] = (gps[n][-1] / l_)*100 if l_ is not 0 else '-'
				ws[d2col+str(len(orf_)+4)] = float(ws[d2col+str(len(orf_)+4)].value) + l_ if ws[d2col+str(len(orf_)+4)].value != None else l_

				ncol = next_col(ncol)
				col = next_col(col)
			
			for row_ in range(4, len(orf_)+5):
				ws[dcol+str(row_)] = float(ws[dcol+str(row_)].value) / book.nsheets if float(ws[dcol+str(row_)].value) != 0 else 0
				ws[gcol+str(row_)] = str(round(float(ws[gcol+str(row_)].value) / float(ws[dcol+str(row_)].value), 3)) + '%' if float(ws[dcol+str(row_)].value) != 0 else '-'
				ws[d2col+str(row_)] = float(ws[dcol+str(row_)].value) / (float(ws[d2col+str(row_)].value) / 3) * 100 if float(ws[d2col+str(row_)].value) != 0 else 0

			# end for s1
			col = next_col(next_col(col))
		infolog("End sheet {0}".format( 4 if title is "ORF" else 5))
コード例 #11
0
	def Init_Average(self, book):
		# book.save_data(book, 'Average_book_vaccine.pkl')

		for sn in book.sheet_list:
			assert sn[0] == 'A' or sn[0] == 'B', "sheet name"

		# for i in range(0,len(book.nsheets)):
		# 	book.BPRaw[i]['sum_'+ book.sheet_list[i]] = book.BPRaw[i][book.basepair].sum(axis=1)

		# A & B 위치별 MAF 평균
		# Avg(A) ~ Avg(B)의 변화 분석
		infolog("Init_Average")

		# # 필요 최종 column : dumacol(5), diffofdec, diffofinc, minor_idx_x, minor_idx_y, ->major(2) / minor..
		# # 필요 값 : any maf 5% 이상, all maf 5% 이상, 값이 있는 곳

		## 정한 값 :	minor_idx_x -> A의 가장 마지막.. / minor_idx_y -> B의 가장 마지막
		## 			minor_x -> A의 minor 평균 / minor_y -> B의 minor 평균
		##			
		for j, i in enumerate(book.BP35):
			i['maf_'+book.sheet_list[j]] = divide(i[['minor']], i[['sum']]) * 100
			i[book.sheet_list[j][0].upper()+'_minor_idx'] = i[['minor_idx']]
			book.BP35[j] = i.drop(columns='minor_idx')

		merged = book.BP35[0].drop(columns=['seq', 'sum', 'A', 'G', 'C', 'T', 'major'])
		merged['maf_'+book.sheet_list[0][0].upper()] = book.BP35[0]['maf_'+book.sheet_list[0]]
		for i in range(1, book.nsheets):
			book.BP35[i] = book.BP35[i].drop(columns=['seq', 'sum', 'A', 'G', 'C', 'T', 'major'])
			grp = book.sheet_list[i][0].upper()
			col_maf = 'maf_' + grp
			col_minor_idx = grp+'_minor_idx'

			merged = pd.merge(merged, book.BP35[i], on=self.dumascol, how='outer')
			if not merged.columns.contains(col_maf): 
				merged[col_maf] = merged['maf_' + book.sheet_list[i] ]
			else : 
				merged[col_maf] = merged[[col_maf, 'maf_'+book.sheet_list[i]]].sum(axis=1)

			if merged.columns.contains(col_minor_idx+'_x'):
			    merged[col_minor_idx] = merged[col_minor_idx+'_y']
			    merged = merged.drop(columns=col_minor_idx+'_x').rename(index=str, columns={col_minor_idx+'_y' : col_minor_idx})

			merged = merged[ merged['major_idx_x'] == merged['major_idx_y'] ]
			merged = merged.drop(columns = 'major_idx_x').rename(index=str, columns={"major_idx_y": "major_idx"})

		Acol, Bcol = [], []
		for i in range(len(book.sheet_list)):
			if book.sheet_list[i][0].upper() == 'A':
				Acol.append('maf_'+book.sheet_list[i])
			else:
				Bcol.append('maf_'+book.sheet_list[i])

		merged['maf_A'] = merged['maf_A'] / len(Acol)
		merged['maf_B'] = merged['maf_B'] / len(Bcol)

		A_all, A_any = True, False
		B_all, B_any = True, False
		for i in Acol:
			A_all = logical_and( A_all, merged[i] >= 5.0 )
			A_any = logical_or( A_any, merged[i] >= 5.0 )
		    
		for i in Bcol:
			B_all = logical_and( B_all, merged[i] >= 5.0 )
			B_any = logical_or( B_any, merged[i] >= 5.0 )
		# A, B = [], []

		merged['minor_idx_x'] = merged['A_minor_idx'].T[-1:].T
		merged['minor_idx_y'] = merged['B_minor_idx'].T[-1:].T

		merged['major_idx_x'] = merged['major_idx']
		merged['major_idx_y'] = merged['major_idx']
		merged = merged.drop(columns = ['A_minor_idx', 'B_minor_idx'])

		tmpx = merged['minor_x'].sum(axis=1) / len(Acol)
		tmpy = merged['minor_y'].sum(axis=1) / len(Bcol)
		merged = merged.drop(columns=['minor_x', 'minor_y'])
		merged['minor_x'] = tmpx
		merged['minor_y'] = tmpy

		merged['diffofinc'] = merged['maf_B'] - merged['maf_A']
		merged['diffofdec'] = merged['maf_A'] - merged['maf_B']

		cond2 = merged['diffofinc'] >= 5.0
		cond3 = merged['diffofdec'] >= 5.0

		maf_5_all = merged[ logical_and(A_all, B_all).values ]
		maf_5_any = merged[ logical_or(A_any, B_any).values ]

		merged.to_excel('{}_merged.xlsx'.format(book.filename))
		maf_5_all.to_excel('{}_maf_5_all.xlsx'.format(book.filename))
		maf_5_any.to_excel('{}_maf_5_any.xlsx'.format(book.filename))

		# print(len(merged), len(maf_5_all), len(maf_5_any))

		self.BPmergedinc.append( merged)
		self.BPmergedinc.append(maf_5_all)
		self.BPmergedinc.append(maf_5_any)

		self.BPmergeddec.append( merged)
		self.BPmergeddec.append(maf_5_all)
		self.BPmergeddec.append(maf_5_any)
		book.sheet_list = ['All', 'All_5', 'Any_5']
		book.nsheets = len(book.sheet_list)
		infolog("End Init_Average")
コード例 #12
0
	def ORFNCR(self, ws, types_, title, book):
		infolog("lowhigh {0} start".format(title))

		ws['A1'] = title
		ws.merge_cells(start_row=1, start_column=1, end_row=2, end_column=1)
		ws['B1'] = 'Length'
		ws.merge_cells(start_row=1, start_column=2, end_row=2, end_column=2)

		orfncr = book.ORF if title is 'ORF' else book.NCR
		col_diff = "diffofinc" if types_ == 'INC' else "diffofdec"
		bp = self.BPmergedinc if types_ == 'INC' else self.BPmergeddec

		sum_, col = 0, 'B'
		for si in range(-1, book.nsheets):
			sum_ = 0
			if si != -1:
				ws[col+'2'] = book.sheet_list[si] 
			for r in range(len(orfncr)):
				if si == -1:
					ws['A'+str(r+3)] = orfncr[r]
					cnt = len(book.Dumas[ book.Dumas[book.col_ORF] == orfncr[r] ])
					ws[col+str(r+3)] = cnt
					sum_ += cnt
				else:
					cnt = len( bp[si][ bp[si][book.col_ORF] == orfncr[r] ] )
					ws[col+str(r+3)] = cnt
					sum_ += cnt
			ws[col+str(3+len(orfncr))] = sum_
			col = next_col(col)
		ws['A'+str(3+len(orfncr))] = 'Total'

		col = next_col(col)
		for i in range(len(self.s1)):			
			# ORF or CNR 종류
			ws[col+'1'] = str(self.s1[i])+'~'+str(self.s2[i]) #if self.s2[i]!=51.0 else str(self.s1[i])+'~'
			for r in range(len(orfncr)):
				ws[col+str(r+3)] = orfncr[r]
			col = next_col(col)

			col_GPS = col
			ws[col+'1'] = '{0} in genetic polymorphism'.format("Increase" if types_ == 'INC' else "Decrease")
			for s in range(book.nsheets):
				ws[col+'2'] = book.sheet_list[s]
				bpx = bp[s][ logical_and(bp[s][col_diff] >= self.s1[i], bp[s][col_diff] < self.s2[i]).values ]

				sum_ = 0
				for r in range(3, 3+len(orfncr)):
					cnt = len(bpx[ bpx[book.col_ORF] == orfncr[r-3]])
					ws[col+str(r)] = cnt	
					sum_ += cnt					
				ws[col+str(3+len(orfncr))] = sum_
				col = next_col(col)			

			ws[col+'1'] = 'Average {0}'.format("Increase" if types_ == 'INC' else "Decrease")
			for s in range(book.nsheets):			
				ws[col+'2'] = book.sheet_list[s]
				bpx = bp[s][ logical_and(bp[s][col_diff] >= self.s1[i], bp[s][col_diff] < self.s2[i]).values ]
				sum_ = 0
				for r in range(3, 3+len(orfncr)):
					cnt = bpx[ bpx[book.col_ORF] == orfncr[r-3]][[col_diff]]					
					ws[col+str(r)] = str(round(cnt.sum()[0] / len(cnt), 3))+'%' if len(cnt) is not 0 else 'N/A'
					sum_ += cnt.sum()[0]
				col = next_col(col)

			ws[col+'1'] = 'Number of GPS / Length'
			for s in range(book.nsheets):			
				ws[col+'2'] = book.sheet_list[s]
				for r in range(3, 3+len(orfncr)):
					s, l = float(ws[col_GPS+str(r)].value), float(ws['B'+str(r)].value)
					ws[col+str(r)] = str( round(s/l, 6) ) if l != 0 else 'N/A'
				col_GPS = next_col(col_GPS)
				col = next_col(col)
			col = next_col(col)

		infolog("lowhigh {0} end".format(title))
コード例 #13
0
	def GenomeStr(self, ws, types_, book):
		infolog("lowhigh GenomeStr start")
		bp = self.BPmergedinc if types_ == 'INC' else self.BPmergeddec		
		col_diff = "diffofinc" if types_ == 'INC' else "diffofdec"

		# s1 ~ s2 
		for i in range(len(self.s1)):
			rows = i*(14+len(book.GenomeStructure) + len(book.RepeatRegion))+ 1

			ws['A'+str(rows)] = 'Region'
			ws.merge_cells(start_row=rows, start_column=1, end_row=rows+1, end_column=1)
			ws['B'+str(rows)] = 'Dumas Length'
			ws.merge_cells(start_row=rows, start_column=2, end_row=rows+1, end_column=2)

			for r in range(2, 2+len(book.GenomeStructure)):
				ws['A' + str(rows+r)] = book.GenomeStructure[r-2]
				ws['B' + str(rows+r)] = len(book.Dumas[ book.Dumas[book.col_GenomeStructure] == book.GenomeStructure[r-2] ])

			ws['A' + str(rows+2+len(book.GenomeStructure))] = 'Total'
			ws['B' + str(rows+2+len(book.GenomeStructure))] = len(book.Dumas)

			sum_ = 0
			for r in range(len(book.RepeatRegion)):
				row = rows+r+3+len(book.GenomeStructure)
				ws['A' + str(row)] = book.RepeatRegion[r]
				cnt = len(book.Dumas[ book.Dumas[book.col_RepeatRegion] == book.RepeatRegion[r] ])
				ws['B' + str(row)] = cnt
				sum_ += cnt

			leng = rows+ len(book.GenomeStructure) + len(book.RepeatRegion)
			ws['A' + str(leng+3)] = 'Total'
			ws['B' + str(leng+3)] = sum_
			ws['A' + str(leng+4)] = 'ORF'
			ws['B' + str(leng+4)] = len(book.Dumas[ book.Dumas[book.col_ORF].isin(book.ORF)])	
			ws['A' + str(leng+5)] = 'NCR'
			ws['B' + str(leng+5)] = len(book.Dumas[ book.Dumas[book.col_ORF].isin(book.NCR)])

			col = 'C'
			ws[col+str(rows)] = '{0} in genetic polymorphism'.format("Increase" if types_ is 'INC' else "Decrease")
			ws.merge_cells(start_row=rows, start_column=3, end_row=rows, end_column=2+book.nsheets)
			ws[chr(ord(col)+book.nsheets)+str(rows)] = "Average {0}".format("Increase" if types_ is 'INC' else "Decrease")
			ws.merge_cells(start_row=rows, start_column=3+book.nsheets, end_row=rows, end_column=2+2*book.nsheets)
			for s in range(book.nsheets):
				idx = logical_and( bp[s][col_diff] >= self.s1[i], bp[s][col_diff] < self.s2[i] )
				bpx = bp[s][idx.values]
				
				ncol = chr(ord(col)+book.nsheets)
				ws[col+str(rows+1)] = book.sheet_list[s]
				ws[chr(ord(col)+book.nsheets)+str(rows+1)] = book.sheet_list[s]

				sum_, avg_ = 0, 0
				for r in range(2, 2+len(book.GenomeStructure)):
					cnt = bpx[ bpx[book.col_GenomeStructure] == book.GenomeStructure[r-2]][[col_diff]]
					ws[col+str(rows+r)] = len(cnt)
					ws[ncol+str(rows+r)] = str(round((cnt.sum()[0] / len(cnt)), 3))+'%' if len(cnt) is not 0 else 'N/A'
					avg_ += cnt.sum()[0]
					sum_ += len(cnt)
				# Total
				ws[col+str(rows+2+len(book.GenomeStructure))] = sum_
				ws[chr(ord(col)+book.nsheets)+str(rows+2+len(book.GenomeStructure))] = str(round(avg_/sum_, 3))+'%' if sum_ is not 0 else 'N/A'

				sum_, avg_ = 0, 0
				for r in range(len(book.RepeatRegion)):
					row = rows+r+3+len(book.GenomeStructure)
					cnt = bpx[ bpx[book.col_RepeatRegion] == book.RepeatRegion[r]][[col_diff]]
					ws[col+str(row)] = len(cnt)
					ws[ncol+str(row)] = str(round((cnt.sum()[0] / len(cnt)),3))+'%' if len(cnt) is not 0 else 'N/A'
					sum_ += len(cnt)
					avg_ += cnt.sum()[0]
				ws[col+str(3+leng)] = sum_
				ws[ncol+str(3+leng)] = str(round(avg_/sum_,3))+'%' if sum_ is not 0 else 'N/A'
				# ORF & NCR
				cnt = bpx[ bpx[book.col_ORF].isin(book.ORF)][[col_diff]]
				ws[col+str(4+leng)] = len(cnt)
				ws[ncol+str(4+leng)] = str(round(cnt.sum()[0] / len(cnt), 3))+'%' if len(cnt) is not 0 else 'N/A'
				cnt = bpx[ bpx[book.col_ORF].isin(book.NCR)][[col_diff]]
				ws[col+str(5+leng)] = len(cnt)
				ws[ncol+str(5+leng)] = str(round(cnt.sum()[0] / len(cnt), 3))+'%' if len(cnt) is not 0 else 'N/A'
				col = next_col(col)

			ncol = chr(ord(ncol)+book.nsheets)
			ws[ncol+str(rows+2)] = str(self.s1[i])+"~"+str(self.s2[i]) #if self.s2[i] != 51.0 else str(self.s1[i])+"~"
		infolog("lowhigh GenomeStr end")
コード例 #14
0
ファイル: fullseq.py プロジェクト: lis123kr/GPA
	def sheet7(self, ws, book):
		infolog("Start sheet7")
		for i in range(0, len(self.s1)):
			r = i * (book.nsheets + 12) + 2
			ws["A"+str(r)] = str(self.s1[i]) + "~" + str(self.s2[i]) + "%" #if self.s2[i] != 51.0 else str(self.s1[i]) + "%~"
			
			ws["B"+str(r)] = "Virus"
			ws.merge_cells(start_row=r, start_column=2, end_row=r+2, end_column=2)

			ws["C"+str(r)] = "Number of GPS"
			ws.merge_cells(start_row=r, start_column=3, end_row=r+2, end_column=3)

			ws["D"+str(r)] = "GPS Mean"
			ws.merge_cells(start_row=r, start_column=4, end_row=r+2, end_column=4)

			ws["E"+str(r)] = "Major"
			ws.merge_cells(start_row=r, start_column=5, end_row=r+1, end_column=5)
			ws["E"+str(r+2)] = "Minor"

			ws['F'+str(r)] = 'A'
			ws.merge_cells(start_row=r, start_column=6, end_row=r+1, end_column=8)
			ws['I'+str(r)] = 'G'
			ws.merge_cells(start_row=r, start_column=9, end_row=r+1, end_column=11)
			ws['L'+str(r)] = 'C'
			ws.merge_cells(start_row=r, start_column=12, end_row=r+1, end_column=14)
			ws['O'+str(r)] = 'T'
			ws.merge_cells(start_row=r, start_column=15, end_row=r+1, end_column=17)	

			ws['E'+str(r+3+book.nsheets)] = 'sum'
			for x in range(0, book.nsheets):
				row, mafrow = r+x+3, r+book.nsheets+x+5
				s, l, minor_ = book.get_Number_of_GPS(book.BP35[x], self.s1[i], self.s2[i])
				mxr = book.BP35[x].loc[minor_.index]
				# s, l = self.book.get_Number_of_GPS(self.PxMinor[x], self.PxSum[x], self.s1[i], self.s2[i]) if len(self.PxMinor[x])!=0 else (0,0)
				ws["B"+str(row)] = book.sheet_list[x]
				ws['C'+str(row)] = len(minor_)
				ws['D'+str(row)] = str(round(s/l, 3)) + '%' if l != 0 else '-'
				
				col = 'F'
				for a in range(0, 4):
					for b in range(0, 4):
						if a == b:
							continue
						else:
							ws[col+str(r+2)] = book.col_basepair[b].lower()
							ext_ = mxr[logical_and( mxr['major_idx']==a, mxr['minor_idx']==b )]

							ws[col+str(row)] = len(ext_)

							if len(ext_) is not 0:
								# maf들의 평균
								maf_ =  (divide( ext_[['minor']], ext_[['sum']]) * 100).sum()[0]						
								ws[col+str(mafrow)] = str(round( maf_ / len(ext_), 3)) + '%'
							else:
								ws[col+str(mafrow)] = 'N/A'

							# 전체 개수의 maf
							if ws[col+str(r+3+book.nsheets)].value == None:							
								ws[col+str(r+3+book.nsheets)] =  len(ext_)
							else:
								ws[col+str(r+3+book.nsheets)].value += len(ext_)

							if ws[col+str(r+5+ 2*book.nsheets)].value == None:
								ws[col+str(r+5+ 2*book.nsheets)] = maf_
								# ws[col+str(r+5+ 2*book.nsheets+1)] = ext_[['sum']].sum()[0]
							else:
								# print(i, "mj ", a, "mn ", b, ext_[['minor']].sum()[0])
								ws[col+str(r+5+ 2*book.nsheets)].value += maf_
								# ws[col+str(r+5+ 2*book.nsheets+1)].value += ext_[['sum']].sum()[0]


						col = next_col(col)
			# total maf
			col = 'F'
			numrow, mnrow = r+3+book.nsheets, r+5+2*book.nsheets
			for c in range(0, 12):
				ws[col+str(mnrow)] = str(round( float(ws[col+str(mnrow)].value) / float(ws[col+str(numrow)].value), 3))+'%' if ws[col+str(numrow)].value != 0 else 'N/A'
				col = next_col(col)

		infolog("End sheet7")
コード例 #15
0
ファイル: book.py プロジェクト: lis123kr/GPA
    def get_Number_of_GPS(self, BP, s1, s2):
        """
			s1, s2 범위의 maf값을 갖는 base pair의 수, 길이, minor 값을 
		"""
        from analyzer import infolog
        infolog("before import")
        from numpy import divide, logical_and
        infolog("before divide")
        infolog(BP.columns)
        maf_ = divide(BP[['minor']], BP[['sum']]) * 100
        infolog("before and")
        idx = logical_and(maf_ >= s1, maf_ < s2)
        idx = idx.values.tolist()
        infolog("before return")
        infolog(maf_[idx].sum()[0])
        return maf_[idx].sum()[0], len(BP[idx]), BP[['minor']][idx]
コード例 #16
0
ファイル: fullseq.py プロジェクト: lis123kr/GPA
	def sheet3(self,ws, book):
		infolog("Start sheet3")
		
		for i in range(0, len(self.s1)):
			infolog("Writing {0} ~ {1}".format(self.s1[i], self.s2[i]))

			# r : (s1,s2)의 범위 별 데이터의 row 변수
			r = i*(15+len(book.GenomeStructure) + len(book.RepeatRegion))+ 1
			ws['A'+str(r)] = "Region"
			ws.merge_cells(start_row=r, start_column=1, end_row=r+2, end_column=2)
			ws['C'+str(r)] = "Dumas Length (bp)"
			ws.merge_cells(start_row=r, start_column=3, end_row=r+2, end_column=3)

			ws['A'+str(r+3)] = "Genome Structure"
			ws.merge_cells(start_row=r+3, start_column=1, end_row=r+3+len(book.GenomeStructure), end_column=1)
			ws['A'+str(r+10)] = "Repeat region"
			ws.merge_cells(start_row=r+4+len(book.GenomeStructure), start_column=1, end_row=r+4+len(book.GenomeStructure)+len(book.RepeatRegion), end_column=1)
			
			# B, C 컬럼 데이터 : GenomeStructure, RepeatRegion 종류와 길이
			# G_rows : GenomeStructure데이터를 넣기 위한 row 범위
			# R_rows : RepeatRegion데이터를 넣기 위한 row 범위
			G_rows = list(range(r+3, r+3+len(book.GenomeStructure)))
			cnt_num = 0
			for ix in range(0, len(book.GenomeStructure)):
				ws['B' + str(G_rows[ix])] = book.GenomeStructure[ix]
				nlen = len(book.Dumas[ book.Dumas[ book.col_GenomeStructure ] == book.GenomeStructure[ix]]) # dumas length 기준
				ws['C' + str(G_rows[ix])] = nlen
				cnt_num += nlen
			ws['B' + str(r+3+len(book.GenomeStructure))] = "Total"
			ws['C' + str(r+3+len(book.GenomeStructure))] = cnt_num
			cnt_num = 0

			nr = r+4+len(book.GenomeStructure)+len(book.RepeatRegion)
			R_rows = list(range(r+4+len(book.GenomeStructure), nr))
			for ix in range(0, len(book.RepeatRegion)):
				ws['B' + str(R_rows[ix])] = book.RepeatRegion[ix]
				nlen = len(book.Dumas[ book.Dumas[ book.col_RepeatRegion ] == book.RepeatRegion[ix]])
				ws['C'+str(R_rows[ix])] = nlen
				cnt_num += nlen
			ws['B'+str(nr)] = 'Total'
			ws['C' + str(nr)] = cnt_num

			## ORF NCR
			ws["A"+str(nr+1)] = "ORF"
			ws["A"+str(nr+2)] = "NCR"
			# ORF NCR Full Length
			ws['C'+str(nr+1)] = len(book.Dumas[ book.Dumas[book.col_ORF].isin(book.ORF)])
			ws['C'+str(nr+2)] = len(book.Dumas[ book.Dumas[book.col_ORF].isin(book.NCR)])
			# end B, C columns
			
			# ncol = 'D'
			# ncol : Column of 'Number of GPS' is need when calculating 'Number of GPS / Length'
			ncol = col = 'D'

			ws[col+str(r)] = "Number of GPS"
			gcol = next_n_col(col, book.nsheets)
			for c in range(0, book.nsheets):
				ws[col+str(r+2)] = book.sheet_list[c]
				# 35 이상 
				s, l, minor_ = book.get_Number_of_GPS(book.BP35[c], self.s1[i], self.s2[i])
				# maf_ = str(round(s/l, 3)) + '%' if l is not 0 else '-'

				tx_rows = book.BP35[c].loc[ minor_.index ]
				tx_orf = tx_rows[ tx_rows[ book.col_ORF ].isin(book.ORF)]
				tx_ncr = tx_rows[ tx_rows[ book.col_ORF ].isin(book.NCR)]

				ws.merge_cells(start_row=r, start_column=4, end_row=r, end_column=4 + book.nsheets)
				ws[col+str(r+1)] = book.filename
				ws.merge_cells(start_row=r+1, start_column=4, end_row=r+1, end_column=4 + book.nsheets)
				self.insert_value_in_cell(ws, book, G_rows, col, gcol, book.BP35[c], minor_, book.col_GenomeStructure , "GPS")
				self.insert_value_in_cell(ws, book, R_rows, col, gcol, book.BP35[c], minor_, book.col_RepeatRegion , "GPS")
				ws[col+str(nr+1)] = len(tx_orf)
				ws[col+str(nr+2)] = len(tx_ncr)
				#180612
				ws[gcol+str(nr+1)] = float(ws[gcol+str(nr+1)].value) + len(tx_orf) if ws[gcol+str(nr+1)].value != None else len(tx_orf)
				ws[gcol+str(nr+2)] = float(ws[gcol+str(nr+2)].value) + len(tx_ncr) if ws[gcol+str(nr+2)].value != None else len(tx_ncr)
				col = next_col(col)
			col = next_col(col)

			# "Average MAF at GPS"
			ws[col+str(r)] = "Average MAF at GPS"
			dcol = next_n_col(col, book.nsheets)
			for c in range(0, book.nsheets):
				ws[col+str(r+2)] = book.sheet_list[c]
				ws.merge_cells(start_row=r, start_column=5 + book.nsheets, end_row=r, end_column=5 + 2 * book.nsheets)
				ws[col+str(r+1)] = book.filename
				ws.merge_cells(start_row=r+1, start_column=5 + book.nsheets, end_row=r+1, end_column=5 + 2 * book.nsheets)

				s, l, minor_ = book.get_Number_of_GPS(book.BP35[c], self.s1[i], self.s2[i])
				# maf_ = str(round(s/l, 3)) + '%' if l is not 0 else '-'
				
				tx_rows = book.BP35[c].loc[ minor_.index ]
				tx_orf = tx_rows[ tx_rows[ book.col_ORF ].isin(book.ORF)]
				tx_ncr = tx_rows[ tx_rows[ book.col_ORF ].isin(book.NCR)]

				self.insert_value_in_cell(ws, book, G_rows, col, dcol, book.BP35[c], minor_, book.col_GenomeStructure , "MAF")
				self.insert_value_in_cell(ws, book, R_rows, col, dcol, book.BP35[c], minor_, book.col_RepeatRegion , "MAF")

				# 180612
				maf_ = (divide(tx_orf[["minor"]], book.BP35[c][['sum']].loc[ tx_orf.index]) * 100).sum()[0]
				ws[col+str(nr+1)] = str(round( maf_ /len(tx_orf), 3)) +'%' if len(tx_orf) is not 0 else '-'
				ws[dcol+str(nr+1)] = float(ws[dcol+str(nr+1)].value) + maf_ if ws[dcol+str(nr+1)].value != None else maf_

				maf_ = (divide(tx_ncr[["minor"]], book.BP35[c][['sum']].loc[ tx_ncr.index]) * 100).sum()[0]
				ws[col+str(nr+2)] = str(round( maf_ /len(tx_ncr), 3)) +'%' if len(tx_ncr) is not 0 else '-'
				ws[dcol+str(nr+2)] = float(ws[dcol+str(nr+2)].value) + maf_ if ws[dcol+str(nr+2)].value != None else maf_

				col = next_col(col)

			col = next_col(col)
			# "Number of GPS / length"
			d2col = next_n_col(col, book.nsheets)
			ws[col+str(r)] = "Number of GPS / length * 100"

			for c in range(0, book.nsheets):
				ws[col+str(r+2)] = book.sheet_list[c]
				ws.merge_cells(start_row=r, start_column=6 + 2 *book.nsheets, end_row=r, end_column=6 + 3 *book.nsheets)
				ws[col+str(r+1)] = book.filename
				ws.merge_cells(start_row=r+1, start_column=6 + 2 *book.nsheets, end_row=r+1, end_column=6 + 3 *book.nsheets)
				ridx = sum_ = 0
				
				for rs in range(r+3, r+5+len(book.GenomeStructure)+len(book.RepeatRegion)):
					if ridx < len(book.GenomeStructure):
						llen_ = len(book.BP35[c][ book.BP35[c][ book.col_GenomeStructure] == book.GenomeStructure[ridx]])
						sum_ += llen_
					elif ridx == len(book.GenomeStructure):
						llen_ = sum_
						sum_ = 0
					elif len(book.GenomeStructure) < ridx and ridx < len(book.GenomeStructure)+1+len(book.RepeatRegion):
						llen_ = len(book.BP35[c][ book.BP35[c][ book.col_RepeatRegion] == book.RepeatRegion[ridx-1-len(book.GenomeStructure)]])
						sum_ += llen_
					else:
						llen_ = sum_

					ws[col + str(rs)] = (ws[ncol+str(rs)].value / llen_) * 100 if llen_ is not 0 else '-'
					ridx = ridx + 1

					# 180612 강제지정
					ws[d2col + str(rs)] = ws[d2col + str(rs)].value + llen_ if ws[d2col + str(rs)].value != None else llen_

				# ORF & NCR
				for rs in range(2):
					tmp, rs_ = 0, r+5+len(book.GenomeStructure)+len(book.RepeatRegion) + rs
					if rs == 0:
						tmp = len(book.BP35[c][ book.BP35[c][book.col_ORF].isin(book.ORF)])
						ws[col+str(rs_)] = (ws[ncol+str(rs_)].value / tmp) * 100 if tmp != 0 else '-'
						
					else:
						tmp = len(book.BP35[c][ book.BP35[c][book.col_ORF].isin(book.NCR)])
						ws[col+str(rs_)] = (ws[ncol+str(rs_)].value / tmp) * 100 if tmp != 0 else '-'

					# 180612 강제지정
					ws[d2col + str(rs_)] = ws[d2col + str(rs_)].value + tmp if ws[d2col + str(rs_)].value != None else tmp

				ncol = next_col(ncol)
				col = next_col(col)

			# 180612 강제지정 
			for rs in range(r+3, r+7+len(book.GenomeStructure)+len(book.RepeatRegion)):
				# 수정   : 위치를 올림, 원래 위치 [수정2]
				ws[dcol+str(rs)] = str(round(float(ws[dcol+str(rs)].value) / float(ws[gcol+str(rs)].value), 3)) + '%' if float(ws[dcol+str(rs)].value) != 0 else '-'
				ws[gcol+str(rs)].value = float(ws[gcol+str(rs)].value) / book.nsheets
				# 수정2   :  ws[dcol+str(rs)] = str(round(float(ws[dcol+str(rs)].value) / float(ws[gcol+str(rs)].value), 3)) + '%' if float(ws[dcol+str(rs)].value) != 0 else '-'
				ws[d2col + str(rs)] = (float(ws[gcol + str(rs)].value) / ( float(ws[d2col + str(rs)].value) / book.nsheets)) * 100 if float(ws[d2col + str(rs)].value) != 0.0 else 0

			col = next_col(col)
			ws[col+str(r+3)] = str(self.s1[i]) + "~" + str(self.s2[i]) + "%" #if self.s2[i] != 51.0 else str(self.s1[i]) + "% 이상"			
		infolog("finished writing GenomeStructure & RepeatRegion")

		#### Full Sequence				
		col = next_col(next_col(next_col(col)))
		self.Full_Seq_in_sheet3(ws, col, book)
		infolog("End sheet3")
コード例 #17
0
ファイル: fullseq.py プロジェクト: lis123kr/GPA
	def Full_Seq_in_sheet3(self, ws, col, book):
		infolog("Start Full_Seq_in_sheet3")
		
		# Writing list of GenomeStructure & RepeatRegion, NCR & ORF
		for r in range(4, 4+len(book.GenomeStructure)):
			ws[col+str(r)] = book.GenomeStructure[r-4]
		ws[col+str(4+len(book.GenomeStructure))] = 'Total'
		for r in range(0, len(book.RepeatRegion)):
			row = r+5+len(book.GenomeStructure)
			ws[col+str(row)] = book.RepeatRegion[r]
		ws[col+str(5+len(book.GenomeStructure)+len(book.RepeatRegion))] = 'Total'

		rs_ = 5+len(book.GenomeStructure)+len(book.RepeatRegion)
		ws[col+str(rs_+1)], ws[col+str(rs_+2)] = 'ORF', 'NCR'

		# Writing the count of each sheets
		col = next_col(col)
		for i in range(0, book.nsheets):
			infolog("{0} Writing {1} sheet".format(time.time(), i))
			ws[col+str(2)] = book.sheet_list[i]

			# 35 이하 포함
			ws[col+str(3)] = '(BP_full)Length'
			sum_ = 0
			for r in range(4, 4+len(book.GenomeStructure)):
				tmp = len(book.BPRaw[i][ book.BPRaw[i][ book.col_GenomeStructure] == book.GenomeStructure[r-4]]) if len(book.BPRaw[i])!=0 else 0
				ws[col+str(r)] = tmp
				sum_ += tmp
			ws[col+str(4+len(book.GenomeStructure))] = sum_
			sum_=0
			for r in range(0, len(book.RepeatRegion)):
				row = r+5+len(book.GenomeStructure)
				tmp = len(book.BPRaw[i][ book.BPRaw[i][ book.col_RepeatRegion] == book.RepeatRegion[r]]) if len(book.BPRaw[i])!=0 else 0
				ws[col+str(row)] = tmp
				sum_ += tmp
			ws[col+str(5+len(book.GenomeStructure)+len(book.RepeatRegion))] = sum_

			# ORF & NCR
			ws[col+str(rs_+1)] = len(book.BPRaw[i][ book.BPRaw[i][ book.col_ORF].isin(book.ORF)]) if len(book.BPRaw[i])!=0 else 0
			ws[col+str(rs_+2)] = len(book.BPRaw[i][ book.BPRaw[i][ book.col_ORF].isin(book.NCR)]) if len(book.BPRaw[i])!=0 else 0

			# sum of basepair 35 이상
			col = next_col(col)
			ws[col+str(3)] = '(BP_35이상)Length'
			sum_ = 0
			for r in range(4, 4+len(book.GenomeStructure)):
				tmp = len(book.BP35[i][ book.BP35[i][ book.col_GenomeStructure] == book.GenomeStructure[r-4]]) if len(book.BPRaw[i])!=0 else 0
				ws[col+str(r)] = tmp
				sum_ += tmp
			ws[col+str(4+len(book.GenomeStructure))] = sum_
			sum_=0
			for r in range(0, len(book.RepeatRegion)):
				row = r+5+len(book.GenomeStructure)
				tmp = len(book.BP35[i][ book.BP35[i][ book.col_RepeatRegion] == book.RepeatRegion[r]]) if len(book.BP35[i])!=0 else 0
				ws[col+str(row)] = tmp
				sum_ += tmp
			ws[col+str(5+len(book.GenomeStructure)+len(book.RepeatRegion))] = sum_

			ws[col+str(rs_+1)] = len(book.BP35[i][ book.BP35[i][ book.col_ORF ].isin(book.ORF)]) if len(book.BP35[i])!=0 else 0
			ws[col+str(rs_+2)] = len(book.BP35[i][ book.BP35[i][ book.col_ORF ].isin(book.NCR)]) if len(book.BP35[i])!=0 else 0
			col = next_col(col)
		infolog("End Full_Seq_in_sheet3")